## Working with Biopython to transform cdr3  & epitope sequences

- The potential use of biopython library could be beneficial in transforming the cdr3 sequence to numerical form. This facilitates the process to train a classifier. 

In [2]:
# pip install biopython


Collecting biopython
  Downloading biopython-1.83-cp39-cp39-macosx_10_9_x86_64.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: biopython
Successfully installed biopython-1.83
Note: you may need to restart the kernel to use updated packages.


In [7]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import numpy as np

def physicochemical_properties(sequence):
    """
    Extract physicochemical properties of amino acids for a given sequence.
    """
    analysis = ProteinAnalysis(sequence)
    return [
        analysis.molecular_weight(),
        analysis.aromaticity(),
        analysis.instability_index(),
        analysis.isoelectric_point(),
        analysis.flexibility(),
    ]

def extract_motifs(sequence, motif_length=3):
    """
    Extract motifs from the given sequence.
    """
    motifs = [sequence[i:i+motif_length] for i in range(len(sequence)-motif_length+1)]
    return motifs

def main():
    # Example CDR3 sequence
    cdr3_sequence = "CASSPQTGTGGYGYTF"

    # Extract physicochemical properties
    physicochemical_features = physicochemical_properties(cdr3_sequence)

    # Extract motifs
    motifs = extract_motifs(cdr3_sequence)

    # Print or use the extracted features as needed
    print("Physicochemical Features:", physicochemical_features)
    print("Extracted Motifs:", motifs)

if __name__ == "__main__":
    main()


Physicochemical Features: [1596.6722000000002, 0.1875, 39.19375000000002, 5.516758537292478, [1.021642857142857, 1.0200119047619047, 1.0319880952380953, 1.0120714285714285, 1.0198095238095237, 1.006190476190476, 0.9800000000000001]]
Extracted Motifs: ['CAS', 'ASS', 'SSP', 'SPQ', 'PQT', 'QTG', 'TGT', 'GTG', 'TGG', 'GGY', 'GYG', 'YGY', 'GYT', 'YTF']


In [8]:
import pandas as pd
df = pd.read_csv('../Tasking/Github/vdjdb-2023-06-01/vdjdb.txt', sep="\t")
df_sample = df[:100]
df_sample.head()

Unnamed: 0,complex.id,gene,cdr3,v.segm,j.segm,species,mhc.a,mhc.b,mhc.class,antigen.epitope,...,antigen.species,reference.id,method,meta,cdr3fix,vdjdb.score,web.method,web.method.seq,web.cdr3fix.nc,web.cdr3fix.unmp
0,1,TRA,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CIVRAPGRADMRF"", ""cdr3_old"": ""CIVRAPG...",2,sort,sanger,no,no
1,1,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSYLPGQGDHYSNQPQHF"", ""cdr3_old"": ""...",2,sort,sanger,no,no
2,0,TRB,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSFEAGQGFFSNQPQHF"", ""cdr3_old"": ""C...",2,sort,sanger,no,no
3,2,TRA,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CAVPSGAGSYQLTF"", ""cdr3_old"": ""CAVPSG...",2,sort,sanger,no,no
4,2,TRB,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSFEPGQGFYSNQPQHF"", ""cdr3_old"": ""C...",2,sort,sanger,no,no


In [9]:
df_sample["cdr3"].map(physicochemical_properties)

0     [1491.7833999999998, 0.07692307692307693, 5.18...
1     [2236.335, 0.15000000000000002, 61.035, 5.9679...
2     [2089.2024, 0.21052631578947367, 30.4578947368...
3     [1400.5550999999998, 0.14285714285714285, 39.5...
4     [2131.2391, 0.21052631578947367, 57.7531578947...
                            ...                        
95    [1536.6633000000002, 0.21428571428571427, 13.9...
96    [1957.1642000000006, 0.1, 16.025000000000002, ...
97    [1588.6568000000004, 0.0625, 52.125, 5.2399526...
98    [1663.7614000000003, 0.2, 12.120000000000001, ...
99    [1490.5570000000002, 0.07692307692307693, 43.5...
Name: cdr3, Length: 100, dtype: object