In [3]:
import pandas as pd

In [4]:
file_path = 'vdjdb.txt' 
df = pd.read_csv(file_path, sep='\t')
df.head()

Unnamed: 0,complex.id,gene,cdr3,v.segm,j.segm,species,mhc.a,mhc.b,mhc.class,antigen.epitope,...,antigen.species,reference.id,method,meta,cdr3fix,vdjdb.score,web.method,web.method.seq,web.cdr3fix.nc,web.cdr3fix.unmp
0,1,TRA,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CIVRAPGRADMRF"", ""cdr3_old"": ""CIVRAPG...",2,sort,sanger,no,no
1,1,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSYLPGQGDHYSNQPQHF"", ""cdr3_old"": ""...",2,sort,sanger,no,no
2,0,TRB,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSFEAGQGFFSNQPQHF"", ""cdr3_old"": ""C...",2,sort,sanger,no,no
3,2,TRA,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CAVPSGAGSYQLTF"", ""cdr3_old"": ""CAVPSG...",2,sort,sanger,no,no
4,2,TRB,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSFEPGQGFYSNQPQHF"", ""cdr3_old"": ""C...",2,sort,sanger,no,no


In [5]:
# complex.id: 数据记录的复合体ID，用于唯一标识数据记录。
# gene: TCR的基因类型，如TRA（α链）或TRB（β链）。
# cdr3: TCR中CDR3区的氨基酸序列，这是决定TCR特异性的关键区域。
# v.segm: 变异（V）区基因段的名称，对应于TCRα或β链的V区。
# j.segm: 连接（J）区基因段的名称，对应于TCRα或β链的J区。
# species: TCR来源的物种，如HomoSapiens（人类）。
# mhc.a: 主要组织相容性复合体（MHC）分子的A链类型。
# mhc.b: MHC分子的B链类型或其他相关标记（如B2M，β2-微球蛋白）。
# mhc.class: MHC分子的类别，如MHCI或MHCII。
# antigen.epitope: TCR识别的抗原表位的氨基酸序列。
# antigen.gene: 抗原的基因名称。
# antigen.species: 抗原来源的物种，如HIV-1。
# reference.id: 发表这一数据的文献的参考ID，如PMID:15596521。
# method: 数据收集的方法或技术，例如通过tetramer-sort、sanger测序等。
# meta: 元数据，包含有关样本的额外信息，如细胞亚群、捐献者MHC信息、样本来源等。
# cdr3fix: CDR3区序列修正的信息，包括是否需要修正、修正前后的序列等。
# vdjdb.score: 数据记录的质量评分。
# web.method: 数据提交或验证时使用的网页方法。
# web.method.seq: 与web.method相关的序列信息。
# web.cdr3fix.nc: 网络上CDR3修正的非规范（non-canonical）信息。
# web.cdr3fix.unmp: 网络上CDR3修正的未映射（unmapped）信息。
# complex.id: Complex ID of the data record, used to uniquely identify the data record.
# gene: The gene type of the TCR, such as TRA (alpha chain) or TRB (beta chain).
# cdr3: Amino acid sequence of the CDR3 region in the TCR, which is the key region that determines the specificity of the TCR.
# v.segm: Name of the gene segment of the variant (V) region, corresponding to the V region of the TCR alpha or beta chain.
# j.segm: Name of the gene segment of the junction (J) region, corresponding to the J region of the TCR alpha or beta chain.
# species: The species from which the TCR originated, e.g. HomoSapiens (human).
# mhc.a: A-chain type of the major histocompatibility complex (MHC) molecule.
# mhc.b: B-chain type of MHC molecule or other relevant marker (e.g. B2M, β2-microglobulin).
# mhc.class: Class of the MHC molecule, e.g. MHCI or MHCII.
# antigen.epitope: Amino acid sequence of the antigenic epitope recognised by the TCR.
# antigen.gene: The gene name of the antigen.
# antigen.species: The species from which the antigen originated, e.g. HIV-1.
# reference.id: The reference ID of the literature in which this data was published, e.g. PMID:15596521.
# method: The method or technique used for data collection, e.g. by tetramer-sort, sanger sequencing, etc.
# meta: Metadata containing additional information about the sample, e.g. cell subpopulations, donor MHC information, sample source, etc.
# cdr3fix: information about sequence correction of CDR3 region, including whether correction is needed, sequence before and after correction, etc.
# vdjdb.score: Quality score of the data record.
# web.method: The web method used for data submission or validation.
# web.method.seq: Sequence information associated with the web.method.
# web.cdr3fix.nc: Non-canonical information about CDR3 fixes on the web.
# web.cdr3fix.unmp: Unmapped information for CDR3 fixes on the web.

In [7]:
# Select the relevant column
relevant_columns = [
    'gene', 'cdr3', 'v.segm', 'j.segm', 'species',
    'mhc.class', 'antigen.epitope', 'antigen.gene', 'antigen.species'
]
df = df[relevant_columns]
df.head()

Unnamed: 0,gene,cdr3,v.segm,j.segm,species,mhc.class,antigen.epitope,antigen.gene,antigen.species
0,TRA,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,HomoSapiens,MHCI,FLKEKGGL,Nef,HIV-1
1,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,MHCI,FLKEKGGL,Nef,HIV-1
2,TRB,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,MHCI,FLKEKGGL,Nef,HIV-1
3,TRA,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,HomoSapiens,MHCI,FLKEKGGL,Nef,HIV-1
4,TRB,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,MHCI,FLKEKGGL,Nef,HIV-1


In [8]:
# Data Cleaning
df = df.dropna()

# Feature engineering
df['cdr3_length'] = df['cdr3'].apply(len)

# Data conversion
df['mhc.class'] = df['mhc.class'].astype('category').cat.codes

# Showing the first few rows of data after preprocessing
df.head()

Unnamed: 0,gene,cdr3,v.segm,j.segm,species,mhc.class,antigen.epitope,antigen.gene,antigen.species,cdr3_length
0,TRA,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,HomoSapiens,0,FLKEKGGL,Nef,HIV-1,13
1,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,0,FLKEKGGL,Nef,HIV-1,20
2,TRB,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,0,FLKEKGGL,Nef,HIV-1,19
3,TRA,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,HomoSapiens,0,FLKEKGGL,Nef,HIV-1,14
4,TRB,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,0,FLKEKGGL,Nef,HIV-1,19


In [None]:
# The main limitation of using the distance or similarity score matrix between TCR sequences to predict specificity 
# in the method you provided is the inconsistency in the length of the CDR3 region. Since the CDR3 regions of TCR 
# sequences can have different lengths, this complicates direct comparisons or the creation of uniform feature 
# representations. Solo thermal coding produces highly sparse data when dealing with this type of problem, which 
# may be detrimental to model training and efficiency. In addition, simple solo thermal coding may not be able to 
# capture the complex interactions and biological properties between amino acids that are critical for determining 
# the specificity of TCRs.

# To overcome these limitations, several strategies can be employed. Firstly, the length of the CDR3 region can be 
# standardised by padding or truncating sequences to ensure that all sequences have the same dimension when encoded. 
# Second, other biological features of the sequences, such as the physicochemical properties of amino acids, can be 
# introduced, as well as the use of advanced sequence encoding methods, such as word embedding techniques, to reduce 
# sparsity and provide a richer representation of the sequences. In addition, the use of deep learning models to 
# process variable-length sequences and learn complex patterns of sequences, or sequence alignment algorithms to 
# ensure accuracy of comparisons can be considered. With these approaches, the performance of prediction models can 
# be improved and the specificity of TCRs can be predicted more accurately.

# 在您提供的方法中，利用TCR序列之间的距离或相似度分数矩阵来预测特异性的主要局限性在于CDR3区域长度的不一致性。由于TCR序列的CDR3区域可以有不
# 同的长度，这使得直接比较或创建统一的特征表示变得复杂。独热编码在处理这类问题时会产生高度稀疏的数据，这可能不利于模型的训练和效率。此外，简单
# 的独热编码可能无法捕捉氨基酸之间的复杂相互作用和生物学特性，这些特性对于确定TCR的特异性至关重要。

# 为了克服这些局限性，可以采取多种策略。首先，可以通过对序列进行填充或截断来标准化CDR3区域的长度，以确保所有序列在编码时具有相同的维度。其次
# ，可以引入序列的其他生物学特征，如氨基酸的物理化学属性，以及使用高级的序列编码方法，如词嵌入技术，以降低稀疏性并提供更丰富的序列表示。此外
# ，还可以考虑使用深度学习模型来处理变长序列并学习序列的复杂模式，或者采用序列对齐算法来确保比较的准确性。通过这些方法，可以提高预测模型的性
# 能，更准确地预测TCR的特异性。
