In [40]:
import pandas as pd
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor

import numpy as np

# 定义文件路径
file_path = 'vdjdb.txt'  # 将 'your_file.txt' 替换为你的文件路径

# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as file:
    # 读取文件的第一行，获取所有的信息变量名
    header = file.readline().strip().split('\t')
    tcr_data = [dict(zip(header, line.strip().split('\t'))) for line in file]
print(header)
cdr3_dict = {}
for row in tcr_data:
    complex_id = row['complex.id']
    cdr3 = row['cdr3']
    # 将相同 complex.id 的 cdr3 拼接起来
    if complex_id in cdr3_dict:
        cdr3_dict[complex_id].append(cdr3)
    else:
        cdr3_dict[complex_id] = [cdr3]
# 假设有一个包含 TCR 序列的 DataFrame
for row in tcr_data:
    complex_id = row['complex.id']
    antigen_epitope = row['antigen.epitope']
    vdjdb_score = row['vdjdb.score']
    species=row['species']
    mhc=row['mhc.class']
    # 将相同 complex.id 的 cdr3 拼接起来
    if len(cdr3_dict[complex_id]) == 2:
        cdr3_dict[complex_id].append(antigen_epitope)
        cdr3_dict[complex_id].append(vdjdb_score)
        cdr3_dict[complex_id].append(species)
        cdr3_dict[complex_id].append(mhc)
    else:
        continue
cdr3_dict.pop('0')
##删除未配对的TCR
df_cdr3 = pd.DataFrame(cdr3_dict)
df_cdr3_trans = df_cdr3.transpose()
names = ['TRA', 'TRB', 'antigen_epitope', 'vdjdb.score','species','mhc']
df_cdr3_trans.columns = names
print(df_cdr3_trans)

['complex.id', 'gene', 'cdr3', 'v.segm', 'j.segm', 'species', 'mhc.a', 'mhc.b', 'mhc.class', 'antigen.epitope', 'antigen.gene', 'antigen.species', 'reference.id', 'method', 'meta', 'cdr3fix', 'vdjdb.score', 'web.method', 'web.method.seq', 'web.cdr3fix.nc', 'web.cdr3fix.unmp']
                  TRA                   TRB antigen_epitope vdjdb.score  \
1       CIVRAPGRADMRF  CASSYLPGQGDHYSNQPQHF        FLKEKGGL           2   
2      CAVPSGAGSYQLTF   CASSFEPGQGFYSNQPQHF        FLKEKGGL           2   
3         CAVKASGSRLT  CASSYEPGQVSHYSNQPQHF        FLKEKGGL           2   
4       CAYRPPGTYKYIF        CASSALASLNEQFF        FLKEKGGL           2   
5       CIVRAPGRADMRF  CASSYLPGQGDHYSNQPQHF        FLKEQGGL           2   
...               ...                   ...             ...         ...   
30590   CMDEGGSNYKLTF         CASSVRSTDTQYF    PQPELPYPQPQL           0   
30591     CSLYNNNDMRF         CASSLRYTDTQYF    PQPELPYPQPQL           0   
30592   CALSTDSWGKLQF       CASSPGQGGDNEQFF   PQ

In [41]:
df_cdr3_trans=df_cdr3_trans[df_cdr3_trans['species'] == 'HomoSapiens']
neg_data= df_cdr3_trans[df_cdr3_trans['vdjdb.score'] == '0']
pos_data= df_cdr3_trans[df_cdr3_trans['vdjdb.score'] != '0']

# 确定阳性样本数量
num_positive_samples = len(pos_data)

# 从阴性样本中随机抽取与阳性样本数量相同的样本
neg_data_sampled = neg_data.sample(n=num_positive_samples, random_state=42)
neg_data_sampled = neg_data_sampled.reset_index(drop=True)
pos_data = pos_data.reset_index(drop=True)
neg_data_sampled['label']=0
pos_data['label']=1
print(neg_data_sampled)
print(pos_data)

                   TRA               TRB antigen_epitope vdjdb.score  \
0        CADSGGGADGLTF   CASSEEAGEYNEQFF        RAKFKQLL           0   
1      CAGLNYGGSQGNLIF  CASSVRERAGANVLTF  TFEYVSQPFLMDLE           0   
2       CAGNGGGSQGNLIF     CASSGRSTGELFF       GILGFVFTL           0   
3         CAVVQGAQKLVF  CASSYLTGTGAYEQYF      ELAGIGILTV           0   
4     CAFMKHENSGTYKYIF   CATSDSGRVNTEAFF      LLDFVRFMGV           0   
...                ...               ...             ...         ...   
2035    CAASAGGSQGNLIF    CASSQDLGEETQYF       KSKRTPMGF           0   
2036    CAVGWGGATNKLIF   CAWSVGVGQLDGYTF       KLGGALQAK           0   
2037  CALSDSGGTSYGKLTF       CASSGTGELFF       KLGGALQAK           0   
2038     CADSGGGADGLTF    CASAPDGFFYGYTF        RAKFKQLL           0   
2039   CAVRDRTGGYNKLIF  CASSLVPGTGEYEQYF       KLGGALQAK           0   

          species    mhc  label  
0     HomoSapiens   MHCI      0  
1     HomoSapiens  MHCII      0  
2     HomoSapiens   MHCI      0  

In [42]:
balanced_dataset = pd.concat([neg_data_sampled, pos_data], axis=0)
balanced_dataset = balanced_dataset.reset_index(drop=True)
balanced_dataset['TRA_TRB_Combined'] = balanced_dataset["TRA"] + balanced_dataset["TRB"]

##Splicing into a complete dataset

In [43]:
cdr_sequences = balanced_dataset['TRA_TRB_Combined'].tolist()
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer='char', lowercase=False)
cdr_code = vectorizer.fit_transform(cdr_sequences)
antigen_epitope_sequences=balanced_dataset['antigen_epitope'].tolist()
antigen_epitope_code=vectorizer.fit_transform(antigen_epitope_sequences)
# 将稀疏矩阵转换为列表
cdr_code_list = cdr_code.toarray().tolist()
antigen_epitope_code_list=antigen_epitope_code.toarray().tolist()
# 将列表添加到 balanced_dataset 中
balanced_dataset['cdr3_code'] = cdr_code_list
balanced_dataset['antigen_code']=antigen_epitope_code_list

In [44]:
balanced_dataset['input'] = balanced_dataset.apply(lambda row: list(row['cdr3_code']) + list(row['antigen_code']), axis=1)
##splice cdr3_code abd antigen_code

In [45]:
# 划分特征和目标变量
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
X = np.array(balanced_dataset['input'].tolist())  # cdr3和待选抗原作为特征
y = np.array(balanced_dataset['label'].tolist())  # label做为标签
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
##Logistic Regression Model
# 使用逻辑回归模型进行分类
from sklearn.linear_model import LogisticRegression

# 初始化逻辑回归模型
logistic_regression = LogisticRegression(max_iter=1000)

# 训练模型
logistic_regression.fit(X_train, y_train)

# 预测测试集
y_pred = logistic_regression.predict(X_test)

# 评估模型性能
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, precision_score, f1_score
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

Recall: 0.9331619537275064
Precision: 0.7773019271948608
F1 Score: 0.8481308411214953
Accuracy: 0.8406862745098039


In [47]:
#DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
# 初始化决策树模型
decision_tree = DecisionTreeClassifier()

# 在训练集上拟合模型
decision_tree.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = decision_tree.predict(X_test)

# 计算准确率
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, precision_score, f1_score
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

Recall: 0.8766066838046273
Precision: 0.8567839195979899
F1 Score: 0.8665819567979669
Accuracy: 0.8713235294117647


In [48]:
from sklearn.ensemble import RandomForestClassifier
# RandomForestClassfier
random_forest_model = RandomForestClassifier(n_estimators=150,max_depth=None, max_features='log2', min_samples_leaf=1, min_samples_split=2)
random_forest_model.fit(X_train, y_train)

# predict test
y_pred = random_forest_model.predict(X_test)
# accuracy
from sklearn.metrics import accuracy_score

from sklearn.metrics import recall_score, precision_score, f1_score
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

Recall: 0.9485861182519281
Precision: 0.821826280623608
F1 Score: 0.8806682577565632
Accuracy: 0.8774509803921569


In [49]:
##Naive_bayes Model
from sklearn.naive_bayes import GaussianNB
# 初始化朴素贝叶斯模型
naive_bayes = GaussianNB()

# 在训练集上拟合模型
naive_bayes.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = naive_bayes.predict(X_test)

# 计算准确率
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, precision_score, f1_score
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

Recall: 0.7712082262210797
Precision: 0.7537688442211056
F1 Score: 0.7623888182973316
Accuracy: 0.7708333333333334


In [50]:

#KNN Model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# 初始化KNN模型
knn_model = KNeighborsClassifier(n_neighbors=3)

# 训练模型
knn_model.fit(X_train, y_train)

# 预测
y_pred = knn_model.predict(X_test)

# 计算准确率
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, precision_score, f1_score
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

Recall: 0.8688946015424165
Precision: 0.8047619047619048
F1 Score: 0.8355995055624227
Accuracy: 0.8370098039215687
