In [2]:

from torch.utils.data import TensorDataset, DataLoader

import numpy as np
import pandas as pd


##------------------读取文件----------------------------------------------------
# 定义文件路径
file_path = 'vdjdb.txt'  # 将 'your_file.txt' 替换为你的文件路径

# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as file:
    # 读取文件的第一行，获取所有的信息变量名
    header = file.readline().strip().split('\t')
    tcr_data = [dict(zip(header, line.strip().split('\t'))) for line in file]
print(header)

['complex.id', 'gene', 'cdr3', 'v.segm', 'j.segm', 'species', 'mhc.a', 'mhc.b', 'mhc.class', 'antigen.epitope', 'antigen.gene', 'antigen.species', 'reference.id', 'method', 'meta', 'cdr3fix', 'vdjdb.score', 'web.method', 'web.method.seq', 'web.cdr3fix.nc', 'web.cdr3fix.unmp']


In [3]:
# --------------清洗第一步，提取所需属性-----------------------------------
selected_data = [{'cdr3': entry['cdr3'],
                  'antigen.epitope': entry['antigen.epitope'],
                  'vdjdb.score': entry['vdjdb.score']}
                 for entry in tcr_data]
##------------------------------------------------------------------------


# ---清洗第二步，转化为数据集，并删去重复元素,同时删除可信度低的行-----------------------
df_raw = pd.DataFrame(selected_data)
df_clean = df_raw[df_raw['vdjdb.score'] != '0']
df_clean = df_clean.reset_index(drop=True)

In [4]:
neg_data= df_raw[df_raw['vdjdb.score'] == '0']
neg_data = neg_data.reset_index(drop=True)
pos_data=df_raw[df_raw['vdjdb.score'] != '0']
pos_data = pos_data.reset_index(drop=True)
num_positive_samples = len(pos_data)

# 从阴性样本中随机抽取与阳性样本数量相同的样本
neg_data_sampled = neg_data.sample(n=num_positive_samples, random_state=42)
neg_data_sampled = neg_data_sampled.reset_index(drop=True)
pos_data = pos_data.reset_index(drop=True)
neg_data_sampled['label']=0
pos_data['label']=1

In [5]:
balanced_dataset = pd.concat([neg_data_sampled, pos_data], axis=0)
balanced_dataset = balanced_dataset.reset_index(drop=True)

In [6]:
balanced_dataset

Unnamed: 0,cdr3,antigen.epitope,vdjdb.score,label
0,CAVIGTTDSWGKLQF,KLGGALQAK,0,0
1,CAFMMNYGGSQGNLIF,KLGGALQAK,0,0
2,CASSGAGGEVFF,SYIGSINNI,0,0
3,CAASSLYGQNFVF,LLWNGPMAV,0,0
4,CARPPETQYF,ELAGIGILTV,0,0
...,...,...,...,...
24823,CASSQGSGGNEQFF,FPQPEQPFPWQP,2,1
24824,CAASVLYGSSNTGKLIF,QLQPFPQPELPY,2,1
24825,CASSIVGSGGYNEQFF,QLQPFPQPELPY,2,1
24826,CAPQGATNKLIF,PQQPFPQPEQPFP,2,1


In [7]:
##----------------接下来编码-------------------------------
encoding_map = {'A': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'C': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'D': [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'E': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'F': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'G': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'H': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'I': [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'K': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'L': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'M': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'N': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                'P': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                'Q': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
                'R': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                'S': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
                'T': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
                'V': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
                'W': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
                'Y': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}
cdr3_encoded = [[encoding_map[char] for char in sequence] for sequence in balanced_dataset['cdr3']]
antigen_encoded = [[encoding_map[char] for char in sequence] for sequence in balanced_dataset['antigen.epitope']]
##独热码成功编辑，但是矩阵长度不一致
longest_cdr3 = max(balanced_dataset['cdr3'], key=len)
print("最长的cdr3:", longest_cdr3)
print("最长cdr3的长度:", len(longest_cdr3))
longest_antigen_epitope = max(balanced_dataset['antigen.epitope'], key=len)
print("最长的antigen_epitope:", longest_antigen_epitope)
print("最长antigen_epitope的长度:", len(longest_antigen_epitope))


def padding_sequence(origin, sequence_length):
    padded = np.zeros((sequence_length, 20))
    padded[:len(origin)] = origin
    return padded


cdr3_encoded_padded = [padding_sequence(seq, len(longest_cdr3)) for seq in cdr3_encoded]
antigen_encoded_padded = [padding_sequence(seq, len(longest_antigen_epitope)) for seq in antigen_encoded]
cdr3_encoded_padded_flat = [seq.flatten() for seq in cdr3_encoded_padded]
antigen_encoded_padded_flat = [seq.flatten() for seq in antigen_encoded_padded]
balanced_dataset['cdr3_code'] = cdr3_encoded_padded_flat
balanced_dataset['antigen_code'] = antigen_encoded_padded_flat

最长的cdr3: CYSTWRLSCLLLCRDSAGAGSYQLTF
最长cdr3的长度: 26
最长的antigen_epitope: MTEYKLVVVGAVGVGKSALTIQLI
最长antigen_epitope的长度: 24


In [12]:
balanced_dataset['input'] = balanced_dataset.apply(lambda row: list(row['cdr3_code']) + list(row['antigen_code']), axis=1)
##确定输入
# 划分特征和目标变量
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
X = np.array(balanced_dataset['input'].tolist())  # cdr3和待选抗原作为特征
y = np.array(balanced_dataset['label'].tolist())  # label做为标签
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55)

In [13]:
#DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
# 初始化决策树模型
decision_tree = DecisionTreeClassifier()

# 在训练集上拟合模型
decision_tree.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = decision_tree.predict(X_test)

# 计算准确率
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, precision_score, f1_score
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

Recall: 0.8278522127486805
Precision: 0.7946219797349962
F1 Score: 0.810896798568304
Accuracy: 0.8084977849375755


In [14]:
##Naive_bayes Model
from sklearn.naive_bayes import GaussianNB
# 初始化朴素贝叶斯模型
naive_bayes = GaussianNB()

# 在训练集上拟合模型
naive_bayes.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = naive_bayes.predict(X_test)

# 计算准确率
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, precision_score, f1_score
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

Recall: 0.15468940316686966
Precision: 0.8318777292576419
F1 Score: 0.2608695652173913
Accuracy: 0.5652436568666935


In [16]:

#KNN Model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# 初始化KNN模型
knn_model = KNeighborsClassifier(n_neighbors=3)

# 训练模型
knn_model.fit(X_train, y_train)

# 预测
y_pred = knn_model.predict(X_test)

# 计算准确率
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, precision_score, f1_score
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

Recall: 0.8327243199350386
Precision: 0.7777777777777778
F1 Score: 0.8043137254901962
Accuracy: 0.7990334273056786


In [17]:
##Logistic Regression Model
# 使用逻辑回归模型进行分类
from sklearn.linear_model import LogisticRegression

# 初始化逻辑回归模型
logistic_regression = LogisticRegression(max_iter=1000)

# 训练模型
logistic_regression.fit(X_train, y_train)

# 预测测试集
y_pred = logistic_regression.predict(X_test)

# 评估模型性能
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, precision_score, f1_score
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

Recall: 0.8493706861550954
Precision: 0.7705340699815838
F1 Score: 0.8080339899575126
Accuracy: 0.7998389045509464
