In [1]:
import pandas as pd
from matplotlib import colors
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
import numpy as np

# 定义文件路径
file_path = 'vdjdb.txt'  # 将 'your_file.txt' 替换为你的文件路径

# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as file:
    # 读取文件的第一行，获取所有的信息变量名
    header = file.readline().strip().split('\t')
    tcr_data = [dict(zip(header, line.strip().split('\t'))) for line in file]
cdr3_dict = {}
for row in tcr_data:
    complex_id = row['complex.id']
    cdr3 = row['cdr3']
    # Splice together CDR3 with the same complex ID
    if complex_id in cdr3_dict:
        cdr3_dict[complex_id].append(cdr3)
    else:
        cdr3_dict[complex_id] = [cdr3]
# There is a DataFrame containing the TCR sequence
for row in tcr_data:
    complex_id = row['complex.id']
    antigen_epitope = row['antigen.epitope']
    vdjdb_score = row['vdjdb.score']
    # Splice together CDR3 with the same complex ID
    if len(cdr3_dict[complex_id]) == 2:
        cdr3_dict[complex_id].append(antigen_epitope)
        cdr3_dict[complex_id].append(vdjdb_score)
    else:
        continue
cdr3_dict.pop('0')
##Delete unpaired TCRs
df_cdr3 = pd.DataFrame(cdr3_dict)
df_cdr3_trans = df_cdr3.transpose()
names = ['TRA', 'TRB', 'antigen_epitope', 'vdjdb.score']
df_cdr3_trans.columns = names
print(df_cdr3_trans)
##The first step is to read out the paired data

                  TRA                   TRB antigen_epitope vdjdb.score
1       CIVRAPGRADMRF  CASSYLPGQGDHYSNQPQHF        FLKEKGGL           2
2      CAVPSGAGSYQLTF   CASSFEPGQGFYSNQPQHF        FLKEKGGL           2
3         CAVKASGSRLT  CASSYEPGQVSHYSNQPQHF        FLKEKGGL           2
4       CAYRPPGTYKYIF        CASSALASLNEQFF        FLKEKGGL           2
5       CIVRAPGRADMRF  CASSYLPGQGDHYSNQPQHF        FLKEQGGL           2
...               ...                   ...             ...         ...
30590   CMDEGGSNYKLTF         CASSVRSTDTQYF    PQPELPYPQPQL           0
30591     CSLYNNNDMRF         CASSLRYTDTQYF    PQPELPYPQPQL           0
30592   CALSTDSWGKLQF       CASSPGQGGDNEQFF   PQQPFPQPEQPFP           0
30593    CAPQGATNKLIF       CASSLGAGGQETQYF   PQQPFPQPEQPFP           2
30594  CLVGGSGGYNKLIF         CASSSTAQETQYF   PQQPFPQPEQPFP           0

[30594 rows x 4 columns]


In [2]:
neg_data= df_cdr3_trans[df_cdr3_trans['vdjdb.score'] == '0']
pos_data_1= df_cdr3_trans[df_cdr3_trans['vdjdb.score'] == '1']
pos_data_2= df_cdr3_trans[df_cdr3_trans['vdjdb.score'] == '2']
pos_data_3= df_cdr3_trans[df_cdr3_trans['vdjdb.score'] == '3']
# Copy the data of pos_data_2 twice
pos_data_2_copy = pos_data_2.copy()
pos_data_2_copy = pd.concat([pos_data_2_copy] * 2, ignore_index=True)

# Copy the data of pos_data_3 three times
pos_data_3_copy = pos_data_3.copy()
pos_data_3_copy = pd.concat([pos_data_3_copy] * 3, ignore_index=True)

# Splice the copied data with pos_data_1
pos_data = pd.concat([pos_data_1, pos_data_2_copy, pos_data_3_copy], ignore_index=True)
# Determine the number of positive samples
num_positive_samples = len(pos_data)

# Randomly select samples from negative samples with the same number as positive samples
neg_data_sampled = neg_data.sample(n=num_positive_samples, random_state=42)
neg_data_sampled = neg_data_sampled.reset_index(drop=True)
pos_data = pos_data.reset_index(drop=True)
neg_data_sampled['label']=0
pos_data['label']=1

In [3]:
balanced_dataset = pd.concat([neg_data_sampled, pos_data], axis=0)
balanced_dataset = balanced_dataset.reset_index(drop=True)
balanced_dataset['TRA_TRB_Combined'] = balanced_dataset["TRA"] + balanced_dataset["TRB"]
##Splicing into a complete dataset

In [4]:
##----------------encoding stage-------------------------------
encoding_map = {'A': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'C': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'D': [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'E': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'F': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'G': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'H': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'I': [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'K': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'L': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'M': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'N': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                'P': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                'Q': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
                'R': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                'S': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
                'T': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
                'V': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
                'W': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
                'Y': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}
cdr3_encoded = [[encoding_map[char] for char in sequence] for sequence in balanced_dataset['TRA_TRB_Combined']]
antigen_encoded = [[encoding_map[char] for char in sequence] for sequence in balanced_dataset['antigen_epitope']]
##one-hot representation encoding stage completed
##next step is padding all input into one size.
longest_cdr3 = max(balanced_dataset['TRA_TRB_Combined'], key=len)
print("Longest cdr3:", longest_cdr3)
print("Longest cdr3's length:", len(longest_cdr3))
longest_antigen_epitope = max(balanced_dataset['antigen_epitope'], key=len)
print("Longest antigen_epitope:", longest_antigen_epitope)
print("Longest antigen_epitope's length:", len(longest_antigen_epitope))


def padding_sequence(origin, sequence_length):
    padded = np.zeros((sequence_length, 20))
    padded[:len(origin)] = origin
    return padded


cdr3_encoded_padded = [padding_sequence(seq, len(longest_cdr3)) for seq in cdr3_encoded]
antigen_encoded_padded = [padding_sequence(seq, len(longest_antigen_epitope)) for seq in antigen_encoded]
cdr3_encoded_padded_flat = [seq.flatten() for seq in cdr3_encoded_padded]
antigen_encoded_padded_flat = [seq.flatten() for seq in antigen_encoded_padded]
balanced_dataset['cdr3_code'] = cdr3_encoded_padded_flat
balanced_dataset['antigen_code'] = antigen_encoded_padded_flat

Longest cdr3: CAASRANAGGTSYGKLTFCASSQDKVSWTGVSGGMNTEAFF
Longest cdr3's length: 41
Longest antigen_epitope: MTEYKLVVVGAVGVGKSALTIQLI
Longest antigen_epitope's length: 24


In [5]:
balanced_dataset['input'] = balanced_dataset.apply(lambda row: list(row['cdr3_code']) + list(row['antigen_code']), axis=1)
##splice cdr3_code abd antigen_code

In [51]:

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
X = np.array(balanced_dataset['input'].tolist())  
y = np.array(balanced_dataset['label'].tolist())  
# Split X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=66,shuffle=True)
from sklearn.ensemble import RandomForestClassifier
# RandomForestClassfier
random_forest_model = RandomForestClassifier(n_estimators=150,max_depth=None, max_features='log2', min_samples_leaf=1, min_samples_split=2)
random_forest_model.fit(X_train, y_train)

# predict test
y_pred = random_forest_model.predict(X_test)
# accuracy
from sklearn.metrics import accuracy_score

from sklearn.metrics import recall_score, precision_score, f1_score
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

Recall: 0.9587020648967551
Precision: 0.8895985401459854
F1 Score: 0.922858495030762
Accuracy: 0.919585594474593
