In [1]:
import pandas as pd
from matplotlib import colors
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
import numpy as np

# 定义文件路径
file_path = 'vdjdb.txt'  # 将 'your_file.txt' 替换为你的文件路径

# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as file:
    # 读取文件的第一行，获取所有的信息变量名
    header = file.readline().strip().split('\t')
    tcr_data = [dict(zip(header, line.strip().split('\t'))) for line in file]
cdr3_dict = {}
for row in tcr_data:
    complex_id = row['complex.id']
    cdr3 = row['cdr3']
    # Splice together CDR3 with the same complex ID
    if complex_id in cdr3_dict:
        cdr3_dict[complex_id].append(cdr3)
    else:
        cdr3_dict[complex_id] = [cdr3]
# There is a DataFrame containing the TCR sequence
for row in tcr_data:
    complex_id = row['complex.id']
    antigen_epitope = row['antigen.epitope']
    vdjdb_score = row['vdjdb.score']
    # Splice together CDR3 with the same complex ID
    if len(cdr3_dict[complex_id]) == 2:
        cdr3_dict[complex_id].append(antigen_epitope)
        cdr3_dict[complex_id].append(vdjdb_score)
    else:
        continue
cdr3_dict.pop('0')
##Delete unpaired TCRs
df_cdr3 = pd.DataFrame(cdr3_dict)
df_cdr3_trans = df_cdr3.transpose()
names = ['TRA', 'TRB', 'antigen_epitope', 'vdjdb.score']
df_cdr3_trans.columns = names
print(df_cdr3_trans)
##The first step is to read out the paired data

                  TRA                   TRB antigen_epitope vdjdb.score
1       CIVRAPGRADMRF  CASSYLPGQGDHYSNQPQHF        FLKEKGGL           2
2      CAVPSGAGSYQLTF   CASSFEPGQGFYSNQPQHF        FLKEKGGL           2
3         CAVKASGSRLT  CASSYEPGQVSHYSNQPQHF        FLKEKGGL           2
4       CAYRPPGTYKYIF        CASSALASLNEQFF        FLKEKGGL           2
5       CIVRAPGRADMRF  CASSYLPGQGDHYSNQPQHF        FLKEQGGL           2
...               ...                   ...             ...         ...
30590   CMDEGGSNYKLTF         CASSVRSTDTQYF    PQPELPYPQPQL           0
30591     CSLYNNNDMRF         CASSLRYTDTQYF    PQPELPYPQPQL           0
30592   CALSTDSWGKLQF       CASSPGQGGDNEQFF   PQQPFPQPEQPFP           0
30593    CAPQGATNKLIF       CASSLGAGGQETQYF   PQQPFPQPEQPFP           2
30594  CLVGGSGGYNKLIF         CASSSTAQETQYF   PQQPFPQPEQPFP           0

[30594 rows x 4 columns]


In [3]:
##clean the data
df_clean = df_cdr3_trans[df_cdr3_trans['vdjdb.score'] != '0']
df_clean['TRA_TRB_Combined'] = df_clean["TRA"] + df_clean["TRB"]
df_clean = df_clean.reset_index(drop=True)
print("There are {} categories of data in the current dataset".format(np.shape(df_clean['antigen_epitope'].unique())))
print(df_clean)

There are (391,) categories of data in the current dataset
                    TRA                   TRB antigen_epitope vdjdb.score  \
0         CIVRAPGRADMRF  CASSYLPGQGDHYSNQPQHF        FLKEKGGL           2   
1        CAVPSGAGSYQLTF   CASSFEPGQGFYSNQPQHF        FLKEKGGL           2   
2           CAVKASGSRLT  CASSYEPGQVSHYSNQPQHF        FLKEKGGL           2   
3         CAYRPPGTYKYIF        CASSALASLNEQFF        FLKEKGGL           2   
4         CIVRAPGRADMRF  CASSYLPGQGDHYSNQPQHF        FLKEQGGL           2   
...                 ...                   ...             ...         ...   
2960         CIALNARLMF         CASSLRATDTQYF    PQPELPYPQPQL           2   
2961   CAMREGRYSSASKIIF       CATSRAGGGGEKLFF    FPQPEQPFPWQP           2   
2962   CLVGDGDGGATNKLIF        CASSQGSGGNEQFF    FPQPEQPFPWQP           2   
2963  CAASVLYGSSNTGKLIF      CASSIVGSGGYNEQFF    QLQPFPQPELPY           2   
2964       CAPQGATNKLIF       CASSLGAGGQETQYF   PQQPFPQPEQPFP           2   

                

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [6]:
#----------------encoding stage-------------------------------
encoding_map = {'A': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'C': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'D': [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'E': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'F': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'G': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'H': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'I': [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'K': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'L': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'M': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                'N': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                'P': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                'Q': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
                'R': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                'S': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
                'T': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
                'V': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
                'W': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
                'Y': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}
cdr3_encoded = [[encoding_map[char] for char in sequence] for sequence in df_clean['TRA_TRB_Combined']]
antigen_encoded = [[encoding_map[char] for char in sequence] for sequence in df_clean['antigen_epitope']]
##The unique hot code was successfully edited, but the matrix length is inconsistent.
##So the second step is to unify all inputs into the longest sequence
longest_cdr3 = max(df_clean['TRA_TRB_Combined'], key=len)
print("Longest CDR3:", longest_cdr3)
print("Longest CDR3's length:", len(longest_cdr3))
longest_antigen_epitope = max(df_clean['antigen_epitope'], key=len)
print("Longest antigen_epitope:", longest_antigen_epitope)
print("Longest antigen_epitope's length:", len(longest_antigen_epitope))

##padding function!
def padding_sequence(origin, sequence_length):
    padded = np.zeros((sequence_length, 20))
    padded[:len(origin)] = origin
    return padded


cdr3_encoded_padded = [padding_sequence(seq, len(longest_cdr3)) for seq in cdr3_encoded]
antigen_encoded_padded = [padding_sequence(seq, len(longest_antigen_epitope)) for seq in antigen_encoded]
cdr3_encoded_padded_flat = [seq.flatten() for seq in cdr3_encoded_padded]
antigen_encoded_padded_flat = [seq.flatten() for seq in antigen_encoded_padded]
##After expansion, expand to a one-dimensional vector
df_clean['cdr3_code'] = cdr3_encoded_padded_flat
df_clean['antigen_code'] = antigen_encoded_padded_flat
print("one-hot encoding completed")

Longest CDR3: CALNPMYSGGGADGLTFCASSVTLWTGTSTRSADTQYF
Longest CDR3's length: 38
Longest antigen_epitope: MTEYKLVVVGAVGVGKSALTIQLI
Longest antigen_epitope's length: 24
one-hot encoding completed
