# Original model prediction

In [12]:
import torch.optim as optim
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef,roc_curve
import torch
import torch.nn as nn
import time
from torch.utils.data import Dataset, DataLoader
from tpbte import Model
import pandas as pd
import pickle as pk


def data_renew(pairs, emb_type):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if emb_type == 'onehot':
        tcr = pairs[:, :, 0:21].type(torch.LongTensor).to(device)
        epi = pairs[:, :, 21:-1].type(torch.LongTensor).to(device)
    elif emb_type == 'BLOSUM62':
        tcr = pairs[:, :, 0:20].to(device)
        epi = pairs[:, :, 20:].to(device)
    else:
        tcr = pairs[:, :, 0:5].to(device)
        epi = pairs[:, :, 5:].to(device)
    return tcr, epi

class MyDataset(Dataset):
    def __init__(self, path, emb_type):

        self.data = pd.read_csv(path,low_memory=False)
        self.CDR3B = self.data['CDR3B']
        self.Epitope = self.data['Epitope']
        self.Affinity = self.data['Affinity']
        self.emb_type = emb_type
        if self.emb_type == 'onehot':
            CDR3B, Epi, self.Affinity = onehot(self.CDR3B, self.Epitope, self.Affinity)
        elif self.emb_type == 'BLOSUM62':
            CDR3B, Epi, self.Affinity = BLOSUM_62(self.CDR3B, self.Epitope, self.Affinity, 20)
        else:
            CDR3B, Epi, self.Affinity = Atchley(self.CDR3B, self.Epitope, self.Affinity, 20)
        self.pair = torch.cat((CDR3B, Epi), -1)
    def __getitem__(self, index):
        return self.pair[index], self.Affinity[index]
    def __len__(self):
        return torch.LongTensor(self.Affinity).size()[0]
def Atchley(TCR, Epitope, Label, Length):
    aa_vec = pk.load(open('atchley.pk', 'rb'))
    Label = torch.LongTensor(Label).view(-1, 1)
    n = Label.size()[0]
    ext = list('********************') 
    tcr_embedding = torch.zeros(n, Length, 6)
    epi_embedding = torch.zeros(n, Length, 6)
    for ti, tcr in enumerate(TCR):
        tcr = tcr + ' ' * (Length - len(tcr))
        for i in range(Length):
            tcr_embedding[ti, i, :] = torch.from_numpy(aa_vec[tcr[i]])

    for ei, epi in enumerate(Epitope):
        epi = epi + ' ' * (Length - len(epi))
        for i in range(Length):
            epi_embedding[ei, i, :] = torch.from_numpy(aa_vec[epi[i]])
    return tcr_embedding[:, :, 0:5], epi_embedding[:, :, 0:5], Label 

def Convert_letters(seq):
    import numpy as np
    result = []
    atchley = pk.load(open('atchley.pk', 'rb'))
    for key in atchley.keys():
        atchley[key] = atchley[key][..., :-1]
    for row in seq:
        res_row = []
        for element in row:
            closest_key = None
            min_distance = float('inf')
            for key, value in atchley.items():
                distance = np.linalg.norm(element - value)
                if distance < min_distance:
                    min_distance = distance
                    closest_key = key
            res_row.append([closest_key])
        result.append(res_row)
    result = np.array(result, dtype=object)
    shape = result.shape
    result = result.reshape((shape[0], shape[1]*shape[2]))
    letter = []
    for row in result:
        row_without_symbols = [element for element in row if element.isalpha()]
        letter.append(''.join(row_without_symbols))
    return letter


In [13]:
def train_main(trainfile_path,save_model_path): 
    import pandas as pd
    import numpy as np 
    vocab = 21 
    d_model = 512
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    e_type = 'Atchley'
    criterion = nn.CrossEntropyLoss()
    model = Model(src_vocab=vocab, tgt_vocab=vocab, emb_type=e_type, N=6, h=8, d_model=d_model,dropout=0.1, device=device)
    model = model.to(device)
    opt = torch.optim.Adam(model.parameters(), lr=0.00005, betas=(0.9, 0.98), eps=1e-9)
    n_epoch = 2
    train = MyDataset(trainfile_path, emb_type=e_type)
    #test = MyDataset(testfile_path, emb_type=e_type)
    b_size = 32
    train_dataset = DataLoader(dataset=train, batch_size=b_size, shuffle=True, drop_last=True) 
    #test_dataset = DataLoader(dataset=test, batch_size=b_size, shuffle=True, drop_last=False)
    model.train()
    best_loss = 1
    L = torch.zeros(n_epoch, 500)
    predictions = []  
    true_labels = [] 
    predicted_classes = []  
    for epoch in range(n_epoch):
        print('epoch:', epoch + 1)
        for i, data in enumerate(train_dataset, 0):
            pairs, label = data
            tcr, epi = data_renew(pairs=pairs, emb_type=e_type)
            label = label.unsqueeze(-1).to(device)
            n = tcr.size()[0]
            output = torch.unsqueeze(model(tcr, epi), 1)
            pred = output.argmax(dim=2)
            output = output.view(-1, 2)
            pred = pred.view(-1)
            label = label.long().view(-1)
            loss = criterion(output, label) 
            correct = torch.eq(pred, label.long()).sum().float().item()
            acc = correct / n
            if loss < best_loss:
                best_loss = loss
                param = model.state_dict()
            if loss == best_loss:
                param = model.state_dict()
            opt.zero_grad()
            loss.backward() 
            opt.step()
    torch.save(model.state_dict(),save_model_path+'model.pth') 



In [14]:
import os
GPU_NUMBER = [1]
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(s) for s in GPU_NUMBER])
os.environ["NCCL_DEBUG"] = "INFO"

In [15]:
trainfile_path ="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/TPBTE/VDJdb/all_vdj.csv"
save_model_path="./model/Original/VDJdb_"
train_main(trainfile_path,save_model_path) 

trainfile_path ="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/TPBTE/McPAS/all_McPAS.csv"
save_model_path="./model/Original/McPAS_"
train_main(trainfile_path,save_model_path) 

epoch: 1
epoch: 2
epoch: 1
epoch: 2


In [17]:
import torch.optim as optim
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef
import torch
import torch.nn as nn
import time
from torch.utils.data import Dataset, DataLoader
from tpbte import Model
import pandas as pd
import pickle as pk


class MyDataset(Dataset):
    def __init__(self, path, emb_type):

        self.data = pd.read_csv(path,low_memory=False)
        self.CDR3B = self.data['CDR3B']
        self.Epitope = self.data['Epitope']
        self.Affinity = self.data['Affinity']
        self.emb_type = emb_type
        if self.emb_type == 'onehot':
            CDR3B, Epi, self.Affinity = onehot(self.CDR3B, self.Epitope, self.Affinity)
        elif self.emb_type == 'BLOSUM62':
            CDR3B, Epi, self.Affinity = BLOSUM_62(self.CDR3B, self.Epitope, self.Affinity, 20)
        else:
            CDR3B, Epi, self.Affinity = Atchley(self.CDR3B, self.Epitope, self.Affinity, 20)
        self.pair = torch.cat((CDR3B, Epi), -1)
    def __getitem__(self, index):
        return self.pair[index], self.Affinity[index]
    def __len__(self):
        return torch.LongTensor(self.Affinity).size()[0]

def Atchley(TCR, Epitope, Label, Length):
    aa_vec = pk.load(open('atchley.pk', 'rb'))
    Label = torch.LongTensor(Label).view(-1, 1)
    n = Label.size()[0]
    ext = list('********************') 
    tcr_embedding = torch.zeros(n, Length, 6)
    epi_embedding = torch.zeros(n, Length, 6)
    for ti, tcr in enumerate(TCR):
        tcr = tcr + ' ' * (Length - len(tcr))
        for i in range(Length):
            tcr_embedding[ti, i, :] = torch.from_numpy(aa_vec[tcr[i]])

    for ei, epi in enumerate(Epitope):
        epi = epi + ' ' * (Length - len(epi))
        for i in range(Length):
            epi_embedding[ei, i, :] = torch.from_numpy(aa_vec[epi[i]])
    return tcr_embedding[:, :, 0:5], epi_embedding[:, :, 0:5], Label 

def Convert_letters(seq):
    result = []
    atchley = pk.load(open('atchley.pk', 'rb'))
    for key in atchley.keys():
        atchley[key] = atchley[key][..., :-1]
    for row in seq:
        res_row = []
        for element in row:
            closest_key = None
            min_distance = float('inf')
            for key, value in atchley.items():
                distance = np.linalg.norm(element - value)
                if distance < min_distance:
                    min_distance = distance
                    closest_key = key
            res_row.append([closest_key])
        result.append(res_row)
    result = np.array(result, dtype=object)
    shape = result.shape
    result = result.reshape((shape[0], shape[1]*shape[2]))
    letter = []
    for row in result:
        row_without_symbols = [element for element in row if element.isalpha()]
        letter.append(''.join(row_without_symbols))
    return letter

def data_renew(pairs, emb_type):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if emb_type == 'onehot':
        tcr = pairs[:, :, 0:21].type(torch.LongTensor).to(device)
        epi = pairs[:, :, 21:-1].type(torch.LongTensor).to(device)
    elif emb_type == 'BLOSUM62':
        tcr = pairs[:, :, 0:20].to(device)
        epi = pairs[:, :, 20:].to(device)
    else:
        tcr = pairs[:, :, 0:5].to(device)
        epi = pairs[:, :, 5:].to(device)
    return tcr, epi

import pandas as pd
import torch
import numpy as np
import warnings
warnings.filterwarnings("ignore")


def validation_main(testfile_path,modelfile_path,result_path):
    vocab = 21 
    d_model = 512
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(device)
    e_type = 'Atchley'
    test_path=testfile_path
    test = MyDataset(test_path, emb_type=e_type)
    b_size = 32
    test_dataset = DataLoader(dataset=test, batch_size=b_size, shuffle=True, drop_last=False)
    model = Model(src_vocab=vocab, tgt_vocab=vocab, emb_type=e_type, N=6, h=8, d_model=d_model, dropout=0.1, device=device)
    model.load_state_dict(torch.load(modelfile_path))
    model = model.to(device)
    model.eval()
    df_data = []
    predictions=[]
    true_labels=[]
    predicted_classes=[]
    with torch.no_grad():
        for data in test_dataset:
            pairs, label = data
            tcr, epi = data_renew(pairs=pairs, emb_type=e_type)
            tcr = tcr.to(device)
            epi = epi.to(device)
            output = model(tcr, epi)
            pred = output.argmax(dim=1)
            predictions.extend(output[:, 1].tolist())  
            true_labels.extend(label.tolist())
            predicted_classes.extend(pred.tolist())
            tcr=Convert_letters(tcr.cpu().numpy())
            epi=Convert_letters(epi.cpu().numpy())
            prediction_scores = output[:, 1].cpu().numpy()
            true_labels_batch = label.cpu().numpy()
            predicted_classes_batch = pred.cpu().numpy()
            for i in range(len(pairs)):
                 df_data.append({'Epitope': epi[i],'CDR3B': tcr[i],'y_true': true_labels_batch[i],
                                 'y_prob': prediction_scores[i], 
                                 'y_pred': predicted_classes_batch[i]})
    df_data=pd.DataFrame(df_data)           
    df_data['y_true'] = df_data['y_true'].str[0]
    df_data.to_csv(result_path+'probability.csv')


In [None]:
name=['1','2','3']
me=['health','Diseases','antigen_specificity']
for i in name:
    for j in me:
        modelfile_path ="./model/Original/VDJdb_model.pth"
        testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Prediction/all/seen/three/neg_pos/"+i+'_'+j+'.csv'
        result_path="./result/Original/seen/three/VDJdb_"+i+'_'+j
        train_main(testfile_path,modelfile_path,result_path)         
        
name=['1','2','3']
me=['health','Diseases','antigen_specificity']
for i in name:
    for j in me:
        modelfile_path ="./model/Original/VDJdb_model.pth"
        testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Prediction/all/seen/seven/neg_pos/"+i+'_'+j+'.csv'
        result_path="./evaluate/repeat10/Prediction/all/seen/seven/VDJdb_"+i+'_'+j
        train_main(testfile_path,modelfile_path,result_path)    
        
        
name=['1','2','3']
me=['health','Diseases','antigen_specificity']
for i in name:
    for j in me:
        modelfile_path ="./model/Original/McPAS_model.pth"
        testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Prediction/all/seen/three/neg_pos/"+i+'_'+j+'.csv'
        result_path="./evaluate/repeat10/Prediction/all/seen/three/McPAS_"+i+'_'+j
        train_main(testfile_path,modelfile_path,result_path) 
            

        
name=['1','2','3']
me=['health','Diseases','antigen_specificity']
for i in name:
    for j in me:
        modelfile_path ="./model/Original/McPAS_model.pth"
        testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Prediction/all/seen/seven/neg_pos/"+i+'_'+j+'.csv'
        result_path="./evaluate/repeat10/Prediction/all/seen/seven/McPAS_"+i+'_'+j
        train_main(testfile_path,modelfile_path,result_path) 
                        
       
    
    
            
name=['1','2','3']
me=['health','Diseases','antigen_specificity']
for i in name:
    for j in me:
        modelfile_path ="./model/Original/McPAS_model.pth"
        testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Prediction/all/unseen/CF/"+i+'_'+j+'.csv'
        result_path="./evaluate/repeat10/Prediction/all/unseen/McPAS_"+i+'_'+j
        train_main(testfile_path,modelfile_path,result_path) 
                               
name=['1','2','3']
me=['health','Diseases','antigen_specificity']
for i in name:
    for j in me:
        modelfile_path ="./model/Original/VDJdb_model.pth"
        testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Prediction/all/unseen/CF/"+i+'_'+j+'.csv'
        result_path="./evaluate/repeat10/Prediction/all/unseen/VDJdb_"+i+'_'+j
        train_main(testfile_path,modelfile_path,result_path) 
  



# Model retraining

In [1]:
import torch.optim as optim
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef,roc_curve
import torch
import torch.nn as nn
import time
from torch.utils.data import Dataset, DataLoader
from tpbte import Model
import pandas as pd
import pickle as pk


def data_renew(pairs, emb_type):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if emb_type == 'onehot':
        tcr = pairs[:, :, 0:21].type(torch.LongTensor).to(device)
        epi = pairs[:, :, 21:-1].type(torch.LongTensor).to(device)
    elif emb_type == 'BLOSUM62':
        tcr = pairs[:, :, 0:20].to(device)
        epi = pairs[:, :, 20:].to(device)
    else:
        tcr = pairs[:, :, 0:5].to(device)
        epi = pairs[:, :, 5:].to(device)
    return tcr, epi

class MyDataset(Dataset):
    def __init__(self, path, emb_type):

        self.data = pd.read_csv(path,low_memory=False)
        self.CDR3B = self.data['CDR3B']
        self.Epitope = self.data['Epitope']
        self.Affinity = self.data['Affinity']
        self.emb_type = emb_type
        if self.emb_type == 'onehot':
            CDR3B, Epi, self.Affinity = onehot(self.CDR3B, self.Epitope, self.Affinity)
        elif self.emb_type == 'BLOSUM62':
            CDR3B, Epi, self.Affinity = BLOSUM_62(self.CDR3B, self.Epitope, self.Affinity, 20)
        else:
            CDR3B, Epi, self.Affinity = Atchley(self.CDR3B, self.Epitope, self.Affinity, 20)
        self.pair = torch.cat((CDR3B, Epi), -1)
    def __getitem__(self, index):
        return self.pair[index], self.Affinity[index]
    def __len__(self):
        return torch.LongTensor(self.Affinity).size()[0]
def Atchley(TCR, Epitope, Label, Length):
    aa_vec = pk.load(open('atchley.pk', 'rb'))
    Label = torch.LongTensor(Label).view(-1, 1)
    n = Label.size()[0]
    ext = list('********************') 
    tcr_embedding = torch.zeros(n, Length, 6)
    epi_embedding = torch.zeros(n, Length, 6)
    for ti, tcr in enumerate(TCR):
        tcr = tcr + ' ' * (Length - len(tcr))
        for i in range(Length):
            tcr_embedding[ti, i, :] = torch.from_numpy(aa_vec[tcr[i]])

    for ei, epi in enumerate(Epitope):
        epi = epi + ' ' * (Length - len(epi))
        for i in range(Length):
            epi_embedding[ei, i, :] = torch.from_numpy(aa_vec[epi[i]])
    return tcr_embedding[:, :, 0:5], epi_embedding[:, :, 0:5], Label 

def Convert_letters(seq):
    import numpy as np
    result = []
    atchley = pk.load(open('atchley.pk', 'rb'))
    for key in atchley.keys():
        atchley[key] = atchley[key][..., :-1]
    for row in seq:
        res_row = []
        for element in row:
            closest_key = None
            min_distance = float('inf')
            for key, value in atchley.items():
                distance = np.linalg.norm(element - value)
                if distance < min_distance:
                    min_distance = distance
                    closest_key = key
            res_row.append([closest_key])
        result.append(res_row)
    result = np.array(result, dtype=object)
    shape = result.shape
    result = result.reshape((shape[0], shape[1]*shape[2]))
    letter = []
    for row in result:
        row_without_symbols = [element for element in row if element.isalpha()]
        letter.append(''.join(row_without_symbols))
    return letter


In [2]:
def train_main(trainfile_path,testfile_path,save_model_path,result_path): 
    import pandas as pd
    import numpy as np 
    vocab = 21 
    d_model = 512
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(device)
    e_type = 'Atchley'
    criterion = nn.CrossEntropyLoss()
    model = Model(src_vocab=vocab, tgt_vocab=vocab, emb_type=e_type, N=6, h=8, d_model=d_model,dropout=0.1, device=device)
    model = model.to(device)
    opt = torch.optim.Adam(model.parameters(), lr=0.00005, betas=(0.9, 0.98), eps=1e-9)
    n_epoch = 2
    train = MyDataset(trainfile_path, emb_type=e_type)
    test = MyDataset(testfile_path, emb_type=e_type)
    b_size = 32
    train_dataset = DataLoader(dataset=train, batch_size=b_size, shuffle=True, drop_last=True) 
    test_dataset = DataLoader(dataset=test, batch_size=b_size, shuffle=True, drop_last=False)
    model.train()
    best_loss = 1
    L = torch.zeros(n_epoch, 500)
    predictions = []  # Initialize predictions list
    true_labels = []  # Initialize true_labels list
    predicted_classes = []  # Initialize predicted_classes list
    for epoch in range(n_epoch):
        print('epoch:', epoch + 1)
        for i, data in enumerate(train_dataset, 0):
            pairs, label = data
            tcr, epi = data_renew(pairs=pairs, emb_type=e_type)
            label = label.unsqueeze(-1).to(device)
            n = tcr.size()[0]
            output = torch.unsqueeze(model(tcr, epi), 1)
            pred = output.argmax(dim=2)
            output = output.view(-1, 2)
            pred = pred.view(-1)
            label = label.long().view(-1)
            loss = criterion(output, label) 
            correct = torch.eq(pred, label.long()).sum().float().item()
            acc = correct / n
            if loss < best_loss:
                best_loss = loss
                param = model.state_dict()
            if loss == best_loss:
                param = model.state_dict()
            opt.zero_grad()
            loss.backward() 
            opt.step()
    torch.save(model.state_dict(),save_model_path+'model.pth')  
    print('End training. Begin testing')
    good_model = model
    good_model.load_state_dict(param)
    good_model.eval()
    good_model.to(device)
    df_data = []
    with torch.no_grad():
        for i, data in enumerate(test_dataset, 0):
            correct = 0  
            pairs, label = data
            tcr, epi = data_renew(pairs=pairs, emb_type=e_type)
            output = good_model(tcr, epi)
            pred = output.argmax(dim=1)
            predictions.extend(output[:, 1].tolist())  
            true_labels.extend(label.tolist())
            predicted_classes.extend(pred.tolist())

            tcr=Convert_letters(tcr.cpu().numpy())
            epi=Convert_letters(epi.cpu().numpy())
            prediction_scores = output[:, 1].cpu().numpy()
            true_labels_batch = label.cpu().numpy()
            predicted_classes_batch = pred.cpu().numpy()
            for i in range(len(pairs)):
                 df_data.append({'Epitope': epi[i],'CDR3B': tcr[i],'y_true': true_labels_batch[i],
                                 'y_prob': prediction_scores[i], 
                                 'y_pred': predicted_classes_batch[i]})
            torch.cuda.empty_cache()
    df_data=pd.DataFrame(df_data)           
    df_data['y_true'] = df_data['y_true'].str[0]
    df_data.to_csv(result_path+'probability.csv')


# pair50

In [None]:
database=['healthy','patient']
name1=['1','2','3','4','5']
name=['1_1','1_2','1_4','1_6','1_8']
for i in database:
    for j in name:
        for k in name1:
            trainfile_path ="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair50/"+i+"/"+k+'_'+j+"train.csv"
            testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair50/"+i+"/"+k+'_'+"1_1test.csv"
            save_model_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/TPBTE/model/Retrain/pair50/"+i+"/"+k+'_'+j
            result_path="./result/pair50/seen/test/"+i+"/"+k+'_'+j
            train_main(trainfile_path,testfile_path,save_model_path,result_path) 


database=['Antigen_specificity']
name1=['1','2','3','4','5']
name=['1_1','1_2','1_4']
for i in database:
    for j in name:
        for k in name1:
            trainfile_path ="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair50/"+i+"/"+k+'_'+j+"train.csv"
            testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair50/"+i+"/"+k+'_'+"1_1test.csv"
            save_model_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/TPBTE/model/Retrain/pair50/"+i+"/"+k+'_'+j
            result_path="./result/pair50/seen/test/"+i+"/"+k+'_'+j
            train_main(trainfile_path,testfile_path,save_model_path,result_path) 


# pair300

In [None]:
pair=['more300','300','200','100','10']
database=['healthy','patient']
name1=['1','2','3','4','5']
name=['1_1']
for i in database:
    for j in name:
        for k in name1:
            for l in pair:
                trainfile_path ="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair300/"+l+"/"+i+"/"+k+'_'+j+"train.csv"
                testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair300/more300/"+i+"/"+k+'_'+j+"test.csv"
                save_model_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/TPBTE/model/Retrain/pair300/seen/"+l+"/"+i+"/"+k+'_'+j
                result_path="./result/pair300/seen/"+l+"/"+i+"/"+k+'_'+j
                train_main(trainfile_path,testfile_path,save_model_path,result_path) 

                

pair=['more300','300','200','100','10']
database=['Antigen_specificity']
name1=['1','2','3','4','5']
name=['1_1']
for i in database:
    for j in name:
        for k in name1:
            for l in pair:
                trainfile_path ="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair300/"+l+"/"+i+"/"+k+'_'+j+"train.csv"
                testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair300/more300/"+i+"/"+k+'_'+j+"test.csv"
                save_model_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/TPBTE/model/Retrain/pair300/seen/"+l+"/"+i+"/"+k+'_'+j
                result_path="./result/pair300/seen/"+l+"/"+i+"/"+k+'_'+j
                train_main(trainfile_path,testfile_path,save_model_path,result_path) 


cuda:0
epoch: 1
epoch: 2
End training. Begin testing
cuda:0
epoch: 1
epoch: 2
End training. Begin testing
cuda:0
epoch: 1
epoch: 2
End training. Begin testing
cuda:0
epoch: 1
epoch: 2
End training. Begin testing
cuda:0
epoch: 1
epoch: 2
End training. Begin testing
cuda:0
epoch: 1
epoch: 2
End training. Begin testing
cuda:0
epoch: 1
epoch: 2
End training. Begin testing
cuda:0
epoch: 1
epoch: 2
End training. Begin testing
cuda:0
epoch: 1
epoch: 2
End training. Begin testing
cuda:0
epoch: 1
epoch: 2
End training. Begin testing
cuda:0
epoch: 1
epoch: 2
End training. Begin testing
cuda:0
epoch: 1
epoch: 2
End training. Begin testing
cuda:0
epoch: 1
epoch: 2
End training. Begin testing
cuda:0
epoch: 1
epoch: 2
End training. Begin testing
cuda:0
epoch: 1
epoch: 2
End training. Begin testing
cuda:0
epoch: 1
epoch: 2
End training. Begin testing
cuda:0
epoch: 1
epoch: 2
End training. Begin testing
cuda:0
epoch: 1
epoch: 2
End training. Begin testing
cuda:0
epoch: 1
epoch: 2
End training. Begin t

In [3]:
def validation_main(testfile_path,modelfile_path,result_path):
    vocab = 21 
    d_model = 512
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(device)
    e_type = 'Atchley'
    test_path=testfile_path
    test = MyDataset(test_path, emb_type=e_type)
    b_size = 32
    test_dataset = DataLoader(dataset=test, batch_size=b_size, shuffle=True, drop_last=False)
    model = Model(src_vocab=vocab, tgt_vocab=vocab, emb_type=e_type, N=6, h=8, d_model=d_model, dropout=0.1, device=device)
    model.load_state_dict(torch.load(modelfile_path))
    model = model.to(device)
    model.eval()
    df_data = []
    predictions=[]
    true_labels=[]
    predicted_classes=[]
    with torch.no_grad():
        for data in test_dataset:
            pairs, label = data
            tcr, epi = data_renew(pairs=pairs, emb_type=e_type)
            tcr = tcr.to(device)
            epi = epi.to(device)
            output = model(tcr, epi)
            pred = output.argmax(dim=1)
            predictions.extend(output[:, 1].tolist())  
            true_labels.extend(label.tolist())
            predicted_classes.extend(pred.tolist())
            tcr=Convert_letters(tcr.cpu().numpy())
            epi=Convert_letters(epi.cpu().numpy())
            prediction_scores = output[:, 1].cpu().numpy()
            true_labels_batch = label.cpu().numpy()
            predicted_classes_batch = pred.cpu().numpy()
            for i in range(len(pairs)):
                 df_data.append({'Epitope': epi[i],'CDR3B': tcr[i],'y_true': true_labels_batch[i],
                                 'y_prob': prediction_scores[i], 
                                 'y_pred': predicted_classes_batch[i]})
    df_data=pd.DataFrame(df_data)           
    df_data['y_true'] = df_data['y_true'].str[0]
    df_data.to_csv(result_path+'probability.csv')

# validaiton

In [None]:
import pandas as pd
name1=['1','2','3','4','5']
name=['1_1','1_2','1_4','1_6','1_8']
database=['healthy','patient']
for i in database:
    for j in name:
        for k in name1:
            testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Validation/"+i+"/"+k+'_'+"1_1Validation.csv"
            modelfile_path="./model/Retrain/pair50/"+i+"/"+k+'_'+j+"model.pth"
            result_path="./result/pair50/seen/validation/"+i+"/"+k+'_'+j
            validation_main(testfile_path,modelfile_path,result_path)

import pandas as pd
name1=['1','2','3','4','5']
name=['1_1','1_2','1_4']
database=['Antigen_specificity']
for i in database:
    for j in name:
        for k in name1:
            testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Validation/"+i+"/"+k+'_'+"1_1Validation.csv"
            modelfile_path="./model/Retrain/pair50/"+i+"/"+k+'_'+j+"model.pth"
            result_path="./result/pair50/seen/validation/"+i+"/"+k+'_'+j
            validation_main(testfile_path,modelfile_path,result_path)

cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0


# unkonwn-test

In [None]:
import pandas as pd
name1=['1','2','3','4','5']
name=['1_1','1_2','1_4','1_6','1_8']
database=['healthy','patient']
for i in database:
    for j in name:
        for k in name1:
            testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/"+i+"/"+k+'_'+"1_1test.csv"
            modelfile_path="./model/Retrain/pair50/"+i+"/"+k+'_'+j+"model.pth"
            result_path="./result/pair50/unseen/test/"+i+"/"+k+'_'+j
            validation_main(testfile_path,modelfile_path,result_path)

import pandas as pd
name1=['1','2','3','4','5']
name=['1_1','1_2','1_4']
database=['Antigen_specificity']
for i in database:
    for j in name:
        for k in name1:
            testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/"+i+"/"+k+'_'+"1_1test.csv"
            modelfile_path="./model/Retrain/pair50/"+i+"/"+k+'_'+j+"model.pth"
            result_path="./result/pair50/unseen/test/"+i+"/"+k+'_'+j
            validation_main(testfile_path,modelfile_path,result_path)


# unkonwn-validation

In [4]:
import pandas as pd
name1=['1','2','3','4','5']
name=['1_1','1_2','1_4','1_6','1_8']
database=['healthy','patient']
for i in database:
    for j in name:
        for k in name1:
            testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/validation/"+i+"/"+k+'_'+"1_1validation.csv"
            modelfile_path="./model/Retrain/pair50/"+i+"/"+k+'_'+j+"model.pth"
            result_path="./result/pair50/unseen/validation/"+i+"/"+k+'_'+j
            validation_main(testfile_path,modelfile_path,result_path)

import pandas as pd
name1=['1','2','3','4','5']
name=['1_1','1_2','1_4']
database=['Antigen_specificity']
for i in database:
    for j in name:
        for k in name1:
            testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/validation/"+i+"/"+k+'_'+"1_1validation.csv"
            modelfile_path="./model/Retrain/pair50/"+i+"/"+k+'_'+j+"model.pth"
            result_path="./result/pair50/unseen/validation/"+i+"/"+k+'_'+j
            validation_main(testfile_path,modelfile_path,result_path)


cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
