# 1.Original model prediction

In [5]:
import pickle
import torch
import argparse
import ae_utils as ae
import lstm_utils as lstm
import ergo_data_loader
import numpy as np
from ERGO_models import AutoencoderLSTMClassifier, DoubleLSTMClassifier
import csv

import torch.optim as optim
import torch.nn as nn
from random import shuffle
import time
from sklearn.metrics import roc_auc_score,roc_curve
import pandas as pd
import ast

def Original_model_prediction(args):
    import pandas as pd
    # Word to index dictionary
    amino_acids = [letter for letter in 'ARNDCEQGHILKMFPSTWYV']
    if args['model_type'] == 'lstm':
        amino_to_ix = {amino: index for index, amino in enumerate(['PAD'] + amino_acids)}
    if args['model_type'] == 'ae':
        pep_atox = {amino: index for index, amino in enumerate(['PAD'] + amino_acids)}
        tcr_atox = {amino: index for index, amino in enumerate(amino_acids + ['X'])}

    # if args.ae_file == 'auto':
    args['ae_file'] = 'TCR_Autoencoder/tcr_ae_dim_100.pt'
    if args['model_file'] == 'auto':
        dir = 'models'
        p_key = 'protein' if args.protein else ''
        args.model_file = dir + '/' + '_'.join([args['model_type'], args.dataset, args.sampling, p_key, 'model.pt'])
    if args['test_data_file'] == 'auto':
        args['test_data_file'] = 'pairs_example.csv'

    # Read test data
    tcrs = []
    peps = []
    y_true=[]
    signs = []
    max_len = 28
    with open(args['test_data_file'], 'r') as csv_file:
        reader = csv.reader(csv_file)
        for line in reader:
            tcr, pep,lable = line
            if args['model_type'] == 'ae' and len(tcr) >= max_len:
                continue
            tcrs.append(tcr)
            peps.append(pep)
            y_true.append(lable)
            signs.append(0.0)
            
    tcrs_copy = tcrs.copy()
    peps_copy = peps.copy()
    y_true_copy = y_true.copy()

    # Load model
    device = args['device']
    if args['model_type'] == 'ae':
        model = AutoencoderLSTMClassifier(10, device, 28, 21, 100, 50, args['ae_file'], False)
        checkpoint = torch.load(args['model_file'], map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        model.to(device)
        model.eval()
    if args['model_type'] == 'lstm':
        model = DoubleLSTMClassifier(10, 500, 0.1, device)
        checkpoint = torch.load(args['model_file'], map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        model.to(device)
        model.eval()
        pass

    # Predict
    batch_size = 50
    if args['model_type'] == 'ae':
        test_batches = ae.get_full_batches(tcrs, peps, signs, tcr_atox, pep_atox, batch_size, max_len)
        preds = ae.predict(model, test_batches, device)
      
    if args['model_type'] == 'lstm':
        lstm.convert_data(tcrs, peps, amino_to_ix)
        test_batches = lstm.get_full_batches(tcrs, peps, signs, batch_size, amino_to_ix)
        preds = lstm.predict(model, test_batches, device)

    data = pd.DataFrame(columns=['CDR3B', 'Class', 'y_true', 'y_prob'])

    data_list = []  
    for tcr, pep, y_true, pred in zip(tcrs_copy, peps_copy, y_true_copy, preds):
        data_list.append({'CDR3B': tcr, 'Class': pep, 'y_true': y_true, 'y_prob': pred})
    data = pd.concat([data, pd.DataFrame(data_list)], ignore_index=True)
    data['y_pred'] = data['y_prob'].apply(lambda x: 1 if x >= 0.5 else 0)
    result_path = args['result_path']
    data.to_csv(result_path + '_probability.csv', index=False)
 

When training and predicting with the model, 
it is necessary to ensure that the column order in the data is 
['CDR3B', 'Epitope', 'Affinity'], 
and there should be no extra rows or mismatched column names.


In [10]:
testfile=pd.read_csv("../data/test.csv")
testfile=testfile[['CDR3B','Epitope','Affinity']]
testfile.to_csv("../data/ERGO_test.csv",header=False, index=False)

In [None]:
args = {
    'test_data_file':"../data/ERGO_test.csv",
    'model_type': "ae",
    'model_file':"../Original_model/ERGO_AE_mc.pt",
    'result_path':"../result_path/Original_model_prediction",
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'}

Original_model_prediction(args)

In [12]:
args = {
    'test_data_file':"../data/ERGO_test.csv",
    'model_type': "lstm",
    'model_file':"../Original_model/ERGO_lstm_mc.pt",
    'result_path':"../result_path/Original_model_prediction",
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'}

Original_model_prediction(args)

# 2.Model retraining

# lstm

In [17]:
import random
import numpy as np
import csv
import os
import sklearn.model_selection as skl

def lstm_get_lists_from_pairs(pairs):
    tcrs = []
    peps = []
    signs = []
    for pair in pairs:
        tcr, pep, label = pair
        tcrs.append(tcr)
        peps.append(pep[0])
        if label == 'p':
            signs.append(1.0)
        elif label == 'n':
            signs.append(0.0)
    return tcrs, peps, signs

def train_epoch(batches, model, loss_function, optimizer, device):
    model.train()
    shuffle(batches)
    total_loss = 0
    for batch in batches:
        padded_tcrs, tcr_lens, padded_peps, pep_lens, batch_signs = batch
        # Move to GPU
        padded_tcrs = padded_tcrs.to(device)
        tcr_lens = tcr_lens.to(device)
        padded_peps = padded_peps.to(device)
        pep_lens = pep_lens.to(device)
        batch_signs = torch.tensor(batch_signs).to(device)
        model.zero_grad()
        probs = model(padded_tcrs, tcr_lens, padded_peps, pep_lens)
        probs = probs.squeeze()
        batch_signs = batch_signs.squeeze()
        # Compute loss
        weights = batch_signs * 0.84 + (1-batch_signs) * 0.14
        loss_function.weight = weights
        batch_signs = batch_signs.squeeze()
        loss = loss_function(probs, batch_signs)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(batches)

def converted_seq(test_tcrs, test_peps, amino_to_ix):
    encoded_tcrs = []
    encoded_peps = []
    for tcr in test_tcrs:
        encoded_tcr = []
        for value in tcr:
            if value == 0: 
                continue
            for amino, encoding in amino_to_ix.items():
                if encoding == value:
                    encoded_tcr.append(amino)
                    break
        encoded_tcrs.append(encoded_tcr)
    for pep in test_peps:
        encoded_pep = []
        for value in pep:
            if value == 0:  
                continue
            for amino, encoding in amino_to_ix.items():
                if encoding == value:
                    encoded_pep.append(amino)
                    break
        encoded_peps.append(encoded_pep)
    encoded_tcrs = ["".join(sequence) for sequence in encoded_tcrs]
    encoded_peps = ["".join(sequence) for sequence in encoded_peps]
    return encoded_tcrs, encoded_peps

def evaluate(model, batches, device):
    model.eval()
    true = []
    scores = []
    shuffle(batches)
    for batch in batches:
        padded_tcrs, tcr_lens, padded_peps, pep_lens, batch_signs = batch
        padded_tcrs = padded_tcrs.to(device)
        tcr_lens = tcr_lens.to(device)
        padded_peps = padded_peps.to(device)
        pep_lens = pep_lens.to(device)
        probs = model(padded_tcrs, tcr_lens, padded_peps, pep_lens)
        true.extend(np.array(batch_signs).astype(int))
        scores.extend(probs.cpu().data.numpy())
    auc = roc_auc_score(true, scores)
    fpr, tpr, thresholds = roc_curve(true, scores)
    return auc, (fpr, tpr, thresholds)


In [18]:
import torch
import pickle
import lstm_utils as lstm
from ERGO_models import AutoencoderLSTMClassifier, DoubleLSTMClassifier
import torch.optim as optim
import torch.nn as nn
from random import shuffle
import time
from sklearn.metrics import roc_auc_score,roc_curve
import pandas as pd

def Model_retraining(trainfile_path,testfile_path,save_model_path,result_path): 
    import pandas as pd
    train=pd.read_csv(trainfile_path)
    train=train[['CDR3B','Epitope','Affinity']]
    train['Affinity'] = train['Affinity'].apply(lambda x: 'p' if x == 1 else 'n')
    train = train.values.tolist()
    train=[(item[0], (item[1],), item[2]) for item in train]

    test=pd.read_csv(testfile_path)
    test=test[['CDR3B','Epitope','Affinity']]
    test['Affinity'] = test['Affinity'].apply(lambda x: 'p' if x == 1 else 'n')
    test = test.values.tolist()
    test=[(item[0], (item[1],), item[2]) for item in test]
    
    amino_acids = [letter for letter in 'ARNDCEQGHILKMFPSTWYV']
    amino_to_ix = {amino: index for index, amino in enumerate(['PAD'] + amino_acids)}

    train_tcrs, train_peps, train_signs = lstm_get_lists_from_pairs(train)
    lstm.convert_data(train_tcrs, train_peps, amino_to_ix)
    batch_size= 50
    train_batches = lstm.get_batches(train_tcrs, train_peps, train_signs, batch_size)

    test_tcrs, test_peps, test_signs = lstm_get_lists_from_pairs(test)
    lstm.convert_data(test_tcrs, test_peps, amino_to_ix)
    test_batches = lstm.get_batches(test_tcrs, test_peps, test_signs, batch_size)
    emb_dim=10
    lstm_dim=500
    dropout=0.1
    lr=1e-4
    wd=0
    option=0
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = DoubleLSTMClassifier(emb_dim,lstm_dim, dropout, device)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    losses = []
    loss_function = nn.BCELoss()
    epochs=30
    best_auc = 0
    best_roc = None
    for epoch in range(epochs):
        print('epoch:', epoch + 1)
        epoch_time = time.time()
        loss = train_epoch(train_batches, model, loss_function, optimizer, device)
        losses.append(loss)
        train_auc = evaluate(model, train_batches, device)[0]
        print('train auc:', train_auc)
        torch.save(model.state_dict(), save_model_path)
        model.eval()
        true = []
        scores = []
        test_tcrs = []
        test_peps = []
        shuffle(test_batches)
        for batch in test_batches: 
            padded_tcrs, tcr_lens, padded_peps, pep_lens, batch_signs = batch
            padded_tcrs = padded_tcrs.to(device)
            tcr_lens = tcr_lens.to(device)
            padded_peps = padded_peps.to(device)
            pep_lens = pep_lens.to(device)
            test_tcrs.extend(padded_tcrs.cpu().numpy())  
            test_peps.extend(padded_peps.cpu().numpy()) 
            probs = model(padded_tcrs, tcr_lens, padded_peps, pep_lens)
            true.extend(np.array(batch_signs).astype(int))
            scores.extend(probs.cpu().data.numpy())
        test_auc = roc_auc_score(true, scores)
        fpr, tpr, thresholds = roc_curve(true, scores)
        tcrs,peps=converted_seq(test_tcrs,test_peps,amino_to_ix)
        probability = pd.DataFrame({ 'Epitope': peps, 'CDR3B': tcrs, 'y_true': true, 'y_prob': scores})
        probability['y_prob'] = probability['y_prob'].astype(str)
        probability['y_prob'] = probability['y_prob'].str.replace('[', '').str.replace(']', '')
        probability['y_prob'] = probability['y_prob'].astype(float)
        probability['y_pred'] = probability['y_prob'].apply(lambda x: 1 if x >= 0.5 else 0)
        probability.to_csv(result_path+'lstm_probability.csv', index=False)
        if test_auc > best_auc:
            best_auc = test_auc
        print('test auc:', test_auc)

In [None]:
trainfile_path ="../data/train.csv"
testfile_path="../data/test.csv"
save_modle_path="../Retraining_model/Retraining_lstm_model.pth"
result_path="../result_path/Retraining_model_prediction"
Model_retraining(trainfile_path,testfile_path,save_modle_path,result_path) 


# 3.Retraining_model_prediction

In [9]:
def Retraining_model_prediction(testfile_path,modelfile_path,result_path):
    import pandas as pd
    from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, f1_score, matthews_corrcoef, precision_score,auc,roc_curve
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    emb_dim=10
    lstm_dim=500
    dropout=0.1
    batch_size= 50
    model = DoubleLSTMClassifier(emb_dim, lstm_dim, dropout, device)
    model.to(device)
    model.load_state_dict(torch.load(modelfile_path))
    model.eval()
    test=pd.read_csv(testfile_path)
    test=test[['CDR3B','Epitope','Affinity']]
    test['Affinity'] = test['Affinity'].apply(lambda x: 'p' if x == 1 else 'n')
    test = test.values.tolist()
    test=[(item[0], (item[1],), item[2]) for item in test]
    amino_acids = [letter for letter in 'ARNDCEQGHILKMFPSTWYV']
    amino_to_ix = {amino: index for index, amino in enumerate(['PAD'] + amino_acids)}
    test_tcrs, test_peps, test_signs = lstm_get_lists_from_pairs(test)
    lstm.convert_data(test_tcrs, test_peps, amino_to_ix)
    test_batches = lstm.get_batches(test_tcrs, test_peps, test_signs, batch_size)
    true = []
    scores = []
    test_tcrs = []
    test_peps = []
    for batch in test_batches:
        padded_tcrs, tcr_lens, padded_peps, pep_lens, batch_signs = batch
        padded_tcrs = padded_tcrs.to(device)
        tcr_lens = tcr_lens.to(device)
        padded_peps = padded_peps.to(device)
        pep_lens = pep_lens.to(device)
        test_tcrs.extend(padded_tcrs.cpu().numpy())  
        test_peps.extend(padded_peps.cpu().numpy()) 
        probs = model(padded_tcrs, tcr_lens, padded_peps, pep_lens)
        true.extend(np.array(batch_signs).astype(int))
        scores.extend(probs.cpu().data.numpy())
        tcrs, peps = converted_seq(test_tcrs, test_peps, amino_to_ix)
        probability = pd.DataFrame({'Epitope': peps, 'CDR3B': tcrs, 'y_true': true, 'y_prob': scores})
        probability['y_prob'] = probability['y_prob'].astype(str)
        probability['y_prob'] = probability['y_prob'].str.replace('[', '').str.replace(']', '')
        probability['y_prob'] = probability['y_prob'].astype(float)
        probability['y_pred'] = probability['y_prob'].apply(lambda x: 1 if x >= 0.5 else 0)
        probability.to_csv(result_path+'lstm_probability.csv', index=False)

In [None]:
testfile_path="./data/Validation.csv"
modelfile_path="./Retraining_model/Retraining_lstm_model.pth"
result_path="./result_path/Retraining_model_prediction"
Retraining_model_prediction(testfile_path,modelfile_path,result_path)


# 2.Model_retraining

# AE

In [8]:
import random
import numpy as np
import csv
import os
import sklearn.model_selection as skl
import torch
import pickle
import lstm_utils as lstm
from ERGO_models import AutoencoderLSTMClassifier, DoubleLSTMClassifier
import torch.optim as optim
import torch.nn as nn
from random import shuffle
import time
from sklearn.metrics import roc_auc_score,roc_curve
import pandas as pd
def ae_get_lists_from_pairs(pairs, max_len):
    tcrs = []
    peps = []
    signs = []
    for pair in pairs:
        tcr, pep, label = pair
        if len(tcr) >= max_len:
            continue
        tcrs.append(tcr)
        peps.append(pep[0])
        if label == 'p':
            signs.append(1.0)
        elif label == 'n':
            signs.append(0.0)
    return tcrs, peps, signs


def train_epoch(batches, model, loss_function, optimizer, device):
    model.train()
    shuffle(batches)
    total_loss = 0
    for batch in batches:
        tcrs, padded_peps, pep_lens, batch_signs = batch
        # Move to GPU
        # print(tcrs)
        tcrs = tcrs.to(device)
        padded_peps = padded_peps.to(device)
        pep_lens = pep_lens.to(device)
        batch_signs = torch.tensor(batch_signs).to(device)
        model.zero_grad()
        probs = model(tcrs, padded_peps, pep_lens)
        # print(probs, batch_signs)
        # Compute loss
        batch_signs = batch_signs.unsqueeze(1)
        loss = loss_function(probs, batch_signs)
        # with open(sys.argv[1], 'a+') as loss_file:
        #    loss_file.write(str(loss.item()) + '\n')
        # Update model weights
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(batches)


def Sequence_conversion(data):
    data['Epitope'] = data['Epitope'].apply(lambda x: [int(num) for num in x.strip('[]').split()])
    amino_acids = [letter for letter in 'ARNDCEQGHILKMFPSTWYV']
    pep_atox = {amino: index for index, amino in enumerate(['PAD'] + amino_acids)}
    atox_dict_reversed = {v: k for k, v in pep_atox.items()}
    for i in range(len(data)):
        data['Epitope'][i] = [atox_dict_reversed[num] for num in data['Epitope'][i]]
    data['Epitope'] = data['Epitope'].astype(str)
    data['Epitope'] = data['Epitope'].str.replace('[', '').str.replace(']', '')
    data['Epitope'] = data['Epitope'].apply(lambda x: ''.join([char for char in x if char.isalnum()]))
    data['CDR3B'] = data['CDR3B'].str.replace('.', '')
    tcr_list=[]
    for j in range(0,len( data['CDR3B'])):
        matrix_str = str(data['CDR3B'][j])
        matrix_rows = matrix_str.strip('[]').split('\n')
        matrix = []
        for row in matrix_rows:
            values = [int(val) for val in row.strip().strip('[]').split()]
            matrix.append(values)
        matrix = np.array(matrix)
        nonzero_rows = np.any(matrix != 0, axis=1)
        matrix = matrix[nonzero_rows]
        amino_acids = {'A': 0, 'R': 1, 'N': 2, 'D': 3, 'C': 4, 'E': 5, 'Q': 6, 'G': 7,
                       'H': 8, 'I': 9, 'L': 10, 'K': 11, 'M': 12, 'F': 13, 'P': 14,
                       'S': 15, 'T': 16, 'W': 17, 'Y': 18, 'V': 19, 'X': 20}
        sequence = ""
        for row in matrix:
            index = np.argmax(row)
            for amino, idx in amino_acids.items():
                if idx == index:
                    sequence += amino
        tcr_list.append(sequence)
    data['CDR3B']=  tcr_list
    data['CDR3B'] = data['CDR3B'].apply(lambda x: x[:-1] if x[-1] == 'X' else x)
    return data


def evaluate(model, batches, device):
    model.eval()
    true = []
    scores = []
    shuffle(batches)
    for batch in batches:
        tcrs, padded_peps, pep_lens, batch_signs = batch
        tcrs = torch.tensor(tcrs).to(device)
        padded_peps = padded_peps.to(device)
        pep_lens = pep_lens.to(device)
        probs = model(tcrs, padded_peps, pep_lens)
        true.extend(np.array(batch_signs).astype(int))
        scores.extend(probs.cpu().data.numpy())
    auc = roc_auc_score(true, scores)
    fpr, tpr, thresholds = roc_curve(true, scores)
    return auc, (fpr, tpr, thresholds)


In [15]:
import torch
import pickle
import argparse
import ae_utils as ae
import lstm_utils as lstm
import ergo_data_loader
import numpy as np
from ERGO_models import AutoencoderLSTMClassifier, DoubleLSTMClassifier
import csv

import torch.optim as optim
import torch.nn as nn
from random import shuffle
import time
from sklearn.metrics import roc_auc_score,roc_curve
import pandas as pd
import ast
def Model_retraining(trainfile_path,testfile_path,save_model_path,result_path): 
    import pandas as pd
    from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, f1_score, matthews_corrcoef, precision_score,auc,roc_curve
    train=pd.read_csv(trainfile_path)
    train=train[['CDR3B','Epitope','Affinity']]
    train['Affinity'] = train['Affinity'].apply(lambda x: 'p' if x == 1 else 'n')
    train = train.values.tolist()
    train=[(item[0], (item[1],), item[2]) for item in train]

    test=pd.read_csv(testfile_path)
    test=test[['CDR3B','Epitope','Affinity']]
    test['Affinity'] = test['Affinity'].apply(lambda x: 'p' if x == 1 else 'n')
    test = test.values.tolist()
    test=[(item[0], (item[1],), item[2]) for item in test]
    emb_dim=10
    enc_dim= 100
    batch_size=50
    train_ae=True
    dropout=0.1
    lr=1e-4
    wd=0
    option=0
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    amino_acids = [letter for letter in 'ARNDCEQGHILKMFPSTWYV']
    pep_atox = {amino: index for index, amino in enumerate(['PAD'] + amino_acids)}
    tcr_atox = {amino: index for index, amino in enumerate(amino_acids + ['X'])}
    ae_file = 'TCR_Autoencoder/tcr_ae_dim_' + str(enc_dim) + '.pt'
    checkpoint = torch.load(ae_file, map_location=device)
    max_len = checkpoint['max_len']
    batch_size = checkpoint['batch_size']

    train_tcrs, train_peps, train_signs = ae_get_lists_from_pairs(train, max_len)
    train_batches = ae.get_batches(train_tcrs, train_peps, train_signs, tcr_atox, pep_atox, batch_size, max_len)
    test_tcrs, test_peps, test_signs = ae_get_lists_from_pairs(test,max_len)
    test_batches = ae.get_batches(test_tcrs, test_peps, test_signs, tcr_atox, pep_atox, batch_size,max_len)

    losses = []
    loss_function = nn.BCELoss()
    model = AutoencoderLSTMClassifier(emb_dim, device,max_len, 21, enc_dim, batch_size, ae_file, train_ae)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    best_auc = 0
    best_roc = None
    epochs=50
    for epoch in range(epochs):
        print('epoch:', epoch + 1)
        epoch_time = time.time()
        loss = train_epoch(train_batches,model, loss_function, optimizer, device)
        losses.append(loss)
        train_auc = evaluate(model, train_batches, device)[0] 
        print('train auc:', train_auc)
        torch.save(model.state_dict(), save_model_path)
        model.eval()
        true = []
        scores = []
        test_tcrs = []
        test_peps = []
        for batch in test_batches: 
            tcrs, padded_peps, pep_lens, batch_signs = batch
            tcrs = torch.tensor(tcrs).to(device)
            padded_peps = padded_peps.to(device)
            pep_lens = pep_lens.to(device)
            test_tcrs.extend(tcrs.cpu().numpy())  
            test_peps.extend(padded_peps.cpu().numpy()) 
            probs = model(tcrs, padded_peps, pep_lens)
            true.extend(np.array(batch_signs).astype(int))
            scores.extend(probs.cpu().data.numpy())    
        
        test_auc = roc_auc_score(true, scores)
        fpr, tpr, thresholds = roc_curve(true, scores)
        probability = pd.DataFrame({ 'Epitope': test_peps, 'CDR3B': test_tcrs, 'y_true': true, 'y_prob': scores})
        probability['y_prob'] = probability['y_prob'].astype(str)
        probability['y_prob'] = probability['y_prob'].str.replace('[', '').str.replace(']', '')
        probability['y_prob'] = probability['y_prob'].astype(float)
        probability['y_pred'] = probability['y_prob'].apply(lambda x: 1 if x >= 0.5 else 0)
        probability.to_csv(result_path+'ae_probability.csv', index=False)
        probability=pd.read_csv(result_path+'ae_probability.csv')
        probability=Sequence_conversion(probability)
        probability.to_csv(result_path+'ae_probability.csv', index=False)

In [None]:
trainfile_path ="../data/train.csv"
testfile_path="../data/test.csv"
save_modle_path="../Retraining_model/Retraining_ae_model.pth"
result_path="../result_path/Retraining_model_prediction"
Model_retraining(trainfile_path,testfile_path,save_modle_path,result_path) 

# 3.Retraining_model_prediction

In [13]:
def Retraining_model_prediction(testfile_path,modelfile_path,result_path): 
    import pandas as pd
    from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, f1_score, matthews_corrcoef, precision_score,auc,roc_curve
    test=pd.read_csv(testfile_path)
    test=test[['CDR3B','Epitope','Affinity']]
    test['Affinity'] = test['Affinity'].apply(lambda x: 'p' if x == 1 else 'n')
    test = test.values.tolist()
    test=[(item[0], (item[1],), item[2]) for item in test]
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    amino_acids = [letter for letter in 'ARNDCEQGHILKMFPSTWYV']
    emb_dim=10
    enc_dim= 100
    batch_size=50
    train_ae=True
    dropout=0.1
    lr=1e-4
    wd=0
    option=0
    pep_atox = {amino: index for index, amino in enumerate(['PAD'] + amino_acids)}
    tcr_atox = {amino: index for index, amino in enumerate(amino_acids + ['X'])}
    ae_file = 'TCR_Autoencoder/tcr_ae_dim_' + str(enc_dim) + '.pt'
    checkpoint = torch.load(ae_file, map_location=device)
    max_len = checkpoint['max_len']
    batch_size = checkpoint['batch_size']

    model = AutoencoderLSTMClassifier(emb_dim, device, max_len, 21, enc_dim, batch_size, ae_file, False)
    model.load_state_dict(torch.load(modelfile_path, map_location=device))
    model.to(device)
    model.eval()
    
    test_tcrs, test_peps, test_signs = ae_get_lists_from_pairs(test,max_len)
    test_batches = ae.get_batches(test_tcrs, test_peps, test_signs, tcr_atox, pep_atox, batch_size,max_len)

    true = []
    scores = []
    test_tcrs = []
    test_peps = []
    for batch in test_batches: 
        tcrs, padded_peps, pep_lens, batch_signs = batch
        tcrs = torch.tensor(tcrs).to(device)
        padded_peps = padded_peps.to(device)
        pep_lens = pep_lens.to(device)
        test_tcrs.extend(tcrs.cpu().numpy())  
        test_peps.extend(padded_peps.cpu().numpy()) 
        probs = model(tcrs, padded_peps, pep_lens)
        true.extend(np.array(batch_signs).astype(int))
        scores.extend(probs.cpu().data.numpy())    
    test_auc = roc_auc_score(true, scores)
    fpr, tpr, thresholds = roc_curve(true, scores)
    probability = pd.DataFrame({ 'Epitope': test_peps, 'CDR3B': test_tcrs, 'y_true': true, 'y_prob': scores})
    probability['y_prob'] = probability['y_prob'].astype(str)
    probability['y_prob'] = probability['y_prob'].str.replace('[', '').str.replace(']', '')
    probability['y_prob'] = probability['y_prob'].astype(float)
    probability['y_pred'] = probability['y_prob'].apply(lambda x: 1 if x >= 0.5 else 0)
    probability.to_csv(result_path+'ae_probability.csv', index=False)
    probability=pd.read_csv(result_path+'ae_probability.csv')
    probability=Sequence_conversion(probability)
    probability.to_csv(result_path+'ae_probability.csv', index=False)
  

In [None]:
testfile_path="../data/Validation.csv"
modelfile_path="../Retraining_model/Retraining_ae_model.pth"
result_path="../result_path/Retraining_model_prediction"
Retraining_model_prediction(testfile_path,modelfile_path,result_path)
