# Original model prediction

In [1]:
import sys
import os
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import argparse
from scripts.DataProcess import TEPDataset,dropInvalid
from torch.utils.data import DataLoader
from scripts.model import TEPDA
from sklearn.model_selection import GroupShuffleSplit
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score,precision_score,f1_score,matthews_corrcoef,roc_curve,auc
def Original_model_prediction(testfile_path, modelfile_path, result_path):
    import torch
    from scripts.DataProcess import TEPDataset
    from torch.utils.data import DataLoader
    import pandas as pd
    from tqdm import tqdm
    from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, f1_score, matthews_corrcoef, precision_score
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    test = pd.read_csv(testfile_path)
    x_test = test[['CDR3B', 'Epitope']]  
    y_test = test['Affinity']
    dataset_test = TEPDataset(x_test, y_test, align=True)
    test_loader = DataLoader(dataset_test, batch_size=128, shuffle=False)

    model = TEPDA(batch_size=128, d_model=32, modelseed=1000, n_heads=6)  
    model.load_state_dict(torch.load(modelfile_path)) 
    model.to(device)
    model.eval()
    all_predictions = []
    all_labels = []
    all_tcrs = []
    all_peptides = []
    with torch.no_grad():
        for tcrs, peptides, labels, _, _ in tqdm(test_loader):
            tcrs, peptides = tcrs.to(device), peptides.to(device)
            _, _, _, _, _, outputs = model(tcrs, peptides)
            probabilities = torch.softmax(outputs, dim=1)[:, 1]
            all_predictions.extend(probabilities.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_tcrs.extend(tcrs.cpu().numpy())
            all_peptides.extend(peptides.cpu().numpy())
    
    predictions_df = pd.DataFrame({
        'CDR3B': all_tcrs,
        'Epitope': all_peptides,
        'y_true': all_labels,
        'y_prob': all_predictions
    })
    AA_List = "RHKDESTNQCUGPAVILMFYW" + "-" + "." + "?" + "|" + "*"
    idx_to_aa = {i: aa for i, aa in enumerate(AA_List)}
    def decode_sequence(encoded_sequence, idx_to_aa):
        return ''.join([idx_to_aa[idx] for idx in encoded_sequence if idx in idx_to_aa])
    predictions_df['CDR3B'] = predictions_df['CDR3B'].apply(lambda x: decode_sequence(x, idx_to_aa))
    predictions_df['Epitope'] = predictions_df['Epitope'].apply(lambda x: decode_sequence(x, idx_to_aa))
    predictions_df['CDR3B'] = predictions_df['CDR3B'].str.replace("-", "", regex=False)
    predictions_df['Epitope'] = predictions_df['Epitope'].str.replace("-", "", regex=False)
    predictions_df['y_pred'] = (predictions_df['y_prob'] >= 0.5).astype(int)
    predictions_df.to_csv(result_path + '_probability.csv', index=False)


In [None]:
testfile_path="../data/test.csv"
modelfile_path="../Original_model/TEPCAM.pt"
result_path="../result_path/Original_model_prediction"
Original_model_prediction(testfile_path,modelfile_path,result_path)


# 2.Model retraining

In [3]:
import sys
import os
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import argparse
from scripts.DataProcess import TEPDataset,dropInvalid
from torch.utils.data import DataLoader
from scripts.model import TEPDA
from sklearn.model_selection import GroupShuffleSplit
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score,precision_score,f1_score,matthews_corrcoef,roc_curve,auc


batch_size = 128
def Model_retraining(trainfile_path, testfile_path, save_model_path, result_path):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    if torch.cuda.is_available():
        print('The code runs on GPU0.')
    train = pd.read_csv(trainfile_path)
    test = pd.read_csv(testfile_path)
    x_train = train[['CDR3B', 'Epitope']]  
    x_test = test[['CDR3B', 'Epitope']]   
    y_train = train['Affinity']
    y_test = test['Affinity']
    dataset_train = TEPDataset(x_train, y_train, align=True)
    train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, drop_last=True)
    dataset_test = TEPDataset(x_test, y_test, align=True)
    print(dataset_test)
    test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)
    model = TEPDA(batch_size=batch_size, d_model=32, modelseed=1000, n_heads=6)
    model.to(device)
    
    optimizer = torch.optim.AdamW(params=model.parameters(), lr=0.0005, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 20], gamma=0.25)
    loss_fn = nn.CrossEntropyLoss()
    loss_fn.to(device)
    train_loss_list = []
    epoch = 30
    for _ in range(epoch):
        total_loss = 0
        batches = 0
        model.train()
        for tcrs, peptides, labels, _, _ in tqdm(train_loader):
            tcrs, peptides = tcrs.to(device), peptides.to(device)
            _, _, _, _, _, output = model(tcrs, peptides)
            labels = labels.long().to(device)
            optimizer.zero_grad()
            loss = loss_fn(output, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            batches += 1
        train_loss = round(total_loss / batches, 4)
        train_loss_list.append(train_loss)
        print(f"Loss for Epoch{_ + 1}: {train_loss}")
        scheduler.step()
        
    save_model_file = save_model_path + ".pth"
    torch.save(model.state_dict(), save_model_file)
    model.eval() 
    all_predictions = []
    all_tcrs = []
    all_peptides = []
    all_labels = []
    with torch.no_grad():
        for tcrs, peptides, labels, _, _ in tqdm(test_loader):
            tcrs, peptides = tcrs.to(device), peptides.to(device)
            _, _, _, _, _, outputs = model(tcrs, peptides)

            # Apply softmax to get probabilities
            probabilities = torch.softmax(outputs, dim=1)[:, 1]  # Select the probability for class 1

            # Get the predicted class (optional, not used for y_prob)
            predicted = torch.argmax(outputs, dim=1)

            all_predictions.extend(probabilities.cpu().numpy())  # Store probabilities
            all_tcrs.extend(tcrs.cpu().numpy())
            all_peptides.extend(peptides.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
    predictions_df = pd.DataFrame({
        'CDR3B': all_tcrs,
        'Epitope': all_peptides,
        'y_true': all_labels,
        'y_prob': all_predictions
    })
    AA_List = "RHKDESTNQCUGPAVILMFYW" + "-" + "." + "?" + "|" + "*"
    idx_to_aa = {i: aa for i, aa in enumerate(AA_List)}
    def decode_sequence(encoded_sequence, idx_to_aa):
        return ''.join([idx_to_aa[idx] for idx in encoded_sequence if idx in idx_to_aa])
    predictions_df['CDR3B'] = predictions_df['CDR3B'].apply(lambda x: decode_sequence(x, idx_to_aa))
    predictions_df['Epitope'] = predictions_df['Epitope'].apply(lambda x: decode_sequence(x, idx_to_aa))
    predictions_df['CDR3B'] = predictions_df['CDR3B'].str.replace("-", "", regex=False)
    predictions_df['Epitope'] = predictions_df['Epitope'].str.replace("-", "", regex=False)
    predictions_df['y_pred'] = (predictions_df['y_prob'] >= 0.5).astype(int)
    predictions_df.to_csv(result_path + 'probability.csv')


In [None]:
trainfile_path ="../data/train.csv"
testfile_path="../data/test.csv"
save_model_path="../Retraining_model/Retraining_model.pickle"
result_path="../result_path/Retraining_model_prediction"
Model_retraining(trainfile_path,testfile_path,save_model_path,result_path) 

# Retraining_model_prediction

In [3]:
def Retraining_model_prediction(testfile_path, modelfile_path, result_path):
    import torch
    from scripts.DataProcess import TEPDataset
    from torch.utils.data import DataLoader
    import pandas as pd
    from tqdm import tqdm
    from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, f1_score, matthews_corrcoef, precision_score
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    test = pd.read_csv(testfile_path)
    x_test = test[['CDR3B', 'Epitope']]  
    y_test = test['Affinity']
    dataset_test = TEPDataset(x_test, y_test, align=True)
    test_loader = DataLoader(dataset_test, batch_size=128, shuffle=False)

    model = TEPDA(batch_size=128, d_model=32, modelseed=1000, n_heads=6)  
    model.load_state_dict(torch.load(modelfile_path)) 
    model.to(device)
    model.eval()
    all_predictions = []
    all_labels = []
    all_tcrs = []
    all_peptides = []
    with torch.no_grad():
        for tcrs, peptides, labels, _, _ in tqdm(test_loader):
            tcrs, peptides = tcrs.to(device), peptides.to(device)
            _, _, _, _, _, outputs = model(tcrs, peptides)
            probabilities = torch.softmax(outputs, dim=1)[:, 1]
            all_predictions.extend(probabilities.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_tcrs.extend(tcrs.cpu().numpy())
            all_peptides.extend(peptides.cpu().numpy())
    
    predictions_df = pd.DataFrame({
        'CDR3B': all_tcrs,
        'Epitope': all_peptides,
        'y_true': all_labels,
        'y_prob': all_predictions
    })
    AA_List = "RHKDESTNQCUGPAVILMFYW" + "-" + "." + "?" + "|" + "*"
    idx_to_aa = {i: aa for i, aa in enumerate(AA_List)}
    def decode_sequence(encoded_sequence, idx_to_aa):
        return ''.join([idx_to_aa[idx] for idx in encoded_sequence if idx in idx_to_aa])
    predictions_df['CDR3B'] = predictions_df['CDR3B'].apply(lambda x: decode_sequence(x, idx_to_aa))
    predictions_df['Epitope'] = predictions_df['Epitope'].apply(lambda x: decode_sequence(x, idx_to_aa))
    predictions_df['CDR3B'] = predictions_df['CDR3B'].str.replace("-", "", regex=False)
    predictions_df['Epitope'] = predictions_df['Epitope'].str.replace("-", "", regex=False)
    predictions_df['y_pred'] = (predictions_df['y_prob'] >= 0.5).astype(int)
    predictions_df.to_csv(result_path + 'probability.csv', index=False)

In [None]:
testfile_path="../data/validation.csv"
modelfile_path="../Retraining_model/Retraining_model.pickle"
result_path="../result_path/Retraining_model_prediction"
Retraining_model_prediction(testfile_path,modelfile_path,result_path)
