# 1.Original model prediction

In [5]:
import sys
import os
sys.path.append('./vittcr/orig/code')
from TcrPepTransform_utils import DropPath, trunc_normal_, lecun_normal_
from TcrPepTransform_beta import *
from utils_train_new import *
import torch
import datetime
import random
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from torch.utils.data import DataLoader, Dataset
import io
import joblib
import pickle
from collections import defaultdict

# Set random seed
def setup_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


# Optimized function to process the input file and return a dictionary
def process_file_return_dict(inpath, length_cdr3=20, length_pep=15, chain='beta'):
    Atchley = {
        'A': np.array((-0.591, -1.302, -0.733, 1.57, -0.146)),
        'C': np.array((-1.343, 0.465, -0.862, -1.02, -0.255)),
        'D': np.array((1.05, 0.302, -3.656, -0.259, -3.242)),
        'E': np.array((1.357, -1.453, 1.477, 0.113, -0.837)),
        'F': np.array((-1.006, -0.59, 1.891, -0.397, 0.412)),
        'G': np.array((-0.384, 1.652, 1.33, 1.045, 2.064)),
        'H': np.array((0.336, -0.417, -1.673, -1.474, -0.078)),
        'I': np.array((-1.239, -0.547, 2.131, 0.393, 0.816)),
        'K': np.array((1.831, -0.561, 0.533, -0.277, 1.648)),
        'L': np.array((-1.019, -0.987, -1.505, 1.266, -0.912)),
        'M': np.array((-0.663, -1.524, 2.219, -1.005, 1.212)),
        'N': np.array((0.945, 0.828, 1.299, -0.169, 0.933)),
        'P': np.array((0.189, 2.081, -1.628, 0.421, -1.392)),
        'Q': np.array((0.931, -0.179, -3.005, -0.503, -1.853)),
        'R': np.array((1.538, -0.055, 1.502, 0.44, 2.897)),
        'S': np.array((-0.228, 1.399, -4.76, 0.67, -2.647)),
        'T': np.array((-0.032, 0.326, 2.213, 0.908, 1.313)),
        'V': np.array((-1.337, -0.279, -0.544, 1.242, -1.262)),
        'W': np.array((-0.595, 0.009, 0.672, -2.128, -0.184)),
        'Y': np.array((0.26, 0.83, 3.097, -0.838, 1.512))
    }
    
    def generate_interaction_map_optimized(df, query, length_cdr3, length_pep, chain):
        features = list(query.columns)
        dict_combined = defaultdict(list)
        dict_intermap = defaultdict(list)

        # Prepare CDR3 and peptide sequences
        if chain == "beta":
            cdr3s = df['cdr3b'].str.upper().tolist()
        elif chain == "alpha":
            cdr3s = df['cdr3a'].str.upper().tolist()
        peptides = df['peptide'].str.upper().tolist()

        if 'Binding' in df.columns:
            Bindings = df['Binding'].tolist()

        # Precompute Atchley values for sequences
        atchley_matrix_cdr3 = np.zeros((len(cdr3s), length_cdr3, len(features)))
        atchley_matrix_peptide = np.zeros((len(peptides), length_pep, len(features)))

        for i, cdr3 in enumerate(cdr3s):
            for j, char in enumerate(cdr3[:length_cdr3]):
                if char in query.index:
                    atchley_matrix_cdr3[i, j, :] = query.loc[char].values

        for i, peptide in enumerate(peptides):
            for j, char in enumerate(peptide[:length_pep]):
                if char in query.index:
                    atchley_matrix_peptide[i, j, :] = query.loc[char].values

        # Compute interaction maps for each feature
        for order, feature in enumerate(features):
            feature_cdr3 = atchley_matrix_cdr3[:, :, order]
            feature_peptide = atchley_matrix_peptide[:, :, order]
            intermap = np.abs(feature_cdr3[:, :, None] - feature_peptide[:, None, :])
            dict_intermap[feature] = intermap

        # Save the combined interaction maps
        combinedmap = np.stack([dict_intermap[feature] for feature in features], axis=1)
        if chain == 'alpha':
            dict_combined['combined_map_alpha'] = combinedmap.tolist()
        elif chain == 'beta':
            dict_combined['combined_map_beta'] = combinedmap.tolist()

        dict_combined['cdr3'] = cdr3s
        dict_combined['peptide'] = peptides
        if 'Binding' in df.columns:
            dict_combined['Binding'] = Bindings

        return dict_combined

    queryfile = pd.DataFrame(Atchley).T
    queryfile.columns = ['f1', 'f2', 'f3', 'f4', 'f5']

    data = pd.read_csv(inpath)
    data.rename(columns={'CDR3B': 'cdr3b', 'Epitope': 'peptide', 'Affinity': 'Binding'}, inplace=True)
    data = data[['cdr3b', 'peptide', 'Binding']]

    dict_combined = generate_interaction_map_optimized(
        df=data,
        query=queryfile,
        length_cdr3=length_cdr3,
        length_pep=length_pep,
        chain=chain )
    return dict_combined


In [6]:
def Original_model_prediction(testfile, save_model_path, resultfile_path, chain='beta'):
    setup_seed(1234)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    model = TcrPepTransform_single(
        input_height=20, 
        input_width=15, 
        in_chans=5, 
        patch_size=4, 
        num_classes=2, 
        embed_dim=256, 
        depth=1, 
        num_heads=4, 
        mlp_ratio=4, 
        qkv_bias=True, 
        drop_rate=0.1, 
        attn_drop_rate=0.05, 
        drop_path_rate=0, 
        act_layer=torch.nn.GELU)
    
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)
    model.to(device)
    model.loss_func = torch.nn.CrossEntropyLoss()
    model.acc_func = metrics.accuracy_score
    model.auc_roc_func = metrics.roc_auc_score
    model.auc_pr_func = pr_auc_score
    model.f1_score_func = metrics.f1_score
    model.optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
 
    test_dicts = process_file_return_dict(testfile, chain=chain)
    label_valid = np.array(test_dicts['Binding'])
    label_valid_initialized = Labels_Initialization(num_classes=2, labels=label_valid)

    if chain == 'beta':
        data_valid = np.array(test_dicts['combined_map_beta'])
        raw_cdr3_test = test_dicts['cdr3']
        raw_epitope_test = test_dicts['peptide']
    elif chain == 'alpha':
        data_valid = np.array(test_dicts['combined_map_alpha'])
        raw_cdr3_test = test_dicts['cdr3']
        raw_epitope_test = test_dicts['peptide']
    else:
        raise ValueError("Invalid chain type. Must be 'alpha' or 'beta'.")

    data_valid = torch.from_numpy(data_valid)
    dataset_valid = MyDataset(data=data_valid, labl=label_valid_initialized, raw_cdr3=raw_cdr3_test, raw_epitope=raw_epitope_test)
    dl_valid = DataLoader(dataset_valid, batch_size=len(data_valid), shuffle=False, num_workers=5)
    model.load_state_dict(torch.load(save_model_path))
    model.eval()
    dfhistory = eval_model(
        model_initial=model,
        dl_valid=dl_valid,
        device=device)
    dfhistory.to_csv(resultfile_path + '_probability.csv', header=True, index=False)


In [None]:
testfile_path="../data/test.csv"
modelfile_path="../Original_model/VitTCR.pt"
result_path="../result_path/Original_model_prediction"
Original_model_prediction(testfile_path,modelfile_path,result_path)

# 2.Model retraining

In [8]:
import sys
import os
sys.path.append('/home/bingxing2/home/scx6666/zhengli/VitTCR/vittcr/orig/code')
from TcrPepTransform_utils import DropPath, trunc_normal_, lecun_normal_
from TcrPepTransform_beta import *
from utils_train_new import *
import torch
import datetime
import random
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from torch.utils.data import DataLoader, Dataset
import io
import joblib
import pickle
from collections import defaultdict

# Set random seed
def setup_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


# Optimized function to process the input file and return a dictionary
def process_file_return_dict(inpath, length_cdr3=20, length_pep=15, chain='beta'):
    Atchley = {
        'A': np.array((-0.591, -1.302, -0.733, 1.57, -0.146)),
        'C': np.array((-1.343, 0.465, -0.862, -1.02, -0.255)),
        'D': np.array((1.05, 0.302, -3.656, -0.259, -3.242)),
        'E': np.array((1.357, -1.453, 1.477, 0.113, -0.837)),
        'F': np.array((-1.006, -0.59, 1.891, -0.397, 0.412)),
        'G': np.array((-0.384, 1.652, 1.33, 1.045, 2.064)),
        'H': np.array((0.336, -0.417, -1.673, -1.474, -0.078)),
        'I': np.array((-1.239, -0.547, 2.131, 0.393, 0.816)),
        'K': np.array((1.831, -0.561, 0.533, -0.277, 1.648)),
        'L': np.array((-1.019, -0.987, -1.505, 1.266, -0.912)),
        'M': np.array((-0.663, -1.524, 2.219, -1.005, 1.212)),
        'N': np.array((0.945, 0.828, 1.299, -0.169, 0.933)),
        'P': np.array((0.189, 2.081, -1.628, 0.421, -1.392)),
        'Q': np.array((0.931, -0.179, -3.005, -0.503, -1.853)),
        'R': np.array((1.538, -0.055, 1.502, 0.44, 2.897)),
        'S': np.array((-0.228, 1.399, -4.76, 0.67, -2.647)),
        'T': np.array((-0.032, 0.326, 2.213, 0.908, 1.313)),
        'V': np.array((-1.337, -0.279, -0.544, 1.242, -1.262)),
        'W': np.array((-0.595, 0.009, 0.672, -2.128, -0.184)),
        'Y': np.array((0.26, 0.83, 3.097, -0.838, 1.512))
    }
    
    def generate_interaction_map_optimized(df, query, length_cdr3, length_pep, chain):
        features = list(query.columns)
        dict_combined = defaultdict(list)
        dict_intermap = defaultdict(list)

        # Prepare CDR3 and peptide sequences
        if chain == "beta":
            cdr3s = df['cdr3b'].str.upper().tolist()
        elif chain == "alpha":
            cdr3s = df['cdr3a'].str.upper().tolist()
        peptides = df['peptide'].str.upper().tolist()

        if 'Binding' in df.columns:
            Bindings = df['Binding'].tolist()

        # Precompute Atchley values for sequences
        atchley_matrix_cdr3 = np.zeros((len(cdr3s), length_cdr3, len(features)))
        atchley_matrix_peptide = np.zeros((len(peptides), length_pep, len(features)))

        for i, cdr3 in enumerate(cdr3s):
            for j, char in enumerate(cdr3[:length_cdr3]):
                if char in query.index:
                    atchley_matrix_cdr3[i, j, :] = query.loc[char].values

        for i, peptide in enumerate(peptides):
            for j, char in enumerate(peptide[:length_pep]):
                if char in query.index:
                    atchley_matrix_peptide[i, j, :] = query.loc[char].values

        # Compute interaction maps for each feature
        for order, feature in enumerate(features):
            feature_cdr3 = atchley_matrix_cdr3[:, :, order]
            feature_peptide = atchley_matrix_peptide[:, :, order]
            intermap = np.abs(feature_cdr3[:, :, None] - feature_peptide[:, None, :])
            dict_intermap[feature] = intermap

        # Save the combined interaction maps
        combinedmap = np.stack([dict_intermap[feature] for feature in features], axis=1)
        if chain == 'alpha':
            dict_combined['combined_map_alpha'] = combinedmap.tolist()
        elif chain == 'beta':
            dict_combined['combined_map_beta'] = combinedmap.tolist()

        dict_combined['cdr3'] = cdr3s
        dict_combined['peptide'] = peptides
        if 'Binding' in df.columns:
            dict_combined['Binding'] = Bindings

        return dict_combined

    queryfile = pd.DataFrame(Atchley).T
    queryfile.columns = ['f1', 'f2', 'f3', 'f4', 'f5']

    data = pd.read_csv(inpath)
    data.rename(columns={'CDR3B': 'cdr3b', 'Epitope': 'peptide', 'Affinity': 'Binding'}, inplace=True)
    data = data[['cdr3b', 'peptide', 'Binding']]

    dict_combined = generate_interaction_map_optimized(
        df=data,
        query=queryfile,
        length_cdr3=length_cdr3,
        length_pep=length_pep,
        chain=chain )
    return dict_combined


In [10]:
def Model_retraining(trainfile, testfile, save_model_path, resultfile_path, chain='beta'):
    setup_seed(1234)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    model = TcrPepTransform_single(
        input_height=20, 
        input_width=15, 
        in_chans=5, 
        patch_size=4, 
        num_classes=2, 
        embed_dim=256, 
        depth=1, 
        num_heads=4, 
        mlp_ratio=4, 
        qkv_bias=True, 
        drop_rate=0.1, 
        attn_drop_rate=0.05, 
        drop_path_rate=0, 
        act_layer=torch.nn.GELU
    )

    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)
    model.to(device)
    model.loss_func = torch.nn.CrossEntropyLoss()
    model.acc_func = metrics.accuracy_score
    model.auc_roc_func = metrics.roc_auc_score
    model.auc_pr_func = pr_auc_score
    model.f1_score_func = metrics.f1_score
    model.optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    print("Loading training data...")
    train_dicts = process_file_return_dict(trainfile, chain=chain)
    label_train = np.array(train_dicts['Binding'])
    label_train_initialized = Labels_Initialization(num_classes=2, labels=label_train)
    if chain == 'beta':
        data_train = np.array(train_dicts['combined_map_beta'])
        raw_cdr3 = train_dicts['cdr3']
        raw_epitope = train_dicts['peptide']
    elif chain == 'alpha':
        data_train = np.array(train_dicts['combined_map_alpha'])
        raw_cdr3 = train_dicts['cdr3']
        raw_epitope = train_dicts['peptide']
    else:
        raise ValueError("Invalid chain type. Must be 'alpha' or 'beta'.")
    data_train = torch.from_numpy(data_train)
    dataset_train = MyDataset(data=data_train, labl=label_train_initialized, raw_cdr3=raw_cdr3, raw_epitope=raw_epitope)
    dl_train = DataLoader(dataset_train, batch_size=512, shuffle=True, drop_last=True, num_workers=5)
    num_epochs = 100
    print(f"Starting training for {num_epochs} epochs...")
    train_model(
        model=model,
        dl_train=dl_train,
        device=device,
        num_epochs=num_epochs )
    torch.save(model.state_dict(), save_model_path)
    test_dicts = process_file_return_dict(testfile, chain=chain)
    label_valid = np.array(test_dicts['Binding'])
    label_valid_initialized = Labels_Initialization(num_classes=2, labels=label_valid)

    if chain == 'beta':
        data_valid = np.array(test_dicts['combined_map_beta'])
        raw_cdr3_test = test_dicts['cdr3']
        raw_epitope_test = test_dicts['peptide']
    elif chain == 'alpha':
        data_valid = np.array(test_dicts['combined_map_alpha'])
        raw_cdr3_test = test_dicts['cdr3']
        raw_epitope_test = test_dicts['peptide']
    else:
        raise ValueError("Invalid chain type. Must be 'alpha' or 'beta'.")

    data_valid = torch.from_numpy(data_valid)
    dataset_valid = MyDataset(data=data_valid, labl=label_valid_initialized, raw_cdr3=raw_cdr3_test, raw_epitope=raw_epitope_test)
    dl_valid = DataLoader(dataset_valid, batch_size=len(data_valid), shuffle=False, num_workers=5)
    model.load_state_dict(torch.load(save_model_path))
    model.eval()
    dfhistory = eval_model(
        model_initial=model,
        dl_valid=dl_valid,
        device=device)
    dfhistory.to_csv(resultfile_path + 'probability.csv', header=True, index=False)


In [None]:
trainfile_path ="../data/train.csv"
testfile_path="../data/test.csv"
save_model_path="../Retraining_model/Retraining_model.pickle"
result_path="../result_path/Retraining_model_prediction"
Model_retraining(trainfile_path,testfile_path,save_model_path,result_path) 

Using device: cpu
Loading training data...
Starting training for 100 epochs...
Epoch 1: Loss: 0.6935177635401487, AUC-PR: 0.5906857694846539


# 3.Retraining_model_prediction

In [7]:
def Retraining_model_prediction(testfile, save_model_path, resultfile_path, chain='beta'):
    setup_seed(1234)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    model = TcrPepTransform_single(
        input_height=20, 
        input_width=15, 
        in_chans=5, 
        patch_size=4, 
        num_classes=2, 
        embed_dim=256, 
        depth=1, 
        num_heads=4, 
        mlp_ratio=4, 
        qkv_bias=True, 
        drop_rate=0.1, 
        attn_drop_rate=0.05, 
        drop_path_rate=0, 
        act_layer=torch.nn.GELU)
    
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)
    model.to(device)
    model.loss_func = torch.nn.CrossEntropyLoss()
    model.acc_func = metrics.accuracy_score
    model.auc_roc_func = metrics.roc_auc_score
    model.auc_pr_func = pr_auc_score
    model.f1_score_func = metrics.f1_score
    model.optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
 
    test_dicts = process_file_return_dict(testfile, chain=chain)
    label_valid = np.array(test_dicts['Binding'])
    label_valid_initialized = Labels_Initialization(num_classes=2, labels=label_valid)

    if chain == 'beta':
        data_valid = np.array(test_dicts['combined_map_beta'])
        raw_cdr3_test = test_dicts['cdr3']
        raw_epitope_test = test_dicts['peptide']
    elif chain == 'alpha':
        data_valid = np.array(test_dicts['combined_map_alpha'])
        raw_cdr3_test = test_dicts['cdr3']
        raw_epitope_test = test_dicts['peptide']
    else:
        raise ValueError("Invalid chain type. Must be 'alpha' or 'beta'.")

    data_valid = torch.from_numpy(data_valid)
    dataset_valid = MyDataset(data=data_valid, labl=label_valid_initialized, raw_cdr3=raw_cdr3_test, raw_epitope=raw_epitope_test)
    dl_valid = DataLoader(dataset_valid, batch_size=len(data_valid), shuffle=False, num_workers=5)
    model.load_state_dict(torch.load(save_model_path))
    model.eval()
    dfhistory = eval_model(
        model_initial=model,
        dl_valid=dl_valid,
        device=device)
    dfhistory.to_csv(resultfile_path + 'probability.csv', header=True, index=False)


In [None]:
testfile_path="../data/validation.csv"
modelfile_path="../Retraining_model/Retraining_model.pickle"
result_path="../result_path/Retraining_model_prediction"
Retraining_model_prediction(testfile_path,modelfile_path,result_path)
