# 1.Original model prediction

In [2]:
import os 
os.chdir("./tcr/")
import numpy as np
import model_utils 
tcrbert_trb_cls = model_utils.load_classification_pipeline("wukevin/tcr-bert", device=0)

In [3]:
import pandas as pd
def Original_model_prediction(data_path,result_path):
    data=pd.read_csv(data_path)
    formatted_strings = [' '.join(list(s)) for s in (data['CDR3B'].tolist())]
    pred = model_utils.reformat_classification_pipeline_preds(tcrbert_trb_cls(formatted_strings))
    unique_epitopes = pd.unique(data['Epitope'])
    df_filtered = pred[unique_epitopes]
    result = pd.concat([data, df_filtered], axis=1)
    result['y_prob'] = result.apply(lambda row: row[row['Epitope']] if row['Epitope'] in result.columns else None, axis=1)
    result=result[['Epitope','CDR3B','antigen_species','Affinity','y_prob']]
    result = result.rename(columns={'Affinity': 'y_true'})
    result['y_pred'] = result['y_prob'].apply(lambda x: 1 if x >= 0.5 else 0)
    result.to_csv(result_path+'probability.csv')

In [None]:
testfile_path="../../data/test.csv"
result_path="../../result_path/Original_model_prediction"
Original_model_prediction(testfile_path,result_path)

In [17]:
testfile_path="../../data/test.csv"
result_path="../../result_path/Original_model_prediction"
Original_model_prediction(testfile_path,result_path)

KeyError: "['LLFNKVTLA', 'TPINLVRDL', 'IQYIDIGNY', 'YLDAYNMMI', 'KLSYGIATV', 'FLNGSCGSV', 'VTIAEILLI', 'HLVDFQVTI', 'TLVPQEHYV', 'KLWAQCVQL', 'SSNVANYQK', 'KLGGALQAK', 'VLFGLGFAI', 'GTHWFVTQR', 'ILHCANFNV', 'YLQPRTFLL', 'VLWAHGFEL', 'CINGVCWTV', 'NLNCCSVPV', 'FLPRVFSAV', 'QYIKWPWYI', 'FLNRFTTTL', 'YFPLQSYGF', 'IPSINVHHY', 'TTAATHREK', 'ILGLPTQTV', 'KAYNVTQAF', 'LPPAYTNSF', 'FIAGLIAIV', 'NQKLIANQF', 'CTELKLSDY', 'YIFFASFYY', 'KEIDRLNEV', 'TLDSKTQSL', 'YLNTLTLAV', 'KLNVGDYFV', 'FVDGVPFVV', 'TLIGDCATV', 'SPRWYFYYL', 'VLAWLYAAV', 'SEETGTLIV', 'GPGHKARVL', 'FPPTSFGPL', 'IVDTVSALV', 'LPAADLDDF', 'GTITSGWTF', 'LVLSVNPYV', 'RQLLFVVEV', 'NLNESLIDL', 'MPASWVMRI', 'IPIQASLPF', 'LLMPILTLT', 'SLVKPSFYV', 'ALSKGVHFV', 'VYGIRLEHF', 'TSNQVAVLY', 'IPRRNVATL', 'EEHVQIHTI', 'LLFGYPVYV', 'VVYRGTTTY', 'SFHSLHLLF', 'WICLLQFAY', 'LEPLVDLPI'] not in index"

# 2.Model retraining

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import os
import sys
import logging
import argparse
import json
from joblib import dump
from typing import *
import numpy as np
import pandas as pd
import sklearn
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.base import BaseEstimator
from sklearn import metrics
import git 

import sys
sys.path.append("./tcr/")
import featurization as ft
import data_loader as dl
import model_utils
import canonical_models as models
logging.basicConfig(level=logging.INFO)


def get_model(keyword: str, n_components: int) -> BaseEstimator:
    """
    Return a sklearn type model given a keyword
    """
    if keyword == "pcasvm":
        cls = models.ModelOnPCA(
            SVC, n_components=n_components, probability=True, kernel="rbf"
        )
    elif keyword == "svm":
        cls = SVC(probability=True, kernel="rbf", random_state=6489)
    elif keyword == "lr":
        cls = LogisticRegression(penalty="l2", solver="liblinear")
    elif keyword == "gpc":
        cls = GaussianProcessClassifier()
    else:
        raise ValueError(f"Unrecognized classifier: {keyword}")
    logging.info(f"Classifier {cls}")
    return cls


        
def Model_retraining(trainfile_path,testfile_path,save_model_path,result_path):
    import torch
    import pandas as pd
    import numpy as np
    from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, f1_score, matthews_corrcoef, precision_score,auc,roc_curve
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    train = pd.read_csv(trainfile_path)
    test = pd.read_csv(testfile_path)
    epitope = pd.unique(train['Epitope'])
    results_list = []
    probab_list=[]
    for i in epitope:
        df_train = train[train['Epitope'] == i].sample(frac=1, random_state=42)  # Filter the dataframe for current epitope
        train_seqs = df_train['CDR3B'].tolist()
        train_labels = df_train['Affinity'].tolist()
       
        df_test = test[test['Epitope'] == i].sample(frac=1, random_state=42)  # Filter the dataframe for current epitope
        test_seqs= df_test['CDR3B'].tolist()
        test_labels = df_test['Affinity'].tolist()
        transformer="wukevin/tcr-bert"
        layer=-1
        train_embed = model_utils.get_transformer_embeddings(
                model_dir=transformer,
                seqs=train_seqs,
                layers=[layer],
                method="mean",
                device=device,
            )
        test_embed = model_utils.get_transformer_embeddings(
                model_dir=transformer,
                seqs=test_seqs,
                layers=[layer],
                method="mean",
                device=device,
            )
        numpcs=50
        classifier="svm"
        cls = get_model(classifier, numpcs)
        cls.fit(train_embed, train_labels)
        model_path = save_model_path + '_' + i + '_model.pt'
        torch.save(cls, model_path)

        test_preds = cls.predict_proba(test_embed)[:, 1]
        y_pred = test_preds.round()
        test_labels = np.array(test_labels)
        probab= { 'Epitope': [i] * len(df_test['CDR3B']),'CDR3B': df_test['CDR3B'],'y_true': test_labels, 'y_pred': y_pred,'y_prob': test_preds}
        probab_list.append(probab)   
        probability = pd.DataFrame(probab_list)
        probability = pd.DataFrame(probability)
        probability = probability.apply(pd.Series.explode)
        probability.to_csv(result_path+'probability.csv')

        

In [None]:
trainfile_path ="../../data/train.csv"
testfile_path="../../data/test.csv"
save_modle_path="../../Retraining_model/Retraining_model"
result_path="../../result_path/Retraining_model_prediction"
Model_retraining(trainfile_path,testfile_path,save_modle_path,result_path) 

# 3.Retraining_model_prediction

In [7]:
import os
import torch
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, f1_score, matthews_corrcoef, precision_score, auc, roc_curve

def load_model(model_path):
    """
    Load a pre-trained model from the specified path.
    """
    return torch.load(model_path)

def validation_main(testfile_path, modelfile_path, result_path):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    test = pd.read_csv(testfile_path)
    epitope = pd.unique(test['Epitope'])
    results_list = []
    probab_list = []
    for i in epitope:
        model_path = modelfile_path + '_' + i + '_model.pt'
        cls = load_model(model_path)
        df_test = test[test['Epitope'] == i].sample(frac=1, random_state=42)
        test_seqs = df_test['CDR3B'].tolist()
        test_labels = df_test['Affinity'].tolist()
        transformer = "wukevin/tcr-bert"
        layer = -1
        test_embed = model_utils.get_transformer_embeddings(
            model_dir=transformer,
            seqs=test_seqs,
            layers=[layer],
            method="mean",
            device=device,
        )

        test_preds = cls.predict_proba(test_embed)[:, 1]
        y_pred = test_preds.round()
        test_labels = np.array(test_labels)
        probab = {
            'Epitope': [i] * len(df_test['CDR3B']),
            'CDR3B': df_test['CDR3B'],
            'y_true': test_labels,
            'y_pred': y_pred,
            'y_prob': test_preds
        }
        probab_list.append(probab)
    probability = pd.DataFrame(probab_list)
    probability = pd.DataFrame(probability)
    probability = probability.apply(pd.Series.explode)
    probability.to_csv(result_path + 'probability.csv', index=False)

In [None]:
testfile_path="../../data/Validation.csv"
modelfile_path="../../Retraining_model/Retraining_model"
result_path="../../result_path/Retraining_model_prediction"
Retraining_model_prediction(testfile_path,modelfile_path,result_path)