# 1.Original model prediction

In [1]:
import tcrgp
import pickle
import ast
import csv
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
plt.style.use('ggplot')

When using this model for prediction or training, 
the dataset needs to be modified in the following way

In [2]:
def fix(datafile,resultpath):
    train=pd.read_csv(datafile)
    train['TRBV']=''
    train['TRBJ']=''
    train['cdr3a']=''
    epitope = pd.unique(train['Epitope'])
    for i in epitope:
        df_train = train[train['Epitope'] == i].sample(frac=1, random_state=42)
        df_train['cdr3a']='NaN'
        df_train=df_train[['Epitope','Affinity','TRBV','TRBJ','cdr3a','CDR3B']]
        df_train = df_train.rename(columns={'Epitope': 'epitope', 'CDR3B': 'cdr3b'})
        df_train.loc[df_train['Affinity'] == 0, 'epitope'] = 'none'
        df_train.to_csv(resultpath+i+'.csv')
        
datafile="../data/test.csv"
resultpath="../data/TCRGP_test"
fix(datafile,resultpath)

In [5]:
def Original_model_prediction(index_path,testfile_path,modelfile_path,result_path):
    index = pd.read_csv(index_path)
    epitope=pd.unique(index['Epitope'])
    all_results = pd.DataFrame()   
    for i in epitope:
        with open(modelfile_path+i+"_tcb",'rb') as f:
            params = pickle.load(f)
        preds = tcrgp.predict(testfile_path+i+'_Validation.csv',params)
        data = pd.read_csv(testfile_path+i+'_Validation.csv')
        probability = data[['epitope', 'cdr3b', 'Affinity']]
        probability = probability.rename(columns={'Affinity': 'y_true'})
        probability['y_prob'] = preds
        probability['y_pred'] = probability['y_prob'].apply(lambda x: 1 if x >= 0.5 else 0)
        probability['Epitope'] = probability['epitope'].str.replace('none', i)
        all_results = all_results.append(probability)
    all_results.to_csv(result_path+'probability.csv', index=False)

When using the original model for prediction, it should be ensured that all predicted epitopes are present in the original model.

In [None]:
index_path="../data/test.csv"
testfile_path="../data/TCRGP_test"
modelfile_path="../Original_model/model_vdj_"
result_path="../result_path/Retraining_model_prediction"
Original_model_prediction(index_path,testfile_path,modelfile_path,result_path)

# 2.Model retraining

In [7]:
import pandas as pd
import pickle
import tcrgp

def Model_retraining(index_path,trainfile_path, testfile_path,save_model_path,esult_path):
    subsmat = tcrgp.subsmatFromAA2('HENS920102')
    pc_blo = tcrgp.get_pcs(subsmat, d=21) 
    index = pd.read_csv(index_path)
    epitope = pd.unique(index['Epitope'])
    all_results = pd.DataFrame()   
    for i in epitope:
        auc, params = tcrgp.train_classifier(trainfile_path + i+'.csv' , 'human', i, pc_blo,
                                                  cdr_types=[[], ['cdr3']], m_iters=20, lr=0.005, nZ=0, mbs=0, lmax3=18,
                                                  va=None, vb=None, cdr3a=None, cdr3b='cdr3b', epis='epitope')
        with open(save_model_path + i, 'wb') as f:
            pickle.dump(params, f)
        preds = tcrgp.predict(testfile_path + i+'.csv' , params)
        data = pd.read_csv(testfile_path + i+'.csv')
        probability = data[['epitope', 'cdr3b', 'Affinity']]
        probability = probability.rename(columns={'Affinity': 'y_true'})
        probability['y_prob'] = preds
        probability['y_pred'] = probability['y_prob'].apply(lambda x: 1 if x >= 0.5 else 0)
        probability['epitope'] = probability['epitope'].str.replace('none', i)
        all_results = all_results.append(probability)
    all_results.to_csv(result_path+'probability.csv', index=False)

In [8]:
trainfile_path ="../data/train.csv"
resultpath="../data/TCRGP_train"
fix(trainfile_path,resultpath)

index_path refers to the path of the original, unmodified training dataset,
while trainfile_path refers to the path of the dataset modified according to this model

In [None]:
index_path="../data/train.csv"
trainfile_path ="../data/TCRGP_train"
testfile_path="../data/TCRGP_test"
save_modle_path="../Retraining_model/Retraining_model"
result_path="../result_path/Retraining_model_prediction"
Model_retraining(index_path,trainfile_path,testfile_path,save_modle_path,result_path) 

# 3.Retraining_model_prediction

In [5]:
def Retraining_model_prediction(index_path,testfile_path,modelfile_path,result_path):
    index = pd.read_csv(index_path)
    epitope=pd.unique(index['Epitope'])
    all_results = pd.DataFrame()   
    for i in epitope:
        with open(modelfile_path+i,'rb') as f:
            params = pickle.load(f)
        preds = tcrgp.predict(testfile_path+i+'.csv',params)
        data = pd.read_csv(testfile_path+i+'.csv')
        probability = data[['epitope', 'cdr3b', 'Affinity']]
        probability = probability.rename(columns={'Affinity': 'y_true'})
        probability['y_prob'] = preds
        probability['y_pred'] = probability['y_prob'].apply(lambda x: 1 if x >= 0.5 else 0)
        probability['Epitope'] = probability['epitope'].str.replace('none', i)
        all_results = all_results.append(probability)
    all_results.to_csv(result_path+'probability.csv', index=False)

In [None]:
testfile_path ="../data/Validation.csv"
resultpath="../data/TCRGP_Validation"
fix(testfile_path,resultpath)

In [None]:
index_path="../data/no_fix_Validation.csv"
testfile_path="../data/TCRGP_Validation"
modelfile_path="../Retraining_model/Retraining_model"
result_path="../result_path/Retraining_model_prediction"
Retraining_model_prediction(index_path,testfile_path,modelfile_path,result_path)
