# 1.Original model prediction

In [3]:
import os, sys
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import keras
import time
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Conv1D, GlobalMaxPooling1D, concatenate
from tensorflow.keras.optimizers import Adam
from keras.initializers import glorot_normal
from keras.activations import sigmoid
from sklearn.metrics import roc_auc_score
import utils
import keras.backend as K
from keras.callbacks import EarlyStopping
from nettcr_architectures import nettcr_ab, nettcr_one_chain 
#Options for Pandas DataFrame printing
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', -1)
from argparse import ArgumentParser

class Args:
    epochs=100
    def __init__(self, trainfile_path=None, testfile_path=None, chain=None):
        self.trainfile_path = trainfile_path
        self.testfile_path = testfile_path
        self.chain = chain

def Original_model_prediction(testfile_path, modelfile_path, result_path, chain):
    encoding = utils.blosum50_20aa
    test_data = pd.read_csv(testfile_path)
    if chain == "ab":
        mdl = nettcr_ab()
        pep_test = utils.enc_list_bl_max_len(test_data.peptide, encoding, 9)
        tcra_test = utils.enc_list_bl_max_len(test_data.CDR3a, encoding, 30)
        tcrb_test = utils.enc_list_bl_max_len(test_data.CDR3b, encoding, 30)
        test_inputs = [tcra_test, tcrb_test, pep_test]
    elif chain == "a":
        mdl = nettcr_one_chain()
        pep_test = utils.enc_list_bl_max_len(test_data.peptide, encoding, 9)
        tcra_test = utils.enc_list_bl_max_len(test_data.CDR3a, encoding, 30)
        test_inputs = [tcra_test, pep_test]
    elif chain == "b":
        mdl = nettcr_one_chain()
        pep_test = utils.enc_list_bl_max_len(test_data.Epitope, encoding, 9)
        tcrb_test = utils.enc_list_bl_max_len(test_data.CDR3B, encoding, 30)
        test_inputs = [tcrb_test, pep_test]
    else:
        raise ValueError("Invalid chain type. Please choose from 'a', 'b', or 'ab'.")
    mdl.load_weights(modelfile_path)
    preds = mdl.predict(test_inputs, verbose=0)
    pred_df = pd.concat([test_data, pd.Series(np.ravel(preds), name='y_prob')], axis=1)
    pred_df['pred'] = pred_df['y_prob'].apply(lambda x: 1 if x > 0.5 else 0)

    result = pred_df[['Epitope', 'CDR3B', 'Affinity', 'pred', 'y_prob']]
    result.columns = ['Epitope', 'CDR3B', 'y_true', 'y_pred', 'y_prob']
    result.to_csv(result_path + 'probability.csv', index=False)
    

In [4]:
testfile_path="../data/test.csv"
modelfile_path="../Original_model/NetTCR.h5"
result_path="../result_path/Original_model_prediction"
chain='b'
Original_model_prediction(testfile_path,modelfile_path,result_path,chain)

# 2.Model retraining

In [6]:
import os, sys
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import keras
import time
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Conv1D, GlobalMaxPooling1D, concatenate
from tensorflow.keras.optimizers import Adam
from keras.initializers import glorot_normal
from keras.activations import sigmoid
from sklearn.metrics import roc_auc_score
import utils
import keras.backend as K
from keras.callbacks import EarlyStopping
from nettcr_architectures import nettcr_ab, nettcr_one_chain 
#Options for Pandas DataFrame printing
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', -1)
from argparse import ArgumentParser

class Args:
    epochs=100
    def __init__(self, trainfile_path=None, testfile_path=None, chain=None):
        self.trainfile_path = trainfile_path
        self.testfile_path = testfile_path
        self.chain = chain

    
def Model_retraining(trainfile_path, testfile_path,save_model_path,result_path,chain):
    args = Args(trainfile_path=trainfile_path, testfile_path=testfile_path, chain=chain)
    EPOCHS = int(args.epochs)
    if args.chain not in ["a", "b", "ab"]:
        print("Invalid chain. You can select a (alpha), b (beta), ab (alpha+beta)")
    print('Loading and encoding the data..')
    train_data = pd.read_csv(args.trainfile_path)
    test_data = pd.read_csv(args.testfile_path)
    # Encode data
    encoding = utils.blosum50_20aa
    early_stop = EarlyStopping(monitor='loss', min_delta=0,
                               patience=10, verbose=0, mode='min', restore_best_weights=True)

    # Call and compile the model
    if args.chain == 'ab':
        pep_train = utils.enc_list_bl_max_len(train_data.peptide, encoding, 9)
        tcra_train = utils.enc_list_bl_max_len(train_data.CDR3a, encoding, 30)
        tcrb_train = utils.enc_list_bl_max_len(train_data.CDR3b, encoding, 30)
        y_train = np.array(train_data.binder)

        pep_test = utils.enc_list_bl_max_len(test_data.peptide, encoding, 9)
        tcra_test = utils.enc_list_bl_max_len(test_data.CDR3a, encoding, 30)
        tcrb_test = utils.enc_list_bl_max_len(test_data.CDR3b, encoding, 30)
        train_inputs = [tcra_train, tcrb_train, pep_train]
        test_inputs = [tcra_test, tcrb_test, pep_test]

        mdl = nettcr_ab()
    elif args.chain == "a":
        pep_train = utils.enc_list_bl_max_len(train_data.peptide, encoding, 9)
        tcra_train = utils.enc_list_bl_max_len(train_data.CDR3a, encoding, 30)
        y_train = np.array(train_data.binder)

        pep_test = utils.enc_list_bl_max_len(test_data.peptide, encoding, 9)
        tcra_test = utils.enc_list_bl_max_len(test_data.CDR3a, encoding, 30)
        train_inputs = [tcra_train, pep_train]
        test_inputs = [tcra_test, pep_test]
        mdl = nettcr_one_chain()
    elif args.chain == "b":
        pep_train = utils.enc_list_bl_max_len(train_data.Epitope, encoding, 9)
        tcrb_train = utils.enc_list_bl_max_len(train_data.CDR3B, encoding, 30)
        y_train = np.array(train_data.Affinity)

        pep_test = utils.enc_list_bl_max_len(test_data.Epitope, encoding, 9)
        tcrb_test = utils.enc_list_bl_max_len(test_data.CDR3B, encoding, 30)
        train_inputs = [tcrb_train, pep_train]
        test_inputs = [tcrb_test, pep_test]
        mdl = nettcr_one_chain()
    mdl.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=0.001))
    history = mdl.fit(train_inputs, y_train, 
                      epochs=EPOCHS, batch_size=128, verbose=1, callbacks=[early_stop])
    mdl.save_weights(save_model_path)
    print('Evaluating..')
    preds = mdl.predict(test_inputs, verbose=0)
    pred_df = pd.concat([test_data, pd.Series(np.ravel(preds), name='y_prob')], axis=1)
    pred_df['pred'] = pred_df['y_prob'].apply(lambda x: 1 if x > 0.5 else 0)
    from sklearn.metrics import accuracy_score, roc_auc_score, recall_score,precision_score,f1_score,matthews_corrcoef,roc_curve,auc
    result =pred_df[['Epitope','CDR3B','Affinity','pred','y_prob']]
    result.columns = ['Epitope', 'CDR3B', 'y_true', 'y_pred', 'y_prob']
    result.to_csv(result_path +'probability.csv')


In [None]:
trainfile_path ="../data/train.csv"
testfile_path="../data/test.csv"
save_modle_path="../Retraining_model/Retraining_model.h5"
result_path="../result_path/Retraining_model_prediction"
chain='b'
Model_retraining(trainfile_path,testfile_path,save_modle_path,result_path,chain) 


# 3.Retraining_model_prediction

In [8]:
import os, sys
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
import numpy as np
import pandas as pd
from keras.models import Model
from tensorflow.keras.optimizers import Adam
import utils
from nettcr_architectures import nettcr_one_chain, nettcr_ab 
from sklearn.metrics import roc_auc_score
from keras.callbacks import EarlyStopping
from argparse import ArgumentParser
def Retraining_model_prediction(testfile_path, modelfile_path, result_path, chain):
    encoding = utils.blosum50_20aa
    test_data = pd.read_csv(testfile_path)
    if chain == "ab":
        mdl = nettcr_ab()
        pep_test = utils.enc_list_bl_max_len(test_data.peptide, encoding, 9)
        tcra_test = utils.enc_list_bl_max_len(test_data.CDR3a, encoding, 30)
        tcrb_test = utils.enc_list_bl_max_len(test_data.CDR3b, encoding, 30)
        test_inputs = [tcra_test, tcrb_test, pep_test]
    elif chain == "a":
        mdl = nettcr_one_chain()
        pep_test = utils.enc_list_bl_max_len(test_data.peptide, encoding, 9)
        tcra_test = utils.enc_list_bl_max_len(test_data.CDR3a, encoding, 30)
        test_inputs = [tcra_test, pep_test]
    elif chain == "b":
        mdl = nettcr_one_chain()
        pep_test = utils.enc_list_bl_max_len(test_data.Epitope, encoding, 9)
        tcrb_test = utils.enc_list_bl_max_len(test_data.CDR3B, encoding, 30)
        test_inputs = [tcrb_test, pep_test]
    else:
        raise ValueError("Invalid chain type. Please choose from 'a', 'b', or 'ab'.")
    mdl.load_weights(modelfile_path)
    preds = mdl.predict(test_inputs, verbose=0)
    pred_df = pd.concat([test_data, pd.Series(np.ravel(preds), name='y_prob')], axis=1)
    pred_df['pred'] = pred_df['y_prob'].apply(lambda x: 1 if x > 0.5 else 0)

    result = pred_df[['Epitope', 'CDR3B', 'Affinity', 'pred', 'y_prob']]
    result.columns = ['Epitope', 'CDR3B', 'y_true', 'y_pred', 'y_prob']
    result.to_csv(result_path + 'probability.csv', index=False)


In [9]:
testfile_path="../data/Validation.csv"
modelfile_path="../Retraining_model/Retraining_model.h5"
result_path="../result_path/Retraining_model_prediction"
chain="b"
Retraining_model_prediction(testfile_path,modelfile_path,result_path,chain)