# Original model prediction

In [1]:
import os, sys
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import keras
import time
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Conv1D, GlobalMaxPooling1D, concatenate
from tensorflow.keras.optimizers import Adam
from keras.initializers import glorot_normal
from keras.activations import sigmoid
from sklearn.metrics import roc_auc_score
import utils
import keras.backend as K
from keras.callbacks import EarlyStopping
from nettcr_architectures import nettcr_ab, nettcr_one_chain 
#Options for Pandas DataFrame printing
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', -1)
from argparse import ArgumentParser

class Args:
    epochs=100
    def __init__(self, trainfile_path=None, testfile_path=None, chain=None):
        self.trainfile_path = trainfile_path
        self.testfile_path = testfile_path
        self.chain = chain

    
def train_main(trainfile_path, testfile_path,save_model_path,result_path,chain):
    args = Args(trainfile_path=trainfile_path, testfile_path=testfile_path, chain=chain)
    EPOCHS = int(args.epochs)
    if args.chain not in ["a", "b", "ab"]:
        print("Invalid chain. You can select a (alpha), b (beta), ab (alpha+beta)")
    print('Loading and encoding the data..')
    train_data = pd.read_csv(args.trainfile_path)
    test_data = pd.read_csv(args.testfile_path)
    # Encode data
    encoding = utils.blosum50_20aa
    early_stop = EarlyStopping(monitor='loss', min_delta=0,
                               patience=10, verbose=0, mode='min', restore_best_weights=True)

    # Call and compile the model
    if args.chain == 'ab':
        pep_train = utils.enc_list_bl_max_len(train_data.peptide, encoding, 9)
        tcra_train = utils.enc_list_bl_max_len(train_data.CDR3a, encoding, 30)
        tcrb_train = utils.enc_list_bl_max_len(train_data.CDR3b, encoding, 30)
        y_train = np.array(train_data.binder)

        pep_test = utils.enc_list_bl_max_len(test_data.peptide, encoding, 9)
        tcra_test = utils.enc_list_bl_max_len(test_data.CDR3a, encoding, 30)
        tcrb_test = utils.enc_list_bl_max_len(test_data.CDR3b, encoding, 30)
        train_inputs = [tcra_train, tcrb_train, pep_train]
        test_inputs = [tcra_test, tcrb_test, pep_test]

        mdl = nettcr_ab()
    elif args.chain == "a":
        pep_train = utils.enc_list_bl_max_len(train_data.peptide, encoding, 9)
        tcra_train = utils.enc_list_bl_max_len(train_data.CDR3a, encoding, 30)
        y_train = np.array(train_data.binder)

        pep_test = utils.enc_list_bl_max_len(test_data.peptide, encoding, 9)
        tcra_test = utils.enc_list_bl_max_len(test_data.CDR3a, encoding, 30)
        train_inputs = [tcra_train, pep_train]
        test_inputs = [tcra_test, pep_test]
        mdl = nettcr_one_chain()
    elif args.chain == "b":
        pep_train = utils.enc_list_bl_max_len(train_data.Epitope, encoding, 9)
        tcrb_train = utils.enc_list_bl_max_len(train_data.CDR3B, encoding, 30)
        y_train = np.array(train_data.Affinity)

        pep_test = utils.enc_list_bl_max_len(test_data.Epitope, encoding, 9)
        tcrb_test = utils.enc_list_bl_max_len(test_data.CDR3B, encoding, 30)
        train_inputs = [tcrb_train, pep_train]
        test_inputs = [tcrb_test, pep_test]
        mdl = nettcr_one_chain()

    mdl.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=0.001))
    # Train
    history = mdl.fit(train_inputs, y_train, 
                      epochs=EPOCHS, batch_size=128, verbose=1, callbacks=[early_stop])
    mdl.save_weights(save_model_path  + ".h5")
    print('Evaluating..')
    preds = mdl.predict(test_inputs, verbose=0)
    pred_df = pd.concat([test_data, pd.Series(np.ravel(preds), name='y_prob')], axis=1)
    pred_df['pred'] = pred_df['y_prob'].apply(lambda x: 1 if x > 0.5 else 0)
    from sklearn.metrics import accuracy_score, roc_auc_score, recall_score,precision_score,f1_score,matthews_corrcoef,roc_curve,auc
    result =pred_df[['Epitope','CDR3B','Affinity','pred','y_prob']]
    result.columns = ['Epitope', 'CDR3B', 'y_true', 'y_pred', 'y_prob']
# name=['1']
# me=['health']
# for i in name:
#     for j in me:
#         trainfile_path ="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/data/train_beta_99.csv"
#         testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Prediction/all/seen/three/no_CF/"+i+'_'+j+'.csv'
#         save_modle_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/model/Original/NetTCR99"
#         result_path="./evaluate/repeat10/Prediction/all/seen/three/"+i+j
#         chain='b'
#         train_main(trainfile_path,testfile_path,save_modle_path,result_path,chain) 

In [2]:
def validation_main(testfile_path, modelfile_path, result_path, chain):
    encoding = utils.blosum50_20aa
    test_data = pd.read_csv(testfile_path)
    if chain == "ab":
        mdl = nettcr_ab()
        pep_test = utils.enc_list_bl_max_len(test_data.peptide, encoding, 9)
        tcra_test = utils.enc_list_bl_max_len(test_data.CDR3a, encoding, 30)
        tcrb_test = utils.enc_list_bl_max_len(test_data.CDR3b, encoding, 30)
        test_inputs = [tcra_test, tcrb_test, pep_test]
    elif chain == "a":
        mdl = nettcr_one_chain()
        pep_test = utils.enc_list_bl_max_len(test_data.peptide, encoding, 9)
        tcra_test = utils.enc_list_bl_max_len(test_data.CDR3a, encoding, 30)
        test_inputs = [tcra_test, pep_test]
    elif chain == "b":
        mdl = nettcr_one_chain()
        pep_test = utils.enc_list_bl_max_len(test_data.Epitope, encoding, 9)
        tcrb_test = utils.enc_list_bl_max_len(test_data.CDR3B, encoding, 30)
        test_inputs = [tcrb_test, pep_test]
    else:
        raise ValueError("Invalid chain type. Please choose from 'a', 'b', or 'ab'.")
    mdl.load_weights(modelfile_path)
    preds = mdl.predict(test_inputs, verbose=0)
    pred_df = pd.concat([test_data, pd.Series(np.ravel(preds), name='y_prob')], axis=1)
    pred_df['pred'] = pred_df['y_prob'].apply(lambda x: 1 if x > 0.5 else 0)

    result = pred_df[['Epitope', 'CDR3B', 'Affinity', 'pred', 'y_prob']]
    result.columns = ['Epitope', 'CDR3B', 'y_true', 'y_pred', 'y_prob']
    result.to_csv(result_path + 'probability.csv', index=False)


In [3]:
name=['1','2','3']
me=['health','Diseases','antigen_specificity']
for i in name:
    for j in me:
        testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Prediction/all/seen/three/no_CF/"+i+'_'+j+'.csv'
        modelfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/model/Original/NetTCR99.h5"
        result_path="./result/Original/seen/three/"+i+'_'+j
        chain='b'
        validation_main(testfile_path,modelfile_path,result_path,chain)

        
name=['1','2','3']
me=['health','Diseases','antigen_specificity']
for i in name:
    for j in me:
        testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Prediction/all/unseen/no_CF/"+i+'_'+j+'.csv'
        modelfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/model/Original/NetTCR99.h5"
        result_path="./result/Original/unseen/"+i+'_'+j
        chain='b'
        validation_main(testfile_path,modelfile_path,result_path,chain)

# Model retraining

In [6]:
import os, sys
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import keras
import time
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Conv1D, GlobalMaxPooling1D, concatenate
from tensorflow.keras.optimizers import Adam
from keras.initializers import glorot_normal
from keras.activations import sigmoid
from sklearn.metrics import roc_auc_score
import utils
import keras.backend as K
from keras.callbacks import EarlyStopping
from nettcr_architectures import nettcr_ab, nettcr_one_chain 
#Options for Pandas DataFrame printing
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', -1)
from argparse import ArgumentParser

class Args:
    epochs=100
    def __init__(self, trainfile_path=None, testfile_path=None, chain=None):
        self.trainfile_path = trainfile_path
        self.testfile_path = testfile_path
        self.chain = chain

    
def train_main(trainfile_path, testfile_path,save_model_path,result_path,chain):
    args = Args(trainfile_path=trainfile_path, testfile_path=testfile_path, chain=chain)
    EPOCHS = int(args.epochs)
    if args.chain not in ["a", "b", "ab"]:
        print("Invalid chain. You can select a (alpha), b (beta), ab (alpha+beta)")
    print('Loading and encoding the data..')
    train_data = pd.read_csv(args.trainfile_path)
    test_data = pd.read_csv(args.testfile_path)
    # Encode data
    encoding = utils.blosum50_20aa
    early_stop = EarlyStopping(monitor='loss', min_delta=0,
                               patience=10, verbose=0, mode='min', restore_best_weights=True)

    # Call and compile the model
    if args.chain == 'ab':
        pep_train = utils.enc_list_bl_max_len(train_data.peptide, encoding, 9)
        tcra_train = utils.enc_list_bl_max_len(train_data.CDR3a, encoding, 30)
        tcrb_train = utils.enc_list_bl_max_len(train_data.CDR3b, encoding, 30)
        y_train = np.array(train_data.binder)

        pep_test = utils.enc_list_bl_max_len(test_data.peptide, encoding, 9)
        tcra_test = utils.enc_list_bl_max_len(test_data.CDR3a, encoding, 30)
        tcrb_test = utils.enc_list_bl_max_len(test_data.CDR3b, encoding, 30)
        train_inputs = [tcra_train, tcrb_train, pep_train]
        test_inputs = [tcra_test, tcrb_test, pep_test]

        mdl = nettcr_ab()
    elif args.chain == "a":
        pep_train = utils.enc_list_bl_max_len(train_data.peptide, encoding, 9)
        tcra_train = utils.enc_list_bl_max_len(train_data.CDR3a, encoding, 30)
        y_train = np.array(train_data.binder)

        pep_test = utils.enc_list_bl_max_len(test_data.peptide, encoding, 9)
        tcra_test = utils.enc_list_bl_max_len(test_data.CDR3a, encoding, 30)
        train_inputs = [tcra_train, pep_train]
        test_inputs = [tcra_test, pep_test]
        mdl = nettcr_one_chain()
    elif args.chain == "b":
        pep_train = utils.enc_list_bl_max_len(train_data.Epitope, encoding, 9)
        tcrb_train = utils.enc_list_bl_max_len(train_data.CDR3B, encoding, 30)
        y_train = np.array(train_data.Affinity)

        pep_test = utils.enc_list_bl_max_len(test_data.Epitope, encoding, 9)
        tcrb_test = utils.enc_list_bl_max_len(test_data.CDR3B, encoding, 30)
        train_inputs = [tcrb_train, pep_train]
        test_inputs = [tcrb_test, pep_test]
        mdl = nettcr_one_chain()
    mdl.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=0.001))
    history = mdl.fit(train_inputs, y_train, 
                      epochs=EPOCHS, batch_size=128, verbose=1, callbacks=[early_stop])
    mdl.save_weights(save_model_path  + ".h5")
    print('Evaluating..')
    preds = mdl.predict(test_inputs, verbose=0)
    pred_df = pd.concat([test_data, pd.Series(np.ravel(preds), name='y_prob')], axis=1)
    pred_df['pred'] = pred_df['y_prob'].apply(lambda x: 1 if x > 0.5 else 0)
    from sklearn.metrics import accuracy_score, roc_auc_score, recall_score,precision_score,f1_score,matthews_corrcoef,roc_curve,auc
    result =pred_df[['Epitope','CDR3B','Affinity','pred','y_prob']]
    result.columns = ['Epitope', 'CDR3B', 'y_true', 'y_pred', 'y_prob']
    result.to_csv(result_path +'probability.csv')


In [7]:
import os
GPU_NUMBER = [1]
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(s) for s in GPU_NUMBER])
os.environ["NCCL_DEBUG"] = "INFO"

# pair50

In [None]:
import pandas as pd
database=['healthy','patient']
name1=['1','2','3','4','5']
name=['1_1','1_2','1_4','1_6','1_8']
for i in database:
    for j in name:
        for k in name1:
            trainfile_path ="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair50/"+i+"/"+k+'_'+j+"train.csv"
            testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair50/"+i+"/"+k+'_'+"1_1test.csv"
            save_model_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/model/Retrain/pair50/"+i+"/"+k+'_'+j
            result_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/result/pair50/seen/test/"+i+"/"+k+'_'+j
            chain='b'
            train_main(trainfile_path,testfile_path,save_model_path,result_path,chain) 
            
import pandas as pd
database=['Antigen_specificity']
name1=['1','2','3','4','5']
name=['1_1','1_2','1_4']
for i in database:
    for j in name:
        for k in name1:
            trainfile_path ="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair50/"+i+"/"+k+'_'+j+"train.csv"
            testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair50/"+i+"/"+k+'_'+"1_1test.csv"
            save_model_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/model/Retrain/pair50/"+i+"/"+k+'_'+j
            result_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/result/pair50/seen/test/"+i+"/"+k+'_'+j
            chain='b'
            train_main(trainfile_path,testfile_path,save_model_path,result_path,chain) 

# pair300

In [None]:
pair=['more300','300','200','100','10']
database=['healthy','patient']
name1=['1','2','3','4','5']
name=['1_1']
for i in database:
    for j in name:
        for k in name1:
            for l in pair:
                trainfile_path ="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair300/"+l+"/"+i+"/"+k+'_'+j+"train.csv"
                testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair300/more300/"+i+"/"+k+'_'+"1_1test.csv"
                save_model_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/model/Retrain/pair300/seen/"+l+"/"+i+"/"+k+'_'+j
                result_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/result/pair300/seen/"+l+"/"+i+"/"+k+'_'+j
                chain='b'
                train_main(trainfile_path,testfile_path,save_model_path,result_path,chain)  

pair=['more300','300','200','100','10']
database=['Antigen_specificity']
name1=['1','2','3','4','5']
name=['1_1']
for i in database:
    for j in name:
        for k in name1:
            for l in pair:
                trainfile_path ="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair300/"+l+"/"+i+"/"+k+'_'+j+"train.csv"
                testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair300/more300/"+i+"/"+k+'_'+"1_1test.csv"
                save_model_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/model/Retrain/pair300/seen/"+l+"/"+i+"/"+k+'_'+j
                result_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/result/pair300/seen/"+l+"/"+i+"/"+k+'_'+j
                chain='b'
                train_main(trainfile_path,testfile_path,save_model_path,result_path,chain)  

In [8]:
import os, sys
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
import numpy as np
import pandas as pd
from keras.models import Model
from tensorflow.keras.optimizers import Adam
import utils
from nettcr_architectures import nettcr_one_chain, nettcr_ab 
from sklearn.metrics import roc_auc_score
from keras.callbacks import EarlyStopping
from argparse import ArgumentParser
def validation_main(testfile_path, modelfile_path, result_path, chain):
    encoding = utils.blosum50_20aa
    test_data = pd.read_csv(testfile_path)
    if chain == "ab":
        mdl = nettcr_ab()
        pep_test = utils.enc_list_bl_max_len(test_data.peptide, encoding, 9)
        tcra_test = utils.enc_list_bl_max_len(test_data.CDR3a, encoding, 30)
        tcrb_test = utils.enc_list_bl_max_len(test_data.CDR3b, encoding, 30)
        test_inputs = [tcra_test, tcrb_test, pep_test]
    elif chain == "a":
        mdl = nettcr_one_chain()
        pep_test = utils.enc_list_bl_max_len(test_data.peptide, encoding, 9)
        tcra_test = utils.enc_list_bl_max_len(test_data.CDR3a, encoding, 30)
        test_inputs = [tcra_test, pep_test]
    elif chain == "b":
        mdl = nettcr_one_chain()
        pep_test = utils.enc_list_bl_max_len(test_data.Epitope, encoding, 9)
        tcrb_test = utils.enc_list_bl_max_len(test_data.CDR3B, encoding, 30)
        test_inputs = [tcrb_test, pep_test]
    else:
        raise ValueError("Invalid chain type. Please choose from 'a', 'b', or 'ab'.")
    mdl.load_weights(modelfile_path)
    preds = mdl.predict(test_inputs, verbose=0)
    pred_df = pd.concat([test_data, pd.Series(np.ravel(preds), name='y_prob')], axis=1)
    pred_df['pred'] = pred_df['y_prob'].apply(lambda x: 1 if x > 0.5 else 0)

    result = pred_df[['Epitope', 'CDR3B', 'Affinity', 'pred', 'y_prob']]
    result.columns = ['Epitope', 'CDR3B', 'y_true', 'y_pred', 'y_prob']
    result.to_csv(result_path + 'probability.csv', index=False)


# validation

In [29]:
name1=['1','2','3','4','5']
name=['1_1','1_2','1_4']
database=['Antigen_specificity']
for i in database:
    for j in name:
        for k in name1:
            testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Validation/"+i+"/"+k+'_'+"1_1Validation.csv"
            modelfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/model/Retrain/pair50/"+i+"/"+k+'_'+j +".h5"
            result_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/result/pair50/seen/validation/"+i+"/"+k+'_'+j
            chain="b"
            validation_main(testfile_path,modelfile_path,result_path,chain)

import pandas as pd
name1=['1','2','3','4','5']
name=['1_1','1_2','1_4','1_6','1_8']
database=['healthy','patient']
for i in database:
    for j in name:
        for k in name1:
            testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Validation/"+i+"/"+k+'_'+"1_1Validation.csv"
            modelfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/model/Retrain/pair50/"+i+"/"+k+'_'+j +".h5"
            result_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/result/pair50/seen/validation/"+i+"/"+k+'_'+j 
            chain="b"
            validation_main(testfile_path,modelfile_path,result_path,chain)


            

# unknown-test

In [None]:
import pandas as pd
name1=['1','2','3','4','5']
name=['1_1','1_2','1_4','1_6','1_8']
database=['healthy','patient']
for i in database:
    for j in name:
        for k in name1:
            testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/"+i+"/"+k+'_'+"1_1test.csv"
            modelfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/model/Retrain/pair50/"+i+"/"+k+'_'+j +".h5"
            result_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/result/pair50/unseen/test/"+i+"/"+k+'_'+j
            chain="b"
            validation_main(testfile_path,modelfile_path,result_path,chain)

import pandas as pd
name1=['1','2','3','4','5']
name=['1_1','1_2','1_4']
database=['Antigen_specificity']
for i in database:
    for j in name:
        for k in name1:
            testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/"+i+"/"+k+'_'+"1_1test.csv"
            modelfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/model/Retrain/pair50/"+i+"/"+k+'_'+j +".h5"
            result_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/result/pair50/unseen/test/"+i+"/"+k+'_'+j
            chain="b"
            validation_main(testfile_path,modelfile_path,result_path,chain)
            

# unknown-validaiton

In [9]:
import pandas as pd
name1=['1','2','3','4','5']
name=['1_1','1_2','1_4','1_6','1_8']
database=['healthy','patient']
for i in database:
    for j in name:
        for k in name1:
            testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/validation/"+i+"/"+k+'_'+"1_1validation.csv"
            modelfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/model/Retrain/pair50/"+i+"/"+k+'_'+j +".h5"
            result_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/result/pair50/unseen/validation/"+i+"/"+k+'_'+j
            chain="b"
            validation_main(testfile_path,modelfile_path,result_path,chain)

import pandas as pd
name1=['1','2','3','4','5']
name=['1_1','1_2','1_4']
database=['Antigen_specificity']
for i in database:
    for j in name:
        for k in name1:
            testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/validation/"+i+"/"+k+'_'+"1_1validation.csv"
            modelfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/model/Retrain/pair50/"+i+"/"+k+'_'+j +".h5"
            result_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/result/pair50/unseen/validation/"+i+"/"+k+'_'+j
            chain="b"
            validation_main(testfile_path,modelfile_path,result_path,chain)
            

# Top5 parameter adjustment

In [5]:
import os, sys
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import keras
import time
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Conv1D, GlobalMaxPooling1D, concatenate
from tensorflow.keras.optimizers import Adam
from keras.initializers import glorot_normal
from keras.activations import sigmoid
from sklearn.metrics import roc_auc_score
import utils
import keras.backend as K
from keras.callbacks import EarlyStopping
from nettcr_architectures import nettcr_ab, nettcr_one_chain 
#Options for Pandas DataFrame printing
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', -1)
from argparse import ArgumentParser

class Args:
    epochs=100
    def __init__(self, trainfile_path=None, testfile_path=None, chain=None):
        self.trainfile_path = trainfile_path
        self.testfile_path = testfile_path
        self.chain = chain

    
def train_main(trainfile_path, testfile_path,result_path,chain,EPOCHS):
    args = Args(trainfile_path=trainfile_path, testfile_path=testfile_path, chain=chain)
    #EPOCHS = int(args.epochs)
    if args.chain not in ["a", "b", "ab"]:
        print("Invalid chain. You can select a (alpha), b (beta), ab (alpha+beta)")
    print('Loading and encoding the data..')
    train_data = pd.read_csv(args.trainfile_path)
    test_data = pd.read_csv(args.testfile_path)
    # Encode data
    encoding = utils.blosum50_20aa
    early_stop = EarlyStopping(monitor='loss', min_delta=0,
                               patience=10, verbose=0, mode='min', restore_best_weights=True)

    # Call and compile the model
    if args.chain == 'ab':
        pep_train = utils.enc_list_bl_max_len(train_data.peptide, encoding, 9)
        tcra_train = utils.enc_list_bl_max_len(train_data.CDR3a, encoding, 30)
        tcrb_train = utils.enc_list_bl_max_len(train_data.CDR3b, encoding, 30)
        y_train = np.array(train_data.binder)

        pep_test = utils.enc_list_bl_max_len(test_data.peptide, encoding, 9)
        tcra_test = utils.enc_list_bl_max_len(test_data.CDR3a, encoding, 30)
        tcrb_test = utils.enc_list_bl_max_len(test_data.CDR3b, encoding, 30)
        train_inputs = [tcra_train, tcrb_train, pep_train]
        test_inputs = [tcra_test, tcrb_test, pep_test]

        mdl = nettcr_ab()
    elif args.chain == "a":
        pep_train = utils.enc_list_bl_max_len(train_data.peptide, encoding, 9)
        tcra_train = utils.enc_list_bl_max_len(train_data.CDR3a, encoding, 30)
        y_train = np.array(train_data.binder)

        pep_test = utils.enc_list_bl_max_len(test_data.peptide, encoding, 9)
        tcra_test = utils.enc_list_bl_max_len(test_data.CDR3a, encoding, 30)
        train_inputs = [tcra_train, pep_train]
        test_inputs = [tcra_test, pep_test]
        mdl = nettcr_one_chain()
    elif args.chain == "b":
        pep_train = utils.enc_list_bl_max_len(train_data.Epitope, encoding, 9)
        tcrb_train = utils.enc_list_bl_max_len(train_data.CDR3B, encoding, 30)
        y_train = np.array(train_data.Affinity)

        pep_test = utils.enc_list_bl_max_len(test_data.Epitope, encoding, 9)
        tcrb_test = utils.enc_list_bl_max_len(test_data.CDR3B, encoding, 30)
        train_inputs = [tcrb_train, pep_train]
        test_inputs = [tcrb_test, pep_test]
        mdl = nettcr_one_chain()
    mdl.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=0.001))
    history = mdl.fit(train_inputs, y_train, 
                      epochs=EPOCHS, batch_size=128, verbose=1, callbacks=[early_stop])
    #mdl.save_weights(save_model_path  + ".h5")
    print('Evaluating..')
    preds = mdl.predict(test_inputs, verbose=0)
    pred_df = pd.concat([test_data, pd.Series(np.ravel(preds), name='y_prob')], axis=1)
    pred_df['pred'] = pred_df['y_prob'].apply(lambda x: 1 if x > 0.5 else 0)
    from sklearn.metrics import accuracy_score, roc_auc_score, recall_score,precision_score,f1_score,matthews_corrcoef,roc_curve,auc
    result =pred_df[['Epitope','CDR3B','Affinity','pred','y_prob']]
    result.columns = ['Epitope', 'CDR3B', 'y_true', 'y_pred', 'y_prob']
    result.to_csv(result_path +'probability.csv')


In [None]:
reap=['reap1','reap2','reap3','reap4','reap5']
name=['top1','top2','top3','top4','top5']
number= ['3000', '2800', '2600', '2400', '2200', '2000', '1800', '1600', '1400', '1200', '1000', '800', '600', '400', '200', '50']
epoch=[60,80,100,120,140]
for i in reap:
    for j in name:
        for k in number:
            for EPOCHS in epoch:
                trainfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/top5/healthy/"+i+"_" +j+"_"+k+'.csv'
                testfile_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/top5/healthy/"+i+"_" +j+"_"+'test.csv'    
                result_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/NetTCR/result/top5/seen/healthy/"+i+'_'+j+'_'+k+'_'+str(EPOCHS)
                chain='b'
                train_main(trainfile_path, testfile_path,result_path,chain,EPOCHS)  