# Original model prediction

In [73]:
import tensorflow.compat.v1 as tf

try:
    import tensorflow.python.keras as keras
except:
    import tensorflow.keras as keras

from tensorflow.python.keras import layers
from tensorflow.python.keras import backend as K

from sklearn.model_selection import KFold

from sklearn import metrics
from sklearn.metrics import accuracy_score,matthews_corrcoef,classification_report,confusion_matrix,precision_score,recall_score
from sklearn.metrics import f1_score,roc_auc_score, auc

from keras import regularizers

import os
import scipy.io as sio
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from tensorflow.python.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from keras.utils import plot_model
from sklearn.utils import shuffle
from tensorflow.python.keras.models import load_model
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import csv
import pandas as pd
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  

In [74]:
os.chdir("/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/DLpTCR/code/")
from aaindexValues import aaindex1PCAValues
def pca_Validation(testFile ):
    
    def pca_code(seqs:list, row=30, n_features=16):
        aadict = aaindex1PCAValues(n_features)
        x = []
        col = n_features + 1
        for i in range(len(seqs)):
            seq = seqs[i]
            n = len(seq)
            t = np.zeros(shape=(row, col))
            j = 0
            while j < n and j < row:
                t[j, :-1] = aadict[seq[j]]
                t[j, -1] = 0
                j += 1
            while j < row:
                t[j, -1] = 1
                j = j + 1
            x.append(t)
        return np.array(x)

    def read_seqs(file, model=1):
        data = pd.read_csv(file)
        labels = data.Affinity
        cdr3 = data.CDR3B
        epitope = data.Epitope
        cdr3_seqs, epit_seqs = [], []
        for i in range(len(epitope)):
            if model == 1:
                cdr3_seqs.append(cdr3[i][2:-1])
            elif model == 2:
                cdr3_seqs.append(cdr3[i])
            epit_seqs.append(epitope[i])

        return cdr3_seqs, epit_seqs, labels

    def load_data(col=20, row=9, m=1):
        test_cdr3_seqs, test_epit_seqs, test_labels = read_seqs(testFile, m)
        x_test = np.ndarray(shape=(len(test_cdr3_seqs), row, col + 1, 2))
        x_test[:, :, :, 0] = pca_code(test_cdr3_seqs, row, col)
        x_test[:, :, :, 1] = pca_code(test_epit_seqs, row, col)

        y_test = np.array(test_labels)
        y_test = to_categorical(y_test, 2)
        return (x_test, y_test)
    
    m = 2
    row = 20
    TCRB_RESNET_Feature, _ = load_data(col=10, row=row, m=m)
    TCRB_FULL_Feature, _ = load_data(col=18, row=row, m=m)
    TCRB_CNN_Feature, _ = load_data(col=20, row=row, m=m)
    return TCRB_FULL_Feature, TCRB_CNN_Feature, TCRB_RESNET_Feature


In [76]:
import pandas as pd

def prediction(data_path,result_path): 
    K.clear_session()
    tf.reset_default_graph()   
    
    data = pd.read_csv(data_path)
    FULL_Feature, CNN_Feature, RESNET_Feature= pca_Validation(data_path)

    FULL_model = load_model('./model/FULL_B_ALL_pca18.h5')
    CNN_model = load_model('./model/CNN_B_ALL_pca20.h5')
    RESNET_model = load_model('./model/RESNET_B_ALL_pca10.h5') 
    
    FULL_X = FULL_Feature
    FULL_X = FULL_X.reshape([len(FULL_X),20,19,2])

    CNN_X = CNN_Feature
    CNN_X = CNN_X.reshape([len(CNN_X),20,21,2])  
    
    RESNET_X = RESNET_Feature
    RESNET_X = RESNET_X.reshape([len(RESNET_X),20,11,2])

    Y_PRED_FULL = FULL_model.predict(FULL_X)
    Y_PRED_CNN = CNN_model.predict(CNN_X)
    Y_PRED_RESNET = RESNET_model.predict(RESNET_X)
    
    Y_pred_FULL= Y_PRED_FULL[:, 1]
    Y_pred_CNN= Y_PRED_CNN[:, 1]
    Y_pred_RESNET= Y_PRED_RESNET[:, 1]
    
    del FULL_model
    del CNN_model
    del RESNET_model

    df_full = pd.DataFrame({'CDR3B': data.CDR3B, 'Epitope': data.Epitope, 'y_true': data.Affinity, 'y_prob': Y_pred_FULL})
    df_full['y_pred'] = df_full['y_prob'].apply(lambda x: 1 if x >= 0.5 else 0)
    df_full.to_csv(result_path + 'FULL_probability.csv', index=False)
    df_cnn = pd.DataFrame({'CDR3B': data.CDR3B, 'Epitope': data.Epitope, 'y_true': data.Affinity, 'y_prob': Y_pred_CNN})
    df_cnn['y_pred'] = df_cnn['y_prob'].apply(lambda x: 1 if x >= 0.5 else 0)
    df_cnn.to_csv(result_path + 'CNN_probability.csv', index=False)
    df_resnet = pd.DataFrame({'CDR3B': data.CDR3B, 'Epitope': data.Epitope, 'y_true': data.Affinity, 'y_prob': Y_pred_RESNET})
    df_resnet['y_pred'] = df_resnet['y_prob'].apply(lambda x: 1 if x >= 0.5 else 0)
    df_resnet.to_csv(result_path + 'RESNET_probability.csv', index=False)

In [None]:
name=['1','2','3']
me=['health','Diseases','antigen_specificity']
for i in name:
    for j in me:
        data_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Prediction/all/seen/seven/neg_pos/"+i+'_'+j+'.csv'
        result_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/DLpTCR/result/Original/seen/seven/"+i+'_'+j+'_'
        prediction(data_path,result_path)
        
name=['1','2','3']
me=['health','Diseases','antigen_specificity']
for i in name:
    for j in me:
        data_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Prediction/all/seen/three/neg_pos/"+i+'_'+j+'.csv'
        result_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/DLpTCR/result/Original/seen/three/"+i+'_'+j+'_'
        prediction(data_path,result_path)
        
name=['1','2','3']
me=['health','Diseases','antigen_specificity']
for i in name:
    for j in me:
        data_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Prediction/all/unseen/CF/"+i+'_'+j+'.csv'
        result_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/DLpTCR/result/Original/unseen/"+i+'_'+j+'_'
        prediction(data_path,result_path)

# Model Retraining

In [3]:
import pandas
import numpy as np
import os

from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix
from sklearn.metrics import f1_score,roc_auc_score,recall_score,precision_score

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import LearningRateScheduler, ReduceLROnPlateau, ModelCheckpoint

from aaindexValues import aaindex1PCAValues

# time

In [9]:
def pca_Validation( data_path,save_path):
    def pca_code(seqs: list, row=30, n_features=16):
        aadict = aaindex1PCAValues(n_features)
        x = []
        col = n_features + 1
        for seq in seqs:
            t = np.zeros(shape=(row, col))
            for j in range(min(len(seq), row)):
                t[j, :-1] = aadict[seq[j]]
            t[len(seq):, -1] = 1  # Pad remaining rows
            x.append(t)
        return np.array(x)

    def read_seqs(file, model=1):
        data = pandas.read_csv(file)
        labels = data.Affinity
        cdr3 = data.CDR3B
        epitope = data.Epitope

        cdr3_seqs = [cdr3[i][2:-1] if model == 1 else cdr3[i] for i in range(len(epitope))]
        epit_seqs = [epitope[i] for i in range(len(epitope))]

        return cdr3_seqs, epit_seqs, labels

    def load_data(test_file, col=20, row=9, m=1):
        test_cdr3_seqs, test_epit_seqs, test_labels = read_seqs(test_file, m)
        x_test = np.ndarray(shape=(len(test_cdr3_seqs), row, col + 1, 2))
        x_test[:, :, :, 0] = pca_code(test_cdr3_seqs, row, col)
        x_test[:, :, :, 1] = pca_code(test_epit_seqs, row, col)

        y_test = np.array(test_labels)
        y_test = to_categorical(y_test, 2)
        return x_test, y_test

    test_file = data_path +".csv"
    
    for col in [10, 18, 20]:
        x_test, y_test = load_data(test_file, col=col, row=20, m=2)
        np.save(f"{save_path}_TCRB_PCA{col}_feature_array", x_test)
        np.save(f"{save_path}_TCRB_PCA{col}_label_array", y_test)
        
dsize=['1000','5000','10000','100000','1000000']
for i in size:
    data_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/time/data_"+i
    save_path="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/DLpTCR/data/top5/"+i
    pca_Validation(data_path, save_path)       

In [None]:
def pca_Validation(validation_path, ratio_name, name1, save_path, prefix='train'):
    def pca_code(seqs: list, row=30, n_features=16):
        aadict = aaindex1PCAValues(n_features)
        x = []
        col = n_features + 1
        for seq in seqs:
            t = np.zeros(shape=(row, col))
            for j in range(min(len(seq), row)):
                t[j, :-1] = aadict[seq[j]]
            t[len(seq):, -1] = 1  # Pad remaining rows
            x.append(t)
        return np.array(x)

    def read_seqs(file, model=1):
        data = pandas.read_csv(file)
        labels = data.Affinity
        cdr3 = data.CDR3B
        epitope = data.Epitope

        cdr3_seqs = [cdr3[i][2:-1] if model == 1 else cdr3[i] for i in range(len(epitope))]
        epit_seqs = [epitope[i] for i in range(len(epitope))]

        return cdr3_seqs, epit_seqs, labels

    def load_data(test_file, col=20, row=9, m=1):
        test_cdr3_seqs, test_epit_seqs, test_labels = read_seqs(test_file, m)
        x_test = np.ndarray(shape=(len(test_cdr3_seqs), row, col + 1, 2))
        x_test[:, :, :, 0] = pca_code(test_cdr3_seqs, row, col)
        x_test[:, :, :, 1] = pca_code(test_epit_seqs, row, col)

        y_test = np.array(test_labels)
        y_test = to_categorical(y_test, 2)
        return x_test, y_test

    test_file = f"{validation_path}/{name1}_{ratio_name}train.csv"
    
    for col in [10, 18, 20]:
        x_test, y_test = load_data(test_file, col=col, row=20, m=2)
        np.save(f"{save_path}/{name1}_{ratio_name}{prefix}_TCRB_PCA{col}_feature_array", x_test)
        np.save(f"{save_path}/{name1}_{ratio_name}{prefix}_TCRB_PCA{col}_label_array", y_test)


def execute_validation(database_list, ratio_name_list, name1_list, base_data_path, base_save_path, prefix='train'):
    for db in database_list:
        for ratio in ratio_name_list:
            for name in name1_list:
                data_path = f"{base_data_path}/{db}"
                save_path = f"{base_save_path}/{db}"
                pca_Validation(data_path, ratio, name, save_path, prefix=prefix)
                
data_path = "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair50"
save_path = "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/DLpTCR/data/pair50/seen/test"
execute_validation(
    database_list=['healthy', 'patient'],
    ratio_name_list=['1_1', '1_2', '1_4', '1_6', '1_8'],
    name1_list=['1', '2', '3', '4', '5'],
    base_data_path=data_path, 
    base_save_path=save_path)

execute_validation(
    database_list=['Antigen_specificity'],
    ratio_name_list=['1_1', '1_2', '1_4'],
    name1_list=['1', '2', '3', '4', '5'],
    base_data_path=data_path, 
    base_save_path=save_path)


In [32]:
def execute_validation_v2(database_list, ratio_name_list, pair_list, name1_list, base_train_path, base_save_path, prefix='train'):
    for db in database_list:
        for ratio in ratio_name_list:
            for pair in pair_list:
                for name in name1_list:
                    train_path = f"{base_train_path}/{pair}/{db}"
                    save_path = f"{base_save_path}/{pair}/{db}"
                    pca_Validation(train_path, ratio, name, save_path, prefix=prefix)

base_train_path = "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair300"
base_save_path = "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/DLpTCR/data/pair300"
execute_validation_v2(
    database_list=['healthy', 'patient', 'Antigen_specificity'],
    ratio_name_list=['1_1'],
    pair_list=['more300','300', '200', '100', '10'],
    name1_list=['1', '2', '3', '4', '5'],
    base_train_path=base_train_path,
    base_save_path=base_save_path)




In [47]:
def pca_Validation(validation_path, ratio_name, name1, save_path, prefix='test'):
    def pca_code(seqs: list, row=30, n_features=16):
        aadict = aaindex1PCAValues(n_features)
        x = []
        col = n_features + 1
        for seq in seqs:
            t = np.zeros(shape=(row, col))
            for j in range(min(len(seq), row)):
                t[j, :-1] = aadict[seq[j]]
            t[len(seq):, -1] = 1  # Pad remaining rows
            x.append(t)
        return np.array(x)

    def read_seqs(file, model=1):
        data = pandas.read_csv(file)
        labels = data.Affinity
        cdr3 = data.CDR3B
        epitope = data.Epitope

        cdr3_seqs = [cdr3[i][2:-1] if model == 1 else cdr3[i] for i in range(len(epitope))]
        epit_seqs = [epitope[i] for i in range(len(epitope))]

        return cdr3_seqs, epit_seqs, labels

    def load_data(test_file, col=20, row=9, m=1):
        test_cdr3_seqs, test_epit_seqs, test_labels = read_seqs(test_file, m)
        x_test = np.ndarray(shape=(len(test_cdr3_seqs), row, col + 1, 2))
        x_test[:, :, :, 0] = pca_code(test_cdr3_seqs, row, col)
        x_test[:, :, :, 1] = pca_code(test_epit_seqs, row, col)

        y_test = np.array(test_labels)
        y_test = to_categorical(y_test, 2)
        return x_test, y_test

    test_file = f"{validation_path}/{name1}_{ratio_name}test.csv"
    
    for col in [10, 18, 20]:
        x_test, y_test = load_data(test_file, col=col, row=20, m=2)
        np.save(f"{save_path}/{name1}_{ratio_name}{prefix}_TCRB_PCA{col}_feature_array", x_test)
        np.save(f"{save_path}/{name1}_{ratio_name}{prefix}_TCRB_PCA{col}_label_array", y_test)

def execute_validation(database_list, ratio_name_list, name1_list, base_data_path, base_save_path, prefix='test'):
    for db in database_list:
        for ratio in ratio_name_list:
            for name in name1_list:
                data_path = f"{base_data_path}/{db}"
                save_path = f"{base_save_path}/{db}"
                pca_Validation(data_path, ratio, name, save_path, prefix=prefix)
                
data_path = "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair50"
save_path = "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/DLpTCR/data/pair50/seen/test"
execute_validation(
    database_list=['healthy', 'patient', 'Antigen_specificity'],
    ratio_name_list=['1_1'],
    name1_list=['1', '2', '3', '4', '5'],
    base_data_path=data_path, 
    base_save_path=save_path)

                
data_path ="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/"
save_path = "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/DLpTCR/data/pair50/unseen/test"
execute_validation(
    database_list=['healthy', 'patient', 'Antigen_specificity'],
    ratio_name_list=['1_1'],
    name1_list=['1', '2', '3', '4', '5'],
    base_data_path=data_path, 
    base_save_path=save_path)





In [35]:
def execute_validation_v2(database_list, ratio_name_list, pair_list, name1_list, base_train_path, base_save_path, prefix='test'):
    for db in database_list:
        for ratio in ratio_name_list:
            for pair in pair_list:
                for name in name1_list:
                    train_path = f"{base_train_path}/{pair}/{db}"
                    save_path = f"{base_save_path}/{pair}/{db}"
                    pca_Validation(train_path, ratio, name, save_path, prefix=prefix)

base_train_path = "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair300"
base_save_path = "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/DLpTCR/data/pair300"
execute_validation_v2(
    database_list=['healthy', 'patient', 'Antigen_specificity'],
    ratio_name_list=['1_1'],
    pair_list=['more300'],
    name1_list=['1', '2', '3', '4', '5'],
    base_train_path=base_train_path,
    base_save_path=base_save_path)

In [50]:
def pca_Validation(validation_path, ratio_name, name1, save_path, prefix='validation'):
    def pca_code(seqs: list, row=30, n_features=16):
        aadict = aaindex1PCAValues(n_features)
        x = []
        col = n_features + 1
        for seq in seqs:
            t = np.zeros(shape=(row, col))
            for j in range(min(len(seq), row)):
                t[j, :-1] = aadict[seq[j]]
            t[len(seq):, -1] = 1  # Pad remaining rows
            x.append(t)
        return np.array(x)

    def read_seqs(file, model=1):
        data = pandas.read_csv(file)
        labels = data.Affinity
        cdr3 = data.CDR3B
        epitope = data.Epitope

        cdr3_seqs = [cdr3[i][2:-1] if model == 1 else cdr3[i] for i in range(len(epitope))]
        epit_seqs = [epitope[i] for i in range(len(epitope))]

        return cdr3_seqs, epit_seqs, labels

    def load_data(test_file, col=20, row=9, m=1):
        test_cdr3_seqs, test_epit_seqs, test_labels = read_seqs(test_file, m)
        x_test = np.ndarray(shape=(len(test_cdr3_seqs), row, col + 1, 2))
        x_test[:, :, :, 0] = pca_code(test_cdr3_seqs, row, col)
        x_test[:, :, :, 1] = pca_code(test_epit_seqs, row, col)

        y_test = np.array(test_labels)
        y_test = to_categorical(y_test, 2)
        return x_test, y_test

    test_file = f"{validation_path}/{name1}_{ratio_name}Validation.csv"
    
    for col in [10, 18, 20]:
        x_test, y_test = load_data(test_file, col=col, row=20, m=2)
        np.save(f"{save_path}/{name1}_{ratio_name}{prefix}_TCRB_PCA{col}_feature_array", x_test)
        np.save(f"{save_path}/{name1}_{ratio_name}{prefix}_TCRB_PCA{col}_label_array", y_test)
def execute_validation(database_list, ratio_name_list, name1_list, base_data_path, base_save_path, prefix='validation'):
    for db in database_list:
        for ratio in ratio_name_list:
            for name in name1_list:
                data_path = f"{base_data_path}/{db}"
                save_path = f"{base_save_path}/{db}"
                pca_Validation(data_path, ratio, name, save_path, prefix=prefix)
                
                
                
data_path = "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Validation"
save_path = "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/DLpTCR/data/pair50/seen/validation"
execute_validation(
    database_list=['healthy', 'patient', 'Antigen_specificity'],
    ratio_name_list=['1_1'],
    name1_list=['1', '2', '3', '4', '5'],
    base_data_path=data_path, 
    base_save_path=save_path)
def pca_Validation(validation_path, ratio_name, name1, save_path, prefix='validation'):
    def pca_code(seqs: list, row=30, n_features=16):
        aadict = aaindex1PCAValues(n_features)
        x = []
        col = n_features + 1
        for seq in seqs:
            t = np.zeros(shape=(row, col))
            for j in range(min(len(seq), row)):
                t[j, :-1] = aadict[seq[j]]
            t[len(seq):, -1] = 1  # Pad remaining rows
            x.append(t)
        return np.array(x)

    def read_seqs(file, model=1):
        data = pandas.read_csv(file)
        labels = data.Affinity
        cdr3 = data.CDR3B
        epitope = data.Epitope

        cdr3_seqs = [cdr3[i][2:-1] if model == 1 else cdr3[i] for i in range(len(epitope))]
        epit_seqs = [epitope[i] for i in range(len(epitope))]

        return cdr3_seqs, epit_seqs, labels

    def load_data(test_file, col=20, row=9, m=1):
        test_cdr3_seqs, test_epit_seqs, test_labels = read_seqs(test_file, m)
        x_test = np.ndarray(shape=(len(test_cdr3_seqs), row, col + 1, 2))
        x_test[:, :, :, 0] = pca_code(test_cdr3_seqs, row, col)
        x_test[:, :, :, 1] = pca_code(test_epit_seqs, row, col)

        y_test = np.array(test_labels)
        y_test = to_categorical(y_test, 2)
        return x_test, y_test

    test_file = f"{validation_path}/{name1}_{ratio_name}validation.csv"
    
    for col in [10, 18, 20]:
        x_test, y_test = load_data(test_file, col=col, row=20, m=2)
        np.save(f"{save_path}/{name1}_{ratio_name}{prefix}_TCRB_PCA{col}_feature_array", x_test)
        np.save(f"{save_path}/{name1}_{ratio_name}{prefix}_TCRB_PCA{col}_label_array", y_test)
def execute_validation(database_list, ratio_name_list, name1_list, base_data_path, base_save_path, prefix='validation'):
    for db in database_list:
        for ratio in ratio_name_list:
            for name in name1_list:
                data_path = f"{base_data_path}/{db}"
                save_path = f"{base_save_path}/{db}"
                pca_Validation(data_path, ratio, name, save_path, prefix=prefix)
                

                
data_path ="/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/validation/"
save_path = "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/DLpTCR/data/pair50/unseen/validation"
execute_validation(
    database_list=['healthy', 'patient', 'Antigen_specificity'],
    ratio_name_list=['1_1'],
    name1_list=['1', '2', '3', '4', '5'],
    base_data_path=data_path, 
    base_save_path=save_path)


# FULL

In [36]:
def FULL_train_main(trainfile_path,testfile_path,test_seq_path,save_model_path,result_path):    
    import tensorflow as tf
    try:
        import tensorflow.python.keras as keras
    except:
        import tensorflow.keras as keras

    from tensorflow.python.keras import layers
    from tensorflow.python.keras import backend as K
    from sklearn.model_selection import KFold
    from sklearn import metrics
    from sklearn.metrics import accuracy_score,matthews_corrcoef,classification_report,confusion_matrix,precision_score,recall_score
    from sklearn.metrics import f1_score,roc_auc_score, auc
    from keras import regularizers
    import os
    import scipy.io as sio
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    import numpy as np
    from tensorflow.python.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
    from keras.utils import plot_model
    #from tensorflow.keras.callbacks import LearningRateScheduler, ReduceLROnPlateau, ModelCheckpoint
    import matplotlib.pyplot as plt
    from sklearn.utils import shuffle
    from tensorflow.python.keras.models import load_model
    import matplotlib.pyplot as plt
    import csv
    import pandas as pd


    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  
    os.environ["CUDA_VISIBLE_DEVICES"] = "0" 
    def FULL_pca18(modelfile,Dropout1=0,Epochs= 20,Batch_size=64,PCA_num = 18):
        train_Feature = np.load(trainfile_path+"train_TCRB_PCA{}_feature_array.npy".format(PCA_num))    
        train_Label = np.load(trainfile_path+"train_TCRB_PCA{}_label_array.npy".format(PCA_num))
        test_Feature = np.load(testfile_path+"test_TCRB_PCA{}_feature_array.npy".format(PCA_num)) 
        test_Label = np.load(testfile_path+"test_TCRB_PCA{}_label_array.npy".format(PCA_num))                 
        X_train = train_Feature
        Y_train = train_Label            
        X_test = test_Feature
        Y_test = test_Label 
        X_train,Y_train = shuffle(X_train,Y_train)
        X_test,Y_test = shuffle(X_test,Y_test)
        X_train= X_train.reshape([len(X_train),20,PCA_num+1,2])
        X_test = X_test.reshape([len(X_test),20,PCA_num+1,2])
        X_test=tf.cast(X_test, tf.float32)
        model = tf.keras.models.Sequential([
            tf.keras.layers.Flatten(input_shape=(20,PCA_num+1,2)),

            tf.keras.layers.Dense(256,activation='relu'),# kernel_regularizer=regularizers.l2(0.01)),#  activation='relu',
            tf.keras.layers.Dense(512,activation='relu'),# kernel_regularizer=regularizers.l2(0.01)),#  activation='relu',

            tf.keras.layers.Dense(256,activation='relu'),# kernel_regularizer=regularizers.l2(0.01)),#  activation='relu',
            #tf.keras.layers.LeakyReLU(alpha=0.05), 
            tf.keras.layers.Dense(128,activation='relu'),
            #tf.keras.layers.LeakyReLU(alpha=0.05), 
            tf.keras.layers.Dense(64,activation='relu'),
            #tf.keras.layers.LeakyReLU(alpha=0.05), 
            tf.keras.layers.Dropout(Dropout1),
            tf.keras.layers.Dense(2, activation='softmax')
        ])

        model.compile(optimizer="Adam",
                      loss=keras.losses.binary_crossentropy,
                      metrics=['accuracy'])   
        checkpoint = ModelCheckpoint(filepath=modelfile, 
                                     monitor='val_loss',
                                     verbose=0, 
                                     save_best_only=True)#,save_weights_only=True)
        cbs = [checkpoint]#, lr_reducer, lr_scheduler]
        cbs = [checkpoint]#, lr_reducer, lr_scheduler]
        history = model.fit(X_train, 
                            Y_train, 
                            epochs= Epochs , 
                            batch_size= Batch_size, 
                            verbose=0,
                            validation_data=(X_test, Y_test),
                            shuffle=False,
                            callbacks=cbs)
        return history
        del model

    for model_number in range(1,2):
        modelfile = save_model_path+'FULL_B_ALL_pca18_{}.h5'.format(model_number)
        FULL_pca18(modelfile,0.3,50,128,18)

    def computing_result(Feature_array,Label_array,model):
        X_TEST = Feature_array
        Y_TEST = Label_array
        model1 = model
        Y_PRED = model1.predict(X_TEST)
        Y_pred2 = np.argmin(Y_PRED, axis=-1)
        Y_test2 = np.argmax(Y_TEST, axis=-1)
        test=pd.read_csv(test_seq_path)
        df = pd.DataFrame({'Class':test.Epitope,'CDR3B':test.CDR3B,'y_prob':Y_PRED[:, 0],'y_pred': Y_pred2, 'y_true': Y_test2})
        df.to_csv(result_path+"FULL_pca18_probability.csv", index=False)


        confusion_matrix1 =confusion_matrix(Y_test2,Y_pred2)
        new_confusion_matrix1 = [[confusion_matrix1[1,1],confusion_matrix1[1,0]],[confusion_matrix1[0,1],confusion_matrix1[0,0]]]
        accuracy = accuracy_score(Y_test2,Y_pred2) 
        precision = precision_score(Y_test2,Y_pred2) 
        recall = recall_score(Y_test2,Y_pred2) 
        f1= f1_score(Y_test2,Y_pred2) #F1
        MCC = matthews_corrcoef(Y_test2,Y_pred2) #MCC
        fpr, tpr, thresholds = metrics.roc_curve(Y_TEST[:,1], Y_PRED[:,1])
        roc_auc = auc(fpr, tpr)
        return new_confusion_matrix1,accuracy,precision,recall,f1,MCC,fpr,tpr,roc_auc


    fileHeader =['model_number','dataset','TP','FN','FP','TN','ACC','precision','recall','f1','MCC','AUC']
    csvFile = open(result_path+"FULL_B_ALL_pca18_result.csv", "w" , newline='')
    csv_writer = csv.writer(csvFile)
    csv_writer.writerow(fileHeader)
    PCA_num = 18
    for model_number in range(1,2):
        modelfile = save_model_path+'FULL_B_ALL_pca18_{}.h5'.format(model_number)
        model = load_model(modelfile)
        test_Feature = np.load(testfile_path+"test_TCRB_PCA{}_feature_array.npy".format(PCA_num)) 
        test_Label = np.load(testfile_path+"test_TCRB_PCA{}_label_array.npy".format(PCA_num)) 
        X_test = test_Feature
        Y_test = test_Label
        X_test = X_test.reshape([len(X_test),20,PCA_num+1,2])
        test_CM,accuracy1,precision1,recall1,f11,MCC1,fpr1,tpr1,roc_auc1 = computing_result(X_test,Y_test,model)
        test_row = [model_number,'TEST',
                    test_CM[0][0],test_CM[0][1],
                    test_CM[1][0],test_CM[1][1],
                    accuracy1,precision1,recall1,f11,MCC1,roc_auc1]
        csv_writer.writerow(test_row)   
        del model
    csvFile.close() 

In [None]:
database=['healthy', 'patient']
name =['1_1','1_2','1_4','1_6','1_8']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            trainfile_path ="./data/pair50/seen/test/"+j + '/' +k+'_1_1'
            testfile_path ="./data/pair50/seen/test/"+j + '/' +k+'_1_1'
            test_seq_path='/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair50/'+j+'/'+k+'_'+'1_1test.csv'
            save_model_path= "./model/Retrain/pair50/"+j + '/'+k+'_'+i
            result_path  = "./result/pair50/seen/test/"+ j + '/'+k+'_'+i
            FULL_train_main(trainfile_path,testfile_path,test_seq_path,save_model_path,result_path)
        
        
database=['Antigen_specificity']
name =['1_1','1_2','1_4']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
         for k in name1:
            trainfile_path ="./data/pair50/seen/test/"+j + '/' +k+'_1_1'
            testfile_path ="./data/repeat10/seen/test/"+j + '/' +k+'_1_1'
            test_seq_path='/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair50/'+j+'/'+k+'_'+'1_1test.csv'
            save_model_path= "./model/Retrain/pair50/"+j + '/' +k+'_'+i
            result_path  = "./result/pair50/seen/test/"+ j + '/' +k+'_'+i
            FULL_train_main(trainfile_path,testfile_path,test_seq_path,save_model_path,result_path)
            
            
import pandas as pd
database=['healthy', 'patient','Antigen_specificity']
name=['1_1']
pair=['more300','300','200','100','10']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in pair:
            for l in name1:
                trainfile_path ="./data/pair300/"+k+'/'+j + '/' +l+'_1_1'
                testfile_path ="./data/pair300/more300/"+j + '/' +l+'_1_1' 
                test_seq_path='/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair300/more300'+'/'+j+'/'+l+'_'+'1_1test.csv'
                save_model_path= "./model/Retrain/pair300/"+k+'/'+j + '/'  +l+'_1_1'
                result_path  = "./result/Retrain/pair300/seen/"+k+'/'+j + '/' +l+'_1_1' 
                FULL_train_main(trainfile_path,testfile_path,test_seq_path,save_model_path,result_path)            

# validation

In [41]:
import os
os.chdir("/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/DLpTCR/")

In [56]:
import psutil
import time
import pandas as pd
def validation_main(testfile_path,modelfile_path,test_seq_path,result_path):
    from tensorflow.python.keras import backend as K
    import tensorflow.compat.v1 as tf
    from tensorflow.python.keras.models import load_model
    import numpy as np
    model_number=1
    PCA_num=18
    modelfile = modelfile_path+'FULL_B_ALL_pca18_{}.h5'.format(model_number)
    model = load_model(modelfile)
    X_TEST = np.load(testfile_path+"_TCRB_PCA{}_feature_array.npy".format(PCA_num)) 
    Y_TEST = np.load(testfile_path+"_TCRB_PCA{}_label_array.npy".format(PCA_num))   
    X_TEST = X_TEST.reshape([len(X_TEST),20,PCA_num+1,2])
    K.clear_session()
    tf.reset_default_graph()
    Y_PRED = model.predict(X_TEST)
    Y_pred2 = np.argmin(Y_PRED, axis=-1) 
    Y_test2 = np.argmax(Y_TEST, axis=-1)
    test=pd.read_csv(test_seq_path)
    df = pd.DataFrame({'Class':test.Epitope,'CDR3B':test.CDR3B,'y_prob':Y_PRED[:, 0],'y_pred': Y_pred2, 'y_true': Y_test2})
    df.to_csv(result_path+"FULL_pca18_probability.csv", index=False)


In [None]:
name=['1_1','1_2','1_4','1_6','1_8']
database=['healthy', 'patient']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            testfile_path="./data/pair50/seen/validation/"+j+"/"+k+'_1_1validation'
            modelfile_path="./model/Retrain/pair50/"+j+"/"+k+'_'+i
            test_seq_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Validation/"+j+"/"+k+"_1_1Validation.csv"
            result_path="./result/pair50/seen/validation/"+j+"/"+k+'_'+i
            validation_main(testfile_path,modelfile_path,test_seq_path,result_path)
        
database=['Antigen_specificity']
name=['1_1','1_2','1_4']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            testfile_path="./data/pair50/seen/validation/"+j+"/"+k+'_1_1validation'
            modelfile_path="./model/Retrain/pair50/"+j+"/"+k+'_'+i
            test_seq_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Validation/"+j+"/"+k+"_1_1Validation.csv"
            result_path="./result/pair50/seen/validation/"+j+"/"+k+'_'+i
            validation_main(testfile_path,modelfile_path,test_seq_path,result_path)  

# unseen

In [57]:
name=['1_1','1_2','1_4','1_6','1_8']
database=['healthy', 'patient']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            testfile_path="./data/pair50/unseen/test/"+j+"/"+k+'_1_1test'
            modelfile_path="./model/Retrain/pair50/"+j+"/"+k+'_'+i
            test_seq_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/"+j+"/"+k+"_1_1test.csv"
            result_path="./result/pair50/unseen/test/"+j+"/"+k+'_'+i
            validation_main(testfile_path,modelfile_path,test_seq_path,result_path)
        

database=['Antigen_specificity']
name=['1_1','1_2','1_4']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            testfile_path="./data/pair50/unseen/test/"+j+"/"+k+"_1_1test"
            modelfile_path="./model/Retrain/pair50/"+j+"/"+k+'_'+i
            test_seq_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/"+j+"/"+k+"_1_1test.csv"
            result_path="./result/pair50/unseen/test/"+j+"/"+k+'_'+i
            validation_main(testfile_path,modelfile_path,test_seq_path,result_path)  
            
name=['1_1','1_2','1_4','1_6','1_8']
database=['healthy', 'patient']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            testfile_path="./data/pair50/unseen/validation/"+j+"/"+k+'_1_1validation'
            modelfile_path="./model/Retrain/pair50/"+j+"/"+k+'_'+i
            test_seq_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/validation/"+j+"/"+k+"_1_1validation.csv"
            result_path="./result/pair50/unseen/validation/"+j+"/"+k+'_'+i
            validation_main(testfile_path,modelfile_path,test_seq_path,result_path)
        

database=['Antigen_specificity']
name=['1_1','1_2','1_4']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            testfile_path="./data/pair50/unseen/validation/"+j+"/"+k+"_1_1validation"
            modelfile_path="./model/Retrain/pair50/"+j+"/"+k+'_'+i
            test_seq_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/validation/"+j+"/"+k+"_1_1validation.csv"
            result_path="./result/pair50/unseen/validation/"+j+"/"+k+'_'+i
            validation_main(testfile_path,modelfile_path,test_seq_path,result_path)  

# RESNET_pca10

In [58]:
def train_main(trainfile_path,testfile_path,test_seq_path,save_model_path,result_path):
    import tensorflow.compat.v1 as tf
    try:
        import tensorflow.python.keras as keras
    except:
        import tensorflow.keras as keras
    from tensorflow.python.keras import layers
    from tensorflow.python.keras import backend as K
    from sklearn.model_selection import KFold
    from sklearn import metrics
    from sklearn.metrics import accuracy_score,matthews_corrcoef,classification_report,confusion_matrix,precision_score,recall_score
    from sklearn.metrics import f1_score,roc_auc_score, auc
    from keras import regularizers
    import os
    import scipy.io as sio
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    import numpy as np
    from tensorflow.python.keras.callbacks import ReduceLROnPlateau,LearningRateScheduler, ModelCheckpoint
    from keras.utils import plot_model
    import matplotlib.pyplot as plt
    from sklearn.utils import shuffle
    from tensorflow.python.keras.models import load_model
    import pandas
    from tensorflow.keras import models
    from tensorflow.keras.utils import to_categorical
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.regularizers import l2
    import matplotlib.pyplot as plt
    import csv
    import pandas as pd
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # 只显示error和warining信息 3 只显示error信息
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # 这一行注释掉就是使用cpu，不注释就是使用gpu
    def RESNET_pca10(model_number,modelfile,Epochs= 20,Batch_size=32,PCA_num = 10):
        train_Feature = np.load(trainfile_path+"train_TCRB_PCA{}_feature_array.npy".format(PCA_num))    
        train_Label = np.load(trainfile_path+"train_TCRB_PCA{}_label_array.npy".format(PCA_num))
        test_Feature = np.load(testfile_path+"test_TCRB_PCA{}_feature_array.npy".format(PCA_num)) 
        test_Label = np.load(testfile_path+"test_TCRB_PCA{}_label_array.npy".format(PCA_num))   
        X_train = train_Feature
        Y_train = train_Label#[:,1]  
        X_test = test_Feature
        Y_test = test_Label#[:,1]  
        X_train,Y_train = shuffle(X_train,Y_train)
        X_test,Y_test = shuffle(X_test,Y_test)
        modelfile = modelfile
        model_number = model_number
        X_train= X_train.reshape([len(X_train),20,PCA_num+1,2])    
        X_test = X_test.reshape([len(X_test),20,PCA_num+1,2])
        X_test=tf.cast(X_test, tf.float32)
        Epochs = Epochs
        Batch_size = Batch_size
        resnet_attention_train_predict(20, PCA_num ,model_number,modelfile,2,X_train,Y_train,X_test,Y_test,Epochs,Batch_size)
    def lr_schedule(epoch):
        lr = 1e-3
        return lr*0.9*epoch
    def resnet_layer(inputs, num_filters, kernel_size=3, strides=1,
                     activation='relu', batch_normalization=True, conv_first=True):
        conv = layers.Conv2D(num_filters, kernel_size=kernel_size, strides=strides,
                             padding='same', kernel_initializer='he_normal',
                             kernel_regularizer=l2(1e-4))
        x = inputs
        if conv_first:
            x = conv(x)
            if batch_normalization:
                x = layers.BatchNormalization()(x)
            if activation is not None:
                x = layers.Activation(activation)(x)
        else:
            if batch_normalization:
                x = layers.BatchNormalization()(x)
            if activation is not None:
                x = layers.Activation(activation)(x)
            x = conv(x)
        return x

    def resnet_v1(input_shape, depth, num_classes=2):
        if (depth-2)%6 != 0:
            raise ValueError('depth should be 6n+2')
        # Start model definition.
        num_filters = 32
        num_res_blocks = int((depth-2)/6)
        inputs = tf.keras.Input(shape=input_shape)
        x = resnet_layer(inputs, num_filters)
        # Instantiate teh stack of residual units
        for stack in range(3):
            for res_block in range(num_res_blocks):
                strides = 1
                if stack > 0 and res_block == 0: # first layer but not first stack
                    strides = 2 # downsample
                y = resnet_layer(x, num_filters, strides=strides)  
                y = resnet_layer(y, num_filters, activation=None)

                if stack > 0 and res_block == 0: # first layer but not first stack
                    # linear projection residual shortcut connection to match
                    # change dims
                    x = resnet_layer(x, num_filters, kernel_size=1, strides=strides,
                                     activation=None, batch_normalization=False)
                x = layers.add([x, y])
                x = layers.Activation('relu')(x)           
            num_filters *= 2
        ax = layers.GlobalAveragePooling2D()(x)
        ax = layers.Dense(num_filters//8, activation='relu')(ax)
        ax = layers.Dense(num_filters//2, activation='softmax')(ax)
        ax = layers.Reshape((1,1,num_filters//2))(ax)
        ax = layers.Multiply()([ax, x])
        y = layers.Flatten()(ax)
        outputs = layers.Dense(num_classes, activation='softmax',
                               kernel_initializer='he_normal')(y)
        # Instantiate model
        model = models.Model(inputs=inputs, outputs=outputs)
        return model   
    def resnet_attention_train_predict(row, PCA_num,model_number, modelfile, m, x_train, y_train,x_test, y_test,Epochs,Batch_size):
        y_train, y_train,x_test, y_test = x_train, y_train,x_test, y_test
        model = resnet_v1(input_shape=(row, PCA_num+1, m), depth=20, num_classes=2)
        model.compile(optimizer="Adam",
                  loss=keras.losses.binary_crossentropy,
                  metrics=['accuracy'])
        lr_scheduler = LearningRateScheduler(lr_schedule)
        lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1), cooldown=0, patience=5,
                                       min_lr=0.5e-6)
        checkpoint = ModelCheckpoint(filepath=modelfile, monitor='val_loss',
                                    verbose=0, save_best_only=True)#,save_weights_only=True)
        cbs = [checkpoint, lr_reducer, lr_scheduler]
        model.fit(x_train, y_train,
                      batch_size=Batch_size,
                      epochs=Epochs,
                      verbose=0, 
                      validation_data=(x_test, y_test),
                      shuffle=False,
                      callbacks=cbs)#callbacks=cbs
        del model
    def computing_result(Feature_array,Label_array,model):
        X_TEST = Feature_array
        Y_TEST = Label_array
        K.clear_session()
        tf.reset_default_graph()
        model1 = model
        Y_PRED = model1.predict(X_TEST)
        Y_pred2 = np.argmin(Y_PRED, axis=-1)
        Y_test2 = np.argmax(Y_TEST, axis=-1)
        test=pd.read_csv(test_seq_path)
        df = pd.DataFrame({'Class':test.Epitope,'CDR3B':test.CDR3B,'y_prob':Y_PRED[:, 0],'y_pred': Y_pred2, 'y_true': Y_test2})
        df.to_csv(result_path+"RESNET_pca10_probability.csv", index=False)

        confusion_matrix1 =confusion_matrix(Y_test2,Y_pred2)
        new_confusion_matrix1 = [[confusion_matrix1[1,1],confusion_matrix1[1,0]],[confusion_matrix1[0,1],confusion_matrix1[0,0]]]
        accuracy = accuracy_score(Y_test2,Y_pred2) 
        precision = precision_score(Y_test2,Y_pred2)
        recall = recall_score(Y_test2,Y_pred2) 
        f1= f1_score(Y_test2,Y_pred2) 
        MCC = matthews_corrcoef(Y_test2,Y_pred2) 
        fpr, tpr, thresholds = metrics.roc_curve(Y_TEST[:,1], Y_PRED[:,1])
        roc_auc = auc(fpr, tpr)
        return new_confusion_matrix1,accuracy,precision,recall,f1,MCC,fpr,tpr,roc_auc

        K.clear_session()
        tf.reset_default_graph()
        model = load_model(modelfile)
        X_test = np.load(test_Feature_path)
        Y_test = np.load(test_Label_path)
        X_test = X_test.reshape([len(X_test),20,PCA_num+1,2])
        test_CM,accuracy1,precision1,recall1,f11,MCC1,fpr1,tpr1,roc_auc1 = computing_result(X_test,Y_test,model)
        test_row = [model_number,'TEST',
                    test_CM[0][0],test_CM[0][1],
                    test_CM[1][0],test_CM[1][1],
                    accuracy1,precision1,recall1,f11,MCC1,roc_auc1]
        del model
        return test_row,X_test_original
    PCA_num=10
    for model_number in range(1,2):
        modelfile =save_model_path+'RESNET_B_ALL_test_pca10_{}.h5'.format(model_number)
        RESNET_pca10(model_number,modelfile,20,32,10)
    fileHeader =['model_number','dataset','TP','FN','FP','TN','ACC','precision','recall','f1','MCC','AUC']
    csvFile = open(result_path+"RESNET_pca10_result.csv", "w" , newline='')
    csv_writer = csv.writer(csvFile)
    csv_writer.writerow(fileHeader)
    for model_number in range(1,2):
        modelfile = save_model_path+'RESNET_B_ALL_test_pca10_{}.h5'.format(model_number)
        model = load_model(modelfile)
        test_Feature = np.load(testfile_path+"test_TCRB_PCA{}_feature_array.npy".format(PCA_num)) 
        test_Label = np.load(testfile_path+"test_TCRB_PCA{}_label_array.npy".format(PCA_num))   
        X_test = test_Feature
        Y_test = test_Label
        X_test = X_test.reshape([len(X_test),20,PCA_num+1,2])
        test_CM,accuracy1,precision1,recall1,f11,MCC1,fpr1,tpr1,roc_auc1 = computing_result(X_test,Y_test,model)
        test_row = [model_number,'TEST',
                    test_CM[0][0],test_CM[0][1],
                    test_CM[1][0],test_CM[1][1],
                    accuracy1,precision1,recall1,f11,MCC1,roc_auc1]
        csv_writer.writerow(test_row)    
        del model
    csvFile.close() 

In [None]:
database=['healthy', 'patient']
name =['1_1','1_2','1_4','1_6','1_8']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            trainfile_path ="./data/pair50/seen/test/"+j + '/' +k+'_1_1'
            testfile_path ="./data/pair50/seen/test/"+j + '/' +k+'_1_1'
            test_seq_path='/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair50/'+j+'/'+k+'_'+'1_1test.csv'
            save_model_path= "./model/Retrain/pair50/"+j + '/'+k+'_'+i
            result_path  = "./result/pair50/seen/test/"+ j + '/'+k+'_'+i
            train_main(trainfile_path,testfile_path,test_seq_path,save_model_path,result_path)
        
        
database=['Antigen_specificity']
name =['1_1','1_2','1_4']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
         for k in name1:
            trainfile_path ="./data/pair50/seen/test/"+j + '/' +k+'_1_1'
            testfile_path ="./data/repeat10/seen/test/"+j + '/' +k+'_1_1'
            test_seq_path='/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair50/'+j+'/'+k+'_'+'1_1test.csv'
            save_model_path= "./model/Retrain/pair50/"+j + '/' +k+'_'+i
            result_path  = "./result/pair50/seen/test/"+ j + '/' +k+'_'+i
            train_main(trainfile_path,testfile_path,test_seq_path,save_model_path,result_path)
            
            
import pandas as pd
database=['healthy', 'patient','Antigen_specificity']
name=['1_1']
pair=['more300','300','200','100','10']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in pair:
            for l in name1:
                trainfile_path ="./data/pair300/"+k+'/'+j + '/' +l+'_1_1'
                testfile_path ="./data/pair300/more300/"+j + '/' +l+'_1_1' 
                test_seq_path='/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair300/more300'+'/'+j+'/'+l+'_'+'1_1test.csv'
                save_model_path= "./model/Retrain/pair300/"+k+'/'+j + '/'  +l+'_1_1'
                result_path  = "./result/Retrain/pair300/seen/"+k+'/'+j + '/' +l+'_1_1' 
                train_main(trainfile_path,testfile_path,test_seq_path,save_model_path,result_path)            

In [70]:
import psutil
import time
import pandas as pd

def validation_main(testfile_path,modelfile_path,test_seq_path,result_path):
    from tensorflow.python.keras import backend as K
    import tensorflow.compat.v1 as tf
    from tensorflow.python.keras.models import load_model
    import numpy as np
    model_number=1
    PCA_num=10
    modelfile = modelfile_path+ 'RESNET_B_ALL_test_pca10_{}.h5'.format(model_number)
    model = load_model(modelfile)
    X_TEST = np.load(testfile_path+"_TCRB_PCA{}_feature_array.npy".format(PCA_num)) 
    Y_TEST = np.load(testfile_path+"_TCRB_PCA{}_label_array.npy".format(PCA_num))   
    X_TEST = X_TEST.reshape([len(X_TEST),20,PCA_num+1,2])
    K.clear_session()
    tf.reset_default_graph()
    Y_PRED = model.predict(X_TEST)
    Y_pred2 = np.argmin(Y_PRED, axis=-1) 
    Y_test2 = np.argmax(Y_TEST, axis=-1)
    test=pd.read_csv(test_seq_path)
    df = pd.DataFrame({'Class':test.Epitope,'CDR3B':test.CDR3B,'y_prob':Y_PRED[:, 0],'y_pred': Y_pred2, 'y_true': Y_test2})
    df.to_csv(result_path+"RESNET_pca10_probability.csv", index=False)


In [72]:

name=['1_1','1_2','1_4','1_6','1_8']
database=['healthy', 'patient']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            testfile_path="./data/pair50/seen/validation/"+j+"/"+k+'_1_1validation'
            modelfile_path="./model/Retrain/pair50/"+j+"/"+k+'_'+i
            test_seq_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Validation/"+j+"/"+k+"_1_1Validation.csv"
            result_path="./result/pair50/seen/validation/"+j+"/"+k+'_'+i
            validation_main(testfile_path,modelfile_path,test_seq_path,result_path)
        

database=['Antigen_specificity']
name=['1_1','1_2','1_4']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            testfile_path="./data/pair50/seen/validation/"+j+"/"+k+"_1_1validation"
            modelfile_path="./model/Retrain/pair50/"+j+"/"+k+'_'+i
            test_seq_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Validation/"+j+"/"+k+"_1_1Validation.csv"
            result_path="./result/pair50/seen/validation/"+j+"/"+k+'_'+i
            validation_main(testfile_path,modelfile_path,test_seq_path,result_path)  
            
name=['1_1','1_2','1_4','1_6','1_8']
database=['healthy', 'patient']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            testfile_path="./data/pair50/unseen/test/"+j+"/"+k+'_1_1test'
            modelfile_path="./model/Retrain/pair50/"+j+"/"+k+'_'+i
            test_seq_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/"+j+"/"+k+"_1_1test.csv"
            result_path="./result/pair50/unseen/test/"+j+"/"+k+'_'+i
            validation_main(testfile_path,modelfile_path,test_seq_path,result_path)
        

database=['Antigen_specificity']
name=['1_1','1_2','1_4']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            testfile_path="./data/pair50/unseen/test/"+j+"/"+k+"_1_1test"
            modelfile_path="./model/Retrain/pair50/"+j+"/"+k+'_'+i
            test_seq_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/"+j+"/"+k+"_1_1test.csv"
            result_path="./result/pair50/unseen/test/"+j+"/"+k+'_'+i
            validation_main(testfile_path,modelfile_path,test_seq_path,result_path)  
            
name=['1_1','1_2','1_4','1_6','1_8']
database=['healthy', 'patient']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            testfile_path="./data/pair50/unseen/validation/"+j+"/"+k+'_1_1validation'
            modelfile_path="./model/Retrain/pair50/"+j+"/"+k+'_'+i
            test_seq_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/validation/"+j+"/"+k+"_1_1validation.csv"
            result_path="./result/pair50/unseen/validation/"+j+"/"+k+'_'+i
            validation_main(testfile_path,modelfile_path,test_seq_path,result_path)
        

database=['Antigen_specificity']
name=['1_1','1_2','1_4']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            testfile_path="./data/pair50/unseen/validation/"+j+"/"+k+"_1_1validation"
            modelfile_path="./model/Retrain/pair50/"+j+"/"+k+'_'+i
            test_seq_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/validation/"+j+"/"+k+"_1_1validation.csv"
            result_path="./result/pair50/unseen/validation/"+j+"/"+k+'_'+i
            validation_main(testfile_path,modelfile_path,test_seq_path,result_path)  

# CNN PCA 20pca

In [63]:
def CNN_train_main(trainfile_path,testfile_path,test_seq_path,save_model_path,result_path):
    #!/usr/bin/env python
    # coding: utf-8
    import tensorflow as tf
    try:
        import tensorflow.python.keras as keras
    except:
        import tensorflow.keras as keras
    from tensorflow.python.keras import layers
    from tensorflow.python.keras import backend as K
    from sklearn.model_selection import KFold
    from sklearn import metrics
    from sklearn.metrics import accuracy_score,matthews_corrcoef,classification_report,confusion_matrix,precision_score,recall_score
    from sklearn.metrics import f1_score,roc_auc_score, auc
    from keras import regularizers
    import os
    import scipy.io as sio
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    import numpy as np
    from tensorflow.python.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
    from keras.utils import plot_model
    #from tensorflow.keras.callbacks import LearningRateScheduler, ReduceLROnPlateau, ModelCheckpoint
    import matplotlib.pyplot as plt
    from sklearn.utils import shuffle
    from tensorflow.python.keras.models import load_model
    import matplotlib.pyplot as plt
    import csv
    import pandas as pd
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  
    os.environ["CUDA_VISIBLE_DEVICES"] = "0" 

    def CNN_pca20(modelfile,Dropout1=0,Epochs= 20,Batch_size=64,PCA_num = 20):
        train_Feature = np.load(trainfile_path+"train_TCRB_PCA{}_feature_array.npy".format(PCA_num))    
        train_Label = np.load(trainfile_path+"train_TCRB_PCA{}_label_array.npy".format(PCA_num))
        test_Feature = np.load(testfile_path+"test_TCRB_PCA{}_feature_array.npy".format(PCA_num)) 
        test_Label = np.load(testfile_path+"test_TCRB_PCA{}_label_array.npy".format(PCA_num))          
        X_train = train_Feature
        Y_train = train_Label  
        X_test = test_Feature
        Y_test = test_Label
        X_train,Y_train = shuffle(X_train,Y_train)
        X_test,Y_test = shuffle(X_test,Y_test)
        X_train= X_train.reshape([len(X_train),20,PCA_num+1,2])
        X_test = X_test.reshape([len(X_test),20,PCA_num+1,2])
        X_test=tf.cast(X_test, tf.float32)
        model = tf.keras.models.Sequential([
            tf.keras.layers.Conv2D(64, (5,5),padding = 'same', input_shape=(20,PCA_num+1,2),activation='relu'),            
            tf.keras.layers.AveragePooling2D(2,2),
            tf.keras.layers.Conv2D(128, (3,3),padding = 'same',activation='relu'),
            tf.keras.layers.AveragePooling2D(2,2),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(512,activation='relu'),# kernel_regularizer=regularizers.l2(0.01)),#  activation='relu',
            tf.keras.layers.Dense(256,activation='relu'),# kernel_regularizer=regularizers.l2(0.01)),#  activation='relu',
            tf.keras.layers.Dense(128,activation='relu'),
            tf.keras.layers.Dense(64,activation='relu'),
            tf.keras.layers.Dropout(Dropout1),
            tf.keras.layers.Dense(2, activation='softmax')
        ]) 
        model.compile(optimizer="Adam",
                      loss=keras.losses.binary_crossentropy,
                      metrics=['accuracy'])   
        checkpoint = ModelCheckpoint(filepath=modelfile, 
                                     monitor='val_loss',
                                     verbose=0, 
                                     save_best_only=True)#,save_weights_only=True)
        cbs = [checkpoint]#, lr_reducer, lr_scheduler]
        history = model.fit(X_train, 
                            Y_train, 
                            epochs= Epochs , 
                            batch_size= Batch_size, 
                            verbose=0,
                            validation_data=(X_test, Y_test),
                            shuffle=False,
                            callbacks=cbs)
        return history
        del model

    for model_number in range(1,2):
        print(model_number)
        modelfile =save_model_path+'CNN_B_ALL_pca20_plt_{}.h5'.format(model_number)
        history = CNN_pca20(modelfile,0.3,300,128,20)
        test_row = history.history['val_accuracy']

    def computing_result(Feature_array,Label_array,model):
        X_TEST = Feature_array
        Y_TEST = Label_array
        model1 = model
        Y_PRED = model1.predict(X_TEST)
        Y_pred2 = np.argmin(Y_PRED, axis=-1) 
        Y_test2 = np.argmax(Y_TEST, axis=-1)
        test=pd.read_csv(test_seq_path)
        df = pd.DataFrame({'Class':test.Epitope,'CDR3B':test.CDR3B,'y_prob':Y_PRED[:, 0],'y_pred': Y_pred2, 'y_true': Y_test2})
        df.to_csv(result_path+"CNN_pca20_probability.csv", index=False)

        confusion_matrix1 =confusion_matrix(Y_test2,Y_pred2)
        new_confusion_matrix1 = [[confusion_matrix1[1,1],confusion_matrix1[1,0]],[confusion_matrix1[0,1],confusion_matrix1[0,0]]]
        accuracy = accuracy_score(Y_test2,Y_pred2) 
        precision = precision_score(Y_test2,Y_pred2) 
        recall = recall_score(Y_test2,Y_pred2)
        f1= f1_score(Y_test2,Y_pred2) 
        MCC = matthews_corrcoef(Y_test2,Y_pred2) 
        fpr, tpr, thresholds = metrics.roc_curve(Y_TEST[:,1], Y_PRED[:,1])
        roc_auc = auc(fpr, tpr)
        return new_confusion_matrix1,accuracy,precision,recall,f1,MCC,fpr,tpr,roc_auc  
    fileHeader =['model_number','dataset','TP','FN','FP','TN','ACC','precision','recall','f1','MCC','AUC']

    csvFile = open(result_path+"CNN_pca20_result.csv", "w" , newline='')
    csv_writer = csv.writer(csvFile)
    csv_writer.writerow(fileHeader)
    PCA_num = 20
    for model_number in range(1,2):
        modelfile =save_model_path+'CNN_B_ALL_pca20_plt_{}.h5'.format(model_number)
        model = load_model(modelfile)
        test_Feature = np.load(testfile_path+"test_TCRB_PCA{}_feature_array.npy".format(PCA_num)) 
        test_Label = np.load(testfile_path+"test_TCRB_PCA{}_label_array.npy".format(PCA_num))
        X_test = test_Feature
        Y_test = test_Label
        X_test = X_test.reshape([len(X_test),20,PCA_num+1,2])
        test_CM,accuracy1,precision1,recall1,f11,MCC1,fpr1,tpr1,roc_auc1 = computing_result(X_test,Y_test,model)
        test_row = [model_number,'TEST',
                    test_CM[0][0],test_CM[0][1],
                    test_CM[1][0],test_CM[1][1],
                    accuracy1,precision1,recall1,f11,MCC1,roc_auc1]
        csv_writer.writerow(test_row)
        del model
    csvFile.close() 


In [None]:
database=['healthy', 'patient']
name =['1_1','1_2','1_4','1_6','1_8']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            trainfile_path ="./data/pair50/seen/test/"+j + '/' +k+'_1_1'
            testfile_path ="./data/pair50/seen/test/"+j + '/' +k+'_1_1'
            test_seq_path='/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair50/'+j+'/'+k+'_'+'1_1test.csv'
            save_model_path= "./model/Retrain/pair50/"+j + '/'+k+'_'+i
            result_path  = "./result/pair50/seen/test/"+ j + '/'+k+'_'+i
            CNN_train_main(trainfile_path,testfile_path,test_seq_path,save_model_path,result_path)
        
        
database=['Antigen_specificity']
name =['1_1','1_2','1_4']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
         for k in name1:
            trainfile_path ="./data/pair50/seen/test/"+j + '/' +k+'_1_1'
            testfile_path ="./data/repeat10/seen/test/"+j + '/' +k+'_1_1'
            test_seq_path='/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair50/'+j+'/'+k+'_'+'1_1test.csv'
            save_model_path= "./model/Retrain/pair50/"+j + '/' +k+'_'+i
            result_path  = "./result/pair50/seen/test/"+ j + '/' +k+'_'+i
            CNN_train_main(trainfile_path,testfile_path,test_seq_path,save_model_path,result_path)
            
            
import pandas as pd
database=['healthy', 'patient','Antigen_specificity']
name=['1_1']
pair=['more300','300','200','100','10']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in pair:
            for l in name1:
                trainfile_path ="./data/pair300/"+k+'/'+j + '/' +l+'_1_1'
                testfile_path ="./data/pair300/more300/"+j + '/' +l+'_1_1' 
                test_seq_path='/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/pair300/more300'+'/'+j+'/'+l+'_'+'1_1test.csv'
                save_model_path= "./model/Retrain/pair300/"+k+'/'+j + '/'  +l+'_1_1'
                result_path  = "./result/Retrain/pair300/seen/"+k+'/'+j + '/' +l+'_1_1' 
                CNN_train_main(trainfile_path,testfile_path,test_seq_path,save_model_path,result_path)            

In [66]:
import pandas as pd
def validation_main(testfile_path,modelfile_path,test_seq_path,result_path):
    from tensorflow.python.keras import backend as K
    import tensorflow.compat.v1 as tf
    from tensorflow.python.keras.models import load_model
    import numpy as np
    model_number=1
    PCA_num=20
    modelfile = modelfile_path+ 'CNN_B_ALL_pca20_plt_{}.h5'.format(model_number)
    model = load_model(modelfile)
    X_TEST = np.load(testfile_path+"_TCRB_PCA{}_feature_array.npy".format(PCA_num)) 
    Y_TEST = np.load(testfile_path+"_TCRB_PCA{}_label_array.npy".format(PCA_num))   
    X_TEST = X_TEST.reshape([len(X_TEST),20,PCA_num+1,2])
    K.clear_session()
    tf.reset_default_graph()
    Y_PRED = model.predict(X_TEST)
    Y_pred2 = np.argmin(Y_PRED, axis=-1) 
    Y_test2 = np.argmax(Y_TEST, axis=-1)
    test=pd.read_csv(test_seq_path)
    df = pd.DataFrame({'Class':test.Epitope,'CDR3B':test.CDR3B,'y_prob':Y_PRED[:, 0],'y_pred': Y_pred2, 'y_true': Y_test2})
    df.to_csv(result_path+"CNN_pca20_probability.csv", index=False)


In [67]:

name=['1_1','1_2','1_4','1_6','1_8']
database=['healthy', 'patient']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            testfile_path="./data/pair50/seen/validation/"+j+"/"+k+'_1_1validation'
            modelfile_path="./model/Retrain/pair50/"+j+"/"+k+'_'+i
            test_seq_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Validation/"+j+"/"+k+"_1_1Validation.csv"
            result_path="./result/pair50/seen/validation/"+j+"/"+k+'_'+i
            validation_main(testfile_path,modelfile_path,test_seq_path,result_path)
        

database=['Antigen_specificity']
name=['1_1','1_2','1_4']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            testfile_path="./data/pair50/seen/validation/"+j+"/"+k+"_1_1validation"
            modelfile_path="./model/Retrain/pair50/"+j+"/"+k+'_'+i
            test_seq_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/Validation/"+j+"/"+k+"_1_1Validation.csv"
            result_path="./result/pair50/seen/validation/"+j+"/"+k+'_'+i
            validation_main(testfile_path,modelfile_path,test_seq_path,result_path) 
            
name=['1_1','1_2','1_4','1_6','1_8']
database=['healthy', 'patient']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            testfile_path="./data/pair50/unseen/test/"+j+"/"+k+'_1_1test'
            modelfile_path="./model/Retrain/pair50/"+j+"/"+k+'_'+i
            test_seq_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/"+j+"/"+k+"_1_1test.csv"
            result_path="./result/pair50/unseen/test/"+j+"/"+k+'_'+i
            validation_main(testfile_path,modelfile_path,test_seq_path,result_path)
        

database=['Antigen_specificity']
name=['1_1','1_2','1_4']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            testfile_path="./data/pair50/unseen/test/"+j+"/"+k+"_1_1test"
            modelfile_path="./model/Retrain/pair50/"+j+"/"+k+'_'+i
            test_seq_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/"+j+"/"+k+"_1_1test.csv"
            result_path="./result/pair50/unseen/test/"+j+"/"+k+'_'+i
            validation_main(testfile_path,modelfile_path,test_seq_path,result_path)  
            
name=['1_1','1_2','1_4','1_6','1_8']
database=['healthy', 'patient']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            testfile_path="./data/pair50/unseen/validation/"+j+"/"+k+'_1_1validation'
            modelfile_path="./model/Retrain/pair50/"+j+"/"+k+'_'+i
            test_seq_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/validation/"+j+"/"+k+"_1_1validation.csv"
            result_path="./result/pair50/unseen/validation/"+j+"/"+k+'_'+i
            validation_main(testfile_path,modelfile_path,test_seq_path,result_path)
        

database=['Antigen_specificity']
name=['1_1','1_2','1_4']
name1=['1','2','3','4','5']
for i in name:
    for j in database:
        for k in name1:
            testfile_path="./data/pair50/unseen/validation/"+j+"/"+k+"_1_1validation"
            modelfile_path="./model/Retrain/pair50/"+j+"/"+k+'_'+i
            test_seq_path= "/home/luyanping/data/TCR_epitope_prediction/Compare_models_same_data/database/benchmark_dataset/10cross_validation/unknow/validation/"+j+"/"+k+"_1_1validation.csv"
            result_path="./result/pair50/unseen/validation/"+j+"/"+k+'_'+i
            validation_main(testfile_path,modelfile_path,test_seq_path,result_path)  