# 1.Original model prediction

In [6]:
import numpy as np
import sys
import pickle
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from tqdm import tqdm
from IPython.display import display
from argparse import ArgumentParser
import src.modules.processor as Processor
import src.modules.model as Model

parser = ArgumentParser(description="Specifying Input Parameters")
parser.add_argument("-te", "--testfile_path", help="Specify the full path of the file with TCR sequences")
parser.add_argument("-mf", "--modelfile_path", help="Specify the full path of the file with trained model")
parser.add_argument("-c", "--chain", default="ce", help="Specify the chain (s) to use (ce, cem). Default: ce")
parser.add_argument("-o", "--result_path", default=sys.stdout, help="Specify output file")

class Args:
    def __init__(self, testfile_path, modelfile_path, chain,result_path):
        self.testfile_path = testfile_path 
        self.modelfile_path = modelfile_path
        self.chain = chain
        self.result_path=result_path

def Original_model_prediction(testfile_path,modelfile_path,result_path,chain):
    import pandas as pd
    from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, f1_score, matthews_corrcoef, precision_score,auc,roc_curve
    args = Args(testfile_path = testfile_path,modelfile_path=modelfile_path,chain = chain,result_path=result_path)
    modelfile_path = args.modelfile_path
    chain = args.chain
    test = pd.read_csv(args.testfile_path)
    new_column_names = {'CDR3B': 'CDR3b',
                    'Epitope': 'epitope',
                   'Affinity':'binder'}
    test = test.rename(columns=new_column_names)
    assert chain in ["ce","cem"]
    if chain not in ["ce","cem"]:
        print("Invalid chain. You can select ce (cdr3b+epitope), cem (cdr3b+epitope+mhc)")
    if chain=='ce':
        pX_test, py_test = Processor.dataRepresentationBlosum62WithoutMHCb(test), test[["binder"]]

        model_rf = pickle.load(open(modelfile_path, 'rb'))
        print('Evaluating..')
        auc_test, acc_test, sens_test, spec_test = Model.predicMLModel(model_rf, test, pX_test, py_test, args.result_path)
        print('Done!')

    else:
        pX_test_mhc, py_test_mhc = Processor.dataRepresentationBlosum62WithMHCb(test), test[["binder"]]

        model_rf_mhc = pickle.load(open(modelfile_path, 'rb'))
        print('Evaluating..')
        auc_test, acc_test, sens_test, spec_test = Model.predicMLModel(model_rf_mhc, test, pX_test_mhc, py_test_mhc, args.result_path)
        print('Done!')   
    result = pd.read_csv (result_path+'result.csv')
    result = result.rename(columns={'binder': 'y_true', 'predict_proba': 'y_prob', 'binder_pred': 'y_pred'})
    result.to_csv(result_path+'probability.csv')


In [None]:
testfile_path="../data/test.csv"
modelfile_path="../Original_model/epiTCR.pickle"
result_path="../result_path/Original_model_prediction"
chain='ce'
Original_model_prediction(testfile_path,modelfile_path,result_path,chain)


# 2.Model retraining

In [21]:
import numpy as np
import sys
import pickle
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from IPython.display import display
import src.modules.processor as Processor
import src.modules.model as Model
from argparse import ArgumentParser


#Args parse
parser = ArgumentParser(description="Specifying Input Parameters")
parser.add_argument("-tr", "--trainfile_path", help="Specify the full path of the training file with TCR sequences")
parser.add_argument("-te", "--testfile_path", help="Specify the full path of the file with TCR sequences")
parser.add_argument("-c", "--chain", default="ce", help="Specify the chain(s) to use (ce, cem). Default: ce")
parser.add_argument("-o", "--result_path", default=sys.stdout, help="Specify output file")
# parser.add_argument("-sm", "--savemodel", help="Specify the full path of the file with save model")

class Args:
    def __init__(self, trainfile_path, testfile_path, chain,result_path):
        self.trainfile_path = trainfile_path 
        self.testfile_path = testfile_path
        self.chain = chain
        self.result_path=result_path

def Model_retraining(trainfile_path,testfile_path,save_modle_path,result_path,chain):
    import pandas as pd
    from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, f1_score, matthews_corrcoef, precision_score,auc,roc_curve
    args = Args(trainfile_path = trainfile_path,testfile_path = testfile_path,chain = chain,result_path=result_path)
    chain = args.chain
    print('Loading and encoding the dataset..')
    if chain not in ["ce","cem"]:
        print("Invalid chain. You can select ce (cdr3b+epitope), cem (cdr3b+epitope+mhc)")
    assert chain in ["ce","cem"]
    train = pd.read_csv(args.trainfile_path)
    test = pd.read_csv(args.testfile_path)
    new_column_names = {'CDR3B': 'CDR3b',
                    'Epitope': 'epitope',
                   'Affinity':'binder'}
    train = train.rename(columns=new_column_names)
    test = test.rename(columns=new_column_names)
    clf_sm = RandomUnderSampler(random_state=42)
    lst_models = [ ('Random Forest - without MHC', RandomForestClassifier(bootstrap=False, max_features=15,
                             n_estimators=300, n_jobs=-1, random_state=42)),
                   ('Random Forest - with MHC', RandomForestClassifier(max_features=20,
                             n_estimators=300, n_jobs=-1, random_state=42))]
    if chain=='ce':
        pX_train, py_train = Processor.dataRepresentationDownsamplingWithoutMHCb(train)
        pX_test, py_test = Processor.dataRepresentationBlosum62WithoutMHCb(test), test[["binder"]]
        print('Training..')
        model_rf = lst_models[0][1].fit(pX_train, np.ravel(py_train))
        Model.saveByPickle(model_rf,save_modle_path )
        #Model.saveByPickle(model_rf, args.savemodel)
        print('Evaluating..')
        auc_test, acc_test, sens_test, spec_test = Model.predicMLModel(model_rf, test, pX_test, py_test, args.result_path)
    else:
        pX_train_mhc, py_train_mhc = Processor.dataRepresentationDownsamplingWithMHCb(train)
        pX_test_mhc, py_test_mhc = Processor.dataRepresentationBlosum62WithMHCb(test), test[["binder"]]

        model_rf_mhc = lst_models[1][1].fit(pX_train_mhc, np.ravel(py_train_mhc))
        Model.saveByPickle(model_rf_mhc, save_modle_path +".pickle")
        auc_test, acc_test, sens_test, spec_test = Model.predicMLModel(model_rf_mhc, test, pX_test_mhc, py_test_mhc, args.result_path)
    result = pd.read_csv (result_path+'result.csv')
    result = result.rename(columns={'binder': 'y_true', 'predict_proba': 'y_prob', 'binder_pred': 'y_pred'})
    result.to_csv(result_path+'probability.csv')
    import os
    os.remove(result_path + 'result.csv')

In [None]:
trainfile_path ="../data/train.csv"
testfile_path="../data/test.csv"
save_modle_path="../Retraining_model/Retraining_model.pickle"
result_path="../result_path/Retraining_model_prediction"
chain='ce'
Model_retraining(trainfile_path,testfile_path,save_modle_path,result_path,chain) 

# 3.Retraining_model_prediction

In [23]:
import numpy as np
import sys
import pickle
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from tqdm import tqdm
from IPython.display import display
from argparse import ArgumentParser

parser = ArgumentParser(description="Specifying Input Parameters")
parser.add_argument("-te", "--testfile_path", help="Specify the full path of the file with TCR sequences")
parser.add_argument("-mf", "--modelfile_path", help="Specify the full path of the file with trained model")
parser.add_argument("-c", "--chain", default="ce", help="Specify the chain (s) to use (ce, cem). Default: ce")
parser.add_argument("-o", "--result_path", default=sys.stdout, help="Specify output file")

class Args:
    def __init__(self, testfile_path, modelfile_path, chain,result_path):
        self.testfile_path = testfile_path 
        self.modelfile_path = modelfile_path
        self.chain = chain
        self.result_path=result_path

def Retraining_model_prediction(testfile_path,modelfile_path,result_path,chain):
    import pandas as pd
    from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, f1_score, matthews_corrcoef, precision_score,auc,roc_curve
    args = Args(testfile_path = testfile_path,modelfile_path=modelfile_path,chain = chain,result_path=result_path)
    modelfile_path = args.modelfile_path
    chain = args.chain
    test = pd.read_csv(args.testfile_path)
    new_column_names = {'CDR3B': 'CDR3b',
                    'Epitope': 'epitope',
                   'Affinity':'binder'}
    test = test.rename(columns=new_column_names)
    assert chain in ["ce","cem"]
    if chain not in ["ce","cem"]:
        print("Invalid chain. You can select ce (cdr3b+epitope), cem (cdr3b+epitope+mhc)")
    if chain=='ce':
        pX_test, py_test = Processor.dataRepresentationBlosum62WithoutMHCb(test), test[["binder"]]
        model_rf = pickle.load(open(modelfile_path, 'rb'))
        auc_test, acc_test, sens_test, spec_test = Model.predicMLModel(model_rf, test, pX_test, py_test, args.result_path)

    else:
        pX_test_mhc, py_test_mhc = Processor.dataRepresentationBlosum62WithMHCb(test), test[["binder"]]
        model_rf_mhc = pickle.load(open(modelfile_path, 'rb'))
        auc_test, acc_test, sens_test, spec_test = Model.predicMLModel(model_rf_mhc, test, pX_test_mhc, py_test_mhc, args.result_path)
        print('Done!')   
    result = pd.read_csv (result_path+'result.csv')
    result = result.rename(columns={'binder': 'y_true', 'predict_proba': 'y_prob', 'binder_pred': 'y_pred'})
    result.to_csv(result_path+'probability.csv')
    import os
    os.remove(result_path + 'result.csv')

In [None]:
testfile_path="../data/Validation.csv"
modelfile_path="../Retraining_model/Retraining_model.pickle"
result_path="../result_path/Retraining_model_prediction"
chain='ce'
Retraining_model_prediction(testfile_path,modelfile_path,result_path,chain)
