# 2.Retraining_model

In [1]:
import sys
import os
import pandas as pd
from DeepTCR.DeepTCR import DeepTCR_SS

def fix_train(data_ori,trainfile_out):
    data=pd.read_csv(data_ori+'train.csv')
    #'CDR3.beta', 'antigen_epitope','mhc.a','label','negative.source','license'
    data.rename(columns={'CDR3B':'beta','Epitope':'epitope','Affinity':'label'},inplace=True)
    df_all=data[['beta', 'epitope','label']]
    df=df_all.loc[df_all.label==1]
    for epitope, group in df.groupby('epitope'):
        output_dir = f"{trainfile_out}/train/{epitope}/"
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, f"{epitope}.csv")
        group=group[['beta']]
        group=pd.concat([group,group])
        group.to_csv(output_path, index=False)
    
def fix_test(data_ori,trainfile_out):
    data=pd.read_csv(data_ori+'test.csv')
    group_label_all=pd.DataFrame()
    for epitope, group in data.groupby('Epitope'):
        group_label_all=pd.concat([group_label_all,group])
    group_label_all.to_csv(f"{trainfile_out}test_group.csv", index=False)    
    #'CDR3.beta', 'antigen_epitope','mhc.a','label','negative.source','license'
    data.rename(columns={'CDR3B':'beta','Epitope':'epitope','Affinity':'label'},inplace=True)
    df_all=data[['beta', 'epitope','label']]
    df=df_all
    for epitope, group in df.groupby('epitope'):
        output_dir = f"{trainfile_out}test/{epitope}/"
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, f"{epitope}.csv")       
        group=group[['beta']]
        group.to_csv(output_path, index=False)


def Model_retraining(trainfile_path, testfile_path, save_models_path, resultfile_path):

    fix_train(trainfile_path, trainfile_path)
    fix_test(testfile_path, testfile_path)
    DTCR_SS_train = DeepTCR_SS(save_models_path)
    print('load training data....')
    DTCR_SS_train.Get_Data(directory=trainfile_path+'train',Load_Prev_Data=False,aggregate_by_aa=True,
                   aa_column_beta=0,sep=',',n_jobs=20)
    DTCR_SS_train.Get_Train_Valid_Test(test_size=0.5)
    print('training model....')
    DTCR_SS_train.Train(stop_criterion = 0.01)
    
    #test
    DTCR_SS_test = DeepTCR_SS(save_models_path)
    print('load test data....')
    DTCR_SS_test.Get_Data(directory=testfile_path+'test',Load_Prev_Data=False,aggregate_by_aa=True,
                   aa_column_beta=0,sep=',',n_jobs=20)
    beta = DTCR_SS_test.beta_sequences
    class_labels = DTCR_SS_test.class_id
    sample_labels = DTCR_SS_test.sample_id
    
    
    predict_prob=DTCR_SS_test.Sequence_Inference(alpha_sequences=None, beta_sequences=beta, v_beta=None, d_beta=None, j_beta=None,
                      v_alpha=None, j_alpha=None, p=None,hla=None, batch_size=10000, models=None,return_dist=False)#models有默认路径
    df_pre=pd.DataFrame(predict_prob)
    epitope_ls=list(DTCR_SS_train.classes)
    df_pre.columns=epitope_ls
    # df_pre
    
    df_pre_final=pd.read_csv(f"{testfile_path}test_group.csv")
    df_pre_final = df_pre_final[['Epitope', 'CDR3B', 'Affinity']]
    df_pre_final = df_pre_final.rename(columns={'Affinity': 'y_true'})
    df_pre_final = pd.concat([df_pre_final, pd.DataFrame({
        'Predicted_Epitope': df_pre.idxmax(axis=1),
        'y_prob': df_pre.max(axis=1)
    })], axis=1)
    
    df_pre_final.loc[df_pre_final.Epitope==df_pre_final.Predicted_Epitope,'y_pred']=1
    df_pre_final.loc[df_pre_final.Epitope!=df_pre_final.Predicted_Epitope,'y_pred']=0
    df_pre_final=df_pre_final[['Epitope', 'CDR3B','y_true','y_pred','y_prob']]
    df_pre_final.to_csv(f'{resultfile_path}probability.csv', index=False)
    print('done saving!')

In [None]:
trainfile_path ="../data/"
testfile_path="../data/"
save_models_path="../Retraining_model/Retraining_model"
result_path="../result_path/Retraining_model_prediction"
Model_retraining(trainfile_path,testfile_path,save_models_path,result_path) 

# 3.Retraining_model_prediction

In [3]:
import sys
import os
import pandas as pd
from DeepTCR.DeepTCR import DeepTCR_SS
def fix_validation(data_ori,trainfile_out):
    data=pd.read_csv(data_ori+'Validation.csv')
    group_label_all=pd.DataFrame()
    for epitope, group in data.groupby('Epitope'):
        group_label_all=pd.concat([group_label_all,group])
    group_label_all.to_csv(f"{trainfile_out}validation_group.csv", index=False)    
    #'CDR3.beta', 'antigen_epitope','mhc.a','label','negative.source','license'
    data.rename(columns={'CDR3B':'beta','Epitope':'epitope','Affinity':'label'},inplace=True)
    df_all=data[['beta', 'epitope','label']]
    df=df_all
    for epitope, group in df.groupby('epitope'):
        output_dir = f"{trainfile_out}validation/{epitope}/"
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, f"{epitope}.csv")       
        group=group[['beta']]
        group.to_csv(output_path, index=False)

def Retraining_model_prediction(trainfile_path, testfile_path, result_path_all, resultfile_path):
    DTCR_SS_train = DeepTCR_SS(result_path_all)
    print('load training data....')
    DTCR_SS_train.Get_Data(directory=trainfile_path+'train',Load_Prev_Data=False,aggregate_by_aa=True,
                   aa_column_beta=0,sep=',',n_jobs=20)
    
    #test
    DTCR_SS_test = DeepTCR_SS(result_path_all)
    print('load test data....')
    DTCR_SS_test.Get_Data(directory=testfile_path+'Validation',Load_Prev_Data=False,aggregate_by_aa=True,
                   aa_column_beta=0,sep=',',n_jobs=20)
    beta = DTCR_SS_test.beta_sequences
    class_labels = DTCR_SS_test.class_id
    sample_labels = DTCR_SS_test.sample_id
    
    
    predict_prob=DTCR_SS_test.Sequence_Inference(alpha_sequences=None, beta_sequences=beta, v_beta=None, d_beta=None, j_beta=None,
                      v_alpha=None, j_alpha=None, p=None,hla=None, batch_size=10000, models=None,return_dist=False)#models有默认路径
    df_pre=pd.DataFrame(predict_prob)

    # print('len(df_pre.columns):',len(df_pre.columns))
    # print('len(DTCR_SS_test.classes):',len(DTCR_SS_test.classes))
    epitope_ls=list(DTCR_SS_train.classes)
    df_pre.columns=epitope_ls
    # df_pre
    
    df_pre_final=pd.read_csv(f"{testfile_path}{testfile_name}_group.csv")
    df_pre_final = df_pre_final[['Epitope', 'CDR3B', 'Affinity']]
    df_pre_final = df_pre_final.rename(columns={'Affinity': 'y_true'})
    df_pre_final = pd.concat([df_pre_final, pd.DataFrame({
        'Predicted_Epitope': df_pre.idxmax(axis=1),
        'y_prob': df_pre.max(axis=1)
    })], axis=1)
    
    df_pre_final.loc[df_pre_final.Epitope==df_pre_final.Predicted_Epitope,'y_pred']=1
    df_pre_final.loc[df_pre_final.Epitope!=df_pre_final.Predicted_Epitope,'y_pred']=0
    df_pre_final=df_pre_final[['Epitope', 'CDR3B','y_true','y_pred','y_prob']]
    df_pre_final.to_csv(f'{resultfile_path}probability.csv', index=False)
    print('done saving!')

In [None]:
trainfile_path ="../data/"
testfile_path="../data/"
modelfile_path="../Retraining_model/Retraining_model"
result_path="../result_path/Retraining_model_prediction"
Retraining_model_prediction(trainfile_path,testfile_path,modelfile_path,result_path) 