In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import auc, roc_curve, roc_auc_score, f1_score, precision_recall_curve, accuracy_score, recall_score,  precision_score
from sklearn import preprocessing

In [2]:
# read training data
PS_training_df = pd.read_csv('./results/PS_training.csv', index_col=0)

# TD proba
min_max_scaler = preprocessing.MinMaxScaler()
PS_training_df['TD_proba'] = min_max_scaler.fit_transform(PS_training_df[['MFI_ratio']]).reshape(-1)
PS_training_df

Unnamed: 0,peptide,peptide_num,sample_name,affinity,allele,PG_proba,PS_proba,presentation_percentile,PS_y,BA_proba,...,TAP_y,HLA,MFI_ratio,SD,Source,ID,Sequence,length,TD_y,TD_proba
0,AAAAAAAAAE,13269,sample1,24912.726279,HLA-C*07:01,0.008204,0.004151,62.744674,0,0.064386,...,0,C*07:01,1.38,0.45,Bashirova,HLA:HLA00433,MRVMAPRALLLLLSGGLALTETWACSHSMRYFDTAVSRPGRGEPRF...,366,0,0.000737
1,AAAQFTKLR,11747,sample1,14617.001321,HLA-C*07:01,0.275157,0.017675,13.500897,0,0.113666,...,1,C*07:01,1.38,0.45,Bashirova,HLA:HLA00433,MRVMAPRALLLLLSGGLALTETWACSHSMRYFDTAVSRPGRGEPRF...,366,0,0.000737
2,AAEHSAQVGDP,2430,sample1,28223.131061,HLA-C*07:01,0.004377,0.003630,99.286603,0,0.052855,...,0,C*07:01,1.38,0.45,Bashirova,HLA:HLA00433,MRVMAPRALLLLLSGGLALTETWACSHSMRYFDTAVSRPGRGEPRF...,366,0,0.000737
3,AAGGALGP,13285,sample1,28448.045100,HLA-C*07:01,0.005589,0.003617,99.286603,0,0.052122,...,1,C*07:01,1.38,0.45,Bashirova,HLA:HLA00433,MRVMAPRALLLLLSGGLALTETWACSHSMRYFDTAVSRPGRGEPRF...,366,0,0.000737
4,AAKEWEMW,14316,sample1,23971.646142,HLA-C*07:01,0.292726,0.011703,20.232255,0,0.067945,...,1,C*07:01,1.38,0.45,Bashirova,HLA:HLA00433,MRVMAPRALLLLLSGGLALTETWACSHSMRYFDTAVSRPGRGEPRF...,366,0,0.000737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70654,WLTPFLQAVY,560,sample1,10617.317775,HLA-A*26:01,0.270236,0.023554,10.231984,0,0.143213,...,1,A*26:01,8.33,2.14,Bashirova,HLA:HLA00073,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFYTSVSRPGRGEPRF...,365,1,0.025123
70655,YPIELHGIGKY,2175,sample1,10936.552986,HLA-A*26:01,0.773798,0.122202,2.543641,1,0.140475,...,0,A*26:01,8.33,2.14,Bashirova,HLA:HLA00073,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFYTSVSRPGRGEPRF...,365,1,0.025123
70656,YVFGGTTGY,1122,sample1,24.690406,HLA-A*26:01,0.642856,0.970225,0.018723,1,0.703652,...,1,A*26:01,8.33,2.14,Bashirova,HLA:HLA00073,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFYTSVSRPGRGEPRF...,365,1,0.025123
70657,YVIGRRKSY,2400,sample1,117.123267,HLA-A*26:01,0.442252,0.779350,0.310707,1,0.559767,...,1,A*26:01,8.33,2.14,Bashirova,HLA:HLA00073,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFYTSVSRPGRGEPRF...,365,1,0.025123


In [3]:
# keep only relevant
PS_training_df = PS_training_df[['peptide',
                                 'allele',
                                 'PG_proba',
                                 'TAP_proba',
                                 'BA_proba',
                                 'TD_proba',
                                 'PG_y',
                                 'TAP_y',
                                 'BA_y',
                                 'TD_y',
                                 'PS_proba',
                                 'PS_y']]


##### NOTE Here PS_y is hit (real y) 

In [4]:
# Create X and y

X_y_df = PS_training_df[['PG_proba',
                            'TAP_proba',
                            'BA_proba',
                            'TD_proba',
                            'PS_y']].dropna()

X = X_y_df[['PG_proba',
            'TAP_proba',
            'BA_proba',
            'TD_proba']].to_numpy()

y = X_y_df['PS_y'].to_numpy()

In [5]:
# Create 
nfold = 10
skf = StratifiedKFold(n_splits=nfold, )

auroc_ls = []
auprc_ls = []
f1_ls = []
precision_ls = []
recall_ls = []

fold_df = {'auroc':[], 'auprc':[], 
           'f1':[], 
           'accuracy': [],
           'precision':[],  
           'recall':[]}

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    clf = LogisticRegression(random_state=0,fit_intercept=True)
    
    clf.fit(X[train_index], y[train_index])
    y_pred = clf.predict(X[test_index])
    y_proba_pred = clf.predict_proba(X[test_index])
    

    AUROC = roc_auc_score(y[test_index], y_proba_pred[:,1])
    fold_df['auroc'] = fold_df['auroc'] + [AUROC]
    
    # calculate precision-recall curve
    precision, recall, thresholds = precision_recall_curve(y[test_index], y_proba_pred[:,1])
    AUPRC = auc(recall, precision)
    fold_df['auprc'] = fold_df['auprc'] + [AUPRC]
    
    # calculate f1 score
    f1 = f1_score(y[test_index], y_pred)
    fold_df['f1'] = fold_df['f1'] + [f1]
    
    # calculate accuracy score
    accuracy = accuracy_score(y[test_index], y_pred)
    fold_df['accuracy'] = fold_df['accuracy'] + [accuracy]
    
    # calculate precision score
    precision = precision_score(y[test_index], y_pred)
    fold_df['precision'] = fold_df['precision'] + [precision]
    
    # calculate recall score
    recall = recall_score(y[test_index], y_pred)
    fold_df['recall'] = fold_df['recall'] + [recall]
    
    
fold_df = pd.DataFrame(fold_df)
fold_df['fold'] = [i for i in range(nfold)]
fold_df

Unnamed: 0,auroc,auprc,f1,accuracy,precision,recall,fold
0,0.958509,0.935905,0.871846,0.927399,0.912657,0.834529,0
1,0.981814,0.966886,0.925337,0.956836,0.947844,0.903874,1
2,0.97804,0.956614,0.903801,0.941976,0.887149,0.92109,2
3,0.979378,0.971456,0.916627,0.950184,0.908024,0.925395,3
4,0.977332,0.962835,0.910387,0.946646,0.905009,0.91583,4
5,0.976123,0.949418,0.906603,0.94494,0.90988,0.903349,5
6,0.977384,0.954324,0.912962,0.947346,0.893724,0.933046,6
7,0.97944,0.959413,0.908792,0.94494,0.891444,0.926829,7
8,0.984809,0.96701,0.916055,0.948195,0.880123,0.955045,8
9,0.980405,0.954574,0.884599,0.925548,0.817187,0.964132,9


In [7]:
# final model
clf = LogisticRegression(random_state=0,
                         fit_intercept=True,)
    
clf.fit(X, y)

# save model
with open('./models/PS_SOTA_clf_proba.pkl', 'wb') as f:
    pickle.dump(clf, f)
f.close()

In [8]:
# check if model is saved correctly
with open('./models/PS_SOTA_clf_proba.pkl', 'rb') as f:
    clf = pickle.load(f)
f.close()

clf