In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import auc, roc_curve, roc_auc_score, f1_score, precision_recall_curve, accuracy_score, recall_score,  precision_score


In [2]:
# read training data
PS_training_df = pd.read_csv('./results/PS_training.csv', index_col=0)

# keep only relevant
PS_training_df = PS_training_df[['peptide',
                                 'allele',
                                 'PG_proba',
                                 'TAP_proba',
                                 'BA_proba',
                                 'PG_y',
                                 'TAP_y',
                                 'BA_y',
                                 'TD_y',
                                 'PS_proba',
                                 'PS_y']]

##### NOTE Here PS_y is hit (real y) 

In [3]:
# Create X and y
X = PS_training_df[['PG_y',
                    'TAP_y',
                    'BA_y',
                    'TD_y']].to_numpy()

y = PS_training_df[['PS_y']].to_numpy()

In [4]:
# Create 
nfold = 10
skf = StratifiedKFold(n_splits=nfold, )

auroc_ls = []
auprc_ls = []
f1_ls = []
precision_ls = []
recall_ls = []

fold_df = {'auroc':[], 'auprc':[], 
           'f1':[], 
           'accuracy': [],
           'precision':[],  
           'recall':[]}

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    clf = LogisticRegression(random_state=0,fit_intercept=True)
    
    clf.fit(X[train_index], y[train_index])
    y_pred = clf.predict(X[test_index])
    y_proba_pred = clf.predict_proba(X[test_index])
    
    # calculate roc curve
    fpr, tpr, thresholds = roc_curve(y[test_index], y_proba_pred[:,1])
    # calculate the g-mean for each threshold
    gmeans = np.sqrt(tpr * (1-fpr))
    # locate the index of the largest g-mean
    ix = np.argmax(gmeans)
    
    y_pred = np.where(y_proba_pred[:,1]<thresholds[ix],0,1)
    AUROC = roc_auc_score(y[test_index], y_proba_pred[:,1])
    fold_df['auroc'] = fold_df['auroc'] + [AUROC]
    
    # calculate precision-recall curve
    precision, recall, thresholds = precision_recall_curve(y[test_index], y_proba_pred[:,1])
    AUPRC = auc(recall, precision)
    fold_df['auprc'] = fold_df['auprc'] + [AUPRC]
    
    # calculate f1 score
    f1 = f1_score(y[test_index], y_pred)
    fold_df['f1'] = fold_df['f1'] + [f1]
    
    # calculate accuracy score
    accuracy = accuracy_score(y[test_index], y_pred)
    fold_df['accuracy'] = fold_df['accuracy'] + [accuracy]
    
    # calculate precision score
    precision = precision_score(y[test_index], y_pred)
    fold_df['precision'] = fold_df['precision'] + [precision]
    
    # calculate recall score
    recall = recall_score(y[test_index], y_pred)
    fold_df['recall'] = fold_df['recall'] + [recall]
    
    
fold_df = pd.DataFrame(fold_df)
fold_df['fold'] = [i for i in range(nfold)]
fold_df

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Unnamed: 0,auroc,auprc,f1,accuracy,precision,recall,fold
0,0.932718,0.910687,0.848058,0.905321,0.807526,0.892874,0
1,0.986558,0.969684,0.912088,0.943391,0.843839,0.992348,1
2,0.962946,0.902129,0.903418,0.940419,0.868166,0.941655,2
3,0.959118,0.921856,0.888839,0.929946,0.837849,0.946437,3
4,0.963149,0.939278,0.878496,0.923153,0.825484,0.938785,4
5,0.956086,0.922309,0.879982,0.924144,0.827368,0.939742,5
6,0.957959,0.930946,0.881553,0.924851,0.826087,0.945002,6
7,0.952243,0.910587,0.888392,0.92938,0.834454,0.949785,7
8,0.969266,0.932099,0.888889,0.928389,0.821762,0.967958,8
9,0.966136,0.919833,0.865659,0.91196,0.789287,0.958393,9


In [5]:
# final model
clf = LogisticRegression(random_state=0,
                         fit_intercept=True)
    
clf.fit(X, y)

# save model
with open('./models/PS_SOTA_clf_binary.pkl', 'wb') as f:
    pickle.dump(clf, f)
f.close()

  y = column_or_1d(y, warn=True)


In [6]:
# check if model is saved correctly
with open('./models/PS_SOTA_clf_binary.pkl', 'rb') as f:
    clf = pickle.load(f)
f.close()

clf

In [7]:
clf.coef_, clf.intercept_

(array([[2.35750946, 0.11810054, 4.42171564, 0.5648065 ]]),
 array([-4.30499985]))