In [52]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import auc, roc_auc_score, f1_score, precision_recall_curve, accuracy_score,recall_score, precision_score

In [48]:
# load BA data
BA_df = pd.read_csv('./../../data/BA/training_df.csv')
BA_df = BA_df[['allele','peptide','y']].rename(columns={'y':'BA_y'})
print("Number of BA dataset peptides: ", len(BA_df['peptide'].unique()))

# load PG data
PG_df = pd.read_csv('./../../data/PG/PG.csv')
PG_df = PG_df[['peptide','hit']].rename(columns={'hit':'PG_y'})
print("Number of PG dataset peptides: ", len(PG_df['peptide'].unique().tolist()))

# load TAP data
TAP_df = pd.read_csv('./../../data/TAP/classification_DS868.csv', sep='\t')
TAP_df = TAP_df[['peptide','label']].rename(columns={'label':'TAP_y'})
print("Number of TAP dataset peptides: ", len(TAP_df['peptide'].unique().tolist()))

# load PS data
PS_df = pd.read_csv('./../../data/PS/Data_S6.csv')
PS_df = PS_df[['peptide','hit','hla']].rename(columns={'hit':'PS_y','hla':'allele'})
print("Number of PS dataset peptides: ", len(PS_df['peptide'].unique().tolist()))

# load TD data
TD_df = pd.read_csv('./../../data/TD/TD_MFI_ratio.csv')
TD_df = TD_df[TD_df['Source']=='Bashirova']
TD_df = TD_df[['HLA_full','MFI_ratio']].rename(columns={'HLA_full':'allele'})
TD_df['TD_y'] = [1 if i>2 else 0 for i in TD_df['MFI_ratio']]
print("Number of HLA in TD: ", len(TD_df))


Number of BA dataset peptides:  35814
Number of PG dataset peptides:  297548
Number of TAP dataset peptides:  868
Number of PS dataset peptides:  71427
Number of HLA in TD:  97


In [49]:
# correct PS_df
ps_ls = []

for idx in tqdm(range(len(PS_df))):
    dict_ = PS_df.iloc[idx].to_dict()
    dict_['allele'] = dict_['allele'].split(' ')
    for k in dict_.keys():
        if k !='allele':
            dict_[k] = [dict_[k]]*len(dict_['allele'])

    ps_ls = ps_ls + [pd.DataFrame(dict_)]
    
PS_df = pd.concat(ps_ls).reset_index(drop=True)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 75378/75378 [00:50<00:00, 1493.49it/s]


In [45]:
#  Get common data
common_df = BA_df.merge(PS_df, how='left',on=['peptide','allele']).fillna(0.0).merge(PG_df,on='peptide')
common_df = common_df.merge(TAP_df, on='peptide').merge(TD_df, on='allele')
common_df.to_csv('./../../data/PS/combined_df_ground_truth.csv')

In [43]:
# print stats
print("Number of pairs: ", len(common_df))
print("Number of unique peptides: ", len(common_df['peptide'].unique()))
print("Number of hla: ", len(common_df['allele'].unique()))

print("\n")
print("% positive BA_y: ", common_df['BA_y'].sum()/len(common_df))
print("% positive PG_y: ", common_df['PG_y'].sum()/len(common_df))
print("% positive TAP_y: ", common_df['TAP_y'].sum()/len(common_df))
print("% positive TD_y: ", common_df['TD_y'].sum()/len(common_df))
print("% positive PS_y: ", common_df['PS_y'].sum()/len(common_df))

Number of pairs:  210
Number of unique peptides:  27
Number of hla:  46


% positive BA_y:  0.10952380952380952
% positive PG_y:  1.0
% positive TAP_y:  0.6904761904761905
% positive TD_y:  0.861904761904762
% positive PS_y:  0.3380952380952381


# baselines

In [58]:
# Baseline
baseline_df = {'f1':[], 'accuracy': [], 'precision':[], 'recall':[]}
col_ls = ['BA_y','PG_y','TAP_y','TD_y']

for col in col_ls:
    
    if 'BA' in col:
        common_df = BA_df.merge(PS_df, how='left',on=['peptide','allele']).fillna(0.0)
        print('positive  PS: ',common_df['PS_y'].sum()/len(common_df),
              ' positive  BA: ',common_df['BA_y'].sum()/len(common_df))
    elif 'PG' in col:
        common_df = PG_df.merge(PS_df, on='peptide')
        print('positive  PS: ',common_df['PS_y'].sum()/len(common_df),
              ' positive  PG: ',common_df['PG_y'].sum()/len(common_df))
    elif 'TAP' in col:
        common_df = TAP_df.merge(PS_df, on='peptide')
        print('positive  PS: ',common_df['PS_y'].sum()/len(common_df),
              ' positive  TAP: ',common_df['TAP_y'].sum()/len(common_df))
    elif 'TD' in col:
        common_df = PS_df.merge(TD_df, on='allele')
        print('positive  PS: ',common_df['PS_y'].sum()/len(common_df),
              ' positive  TD: ',common_df['TD_y'].sum()/len(common_df))
    
    # calculate f1 score
    f1 = f1_score(common_df['PS_y'],common_df[col])
    baseline_df['f1'] = baseline_df['f1'] + [f1]

    # calculate accuracy score
    accuracy = accuracy_score(common_df['PS_y'],common_df[col])
    baseline_df['accuracy'] = baseline_df['accuracy'] + [accuracy]

    # calculate precision score
    precision = precision_score(common_df['PS_y'],common_df[col])
    baseline_df['precision'] = baseline_df['precision'] + [precision]
    
    # calculate recall score
    recall = recall_score(common_df['PS_y'],common_df[col])
    baseline_df['recall'] = baseline_df['recall'] + [recall]
    
pd.DataFrame(baseline_df, index=col_ls)

positive  PS:  0.0015400822446698715  positive  BA:  0.24040790789452576
positive  PS:  0.9273912031649988  positive  PG:  0.8475954496297006
positive  PS:  1.0  positive  TAP:  0.6844660194174758
positive  PS:  0.33161143376543767  positive  TD:  0.796579805420945


Unnamed: 0,f1,accuracy,precision,recall
BA_y,0.004244,0.759079,0.002135,0.333333
PG_y,0.930673,0.876946,0.974482,0.890634
TAP_y,0.81268,0.684466,1.0,0.684466
TD_y,0.467884,0.399672,0.331331,0.795906
