# Peptide generation + binding prediction using MHCflurry

This is essentially AP predictor from MHCflurry

1. The training data for PS is S6 dataset
2. The evaluation is done on Multiallelic-recent S1 data

In [1]:
import pandas as pd
import numpy as np
from mhcflurry import Class1PresentationPredictor
from tqdm import tqdm

In [28]:
# read training data
training_df = pd.read_csv('./../../data/PS/PS_df.csv', index_col=0)
training_df

Unnamed: 0,peptide,PS_y,allele
0,ITTQATKAGF,0,HLA-A*02:01
1,ITTQATKAGF,0,HLA-A*11:01
2,ITTQATKAGF,0,HLA-B*40:01
3,ITTQATKAGF,0,HLA-B*44:03
4,KGHLDAEL,0,HLA-A*02:01
...,...,...,...
387236,ELKERKSSL,1,HLA-A*29:02
387237,ELKERKSSL,1,HLA-B*08:01
387238,ELKERKSSL,1,HLA-B*44:03
387239,ELKERKSSL,1,HLA-C*07:01


In [29]:
# allele list
allele_ls = training_df['allele'].unique()
allele_ls

array(['HLA-A*02:01', 'HLA-A*11:01', 'HLA-B*40:01', 'HLA-B*44:03',
       'HLA-B*39:06', 'HLA-C*07:02', 'HLA-A*01:01', 'HLA-B*07:02',
       'HLA-C*16:01', 'HLA-A*26:01', 'HLA-B*44:02', 'HLA-C*03:04',
       'HLA-C*05:01', 'HLA-A*24:02', 'HLA-B*15:11', 'HLA-B*35:01',
       'HLA-C*03:03', 'HLA-A*03:01', 'HLA-C*04:01', 'HLA-B*18:01',
       'HLA-B*38:01', 'HLA-C*06:02', 'HLA-C*12:03', 'HLA-A*29:02',
       'HLA-B*57:01', 'HLA-C*07:01', 'HLA-B*15:01', 'HLA-B*13:02',
       'HLA-A*68:01', 'HLA-B*27:05', 'HLA-B*35:03', 'HLA-C*02:02',
       'HLA-A*23:01', 'HLA-B*14:01', 'HLA-C*08:02', 'HLA-B*14:02',
       'HLA-A*02:05', 'HLA-B*50:01', 'HLA-B*08:01', 'HLA-A*02:20',
       'HLA-B*39:01', 'HLA-A*02:06', 'HLA-B*55:01', 'HLA-C*01:02',
       'HLA-B*45:01', 'HLA-C*14:02', 'HLA-B*15:18', 'HLA-C*07:04',
       'HLA-A*30:01', 'HLA-A*68:02', 'HLA-B*15:03', 'HLA-B*15:10',
       'HLA-C*02:10', 'HLA-B*39:24', 'HLA-B*73:01', 'HLA-C*15:05',
       'HLA-A*32:01', 'HLA-B*56:01', 'HLA-A*31:01', 'HLA-B*35:

In [25]:
### load
predictor = Class1PresentationPredictor.load()
predictor.weights_dataframe

Unnamed: 0,intercept,affinity_score,processing_score
without_flanks,-6.185913,10.509827,3.538129
with_flanks,-6.361002,10.535336,3.802439


In [122]:

### Predict and make df
predictor_result_df = [None]*len(allele_ls)

for idx, allele in tqdm(enumerate(allele_ls)):
    allele_df = training_df[training_df['allele']==allele]
    
    predictor_result_df[idx] = predictor.predict(
                                    peptides=allele_df['peptide'].to_list(),
                                    alleles=[allele],
                                    verbose=0, throw=False)
    
predictor_result_df = pd.concat(predictor_result_df)
predictor_result_df = predictor_result_df.rename(columns={'best_allele':'allele'}) 
predictor_result_df = predictor_result_df.merge(training_df, on=['peptide','allele'])
predictor_result_df

0it [00:00, ?it/s]



1it [00:10, 10.45s/it]



2it [00:12,  5.41s/it]



3it [00:14,  3.72s/it]



4it [00:17,  3.74s/it]



5it [00:20,  3.22s/it]



6it [00:28,  4.84s/it]



7it [00:35,  5.65s/it]



8it [00:42,  6.23s/it]



9it [00:44,  4.94s/it]



10it [00:46,  3.87s/it]



11it [00:50,  3.94s/it]



12it [00:52,  3.31s/it]



13it [00:56,  3.64s/it]



14it [01:02,  4.34s/it]



15it [01:03,  3.29s/it]



16it [01:05,  2.96s/it]



17it [01:08,  2.91s/it]



18it [01:16,  4.40s/it]



19it [01:21,  4.49s/it]



20it [01:23,  3.76s/it]



21it [01:25,  3.40s/it]



22it [01:29,  3.52s/it]



23it [01:33,  3.68s/it]



24it [01:35,  3.08s/it]



25it [01:37,  2.72s/it]



26it [01:42,  3.47s/it]



27it [01:45,  3.22s/it]



28it [01:46,  2.58s/it]



29it [01:49,  2.76s/it]



30it [01:52,  2.84s/it]



31it [01:55,  2.96s/it]



32it [01:58,  2.86s/it]



33it [02:00,  2.68s/it]



34it [02:01,  2.05s/it]



35it [02:01,  1.66s/it]



36it [02:02,  1.37s/it]



37it [02:03,  1.22s/it]



38it [02:04,  1.21s/it]



39it [02:08,  1.92s/it]



40it [02:09,  1.77s/it]



41it [02:10,  1.64s/it]



42it [02:11,  1.33s/it]



43it [02:12,  1.11s/it]



44it [02:12,  1.04it/s]



45it [02:13,  1.11it/s]



46it [02:14,  1.03it/s]



47it [02:15,  1.07it/s]



48it [02:16,  1.11it/s]



49it [02:17,  1.16it/s]



50it [02:17,  1.20it/s]



51it [02:18,  1.22it/s]



52it [02:19,  1.24it/s]



53it [02:20,  1.25it/s]



54it [02:20,  1.37it/s]



55it [02:21,  1.39it/s]



56it [02:22,  1.41it/s]



57it [02:22,  1.43it/s]



58it [02:23,  1.50it/s]



59it [02:24,  1.38it/s]



60it [02:24,  1.43it/s]



61it [02:25,  1.47it/s]



62it [02:26,  1.58it/s]



63it [02:26,  2.33s/it]


Unnamed: 0,peptide,peptide_num,sample_name,affinity,allele,processing_score,presentation_score,presentation_percentile,PS_y
0,ITTQATKAGF,0,sample1,24497.361883,HLA-A*02:01,0.029817,0.004553,62.744674,0
1,KGHLDAEL,1,sample1,28783.099531,HLA-A*02:01,0.082184,0.004685,62.744674,0
2,LPPETQPLHEV,2,sample1,17979.853804,HLA-A*02:01,0.105367,0.008005,31.643179,0
3,SLLEKSLGL,3,sample1,18.502050,HLA-A*02:01,0.773742,0.985617,0.003125,1
4,SLLEKSLGL,3,sample1,18.502050,HLA-A*02:01,0.773742,0.985617,0.003125,1
...,...,...,...,...,...,...,...,...,...
434708,DFSRAVAEEYL,208,sample1,29697.795869,HLA-B*41:01,0.176059,0.006325,37.359049,0
434709,QGHQAPEG,209,sample1,27583.963237,HLA-B*41:01,0.000487,0.003661,99.286603,0
434710,SVPGSCYLKLL,210,sample1,29006.515279,HLA-B*41:01,0.102479,0.004994,62.744674,0
434711,ENVLAIISL,211,sample1,3104.301461,HLA-B*41:01,0.101127,0.041947,6.047908,1


In [141]:
# # save final df
# predictor_result_df = predictor_result_df.sort_values(['peptide','affinity'])
# predictor_result_df = predictor_result_df.drop_duplicates(['peptide','PS_y'])
# predictor_result_df.to_csv('./results/PS_training_MHCflurry_result.csv')

In [144]:
# check peptide count
predictor_result_df = pd.read_csv('./results/PS_training_MHCflurry_result.csv',index_col=0)
len(predictor_result_df['peptide'].unique()) == len(training_df['peptide'].unique()) 

True