In [1]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt

In [2]:
EXTRACTED_FEATURES_DB_NAME = '/data/experiments/YHE/extracted-features/extracted-features.sqlite'

In [3]:
db_conn = sqlite3.connect(EXTRACTED_FEATURES_DB_NAME)
sequences_df = pd.read_sql_query("select * from features", db_conn)
db_conn.close()

In [4]:
sequences_df[['sequence','charge','run_name','number_of_runs_extracted','human','group','classed_as']].sample(n=5)

Unnamed: 0,sequence,charge,run_name,number_of_runs_extracted,human,group,classed_as
167547,CMTNTPVVVR,2,YHE114_07_Slot1-1_1_3014,31,1,YHE114,target
49636,YRAPGEQDGDAMPLGSAVDILATDDPNFSQEDQQDTQIYEK,3,YHE211_01_Slot1-1_1_2997,18,1,YHE211,target
414235,AECEILMMVGLPAAGK,2,YHE114_11_Slot1-1_1_3043,31,1,YHE114,decoy
632757,HPSVNQEVR,2,YHE211_05_Slot1-1_1_3001,31,1,YHE211,target
543718,NMDPLNDNVATLLHQSSDR,2,YHE211_08_Slot1-1_1_3004,31,1,YHE211,target


In [5]:
sequences_df['group'] = sequences_df.run_name.apply(lambda x: x.split('_')[0])

In [6]:
sequences_df[['sequence','charge','run_name','number_of_runs_extracted','human','group']].sample(n=5)

Unnamed: 0,sequence,charge,run_name,number_of_runs_extracted,human,group
128954,LSDSFSLYPQFMFHLR,3,YHE114_10_Slot1-1_1_3017,18,1,YHE114
506036,WASEPEHDHR,2,YHE114_08_Slot1-1_1_3015,6,1,YHE114
523747,SSDASTAQPPESQPLPASQTPASNQPK,2,YHE010_04_Slot1-1_1_2989,29,1,YHE010
14230,LVNSVAGCADDALAGLVACNPNLQLLQGHR,3,YHE114_02_Slot1-1_1_3009,22,1,YHE114
152851,LHFFMPGFAPLTSR,3,YHE211_09_Slot1-1_1_3005,13,1,YHE211


In [7]:
yhe_only_df = sequences_df[(sequences_df.group == 'YHE211') | (sequences_df.group == 'YHE114')]
yhe_only_targets_df = yhe_only_df[(yhe_only_df.classed_as == 'target')]

In [8]:
number_of_yhe_runs_in_experiment = len(yhe_only_df.file_idx.unique())
print('there are {} runs in the experiment'.format(number_of_yhe_runs_in_experiment))

there are 21 runs in the experiment


In [9]:
# count how many runs each sequence/charge was extracted from
sequence_occurences_l = []
for group_name,group_df in yhe_only_targets_df.groupby(['sequence','charge'], as_index=False):
    sequence_occurences_l.append((group_name, len(group_df), group_name[0], group_name[1]))
yhe_sequence_occurences_df = pd.DataFrame(sequence_occurences_l, columns=['sequence_charge','number_of_runs_targets_extracted','sequence','charge'])

In [10]:
# display the number of missing values across the experiment
print("there are {} unique extracted sequences.".format(len(yhe_only_df['sequence'].unique())))
yhe_sequence_occurences_df['missing_files'] = number_of_yhe_runs_in_experiment - yhe_sequence_occurences_df.number_of_runs_targets_extracted
proportion_of_missing_values = yhe_sequence_occurences_df.missing_files.sum() / (len(yhe_sequence_occurences_df) * number_of_yhe_runs_in_experiment)
print("{}% of values are missing across the experiment.".format(round(proportion_of_missing_values*100)))

there are 27265 unique extracted sequences.
28.0% of values are missing across the experiment.


In [11]:
yhe_sequence_occurences_df.sample(n=10)[['sequence','charge','number_of_runs_targets_extracted','missing_files']]

Unnamed: 0,sequence,charge,number_of_runs_targets_extracted,missing_files
15119,LVNMLDAVQENQHK,2,21,0
5017,EETYYQESAGVADLITTCAGGR,2,15,6
20498,SLFFGSILAPVR,2,21,0
7314,FNEEHIPDSPFVVPVASPSGDAR,3,4,17
26555,YTPEEIAMATVTALR,2,13,8
7538,FSEAEHWLDYFPPLAIQDLK,2,16,5
17919,QHMENEMAHYACDCWDAESK,3,20,1
18820,Q[-17.0265]HPQPYIFPDSPGGTSYER,2,21,0
15354,MAGDPVANVR,2,15,6
21562,TAEHLVEQLNVPK,2,17,4


In [12]:
df = yhe_only_df[(yhe_only_df.sequence == 'EHHIDPTAITR') & (yhe_only_df.charge == 2)][['sequence','charge','file_idx','classed_as','prob_target']]
df

Unnamed: 0,sequence,charge,file_idx,classed_as,prob_target
4989,EHHIDPTAITR,2,16,target,0.769184
30081,EHHIDPTAITR,2,28,target,0.949875
75195,EHHIDPTAITR,2,23,decoy,0.004988
145215,EHHIDPTAITR,2,9,target,0.987639
170275,EHHIDPTAITR,2,25,decoy,0.230228
195448,EHHIDPTAITR,2,24,target,0.996354
241602,EHHIDPTAITR,2,27,target,0.961668
285211,EHHIDPTAITR,2,20,decoy,0.192898
309215,EHHIDPTAITR,2,29,decoy,0.091118
394380,EHHIDPTAITR,2,14,target,0.550377


In [13]:
len(df)

20

#### now analyse the human-only runs

In [14]:
human_only_df = sequences_df[(sequences_df.group == 'YHE010')]
human_only_targets_df = human_only_df[(human_only_df.classed_as == 'target')]

In [15]:
# count how many runs each sequence/charge was extracted from
sequence_occurences_l = []
for group_name,group_df in human_only_targets_df.groupby(['sequence','charge'], as_index=False):
    sequence_occurences_l.append((group_name, len(group_df), group_name[0], group_name[1], group_df.iloc[0].human))
human_sequence_occurences_df = pd.DataFrame(sequence_occurences_l, columns=['sequence_charge','number_of_runs_targets_extracted','sequence','charge','human'])


In [16]:
# proportion of false positives i.e. non-human peptides found in the human-only runs
len(human_sequence_occurences_df[(human_sequence_occurences_df.human == 0)]) / len(human_sequence_occurences_df)

0.08954940411186553

In [17]:
number_of_human_runs_in_experiment = len(human_only_df.file_idx.unique())
print('there are {} human-only runs in the experiment'.format(number_of_human_runs_in_experiment))

there are 10 human-only runs in the experiment


In [18]:
# display the number of missing values across the experiment
print("there are {} unique extracted sequences.".format(len(human_only_df['sequence'].unique())))
human_sequence_occurences_df['missing_files'] = number_of_human_runs_in_experiment - human_sequence_occurences_df.number_of_runs_targets_extracted
proportion_of_missing_values = human_sequence_occurences_df.missing_files.sum() / (len(human_sequence_occurences_df) * number_of_human_runs_in_experiment)
print("{}% of values are missing across the experiment.".format(round(proportion_of_missing_values*100)))

there are 23414 unique extracted sequences.
30.0% of values are missing across the experiment.


In [19]:
human_sequence_occurences_df.sample(n=10)[['sequence','charge','number_of_runs_targets_extracted','missing_files']]

Unnamed: 0,sequence,charge,number_of_runs_targets_extracted,missing_files
7398,GVTIIGPATVGGIKPGCFK,2,10,0
17977,TQTSDPAMLPTMIGLLAEAGVR,3,1,9
2205,DDQEWESPSPPKPTVFISGVIAR,3,10,0
16449,SPTLYGISHDDLK,2,9,1
151,ACFKEEIITHEEPLFDER,2,10,0
13519,NQGDEEGTEIDTLQFR,2,10,0
1142,APVAIGVAAMSTAEMLTSGLK,2,5,5
2099,DANNGNLQLR,2,10,0
7517,HAVSDPSILDSLDLNEDEREVLINNINR,3,8,2
14554,QWQQHIQSEK,2,5,5


In [23]:
df = human_only_df[(human_only_df.sequence == 'APVAIGVAAMSTAEMLTSGLK') & (human_only_df.charge == 2)][['sequence','charge','file_idx','classed_as','prob_target']]
df

Unnamed: 0,sequence,charge,file_idx,classed_as,prob_target
50967,APVAIGVAAMSTAEMLTSGLK,2,4,target,0.998877
216599,APVAIGVAAMSTAEMLTSGLK,2,1,decoy,0.215683
328981,APVAIGVAAMSTAEMLTSGLK,2,12,target,0.646673
350284,APVAIGVAAMSTAEMLTSGLK,2,3,target,0.563356
370485,APVAIGVAAMSTAEMLTSGLK,2,2,decoy,0.033295
464829,APVAIGVAAMSTAEMLTSGLK,2,5,decoy,0.18832
508429,APVAIGVAAMSTAEMLTSGLK,2,19,target,0.999673
578727,APVAIGVAAMSTAEMLTSGLK,2,7,target,0.999144


In [24]:
len(df)

8