In [1]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt

In [2]:
EXTRACTED_FEATURES_DB_NAME = '/data/experiments/YHE/extracted-features/extracted-features.sqlite'

In [3]:
db_conn = sqlite3.connect(EXTRACTED_FEATURES_DB_NAME)
sequences_df = pd.read_sql_query("select * from features", db_conn)
db_conn.close()

In [4]:
sequences_df.columns

Index(['sequence', 'charge', 'file_idx', 'peak_idx', 'target_coords',
       'theoretical_mz', 'q_value', 'delta_mz_ppm', 'delta_rt', 'delta_scan',
       'fwhm_rt_0', 'fwhm_scan_0', 'geometric_mean_0_1',
       'geometric_mean_0_1_2', 'isotope_0_1_mz_delta_ppm',
       'isotope_0_1_rt_delta', 'isotope_0_1_scan_delta',
       'isotope_0_2_mz_delta_ppm', 'isotope_0_2_rt_delta',
       'isotope_0_2_scan_delta', 'monoisotope_auc_over_isotope_peak_auc_sum',
       'monoisotope_int_over_isotope_peak_int_sum', 'mz_delta_ppm_std_dev_0',
       'mz_delta_ppm_std_dev_1', 'number_of_frames_0', 'number_of_frames_1',
       'number_of_frames_2', 'number_of_missing_frames_0',
       'number_of_missing_frames_1', 'number_of_missing_frames_2',
       'peak_base_width_rt_0', 'peak_base_width_scan_0', 'r_squared_phr',
       'rt_isotope_correlation', 'rt_isotope_cv', 'rt_peak_symmetry_0',
       'rt_peak_symmetry_1', 'rt_peak_symmetry_2', 'scan_isotope_correlation',
       'scan_isotope_cv', 'scan_peak

In [7]:
sequences_df[['sequence','charge','run_name','number_of_runs_extracted','human','group','classed_as']].sample(n=5)

Unnamed: 0,sequence,charge,run_name,number_of_runs_extracted,human,group,classed_as
289495,HLSYQR,2,YHE211_02_Slot1-1_1_2998,20,0,YHE211,decoy
213406,VMTIAPGLFGTPLLTSLPEK,2,YHE211_03_Slot1-1_1_2999,31,1,YHE211,target
649403,APLATGEDDDDEVPDLVENFDEASK,2,YHE114_03_Slot1-1_1_3010,31,1,YHE114,target
134177,SGYHQSASEHGLVVIAPDTSPR,2,YHE114_10_Slot1-1_1_3017,29,1,YHE114,decoy
39797,MQAQMQMQMQGGDGDGGALGHHV,3,YHE211_01_Slot1-1_1_2997,28,1,YHE211,decoy


In [8]:
sequences_df['group'] = sequences_df.run_name.apply(lambda x: x.split('_')[0])

In [9]:
sequences_df[['sequence','charge','run_name','number_of_runs_extracted','human','group']].sample(n=5)

Unnamed: 0,sequence,charge,run_name,number_of_runs_extracted,human,group
127958,LITSHLVDTDPEVDSIIK,2,YHE114_10_Slot1-1_1_3017,21,0,YHE114
474741,M[15.9949]DDSALQLNHTANEFAR,2,YHE010_07_Slot1-1_1_2992,30,1,YHE010
576182,VVEVCLADLQGSEDHSFR,2,YHE211_07_Slot1-1_1_3003,21,0,YHE211
442768,DTTALSFFHMLNGAALR,2,YHE211_10_Slot1-1_1_3006,31,1,YHE211
414144,ADGGTGDSSPSPGSSLPSPLSDPR,2,YHE114_11_Slot1-1_1_3043,31,1,YHE114


In [10]:
yhe_only_df = sequences_df[(sequences_df.group == 'YHE211') | (sequences_df.group == 'YHE114')]
yhe_only_targets_df = yhe_only_df[(yhe_only_df.classed_as == 'target')]

In [11]:
number_of_yhe_runs_in_experiment = len(yhe_only_df.file_idx.unique())
print('there are {} runs in the experiment'.format(number_of_yhe_runs_in_experiment))

there are 21 runs in the experiment


In [12]:
# count how many runs each sequence/charge was extracted from
sequence_occurences_l = []
for group_name,group_df in yhe_only_targets_df.groupby(['sequence','charge'], as_index=False):
    sequence_occurences_l.append((group_name, len(group_df), group_name[0], group_name[1]))
yhe_sequence_occurences_df = pd.DataFrame(sequence_occurences_l, columns=['sequence_charge','number_of_runs_extracted','sequence','charge'])

In [26]:
df = yhe_only_df[(yhe_only_df.sequence == 'CQNIYLTTGYAGSK') & (yhe_only_df.charge == 2)][['sequence','charge','file_idx','classed_as']]
df

Unnamed: 0,sequence,charge,file_idx,classed_as
2365,CQNIYLTTGYAGSK,2,16,decoy
27477,CQNIYLTTGYAGSK,2,28,decoy
72559,CQNIYLTTGYAGSK,2,23,decoy
117473,CQNIYLTTGYAGSK,2,15,decoy
142576,CQNIYLTTGYAGSK,2,9,decoy
167617,CQNIYLTTGYAGSK,2,25,target
192832,CQNIYLTTGYAGSK,2,24,target
238919,CQNIYLTTGYAGSK,2,27,decoy
282673,CQNIYLTTGYAGSK,2,20,target
306781,CQNIYLTTGYAGSK,2,29,target


In [15]:
len(df)

19

In [16]:
# display the number of missing values across the experiment
print("there are {} unique extracted sequences.".format(len(yhe_only_df['sequence'].unique())))
yhe_sequence_occurences_df['missing_files'] = number_of_yhe_runs_in_experiment - yhe_sequence_occurences_df.number_of_runs_extracted
proportion_of_missing_values = yhe_sequence_occurences_df.missing_files.sum() / (len(yhe_sequence_occurences_df) * number_of_yhe_runs_in_experiment)
print("{}% of values are missing across the experiment.".format(round(proportion_of_missing_values*100)))

there are 27265 unique extracted sequences.
38.0% of values are missing across the experiment.


In [25]:
yhe_sequence_occurences_df.sample(n=10)

Unnamed: 0,sequence_charge,number_of_runs_extracted,sequence,charge,missing_files
2122,"(CQNIYLTTGYAGSK, 2)",8,CQNIYLTTGYAGSK,2,13
11286,"(LEPSTSTDQPVTPEPTSQATR, 2)",3,LEPSTSTDQPVTPEPTSQATR,2,18
11311,"(LESEMEDAYHEHQANLLR, 3)",21,LESEMEDAYHEHQANLLR,3,0
21968,"(VVQLHEYSEELEK, 2)",13,VVQLHEYSEELEK,2,8
11620,"(LGPLFSLLNENHR, 2)",21,LGPLFSLLNENHR,2,0
10266,"(ITLQDVVSHSK, 2)",5,ITLQDVVSHSK,2,16
20198,"(TWYPVSIPAFYNPVTSLLKPVGEK, 3)",21,TWYPVSIPAFYNPVTSLLKPVGEK,3,0
9695,"(ILLANFLAQTEALMR, 2)",21,ILLANFLAQTEALMR,2,0
12644,"(LQEGYDHSYYFIATFITDHIR, 3)",18,LQEGYDHSYYFIATFITDHIR,3,3
2242,"(DAAASASTPAQAPTSDSPVAEDASR, 2)",11,DAAASASTPAQAPTSDSPVAEDASR,2,10


In [17]:
yhe_sequence_occurences_df.number_of_runs_extracted.max()

21

In [18]:
human_only_df = sequences_df[(sequences_df.group == 'YHE010')]

In [19]:
# count how many runs each sequence/charge was extracted from
sequence_occurences_l = []
for group_name,group_df in human_only_df.groupby(['sequence','charge'], as_index=False):
    sequence_occurences_l.append((group_name, len(group_df), group_name[0], group_name[1], group_df.iloc[0].human))
human_sequence_occurences_df = pd.DataFrame(sequence_occurences_l, columns=['sequence_charge','number_of_runs_extracted','sequence','charge','human'])


In [20]:
human_sequence_occurences_df.sample(n=10)

Unnamed: 0,sequence_charge,number_of_runs_extracted,sequence,charge,human
17752,"(Q[-17.0265]GIQGQGSFQAQWQESHK, 2)",9,Q[-17.0265]GIQGQGSFQAQWQESHK,2,0
20423,"(TAAYAIPMLQLLLHR, 2)",9,TAAYAIPMLQLLLHR,2,1
9332,"(HIDFSLR, 2)",5,HIDFSLR,2,1
24563,"(YFNSYTLTGR, 2)",10,YFNSYTLTGR,2,1
10379,"(IGNFSTDIK, 2)",8,IGNFSTDIK,2,1
456,"(AEVSDVGNAILDGADCVMLSGETAK, 2)",8,AEVSDVGNAILDGADCVMLSGETAK,2,0
2083,"(CFIVGADNVGSK, 2)",10,CFIVGADNVGSK,2,1
20702,"(TEELEEESFPER, 2)",10,TEELEEESFPER,2,1
11797,"(LAPEECFSPLDLFNK, 2)",10,LAPEECFSPLDLFNK,2,1
15272,(M[15.9949]DALM[15.9949]PISGTSFAVGIDFHAENDTIYW...,6,M[15.9949]DALM[15.9949]PISGTSFAVGIDFHAENDTIYWT...,3,1


In [21]:
len(human_sequence_occurences_df)

25309

In [22]:
# proportion of false positives i.e. non-human peptides found in the human-only runs
len(human_sequence_occurences_df[(human_sequence_occurences_df.human == 0)]) / len(human_sequence_occurences_df)

0.1297562132047888

In [23]:
number_of_human_runs_in_experiment = len(human_only_df.file_idx.unique())
print('there are {} human-only runs in the experiment'.format(number_of_human_runs_in_experiment))

there are 10 human-only runs in the experiment


In [24]:
# display the number of missing values across the experiment
print("there are {} unique extracted sequences.".format(len(human_only_df['sequence'].unique())))
human_sequence_occurences_df['missing_files'] = number_of_human_runs_in_experiment - human_sequence_occurences_df.number_of_runs_extracted
proportion_of_missing_values = human_sequence_occurences_df.missing_files.sum() / (len(human_sequence_occurences_df) * number_of_human_runs_in_experiment)
print("{}% of values are missing across the experiment.".format(round(proportion_of_missing_values*100)))

there are 23414 unique extracted sequences.
21.0% of values are missing across the experiment.
