In [1]:
import pandas as pd
import numpy as np
import sqlite3
import json
import matplotlib.pyplot as plt
from scipy.spatial import distance
from sklearn import preprocessing

In [2]:
EXTRACTED_FEATURES_DB_NAME = '/data/experiments/YHE/extracted-features/extracted-features.sqlite'

In [3]:
db_conn = sqlite3.connect(EXTRACTED_FEATURES_DB_NAME)
sequences_df = pd.read_sql_query("select * from features", db_conn)
db_conn.close()

In [15]:
targets_only_df = sequences_df[(sequences_df.classed_as == 'target')].copy()

In [6]:
EXPERIMENT_FEATURES_NAME = '/data/experiments/YHE/recalibrated-features/experiment-features.pkl'
experiment_features_df = pd.read_pickle(EXPERIMENT_FEATURES_NAME)

In [9]:
# read the percolator identifications
EXPERIMENT_NAME = 'YHE'
PERCOLATOR_OUTPUT_DIR = "/data/experiments/{}/recalibrated-percolator-output".format(EXPERIMENT_NAME)
PERCOLATOR_OUTPUT_FILE_NAME = "{}/{}.percolator.target.psms.txt".format(PERCOLATOR_OUTPUT_DIR, EXPERIMENT_NAME)

psms_df = pd.read_csv(PERCOLATOR_OUTPUT_FILE_NAME, sep='\t')

In [28]:
detected_features_df = pd.merge(experiment_features_df, psms_df, how='left', left_on=['percolator_idx','feature_id'], right_on=['file_idx','scan'])

In [29]:
detected_features_df.columns

Index(['feature_id', 'charge_x', 'rt_apex', 'rt_peak_width', 'scan_apex',
       'scan_peak_width', 'intensity', 'precursor_id', 'monoisotopic_mass',
       'predicted_mass_error', 'recalibrated_monoisotopic_mass',
       'recalibrated_monoisotopic_mz', 'percolator_idx', 'file_idx', 'scan',
       'charge_y', 'spectrum precursor m/z', 'spectrum neutral mass',
       'peptide mass', 'percolator score', 'percolator q-value',
       'percolator PEP', 'total matches/spectrum', 'sequence', 'protein id',
       'flanking aa'],
      dtype='object')

In [30]:
detected_features_df.rename(columns={"percolator q-value": "q_value"}, inplace=True)

In [31]:
detected_features_df.sample(n=10)

Unnamed: 0,feature_id,charge_x,rt_apex,rt_peak_width,scan_apex,scan_peak_width,intensity,precursor_id,monoisotopic_mass,predicted_mass_error,...,spectrum precursor m/z,spectrum neutral mass,peptide mass,percolator score,q_value,percolator PEP,total matches/spectrum,sequence,protein id,flanking aa
4334148,951002,2,1529.54,3.32,694.86,44.2,82860.723261,9510,1261.565285,-0.001739,...,,,,,,,,,,
4173344,3030301,3,1808.92,5.52,546.98,104.07,62530.0,30303,3606.621088,-0.008125,...,,,,,,,,,,
70353,5861202,2,2146.39,20.0,498.41,31.81,88071.0,58612,1798.97479,-0.003992,...,899.993,1797.9715,1741.9621,-0.385855,0.577081,0.988581,628.84,ILRAEKAVACSGAAQVR,sp|Q92797|SYMPK_HUMAN,RI
719785,2740501,2,1718.0,5.5,841.71,19.32,11092.0,27405,862.437484,-0.009358,...,,,,,,,,,,
3946924,5294501,2,2043.12,31.13,692.63,18.24,52288.0,52945,1202.576632,-0.008301,...,,,,,,,,,,
2892985,1862603,2,1564.15,46.03,740.78,47.67,39278.0,18626,1112.539933,-0.006144,...,556.7767,1111.5388,1112.5462,-0.092858,0.050527,0.643702,628.84,VDASGPDVPTR,sp|Q8N158|GPC2_HUMAN,KR
2613734,1031203,2,1480.01,43.1,812.97,83.24,12362.0,10312,969.469208,-0.00283,...,485.2397,968.4648,969.4702,-0.326957,0.50274,0.970577,628.84,TAM[15.9949]RANYK,sp|Q9P0W8|SPAT7_HUMAN,KN
3651135,3002402,2,1741.84,89.96,487.26,86.08,117773.0,30024,1854.900973,-0.002001,...,,,,,,,,,,
1546076,4434303,2,1909.09,40.95,538.51,45.27,109481.0,44343,1696.835672,-0.010288,...,,,,,,,,,,
3282198,4263001,2,1903.33,8.99,445.25,75.0,344057.0,42630,1964.019079,-0.012592,...,982.5195,1963.0244,1964.0367,0.103005,0.001588,0.036716,628.84,HNQLPLVIEFTEQTAPK,sp|P07237|PDIA1_HUMAN,KI


In [13]:
def myround(x, base=5):
    return base * round(x/base)

In [33]:
# calculate the rounded coordinates for comparison
targets_only_df['rounded_monoisotopic_mz_centroid'] = round(targets_only_df.monoisotopic_mz_centroid,2)
targets_only_df['rounded_rt_apex'] = myround(targets_only_df.rt_apex, base=10)
targets_only_df['rounded_scan_apex'] = myround(targets_only_df.scan_apex, base=10)

In [34]:
detected_features_df['rounded_monoisotopic_mz_centroid'] = round(detected_features_df.recalibrated_monoisotopic_mz,2)
detected_features_df['rounded_rt_apex'] = myround(detected_features_df.rt_apex, base=10)
detected_features_df['rounded_scan_apex'] = myround(detected_features_df.scan_apex, base=10)

In [35]:
# for each extracted feature, was it detected in the same run?
for row in targets_only_df.sample(n=100).itertuples():
    df = detected_features_df[(detected_features_df.file_idx == row.file_idx) & (detected_features_df.charge_x == row.charge) & (detected_features_df.rounded_monoisotopic_mz_centroid == row.rounded_monoisotopic_mz_centroid) & (detected_features_df.rounded_rt_apex == row.rounded_rt_apex) & (detected_features_df.rounded_scan_apex == row.rounded_scan_apex)]
    if len(df) == 0:
        print('not detected')
    else:
        print('detected {}'.format(len(df)))
        print('\textracted sequence: {}'.format(row.sequence))
        for ident_row in df.itertuples():
            print('\t\tident seq: {}, {}'.format(ident_row.sequence, ident_row.q_value))


detected 1
	extracted sequence: ILDILGETCK
		ident seq: EITALAPSTMK, 0.52421536
detected 1
	extracted sequence: LFLASLAAAGSGTDAQVALVNEVK
		ident seq: LFLASLAAAGSGTDAQVALVNEVK, 3.7296314999999997e-06
detected 1
	extracted sequence: CIALAQLLVEQNFPAIAIHR
		ident seq: CIALAQLLVEQNFPAIAIHR, 8.6396074e-06
detected 2
	extracted sequence: LLGNTFVALSDLR
		ident seq: LLGNTFVALSDLR, 3.7296314999999997e-06
		ident seq: LLGNTFVALSDLR, 3.7296314999999997e-06
detected 1
	extracted sequence: VTWFLDDEAGALIPENC
		ident seq: VTWFLDDEAGALIPENC, 8.6396074e-06
not detected
not detected
not detected
detected 1
	extracted sequence: QEALIESLCEK
		ident seq: QEALIESLCEK, 0.0044217496
detected 2
	extracted sequence: TM[15.9949]TSEIASQEEGATIQWM[15.9949]SSVEAK
		ident seq: SCNSIVDCISWLEVFCAELTSR, 0.23521339
		ident seq: THVISHSYCEHM[15.9949]AVVALTCGDSR, 0.18811958
not detected
detected 2
	extracted sequence: YIAIVSTTVETKEPEK
		ident seq: YIAIVSTTVETKEPEK, 8.6396074e-06
		ident seq: YIAIVSTTVETKEPEK, 3.729631499999