In [1]:
import pandas as pd
import numpy as np
import sqlite3
import json
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
EXPERIMENT_NAME = 'P3856'
RUN_NAME = 'P3856_YHE211_1_Slot1-1_1_5104'

In [3]:
MZ_TOLERANCE_PPM = 5
MZ_TOLERANCE_PERCENT = MZ_TOLERANCE_PPM * 10**-4
SCAN_TOLERANCE = 2
RT_TOLERANCE = 1

##### 3DID parameters

##### PASEF parameters

In [4]:
# load the detected features
EXPERIMENT_DIR = '/data2/experiments/{}'.format(EXPERIMENT_NAME)
FEATURES_DIR = '{}/features'.format(EXPERIMENT_DIR)
FEATURES_FILE = '{}/detected-features-no-recal.sqlite'.format(FEATURES_DIR)
FEATURES_DEDUP_FILE = '{}/exp-{}-run-{}-features-pasef-dedup.pkl'.format(FEATURES_DIR, EXPERIMENT_NAME, RUN_NAME)
MZ_COLUMN_NAME = 'monoisotopic_mz'

In [5]:
RT_LOWER = 1650
RT_UPPER = 2200

db_conn = sqlite3.connect(FEATURES_FILE)
features_df = pd.read_sql_query("select * from features where rt_apex >= {} and rt_apex <= {} and run_name=='{}'".format(RT_LOWER, RT_UPPER, RUN_NAME), db_conn)
db_conn.close()

In [6]:
print('there are {} features prior to de-dup'.format(len(features_df)))

there are 252796 features prior to de-dup


In [8]:
features_df['dup_mz'] = features_df[MZ_COLUMN_NAME]  # shorthand to reduce verbosity
features_df['dup_mz_ppm_tolerance'] = features_df.dup_mz * MZ_TOLERANCE_PERCENT / 100
features_df['dup_mz_lower'] = features_df.dup_mz - features_df.dup_mz_ppm_tolerance
features_df['dup_mz_upper'] = features_df.dup_mz + features_df.dup_mz_ppm_tolerance
features_df['dup_scan_lower'] = features_df.scan_apex - SCAN_TOLERANCE
features_df['dup_scan_upper'] = features_df.scan_apex + SCAN_TOLERANCE
features_df['dup_rt_lower'] = features_df.rt_apex - RT_TOLERANCE
features_df['dup_rt_upper'] = features_df.rt_apex + RT_TOLERANCE
features_df['dup_composite_key'] = features_df.apply(lambda row: '{},{}'.format(row.feature_id, row.precursor_id), axis=1)

In [9]:
# remove these after we're finished
columns_to_drop_l = []
columns_to_drop_l.append('dup_mz')
columns_to_drop_l.append('dup_mz_ppm_tolerance')
columns_to_drop_l.append('dup_mz_lower')
columns_to_drop_l.append('dup_mz_upper')
columns_to_drop_l.append('dup_scan_lower')
columns_to_drop_l.append('dup_scan_upper')
columns_to_drop_l.append('dup_rt_lower')
columns_to_drop_l.append('dup_rt_upper')
columns_to_drop_l.append('dup_composite_key')


In [10]:
features_df.sort_values(by=['intensity'], ascending=False, inplace=True)

In [13]:
# see if any detections have a duplicate - if so, find the dup with the highest intensity and keep it
keep_l = []
for row in features_df.itertuples():
    dup_df = features_df[(features_df.dup_mz > row.dup_mz_lower) & (features_df.dup_mz < row.dup_mz_upper) & (features_df.scan_apex > row.dup_scan_lower) & (features_df.scan_apex < row.dup_scan_upper) & (features_df.rt_apex > row.dup_rt_lower) & (features_df.rt_apex < row.dup_rt_upper)].copy()
    # group the dups by charge - take the most intense for each charge
    for group_name,group_df in dup_df.groupby(['charge'], as_index=False):
        keep_l.append(group_df.iloc[0].dup_composite_key)


In [15]:
# remove any features that are not in the keep list
dedup_df = features_df[features_df.dup_composite_key.isin(keep_l)].copy()

In [16]:
number_of_dups = len(features_df)-len(dedup_df)
print('removed {} duplicates ({}% of the original detections)'.format(number_of_dups, round(number_of_dups/len(features_df)*100)))
print('there are {} detected de-duplicated features'.format(len(dedup_df)))

removed 21287 duplicates (8% of the original detections)
there are 231509 detected de-duplicated features


In [17]:
# remove the columns we added earlier
dedup_df.drop(columns_to_drop_l, axis=1, inplace=True)

In [18]:
dedup_df.sample(n=3)

Unnamed: 0,monoisotopic_mz,charge,intensity,intensity_full_rt_extent,intensity_six_sigma,scan_apex,scan_lower,scan_upper,rt_apex,rt_lower,rt_upper,precursor_id,envelope,deconvolution_score,envelope_mono_peak_intensity,feature_id,monoisotopic_mass,run_name
6246,822.400316,1,20287.0,43200,39894,489.35,469.35,509.35,1822.69,1812.69,1832.69,19345,"[[822.3993952060689, 10762.196394618164], [823...",118.649454,10762.196395,1934505,821.393016,P3856_YHE211_1_Slot1-1_1_5104
91383,1075.882019,3,16947.0,987,809,128.83,108.83,148.83,1965.46,1955.46,1975.46,29128,"[[1076.5471696522732, 5460.620970188946], [107...",58.31879,5460.62097,2912803,3224.624158,P3856_YHE211_1_Slot1-1_1_5104
136775,979.41737,4,137678.0,429884,427483,494.71,474.71,514.71,1964.34,1954.34,1974.34,28210,"[[979.4311231131832, 19148.0], [979.6694982165...",612.129611,19148.0,2821001,3913.640281,P3856_YHE211_1_Slot1-1_1_5104


In [19]:
dedup_df.to_pickle(FEATURES_DEDUP_FILE)