In [8]:
import pandas as pd
import pickle
import numpy as np

In [9]:
DUP_MZ_TOLERANCE_PPM = 5
DUP_SCAN_TOLERANCE = 10
DUP_RT_TOLERANCE = 5

In [13]:
experiment_base_dir = '/media/big-ssd/experiments'
experiment_name = 'P3856'
precursor_definition_method = 'pasef'
run_name = 'P3856_YHE211_1_Slot1-1_1_5104'

In [34]:
# load the pre-dedup features
EXPERIMENT_DIR = '{}/{}'.format(experiment_base_dir, experiment_name)
FEATURES_DIR = '{}/features-{}-backup'.format(EXPERIMENT_DIR, precursor_definition_method)
FEATURES_FILE = '{}/exp-{}-run-{}-features-{}.pkl'.format(FEATURES_DIR, experiment_name, run_name, precursor_definition_method)


In [35]:
# load the features
with open(FEATURES_FILE, 'rb') as handle:
    features_df = pickle.load(handle)['features_df']

In [36]:
len(features_df)

155262

In [38]:
# set up dup definitions
MZ_TOLERANCE_PERCENT = DUP_MZ_TOLERANCE_PPM * 10**-4
features_df['dup_mz'] = features_df['monoisotopic_mz']  # shorthand to reduce verbosity
features_df['dup_mz_ppm_tolerance'] = features_df.dup_mz * MZ_TOLERANCE_PERCENT / 100
features_df['dup_mz_lower'] = features_df.dup_mz - features_df.dup_mz_ppm_tolerance
features_df['dup_mz_upper'] = features_df.dup_mz + features_df.dup_mz_ppm_tolerance
features_df['dup_scan_lower'] = features_df.scan_apex - DUP_SCAN_TOLERANCE
features_df['dup_scan_upper'] = features_df.scan_apex + DUP_SCAN_TOLERANCE
features_df['dup_rt_lower'] = features_df.rt_apex - DUP_RT_TOLERANCE
features_df['dup_rt_upper'] = features_df.rt_apex + DUP_RT_TOLERANCE

# remove these after we're finished
columns_to_drop_l = []
columns_to_drop_l.append('dup_mz')
columns_to_drop_l.append('dup_mz_ppm_tolerance')
columns_to_drop_l.append('dup_mz_lower')
columns_to_drop_l.append('dup_mz_upper')
columns_to_drop_l.append('dup_scan_lower')
columns_to_drop_l.append('dup_scan_upper')
columns_to_drop_l.append('dup_rt_lower')
columns_to_drop_l.append('dup_rt_upper')

# sort by decreasing intensity
features_df.sort_values(by=['feature_intensity'], ascending=False, inplace=True)

selected_features_df = features_df.sample(n=100)

# see if any detections have a duplicate - if so, find the dup with the highest intensity (i.e. the first in the group) and keep it
keep_l = []
dups_l = []
for row in selected_features_df.itertuples():
    dup_df = features_df[(features_df.dup_mz > row.dup_mz_lower) & (features_df.dup_mz < row.dup_mz_upper) & (features_df.scan_apex > row.dup_scan_lower) & (features_df.scan_apex < row.dup_scan_upper) & (features_df.rt_apex > row.dup_rt_lower) & (features_df.rt_apex < row.dup_rt_upper)].copy()
    if len(dup_df) > 1:
        dups_l.append(dup_df)
    # group the dups by charge - take the most intense for each charge
    for group_name,group_df in dup_df.groupby(['charge'], as_index=False):
        keep_l.append(group_df.iloc[0].feature_id)
# remove any features that are not in the keep list
dedup_df = features_df[features_df.feature_id.isin(keep_l)].copy()


In [46]:
dups_l[50][['monoisotopic_mz','charge','scan_apex','rt_apex','feature_intensity','feature_id','dup_mz_lower','dup_mz_upper']]

Unnamed: 0,monoisotopic_mz,charge,scan_apex,rt_apex,feature_intensity,feature_id,dup_mz_lower,dup_mz_upper
3,936.443668,2,342.0,1706.014571,677582.0,1208704,936.438986,936.448351
4,936.442928,2,342.0,1705.486022,633215.0,1231505,936.438246,936.44761
3,936.441622,2,341.0,1706.014571,622063.0,1194904,936.436939,936.446304
3,936.441549,2,341.0,1706.014571,618879.0,1193404,936.436867,936.446231
2,936.444472,2,337.0,1705.486022,560329.0,1156203,936.43979,936.449155
4,936.444792,2,341.0,1705.486022,557413.0,1266005,936.44011,936.449474
1,936.443811,2,342.0,1704.957376,551478.0,1262602,936.439129,936.448493
4,936.440322,2,343.0,1705.486022,502464.0,1202805,936.43564,936.445004


In [48]:
(936.448351-936.438986)/2

0.004682500000001255