In [1]:
import pandas as pd
import numpy as np
import sqlite3
import json
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
EXPERIMENT_NAME = 'P3856'

In [3]:
MZ_TOLERANCE_PPM = 5
MZ_TOLERANCE_PERCENT = MZ_TOLERANCE_PPM * 10**-4
SCAN_TOLERANCE = 2
RT_TOLERANCE = 1

In [21]:
# load the detected features
EXPERIMENT_DIR = '/Users/darylwilding-mcbride/Downloads/experiments/{}'.format(EXPERIMENT_NAME)
# PASEF
# FEATURES_FILE_NAME = '{}/features/P3856_YHE211_1-features.pkl'.format(EXPERIMENT_DIR)
# DEDUP_FEATURES_FILE_NAME = '{}/features/P3856_YHE211_1-features-dedup.pkl'.format(EXPERIMENT_DIR)
# 3DID
FEATURES_FILE_NAME = '{}/features-3did/experiment-features.pkl'.format(EXPERIMENT_DIR)
DEDUP_FEATURES_FILE_NAME = '{}/features-3did/experiment-features-dedup.pkl'.format(EXPERIMENT_DIR)

# EXPERIMENT_DIR = '/data2/experiments/{}'.format(EXPERIMENT_NAME)
# FEATURES_FILE_NAME = '{}/recalibrated-features/experiment-features.pkl'.format(EXPERIMENT_DIR)

In [22]:
perc_id = 29

# MAPPING_FILE = '{}/recalibrated-percolator-output/percolator-idx-mapping.csv'.format(EXPERIMENT_DIR)
# SELECTED_RUN = 'P3856_YHE211_1_Slot1-1_1_5104'
# map_df = pd.read_csv(MAPPING_FILE, sep=',')
# perc_id = map_df[(map_df.run_name == SELECTED_RUN)].iloc[0].file_idx

In [24]:
features_df = pd.read_pickle(FEATURES_FILE_NAME)
# features_df = features_df[(features_df.percolator_idx == perc_id)]

In [25]:
print('there are {} features priot to de-dup'.format(len(features_df)))

there are 24400 features priot to de-dup


In [26]:
features_df.columns

Index(['monoisotopic_mz', 'charge', 'intensity', 'intensity_full_rt_extent',
       'scan_apex', 'scan_curve_fit', 'scan_lower', 'scan_upper', 'rt_apex',
       'rt_curve_fit', 'rt_lower', 'rt_upper', 'precursor_id', 'envelope',
       'feature_id', 'candidate_phr_error', 'mono_adjusted',
       'original_phr_error', 'original_phr', 'monoisotopic_mass', 'run_name'],
      dtype='object')

In [29]:
# features_df['mz'] = features_df.recalibrated_monoisotopic_mz  # shorthand to reduce verbosity
features_df['mz'] = features_df.monoisotopic_mz  # shorthand to reduce verbosity
features_df['mz_ppm_tolerance'] = features_df.mz * MZ_TOLERANCE_PERCENT / 100
features_df['mz_lower'] = features_df.mz - features_df.mz_ppm_tolerance
features_df['mz_upper'] = features_df.mz + features_df.mz_ppm_tolerance
features_df['scan_lower'] = features_df.scan_apex - SCAN_TOLERANCE
features_df['scan_upper'] = features_df.scan_apex + SCAN_TOLERANCE
features_df['rt_lower'] = features_df.rt_apex - RT_TOLERANCE
features_df['rt_upper'] = features_df.rt_apex + RT_TOLERANCE
# features_df['composite_key'] = features_df.apply(lambda row: '{},{},{}'.format(row.feature_id, row.precursor_id, row.percolator_idx), axis=1)
features_df['composite_key'] = features_df.apply(lambda row: '{},{}'.format(row.feature_id, row.precursor_id), axis=1)

In [30]:
# remove these after we're finished
columns_to_drop_l = []
columns_to_drop_l.append('mz')
columns_to_drop_l.append('mz_ppm_tolerance')
columns_to_drop_l.append('mz_lower')
columns_to_drop_l.append('mz_upper')
columns_to_drop_l.append('scan_lower')
columns_to_drop_l.append('scan_upper')
columns_to_drop_l.append('rt_lower')
columns_to_drop_l.append('rt_upper')
columns_to_drop_l.append('composite_key')


In [31]:
features_df.sort_values(by=['intensity'], ascending=False, inplace=True)

In [32]:
# see if any detections have a duplicate - if so, find the dup with the highest intensity and keep it
keep_l = []
for row in features_df.itertuples():
    dup_df = features_df[(features_df.mz > row.mz_lower) & (features_df.mz < row.mz_upper) & (features_df.scan_apex > row.scan_lower) & (features_df.scan_apex < row.scan_upper) & (features_df.rt_apex > row.rt_lower) & (features_df.rt_apex < row.rt_upper)].copy()
    # group the dups by charge - take the most intense for each charge
    for group_name,group_df in dup_df.groupby(['charge'], as_index=False):
        keep_l.append(group_df.iloc[0].composite_key)


In [33]:
# remove any extractions that are not in the keep list
dedup_df = features_df[features_df.composite_key.isin(keep_l)].copy()

In [34]:
number_of_dups = len(features_df)-len(dedup_df)
print('removed {} duplicates ({}% of the original detections)'.format(number_of_dups, round(number_of_dups/len(features_df)*100)))
print('there are {} detected de-duplicated features'.format(len(dedup_df)))

removed 333 duplicates (1% of the original detections)
there are 24067 detected de-duplicated features


In [35]:
# remove the columns we added earlier
dedup_df.drop(columns_to_drop_l, axis=1, inplace=True)

In [36]:
dedup_df.sample(n=3)

Unnamed: 0,monoisotopic_mz,charge,intensity,intensity_full_rt_extent,scan_apex,scan_curve_fit,rt_apex,rt_curve_fit,precursor_id,envelope,feature_id,candidate_phr_error,mono_adjusted,original_phr_error,original_phr,monoisotopic_mass,run_name
1,705.060853,1,1079.0,591,563.15,False,1694.9,False,3788,"((705.0609, 591.00), (706.0585, 488.00), (707....",378802,,False,1.19888,0.825719,704.053553,P3856_YHE211_1_Slot1-1_1_5104
0,848.927529,2,54134.0,20649,393.42,True,1703.37,False,9199,"((848.9274, 21147.00), (849.4293, 20378.00), (...",919901,,False,0.0544124,0.963636,1695.840458,P3856_YHE211_1_Slot1-1_1_5104
1,918.229762,1,434.0,104,338.0,False,1930.43,False,11294,"((918.2454, 269.00), (919.2494, 165.00), (920....",1129402,,False,0.25025,0.613383,917.222462,P3856_YHE211_1_Slot1-1_1_5104


In [37]:
dedup_df.to_pickle(DEDUP_FEATURES_FILE_NAME)