In [1]:
import pandas as pd
import numpy as np
import sqlite3
import json
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
EXPERIMENT_NAME = 'P3856'

In [19]:
MZ_TOLERANCE_PPM = 5
MZ_TOLERANCE_PERCENT = MZ_TOLERANCE_PPM * 10**-4
SCAN_TOLERANCE = 2
RT_TOLERANCE = 1

In [20]:
# load the extractions
EXPERIMENT_DIR = '/Users/darylwilding-mcbride/Downloads/experiments/{}'.format(EXPERIMENT_NAME)
EXTRACTIONS_FILE_NAME = '{}/extracted-features/extracted-features-YHE211_1.pkl'.format(EXPERIMENT_DIR)

In [85]:
ext_df = pd.read_pickle(EXTRACTIONS_FILE_NAME)

In [86]:
ext_df.columns

Index(['sequence', 'charge', 'run_name', 'short_run_name', 'run_group',
       'theoretical_mz', 'monoisotopic_mz_centroid', 'rt_apex', 'scan_apex'],
      dtype='object')

In [87]:
ext_df['mz'] = ext_df.monoisotopic_mz_centroid  # shorthand to reduce verbosity
ext_df['mz_ppm_tolerance'] = ext_df.mz * MZ_TOLERANCE_PERCENT / 100
ext_df['mz_lower'] = ext_df.mz - ext_df.mz_ppm_tolerance
ext_df['mz_upper'] = ext_df.mz + ext_df.mz_ppm_tolerance
ext_df['scan_lower'] = ext_df.scan_apex - SCAN_TOLERANCE
ext_df['scan_upper'] = ext_df.scan_apex + SCAN_TOLERANCE
ext_df['rt_lower'] = ext_df.rt_apex - RT_TOLERANCE
ext_df['rt_upper'] = ext_df.rt_apex + RT_TOLERANCE
ext_df['mz_delta'] = abs(ext_df.theoretical_mz - ext_df.mz)
ext_df['composite_key'] = ext_df.apply(lambda row: '{},{},{}'.format(row.sequence, row.charge, row.short_run_name), axis=1)

In [88]:
# remove these after we're finished
columns_to_drop_l = []
columns_to_drop_l.append('mz')
columns_to_drop_l.append('mz_ppm_tolerance')
columns_to_drop_l.append('mz_lower')
columns_to_drop_l.append('mz_upper')
columns_to_drop_l.append('scan_lower')
columns_to_drop_l.append('scan_upper')
columns_to_drop_l.append('rt_lower')
columns_to_drop_l.append('rt_upper')
columns_to_drop_l.append('mz_delta')
columns_to_drop_l.append('composite_key')

In [89]:
# see if any extractions have a duplicate - if so, find the dup with the lowest mass error and keep it
keep_l = []
for row in ext_df.itertuples():
    dup_df = ext_df[(ext_df.mz > row.mz_lower) & (ext_df.mz < row.mz_upper) & (ext_df.scan_apex > row.scan_lower) & (ext_df.scan_apex < row.scan_upper) & (ext_df.rt_apex > row.rt_lower) & (ext_df.rt_apex < row.rt_upper)].copy()
    dup_df.sort_values(by=['mz_delta'], ascending=True, inplace=True)
    keep_l.append(dup_df.iloc[0][['sequence','charge','short_run_name','composite_key']])  # keep the dup with the lowest m/z delta
keep_df = pd.DataFrame(keep_l)

In [90]:
# remove any extractions that are not in the keep list
dedup_df = ext_df[ext_df.composite_key.isin(keep_df.composite_key)].copy()

In [91]:
len(ext_df)

20784

In [92]:
len(dedup_df)

20680

In [93]:
# remove the columns we added earlier
dedup_df.drop(columns_to_drop_l, axis=1, inplace=True)

In [94]:
dedup_df.sample(n=3)

Unnamed: 0,sequence,charge,run_name,short_run_name,run_group,theoretical_mz,monoisotopic_mz_centroid,rt_apex,scan_apex
242529,GLSPAQADSQFLENAK,2,P3856_YHE211_1_Slot1-1_1_5104,YHE211_1,YHE211,838.417926,838.416762,1833.579093,460
255242,VQENLLANGVDLVTYITR,2,P3856_YHE211_1_Slot1-1_1_5104,YHE211_1,YHE211,1009.549476,1009.548148,2282.897074,249
248536,NALESYAFNMK,2,P3856_YHE211_1_Slot1-1_1_5104,YHE211_1,YHE211,644.305526,644.30508,1874.861377,708
