In [68]:
import sqlite3
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt

In [69]:
ms1_features_df = pd.read_csv('/Users/darylwilding-mcbride/Downloads/HeLa_20KInt-feature-characteristics.csv')

In [70]:
len(ms1_features_df)

72970

In [71]:
# +/- these amounts
MZ_TOLERANCE_PPM = 5
MZ_TOLERANCE_PERCENT = MZ_TOLERANCE_PPM * 10**-4
SCAN_TOLERANCE = 10
RT_TOLERANCE = 0.1

In [72]:
scratch_df = ms1_features_df.copy() # take a copy because we're going to delete stuff
ms1_features_l = []
while len(scratch_df) > 0:
    # take the first row
    row = scratch_df.iloc[0]
    mz = row.monoisotopic_mz
    scan = row.scan_apex
    rt = row.rt_apex

    # calculate the matching bounds
    mz_ppm_tolerance = mz * MZ_TOLERANCE_PERCENT / 100
    mz_lower = mz - mz_ppm_tolerance
    mz_upper = mz + mz_ppm_tolerance
    scan_lower = scan - SCAN_TOLERANCE
    scan_upper = scan + SCAN_TOLERANCE
    rt_lower = rt - RT_TOLERANCE
    rt_upper = rt + RT_TOLERANCE
    
    # find the matches within these tolerances
    matches_df = scratch_df[(scratch_df.monoisotopic_mz >= mz_lower) & (scratch_df.monoisotopic_mz <= mz_upper) & (scratch_df.scan_apex >= scan_lower) & (scratch_df.scan_apex <= scan_upper) & (scratch_df.rt_apex >= rt_lower) & (scratch_df.rt_apex <= rt_upper)]
    peak_df = matches_df.loc[matches_df.intensity.idxmax()].copy()
    peak_df['duplicates'] = len(matches_df)
    
    # add the most intense to the list
    ms1_features_l.append(tuple(peak_df))

    # remove the matches
    scratch_df = scratch_df[~scratch_df.isin(matches_df)].dropna(how = 'all')


In [73]:
len(ms1_features_l)

53910

In [78]:
print("de-duping removed {} peaks".format(len(ms1_features_df) - len(ms1_features_l)))

de-duping removed 19060 peaks


In [76]:
ms1_peaks_df = pd.DataFrame(ms1_features_l, columns=['monoisotopic_mass', 'charge', 'monoisotopic_mz', 'intensity', 'scan_apex', 'scan_curve_fit', 'rt_apex', 'rt_curve_fit', 'precursor_id', 'duplicates'])

In [77]:
ms1_peaks_df.to_csv('/Users/darylwilding-mcbride/Downloads/HeLa_20KInt-feature-characteristics-deduped.csv')