In [44]:
import sqlite3
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt

In [45]:
ms1_features_df = pd.read_csv('/Users/darylwilding-mcbride/Downloads/HeLa_20KInt-feature-characteristics.csv')

In [46]:
len(ms1_features_df)

72970

In [47]:
ms1_features_df.rt_apex.min(), ms1_features_df.rt_apex.max()

(4345.07, 4570.72)

In [48]:
# +/- these amounts
MZ_TOLERANCE_PPM = 5
MZ_TOLERANCE_PERCENT = MZ_TOLERANCE_PPM * 10**-4
SCAN_TOLERANCE = 10
RT_TOLERANCE = 0.1

In [84]:
scratch_df = ms1_features_df.copy() # take a copy because we're going to delete stuff

In [92]:
ms1_features_l = []
for i in range(20):
# while len(scratch_df) > 0:
    # take the first row
    row = scratch_df.iloc[0]
    mz = row.monoisotopic_mz
    scan = row.scan_apex
    rt = row.rt_apex

    # calculate the matching bounds
    mz_ppm_tolerance = mz * MZ_TOLERANCE_PERCENT / 100
    mz_lower = mz - mz_ppm_tolerance
    mz_upper = mz + mz_ppm_tolerance
    scan_lower = scan - SCAN_TOLERANCE
    scan_upper = scan + SCAN_TOLERANCE
    rt_lower = rt - RT_TOLERANCE
    rt_upper = rt + RT_TOLERANCE
    
    # find the matches within these tolerances
    cond_1 = (scratch_df.monoisotopic_mz >= mz_lower) & (scratch_df.monoisotopic_mz <= mz_upper) & (scratch_df.scan_apex >= scan_lower) & (scratch_df.scan_apex <= scan_upper) & (scratch_df.rt_apex >= rt_lower) & (scratch_df.rt_apex <= rt_upper)
    matching_rows = scratch_df.loc[cond_1, :]
    # of those, find the most intense
    cond_2 = (matching_rows.intensity == matching_rows.intensity.max())
    most_intense_row = matching_rows.loc[cond_2, :].copy()
    most_intense_row['duplicates'] = len(matching_rows)
    # add it to the list
    ms1_features_l.append(tuple(most_intense_row.iloc[0]))
    # drop the duplicates
    scratch_df.drop(matching_rows.index, inplace=True)


In [93]:
ms1_features_l

[(1856.8879379999998,
  2,
  928.9475954151453,
  3028,
  349.4534121536828,
  True,
  4367.53,
  True,
  148945,
  1),
 (904.9775210000001,
  1,
  904.9774977427787,
  1355,
  368.3106423777565,
  False,
  4386.48,
  True,
  148946,
  1),
 (905.467474,
  1,
  905.4674504043717,
  3063,
  366.95249815274263,
  True,
  4380.57,
  True,
  148946,
  1),
 (1803.7140420000003,
  2,
  902.3606476060656,
  3210,
  365.0101129422561,
  True,
  4369.89,
  True,
  148946,
  1),
 (1803.911232,
  2,
  902.4592423814943,
  2319,
  379.50902777777776,
  False,
  4363.99,
  True,
  148946,
  1),
 (1804.950345,
  2,
  902.9787989755637,
  15719,
  363.66569667921937,
  False,
  4388.85,
  True,
  148946,
  2),
 (1806.961693,
  2,
  903.9844730125074,
  2561,
  362.9440617249931,
  False,
  4388.85,
  True,
  148946,
  1),
 (1380.646591,
  2,
  690.8269220662338,
  143074,
  506.91032272273185,
  True,
  4367.53,
  True,
  148947,
  2),
 (1381.149547,
  2,
  691.0784002159912,
  10358,
  529.4961925816

In [94]:
cols = scratch_df.columns.append(pd.Index(['duplicates']))

In [95]:
cols

Index(['monoisotopic_mass', 'charge', 'monoisotopic_mz', 'intensity',
       'scan_apex', 'scan_curve_fit', 'rt_apex', 'rt_curve_fit',
       'precursor_id', 'duplicates'],
      dtype='object')

In [96]:
ms1_peaks_df = pd.DataFrame(ms1_features_l, columns=cols)

In [97]:
ms1_peaks_df

Unnamed: 0,monoisotopic_mass,charge,monoisotopic_mz,intensity,scan_apex,scan_curve_fit,rt_apex,rt_curve_fit,precursor_id,duplicates
0,1856.887938,2,928.947595,3028,349.453412,True,4367.53,True,148945,1
1,904.977521,1,904.977498,1355,368.310642,False,4386.48,True,148946,1
2,905.467474,1,905.46745,3063,366.952498,True,4380.57,True,148946,1
3,1803.714042,2,902.360648,3210,365.010113,True,4369.89,True,148946,1
4,1803.911232,2,902.459242,2319,379.509028,False,4363.99,True,148946,1
5,1804.950345,2,902.978799,15719,363.665697,False,4388.85,True,148946,2
6,1806.961693,2,903.984473,2561,362.944062,False,4388.85,True,148946,1
7,1380.646591,2,690.826922,143074,506.910323,True,4367.53,True,148947,2
8,1381.149547,2,691.0784,10358,529.496193,False,4366.35,True,148947,1
9,1382.673457,2,691.840355,11273,508.894917,False,4380.57,True,148947,1
