In [178]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
from sys import getsizeof
from numba import njit

In [229]:
feature_id = 1
mz_scaling_factor = 1000
charge = 1
DELTA_MZ = 1.003355  # Mass difference between Carbon-12 and Carbon-13 isotopes, in Da. For calculating the spacing between isotopic peaks.
PROTON_MASS = 1.0073  # Mass of a proton in unified atomic mass units, or Da. For calculating the monoisotopic mass.

In [254]:
feature_raw_ms2_df = pd.read_pickle('/Users/darylwilding-mcbride/Downloads/feature-{}-ms2-raw-points.pkl'.format(feature_id))
feature_raw_ms2_df.sort_values(by=['mz'], ascending=True, inplace=True)
feature_raw_ms2_df['decharged_mass'] = (feature_raw_ms2_df.mz + PROTON_MASS) * charge

In [255]:
feature_raw_ms2_df.head()

Unnamed: 0,frame_id,mz,intensity,decharged_mass
0,27938,133.596214,9,134.603514
1,27938,180.467447,9,181.474747
2,27938,213.41826,9,214.42556
3,27938,266.156364,9,267.163664
4,27938,268.404245,9,269.411545


In [256]:
len(feature_raw_ms2_df)

215

In [257]:
MZ_MIN = 100
MZ_MAX = 1800

In [258]:
# generate a charge-1 mask of mass defect windows
bin_edges_l = []
for nominal_mass in range(MZ_MIN,MZ_MAX):
    mass_centre = nominal_mass * 1.00048
    width = 0.19 + (0.0001 * nominal_mass)
    lower_mass = mass_centre - (width / 2)
    upper_mass = mass_centre + (width / 2)
    bin_edges_l.append(lower_mass)
    bin_edges_l.append(upper_mass)
bins = np.asarray(bin_edges_l)

In [259]:
bins[:10]

array([ 99.948  , 100.148  , 100.94843, 101.14853, 101.94886, 102.14906,
       102.94929, 103.14959, 103.94972, 104.15012])

In [260]:
np.digitize(99.95, bins)

1

In [268]:
decharged_mass_a = feature_raw_ms2_df.decharged_mass.to_numpy()
mz_a = feature_raw_ms2_df.mz.to_numpy()
intensity_a = feature_raw_ms2_df.intensity.to_numpy()

In [262]:
digitised_mass = np.digitize(decharged_mass_a, bins)  # an odd index means the point is inside a mass defect window
digitised_mass

array([  70,  164,  230,  335,  340,  384,  385,  391,  409,  441,  453,
        459,  511,  522,  525,  525,  553,  597,  603,  607,  609,  619,
        626,  627,  629,  633,  637,  651,  661,  665,  665,  669,  677,
        709,  719,  734,  737,  757,  773,  782,  801,  801,  811,  817,
        825,  825,  827,  828,  845,  847,  861,  861,  863,  863,  865,
        867,  867,  883,  883,  883,  885,  885,  887,  887,  887,  891,
        903,  905,  911,  918,  951,  955,  960,  965,  979,  987, 1001,
       1019, 1021, 1033, 1033, 1037, 1051, 1053, 1086, 1087, 1088, 1089,
       1105, 1133, 1141, 1143, 1147, 1149, 1194, 1202, 1203, 1219, 1231,
       1258, 1302, 1302, 1318, 1341, 1353, 1371, 1373, 1373, 1375, 1375,
       1375, 1391, 1425, 1453, 1511, 1511, 1511, 1513, 1513, 1513, 1513,
       1513, 1513, 1515, 1515, 1515, 1515, 1515, 1515, 1515, 1515, 1515,
       1515, 1515, 1515, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517,
       1517, 1518, 1522, 1581, 1645, 1661, 1699, 17

In [134]:
# consecutive odd indexes means consecutive windows, suggesting a series of isotopic peaks

In [263]:
# remove all the even indexes - the odd indexes are the mass defect windows
mass_defect_window_indexes = digitised_mass[(digitised_mass % 2) == 1]
mass_defect_window_indexes

array([ 335,  385,  391,  409,  441,  453,  459,  511,  525,  525,  553,
        597,  603,  607,  609,  619,  627,  629,  633,  637,  651,  661,
        665,  665,  669,  677,  709,  719,  737,  757,  773,  801,  801,
        811,  817,  825,  825,  827,  845,  847,  861,  861,  863,  863,
        865,  867,  867,  883,  883,  883,  885,  885,  887,  887,  887,
        891,  903,  905,  911,  951,  955,  965,  979,  987, 1001, 1019,
       1021, 1033, 1033, 1037, 1051, 1053, 1087, 1089, 1105, 1133, 1141,
       1143, 1147, 1149, 1203, 1219, 1231, 1341, 1353, 1371, 1373, 1373,
       1375, 1375, 1375, 1391, 1425, 1453, 1511, 1511, 1511, 1513, 1513,
       1513, 1513, 1513, 1513, 1515, 1515, 1515, 1515, 1515, 1515, 1515,
       1515, 1515, 1515, 1515, 1515, 1517, 1517, 1517, 1517, 1517, 1517,
       1517, 1517, 1517, 1581, 1645, 1661, 1699, 1707, 1739, 1739, 1739,
       1741, 1741, 1743, 1745, 1745, 1883, 1899, 1913, 1933, 1939, 1939,
       1941, 1941, 1941, 1941, 1941, 1943, 1943, 19

In [264]:
# remove the duplicates
unique_mass_defect_window_indexes = np.unique(mass_defect_window_indexes)
unique_mass_defect_window_indexes

array([ 335,  385,  391,  409,  441,  453,  459,  511,  525,  553,  597,
        603,  607,  609,  619,  627,  629,  633,  637,  651,  661,  665,
        669,  677,  709,  719,  737,  757,  773,  801,  811,  817,  825,
        827,  845,  847,  861,  863,  865,  867,  883,  885,  887,  891,
        903,  905,  911,  951,  955,  965,  979,  987, 1001, 1019, 1021,
       1033, 1037, 1051, 1053, 1087, 1089, 1105, 1133, 1141, 1143, 1147,
       1149, 1203, 1219, 1231, 1341, 1353, 1371, 1373, 1375, 1391, 1425,
       1453, 1511, 1513, 1515, 1517, 1581, 1645, 1661, 1699, 1707, 1739,
       1741, 1743, 1745, 1883, 1899, 1913, 1933, 1939, 1941, 1943, 1945,
       1947, 1949, 2013, 2055, 2073, 2123, 2163, 2165, 2167, 2199, 2201,
       2203, 2205, 2267, 2271, 2319, 2369, 2403, 2521, 2567, 2967])

In [265]:
# find where the windows are consecutive
condition = (np.diff(unique_mass_defect_window_indexes) == 2)
condition

array([False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False,  True, False,
        True,  True,  True, False,  True,  True, False, False,  True,
       False, False, False, False, False, False, False, False,  True,
       False, False, False,  True, False,  True, False, False, False,
        True, False,  True, False, False, False, False, False, False,
        True,  True, False, False, False, False,  True,  True,  True,
       False, False, False, False, False, False,  True,  True,  True,
       False, False, False, False, False,  True,  True,  True,  True,
        True, False, False, False, False, False,  True,  True, False,
        True,  True,  True, False, False, False, False, False, False,
       False, False])

In [266]:
chunk_sizes = np.diff(np.where(np.concatenate(([condition[0]],
                                     condition[:-1] != condition[1:],
                                     [True])))[0])[::2]
chunk_sizes

array([1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 2, 3, 3, 5, 2, 3])

In [267]:
consecutive_window_indexes = np.where(condition == True)[0]
consecutive_window_indexes

array([ 12,  15,  32,  34,  36,  37,  38,  40,  41,  44,  53,  57,  59,
        63,  65,  72,  73,  78,  79,  80,  87,  88,  89,  95,  96,  97,
        98,  99, 105, 106, 108, 109, 110])

In [280]:
expected_peak_spacing = DELTA_MZ / charge
for idx in range(len(chunk_sizes)):
    number_of_windows = chunk_sizes[idx]
    start_idx = chunk_sizes[:idx].sum()
    index_list = consecutive_window_indexes[start_idx:start_idx+number_of_windows].tolist()
    index_list.append(index_list[-1]+1)
    print("isotope series {} ({} peaks):".format(idx+1, number_of_windows+1))
    mz_l = []
    int_l = []
    for i in index_list:
        lower_mass = round(bins[unique_mass_defect_window_indexes[i]-1],4)
        upper_mass = round(bins[unique_mass_defect_window_indexes[i]],4)
        print("mass defect window [{}]: {}-{} Da".format(unique_mass_defect_window_indexes[i], lower_mass, upper_mass))
        # get the raw points allocated to this bin
        peak_indexes = np.where(digitised_mass == unique_mass_defect_window_indexes[i])[0]
        print("\tpeak m/z {}".format(np.round(mz_a[peak_indexes],4)))
        mz_centroid = np.average(mz_a[peak_indexes], weights=intensity_a[peak_indexes])
        intensity = np.sum(intensity_a[peak_indexes])
        mz_l.append(mz_centroid)
        int_l.append(intensity)
        print("\tpeak centroid {} m/z, {}".format(np.round(mz_centroid,4), intensity))
    # de-isotope the peaks
    peaks_mz_a = np.array(mz_l)
    peaks_int_a = np.array(int_l)
    for i in range(len(peaks_mz_a)):
        peaks_mz_a[i] = peaks_mz_a[i] - (i * expected_peak_spacing)
    print("de-isotoped m/z {}".format(np.round(peaks_mz_a,4)))
    deisotoped_mz = np.average(peaks_mz_a, weights=peaks_int_a)
    print("de-isotoped m/z centroid {}".format(np.round(deisotoped_mz,4)))
    monoisotopic_mass = (deisotoped_mz - PROTON_MASS) * charge
    print("monoisotopic mass {} Da".format(np.round(monoisotopic_mass,4)))
    print()
    

isotope series 1 (2 peaks):
mass defect window [607]: 403.0783-403.3086 Da
	peak m/z [402.2189]
	peak centroid 402.2189 m/z, 10
mass defect window [609]: 404.0787-404.3091 Da
	peak m/z [403.1935]
	peak centroid 403.1935 m/z, 23
de-isotoped m/z [402.2189 402.1902]
de-isotoped m/z centroid 402.1989
monoisotopic mass 401.1916 Da

isotope series 2 (2 peaks):
mass defect window [627]: 413.0826-413.3139 Da
	peak m/z [412.2579]
	peak centroid 412.2579 m/z, 27
mass defect window [629]: 414.083-414.3144 Da
	peak m/z [413.2542]
	peak centroid 413.2542 m/z, 114
de-isotoped m/z [412.2579 412.2509]
de-isotoped m/z centroid 412.2522
monoisotopic mass 411.2449 Da

isotope series 3 (2 peaks):
mass defect window [825]: 512.1252-512.3664 Da
	peak m/z [511.2531 512.2554]
	peak centroid 511.8302 m/z, 99
mass defect window [827]: 513.1256-513.3669 Da
	peak m/z [512.4303]
	peak centroid 512.4303 m/z, 24
de-isotoped m/z [511.8302 511.4269]
de-isotoped m/z centroid 511.7515
monoisotopic mass 510.7442 Da

isot

IndexError: index 194 is out of bounds for axis 0 with size 194