In [145]:
import sqlite3
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
from ms_deisotope import deconvolute_peaks, averagine, scoring
from ms_deisotope.deconvolution import peak_retention_strategy
from numba import njit

In [130]:
RT_LIMIT_LOWER = 3000  # RT range in the database
RT_LIMIT_UPPER = 3600
RT_BASE_PEAK_WIDTH_SECS = 30.0  # assumption about base peak width in RT
RT_FRAGMENT_EVENT_DELTA_SECS = 3.5  # use this window for constraining RT to focus on the fragmentation event
MS1_CE = 10

In [131]:
BASE_NAME = "/Users/darylwilding-mcbride/Downloads/HeLa_20KInt-rt-{}-{}-denoised".format(RT_LIMIT_LOWER,RT_LIMIT_UPPER)
BASE_MAXQUANT_TXT_DIR = '/Users/darylwilding-mcbride/Downloads/maxquant_results/txt'
ALLPEPTIDES_FILENAME = '{}/allPeptides.txt'.format(BASE_MAXQUANT_TXT_DIR)
PASEF_MSMS_SCANS_FILENAME = '{}/pasefMsmsScans.txt'.format(BASE_MAXQUANT_TXT_DIR)
CONVERTED_DATABASE_NAME = '{}/HeLa_20KInt.sqlite'.format(BASE_NAME)

In [132]:
PROTON_MASS = 1.0073  # Mass of a proton in unified atomic mass units, or Da. For calculating the monoisotopic mass.
DELTA_MZ = 1.003355     # Mass difference between Carbon-12 and Carbon-13 isotopes, in Da. For calculating the spacing between isotopic peaks.
INSTRUMENT_RESOLUTION = 40000.0
MASS_DEFECT_WINDOW_DA_MIN = 100  # range in Daltons
MASS_DEFECT_WINDOW_DA_MAX = 5200

In [133]:
isolation_window_df = pd.read_pickle('/Users/darylwilding-mcbride/isolation_windows.pkl')

In [134]:
isolation_window_df.sort_values(by=['Precursor'], ascending=False, inplace=True)

In [135]:
isolation_window_df.head()

Unnamed: 0,Frame,ScanNumBegin,ScanNumEnd,IsolationMz,IsolationWidth,Precursor,frame_id,retention_time_secs,mz_lower,mz_upper
202634,28495,705,730,641.396052,2.0,94168,28495.0,3059.990139,639.696052,643.096052
202633,28495,649,674,839.504336,3.0,94167,28495.0,3059.990139,837.304336,841.704336
202632,28495,602,627,579.62179,2.0,94166,28495.0,3059.990139,577.92179,581.32179
202629,28495,435,460,798.343136,2.983431,94165,28495.0,3059.990139,796.151421,800.534852
202636,28495,797,822,431.41008,2.0,94164,28495.0,3059.990139,429.71008,433.11008


In [136]:
df = isolation_window_df[isolation_window_df.Precursor == 92076]
df

Unnamed: 0,Frame,ScanNumBegin,ScanNumEnd,IsolationMz,IsolationWidth,Precursor,frame_id,retention_time_secs,mz_lower,mz_upper
198101,27950,322,347,967.542835,3.0,92076,27950.0,3001.428397,965.342835,969.742835
198082,27947,322,347,967.542835,3.0,92076,27947.0,3001.110042,965.342835,969.742835
198111,27951,322,347,967.542835,3.0,92076,27951.0,3001.536969,965.342835,969.742835
198091,27949,322,347,967.542835,3.0,92076,27949.0,3001.321952,965.342835,969.742835
198129,27953,322,347,967.542835,3.0,92076,27953.0,3001.753915,965.342835,969.742835
198120,27952,322,347,967.542835,3.0,92076,27952.0,3001.645923,965.342835,969.742835


In [137]:
ms2_frame_ids = tuple(df.Frame)
ms2_frame_ids

(27950, 27947, 27951, 27949, 27953, 27952)

In [138]:
# create the bins for mass defect windows in Da space
def generate_mass_defect_windows():
    bin_edges_l = []
    for nominal_mass in range(MASS_DEFECT_WINDOW_DA_MIN, MASS_DEFECT_WINDOW_DA_MAX):
        mass_centre = nominal_mass * 1.00048  # from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3184890/
        width = 0.19 + (0.0001 * nominal_mass)
        lower_mass = mass_centre - (width / 2)
        upper_mass = mass_centre + (width / 2)
        bin_edges_l.append(lower_mass)
        bin_edges_l.append(upper_mass)
    bins = np.asarray(bin_edges_l)
    return bins

In [139]:
# return a point if, in its imaginary charge-3, charge-2, or charge-1 de-charged state, it fits inside at least one mass defect window
# ms2_peaks_a is a numpy array of [mz,intensity]
def remove_points_outside_mass_defect_windows(ms2_peaks_a, mass_defect_window_bins):
    mz_a = ms2_peaks_a[:,0]
    inside_mass_defect_window_a = np.full((len(mz_a)), False)
    for charge in [3,2,1]:
        decharged_mass_a = (mz_a * charge) - (PROTON_MASS * charge)
        decharged_mass_bin_indexes = np.digitize(decharged_mass_a, mass_defect_window_bins)  # an odd index means the point is inside a mass defect window
        mass_defect_window_indexes = (decharged_mass_bin_indexes % 2) == 1  # odd bin indexes are mass defect windows
        inside_mass_defect_window_a[mass_defect_window_indexes] = True
    result = ms2_peaks_a[inside_mass_defect_window_a]
    return result

In [140]:
ms2_peak_delta = 0.01

In [141]:
@njit(fastmath=True)
def mz_centroid(_int_f, _mz_f):
    return ((_int_f/_int_f.sum()) * _mz_f).sum()

In [142]:
# ms2_peaks_a is a numpy array of [mz,intensity]
# returns a nunpy array of [mz_centroid,summed_intensity]
def ms2_intensity_descent(ms2_peaks_a):
    # intensity descent
    ms2_peaks_l = []
    while len(ms2_peaks_a) > 0:
        # find the most intense point
        max_intensity_index = np.argmax(ms2_peaks_a[:,1])
        peak_mz = ms2_peaks_a[max_intensity_index,0]
        peak_mz_lower = peak_mz - ms2_peak_delta
        peak_mz_upper = peak_mz + ms2_peak_delta

        # get all the raw points within this m/z region
        peak_indexes = np.where((ms2_peaks_a[:,0] >= peak_mz_lower) & (ms2_peaks_a[:,0] <= peak_mz_upper))
        if len(peak_indexes) > 0:
            mz_cent = mz_centroid(ms2_peaks_a[peak_indexes,1], ms2_peaks_a[peak_indexes,0])
            summed_intensity = ms2_peaks_a[peak_indexes,1].sum()
            ms2_peaks_l.append((mz_cent, summed_intensity))
            # remove the raw points assigned to this peak
            ms2_peaks_a = np.delete(ms2_peaks_a, peak_indexes, axis=0)
    return np.array(ms2_peaks_l)

In [143]:
# return a list of deconvoluted spectra for this precursor
def process_ms2(idx, precursor_group_df):
    deconvoluted_peaks_l = []
    precursor_id = int(precursor_group_df.iloc[0].Precursor)
    # determine the target raw data
    scan_lower = precursor_group_df.iloc[0].ScanNumBegin
    scan_upper = precursor_group_df.iloc[0].ScanNumEnd
    ms2_frame_ids = tuple(precursor_group_df.Frame)
    if len(ms2_frame_ids) == 1:
        ms2_frame_ids = "({})".format(ms2_frame_ids[0])
    # extract the raw data
    db_conn = sqlite3.connect(CONVERTED_DATABASE_NAME)
    ms2_raw_points_df = pd.read_sql_query("select frame_id,mz,scan,intensity from frames where frame_id in {} and scan >= {} and scan <= {} and intensity > 0 order by mz".format(ms2_frame_ids, scan_lower, scan_upper), db_conn)
    db_conn.close()
    # remove the points that are not within a mass defect window
    raw_points_a = ms2_raw_points_df[['mz','intensity']].to_numpy()
    filtered_raw_points_a = remove_points_outside_mass_defect_windows(raw_points_a, mass_defect_window_bins)
    # perform intensity descent to resolve peaks
    peaks_a = ms2_intensity_descent(filtered_raw_points_a)
    # deconvolute the spectra
    peaks_l = list(map(tuple, peaks_a))
    deconvoluted_peaks, _ = deconvolute_peaks(peaks_l, use_quick_charge=True, averagine=averagine.peptide, charge_range=(1,5), scorer=scoring.MSDeconVFitter(minimum_score=8, mass_error_tolerance=0.1), error_tolerance=4e-5, truncate_after=0.8, retention_strategy=peak_retention_strategy.TopNRetentionStrategy(n_peaks=100, base_peak_coefficient=1e-6, max_mass=1800.0))
    # package the spectra as a dataframe
    for peak in deconvoluted_peaks:
        deconvoluted_peaks_l.append((precursor_id, round(peak.neutral_mass+PROTON_MASS, 4), int(peak.charge), peak.intensity, peak.score, peak.signal_to_noise))
    return deconvoluted_peaks_l

In [144]:
mass_defect_window_bins = generate_mass_defect_windows()

peaks_l = []
for idx,precursor_group_df in isolation_window_df.groupby('Precursor'):
    peaks_for_precursor_l = process_ms2(idx, precursor_group_df)
    peaks_l.append(peaks_for_precursor_l)
flattened_peaks_l = [item for sublist in peaks_l for item in sublist]

ms2_deconvoluted_peaks_df = pd.DataFrame(flattened_peaks_l, columns=['precursor','mz','charge','intensity','score','SN'])
ms2_deconvoluted_peaks_df.to_pickle('/Users/darylwilding-mcbride/Downloads/rt-3000-3060-test-ms2.pkl')