Associates ms2 spectra (produced with 'prototyping ms2 processing from PASEF isolation windows') with features detected in ms1.

In [10]:
import sqlite3
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
from ms_deisotope import deconvolute_peaks, averagine, scoring
from ms_deisotope.deconvolution import peak_retention_strategy
from numba import njit
import os.path
from pyteomics import mgf

In [11]:
# get the ms2 spectra
ms2_deconvoluted_peaks_df = pd.read_pickle('/Users/darylwilding-mcbride/Downloads/Hela_Ecoli_1to1-rt-0-1800-ms2.pkl')

In [12]:
ms2_deconvoluted_peaks_df.head()

Unnamed: 0,precursor,mz,charge,intensity,score,SN
0,1,151.165,1,9.0,0.0,9.0
1,1,206.0931,1,9.0,0.0,9.0
2,1,215.1453,1,69.0,8.306624,69.0
3,1,216.14,1,96.0,9.797959,96.0
4,1,252.4246,1,9.0,0.0,9.0


In [22]:
# get the features detected in ms1
features_df = pd.read_pickle('/Users/darylwilding-mcbride/Downloads/Hela_Ecoli_1to1-rt-0-1800-ms1.pkl')

In [23]:
features_df.columns

Index(['candidate_phr_error', 'charge', 'envelope', 'feature_id', 'intensity',
       'intensity_full_rt_extent', 'mono_adjusted', 'monoisotopic_mz',
       'original_phr', 'original_phr_error', 'precursor_id', 'rt_apex',
       'rt_curve_fit', 'rt_lower', 'rt_upper', 'scan_apex', 'scan_curve_fit',
       'scan_lower', 'scan_upper'],
      dtype='object')

In [24]:
# ms2_a is a numpy array [precursor_id,mz,intensity]
# return is a dictionary containing the feature information and spectra
def collate_spectra_for_feature(feature_df, ms2_a):
    # append the monoisotopic and the ms2 fragments to the list for MGF creation
    ms2_sorted_a = ms2_a[ms2_a[:,1].argsort()] # sort by m/z increasing
    spectrum = {}
    spectrum["m/z array"] = ms2_sorted_a[:,1]
    spectrum["intensity array"] = ms2_sorted_a[:,2].astype(int)
    params = {}
    params["TITLE"] = "RawFile: {} Charge: {} FeatureIntensity: {} Feature#: {} RtApex: {} Precursor: {}".format(os.path.basename(CONVERTED_DATABASE_NAME).split('.')[0], feature_df.charge, round(feature_df.intensity), feature_df.feature_id, round(feature_df.rt_apex,2), feature_df.precursor_id)
    params["INSTRUMENT"] = "ESI-QUAD-TOF"
    params["PEPMASS"] = "{} {}".format(round(feature_df.monoisotopic_mz,6), round(feature_df.intensity))
    params["CHARGE"] = "{}+".format(feature_df.charge)
    params["RTINSECONDS"] = "{}".format(round(feature_df.rt_apex,2))
    spectrum["params"] = params
    return spectrum

In [25]:
CONVERTED_DATABASE_NAME = '{}/HeLa_20KInt.sqlite'.format('/Users/darylwilding-mcbride/Downloads')

In [26]:
ms2_peaks_a = ms2_deconvoluted_peaks_df[['precursor','mz','intensity']].to_numpy()

In [27]:
feature_results = []
for i in range(len(features_df)):
    feature_df = features_df.iloc[i]
    # package the feature and its fragment ions for writing out to the MGF
    ms2_a = ms2_peaks_a[np.where(ms2_peaks_a[:,0] == feature_df.precursor_id)]
    result = collate_spectra_for_feature(feature_df, ms2_a)
    feature_results.append(result)

In [9]:
# generate the MGF for all the features
fn = '/Users/darylwilding-mcbride/Downloads/rt-3000-3060.mgf'
print("writing {} entries to {}".format(len(feature_results), fn))
if os.path.isfile(fn):
    os.remove(fn)
_ = mgf.write(output=fn, spectra=feature_results)

writing 4664 entries to /Users/darylwilding-mcbride/Downloads/rt-3000-3060.mgf
