Associates ms2 spectra (produced with 'prototyping ms2 processing from PASEF isolation windows') with features detected in ms1.

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
from ms_deisotope import deconvolute_peaks, averagine, scoring
from ms_deisotope.deconvolution import peak_retention_strategy
from numba import njit
import os.path
from pyteomics import mgf

In [2]:
# get the ms2 spectra
ms2_deconvoluted_peaks_df = pd.read_pickle('/Users/darylwilding-mcbride/Downloads/rt-3000-3060-ms2.pkl')

In [3]:
ms2_deconvoluted_peaks_df.head()

Unnamed: 0,precursor,mz,charge,intensity,score,SN
0,92011,362.2027,1,69.0,0.0,69.0
1,92011,376.1919,1,9.0,0.0,9.0
2,92011,429.2472,1,63.0,0.0,63.0
3,92011,465.9346,1,9.0,0.0,9.0
4,92011,486.8721,1,9.0,0.0,9.0


In [4]:
# get the features detected in ms1
ms1_dedup_features_df = pd.read_pickle('/Users/darylwilding-mcbride/Downloads/rt-3000-3060-cmms1.pkl')

In [5]:
ms1_dedup_features_df.columns

Index(['candidate_phr_error', 'charge', 'envelope', 'feature_id', 'intensity',
       'intensity_full_rt_extent', 'mono_adjusted', 'monoisotopic_mz',
       'original_phr', 'original_phr_error', 'precursor_id', 'rt_apex',
       'rt_curve_fit', 'rt_lower', 'rt_upper', 'scan_apex', 'scan_curve_fit',
       'scan_lower', 'scan_upper'],
      dtype='object')

In [6]:
def collate_spectra_for_feature(feature_df, ms2_deconvoluted_df):
    # append the monoisotopic and the ms2 fragments to the list for MGF creation
    pairs_df = ms2_deconvoluted_df[['mz', 'intensity']].copy().sort_values(by=['mz'], ascending=True)
    spectrum = {}
    spectrum["m/z array"] = pairs_df.mz.values
    spectrum["intensity array"] = pairs_df.intensity.values.astype(int)
    params = {}
    params["TITLE"] = "RawFile: {} Charge: {} FeatureIntensity: {} Feature#: {} RtApex: {} Precursor: {}".format(os.path.basename(CONVERTED_DATABASE_NAME).split('.')[0], feature_df.charge, round(feature_df.intensity), feature_df.feature_id, round(feature_df.rt_apex,2), feature_df.precursor_id)
    params["INSTRUMENT"] = "ESI-QUAD-TOF"
    params["PEPMASS"] = "{} {}".format(round(feature_df.monoisotopic_mz,6), round(feature_df.intensity))
    params["CHARGE"] = "{}+".format(feature_df.charge)
    params["RTINSECONDS"] = "{}".format(round(feature_df.rt_apex,2))
    spectrum["params"] = params
    return spectrum

In [7]:
CONVERTED_DATABASE_NAME = '{}/HeLa_20KInt.sqlite'.format('/Users/darylwilding-mcbride/Downloads')

In [9]:
feature_results = []
for idx,feature_df in ms1_dedup_features_df.iterrows():
    # package the feature and its fragment ions for writing out to the MGF
    ms2_df = ms2_deconvoluted_peaks_df[ms2_deconvoluted_peaks_df.precursor == feature_df.precursor_id]
    result = collate_spectra_for_feature(feature_df, ms2_df)
    feature_results.append(result)

# generate the MGF for all the features
fn = '/Users/darylwilding-mcbride/Downloads/rt-3000-3060.mgf'
print("writing {} entries to {}".format(len(feature_results), fn))
if os.path.isfile(fn):
    os.remove(fn)
_ = mgf.write(output=fn, spectra=feature_results)


writing 4664 entries to /Users/darylwilding-mcbride/Downloads/rt-3000-3060.mgf
