In [27]:
import sqlite3
import pandas as pd
import numpy as np
import sys
import glob, os
from pyteomics import mgf

In [28]:
RT_LIMIT_LOWER = 3000
RT_LIMIT_UPPER = 3060

In [29]:
RT_BUFFER_WIDTH = 15

In [30]:
BASE_MAXQUANT_DIR = '/Users/darylwilding-mcbride/Downloads/maxquant_results'
MAXQUANT_TXT_DIR = '{}/txt'.format(BASE_MAXQUANT_DIR)
ALLPEPTIDES_FILENAME = '{}/allPeptides.txt'.format(MAXQUANT_TXT_DIR)
APL_DIR = '{}/andromeda'.format(BASE_MAXQUANT_DIR)

In [32]:
MIN_ISOTOPE_CORRELATION = 0.9

allpeptides_df = pd.read_csv(ALLPEPTIDES_FILENAME, sep='\t')
allpeptides_df.rename(columns={'Number of isotopic peaks':'isotope_count', 'm/z':'mz', 'Number of data points':'number_data_points', 'Intensity':'intensity', 'Ion mobility index':'scan', 'Ion mobility index length':'scan_length', 'Ion mobility index length (FWHM)':'scan_length_fwhm', 'Retention time':'rt', 'Retention length':'rt_length', 'Retention length (FWHM)':'rt_length_fwhm', 'Charge':'charge_state', 'Number of pasef MS/MS':'number_pasef_ms2_ids', 'Pasef MS/MS IDs':'pasef_msms_ids', 'MS/MS scan number':'msms_scan_number', 'Isotope correlation':'isotope_correlation'}, inplace=True)
allpeptides_df = allpeptides_df[allpeptides_df.intensity.notnull() & (allpeptides_df.number_pasef_ms2_ids > 0) & (allpeptides_df.msms_scan_number >= 0) & allpeptides_df.pasef_msms_ids.notnull() & (allpeptides_df.isotope_correlation >= MIN_ISOTOPE_CORRELATION) & (allpeptides_df.rt >= RT_LIMIT_LOWER+RT_BUFFER_WIDTH) & (allpeptides_df.rt <= RT_LIMIT_UPPER-RT_BUFFER_WIDTH)].copy()


In [33]:
allpeptides_df.msms_scan_number = allpeptides_df.msms_scan_number.apply(lambda x: int(x))

In [34]:
allpeptides_df.msms_scan_number.max()

100522

In [35]:
allpeptides_df.tail()

Unnamed: 0,Raw file,charge_state,mz,Mass,Resolution,number_data_points,Number of frames,isotope_count,isotope_correlation,Mass fractional part,...,Min frame index,Max frame index,scan,scan_length,scan_length_fwhm,intensity,Intensities,number_pasef_ms2_ids,pasef_msms_ids,msms_scan_number
317877,HeLa_20KInt_2KIT_Slot1-46_01_1179,2,1186.0386,2370.0627,23652.669585,1914,12,3,0.918753,0.062657,...,2561,2572,228,45,24,19775.0,,9,200196;200205;200214;200223;200232;200292;2003...,92215
321399,HeLa_20KInt_2KIT_Slot1-46_01_1179,2,1263.0392,2524.0638,24217.124322,23646,19,5,0.997538,0.063765,...,2559,2577,210,138,63,233650.0,,40,199857;199867;199877;199888;199898;199908;1999...,94898
325384,HeLa_20KInt_2KIT_Slot1-46_01_1179,3,670.30404,2007.8903,25049.187714,62271,28,7,0.99996,0.890286,...,2554,2581,708,120,48,1999200.0,,3,199529;199777;201014,81401
325494,HeLa_20KInt_2KIT_Slot1-46_01_1179,3,688.35495,2062.043,24291.598528,10295,18,5,0.997244,0.043033,...,2563,2580,717,72,33,165630.0,,2,200445;200455,83450
327447,HeLa_20KInt_2KIT_Slot1-46_01_1179,2,673.38573,1344.7569,23387.664514,1698,12,2,0.954187,0.756915,...,2551,2562,771,54,45,23594.0,,1,199498,40002


In [36]:
allpeptides_df.columns

Index(['Raw file', 'charge_state', 'mz', 'Mass', 'Resolution',
       'number_data_points', 'Number of frames', 'isotope_count',
       'isotope_correlation', 'Mass fractional part', 'Mass deficit', 'rt',
       'rt_length', 'rt_length_fwhm', 'Min frame index', 'Max frame index',
       'scan', 'scan_length', 'scan_length_fwhm', 'intensity', 'Intensities',
       'number_pasef_ms2_ids', 'pasef_msms_ids', 'msms_scan_number'],
      dtype='object')

In [37]:
def collate_spectra_for_feature(ms1_d, ms2_df):
    # append the monoisotopic and the ms2 fragments to the list for MGF creation
    pairs_df = ms2_df[['mz', 'intensity']].copy().sort_values(by=['intensity'], ascending=False)
    spectrum = {}
    spectrum["m/z array"] = pairs_df.mz.values
    spectrum["intensity array"] = pairs_df.intensity.values
    params = {}
    params["TITLE"] = "RawFile: {} Index: {} Charge: {} FeatureIntensity: {} RtApex: {}".format(ms1_d['raw_file'], ms1_d['mq_index'], ms1_d['charge'], ms1_d['intensity'], round(ms1_d['rt_apex'],2))
    params["INSTRUMENT"] = "ESI-QUAD-TOF"
    params["PEPMASS"] = "{} {}".format(round(ms1_d['monoisotopic_mz'],6), ms1_d['intensity'])
    params["CHARGE"] = "{}+".format(ms1_d['charge'])
    params["RTINSECONDS"] = "{}".format(round(ms1_d['rt_apex'],2))
    params["SCANS"] = "{}".format(int(ms1_d['rt_apex']))
    spectrum["params"] = params
    return spectrum

In [38]:
# build a list of indexes from the APL files
ms2_peaks = []
apl_indexes = []
for file in glob.glob("{}/*.apl".format(APL_DIR)):
    with open(file, 'r') as f:
        for line in f:
            line = line.rstrip()
            if len(line) > 0:
                if line.startswith("header="):
                    mq_index = int(line.split(' ')[3])
                if line[0].isdigit():
                    line_a = line.split('\t')
                    mz = float(line_a[0])
                    intensity = round(float(line_a[1]))
                    ms2_peaks.append((mz, intensity))
                if line.startswith("peaklist end"):
                    apl_indexes.append((mq_index, ms2_peaks.copy()))
                    del ms2_peaks[:]
                    mq_index = 0
apl_indexes_df = pd.DataFrame(apl_indexes, columns=['mq_index','ms2_peaks'])

In [39]:
apl_indexes_df.head()

Unnamed: 0,mq_index,ms2_peaks
0,18180,"[(198.12835, 15), (226.11905, 16), (235.11728,..."
1,18181,"[(204.08961, 10), (231.11226, 21), (235.04234,..."
2,18182,"[(166.28784, 9), (171.15202, 26), (195.07293, ..."
3,18183,"[(212.10498, 15), (217.11978, 9), (237.10405, ..."
4,18184,"[(226.11908, 16), (227.12076, 10), (244.16614,..."


In [40]:
mgf_spectra = []
for idx,row in allpeptides_df.iterrows():
    mq_index = row.msms_scan_number
    ms1_d = {'monoisotopic_mass':row.Mass, 
             'charge':row.charge_state, 
             'monoisotopic_mz':row.mz, 
             'intensity':int(row.intensity), 
             'scan_apex':row.scan, 
             'rt_apex':row.rt,
             'raw_file':row['Raw file'],
             'mq_index':mq_index}
    df = apl_indexes_df[apl_indexes_df.mq_index == mq_index]
    if len(df) == 1:
        ms2_peaks_df = pd.DataFrame(df.ms2_peaks.values[0], columns=['mz','intensity'])
        feature_spectra = collate_spectra_for_feature(ms1_d, ms2_peaks_df)
        mgf_spectra.append(feature_spectra)
    else:
        print("no match for mq_index {}".format(mq_index))

In [41]:
len(mgf_spectra)

419

In [42]:
allpeptides_df.msms_scan_number.nunique()

419

In [50]:
MGF_FILE_NAME = '/Users/darylwilding-mcbride/Downloads/mq_features-rt-3000-3060.mgf'
# generate the MGF for all the features
print("generating the MGF: {}".format(MGF_FILE_NAME))
if os.path.isfile(MGF_FILE_NAME):
    os.remove(MGF_FILE_NAME)
f = mgf.write(output=MGF_FILE_NAME, spectra=mgf_spectra)


generating the MGF: /Users/darylwilding-mcbride/Downloads/mq_features-rt-3000-3060.mgf


In [44]:
ms1_features = []
for idx,row in allpeptides_df.iterrows():
    ms1_features.append((row.mz, row.charge_state, int(row.intensity), row.Mass, row.rt, row.scan, row.msms_scan_number))
ms1_df = pd.DataFrame(ms1_features, columns=['mz','charge','intensity','monoisotopic_mass','rt','scan','msms_scan_number'])

In [45]:
ms1_df.head()

Unnamed: 0,mz,charge,intensity,monoisotopic_mass,rt,scan,msms_scan_number
0,326.70863,2,7996,651.4027,3017.769,852,1790
1,364.74869,2,35051,727.48282,3018.951,768,3086
2,363.87617,3,12509,1088.6067,3029.586,825,20281
3,367.21087,3,168590,1098.6108,3040.218,825,20900
4,371.21264,3,27040,1110.6161,3021.314,771,21731


In [46]:
ms1_df.to_csv('/Users/darylwilding-mcbride/Downloads/mq-ms1-rt-3000-3060.csv', index=False)
ms1_df.to_pickle('/Users/darylwilding-mcbride/Downloads/mq-ms1-rt-3000-3060.pkl')

In [47]:
import pickle
ms1_deduped_df = pd.read_pickle('/Users/darylwilding-mcbride/Downloads/my-ms1-rt-3000-3060.pkl')

In [48]:
len(ms1_deduped_df)

4930

In [49]:
ms1_deduped_df.to_csv('/Users/darylwilding-mcbride/Downloads/my-ms1-rt-3000-3060.csv', index=False)