In [1]:
import sqlite3
import pandas as pd
import numpy as np
import sys
import glob, os
from pyteomics import mgf

In [2]:
RT_LIMIT_LOWER = 0
RT_LIMIT_UPPER = 3000

In [3]:
RT_BUFFER_WIDTH = 0

In [4]:
BASE_MAXQUANT_DIR = '/Users/darylwilding-mcbride/Downloads/analyses/combined'
MAXQUANT_TXT_DIR = '{}/txt'.format(BASE_MAXQUANT_DIR)
ALLPEPTIDES_FILENAME = '{}/allPeptides.txt'.format(MAXQUANT_TXT_DIR)
APL_DIR = '{}/andromeda'.format(BASE_MAXQUANT_DIR)

In [5]:
BASE_NAME = 'P3795'
MGF_FILE_NAME = '{}/{}-mq.mgf'.format(BASE_MAXQUANT_DIR, BASE_NAME)

In [17]:
MIN_ISOTOPE_CORRELATION = 0.9

allpeptides_df = pd.read_csv(ALLPEPTIDES_FILENAME, sep='\t')
allpeptides_df.rename(columns={'Number of isotopic peaks':'isotope_count', 'm/z':'mz', 'Number of data points':'number_data_points', 'Intensity':'intensity', 'Ion mobility index':'scan', 'Ion mobility index length':'scan_length', 'Ion mobility index length (FWHM)':'scan_length_fwhm', 'Retention time':'rt', 'Retention length':'rt_length', 'Retention length (FWHM)':'rt_length_fwhm', 'Charge':'charge_state', 'Number of pasef MS/MS':'number_pasef_ms2_ids', 'Pasef MS/MS IDs':'pasef_msms_ids', 'MS/MS scan number':'msms_scan_number', 'Isotope correlation':'isotope_correlation'}, inplace=True)
# allpeptides_df = allpeptides_df[allpeptides_df.intensity.notnull() & (allpeptides_df.number_pasef_ms2_ids > 0) & (allpeptides_df.msms_scan_number >= 0) & allpeptides_df.pasef_msms_ids.notnull() & (allpeptides_df.isotope_correlation >= MIN_ISOTOPE_CORRELATION) & (allpeptides_df.rt >= RT_LIMIT_LOWER+RT_BUFFER_WIDTH) & (allpeptides_df.rt <= RT_LIMIT_UPPER-RT_BUFFER_WIDTH)].copy()
allpeptides_df = allpeptides_df[allpeptides_df.intensity.notnull() & (allpeptides_df.number_pasef_ms2_ids > 0) & (allpeptides_df.msms_scan_number >= 0) & allpeptides_df.pasef_msms_ids.notnull() & (allpeptides_df.isotope_correlation >= MIN_ISOTOPE_CORRELATION)].copy()
allpeptides_df.sort_values(by=['msms_scan_number'], ascending=True, inplace=True)

In [18]:
allpeptides_df.msms_scan_number = allpeptides_df.msms_scan_number.apply(lambda x: int(x))

In [19]:
allpeptides_df.msms_scan_number.max()

28687

In [20]:
allpeptides_df.tail()

Unnamed: 0,Raw file,charge_state,mz,Mass,Resolution,number_data_points,Number of frames,isotope_count,isotope_correlation,Mass fractional part,...,Min frame index,Max frame index,scan,scan_length,scan_length_fwhm,intensity,Intensities,number_pasef_ms2_ids,pasef_msms_ids,msms_scan_number
59883,P3795_noCompound_Slot1-14_1_3450,5,1666.7395,8328.6611,22639.073216,18868,29,9,0.989239,0.661119,...,3388,3416,510,105,45,159310.0,,4,152391;152725;152735;153072,28680
59967,P3795_noCompound_Slot1-14_1_3450,5,1668.5325,8337.6263,23446.433319,4971,11,9,0.97888,0.626271,...,3336,3346,501,87,36,95395.0,,9,146106;146130;146283;146296;146532;146543;1465...,28681
59940,P3795_noCompound_Slot1-14_1_3450,5,1674.7259,8368.5932,22969.669261,1376,10,6,0.925773,0.593178,...,3336,3345,504,33,30,26585.0,,3,146554;146630;146642,28683
60014,P3795_noCompound_Slot1-14_1_3450,5,1681.1297,8400.6121,23472.955759,8141,19,9,0.991652,0.612135,...,3496,3514,486,90,57,113660.0,,8,160290;160298;160306;160312;160528;160529;1605...,28684
60143,P3795_noCompound_Slot1-14_1_3450,5,1701.5219,8502.5732,24175.431004,26371,19,10,0.982443,0.573159,...,2813,2831,522,105,45,246330.0,,14,93119;93127;93211;93221;93229;93298;93308;9331...,28687


In [21]:
allpeptides_df['Raw file'].unique()

array(['P3795_withCompound_Slot1-15_1_3452',
       'P3795_noCompound_Slot1-14_1_3450'], dtype=object)

In [22]:
def collate_spectra_for_feature(ms1_d, ms2_df):
    # append the monoisotopic and the ms2 fragments to the list for MGF creation
    pairs_df = ms2_df[['mz', 'intensity']].copy().sort_values(by=['intensity'], ascending=False)
    spectrum = {}
    spectrum["m/z array"] = pairs_df.mz.values
    spectrum["intensity array"] = pairs_df.intensity.values
    params = {}
    params["TITLE"] = "RawFile: {} Index: {} Charge: {} FeatureIntensity: {} RtApex: {}".format(ms1_d['raw_file'], ms1_d['mq_index'], ms1_d['charge'], ms1_d['intensity'], round(ms1_d['rt_apex'],2))
    params["INSTRUMENT"] = "ESI-QUAD-TOF"
    params["PEPMASS"] = "{} {}".format(round(ms1_d['monoisotopic_mz'],6), ms1_d['intensity'])
    params["CHARGE"] = "{}+".format(ms1_d['charge'])
    params["RTINSECONDS"] = "{}".format(round(ms1_d['rt_apex'],2))
    params["SCANS"] = "{}".format(int(ms1_d['rt_apex']))
    spectrum["params"] = params
    return spectrum

In [23]:
# build a list of indexes from the APL files
ms2_peaks = []
apl_indexes = []
for file in glob.glob("{}/*.apl".format(APL_DIR)):
    with open(file, 'r') as f:
        for line in f:
            line = line.rstrip()
            if len(line) > 0:
                if line.startswith("header="):
                    line_a = line.split(' ')
                    mq_index = int(line_a[3])
                    raw_file = line_a[1]
                if line[0].isdigit():
                    line_a = line.split('\t')
                    mz = float(line_a[0])
                    intensity = round(float(line_a[1]))
                    ms2_peaks.append((mz, intensity))
                if line.startswith("peaklist end"):
                    apl_indexes.append((mq_index, ms2_peaks.copy(), raw_file))
                    del ms2_peaks[:]
                    mq_index = 0
apl_indexes_df = pd.DataFrame(apl_indexes, columns=['mq_index','ms2_peaks','raw_file'])
apl_indexes_df.sort_values(by=['mq_index'], ascending=True, inplace=True)

In [24]:
for group_name,group_df in allpeptides_df.groupby('Raw file'):
    mgf_spectra = []
    mgf_file_name = '{}/{}-mq.mgf'.format(BASE_MAXQUANT_DIR, group_name)
    file_apl_indexes_df = apl_indexes_df[(apl_indexes_df['raw_file'] == group_name)]
    for idx,row in group_df.iterrows():
        mq_index = row.msms_scan_number
        ms1_d = {'monoisotopic_mass':row.Mass, 
                 'charge':row.charge_state, 
                 'monoisotopic_mz':row.mz, 
                 'intensity':int(row.intensity), 
                 'scan_apex':row.scan, 
                 'rt_apex':row.rt,
                 'raw_file':row['Raw file'],
                 'mq_index':mq_index}
        df = file_apl_indexes_df[(file_apl_indexes_df.mq_index == mq_index)]
        ms2_peaks_df = pd.DataFrame(df.iloc[0].ms2_peaks, columns=['mz','intensity'])
        feature_spectra = collate_spectra_for_feature(ms1_d, ms2_peaks_df)
        mgf_spectra.append(feature_spectra)
    # generate the MGF for all the features
    print("generating MGF: {}".format(mgf_file_name))
    if os.path.isfile(mgf_file_name):
        os.remove(mgf_file_name)
    f = mgf.write(output=mgf_file_name, spectra=mgf_spectra)


generating MGF: /Users/darylwilding-mcbride/Downloads/analyses/combined/P3795_noCompound_Slot1-14_1_3450-mq.mgf
generating MGF: /Users/darylwilding-mcbride/Downloads/analyses/combined/P3795_withCompound_Slot1-15_1_3452-mq.mgf
