# mzML and psm

Some useful functions:
* load psm data and sort it 
* mzML helper function
* join data function
* save file function
* controller function

In [138]:
import pandas as pd
import numpy as np
import os
import pyteomics.mzml
import spectrum_utils.spectrum as sus
from pathlib import Path
import json

In [169]:
def load_psm(psm_file_path):
    # read the psm file into a pandas dataframe
    psm_dataframe = pd.read_table(psm_file_path, delimiter='\t')

    # sort dataframe by QValue
    psm_dataframe = psm_dataframe.sort_values("QValue")

    # drop duplicates
    psm_dataframe = psm_dataframe.drop_duplicates(subset=["Scan Number"], keep="first")
    psm_dataframe["Protein Accession"] = psm_dataframe["Protein Accession"].astype(str)

    # rename the "Full Sequence" column
    psm_dataframe = psm_dataframe.rename({"Full Sequence": "Peptide"})

    return psm_dataframe

def load_psm_df_msfragger(psm_file_path):
    # read in the psm file as a dataframe
    psm_df = pd.read_table(psm_file_path)

    # split the "Spectrum" column into a list at each period and store it under 
    # the "temp_split_column"
    psm_df["temp_split_column"] = psm_df["Spectrum"].str.split(".")
    # store the element located at index 1 of the "temp_split_column" in a 
    # "Scan Number" column
    psm_df["Scan Number"] = psm_df["temp_split_column"].map(lambda x:x[1]).apply(pd.to_numeric)
    # drop unneeded columns
    columns_to_drop = ['Spectrum', 'temp_split_column','Mapped Genes', 'Mapped Proteins']
    psm_df = psm_df.drop(columns=columns_to_drop)

    # rename 2 headers to match MM file formats
    psm_df = psm_df.rename({'Protein ID': 'Protein Accession', 'Spectrum File': 'File Name'}, axis=1)

    # drop duplicates
    psm_df = psm_df.drop_duplicates(subset=["Scan Number"], keep="first")
    
    return psm_df

In [170]:
def load_mzml_dict(mzml_file_path):
    # use pyteomics.mzml.read() to generate an iterator over the dicts with spectrum properties
    mzml_dicts = pyteomics.mzml.read(source=mzml_file_path)
    return mzml_dicts

psm and peptideQ (for total as well) and mzml_psm
['File Name','Protein Accession','Peptide', 'Scan Number']

peptideQ and protein
['Protein Accession', 'Peptide']

mzml
['Scan Number']

protein
['Protein Accession']


In [171]:
def load_mzml_df(mzml_dicts):
    # load dataframe from the list of mzml dictionaires
    # drop the extra index column
    mzml_df = pd.DataFrame(mzml_dicts).drop(columns='index')

    # create a new dataframe containing only the ms/ms scans
    relevant_info = mzml_df.loc[(mzml_df['ms level'] == 2)]

    # reset the index to make up for the ms scans that were not included in this database
    relevant_info = relevant_info.reset_index(drop=True)

    # drop irrelevent columns (Note: We can change this if needed.)
    columns_to_drop = ["spectrum title", "count", "positive scan", "centroid spectrum", "defaultArrayLength", "MSn spectrum", "dataProcessingRef", "scanList", "MS1 spectrum", "ms level"]
    relevant_info = relevant_info.drop(columns=columns_to_drop)

    # create a new "Scan Number" column
    # the scan number info is contained within the "id" column so we will pull out the scan number and then delete the "id" column
    relevant_info["temp_split_column"] = (relevant_info["id"].str.split(" "))
    relevant_info["Scan Number"] = relevant_info["temp_split_column"].map(lambda x:x[2]).str.replace("scan=", "")
    relevant_info["Scan Number"] = relevant_info["Scan Number"].apply(pd.to_numeric)
    relevant_info = relevant_info.drop(columns=['temp_split_column', 'id'])

    # next, we'll want to pull out some info about the precursor in the "precursorList" columm
    # we will store the info we want under the "precursor info" column
    # then we'll drop the "precursorList" column
    relevant_info["precursor info"] = relevant_info["precursorList"].map(lambda x:x['precursor'][0]['selectedIonList']['selectedIon'][0]).astype(str)
    relevant_info = relevant_info.drop(columns=["precursorList"])

    # the precursor info is stored as a string in the format of a dictionary
    # json.loads() requires " instead of ' so we will fix that then convert the string into a dictionary
    dict_list = relevant_info["precursor info"].tolist()

    for index, dictionary in enumerate(dict_list):
        dictionary = dictionary.replace("'", '"')
        dict_list[index] = json.loads(dictionary)

    # we'll then load the dictionary data into a temporary dataframe
    three_column_df = pd.DataFrame.from_dict(dict_list)

    # next, we'll concatenate these two dataframes along the columns based on the index
    complete_mzml_df = pd.concat([relevant_info, three_column_df], axis="columns")

    # drop the "precursor info" column because we don't need it anymore
    complete_mzml_df = complete_mzml_df.drop(columns=['precursor info'])

    # as the scan number is the index we care about, we will use it as our index in the dataframe
    complete_mzml_df= complete_mzml_df.set_index("Scan Number")

    return complete_mzml_df 



In [172]:
# save the joined dataframe as a tsv file

def save_df(joined_dataframe, file_path):
    joined_dataframe.to_csv(file_path, sep="\t", index=False)
    print(f'Dataframe saved.')

In [184]:
def mzml_and_psm_controller(file_type, mzml_file_path,psm_file_path, mzml_and_psm_file_path, columns_to_keep=None):

    ''' Joins an mzml and psm file. Files are joined into a pandas dataframe and saved as a tsv.
    
    Required Parameters:
        * file_type: "mm" for metamorpheus files, or "msfragger" for msfragger files
        * mzml_file_path: File path to the mzML file
        * psm_file_path: File path to the psm file
        * mzml_and_psm_file_path: Output file path
        
    Optional Parameters: 
        * columns_to_keep: List of columns to include in the dataframe. Note that column names may vary based on whether your files were generated with MetaMorpheus or MSFragger.'''

    # load psm dataframe based on psm file type
    if file_type.lower() == 'mm':
        psm_dataframe = load_psm(psm_file_path)
    elif file_type.lower() == 'msfragger':
        psm_dataframe = load_psm_df_msfragger(psm_file_path=psm_file_path)
    else:
        print('invalid file type')
        return 
    
    # load mzML dataframe
    list_of_mzml_dicts = load_mzml_dict(mzml_file_path=mzml_file_path)
    mzml_dataframe = load_mzml_df(mzml_dicts=list_of_mzml_dicts)

    # merge datafames based on "Scan Number"
    joined_dataframe = mzml_dataframe.join(other=psm_dataframe, on='Scan Number', how='inner')

    # select all columns to keep, if this parameter was not passed in, return dataframe with all columns
    if columns_to_keep != None:
        joined_dataframe = joined_dataframe[columns_to_keep]

    # set up the multiIndex
    joined_dataframe = joined_dataframe.set_index(['File Name', 'Protein Accession', 'Peptide', 'Scan Number'])

    # save merged dataframes
    save_df(joined_dataframe=joined_dataframe, file_path=mzml_and_psm_file_path)

    return joined_dataframe

In [185]:
help(mzml_and_psm_controller)

Help on function mzml_and_psm_controller in module __main__:

mzml_and_psm_controller(file_type, mzml_file_path, psm_file_path, mzml_and_psm_file_path)
    Joins an mzml and a psm file. Files are joined into a pandas dataframe and saved as a tsv.
    
    Required Parameters:
        * file_type: "mm" for metamorpheus files, or "msfragger" for msfragger files
        * mzml_file_path: File path to the mzML file
        * psm_file_path: File path to the psm file
        * mzml_and_psm_file_path: Output file path
        
    Optional Parameters: 
        * columns_to_keep: List of columns to include in the dataframe. Note that column names may vary based on whether your files were generated with MetaMorpheus or MSFragger.



In [174]:
def short_controller_function():

    psm_dataframe = load_psm(psm_file_path)
    # merge datafames
    joined_dataframe = relevant_info.join(other=psm_dataframe, on="Scan Number", how='inner', lsuffix='_mzml', rsuffix='_psm')
    joined_dataframe = joined_dataframe.drop(columns=['Scan Number_mzml'])
    joined_dataframe = joined_dataframe.set_index(['File Name', 'Protein Accession'])

    # save merged dataframes
    save_df(joined_dataframe=joined_dataframe, file_path=output_file_path)

    return joined_dataframe

In [175]:
mzml_file_path = "C:\\Users\\Sarah Curtis\\OneDrive - BYU\\Documents\\Single Cell Team Documents\\API_dev\\MetaM\\2ng\\Ex_Auto_J3_30umTB_2ngQC_60m_1.mzML"
psm_file_path = "C:\\Users\\Sarah Curtis\\OneDrive - BYU\\Documents\\Single Cell Team Documents\\API_dev\\msfragger\\psm1.tsv"
output_file_path = "C:\\Users\\Sarah Curtis\\OneDrive - BYU\\Documents\\Single Cell Team Documents\\API_dev\\MetaM\\2ng\\mzml_and_psm.tsv"

In [176]:
# testing the function
new_database = (file_type='msfragger', psm_file_path=psm_file_path, mzml_and_psm_file_path=output_file_path)

Dataframe saved.


In [179]:
new_database

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,base peak m/z,base peak intensity,total ion current,lowest observed m/z,highest observed m/z,m/z array,intensity array,selected ion m/z,charge state,peak intensity,...,Protein Start,Protein End,Intensity,Assigned Modifications,Observed Modifications,Is Unique,Protein,Entry Name,Gene,Protein Description
File Name,Protein Accession,Peptide,Scan Number,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
interact-Ex_Auto_J3_30umTB_02ngQC_60m_1.pep.xml,P18206,QQELTHQEHR,4761,3045.887847,1042.273804,11691.656250,134.218124,3229.287598,"[134.21812438964844, 140.13174438476562, 199.8...","[672.3894653320312, 728.9189453125, 797.305297...",1119.239705,3,,...,179,188,169547.050,,,True,sp|P18206|VINC_HUMAN,VINC_HUMAN,VCL,Vinculin
interact-Ex_Auto_J3_30umTB_02ngQC_60m_1.pep.xml,P18077,NNTVTPGGKPNK,4770,226.334431,1360.701294,14130.234375,125.694862,2568.897461,"[125.69486236572266, 226.3344268798828, 270.30...","[672.412841796875, 1360.7012939453125, 709.098...",975.462233,3,,...,55,66,135834.620,,,True,sp|P18077|RL35A_HUMAN,RL35A_HUMAN,RPL35A,60S ribosomal protein L35a
interact-Ex_Auto_J3_30umTB_02ngQC_60m_1.pep.xml,P67809,ENQGDETQGQQPPQR,4772,3684.796666,1703.962402,17414.609375,120.513962,3684.796631,"[120.51396179199219, 127.50184631347656, 172.5...","[698.4600830078125, 779.7194213867188, 753.386...",1269.767464,3,,...,265,279,811776.200,,,True,sp|P67809|YBOX1_HUMAN,YBOX1_HUMAN,YBX1,Y-box-binding protein 1
interact-Ex_Auto_J3_30umTB_02ngQC_60m_1.pep.xml,P11413,PASTNSDDVRDEK,4777,1438.513781,823.853760,12231.937500,118.969582,1438.513794,"[118.9695816040039, 158.41725158691406, 272.99...","[650.7579956054688, 775.4783325195312, 680.673...",765.991264,3,,...,276,288,287172.500,,,True,sp|P11413|G6PD_HUMAN,G6PD_HUMAN,G6PD,Glucose-6-phosphate 1-dehydrogenase
interact-Ex_Auto_J3_30umTB_02ngQC_60m_1.pep.xml,Q14980,AVQAQGGESQQEAQR,4779,355.070984,2075.281982,24020.359375,136.373032,1674.768433,"[136.37303161621094, 149.04534912109375, 164.6...","[818.3619995117188, 1050.46240234375, 670.3148...",817.699647,2,12893.723633,...,1574,1588,0.000,,,True,sp|Q14980|NUMA1_HUMAN,NUMA1_HUMAN,NUMA1,Nuclear mitotic apparatus protein 1
interact-Ex_Auto_J3_30umTB_02ngQC_60m_1.pep.xml,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
interact-Ex_Auto_J3_30umTB_02ngQC_60m_1.pep.xml,P46782,TIAECLADELINAAK,20457,1018.494557,72908.125000,430894.437500,115.527031,1274.619019,"[115.52703094482422, 118.18751525878906, 118.6...","[669.1881713867188, 717.433837890625, 655.3529...",685.839766,2,209264.781250,...,168,182,125678.660,5C(57.0215),,True,sp|P46782|RS5_HUMAN,RS5_HUMAN,RPS5,40S ribosomal protein S5
interact-Ex_Auto_J3_30umTB_02ngQC_60m_1.pep.xml,O43175,DLPLLLFR,20487,251.103354,45406.714844,639398.187500,110.071579,727.849854,"[110.07157897949219, 112.08744812011719, 113.0...","[7018.53515625, 2156.968505859375, 846.3520507...",408.207844,2,131629.562500,...,462,469,756196.250,,,True,sp|O43175|SERA_HUMAN,SERA_HUMAN,PHGDH,D-3-phosphoglycerate dehydrogenase
interact-Ex_Auto_J3_30umTB_02ngQC_60m_1.pep.xml,P08195,LLTSFLPAQLLR,20534,129.102685,20830.025391,401944.187500,110.071602,1203.187012,"[110.07160186767578, 110.47235870361328, 112.0...","[6838.19482421875, 641.6944580078125, 701.3181...",414.564027,3,119998.945312,...,440,451,84497.550,,,True,sp|P08195|4F2_HUMAN,4F2_HUMAN,SLC3A2,4F2 cell-surface antigen heavy chain
interact-Ex_Auto_J3_30umTB_02ngQC_60m_1.pep.xml,P13639,TFCQLILDPIFK,20541,427.747145,107592.500000,739924.687500,110.071609,796.442139,"[110.07160949707031, 112.08729553222656, 113.0...","[13752.9521484375, 7470.677734375, 930.8485717...",428.188728,2,265198.343750,...,288,299,87594.164,3C(57.0215),,True,sp|P13639|EF2_HUMAN,EF2_HUMAN,EEF2,Elongation factor 2


### Using this code, lets do the mzml and protein

In [134]:
def join_protein_to_df(protein_df, mzml_psm_df):
    # here we'll delete the duplicate columns in the protein_df
    duplicate_columns = []
    for column in protein_df.columns:
        if column in mzml_psm_df.columns:
            duplicate_columns.append(column)
    protein_df=protein_df.drop(axis=1,columns=duplicate_columns)
    new_df = mzml_psm_df.join(other=protein_df, on="Protein Accession", how='left', lsuffix="_psm")
    return new_df


def mzml_and_protein_controller(mzml_file_path, psm_file_path, protein_file_path, output_file_path, all_data=False):
    # load dataframes
    psm_dataframe = load_psm(psm_file_path)
    protein_dataframe = pd.read_table(protein_file_path)

    count = 0
    for index, row in psm_dataframe.iterrows():
        # break put here to faciliate testing
        # remove this conditional statement when parsing through an entire mzml file
        if count == 2:
            break
        print(f"{count} : {index}")
        scan_num = str(row['Scan Number'])
        relevant_info = mzml_helper(scan_num, mzml_file_path, all_data)
        psm_dataframe = join_data(relevant_info, psm_dataframe, index)
        count += 1
    
    # join dataframes and save as a tsv
    joined_df = join_protein_to_df(protein_df=protein_dataframe, mzml_psm_df=psm_dataframe)
    save_file(psm_dataframe=joined_df, file_path=output_file_path)
    
    return joined_df

In [135]:
# test
protein_file_path = "C:\\Users\\Sarah Curtis\\OneDrive - BYU\\Documents\\Single Cell Team Documents\\API_dev\\MetaM\\02ng\\AllQuantifiedProteinGroups.tsv"
outfile_path = "C:\\Users\\Sarah Curtis\\OneDrive - BYU\\Documents\\Single Cell Team Documents\\API_dev\\MetaM\\02ng\\mzml_and_protein.tsv"
test_df = mzml_and_protein_controller(mzml_file_path=mzml_file_path, psm_file_path=psm_file_path, protein_file_path=protein_file_path, output_file_path=outfile_path)


0 : 0


NameError: name 'mzml_helper' is not defined

In [None]:
test_df.head()


Unnamed: 0,File Name,Scan Number,Scan Retention Time,Num Experimental Peaks,Total Ion Current,Precursor Scan Number,Precursor Charge,Precursor MZ,Precursor Mass,Score,...,Intensity_Ex_Auto_K13_30umTA_2ngQC_60m_2-calib,Intensity_Ex_Auto_W17_30umTB_2ngQC_60m_1-calib,Intensity_Ex_Auto_W17_30umTB_2ngQC_60m_2-calib,Number of PSMs,Protein Decoy/Contaminant/Target,Protein Cumulative Target,Protein Cumulative Decoy,Protein QValue,Best Peptide Score,Best Peptide Notch QValue
0,Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,16668,52.52059,101.0,335251.91071,16649,2.0,1280.62784,2559.24113,28.51,...,19350380.0,18234990.0,355.0,T,2.0,0.0,0.0,29.506938,0.0,
1227,Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,20217,60.99236,49.0,76071.94836,20203,3.0,800.73355,2399.17882,13.388,...,1413019.0,1530572.0,62.0,T,45.0,0.0,0.0,24.292929,0.0,
1226,Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,21843,64.88158,77.0,185690.32166,21831,3.0,515.26251,1542.76569,13.388,...,16621310.0,14759070.0,462.0,T,129.0,0.0,0.0,21.589114,0.0,
1225,Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,16915,53.10976,99.0,224170.22906,16902,3.0,506.92071,1517.74029,13.39,...,3405293.0,3366013.0,395.0,T,27.0,0.0,0.0,25.311353,0.0,
1224,Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,19666,59.67404,72.0,282899.35223,19653,2.0,721.90597,1441.79739,13.39,...,5032313.0,4587147.0,109.0,T,286.0,0.0,0.0,19.439414,0.0,
