# PSM to PeptideQ
Some useful functions:
* load psm data and sort it 
* load peptideQ data 
* join data function
* save file function
* controller function

In [1]:
import pandas as pd
import numpy as np
import os
import pyteomics.mzml
import spectrum_utils.spectrum as sus
from pathlib import Path

In [2]:
# We'll need to identify the matching columns
# we should not join the matching columns to the psm file 
# we should add the peptide data to the end of the psm file based on peptide sequence

In [70]:
# let's find the matching rows
psm_file_path = "C:\\Users\\Sarah Curtis\\OneDrive - BYU\\Documents\\Single Cell Team Documents\\API_dev\\MetaM\\2ng\\Ex_Auto_J3_30umTB_2ngQC_60m_1-calib_Peptides.psmtsv"

peptideQ_file_path = "C:\\Users\\Sarah Curtis\OneDrive - BYU\\Documents\\Single Cell Team Documents\\API_dev\\MetaM\\2ng\\AllQuantifiedPeptides.tsv"
psm_df = pd.read_table(psm_file_path, delimiter="\t")
peptideQ_df = pd.read_table(peptideQ_file_path, delimiter="\t")

duplicate_columns = []
for column in psm_df.columns:
    if column in peptideQ_df.columns:
        duplicate_columns.append(column)

duplicate_columns

['Base Sequence']

In [71]:
# looks like the only matching column is the 'Base Sequence' one so we don't need to worry about any of the other 
# columns

In [72]:
# read the psm file into a pandas dataframe
# sort dataframe by QValue
# drop duplicates

def load_psm(psm_file_path):
    # read the psm file into a pandas dataframe
    psm_dataframe = pd.read_table(psm_file_path, delimiter='\t')

    # sort dataframe by QValue
    psm_dataframe = psm_dataframe.sort_values("QValue")

    # drop duplicates
    psm_dataframe = psm_dataframe.drop_duplicates(subset=["Scan Number"], keep="first")
    psm_dataframe["Protein Accession"] = psm_dataframe["Protein Accession"].astype(str)

    # rename the "Full Sequence" column
    psm_dataframe = psm_dataframe.rename({"Full Sequence": "Peptide"})

    return psm_dataframe

def load_peptideQ(peptideQ_file_path):
    # read the peptideQ file into a pandas dataframe
    peptideQ_dataframe = pd.read_table(peptideQ_file_path, delimiter='\t')

    # rename the "Protein Groups" header so we can use the df.merge function later
    peptideQ_dataframe = peptideQ_dataframe.rename({"Protein Groups": "Protein Accession", "Sequence" : "Peptide"}, axis=1)
    
    return peptideQ_dataframe

def load_psm_df_msfragger(psm_file_path):
    # read in the psm file as a dataframe
    psm_df = pd.read_table(psm_file_path)

    # split the "Spectrum" column into a list at each period and store it under 
    # the "temp_split_column"
    psm_df["temp_split_column"] = psm_df["Spectrum"].str.split(".")
    # store the element located at index 1 of the "temp_split_column" in a 
    # "Scan Number" column
    psm_df["Scan Number"] = psm_df["temp_split_column"].map(lambda x:x[1]).apply(pd.to_numeric)
    # drop unneeded columns
    columns_to_drop = ['Spectrum', 'temp_split_column','Mapped Genes', 'Mapped Proteins']
    psm_df = psm_df.drop(columns=columns_to_drop)

    # rename 2 headers to match MM file formats
    psm_df = psm_df.rename({'Protein ID': 'Protein Accession', 'Spectrum File': 'File Name'}, axis=1)

    # drop duplicates
    psm_df = psm_df.drop_duplicates(subset=["Scan Number"], keep="first")
    
    return psm_df

def load_msfragger_peptideQ(peptideQ_file_path):
    # read the peptideQ file into a pandas dataframe
    peptideQ_dataframe = pd.read_table(peptideQ_file_path, delimiter='\t')

    # rename the "Protein Groups" header so we can use the df.merge function later
    peptideQ_dataframe = peptideQ_dataframe.rename({'Protein ID': 'Protein Accession', 'Peptide Sequence': 'Peptide'}, axis=1)
    
    return peptideQ_dataframe

In [73]:
# save the joined dataframe as a tsv file

def save_file(psm_dataframe, file_path):
    psm_dataframe.to_csv(file_path, sep="\t", index=False)
    print(f'Dataframe saved.')

In [74]:
def join_psm_and_peptideQ_dataframes(psm_df, peptideQ_df):
    # join based on the "Base Sequence"
    joined_dataframe = psm_df.merge(right=peptideQ_df, on="Base Sequence", how='inner')

    # generate multiIndex
    joined_dataframe = joined_dataframe.set_index(['File Name','Protein Accession','Peptide', 'Scan Number']).drop(columns=["Protein Groups"])

    return joined_dataframe


In [79]:
def psm_and_peptideQ_controller(file_type, psm_file_path, peptideQ_file_path, psm_and_peptideQ_file_path, columns_to_keep=None):
    
    ''' Joins a psm and a Peptide Quantification file. Files are joined into a pandas dataframe and saved as a tsv.
    
    Required Parameters:
        * file_type: "mm" for metamorpheus files, or "msfragger" for msfragger files
        * psm_file_path: File path to the psm file
        * peptideQ_file_path: File path to the Peptide Quantification file
        * psm_and_peptideQ_file_path: Output file path
        
    Optional Parameters: 
        * columns_to_keep: List of columns to include in the dataframe. Note that column names may vary based on whether your files were generated with MetaMorpheus or MSFragger.'''
        
    # load dataframes

    if file_type.lower() == 'mm':
        psm_df = load_psm(psm_file_path)
        peptideQ_df = load_peptideQ(peptideQ_file_path)
    elif file_type.lower() == 'msfragger':
        peptideQ_df = load_msfragger_peptideQ(peptideQ_file_path=peptideQ_file_path)
        psm_df = load_psm_df_msfragger(psm_file_path)
    else:
        print('invalid file type')
        return
    

    # join dataframes
    joined_df = join_psm_and_peptideQ_dataframes(psm_df, peptideQ_df)

    # select all columns to keep, if this parameter was not passed in, return dataframe with all columns
    if columns_to_keep != None:
        joined_df = joined_df[columns_to_keep]

    # save dataframe
    save_file(psm_dataframe=joined_df, file_path=psm_and_peptideQ_file_path)
    
    return joined_df

In [80]:
help(psm_and_peptideQ_controller)

Help on function psm_and_peptideQ_controller in module __main__:

psm_and_peptideQ_controller(file_type, psm_file_path, peptideQ_file_path, psm_and_peptideQ_file_path, columns_to_keep=None)
    Joins a psm and a Peptide Quantification file. Files are joined into a pandas dataframe and saved as a tsv.
    
    Required Parameters:
        * file_type: "mm" for metamorpheus files, or "msfragger" for msfragger files
        * psm_file_path: File path to the psm file
        * peptideQ_file_path: File path to the Peptide Quantification file
        * psm_and_peptideQ_file_path: Output file path
        
    Optional Parameters: 
        * columns_to_keep: List of columns to include in the dataframe. Note that column names may vary based on whether your files were generated with MetaMorpheus or MSFragger.



In [76]:
# testing the parser
outfile_path = "C:\\Users\\Sarah Curtis\\OneDrive - BYU\\Documents\\Single Cell Team Documents\\API_dev\\psm_and_peptideQ.tsv"
test_dataframe = controller(file_type='mm',psm_file_path=psm_file_path, peptideQ_file_path=peptideQ_file_path, psm_and_peptideQ_file_path=outfile_path)

KeyError: "None of ['Protein Accession', 'Peptide'] are in the columns"

In [64]:
test_dataframe.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Scan Retention Time,Num Experimental Peaks,Total Ion Current,Precursor Scan Number,Precursor Charge,Precursor MZ,Precursor Mass,Score,Delta Score,Notch,...,Intensity_Ex_Auto_K13_30umTA_2ngQC_60m_1-calib,Intensity_Ex_Auto_K13_30umTA_2ngQC_60m_2-calib,Intensity_Ex_Auto_W17_30umTB_2ngQC_60m_1-calib,Intensity_Ex_Auto_W17_30umTB_2ngQC_60m_2-calib,Detection Type_Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,Detection Type_Ex_Auto_J3_30umTB_2ngQC_60m_2-calib,Detection Type_Ex_Auto_K13_30umTA_2ngQC_60m_1-calib,Detection Type_Ex_Auto_K13_30umTA_2ngQC_60m_2-calib,Detection Type_Ex_Auto_W17_30umTB_2ngQC_60m_1-calib,Detection Type_Ex_Auto_W17_30umTB_2ngQC_60m_2-calib
File Name,Protein Accession,Sequence,Scan Number,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,P10809,LVQDVANNTNEEAGDGTTTATVLAR,16668,52.52059,101.0,335251.91071,16649,2.0,1280.62784,2559.24113,28.51,23.451,0,...,0.0,0.0,2809633.0,3020242.0,MSMS,MSMS,NotDetected,NotDetected,MSMS,MSMS
Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,P61106,IYQNIQDGSLDLNAAESGVQHK,20217,60.99236,49.0,76071.94836,20203,3.0,800.73355,2399.17882,13.388,8.388,0,...,85992.68,176640.6,364845.5,402614.3,MSMS,MSMS,MSMS,MSMS,MSMS,MSMS
Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,P13639,ARPFPDGLAEDIDK,21843,64.88158,77.0,185690.32166,21831,3.0,515.26251,1542.76569,13.388,8.388,0,...,0.0,0.0,233807.3,207856.6,MSMS,MSMS,NotDetected,NotDetected,MSMS,MSMS
Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,Q00610,HSSLAGC[Common Fixed:Carbamidomethyl on C]QIINYR,16915,53.10976,99.0,224170.22906,16902,3.0,506.92071,1517.74029,13.39,7.261,0,...,192202.5,228741.4,426113.6,560141.8,MSMS,MSMS,MSMS,MSMS,MSMS,MSMS
Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,Q15365,LVVPATQC[Common Fixed:Carbamidomethyl on C]GSLIGK,19666,59.67404,72.0,282899.35223,19653,2.0,721.90597,1441.79739,13.39,4.133,0,...,884890.8,1234482.0,1505525.0,1380007.0,MSMS,MSMS,MSMS,MSMS,MSMS,MSMS
Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,P13667,MDATANDVPSDR,8038,31.98848,105.0,320462.01965,8025,2.0,646.28304,1290.55154,13.391,8.391,0,...,0.0,0.0,155637.8,130715.5,MSMS,MSMS,NotDetected,NotDetected,MSMS,MSMS
Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,P07437,ISVYYNEATGGK,13743,45.5415,120.0,550682.62354,13722,2.0,651.32166,1300.62877,13.392,8.392,0,...,2396292.0,3031950.0,4520206.0,4502793.0,MSMS,MSMS,MSMS,MSMS,MSMS,MSMS
Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,P50395,DLGTESQIFISR,22397,66.2047,78.0,222097.00421,22382,2.0,683.354,1364.69345,13.392,8.392,0,...,22196.13,95540.14,431825.0,381380.4,MSMS,MSMS,MSMS,MSMS,MSMS,MSMS
Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,P07737,SSFYVNGLTLGGQK,24022,70.10585,55.0,156740.86102,24010,2.0,735.88549,1469.75644,13.394,8.394,0,...,1956430.0,2391598.0,3483574.0,4424726.0,MSMS,MSMS,MSMS,MSMS,MSMS,MSMS
Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,Q16698,VAFITGGGTGLGK,19384,58.99909,71.0,168562.39172,19367,2.0,589.33251,1176.65047,13.394,6.155,0,...,91264.32,126524.2,155740.5,126546.6,MSMS,MSMS,MSMS,MSMS,MSMS,MSMS
