# PeptideQ to Protein
### Approach: Merge by protein accession \#s
* Note that a possible problem would be that a given sequence can be associated with multiple proteins. This means that in the PeptideQ file, protein accession numbers are list under "Protein Groups"
* To remedy this we may need to split the Protein Groups into individual columns per protein accession \#... This may not be very useful though because there is an unknown maximum \# of proteins per group
* depending on how important this is we can add a catch for that but for the moment I'll leave it like this


In [51]:
import pandas as pd
import numpy as np
import os
import pyteomics.mzml
import spectrum_utils.spectrum as sus
from pathlib import Path

In [97]:
def save_file(psm_dataframe, file_path):
    psm_dataframe.to_csv(file_path, sep="\t", index=False)
    print(f'Dataframe saved.')


def load_peptideQ(peptideQ_file_path):
    peptideQ_dataframe = pd.read_table(peptideQ_file_path, delimiter='\t')
    #print(peptideQ_dataframe.columns)
    # rename the "Protein Groups" header so we can use the df.merge function later
    peptideQ_dataframe = peptideQ_dataframe.rename({"Protein Groups": "Protein Accession", "Sequence" : "Peptide"}, axis=1)
    #print(peptideQ_dataframe.columns)
    return peptideQ_dataframe

def load_protein(protein_file_path):
    # load the protein file into a pandas dataframe
    protein_dataframe = pd.read_table(protein_file_path)
    return protein_dataframe
    
def load_msfragger_peptideQ(peptideQ_file_path):
    peptideQ_dataframe = pd.read_table(peptideQ_file_path, delimiter='\t')
    # rename the "Protein Groups" header so we can use the df.merge function later
    peptideQ_dataframe = peptideQ_dataframe.rename({'Protein ID': 'Protein Accession', 'Peptide Sequence': 'Peptide'}, axis=1)
    return peptideQ_dataframe

def load_msfragger_protein(protein_file_path):
    # load the protein file into a pandas dataframe
    protein_dataframe = pd.read_table(protein_file_path)

    # Rename the "Protein ID" column to faciliate merging
    protein_dataframe = protein_dataframe.rename({'Protein ID': 'Protein Accession'}, axis=1)
    
    return protein_dataframe
    


In [103]:
def join_peptideQ_and_protein_dataframes(protein_df, peptideQ_df):
    # join based on the "Protein Accession"
    joined_dataframe = peptideQ_df.merge(right=protein_df, on="Protein Accession", how='inner', suffixes=('_protein', '_peptide'))
    
    # generate multiIndex
    joined_dataframe = joined_dataframe.set_index(['Protein Accession', 'Peptide'])
    
    return joined_dataframe

In [126]:
def peptideQ_and_protein_controller (file_type, peptideQ_file_path, protein_file_path, peptideQ_and_protein_file_path, columns_to_keep=None):
    
    ''' Joins a Peptide and Protein Quantification files. Files are joined into a pandas dataframe and saved as a tsv.
    
    Required Parameters:
        * file_type: "mm" for metamorpheus files, or "msfragger" for msfragger files
        * peptideQ_file_path: File path to the Peptide Quantification file
        * protein_file_path: File path to the Protein Quantification file
        * peptideQ_and_protein_file_path: Output file path
        
    Optional Parameters: 
        * columns_to_keep: List of columns to include in the dataframe. Note that column names may vary based on whether your files were generated with MetaMorpheus or MSFragger. '''

    # load dataframes
    if file_type.lower() == 'mm':
        protein_df = load_protein(protein_file_path)
        peptideQ_df = load_peptideQ(peptideQ_file_path)
    elif file_type.lower() == 'msfragger':
        peptideQ_df = load_msfragger_peptideQ(peptideQ_file_path=peptideQ_file_path)
        protein_df = load_msfragger_protein(protein_file_path=protein_file_path)
    else:
        print('invalid file type')
        return
    

    # create joined dataframe and save as csv
    joined_df = join_peptideQ_and_protein_dataframes(protein_df=protein_df, peptideQ_df=peptideQ_df)

    # select all columns to keep, if this parameter was not passed in, return dataframe with all columns
    if columns_to_keep != None:
        joined_df = joined_df[columns_to_keep]

    save_file(joined_df, file_path=peptideQ_and_protein_file_path)

    return joined_df

In [127]:
help(controller)

Help on function controller in module __main__:

controller(file_type, peptideQ_file_path, protein_file_path, peptideQ_and_protein_file_path, columns_to_keep=None)
    Joins a Peptide and Protein Quantification files. Files are joined into a pandas dataframe and saved as a tsv.
    
    Required Parameters:
        * file_type: "mm" for metamorpheus files, or "msfragger" for msfragger files
        * peptideQ_file_path: File path to the Peptide Quantification file
        * protein_file_path: File path to the Protein Quantification file
        * peptideQ_and_protein_file_path: Output file path
        
    Optional Parameters: 
        * columns_to_keep: List of columns to include in the dataframe. Note that column names may vary based on whether your files were generated with MetaMorpheus or MSFragger.



In [105]:
# testing the parser
protein_file_path = "C:\\Users\\Sarah Curtis\\OneDrive - BYU\\Documents\\Single Cell Team Documents\\API_dev\\MetaM\\2ng\\AllQuantifiedProteinGroups.tsv"
peptideQ_file_path = "C:\\Users\\Sarah Curtis\\OneDrive - BYU\\Documents\\Single Cell Team Documents\\API_dev\\MetaM\\2ng\\AllQuantifiedPeptides.tsv"
outfile_path = "C:\\Users\\Sarah Curtis\\OneDrive - BYU\\Documents\\Single Cell Team Documents\\API_dev\\MetaM\\02ng\\peptideQ_and_protein.tsv"
test_df = controller(file_type='mm',peptideQ_file_path=peptideQ_file_path, protein_file_path=protein_file_path, peptideQ_and_protein_file_path=outfile_path)


Dataframe saved.


In [106]:
test_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Base Sequence,Gene Names,Organism_protein,Intensity_Ex_Auto_J3_30umTB_2ngQC_60m_1-calib_protein,Intensity_Ex_Auto_J3_30umTB_2ngQC_60m_2-calib_protein,Intensity_Ex_Auto_K13_30umTA_2ngQC_60m_1-calib_protein,Intensity_Ex_Auto_K13_30umTA_2ngQC_60m_2-calib_protein,Intensity_Ex_Auto_W17_30umTB_2ngQC_60m_1-calib_protein,Intensity_Ex_Auto_W17_30umTB_2ngQC_60m_2-calib_protein,Detection Type_Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,...,Intensity_Ex_Auto_K13_30umTA_2ngQC_60m_2-calib_peptide,Intensity_Ex_Auto_W17_30umTB_2ngQC_60m_1-calib_peptide,Intensity_Ex_Auto_W17_30umTB_2ngQC_60m_2-calib_peptide,Number of PSMs,Protein Decoy/Contaminant/Target,Protein Cumulative Target,Protein Cumulative Decoy,Protein QValue,Best Peptide Score,Best Peptide Notch QValue
Protein Accession,Peptide,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Q99453,AAAAAAAAAK,AAAAAAAAAK,PHOX2B,Homo sapiens,0.00000,0.00000,0.00000,0.0000,0.00000,102806.00880,NotDetected,...,,,1.028060e+05,1.0,T,2674.0,142.0,0.052926,8.041378,0.004496
P37108,AAAAAAAAAPAAAATAPTTAATTAATAAQ,AAAAAAAAAPAAAATAPTTAATTAATAAQ,SRP14,Homo sapiens,90183.00651,173906.74480,106717.67170,156981.3472,225347.22530,225444.92770,MSMS,...,622614.889,1.687516e+06,1.360443e+06,40.0,T,789.0,0.0,0.000000,15.592306,0.000000
P37108,FQMAYSNLLR,FQMAYSNLLR,SRP14,Homo sapiens,0.00000,0.00000,62228.73901,33431.1230,145823.13620,137590.13940,NotDetected,...,622614.889,1.687516e+06,1.360443e+06,40.0,T,789.0,0.0,0.000000,15.592306,0.000000
P37108,GTVEGFEPADNK,GTVEGFEPADNK,SRP14,Homo sapiens,0.00000,0.00000,0.00000,0.0000,0.00000,66569.80859,NotDetected,...,622614.889,1.687516e+06,1.360443e+06,40.0,T,789.0,0.0,0.000000,15.592306,0.000000
P37108,ISTVVSSK,ISTVVSSK,SRP14,Homo sapiens,253830.39650,343235.27150,125725.10250,0.0000,195137.92380,166419.87700,MSMS,...,622614.889,1.687516e+06,1.360443e+06,40.0,T,789.0,0.0,0.000000,15.592306,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q14865,YPESLSR,YPESLSR,ARID5B,Homo sapiens,57361.70404,0.00000,0.00000,0.0000,86110.37598,100701.88480,MSMS,...,,1.007019e+05,1.177659e+05,3.0,T,2805.0,187.0,0.066667,7.176256,0.005971
Q9BQP7,YSNLVQSVLSSR,YSNLVQSVLSSR,MGME1,Homo sapiens,0.00000,0.00000,0.00000,0.0000,0.00000,27345.30762,NotDetected,...,,,2.734531e+04,1.0,T,2833.0,201.0,0.070924,7.152819,0.006442
O95167,YSVMINK,YSVMINK,NDUFA3,Homo sapiens,0.00000,0.00000,0.00000,0.0000,64896.74265,0.00000,NotDetected,...,,6.489674e+04,,1.0,T,2844.0,215.0,0.075598,7.143181,0.006800
Q9UBM7,YTAAVPYR,YTAAVPYR,DHCR7,Homo sapiens,32908.93552,50476.98682,0.00000,0.0000,0.00000,0.00000,MSMS,...,,,,4.0,T,2164.0,12.0,0.005533,9.288596,0.000535


In [109]:
# let's test this with ms fragger
msfragger_protein_file_path = "C:\\Users\\Sarah Curtis\\OneDrive - BYU\\Documents\\Single Cell Team Documents\\API_dev\\msfragger\\combined_protein.tsv"
msfragger_peptide_file_path = "C:\\Users\\Sarah Curtis\\OneDrive - BYU\\Documents\\Single Cell Team Documents\\API_dev\\msfragger\\combined_peptide.tsv"
msfragger_outfile_path = "C:\\Users\\Sarah Curtis\\OneDrive - BYU\\Documents\\Single Cell Team Documents\\API_dev\\msfragger\\peptideQ_and_protein.tsv"
try3 = controller(file_type='msfragger', peptideQ_file_path=msfragger_peptide_file_path, protein_file_path=msfragger_protein_file_path, peptideQ_and_protein_file_path=msfragger_outfile_path)
try3

Dataframe saved.


Unnamed: 0_level_0,Unnamed: 1_level_0,Prev AA,Next AA,Start,End,Peptide Length,Charges,Protein_protein,Entry Name_protein,Gene_protein,Protein Description,...,Ex_Auto_K13_30umTA_02ngQC_60m_2 MaxLFQ Unique Intensity,Ex_Auto_W17_30umTA_02ngQC_60m_3 MaxLFQ Unique Intensity,Ex_Auto_W17_30umTA_02ngQC_60m_4 MaxLFQ Unique Intensity,Ex_Auto_J3_30umTB_02ngQC_60m_1 MaxLFQ Total Intensity,Ex_Auto_J3_30umTB_02ngQC_60m_2 MaxLFQ Total Intensity,Ex_Auto_K13_30umTA_02ngQC_60m_1 MaxLFQ Total Intensity,Ex_Auto_K13_30umTA_02ngQC_60m_2 MaxLFQ Total Intensity,Ex_Auto_W17_30umTA_02ngQC_60m_3 MaxLFQ Total Intensity,Ex_Auto_W17_30umTA_02ngQC_60m_4 MaxLFQ Total Intensity,Indistinguishable Proteins
Protein Accession,Peptide,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Q86U42,AAAAAAAAAAGAAGGR,M,G,2,17,16,2,sp|Q86U42|PABP2_HUMAN,PABP2_HUMAN,PABPN1,Polyadenylate-binding protein 2,...,374784.94,0.0,0.0,295894.44,326265.50,424528.03,374784.94,0.0,0.0,
Q86U42,ELQNEVEK,K,Q,138,145,8,2,sp|Q86U42|PABP2_HUMAN,PABP2_HUMAN,PABPN1,Polyadenylate-binding protein 2,...,374784.94,0.0,0.0,295894.44,326265.50,424528.03,374784.94,0.0,0.0,
Q86U42,GFAYIEFSDK,K,E,214,223,10,2,sp|Q86U42|PABP2_HUMAN,PABP2_HUMAN,PABPN1,Polyadenylate-binding protein 2,...,374784.94,0.0,0.0,295894.44,326265.50,424528.03,374784.94,0.0,0.0,
Q86U42,TSLALDESLFR,R,G,228,238,11,2,sp|Q86U42|PABP2_HUMAN,PABP2_HUMAN,PABPN1,Polyadenylate-binding protein 2,...,374784.94,0.0,0.0,295894.44,326265.50,424528.03,374784.94,0.0,0.0,
P37108,AAAAAAAAAPAAAATAPTTAATTAATAAQ,K,-,108,136,29,3,sp|P37108|SRP14_HUMAN,SRP14_HUMAN,SRP14,Signal recognition particle 14 kDa protein,...,0.00,618273.7,454599.7,444297.88,659193.25,0.00,0.00,618273.7,454599.7,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
O43768,YFDSGDYNMAK,K,A,64,74,11,2,sp|O43768|ENSA_HUMAN,ENSA_HUMAN,ENSA,Alpha-endosulfine,...,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,sp|P56211|ARP19_HUMAN
P61964,YILAATLDNTLK,K,L,228,239,12,2,sp|P61964|WDR5_HUMAN,WDR5_HUMAN,WDR5,WD repeat-containing protein 5,...,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,
P50402,YNIPHGPVVGSTR,R,R,19,31,13,3,sp|P50402|EMD_HUMAN,EMD_HUMAN,EMD,Emerin,...,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,
contam_sp|P02788|TRFL_HUMAN Lactotransferrin OS=Homo sapiens GN=LTF PE=1 SV=6,YYGYTGAFR,R,C,544,552,9,2,contam_sp|P02788|TRFL_HUMAN,contam_sp|P02788|TRFL_HUMAN Lactotransferrin O...,,Lactotransferrin,...,0.00,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,
