In [1]:
import os
import mokapot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.append("..")
sys.path
import data_loader as dl


In [2]:
def filter_data(df, prob_column):
    #drop decoys
    df = df[df["decoy"]==False]
    #sort by qvalue
    df = df.sort_values(prob_column)
    #drop duplicate scans
    df = df.drop_duplicates(subset=["scan"], keep="first") #keep highest scoring
    #rename scan column
    df = df.rename(columns = {"scan": "ScanNr"})
    
    return df

In [3]:
#Reading in and formatting the original MetaMorpheus data
def get_orgininal_mm_data(file):
    mm_original_df = dl.clean_metamorph(file)
    mm_original_df = filter_data( mm_original_df, "QValue")
    
    #filter out just the ScanNr and QValue and/or PEP columns
    mm_original_df = mm_original_df.filter(items = ['ScanNr', 'peptide', 'QValue', 'PEP'])
    
    return mm_original_df

In [4]:
#Reading in and formatting the orginial MsFragger data
def set_probablility(row):
    new_prob = 1 - row["PeptideProphet Probability"]
    return new_prob

#pulling only scan numbers out
def extractScanNum(row):
    string = row
    spot = string.find('.')
    new_st = string[spot + 1:]
    spot = new_st.find('.')
    final_st = new_st[:spot]
    
    if final_st[0] == "0":
        final_st = final_st[1:]
    return final_st

def get_original_msf_data(file):
    msf_original_df = dl.clean_msfragger(file)
    
    #Extracting scan number from file number
    msf_original_df['scan'] =msf_original_df['scan'].apply(extractScanNum) 
    
    msf_original_df = filter_data(msf_original_df, 'PeptideProphet Probability')
    
    #Changing the probabilities to the same scale all the other tools use 
    msf_original_df["Updated_probability"] =  msf_original_df.apply(set_probablility, 1)
    
    #filter out just the ScanNr and QValue and/or PEP columns
    msf_original_df = msf_original_df.filter(['ScanNr', 'peptide','Updated_probability'])
    
    return  msf_original_df

In [5]:
#Reading in and formatting the orginial MsgfPlus data
def get_original_msg_data(file):
    msg_original_df = dl.clean_msgfplus(file)
    msg_original_df = filter_data(msg_original_df, "QValue")
    
    #filter out just the ScanNr and QValue and/or PEP columns
    msg_original_df = msg_original_df.filter(["ScanNr", 'peptide', "QValue"])
    
    return msg_original_df

In [6]:
#Reading in and formatting the orginial MaxQuant data
def get_original_mq_data(file):
    mq_original_df =  dl.clean_maxquant(file)
    
#Formatting and dropping any rows that are missing the sequence
#     mq_original_df['Sequence'].replace(' ', np.nan, inplace = True)
#     mq_original_df.dropna(subset=['Sequence'], inplace=True)
    
    mq_original_df = filter_data(mq_original_df, 'PEP')
    
    #filter out just the ScanNr and QValue and/or PEP columns
    mq_original_df = mq_original_df.filter(["ScanNr", 'peptide', "PEP"])
    
    return  mq_original_df

In [7]:
#Reading in and formatting the data from PD
pd_df = dl.clean_proteome_discover("2ng_rep1")
pd_df

Unnamed: 0,Checked,Confidence,Identifying Node,PSM Ambiguity,Annotated Sequence,Modifications,# Proteins,Master Protein Accessions,Protein Accessions,# Missed Cleavages,...,RT [min],First Scan,Spectrum File,File ID,Peptides Matched,XCorr,# Protein Groups,Percolator q-Value,Percolator PEP,Percolator SVMScore
12207,False,High,Sequest HT (A3),Unambiguous,SVDEVFDEVVQIFDK,,1,P30085,P30085,0,...,104.7650,36961,Ex_Auto_J3_30umTB_2ngQC_60m_1.raw,F3,345,1.85,1,0.000019,1.484000e-05,0.912
12231,False,High,Sequest HT (A3),Unambiguous,QITDNIFLTTAEVIAQQVSDK,,1,P48163,P48163,0,...,105.0655,37067,Ex_Auto_J3_30umTB_2ngQC_60m_1.raw,F3,352,1.26,1,0.000019,8.835000e-05,0.742
12233,False,High,Sequest HT (A3),Unambiguous,VPSTEAEALASSLMGLFEK,,1,P50395,P50395,0,...,105.0476,37060,Ex_Auto_J3_30umTB_2ngQC_60m_1.raw,F3,422,2.36,1,0.000019,6.671000e-07,1.207
12615,False,High,Sequest HT (A3),Unambiguous,DSTLIMQLLR,,7,P63104; P31947; P31946; Q04917; P61981; P27348...,P63104; P31947; P31946; Q04917; P61981; P27348...,0,...,92.9377,33485,Ex_Auto_J3_30umTB_2ngQC_60m_1.raw,F3,349,2.49,7,0.000019,2.803000e-05,0.852
12620,False,High,Sequest HT (A3),Unambiguous,DSTLImQLLR,M6(Oxidation),7,P63104; P31947; P31946; Q04917; P61981; P27348...,P63104; P31947; P31946; Q04917; P61981; P27348...,0,...,92.9642,33496,Ex_Auto_J3_30umTB_2ngQC_60m_1.raw,F3,394,2.33,7,0.000019,2.284000e-05,0.871
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176623,False,High,Sequest HT (A3),Unambiguous,KAVHHFVNKK,,1,Q9H0K6,Q9H0K6,2,...,120.5039,41647,Ex_Auto_J3_30umTB_2ngQC_60m_1.raw,F3,80,0.15,1,0.373500,1.000000e+00,-2.701
176643,False,High,Sequest HT (A3),Unambiguous,KLKPKLLK,,1,P50748,P50748,2,...,112.1758,39207,Ex_Auto_J3_30umTB_2ngQC_60m_1.raw,F3,3,0.08,1,0.373600,1.000000e+00,-2.771
176654,False,High,Sequest HT (A3),Unambiguous,IKGPGPAK,,1,Q8IXM7,Q8IXM7,1,...,67.7630,23046,Ex_Auto_J3_30umTB_2ngQC_60m_1.raw,F3,2,0.11,1,0.373700,1.000000e+00,-3.043
176655,False,High,Sequest HT (A3),Unambiguous,LNVSHNKLK,,1,Q9H9A6,Q9H9A6,1,...,112.6422,39400,Ex_Auto_J3_30umTB_2ngQC_60m_1.raw,F3,112,0.33,1,0.373700,1.000000e+00,-2.932


In [8]:
preMP_data = ["2ng_rep1", "2ng_rep2", "2ng_rep3", "2ng_rep4", "2ng_rep5", "2ng_rep6",
             "0.2ng_rep1", "0.2ng_rep2", "0.2ng_rep3", "0.2ng_rep4", "0.2ng_rep5", "0.2ng_rep6"]

#mm_postMP_data = {"2ng_rep1", "2ng_rep2", "2ng_rep3", "2ng_rep4", "2ng_rep5", "2ng_rep6",
            # "0.2ng_rep1", "0.2ng_rep2", "0.2ng_rep3", "0.2ng_rep4", "0.2ng_rep5", "0.2ng_rep6"}
    
msg_postMP_data = {"2ng_rep1" : "MokaPot_Output/MsgfPlus/msg_2ng_rep1.csv", "2ng_rep2" : "MokaPot_Output/MsgfPlus/msg_2ng_rep2.csv", "2ng_rep3" : "MokaPot_Output/MsgfPlus/msg_2ng_rep3.csv" , 
                   "2ng_rep4" : "MokaPot_Output/MsgfPlus/msg_2ng_rep4.csv", "2ng_rep5" : "MokaPot_Output/MsgfPlus/msg_2ng_rep5.csv", "2ng_rep6": "MokaPot_Output/MsgfPlus/msg_2ng_rep6.csv",
                   "0.2ng_rep1" : "MokaPot_Output/MsgfPlus/msg_0.2ng_rep1.csv", "0.2ng_rep2" : "MokaPot_Output/MsgfPlus/msg_0.2ng_rep2.csv", "0.2ng_rep3": "MokaPot_Output/MsgfPlus/msg_0.2ng_rep3.csv", 
                   "0.2ng_rep4": "MokaPot_Output/MsgfPlus/msg_0.2ng_rep4.csv", "0.2ng_rep5": "MokaPot_Output/MsgfPlus/msg_0.2ng_rep5.csv", "0.2ng_rep6": "MokaPot_Output/MsgfPlus/msg_0.2ng_rep6.csv"}

mq_postMP_data = {"2ng_rep1" : "MokaPot_Output/MaxQuant/mq_2ng_rep1.csv", "2ng_rep2" : "MokaPot_Output/MaxQuant/mq_2ng_rep2.csv", "2ng_rep3" : "MokaPot_Output/MaxQuant/mq_2ng_rep3.csv" , 
                   "2ng_rep4" : "MokaPot_Output/MaxQuant/mq_2ng_rep4.csv", "2ng_rep5" : "MokaPot_Output/MaxQuant/mq_2ng_rep5.csv", "2ng_rep6": "MokaPot_Output/MaxQuant/mq_2ng_rep6.csv",
                   "0.2ng_rep1" : "MokaPot_Output/MaxQuant/mq_0.2ng_rep1.csv", "0.2ng_rep2" : "MokaPot_Output/MaxQuant/mq_0.2ng_rep2.csv", "0.2ng_rep3": "MokaPot_Output/MaxQuant/mq_0.2ng_rep3.csv", 
                   "0.2ng_rep4": "MokaPot_Output/MaxQuant/mq_0.2ng_rep4.csv", "0.2ng_rep5": "MokaPot_Output/MaxQuant/mq_0.2ng_rep5.csv", "0.2ng_rep6": "MokaPot_Output/MaxQuant/mq_0.2ng_rep6.csv"}

mm_postMP_data = {"2ng_rep1" : "MokaPot_Output/MetaMorpheus/mm_2ng_rep1.csv", "2ng_rep2" : "MokaPot_Output/MetaMorpheus/mm_2ng_rep2.csv", "2ng_rep3" : "MokaPot_Output/MetaMorpheus/mm_2ng_rep3.csv" , 
                   "2ng_rep4" : "MokaPot_Output/MetaMorpheus/mm_2ng_rep4.csv", "2ng_rep5" : "MokaPot_Output/MetaMorpheus/mm_2ng_rep5.csv", "2ng_rep6": "MokaPot_Output/MetaMorpheus/mm_2ng_rep6.csv",
                   "0.2ng_rep1" : "MokaPot_Output/MetaMorpheus/mm_0.2ng_rep1.csv", "0.2ng_rep2" : "MokaPot_Output/MetaMorpheus/mm_0.2ng_rep2.csv", "0.2ng_rep3": "MokaPot_Output/MetaMorpheus/mm_0.2ng_rep3.csv", 
                   "0.2ng_rep4": "MokaPot_Output/MetaMorpheus/mm_0.2ng_rep4.csv", "0.2ng_rep5": "MokaPot_Output/MetaMorpheus/mm_0.2ng_rep5.csv", "0.2ng_rep6": "MokaPot_Output/MetaMorpheus/mm_0.2ng_rep6.csv"}

msf_postMP_data = {"2ng_rep1" : "MokaPot_Output/MsFragger/msf_2ng_rep1.csv", "2ng_rep2" : "MokaPot_Output/MsFragger/msf_2ng_rep2.csv", "2ng_rep3" : "MokaPot_Output/MsFragger/msf_2ng_rep3.csv" , 
                   "2ng_rep4" : "MokaPot_Output/MsFragger/msf_2ng_rep4.csv", "2ng_rep5" : "MokaPot_Output/MsFragger/msf_2ng_rep5.csv", "2ng_rep6": "MokaPot_Output/MsFragger/msf_2ng_rep6.csv",
                   "0.2ng_rep1" : "MokaPot_Output/MsFragger/msf_0.2ng_rep1.csv", "0.2ng_rep2" : "MokaPot_Output/MsFragger/msf_0.2ng_rep2.csv", "0.2ng_rep3": "MokaPot_Output/MsFragger/msf_0.2ng_rep3.csv", 
                   "0.2ng_rep4": "MokaPot_Output/MsFragger/msf_0.2ng_rep4.csv", "0.2ng_rep5": "MokaPot_Output/MsFragger/msf_0.2ng_rep5.csv", "0.2ng_rep6": "MokaPot_Output/MsFragger/msf_0.2ng_rep6.csv"}

Reading in the results from running data through MokaPot 

In [9]:
for file in preMP_data:

    msg_df = pd.read_csv(msg_postMP_data[file])
    msg_df = msg_df.filter(['ScanNr', 'mokapot score', "mokapot q-value"])

    mm_df = pd.read_csv(mm_postMP_data[file])
    mm_df = mm_df.filter(['ScanNr', 'mokapot score', "mokapot q-value"])


    msf_df = pd.read_csv(msf_postMP_data[file])
    msf_df = msf_df.filter(['ScanNr', 'mokapot score', "mokapot q-value"])

    mq_df = pd.read_csv(mq_postMP_data[file])
    mq_df = mq_df.filter(['ScanNr', 'mokapot score', "mokapot q-value"])

    mm_original_df = get_orgininal_mm_data(file)
    msf_original_df = get_original_msf_data(file)
    msg_original_df = get_original_msg_data(file)
    mq_original_df = get_original_mq_data(file)


    #Switching index to ScanNr to join dataframes based on their scan numbers. This is so that all the files have
    #the same name for their scan number column
    msg_df = msg_df.set_index('ScanNr')
    msg_original_df = msg_original_df.set_index('ScanNr')
    msf_df = msf_df.set_index('ScanNr')
    msf_original_df = msf_original_df.set_index('ScanNr')
    mm_df = mm_df.set_index('ScanNr')
    mm_original_df = mm_original_df.set_index('ScanNr')
    mq_df = mq_df.set_index('ScanNr')
    mq_original_df = mq_original_df.set_index('ScanNr')

    #Joining data from the original and postMokapot into a df for each tool individually
    MsFragger = msf_original_df.join(msf_df, how="outer")
    MsgfPlus = msg_original_df.join(msg_df, how="outer")
    MetaMorpheus = mm_original_df.join(mm_df, how="outer")
    MaxQuant = mq_original_df.join(mq_df, how="outer")

    #combining all the dataframes together into one megaScript
    megaScript = pd.concat(dict(MsFragger = MsFragger, MsgfPlus = MsgfPlus, MetaMorpheus = MetaMorpheus, MaxQuant = MaxQuant), axis=1)
    megaScript.reset_index(inplace=True)

    megaScript.to_csv("MegaScript_" + file + ".csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/daishavanderwatt/Payne_Lab/SingleCellBenchMark/data/maxquant/msms02ng.txt.gz'

In [None]:
# #Why is there a difference in length in msgplus? We already know that mm was a little different. 

# print("msf_orig: " + str(len(msf_original_df)))
# print("msf_post: " + str(len(msf_df)))

# print("msg_orig: " + str(len(msg_original_df)))
# print("msg_post: " + str(len(msg_df)))

# print("mq_orig: " + str(len(mq_original_df)))
# print("mq_post: " + str(len(mq_df)))

# print("mm_orig: " + str(len(mm_original_df)))
# print("mm_post: " + str(len(mm_df)))

In [None]:
# msf_original_df = dl.clean_msfragger('2ng_rep1')
# msf_original_df['scan'] =msf_original_df['scan'].apply(extractScanNum) 
    
# msf_original_df = filter_data(msf_original_df, 'PeptideProphet Probability')

In [None]:
# msf_original_df