In [1]:
import os
import mokapot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.append("..")
sys.path
import data_loader as dl


The purpose of this function is to clean up the original 'before' data so that we are not counting decoys or duplicate scans.

In [2]:
def filter_data(df, prob_column):
    #drop decoys
    df = df[df["decoy"]==False]
    #sort by qvalue
    df = df.sort_values(prob_column)
    #drop duplicate scans
    df = df.drop_duplicates(subset=["scan"], keep="first") #keep highest scoring
    #rename scan column
    df = df.rename(columns = {"scan": "ScanNr"})
    
    return df

The next functions read in the dataframes that hold the 'before' data. This is the data that has not been run through MokaPot. We will use this to compare whether MokaPot was able to improve the number or scans below a certain cutoff. 

In [3]:
#Reading in and formatting the original MetaMorpheus data
def get_orgininal_mm_data(file):
    mm_original_df = dl.clean_metamorph(file)
    mm_original_df = filter_data( mm_original_df, "QValue")
    
    #filter out just the ScanNr and QValue and/or PEP columns
    mm_original_df = mm_original_df.filter(items = ['ScanNr', 'peptide', 'QValue', 'PEP'])
    
    return mm_original_df

In [4]:
#Reading in and formatting the orginial MsFragger data
def set_probablility(row):
    new_prob = 1 - row["PeptideProphet Probability"]
    return new_prob

#pulling only scan numbers out
def extractScanNum(row):
    string = row
    spot = string.find('.')
    new_st = string[spot + 1:]
    spot = new_st.find('.')
    final_st = new_st[:spot]
    
    if final_st[0] == "0":
        final_st = final_st[1:]
    return final_st

def get_original_msf_data(file):
    msf_original_df = dl.clean_msfragger(file)
    
    #Extracting scan number from file number
    msf_original_df['scan'] =msf_original_df['scan'].apply(extractScanNum) 
    
    msf_original_df = filter_data(msf_original_df, 'PeptideProphet Probability')
    
    #Changing the probabilities to the same scale all the other tools use 
    msf_original_df["Updated_probability"] =  msf_original_df.apply(set_probablility, 1)
    
    #filter out just the ScanNr, peptide, and probability column
    msf_original_df = msf_original_df.filter(['ScanNr', 'peptide','Updated_probability'])
    
    return  msf_original_df

In [5]:
#Reading in and formatting the orginial MsgfPlus data
def get_original_msg_data(file):
    msg_original_df = dl.clean_msgfplus(file)
    msg_original_df = filter_data(msg_original_df, "QValue")
    
    #filter out just the ScanNr, peptide, and QValue columns
    msg_original_df = msg_original_df.filter(["ScanNr", 'peptide', "QValue"])
    
    return msg_original_df

In [6]:
#Reading in and formatting the orginial MaxQuant data
def get_original_mq_data(file):
     mq_original_df =  dl.clean_maxquant(file)
    
#Formatting and dropping any rows that are missing the sequence
     mq_original_df['Sequence'].replace(' ', np.nan, inplace = True)
     mq_original_df.dropna(subset=['Sequence'], inplace=True)
    
     mq_original_df = filter_data(mq_original_df, 'PEP')
    
    #filter out just the ScanNr, peptide and PEP columns
     mq_original_df = mq_original_df.filter(["ScanNr", 'peptide', "PEP"])
    
     return  mq_original_df

In [7]:
#Reading in and formatting the data from PD
def get_pd_data(file):
    pd_df = dl.clean_proteome_discover(file)
    pd_df = pd_df.sort_values("Percolator q-Value")
    pd_df = pd_df.rename(columns = {"First Scan": "ScanNr"})
    pd_df = pd_df.filter(['ScanNr', 'Percolator q-Value'])
    pd_df.drop_duplicates(subset=["ScanNr"], keep="first", inplace=True)

    return pd_df

This cell has the names to all the saved data files after the data has been ran through MokaPot. 

In [8]:
preMP_data = ["2ng_rep1", "2ng_rep2", "2ng_rep3", "2ng_rep4", "2ng_rep5", "2ng_rep6",
             "0.2ng_rep1", "0.2ng_rep2", "0.2ng_rep3", "0.2ng_rep4", "0.2ng_rep5", "0.2ng_rep6"]

    
msg_postMP_data = {"2ng_rep1" : "MokaPot_Output/MsgfPlus/msg_2ng_rep1.csv", "2ng_rep2" : "MokaPot_Output/MsgfPlus/msg_2ng_rep2.csv", "2ng_rep3" : "MokaPot_Output/MsgfPlus/msg_2ng_rep3.csv" , 
                   "2ng_rep4" : "MokaPot_Output/MsgfPlus/msg_2ng_rep4.csv", "2ng_rep5" : "MokaPot_Output/MsgfPlus/msg_2ng_rep5.csv", "2ng_rep6": "MokaPot_Output/MsgfPlus/msg_2ng_rep6.csv",
                   "0.2ng_rep1" : "MokaPot_Output/MsgfPlus/msg_0.2ng_rep1.csv", "0.2ng_rep2" : "MokaPot_Output/MsgfPlus/msg_0.2ng_rep2.csv", "0.2ng_rep3": "MokaPot_Output/MsgfPlus/msg_0.2ng_rep3.csv", 
                   "0.2ng_rep4": "MokaPot_Output/MsgfPlus/msg_0.2ng_rep4.csv", "0.2ng_rep5": "MokaPot_Output/MsgfPlus/msg_0.2ng_rep5.csv", "0.2ng_rep6": "MokaPot_Output/MsgfPlus/msg_0.2ng_rep6.csv"}

mq_postMP_data = {"2ng_rep1" : "MokaPot_Output/MaxQuant/mq_2ng_rep1.csv", "2ng_rep2" : "MokaPot_Output/MaxQuant/mq_2ng_rep2.csv", "2ng_rep3" : "MokaPot_Output/MaxQuant/mq_2ng_rep3.csv" , 
                   "2ng_rep4" : "MokaPot_Output/MaxQuant/mq_2ng_rep4.csv", "2ng_rep5" : "MokaPot_Output/MaxQuant/mq_2ng_rep5.csv", "2ng_rep6": "MokaPot_Output/MaxQuant/mq_2ng_rep6.csv",
                   "0.2ng_rep1" : "MokaPot_Output/MaxQuant/mq_0.2ng_rep1.csv", "0.2ng_rep2" : "MokaPot_Output/MaxQuant/mq_0.2ng_rep2.csv", "0.2ng_rep3": "MokaPot_Output/MaxQuant/mq_0.2ng_rep3.csv", 
                   "0.2ng_rep4": "MokaPot_Output/MaxQuant/mq_0.2ng_rep4.csv", "0.2ng_rep5": "MokaPot_Output/MaxQuant/mq_0.2ng_rep5.csv", "0.2ng_rep6": "MokaPot_Output/MaxQuant/mq_0.2ng_rep6.csv"}

mm_postMP_data = {"2ng_rep1" : "MokaPot_Output/MetaMorpheus/mm_2ng_rep1.csv", "2ng_rep2" : "MokaPot_Output/MetaMorpheus/mm_2ng_rep2.csv", "2ng_rep3" : "MokaPot_Output/MetaMorpheus/mm_2ng_rep3.csv" , 
                   "2ng_rep4" : "MokaPot_Output/MetaMorpheus/mm_2ng_rep4.csv", "2ng_rep5" : "MokaPot_Output/MetaMorpheus/mm_2ng_rep5.csv", "2ng_rep6": "MokaPot_Output/MetaMorpheus/mm_2ng_rep6.csv",
                   "0.2ng_rep1" : "MokaPot_Output/MetaMorpheus/mm_0.2ng_rep1.csv", "0.2ng_rep2" : "MokaPot_Output/MetaMorpheus/mm_0.2ng_rep2.csv", "0.2ng_rep3": "MokaPot_Output/MetaMorpheus/mm_0.2ng_rep3.csv", 
                   "0.2ng_rep4": "MokaPot_Output/MetaMorpheus/mm_0.2ng_rep4.csv", "0.2ng_rep5": "MokaPot_Output/MetaMorpheus/mm_0.2ng_rep5.csv", "0.2ng_rep6": "MokaPot_Output/MetaMorpheus/mm_0.2ng_rep6.csv"}

msf_postMP_data = {"2ng_rep1" : "MokaPot_Output/MsFragger/msf_2ng_rep1.csv", "2ng_rep2" : "MokaPot_Output/MsFragger/msf_2ng_rep2.csv", "2ng_rep3" : "MokaPot_Output/MsFragger/msf_2ng_rep3.csv" , 
                   "2ng_rep4" : "MokaPot_Output/MsFragger/msf_2ng_rep4.csv", "2ng_rep5" : "MokaPot_Output/MsFragger/msf_2ng_rep5.csv", "2ng_rep6": "MokaPot_Output/MsFragger/msf_2ng_rep6.csv",
                   "0.2ng_rep1" : "MokaPot_Output/MsFragger/msf_0.2ng_rep1.csv", "0.2ng_rep2" : "MokaPot_Output/MsFragger/msf_0.2ng_rep2.csv", "0.2ng_rep3": "MokaPot_Output/MsFragger/msf_0.2ng_rep3.csv", 
                   "0.2ng_rep4": "MokaPot_Output/MsFragger/msf_0.2ng_rep4.csv", "0.2ng_rep5": "MokaPot_Output/MsFragger/msf_0.2ng_rep5.csv", "0.2ng_rep6": "MokaPot_Output/MsFragger/msf_0.2ng_rep6.csv"}

Here we read in the results from running each tool's data through MokaPot. After we read in all the data we format the data so it can all be read into a large megascript. The dataframes are joined based on their scan number. We then join all of the before and after data together and save the data from each raw file into its own new megascript.  

In [9]:
for file in preMP_data:

    msg_df = pd.read_csv(msg_postMP_data[file])
    msg_df = msg_df.filter(['ScanNr', 'mokapot score', "mokapot q-value"])

    mm_df = pd.read_csv(mm_postMP_data[file])
    mm_df = mm_df.filter(['ScanNr', 'mokapot score', "mokapot q-value"])


    msf_df = pd.read_csv(msf_postMP_data[file])
    msf_df = msf_df.filter(['ScanNr', 'mokapot score', "mokapot q-value"])

    mq_df = pd.read_csv(mq_postMP_data[file])
    mq_df = mq_df.filter(['ScanNr', 'mokapot score', "mokapot q-value"])
    
    pd_df = get_pd_data(file)

    mm_original_df = get_orgininal_mm_data(file)
    msf_original_df = get_original_msf_data(file)
    msg_original_df = get_original_msg_data(file)
    mq_original_df = get_original_mq_data(file)


    #Switching index to ScanNr to join dataframes based on their scan numbers. 
    msg_df = msg_df.set_index('ScanNr')
    msg_original_df = msg_original_df.set_index('ScanNr')
    msf_df = msf_df.set_index('ScanNr')
    msf_original_df = msf_original_df.set_index('ScanNr')
    mm_df = mm_df.set_index('ScanNr')
    mm_original_df = mm_original_df.set_index('ScanNr')
    mq_df = mq_df.set_index('ScanNr')
    mq_original_df = mq_original_df.set_index('ScanNr')
    pd_df = pd_df.set_index('ScanNr')
    

    #Joining data from the original and postMokapot into a df for each tool individually
    MsFragger = msf_original_df.join(msf_df, how="outer")
    MsgfPlus = msg_original_df.join(msg_df, how="outer")
    MetaMorpheus = mm_original_df.join(mm_df, how="outer")
    MaxQuant = mq_original_df.join(mq_df, how="outer")
    
    Proteome_Discoverer = pd_df

    #combining all the dataframes together into one megaScript
    megaScript = pd.concat(dict(MsFragger = MsFragger, MsgfPlus = MsgfPlus, MetaMorpheus = MetaMorpheus, 
                                MaxQuant = MaxQuant, Proteome_Discoverer = Proteome_Discoverer), axis=1)
    megaScript.reset_index(inplace=True)

    megaScript.to_csv("MegaScript_" + file + ".csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['temp_peptide'] = df.apply(lambda row: format_oxidation(row, "Modified sequence", "(Oxidation (M))"), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["temp_peptide"] = df["temp_peptide"].str[1:-1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/p

In [20]:
pd_df = get_pd_data("2ng_rep5")
pd_df = pd_df.set_index('ScanNr')

pd_under_cutoff = len(pd_df[pd_df['Percolator q-Value'] <= 0.01])
pd_under_cutoff

14796