In [1]:
import os
import mokapot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.append("..")
sys.path
import data_loader as dl

This takes the 'before' data and filters it by dropping decoys and duplicates.

In [2]:
def filter_data(df, prob_column):
    #drop decoys
    df = df[df["decoy"]==False]
    #sort by qvalue
    df = df.sort_values(prob_column)
    #drop duplicate scans
    df = df.drop_duplicates(subset=["scan"], keep="first") #keep highest scoring
    
    return df

Reading in all of our data from the parser and formatting it to be combined in a megascript.

In [3]:
#Reading in and formatting the original MetaMorpheus data
def get_orgininal_mm_data(file):
    mm_original_df = dl.clean_metamorph(file)
    mm_original_df = filter_data( mm_original_df, "QValue")
    
    #filter out just the ScanNr and QValue and/or PEP columns
    mm_original_df = mm_original_df.filter(items = ['scan', 'peptide', 'QValue', 'PEP'])
    
    return mm_original_df

In [4]:
#Reading in and formatting the orginial MsFragger data
def set_probablility(row):
    new_prob = 1 - row["PeptideProphet Probability"]
    return new_prob

#pulling only scan numbers out
def extractScanNum(row):
    string = row
    spot = string.find('.')
    new_st = string[spot + 1:]
    spot = new_st.find('.')
    final_st = new_st[:spot]
    
    if final_st[0] == "0":
        final_st = final_st[1:]
    return final_st

def get_original_msf_data(file):
    msf_original_df = dl.clean_msfragger(file)
    
    #Extracting scan number from file number
    msf_original_df['scan'] =msf_original_df['scan'].apply(extractScanNum) 
    
    msf_original_df = filter_data(msf_original_df, 'PeptideProphet Probability')
    
    #Changing the probabilities to the same scale all the other tools use 
    msf_original_df["Updated_probability"] =  msf_original_df.apply(set_probablility, 1)
    
    #filter out just the ScanNr and QValue and/or PEP columns
    msf_original_df = msf_original_df.filter(['scan', 'peptide','Updated_probability'])
    
    return  msf_original_df

In [5]:
#Reading in and formatting the orginial MsgfPlus data
def get_original_msg_data(file):
    msg_original_df = dl.clean_msgfplus(file)
    msg_original_df = filter_data(msg_original_df, "QValue")
    
    #filter out just the ScanNr and QValue and/or PEP columns
    msg_original_df = msg_original_df.filter(['scan', 'peptide', "QValue"])
    
    return msg_original_df

In [6]:
#Reading in and formatting the orginial MaxQuant data
def get_original_mq_data(file):
     mq_original_df =  dl.clean_maxquant(file)
    
#Formatting and dropping any rows that are missing the sequence
     mq_original_df['Sequence'].replace(' ', np.nan, inplace = True)
     mq_original_df.dropna(subset=['Sequence'], inplace=True)
    
     mq_original_df = filter_data(mq_original_df, 'PEP')
    
    #filter out the ScanNr, peptide, and PEP columns
     mq_original_df = mq_original_df.filter(['scan', 'peptide', 'PEP'])
    
     return  mq_original_df

In [7]:
#Reading in and formatting the data to be compared to the benchmarked data
def get_input_data(file, probability):
    df = pd.read_csv(file)
    
    #These need to come out, for testing only with the data I have 
    df['decoy'].replace('False', False, inplace = True) 
    df['decoy'].replace('True', True, inplace = True)
    
    df = filter_data(df, probability)
    
    #filter out the ScanNr, peptide, and probability columns
    df = df.filter(['scan', probability])
    
    return df

Here we will read in the megascript that contains the output data from a single raw file that was ran through each tool.

In [8]:
#read in the megaScript and reformat it
def clean_meagScript(file):
    df = pd.read_csv(file, low_memory=False,  header=[0,1])
    df.drop(columns = {"Unnamed: 0_level_0"})
    
    return df

Slicing out the Peptide Prophet Probability values for MsFragger. There is no qvalue or PEP, so this is the row we are using. 
Counting how many are at or under the cutoff

In [9]:
def get_msf_prob_len(df, cutoff):
    msf_probability = df["MsFragger"]['Updated_probability']
    msf_probability =  msf_probability.dropna()
    msf_under_cutoff = len(msf_probability.loc[msf_probability <= cutoff])
    return msf_under_cutoff

Slicing out the qvalues from MetaMorpheus and counting how many are at or under the cutoff

In [10]:
def get_mm_Qval_len(df, cutoff):
    mm_qval = df["MetaMorpheus"]["QValue"] 
    mm_qval =  mm_qval.dropna() 
    mm_under_cutoff = len(mm_qval.loc[mm_qval <= cutoff])
    return mm_under_cutoff

Slicing out the PEP values from MetaMorpheus and counting how many are at or under the cutoff
**Are we keeping this?

In [11]:
def get_mm_PEP_len(df, cutoff):
    mm_PEP = df["MetaMorpheus"]["PEP"] 
    mm_PEP =  mm_PEP.dropna() 
    value_under_cutoff = len(mm_PEP.loc[mm_PEP <= cutoff])
    return value_under_cutoff

Slicing out the qvalues from MsgfPlus and counting how many are at or under the cutoff

In [12]:
def get_msg_Qval_len(df, cutoff):
    msg_qval = df["MsgfPlus"]["QValue"] 
    msg_qval =  msg_qval.dropna() 
    msg_under_cutoff = len(msg_qval.loc[msg_qval <= cutoff])
    return msg_under_cutoff

Slicing out the PEP from MaxQuant. Counting how many are at or under the cutoff

In [13]:
def get_mq_PEP_len(df, cutoff):
    mq_PEP = df["MaxQuant"]["PEP"] 
    mq_PEP =  mq_PEP.dropna() 
    mq_under_cutoff = len(mq_PEP.loc[mq_PEP <= cutoff])
    return mq_under_cutoff

Slicing out the probability column from the inputted data and counting how many scans are at or under the cutoff

In [14]:
def get_input_len(df, cutoff, probability):
    df = df["InputData"][probability] 
    df =  df.dropna() 
    df_under_cutoff = len(df.loc[df <= cutoff])
    return df_under_cutoff

This function gets the number of scan values that were at or below the cutoff for each tool and returns them.

In [15]:
def get_file_values(file, cutoff, inputs_probability):
    df = clean_meagScript(file)
    msf = get_msf_prob_len(df, cutoff)
    MM_QVal = get_mm_Qval_len(df, cutoff)
    MM_PEP = get_mm_PEP_len(df, cutoff)
    msg_QVal = get_msg_Qval_len(df, cutoff)
    MQ_PEP = get_mq_PEP_len(df, cutoff)
    input_data = get_input_len(df, cutoff, inputs_probability)
    values_list = {"msf" : msf, "MM_QVal" : MM_QVal, "MM_PEP" : MM_PEP, "msg_QVal" : msg_QVal, "MQ_PEP" : MQ_PEP, "input_data": input_data}
    return values_list

Reading in the data and making the graph for the 2ng data at a certain cutoff

In [16]:
def make_2ng_graph(cutoff, input_probability):
    File1 = get_file_values("benchmark_MegaScript_2ng_rep1.csv", cutoff, input_probability)
    File2 = get_file_values("benchmark_MegaScript_2ng_rep2.csv", cutoff, input_probability)
    File3 = get_file_values("benchmark_MegaScript_2ng_rep3.csv", cutoff, input_probability)
    File4 = get_file_values("benchmark_MegaScript_2ng_rep4.csv", cutoff, input_probability)
    File5 = get_file_values("benchmark_MegaScript_2ng_rep5.csv", cutoff, input_probability)
    File6 = get_file_values("benchmark_MegaScript_2ng_rep6.csv", cutoff, input_probability)

    # set width of bars
    barWidth = 0.14

    # set heights of bars
    msf_prob = [File1['msf'], File2['msf'], File3['msf'], File4['msf'], File5['msf'], File6['msf']]
    MM_PEP = [File1['MM_PEP'], File2['MM_PEP'], File3['MM_PEP'], File4['MM_PEP'], File5['MM_PEP'], File6['MM_PEP']]
    MM_qval = [File1['MM_QVal'], File2['MM_QVal'], File3['MM_QVal'], File4['MM_QVal'], File5['MM_QVal'], File6['MM_QVal']]
    msg_qval = [File1['msg_QVal'], File2['msg_QVal'], File3['msg_QVal'], File4['msg_QVal'], File5['msg_QVal'], File6['msg_QVal']]
    mq_PEP = [File1['MQ_PEP'], File2['MQ_PEP'], File3['MQ_PEP'], File4['MQ_PEP'], File5['MQ_PEP'], File6['MQ_PEP']]
    input_prob = [File1['input_data'], File2['input_data'], File3['input_data'], File4['input_data'], File5['input_data'], File6['input_data']]
    
    # Set position of bar on X axis
    r1 = np.arange(len(msf_prob))
    r2 = [x + barWidth for x in r1]
    r3 = [x + barWidth for x in r2]
    r4 = [x + barWidth for x in r3]
    r5 = [x + barWidth for x in r4]
    r6 = [x + barWidth for x in r5]

    # Make the plot
    plt.bar(r1, msf_prob, width=barWidth, edgecolor='white', label='MsFragger Peptide Prophet Probability')
    plt.bar(r2, MM_qval, width=barWidth, edgecolor='white', label='MetaMorpheus Q-Value')
    plt.bar(r3, msg_qval, width=barWidth, edgecolor='white', label='MsgfPlus Q-Value')
    plt.bar(r4, mq_PEP, width=barWidth, edgecolor='white', label='MaxQuant PEP')
    plt.bar(r5, MM_PEP, width=barWidth, edgecolor='white', label='MetaMorpheus PEP')
    plt.bar(r6, input_prob, width=barWidth, edgecolor='white', label='Input ' + input_probability)

    # Add xticks on the middle of the group bars
    plt.ylabel('#PSMs')
    plt.xlabel('# PSMs using native score and cutoff')
    plt.title('2ng')
    plt.xticks([r + barWidth for r in range(len(msf_prob))], ['File1', 'File2', 'File3', 'File4', 'File5', 'File6'])

    # Create legend & Show graph
    plt.legend(loc = "upper right", bbox_to_anchor=(1.73, 1))
    plt.show()
    #plt.savefig('2ng_PSM_native_score.png')

Reading in the data and making the graph for the 0.2ng data at a certain cutoff

In [17]:
def make_02ng_graph(cutoff, input_probability):
    File1 = get_file_values("benchmark_MegaScript_0.2ng_rep1.csv", cutoff, input_probability)
    File2 = get_file_values("benchmark_MegaScript_0.2ng_rep2.csv", cutoff, input_probability)
    File3 = get_file_values("benchmark_MegaScript_0.2ng_rep3.csv", cutoff, input_probability)
    File4 = get_file_values("benchmark_MegaScript_0.2ng_rep4.csv", cutoff, input_probability)
    File5 = get_file_values("benchmark_MegaScript_0.2ng_rep5.csv", cutoff, input_probability)
    File6 = get_file_values("benchmark_MegaScript_0.2ng_rep6.csv", cutoff, input_probability)


    # set width of bars
    barWidth = 0.14

    # set heights of bars
    msf_prob = [File1['msf'], File2['msf'], File3['msf'], File4['msf'], File5['msf'], File6['msf']]
    MM_PEP = [File1['MM_PEP'], File2['MM_PEP'], File3['MM_PEP'], File4['MM_PEP'], File5['MM_PEP'], File6['MM_PEP']]
    MM_qval = [File1['MM_QVal'], File2['MM_QVal'], File3['MM_QVal'], File4['MM_QVal'], File5['MM_QVal'], File6['MM_QVal']]
    msg_qval = [File1['msg_QVal'], File2['msg_QVal'], File3['msg_QVal'], File4['msg_QVal'], File5['msg_QVal'], File6['msg_QVal']]
    mq_PEP = [File1['MQ_PEP'], File2['MQ_PEP'], File3['MQ_PEP'], File4['MQ_PEP'], File5['MQ_PEP'], File6['MQ_PEP']]
    input_prob = [File1['input_data'], File2['input_data'], File3['input_data'], File4['input_data'], File5['input_data'], File6['input_data']]
    
    # Set position of bar on X axis
    r1 = np.arange(len(msf_prob))
    r2 = [x + barWidth for x in r1]
    r3 = [x + barWidth for x in r2]
    r4 = [x + barWidth for x in r3]
    r5 = [x + barWidth for x in r4]
    r6 = [x + barWidth for x in r5]

    # Make the plot
    plt.bar(r1, msf_prob, width=barWidth, edgecolor='white', label='MsFragger Peptide Prophet Probability')
    plt.bar(r2, MM_qval, width=barWidth, edgecolor='white', label='MetaMorpheus Q-Value')
    plt.bar(r3, msg_qval, width=barWidth, edgecolor='white', label='MsgfPlus Q-Value')
    plt.bar(r4, mq_PEP, width=barWidth, edgecolor='white', label='MaxQuant PEP')
    plt.bar(r5, MM_PEP, width=barWidth, edgecolor='white', label='MetaMorpheus PEP')
    plt.bar(r6, input_prob, width=barWidth, edgecolor='white', label='Input ' + input_probability)


    # Add xticks on the middle of the group bars
    plt.ylabel('#PSMs')
    plt.xlabel('# PSMs using native score and cutoff')
    plt.title('0.2ng')
    plt.xticks([r + barWidth for r in range(len(msf_prob))], ['File1', 'File2', 'File3', 'File4', 'File5', 'File6'])

    # Create legend & Show graph
    plt.legend(loc = "upper right", bbox_to_anchor=(1.73, 1))
    plt.show()
    #plt.savefig('0.2ng_PSM_native_score.pdf')

In [25]:
#names of all the files we are going to read in to upload our data
preMP_data = ["2ng_rep1"] #, "2ng_rep2", "2ng_rep3", "2ng_rep4", "2ng_rep5", "2ng_rep6",
             #"0.2ng_rep1", "0.2ng_rep2", "0.2ng_rep3", "0.2ng_rep4", "0.2ng_rep5", "0.2ng_rep6"]

Our data is split into 12 output files. We will read in the data one output file at a time for each tool and make a megascript. The megascript allows us to look at the output for each tool from that specific raw input file. 

We begin by reading in the output file from each tool. These are the output files that each tool gives us from running a specific raw file. We will then set the index to the scan column for each of the individual dataframes. 

In [26]:
#in the .py script, this has been renamed to make_megascript

def compare_data(input_files, cutoff, probability):

    for file in preMP_data:

        mm_df = get_orgininal_mm_data(file)
        msf_df = get_original_msf_data(file)
        msg_df = get_original_msg_data(file)
        mq_df = get_original_mq_data(file)

        #Switching index to ScanNr to join dataframes based on their scan numbers.  
        MsgfPlus = msg_df.set_index('scan')
        MsFragger = msf_df.set_index('scan')
        MetaMorpheus = mm_df.set_index('scan')
        MaxQuant = mq_df.set_index('scan')
        

        #concating all the individual joined dataframes togehter to make one megascript
        megaScript = pd.concat(dict(MsFragger = MsFragger, MsgfPlus = MsgfPlus, MetaMorpheus = MetaMorpheus, 
                                    MaxQuant = MaxQuant), axis=1)
        megaScript.reset_index(inplace=True)

        #saving the megascript
        megaScript.to_csv("benchmark_MegaScript_" + file + ".csv")
        
    #make the graphs
#     make_2ng_graph(cutoff, probability)
#     make_02ng_graph(cutoff, probability)


In order to benchmark your program's output compared to our benchmarked you will need to run the same raw files  through your program. Begin by downloading six 2ng and six 0.2ng files from PRIDE (URL?). Run these through your program and save the files. 

In order to accuratly run your files against ours, each file will need to have a "scan", "decoy", and some type of probability column.
- "scan" is the scan number column . 
- "decoy" is a boolean column that tracks whether a scan was tagged as a decoy or not. True denotes a deocy, others wise it will be false.
- The probability column that presents the score that the tool gave to a specific scan. If your tool has a  qvalue or PEP value column, use this. (How to say that 0 is better than 1?) 

Once you have ran the raw files and saved the correctly formatted output files, insert the file paths into the correct spot in the input_files list below. Make sure that the right file is loaded into the right spot or your tool's output will not be correctly compared to the benchmarked data. 

**Not putting the peptide in

The function to run the program will ask for the list of files, a cutoff, and the name of the probability column. The cutoff entered will be used to measure the number of PSM's at or below the cutoff.(need to say more eloquently)   

In [20]:
input_files = {} 
    #2ng files
input_files["2ng_rep1"] = "Insert Ex_Auto_J3_30umTB_2ngQC_60m_1 file path"
input_files["2ng_rep2"] = "Insert Ex_Auto_J3_30umTB_2ngQC_60m_2 file path"
input_files["2ng_rep3"] = "Insert Ex_Auto_K13_30umTA_2ngQC_60m_1 file path"
input_files["2ng_rep4"] = "Insert Ex_Auto_K13_30umTA_2ngQC_60m_2 file path"
input_files["2ng_rep5"] = "Insert Ex_Auto_W17_30umTA_2ngQC_60m_3 file path"
input_files["2ng_rep6"] = "Insert Ex_Auto_W17_30umTA_2ngQC_60m_4 file path"

    #0.2ng files
input_files["0.2ng_rep1"] = "Insert Ex_Auto_J3_30umTB_02ngQC_60m_1 file path"
input_files["0.2ng_rep2"] = "Insert Ex_Auto_J3_30umTB_02ngQC_60m_2 file path"
input_files["0.2ng_rep3"] = "Insert Ex_Auto_K13_30umTA_02ngQC_60m_1 file path"
input_files["0.2ng_rep4"] = "Insert Ex_Auto_K13_30umTA_02ngQC_60m_2 file path"
input_files["0.2ng_rep5"] = "Insert Ex_Auto_W17_30umTA_02ngQC_60m_3 file path"
input_files["0.2ng_rep6"] = "Insert Ex_Auto_W17_30umTA_02ngQC_60m_4 file path"



In [21]:
#For testing only. Remove later
#2ng files
input_files["2ng_rep1"] = "2ng_rep1_new_features.csv"
input_files["2ng_rep2"] = "2ng_rep1_new_features.csv"
input_files["2ng_rep3"] = "2ng_rep1_new_features.csv"
input_files["2ng_rep4"] = "2ng_rep1_new_features.csv"
input_files["2ng_rep5"] = "2ng_rep1_new_features.csv"
input_files["2ng_rep6"] = "2ng_rep1_new_features.csv"

    #0.2ng files
input_files["0.2ng_rep1"] = "2ng_rep1_new_features.csv"
input_files["0.2ng_rep2"] = "2ng_rep1_new_features.csv"
input_files["0.2ng_rep3"] = "2ng_rep1_new_features.csv"
input_files["0.2ng_rep4"] = "2ng_rep1_new_features.csv"
input_files["0.2ng_rep5"] = "2ng_rep1_new_features.csv"
input_files["0.2ng_rep6"] = "2ng_rep1_new_features.csv"


This function will take the input data and will make the 2ng and 0.2ng graphs comparing the inputted data to our benchamarked data. 

In [28]:
compare_data(input_files, 0.01, "probability")

  compare_data(input_files, 0.01, "probability")


In [None]:
#Delete from here down

In [164]:
for file in preMP_data:

    mm_df = get_orgininal_mm_data(file)
    msf_df = get_original_msf_data(file)
    msg_df = get_original_msg_data(file)
    mq_df = get_original_mq_data(file)

    #Switching index to ScanNr to join dataframes based on their scan numbers.  
    MsgfPlus = msg_df.set_index('scan')
    MsFragger = msf_df.set_index('scan')
    MetaMorpheus = mm_df.set_index('scan')
    MaxQuant = mq_df.set_index('scan')


    #concating all the individual joined dataframes togehter to make one megascript
    megaScript = pd.concat(dict(MsFragger = MsFragger, MsgfPlus = MsgfPlus, MetaMorpheus = MetaMorpheus, 
                                MaxQuant = MaxQuant), axis=1)
    megaScript.reset_index(inplace=True)

    #saving the megascript
    megaScript.to_csv("benchmark_MegaScript_" + file + ".csv")
    


  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['temp_peptide'] = df.apply(lambda row: format_oxidation(row, "Modified sequence", "(Oxidation (M))"), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["temp_peptide"] = df["temp_peptide"].str[1:-1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus

In [165]:
megaScript 

Unnamed: 0_level_0,scan,MsFragger,MsFragger,MsgfPlus,MsgfPlus,MetaMorpheus,MetaMorpheus,MetaMorpheus,MaxQuant,MaxQuant
Unnamed: 0_level_1,Unnamed: 1_level_1,peptide,Updated_probability,peptide,QValue,peptide,QValue,PEP,peptide,PEP
0,5,,,QSKSEHETSDAKKSVEDRGKRCPTPEIQK,0.086185,,,,SSKAYYVLSDAAMSLQKYGR,3.24090
1,7,,,,,,,,LSLLVAQEVTRLLDILGLTLVMK,5.57320
2,8,,,CTMAPFWAHSDPEEM+15.995QWR,0.307555,,,,ELIVLLLVAAAHLR,0.45389
3,11,,,RTEDCGHCDFCRDMKK,0.303415,,,,,
4,13,,,,,,,,MMQCVLHVYK,0.11660
...,...,...,...,...,...,...,...,...,...,...
41802,9990,LDDPSCPRPECYR,0.0000,,,,,,,
41803,9991,SDPVVSYR,0.0024,,,,,,,
41804,9992,DTQEVPLEK,0.0022,,,,,,,
41805,9994,RLEFENQK,0.3401,,,,,,,


In [166]:
 megaScript.to_csv("deleteMe.csv")

In [169]:
mi_df = pd.read_csv("deleteMe.csv", header=[0,1])

mi_df = mi_df.drop(columns = {"Unnamed: 0_level_0"})
mid_df = mi_df.set_index('scan')
mi_df

ValueError: Index data must be 1-dimensional