In [1]:
import os
import mokapot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.append("..")
sys.path
import data_loader as dl

In [2]:
def filter_data(df, prob_column):
     #drop decoys
    df = df[df["decoy"]== False]
    #sort by qvalue
    df = df.sort_values(prob_column)
    #Drop duplicates
    df = df.drop_duplicates(subset=["scan"], keep="first") #keep highest scoring

    return df

In [3]:
def get_PreMokaPot_data(file):
    data = dl.clean_metamorph(file)
    data = filter_data(data,'QValue')
    return data

In [4]:
def get_data_for_MokaPot(file):
    df_2 = dl.get_pin_file(file)
    df_2 = df_2.iloc[1: :]
    df_2 = df_2.drop_duplicates(subset=["ScanNr"], keep="first")
    
    return df_2

In [5]:
def calculate_diff(row):
    num = row['y'] - row['y_pred']
    return num

MetaMorpheus gives an output file that is specifically formatted for MokaPot. However, this file does not have q values, which we need to compare the before and after data. However, once the decoys are dropped from both files, the regular before file and the specailly formatted file contain the same scan numbers. The before file is used to compute the number of max consecutive y peaks and "perc_in_ladder". These columns are then taken and then added to the specially formatted file.

Here we read in the speically formatted file and the file that has the additional feature columns in it. Decoys are dropped from both of them, giving us the same number of scans in both file. We splice out the scan, max_consecutive_y, and "perc_in_ladder" columns. These columns are then joined with the formatted file based on their scan numbers. We can use this file to run through MokaPot and compare if the extra feature columns help MokaPot to score more PSMs below our cutoff.

In [6]:
def get_addedFeat_data(pin_df, before_df):

    data = pd.read_csv(before_df, sep = "\t")    
    
    #sort based on the probability column
    data = data.sort_values('QValue')
    #drop duplicate scans
    data = data.drop_duplicates(subset=["scan"], keep="first") #keep highest scoring
    #changing the type
    data = data.astype({'y_pred': float})
    
    #calculating the difference column
    data['difference'] = data.apply(calculate_diff, axis = 1)
    
    data = data.rename(columns = {"scan": "ScanNr"})
    data = data.filter(['ScanNr', 'difference'])
    data = data.set_index("ScanNr")
    
    #setting up the data from the pin file
    df_2 = dl.get_pin_file(pin_df)
    df_2 = df_2.iloc[1: :]
    df_2 = df_2.astype({"ScanNr": int})
    df_2 = df_2.drop_duplicates(subset=["ScanNr"], keep="first")
    df_2 = df_2.set_index("ScanNr")
    
    
    joined_df = df_2.join(data, how = "outer")
    joined_df.reset_index(inplace=True)
    
    return joined_df
    

In [7]:
def plot_qvalues(qvalues, threshold=0.01, ax=None, **kwargs):
    if ax is None:
        ax = plt.gca()

    # Calculate cumulative targets at each q-value
    qvals = pd.Series(qvalues, name="qvalue")
    qvals = qvals.sort_values(ascending=True).to_frame()
    qvals["target"] = 1
    qvals["num"] = qvals["target"].cumsum()
    qvals = qvals.groupby(["qvalue"]).max().reset_index()
    qvals = qvals[["qvalue", "num"]]

    zero = pd.DataFrame({"qvalue": qvals["qvalue"][0], "num": 0}, index=[-1])
    qvals = pd.concat([zero, qvals], sort=True).reset_index(drop=True)

    xmargin = threshold * 0.05
    ymax = qvals.num[qvals["qvalue"] <= (threshold + xmargin)].max()
    ymargin = ymax * 0.05

    # Set margins
    curr_ylims = ax.get_ylim()
    if curr_ylims[1] < ymax + ymargin:
        ax.set_ylim(0 - ymargin, ymax + ymargin)

    ax.set_xlim(0 - xmargin, threshold + xmargin)
    ax.set_xlabel("q-value")
    ax.set_ylabel(f"Discoveries")

    ax.step(qvals["qvalue"].values, qvals.num.values, where="post", **kwargs)

    return ax

In [8]:
def plot_qvalues(qvalues, threshold=0.01, ax=None, **kwargs):
    if ax is None:
        ax = plt.gca()

    # Calculate cumulative targets at each q-value
    qvals = pd.Series(qvalues, name="qvalue")
    qvals = qvals.sort_values(ascending=True).to_frame()
    qvals["target"] = 1
    qvals["num"] = qvals["target"].cumsum()
    qvals = qvals.groupby(["qvalue"]).max().reset_index()
    qvals = qvals[["qvalue", "num"]]

    zero = pd.DataFrame({"qvalue": qvals["qvalue"][0], "num": 0}, index=[-1])
    qvals = pd.concat([zero, qvals], sort=True).reset_index(drop=True)

    xmargin = threshold * 0.05
    ymax = qvals.num[qvals["qvalue"] <= (threshold + xmargin)].max()
    ymargin = ymax * 0.05

    # Set margins
    curr_ylims = ax.get_ylim()
    if curr_ylims[1] < ymax + ymargin:
        ax.set_ylim(0 - ymargin, ymax + ymargin)

    ax.set_xlim(0 - xmargin, threshold + xmargin)
    ax.set_xlabel("q-value")
    ax.set_ylabel(f"Discoveries")

    ax.step(qvals["qvalue"].values, qvals.num.values, where="post", **kwargs)

    return ax

In [9]:
#Read all the files into a list here
file_names = ["2ng_rep1"]

In [10]:
rounds = 0
limit = 25

no_addedFeats_list = []
addedFeats_list = []


mm_df = get_PreMokaPot_data("2ng_rep1")
df = get_data_for_MokaPot("2ng_rep1")
join_df = get_addedFeat_data("2ng_rep1", "test.tsv")

while rounds < limit:
                                                        
    mm_for_MP = mokapot.read_pin(df) 
    results, models = mokapot.brew(mm_for_MP)


    mm_newFeat_MP = mokapot.read_pin(join_df) 
    added_results, added_models = mokapot.brew(mm_newFeat_MP)
        
      
    no_addedFeats_list.append(len(results.psms[ results.psms['mokapot q-value'] <= 0.01]))
    print("without added feat: " + str(len(results.psms[ results.psms['mokapot q-value'] <= 0.01])))
    
    addedFeats_list.append(len(added_results.psms[ added_results.psms['mokapot q-value'] <= 0.01]))
    print("with added feat: " + str(len(added_results.psms[ added_results.psms['mokapot q-value'] <= 0.01])))
    rounds+= 1
    print("finished round " + str(rounds))
    

  exec(code_obj, self.user_global_ns, self.user_ns)
  if (await self.run_code(code, result,  async_=asy)):


without added feat: 14673
with added feat: 14315
finished round 1
without added feat: 14654
with added feat: 14090
finished round 2
without added feat: 14148
with added feat: 14637
finished round 3
without added feat: 14339
with added feat: 14113
finished round 4
without added feat: 14678
with added feat: 14383
finished round 5
without added feat: 14289
with added feat: 14622
finished round 6
without added feat: 14369
with added feat: 14405
finished round 7
without added feat: 14691
with added feat: 14530
finished round 8
without added feat: 14674
with added feat: 14339
finished round 9
without added feat: 14412
with added feat: 14134
finished round 10
without added feat: 14404
with added feat: 14335
finished round 11
without added feat: 14383
with added feat: 14345
finished round 12
without added feat: 14622
with added feat: 14647
finished round 13
without added feat: 14402
with added feat: 14134
finished round 14
without added feat: 14383
with added feat: 14249
finished round 15
with

In [11]:
def get_avg(num_list):
    total = 0
    for num in num_list:
        total = total + num
    avg = total/len(num_list)
    return avg
    
print("Average scores after 25 runs:")    
no_add = get_avg(no_addedFeats_list)
print("\t" + "MetaMorpheus and MokaPot without additional features: " + str(no_add))

yes_add = get_avg(addedFeats_list)
print("\t" + "MetaMorpheus and MokaPot with additional features of retenetion time difference: " + str(yes_add))

        

Average scores after 25 runs:
	MetaMorpheus and MokaPot without additional features: 14457.08
	MetaMorpheus and MokaPot with additional features of retenetion time difference: 14411.64
