In [1]:
import os
import mokapot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.append("..")
sys.path
import data_loader as dl

In [2]:
def filter_data(df, prob_column='QValue'):
     #drop decoys
    df = df[df["decoy"]==False]
    #sort by qvalue
    df = df.sort_values(prob_column)
    #Drop duplicates
    df = df.drop_duplicates(subset=["scan"], keep="first") #keep highest scoring

    return df

In [3]:
def get_PreMokaPot_data(file):
    mm_df = dl.clean_metamorph(file)
    mm_df = filter_data(mm_df,"QValue")
    return mm_df


Reading in the data. MetaMorpheus gives an output file that is specifically designed for Percolator, that's the file we are using here.

In [4]:
def get_data_for_MokaPot(file):
    #get help so that this isn't hardcoded in
    df = pd.read_csv(file, sep="\t")

    #Dropping because this is not a a real row
    df = df.iloc[1: :]
    return df

In [5]:
def plot_qvalues(qvalues, threshold=0.01, ax=None, **kwargs):
    if ax is None:
        ax = plt.gca()

    # Calculate cumulative targets at each q-value
    qvals = pd.Series(qvalues, name="qvalue")
    qvals = qvals.sort_values(ascending=True).to_frame()
    qvals["target"] = 1
    qvals["num"] = qvals["target"].cumsum()
    qvals = qvals.groupby(["qvalue"]).max().reset_index()
    qvals = qvals[["qvalue", "num"]]

    zero = pd.DataFrame({"qvalue": qvals["qvalue"][0], "num": 0}, index=[-1])
    qvals = pd.concat([zero, qvals], sort=True).reset_index(drop=True)

    xmargin = threshold * 0.05
    ymax = qvals.num[qvals["qvalue"] <= (threshold + xmargin)].max()
    ymargin = ymax * 0.05

    # Set margins
    curr_ylims = ax.get_ylim()
    if curr_ylims[1] < ymax + ymargin:
        ax.set_ylim(0 - ymargin, ymax + ymargin)

    ax.set_xlim(0 - xmargin, threshold + xmargin)
    ax.set_xlabel("q-value")
    ax.set_ylabel(f"Discoveries")

    ax.step(qvals["qvalue"].values, qvals.num.values, where="post", **kwargs)

    return ax

In [6]:
def plot_qvalues(qvalues, threshold=0.01, ax=None, **kwargs):
    if ax is None:
        ax = plt.gca()

    # Calculate cumulative targets at each q-value
    qvals = pd.Series(qvalues, name="qvalue")
    qvals = qvals.sort_values(ascending=True).to_frame()
    qvals["target"] = 1
    qvals["num"] = qvals["target"].cumsum()
    qvals = qvals.groupby(["qvalue"]).max().reset_index()
    qvals = qvals[["qvalue", "num"]]

    zero = pd.DataFrame({"qvalue": qvals["qvalue"][0], "num": 0}, index=[-1])
    qvals = pd.concat([zero, qvals], sort=True).reset_index(drop=True)

    xmargin = threshold * 0.05
    ymax = qvals.num[qvals["qvalue"] <= (threshold + xmargin)].max()
    ymargin = ymax * 0.05

    # Set margins
    curr_ylims = ax.get_ylim()
    if curr_ylims[1] < ymax + ymargin:
        ax.set_ylim(0 - ymargin, ymax + ymargin)

    ax.set_xlim(0 - xmargin, threshold + xmargin)
    ax.set_xlabel("q-value")
    ax.set_ylabel(f"Discoveries")

    ax.step(qvals["qvalue"].values, qvals.num.values, where="post", **kwargs)

    return ax

In [7]:
#Read all the files into a list here
file_names = ["2ng"]
MokaPot_input = ['Ex_Auto_DrM3_30umT4_2ngQC_60m_half_PSMsFormattedForPercolator.tab']

Here we are going to start our data processing

1. we have to get the 'before' data. It is from a 2 ng file from metamorpheus
We have to use a different file for 'before' than the one we send to mokapot, because
the one we send to mokapot does not have any probability values.

Now we're getting a data file to send into mokapot

In [15]:
for num in range(len(MokaPot_input)):
    mm_df = get_PreMokaPot_data(file_names[num])
    df = get_data_for_MokaPot(MokaPot_input[num])
    psms = mokapot.read_pin(df)
    results, models = mokapot.brew(psms)
    results_df = results.psms
    results_df.to_csv("MokaPot_Output/metamorpheus_" + file_names[num] + ".csv")



Graphing the results from the data comparing it before it was run through MokaPot versus after it was run through MokaPot

In [None]:
plot_qvalues(mm_df["QValue"], label="Pre-mokapot")
plt.title("Mokapot vs MetaMorpheus")
results.plot_qvalues(label="mokapot")
plt.legend(["preMoka", "Mokapot"])
#plt.vlines(x = 0.01, ymin = 0, ymax = 16000)
plt.tight_layout()

plt.show()

In [None]:
print("The number of PSMs found at or above 0.01: ") 
      
print("\t" + "MetaMorpheus: " + str(len(mm_df[mm_df['QValue'] <= 0.01])))

print("\t""MetaMorpheus and MokaPot: " + str(len(results.psms[results.psms['mokapot q-value'] <= 0.01])))