In [None]:
import os
from collections import defaultdict

import mokapot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from wispy import theme

pal = theme.paper()
TWO_COL = 180 / 25.4
HEIGHT = 3.5
ONE_COL = 88 / 25.4

res_files = [f for f in os.listdir("mokapot-out") if f.endswith(".txt") and ".modified" in f]
res_files

print(TWO_COL)

## Load the confidence estimates

In [None]:
disc = defaultdict(list)
for res in res_files:
    model, _, _, level, _ = res.split(".")
    df = pd.read_csv(os.path.join("mokapot-out", res), sep="\t")
    df["model"] = model
    disc[level].append(df)
    
disc = {k: pd.concat(v) for k, v in disc.items()}
disc.keys()

## Number of Accepted

In [None]:
for level, df in disc.items():
    accepted = df.loc[df["mokapot q-value"] <= 0.01, :]
    groups = accepted.groupby("model")["mokapot q-value"].count()
    gain = groups["xgb"] - groups["linear"]
    
    print(level)
    print(groups)
    print(gain, gain/groups["linear"])
    print("\n")

## Feature Importance

In [None]:
imp = pd.read_csv("featimp-out/importance.txt", sep="\t")
new_labs = {"linear": "Linear SVM", "xgb": "XGBoost"}
imp["mokapot model"] = imp["model"].apply(new_labs.get)

sums = imp.groupby("model").sum() / 5
imp["norm_imp"] = imp.apply(lambda x: x["importance"] / sums.loc[x["model"], "importance"], axis=1)

plt.figure(figsize=(TWO_COL, 6))
sns.barplot(data=imp, x="norm_imp", y="feature", hue="mokapot model")
plt.ylabel("Feature")
plt.xlabel("Relative Feature Importance")
plt.tight_layout()
plt.savefig("figures/feature_importance.png", dpi=300)

## Make a Mass Shift Histogram

In [None]:
x = disc["psms"]
x["massdiff"] = (x["Peptide"].str.split("\[")
                 .apply(lambda x: x[-1])
                 .str.replace("\]", "")
                 .astype(float))

x = x.loc[~(x["model"] == "fragger") & (x["mokapot q-value"] <= 0.01), :]
sums = []
for mod, group in x.groupby("model"):
    gsum = group["massdiff"].value_counts().sort_index()
    gsum.name = mod
    sums.append(gsum)
    
sums = pd.concat(sums, axis=1).fillna(0).sort_index()
diff = (sums["xgb"] - sums["linear"]).sort_index(ascending=True)

diff.sort_values(ascending=False)

## Plot the Main Figure

In [None]:
fig = plt.figure(figsize=(TWO_COL, 4))
gs = fig.add_gridspec(2, 3)

score = {"fragger": "E-value", "linear": "Linear SVM", "xgb": "XGBoost"}
labels = {"psms": "PSMs", "peptides": "Peptides", "proteins": "Proteins"}


# PSMs
for idx, level in enumerate(["psms", "peptides", "proteins"]):
    ax = fig.add_subplot(gs[0, idx])
    for model, df in disc[level].groupby("model"):
        mokapot.plot_qvalues(df["mokapot q-value"], label=score[model], ax=ax)
    
    ax.set_ylabel(f"Modified {labels[level]}")
    ax.legend(fontsize="small")


# Mass Shift Histogram
# 79.97 = Phospho
# 43.01 = Carbamylation
# 958.13 = ACU
# 0.98 = Deamidation
# 653.09 = AU

offset = 10
ax = fig.add_subplot(gs[1, :])
ax.plot(diff.index, diff.values)
ax.set_xlabel("Mass shift (Da)")
ax.set_ylabel("PSMs gained")
ax.annotate("Phosphorylation\n(+79.97)", (79.97 + offset, 200))
ax.annotate("AU\n(+653.09)", (653.09, 120), ha="right")
ax.annotate("ACU\n(+958.13)", (958.13, 120), ha="right")
ax.annotate("Carbamylation\n(+43.01)", (43.01, 180), ha="right")
ax.annotate("Deamidation\n(+0.98)", (0.98 - offset, 75), ha="right")

fig.align_ylabels()

for ax, label in zip(fig.axes, ["a", "b", "c", "d"]):
    ax.annotate(
        label, 
        (-10, 105), 
        xycoords="axes points", 
        fontweight='bold', 
        va='top', 
        ha='right')
    
plt.tight_layout()

plt.savefig("figures/oms_figure.png", dpi=300)