In [1]:
import pandas as pd

from denovo_utils.analysis.evaluation import (
    calculate_all_accuracy_metrics,
    subset_spectra_on_psmtype,
    evaluate_prediction,
    load_and_preprocess,
    annotate_GMM_clusters,
    filter_gmm_cluster,
    calculate_metrics
)


from matplotlib import pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm

import json
import os

In [2]:
from functools import partialmethod

tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)

# Filtering all PSMS with gaussian mixture models

Filter the psmslists while keeping some metrics along the way

# 1. Load data

In [3]:
filenames = [
    "F01_Fraction1",
    "F01_Fraction2",
    "F01_Fraction3",
    "F01_Fraction4",
    "F06",
    "F07_Fraction1",
    "F07_Fraction2",
    "F07_Fraction3",
    "F07_Fraction4",
    "F08_Rep1",
    "F08_Rep2",
    "F08_Rep3",
    # "S03", Missing in PT1 (casanovo instanovo ... --> rerun)
    # "S05",
    "S07",
    "S08",
    "S11_Fraction1",
    "S11_Fraction2",
    "S11_Fraction3",
    "S11_Fraction4",
    "S14_Rep1",
    "S14_Rep2",
    "S14_Rep3",
]

engines = [
    'Casanovo4.2.0', 'InstaNovo', 'PepNet', 'ContraNovo',
    'NovoB', 'Novor', 'PepNovo+'
]

In [4]:
root_data="/home/samva/Doctorate/data_directory/denovo_project"
save_directory = "/home/samva/Doctorate/data_directory/denovo_project/denovo_results/psmlists"

# For a single file

In [5]:
# Load all data in a dataframe and do some basic preprocessing
df = load_and_preprocess(
    root=root_data,
    filename="F01_Fraction1"
)

['Casanovo4.2.0', 'InstaNovo', 'PepNet', 'ContraNovo']
['NovoB', 'Novor', 'PepNovo+']
['sage']


In [6]:
# Fit GMMs and assign cluster labels to PSMs
df, gmm_metrics = annotate_GMM_clusters(df, db_engine="sage", only_df=False)

In [7]:
df_filtered = filter_gmm_cluster(df=df, keep_cluster_label=1)

metrics = calculate_metrics(
    df_filtered,
    engines=engines
)

In [8]:
df_filtered.to_csv("/home/samva/Doctorate/data_directory/denovo_project/denovo_results/psmlists/df/F01_Fraction1.csv", index=False)

# For all files

In [None]:
metrics = {}
metrics_gmm = {}
df_filtered = {}

In [5]:
import json
def save_to_json(data, path):
    with open(path, "w") as f:
        json_obj = json.dumps(data, indent=4)
        f.write(json_obj)

In [6]:
for filename in filenames:
    print(filename)

    # Load data, parse and preprocess a bit
    df = load_and_preprocess(
        root=root_data,
        filename=filename
    )
    
    # Fit GMMs and assign cluster labels to PSMs
    df, metrics_gmm_file = annotate_GMM_clusters(df, db_engine="sage", only_df=False)
    df_filtered_file = filter_gmm_cluster(df=df, keep_cluster_label=1)

    # Get some accuracy metrics for evaluation of the filtering
    metrics_file = calculate_metrics(
        df_filtered_file,
        engines=engines
    )

    # Store
    df_filtered_file.to_csv(os.path.join(
        save_directory, "df", filename+".csv"
    ), index=False)

    save_to_json(metrics_file, 
                 os.path.join(
                     save_directory,
                     "metrics",
                     filename+"_accuracy.json"
                 ))
    save_to_json(metrics_gmm_file, 
                 os.path.join(
                     save_directory,
                     "metrics",
                     filename+"_gmm.json"
                 ))

F01_Fraction1
['Casanovo4.2.0', 'InstaNovo', 'PepNet', 'ContraNovo']
['NovoB', 'Novor', 'PepNovo+']
['sage']
F01_Fraction2
['Casanovo4.2.0', 'InstaNovo', 'PepNet', 'ContraNovo']
['NovoB', 'Novor', 'PepNovo+']
['sage']
F01_Fraction3
['Casanovo4.2.0', 'InstaNovo', 'PepNet', 'ContraNovo']
['NovoB', 'Novor', 'PepNovo+']
['sage']
F01_Fraction4
['Casanovo4.2.0', 'InstaNovo', 'PepNet', 'ContraNovo']
['NovoB', 'Novor', 'PepNovo+']
['sage']
F06
['Casanovo4.2.0', 'InstaNovo', 'PepNet', 'ContraNovo']
['NovoB', 'Novor', 'PepNovo+']
['sage']
F07_Fraction1
['Casanovo4.2.0', 'InstaNovo', 'PepNet', 'ContraNovo']
['NovoB', 'Novor', 'PepNovo+']
['sage']
F07_Fraction2
['Casanovo4.2.0', 'InstaNovo', 'PepNet', 'ContraNovo']
['NovoB', 'Novor', 'PepNovo+']
['sage']
F07_Fraction3
['Casanovo4.2.0', 'InstaNovo', 'PepNet', 'ContraNovo']
['NovoB', 'Novor', 'PepNovo+']
['sage']
F07_Fraction4
['Casanovo4.2.0', 'InstaNovo', 'PepNet', 'ContraNovo']
['NovoB', 'Novor', 'PepNovo+']
['sage']
F08_Rep1
['Casanovo4.2.0', 'I