In [1]:
import pyopenms as oms
from pyteomics import mgf
import os
import pandas as pd
from pyteomics.mass import calculate_mass
from psm_utils import PSM, PSMList

from spectrum_utils.proforma import Proteoform, Modification
from spectrum_utils.fragment_annotation import get_theoretical_fragments
from psm_utils import Peptidoform
from spectrum_utils import proforma

from denovo_utils.analysis import calculate_hyperscore
from denovo_utils.parsers import proforma_to_theoretical_spectrum
from denovo_utils.parsers.converters import SpectralisParser, DenovoEngineConverter
from ms2rescore.feature_generators import BasicFeatureGenerator, MS2PIPFeatureGenerator, DeepLCFeatureGenerator

import deeplc
from deeplc.plot import scatter
from ms2rescore.report.charts import (
    calculate_feature_qvalues,
    feature_ecdf_auc_bar,
    fdr_plot,
    ms2pip_correlation,
)

from tqdm import tqdm

import spectrum_utils.plot as sup

from denovo_utils.utils.pandas import get_psm_type, get_spectralis_score
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline

2024-08-09 15:30:45.359914: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-09 15:30:45.359948: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-09 15:30:45.360792: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Modification already exists in ModificationsDB. Skipping.


Here, we will split up the spectra in categories depending on how they were matched predicted with the de novo tools.

Either they are:
- Perfectly matched on sequence level
- Isobarically matched (Undiscernable on spectrum level)
- Better match (Based on the hyperscore)
- Worse match (Based on the hyperscore)
- Extra predictions (No fasta match)

To preserve memory, the dataframe will be filtered on following columns:
- proforma
- sequence
- spectrum_id
- run
- engine
- score
- qvalue
- spectralis_score
- hyperscore
- is_decoy
- rescoring_features

Additionally, only spectra with PSMs below 1% FDR are kept!

In [2]:
def peptidoform_has_modification(peptidoform, allowed_modificiations=["[UNIMOD:4]", "[UNIMOD:35]"]):
    peptidoform_str = peptidoform.proforma
    for allowed_modification in allowed_modificiations:
        peptidoform_str = peptidoform_str.replace(allowed_modification, "")
    
    return "[" in peptidoform_str

def hyperscore_difference(row, reference):
    try:
        reference_hyperscore = reference[row["spectrum_id"]]
        return float(row["hyperscore"])-float(reference_hyperscore)
    except:
        return None
    
def evaluate_prediction_isobaricity(
        row, ground_truth_peptide, ground_truth_hyperscore
):
    try:
        sequence_match = ground_truth_peptide[
            row["spectrum_id"]
        ] == row["peptide"]
        if sequence_match:
            return "Match"
        
        ref_hyperscore = ground_truth_hyperscore[row["spectrum_id"]]
        if row["hyperscore"] == ref_hyperscore:
            return "Isobaric"
        
        elif row["hyperscore"] > ref_hyperscore:
            return "Better"

        elif row["hyperscore"] < ref_hyperscore:
            return "Worse"

        else:
            return "Error?"

    except:
        return "Unpredicted"

In [3]:
filenames = [
    # 'F01_Fraction2',
    # 'F01_Fraction4',
    'S14_Rep2',
    'S14_Rep1',
    #'S08',
    'S03',
    'S14_Rep3',
    'F07_Fraction4',
    'S11_Fraction3',
    'S11_Fraction1',
    'S07',
    'F07_Fraction3',
    'F08_Rep2',
    'F07_Fraction2',
    'F07_Fraction1',
    'S11_Fraction2',
    'S11_Fraction4',
    'F08_Rep1',
    'F01_Fraction1',
    'F01_Fraction3',
    'F08_Rep3',
    'F06',
    #'S05'
]

keep_cols = [
    "proforma",
    "sequence",
    "spectrum_id",
    "run",
    "engine",
    "score",
    "qvalue",
    "spectralis_score",
    "hyperscore",
    "is_decoy",
    "has_modification",
    "rescoring_features"
]

for filename in filenames:
    print(filename)
    root_data="/home/samva/Doctorate/data_directory/denovo_project"
    mgf_path=os.path.join(root_data, "mgf_filtered", filename + ".mgf")
    results_dir=os.path.join(root_data, "denovo_results")

    parser_spectralis = SpectralisParser(
        mgf_path=mgf_path,
        results_dir=results_dir
    )

    # Casanovo, instanovo, pepnet, contranovo ran together
    parser_spectralis.parse(
        path_spectralis=os.path.join(
            results_dir,
            "refinement/spectralis/pt1", filename + "_annotated_rescoring.csv"
        )
    )

    # NovoB, Novor, PepNovo+ ran together
    parser_spectralis.parse(
        path_spectralis=os.path.join(
            results_dir,
            "refinement/spectralis/pt2", filename + "_annotated_rescoring.csv"
        )
    )

    # Sage results ran separately
    parser_spectralis.parse(
        path_spectralis=os.path.join(
            results_dir,
            "refinement/spectralis/pt3", filename + "_annotated_rescoring.csv"
        )
    )

    psmlist = parser_spectralis.psmlist
    psmlist["run"] = [filename]*len(psmlist)
    decoy_status = psmlist["is_decoy"] 
    decoy_status = np.where(decoy_status == None, False, decoy_status)
    psmlist["is_decoy"] = decoy_status 

    psmlist["qvalue"] = [1 if x is None else x for x in psmlist["qvalue"]]
    spectrum_ids_to_keep = psmlist[
        (psmlist["source"]=="sage") &
        (psmlist["qvalue"]<.01)
    ]["spectrum_id"]

    mgf_file = mgf.read(mgf_path)

    basic_fgen = BasicFeatureGenerator()

    basic_fgen.add_features(psmlist)
    print("Added basic features.")
    # ms2pip_fgen.add_features(psmlist)
    # print("Added MS2PIP features.")
    # deeplc_fgen.add_features(psmlist)
    # print("Added DeepLC features.")

    for psm in tqdm(psmlist):
        hyperscore = calculate_hyperscore(
            psm=psm,
            mgf_file=mgf_file,
            engine="pyopenms"
        )
        psm["rescoring_features"].update(
            {"hyperscore": hyperscore}
        )
    print("Added hyperscore.")

    df = psmlist.to_dataframe()
    df = df[df.spectrum_id.isin(spectrum_ids_to_keep)].reset_index(drop=True)
    df["spectralis_score"] = df.apply(get_spectralis_score, axis=1)
    df["hyperscore"] = pd.DataFrame(df["rescoring_features"].tolist())["hyperscore"]
    df["psm_type"] = df.apply(get_psm_type, axis=1)
    df["proforma"] = df.peptidoform.apply(lambda x: x.proforma)
    df["peptide"] = df.peptidoform.apply(lambda x: x.sequence)
    df["has_modification"] = df.peptidoform.apply(peptidoform_has_modification)

    ground_truth_peptide = df.loc[df.source=="sage", ["spectrum_id", "peptide"]].set_index("spectrum_id").to_dict()["peptide"]
    ground_truth_hyperscore = df.loc[df.source=="sage", ["spectrum_id", "hyperscore"]].set_index("spectrum_id").to_dict()["hyperscore"]

    df["match_type"] = df.progress_apply(
        lambda x: evaluate_prediction_isobaricity(
            x, 
            ground_truth_peptide=ground_truth_peptide, 
            ground_truth_hyperscore=ground_truth_hyperscore
        ), axis=1
    )

    df.to_pickle(os.path.join("./filtered_results", filename+".pkl"))

S14_Rep2


100%|██████████| 60809/60809 [00:01<00:00, 33569.26it/s]
100%|██████████| 60809/60809 [00:00<00:00, 159585.18it/s]
100%|██████████| 60012/60012 [00:01<00:00, 37496.70it/s]
100%|██████████| 60012/60012 [00:00<00:00, 164001.21it/s]
100%|██████████| 70634/70634 [00:01<00:00, 47460.05it/s]
100%|██████████| 70634/70634 [00:00<00:00, 161933.26it/s]
100%|██████████| 30624/30624 [00:00<00:00, 45063.59it/s]
100%|██████████| 30624/30624 [00:00<00:00, 162842.85it/s]
100%|██████████| 57865/57865 [00:02<00:00, 23878.24it/s]
100%|██████████| 56089/56089 [00:00<00:00, 163303.58it/s]
100%|██████████| 2021/2021 [00:00<00:00, 39980.42it/s]
100%|██████████| 2021/2021 [00:00<00:00, 153707.99it/s]
100%|██████████| 2430/2430 [00:00<00:00, 38790.77it/s]
100%|██████████| 2430/2430 [00:00<00:00, 155413.29it/s]
100%|██████████| 55537/55537 [00:00<00:00, 147367.67it/s]


Added basic features.


  0%|          | 483/338156 [00:00<01:09, 4825.13it/s]



  2%|▏         | 6300/338156 [00:01<01:19, 4197.72it/s]



  2%|▏         | 7522/338156 [00:01<01:24, 3911.50it/s]



  4%|▎         | 12365/338156 [00:03<01:26, 3768.46it/s]



  5%|▌         | 17114/338156 [00:04<01:31, 3525.55it/s]



 10%|▉         | 33764/338156 [00:09<01:34, 3212.54it/s]



 11%|█         | 36706/338156 [00:10<01:32, 3267.46it/s]



 12%|█▏        | 40881/338156 [00:11<01:35, 3106.36it/s]



 14%|█▎        | 46206/338156 [00:13<01:29, 3266.66it/s]



 57%|█████▋    | 192929/338156 [00:52<00:32, 4487.25it/s]



 57%|█████▋    | 193815/338156 [00:53<00:33, 4304.39it/s]



 58%|█████▊    | 195095/338156 [00:53<00:34, 4090.64it/s]



 58%|█████▊    | 195907/338156 [00:53<00:35, 3997.85it/s]



 58%|█████▊    | 197494/338156 [00:53<00:36, 3872.51it/s]



 59%|█████▊    | 198265/338156 [00:54<00:37, 3757.74it/s]



 59%|█████▉    | 199013/338156 [00:54<00:37, 3708.84it/s]



 59%|█████▉    | 199747/338156 [00:54<00:38, 3603.97it/s]



 59%|█████▉    | 200466/338156 [00:54<00:40, 3422.11it/s]



 59%|█████▉    | 201154/338156 [00:55<00:40, 3396.86it/s]



 60%|█████▉    | 202177/338156 [00:55<00:40, 3396.91it/s]



 60%|██████    | 203193/338156 [00:55<00:40, 3358.13it/s]



 60%|██████    | 204205/338156 [00:55<00:40, 3342.02it/s]



 61%|██████    | 204871/338156 [00:56<00:40, 3301.73it/s]



 61%|██████    | 205530/338156 [00:56<00:40, 3262.48it/s]



 61%|██████▏   | 207170/338156 [00:56<00:40, 3249.32it/s]



 62%|██████▏   | 208150/338156 [00:57<00:40, 3238.99it/s]



 62%|██████▏   | 208794/338156 [00:57<00:41, 3133.88it/s]



 62%|██████▏   | 209733/338156 [00:57<00:41, 3088.03it/s]



 62%|██████▏   | 210693/338156 [00:57<00:40, 3156.81it/s]



 63%|██████▎   | 212296/338156 [00:58<00:39, 3175.13it/s]



 63%|██████▎   | 212945/338156 [00:58<00:39, 3208.32it/s]



 63%|██████▎   | 213923/338156 [00:58<00:38, 3248.98it/s]



 63%|██████▎   | 214585/338156 [00:59<00:37, 3278.41it/s]



 64%|██████▎   | 215240/338156 [00:59<00:37, 3242.58it/s]



 64%|██████▍   | 216247/338156 [00:59<00:36, 3330.08it/s]



 64%|██████▍   | 218089/338156 [01:00<00:32, 3695.38it/s]



 65%|██████▍   | 218881/338156 [01:00<00:31, 3838.41it/s]



 65%|██████▌   | 220133/338156 [01:00<00:29, 4063.70it/s]



 66%|██████▌   | 221825/338156 [01:01<00:27, 4180.82it/s]



100%|██████████| 338156/338156 [01:33<00:00, 3615.58it/s]


Added hyperscore.


100%|██████████| 192097/192097 [00:00<00:00, 195022.36it/s]


S14_Rep1


 61%|██████    | 37187/61007 [00:00<00:00, 41356.94it/s]

: 