In [11]:

import pandas as pd
import ipywidgets as widgets
from IPython.display import display
import requests
from bs4 import BeautifulSoup
import os
import zipfile
from tqdm import tqdm
from pathlib import Path

import json

from collections import defaultdict
import toml

from proteobench.modules.quant.quant_lfq_ion_DDA_QExactive import DDAQuantIonModuleQExactive
from proteobench.io.parsing.parse_settings import ParseSettingsBuilder
from proteobench.plotting import plot_quant
from proteobench.io.parsing.parse_ion import load_input_file
from proteobench.modules.constants import MODULE_SETTINGS_DIRS

In [12]:
from proteobench.utils.server_io import get_merged_json
df = get_merged_json(repo_url="https://github.com/Proteobench/Results_quant_ion_DDA/archive/refs/heads/main.zip")
df.shape

proteobench_directory = os.path.abspath("../../../../../../../")
print(proteobench_directory)

Combined 58 JSON files into 'combined_results.json'.
/Users/witoldwolski/__projects/ProteoBench


In [13]:
from proteobench.utils.server_io import get_raw_data


def get_datasets_to_downlod(df, output_directory="temp_results"):
    """
    Check which datasets from the DataFrame need to be downloaded.
    
    Args:
        df: DataFrame containing intermediate_hash column
        output_directory: Directory where datasets are stored
        
    Returns:
        Tuple containing:
        - DataFrame containing only rows that need to be downloaded
        - Dictionary mapping intermediate_hash to directory paths
    """
    # Extract hash list from DataFrame 
    hash_list = df["intermediate_hash"].tolist()
    
    # Check which hashes already exist and build hash->dir mapping
    existing_hashes = []
    hash_vis_dir = {}
    
    if os.path.exists(output_directory):
        for hash_dir in os.listdir(output_directory):
            if hash_dir in hash_list:
                existing_hashes.append(hash_dir)
                hash_vis_dir[hash_dir] = os.path.join(output_directory, hash_dir)

    if existing_hashes:
        df_to_download = df[~df["intermediate_hash"].isin(existing_hashes)]
        return df_to_download, hash_vis_dir
    else:
        return df, hash_vis_dir

# Get DataFrame of datasets that need downloading
df_to_download, hash_vis_dirs = get_datasets_to_downlod(df,output_directory="temp_results")

df_to_download.shape

(4, 88)

In [14]:
hash_vis_dirs

{'e8e80290fb48ff02de5ee54eb6b0114ff661bace': 'temp_results/e8e80290fb48ff02de5ee54eb6b0114ff661bace',
 'bbd079eda4dd3d6a51b37924d83db50022530bb6': 'temp_results/bbd079eda4dd3d6a51b37924d83db50022530bb6',
 'b9f217a22df2e15b1144830498fcb65f1dbcff57': 'temp_results/b9f217a22df2e15b1144830498fcb65f1dbcff57',
 '0d33cae7fd689507eb8b31c7ee2adc72adc3befe': 'temp_results/0d33cae7fd689507eb8b31c7ee2adc72adc3befe',
 '00e2f863939301a2a71178652972dad895b27520': 'temp_results/00e2f863939301a2a71178652972dad895b27520',
 'c9b6df263cfd4491f6188ac49043fa78d0f8acdb': 'temp_results/c9b6df263cfd4491f6188ac49043fa78d0f8acdb',
 '4decb9e0d2d9ed9f9be6a9ad5aa066b1dcd1e616': 'temp_results/4decb9e0d2d9ed9f9be6a9ad5aa066b1dcd1e616',
 '2805f6b6d1fabd65448540bee743f2b05c1dda59': 'temp_results/2805f6b6d1fabd65448540bee743f2b05c1dda59',
 '7a1b33450db849167eac7f1ad2f96f25753e5484': 'temp_results/7a1b33450db849167eac7f1ad2f96f25753e5484',
 'e0ab1339a5354cb27d895ff252383ee4a3365b2e': 'temp_results/e0ab1339a5354cb27d895ff

In [15]:
if(df_to_download.shape[0] > 0):
    hash_vis_dirs_2 = get_raw_data(df_to_download, base_url="https://proteobench.cubimed.rub.de/datasets/",output_directory="temp_results")

# append hash_vis_dir_2 to hash_vis_dir
hash_vis_dirs.update(hash_vis_dirs_2)
hash_vis_dirs


#check that all df["intermediate_hash"] are in hash_vis_dirs
# remove them if they are not in hash_vis_dirs

for idx,row in df.iterrows():
    if row["intermediate_hash"] not in hash_vis_dirs:
        print(f"intermediate_hash {row['intermediate_hash']} not in hash_vis_dirs")

df = df[df["intermediate_hash"].isin(hash_vis_dirs)]


intermediate_hash e01ca9dd0be7bae4ec717df19842cef4061b5521 not in hash_vis_dirs
intermediate_hash 45fbb2dc2468abd6dbaec4520c2a0e96cb7f7e0f not in hash_vis_dirs
intermediate_hash 47db7ef37a0fb5fec79f3bedbfb4f67835774f10 not in hash_vis_dirs
intermediate_hash 0b2ce2ef84562945a7ad4865ff11ff720f8d68ad not in hash_vis_dirs


In [16]:
#%%script false --no-raise-error
if False:
    # Create a SelectMultiple widget with names as options
    row_selector = widgets.SelectMultiple(
        options=[(f"{row['id']} (hash: {row['intermediate_hash']}, submission comments: {row['submission_comments']})", idx) for idx, row in df.iterrows()],
        description='Select Rows:',
        rows=10,  # Number of visible rows in the widget
        layout=widgets.Layout(width='50%')  # Adjust layout as needed
    )

    # Button to confirm selection
    button = widgets.Button(description='Filter Rows')

    # Output widget to display the filtered DataFrame
    output = widgets.Output()

    # Callback for filtering rows
    def on_button_click(b):
        with output:
            output.clear_output()
            selected_indices = list(row_selector.value)
            global filtered_df  # Store filtered DataFrame globally
            filtered_df = df.iloc[selected_indices]
            print("Filtered DataFrame:")
            display(filtered_df)

    # Attach callback
    button.on_click(on_button_click)

    # Display the widgets
    display(row_selector, button, output)

else:
    filtered_df = df

In [17]:

location = hash_vis_dirs[filtered_df["intermediate_hash"].iloc[0]]
#filtered_df["intermediate_hash"].iloc[0]
#hash_vis_dirs

In [18]:

software_name = filtered_df["software_name"].iloc[0]

# List all files in the directory
all_files = os.listdir(location)

# Filter for files that start with 'input_file' and ignore their extensions
matching_file = os.path.join(location,[f for f in all_files if f.startswith('input_file') and os.path.isfile(os.path.join(location, f))][0])
matching_file_params = os.path.join(location,[f for f in all_files if f.startswith('param') and os.path.isfile(os.path.join(location, f))][0])

user_config = defaultdict(lambda: "")

module_obj = DDAQuantIonModuleQExactive(token="")
#results_df = module_obj.obtain_all_data_points(all_datapoints=None)

results_performance, all_datapoints, result_df = module_obj.benchmarking(
    matching_file, software_name, user_input=user_config, all_datapoints=[]
)

fig1 = plot_quant.PlotDataPoint.plot_CV_violinplot(results_performance)
fig1.show()

parse_settings = ParseSettingsBuilder(
                parse_settings_dir=f"{proteobench_directory}/proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/QExactive", module_id="quant_lfq_DDA_ion_QExactive"
            ).build_parser(software_name)

fig2 = plot_quant.PlotDataPoint.plot_fold_change_histogram(results_performance,parse_settings.species_expected_ratio())
fig2.show()



In [19]:
import time
performance_dict = {}
timings = {}
# select 5 random rows from df
#filtered_df = df#.sample(5)
# Filter for specific software names
valid_software_names = ["ProlineStudio", "FragPipe", "MSAngel"]
filtered_df = df[df["software_name"].isin(valid_software_names)]



for idx,row in filtered_df.iterrows():
    location = hash_vis_dirs[row["intermediate_hash"]]
    software_name = row["software_name"]
    # List all files in the directory
    all_files = os.listdir(location)
    if len(all_files) == 0:
        print(f"Directory {location} is empty.")
        continue

    # Filter for files that start with 'input_file' and ignore their extensions
    matching_file = os.path.join(
        location, [f for f in all_files if f.startswith("input_file") and os.path.isfile(os.path.join(location, f))][0]
    )
    matching_file_params = os.path.join(
        location, [f for f in all_files if f.startswith("param") and os.path.isfile(os.path.join(location, f))][0]
    )

    user_config = defaultdict(lambda: "")
    module_obj = DDAQuantIonModuleQExactive(token="")

    # time loading input file
    start_time = time.time()
    input_df = load_input_file(matching_file, software_name)
    end_time = time.time()
    load_time = end_time - start_time
    print(f"Time to load {software_name} input file: {load_time:.2f} seconds")

    parse_settings = ParseSettingsBuilder(
        parse_settings_dir=MODULE_SETTINGS_DIRS["quant_lfq_DDA_ion_QExactive"], module_id="quant_lfq_DDA_ion_QExactive"
    ).build_parser(software_name)
    
    # Benchmark convert_to_standard_format
    import time
    start_time = time.time()
    standard_format, replicate_to_raw = parse_settings.convert_to_standard_format(input_df)
    end_time = time.time()
    conversion_time = end_time - start_time
    print(f"Time to convert {software_name} to standard format: {conversion_time:.2f} seconds")
    
    replicate_to_raw = parse_settings._create_replicate_mapping()
    runs = replicate_to_raw["A"]
    runs.extend(replicate_to_raw["B"])

    # time benchmarking
    start_time = time.time()
    results_performance, all_datapoints, result_df = module_obj.benchmarking(
        matching_file, software_name, user_input=user_config, all_datapoints=[]
    )
    end_time = time.time()
    benchmarking_time = end_time - start_time
    print(f"Time to benchmark {software_name}: {benchmarking_time:.2f} seconds")

    results_performance_runs = results_performance.loc[:, runs]
    total_missing = results_performance.loc[:, runs].isna().sum().sum()
    total_peptide_ions = results_performance_runs.shape[0]
    total_runs = results_performance_runs.shape[1]
    total_max = total_peptide_ions * total_runs
    performance_dict[row["id"]] = {
        "software_name": software_name,
        "total_missing": total_missing,
        "total_max": total_max,
        "total_peptide_ions": total_peptide_ions,
        "total_runs": total_runs,
        "total_missing_ratio": total_missing / total_max,
        "total_missing_percentage": (total_missing / total_max) * 100,
        "benchmarking_time": benchmarking_time,
        "load_time":load_time,
        "conversion_time":conversion_time,
        "locations":location

    }
    #timings[row["id"]] = timing

# collect all timing data into a dataframe


KeyboardInterrupt: 

In [10]:
performance_df = pd.DataFrame(performance_dict).T
performance_df



Unnamed: 0,software_name,total_missing,total_max,total_peptide_ions,total_runs,total_missing_ratio,total_missing_percentage,benchmarking_time,load_time,conversion_time,locations
WOMBAT_20250522_133359,WOMBAT,60584,287682,47947,6,0.210594,21.059364,8.141222,0.087561,0.384492,temp_results/6636a8c845b94a386a2f8ab8827e47e13...
AlphaPept_20250122_154315,AlphaPept,145148,349122,58187,6,0.415752,41.575151,10.518389,2.919729,1.263126,temp_results/fd9834a584a2248fdf6ff35b3584092fa...
MaxQuant_20241216_122819,MaxQuant,62988,298074,49679,6,0.211317,21.131665,11.802985,1.984647,1.657888,temp_results/254d6c77ce656888918e738772ad5f5f6...
AlphaPept_20250122_155121,AlphaPept,144945,348540,58090,6,0.415863,41.586332,10.82127,3.119355,1.304009,temp_results/94c4c0b7d00761d24fdde05276053b087...
MaxQuant_20241216_122228,MaxQuant,120910,302022,50337,6,0.400335,40.033507,8.528185,1.576692,1.38755,temp_results/a1140a31b414d7b3110ee9b9c0456cc4f...


In [13]:
# Calculate pure benchmarking time by subtracting load and conversion times
performance_df['pure_benchmarking_time'] = performance_df['benchmarking_time'] - ((performance_df['load_time'] + performance_df['conversion_time']))
# Save performance DataFrame to TSV file in same directory as notebook
performance_df.to_csv('performance_metrics_new.tsv', sep='\t', index=True)

# Confirm you have >1 category
print(performance_df["software_name"].nunique(), performance_df["software_name"].unique())




3 ['WOMBAT' 'AlphaPept' 'MaxQuant']


In [14]:
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly"     # the default Plotly template


def create_scatter_plot(performance_df, x_column="total_peptide_ions", y_column="pure_benchmarking_time",
                        title="Benchmarking Time vs Total Peptide Ions by Software"):
    # Create the scatter plot
    fig = px.scatter(
        performance_df,
        x=x_column,
        y=y_column,
        color='software_name',
        labels={
            'total_peptide_ions': 'Total Peptide Ions',
            'benchmarking_time': 'Benchmarking Time (s)',
            'software_name': 'Software'
        },
        title=title
    )

    fig.update_layout(hovermode='closest')
    fig.show()

create_scatter_plot(performance_df, x_column="total_peptide_ions", y_column="pure_benchmarking_time")

In [15]:
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly"     # the default Plotly template

create_scatter_plot(performance_df, x_column="total_peptide_ions", y_column="load_time", title="Load Time vs Total Peptide Ions by Software")


In [16]:
create_scatter_plot(performance_df, x_column="total_peptide_ions",
                    y_column="conversion_time",
                    title="Load Time vs Total Peptide Ions by Software")

In [17]:
import plotly.express as px
performance_df["IDX"] = performance_df.index
# Creating an interactive scatter plot using Plotly
fig = px.scatter(
    performance_df,
    x="total_peptide_ions",
    y="total_missing_percentage",
    hover_name="IDX",
    labels={"total_missing": "Total Missing", "total_missing_percentage": "Total Missing Percentage"},
    title="Interactive Scatter Plot of Missing Data",
)

fig.update_layout(hovermode="closest")
fig.show()

In [16]:
import pandas as pd
import plotly.express as px

performance_df["IDX"] = performance_df.index
# Creating an interactive scatter plot using Plotly
fig = px.scatter(
    performance_df,
    x="total_missing",
    y="total_missing_percentage",
    hover_name="IDX",
    labels={"total_missing": "Total Missing", "total_missing_percentage": "Total Missing Percentage"},
    title="Interactive Scatter Plot of Missing Data",
)

fig.update_layout(hovermode="closest")
fig.show()