In [1]:
# !pip install ipywidgets

import pandas as pd
import ipywidgets as widgets
from IPython.display import display
import requests
from bs4 import BeautifulSoup
import os
import zipfile
from tqdm import tqdm
from pathlib import Path

import json

from collections import defaultdict
import toml

from proteobench.modules.quant.quant_lfq_ion_DDA import DDAQuantIonModule
from proteobench.io.parsing.parse_settings import ParseSettingsBuilder
from proteobench.plotting import plot_quant
from proteobench.io.parsing.parse_ion import load_input_file
from proteobench.modules.constants import MODULE_SETTINGS_DIRS

In [24]:
from proteobench.utils.server_io import get_merged_json
df = get_merged_json(repo_url="https://github.com/Proteobench/Results_quant_ion_DDA/archive/refs/heads/main.zip")
df.shape

proteobench_directory = os.path.abspath("../../../../../../../")
print(proteobench_directory)

Combined 51 JSON files into 'combined_results.json'.
/Users/witoldwolski/__projects/ProteoBench


In [18]:
from proteobench.utils.server_io import get_raw_data


def get_datasets_to_downlod(df, output_directory="temp_results"):
    """
    Check which datasets from the DataFrame need to be downloaded.
    
    Args:
        df: DataFrame containing intermediate_hash column
        output_directory: Directory where datasets are stored
        
    Returns:
        Tuple containing:
        - DataFrame containing only rows that need to be downloaded
        - Dictionary mapping intermediate_hash to directory paths
    """
    # Extract hash list from DataFrame 
    hash_list = df["intermediate_hash"].tolist()
    
    # Check which hashes already exist and build hash->dir mapping
    existing_hashes = []
    hash_vis_dir = {}
    
    if os.path.exists(output_directory):
        for hash_dir in os.listdir(output_directory):
            if hash_dir in hash_list:
                existing_hashes.append(hash_dir)
                hash_vis_dir[hash_dir] = os.path.join(output_directory, hash_dir)

    if existing_hashes:
        df_to_download = df[~df["intermediate_hash"].isin(existing_hashes)]
        return df_to_download, hash_vis_dir
    else:
        return df, hash_vis_dir

# Get DataFrame of datasets that need downloading
df_to_download, hash_vis_dirs = get_datasets_to_downlod(df,output_directory="temp_results")

df_to_download.shape

(5, 88)

In [19]:
hash_vis_dirs

{'e8e80290fb48ff02de5ee54eb6b0114ff661bace': 'temp_results/e8e80290fb48ff02de5ee54eb6b0114ff661bace',
 'bbd079eda4dd3d6a51b37924d83db50022530bb6': 'temp_results/bbd079eda4dd3d6a51b37924d83db50022530bb6',
 '0d33cae7fd689507eb8b31c7ee2adc72adc3befe': 'temp_results/0d33cae7fd689507eb8b31c7ee2adc72adc3befe',
 '00e2f863939301a2a71178652972dad895b27520': 'temp_results/00e2f863939301a2a71178652972dad895b27520',
 'c9b6df263cfd4491f6188ac49043fa78d0f8acdb': 'temp_results/c9b6df263cfd4491f6188ac49043fa78d0f8acdb',
 '4decb9e0d2d9ed9f9be6a9ad5aa066b1dcd1e616': 'temp_results/4decb9e0d2d9ed9f9be6a9ad5aa066b1dcd1e616',
 '2805f6b6d1fabd65448540bee743f2b05c1dda59': 'temp_results/2805f6b6d1fabd65448540bee743f2b05c1dda59',
 '7a1b33450db849167eac7f1ad2f96f25753e5484': 'temp_results/7a1b33450db849167eac7f1ad2f96f25753e5484',
 'e0ab1339a5354cb27d895ff252383ee4a3365b2e': 'temp_results/e0ab1339a5354cb27d895ff252383ee4a3365b2e',
 '0c88ca16d30d2911dc78febe087650427365e963': 'temp_results/0c88ca16d30d2911dc78feb

In [20]:
if(df_to_download.shape[0] > 0):
    hash_vis_dirs_2 = get_raw_data(df_to_download, base_url="https://proteobench.cubimed.rub.de/datasets/",output_directory="temp_results")

# append hash_vis_dir_2 to hash_vis_dir
hash_vis_dirs.update(hash_vis_dirs_2)
hash_vis_dirs

{'e8e80290fb48ff02de5ee54eb6b0114ff661bace': 'temp_results/e8e80290fb48ff02de5ee54eb6b0114ff661bace',
 'bbd079eda4dd3d6a51b37924d83db50022530bb6': 'temp_results/bbd079eda4dd3d6a51b37924d83db50022530bb6',
 '0d33cae7fd689507eb8b31c7ee2adc72adc3befe': 'temp_results/0d33cae7fd689507eb8b31c7ee2adc72adc3befe',
 '00e2f863939301a2a71178652972dad895b27520': 'temp_results/00e2f863939301a2a71178652972dad895b27520',
 'c9b6df263cfd4491f6188ac49043fa78d0f8acdb': 'temp_results/c9b6df263cfd4491f6188ac49043fa78d0f8acdb',
 '4decb9e0d2d9ed9f9be6a9ad5aa066b1dcd1e616': 'temp_results/4decb9e0d2d9ed9f9be6a9ad5aa066b1dcd1e616',
 '2805f6b6d1fabd65448540bee743f2b05c1dda59': 'temp_results/2805f6b6d1fabd65448540bee743f2b05c1dda59',
 '7a1b33450db849167eac7f1ad2f96f25753e5484': 'temp_results/7a1b33450db849167eac7f1ad2f96f25753e5484',
 'e0ab1339a5354cb27d895ff252383ee4a3365b2e': 'temp_results/e0ab1339a5354cb27d895ff252383ee4a3365b2e',
 '0c88ca16d30d2911dc78febe087650427365e963': 'temp_results/0c88ca16d30d2911dc78feb

In [22]:
#%%script false --no-raise-error
if False:
    # Create a SelectMultiple widget with names as options
    row_selector = widgets.SelectMultiple(
        options=[(f"{row['id']} (hash: {row['intermediate_hash']}, submission comments: {row['submission_comments']})", idx) for idx, row in df.iterrows()],
        description='Select Rows:',
        rows=10,  # Number of visible rows in the widget
        layout=widgets.Layout(width='50%')  # Adjust layout as needed
    )

    # Button to confirm selection
    button = widgets.Button(description='Filter Rows')

    # Output widget to display the filtered DataFrame
    output = widgets.Output()

    # Callback for filtering rows
    def on_button_click(b):
        with output:
            output.clear_output()
            selected_indices = list(row_selector.value)
            global filtered_df  # Store filtered DataFrame globally
            filtered_df = df.iloc[selected_indices]
            print("Filtered DataFrame:")
            display(filtered_df)

    # Attach callback
    button.on_click(on_button_click)

    # Display the widgets
    display(row_selector, button, output)

else:
    filtered_df = df

In [None]:

location = hash_vis_dirs[filtered_df["intermediate_hash"].iloc[0]]
software_name = filtered_df["software_name"].iloc[0]

# List all files in the directory
all_files = os.listdir(location)

# Filter for files that start with 'input_file' and ignore their extensions
matching_file = os.path.join(location,[f for f in all_files if f.startswith('input_file') and os.path.isfile(os.path.join(location, f))][0])
matching_file_params = os.path.join(location,[f for f in all_files if f.startswith('param') and os.path.isfile(os.path.join(location, f))][0])

user_config = defaultdict(lambda: "")

module_obj = DDAQuantIonModule(token="")
results_df = module_obj.obtain_all_data_points(all_datapoints=None)

results_performance, all_datapoints, result_df = module_obj.benchmarking(
    matching_file, software_name, user_input=user_config, all_datapoints=[]
)

fig1 = plot_quant.PlotDataPoint.plot_CV_violinplot(results_performance)
fig1.show()

parse_settings = ParseSettingsBuilder(
                parse_settings_dir=f"{proteobench_directory}/proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/", module_id="quant_lfq_DDA_ion"
            ).build_parser(software_name)

fig2 = plot_quant.PlotDataPoint.plot_fold_change_histogram(results_performance,parse_settings.species_expected_ratio())
fig2.show()



FileNotFoundError: The following parse settings files are missing: ['../../../../../proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/parse_settings_alphapept.toml', '../../../../../proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/parse_settings_fragpipe.toml', '../../../../../proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/parse_settings_i2massChroQ.toml', '../../../../../proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/parse_settings_maxquant.toml', '../../../../../proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/parse_settings_msangel.toml', '../../../../../proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/parse_settings_peaks.toml', '../../../../../proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/parse_settings_proline.toml', '../../../../../proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/parse_settings_msstats.toml', '../../../../../proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/parse_settings_sage.toml', '../../../../../proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/parse_settings_wombat.toml', '../../../../../proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/parse_settings_custom.toml', '../../../../../proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/module_settings.toml']

In [None]:
performance_dict = {}

for idx,row in df.iterrows():
    location = hash_vis_dirs[row["intermediate_hash"]]
    software_name = row["software_name"]

    # List all files in the directory
    all_files = os.listdir(location)
    if len(all_files) == 0:
        print(f"Directory {location} is empty.")
        continue

    # Filter for files that start with 'input_file' and ignore their extensions
    matching_file = os.path.join(
        location, [f for f in all_files if f.startswith("input_file") and os.path.isfile(os.path.join(location, f))][0]
    )
    matching_file_params = os.path.join(
        location, [f for f in all_files if f.startswith("param") and os.path.isfile(os.path.join(location, f))][0]
    )

    user_config = defaultdict(lambda: "")

    module_obj = DDAQuantIonModule(token="")
    results_df = module_obj.obtain_all_data_points(all_datapoints=None)

    input_df = load_input_file(matching_file, software_name)

    parse_settings = ParseSettingsBuilder(
        parse_settings_dir=MODULE_SETTINGS_DIRS["quant_lfq_DDA_ion"], module_id="quant_lfq_DDA_ion"
    ).build_parser(software_name)
    standard_format, replicate_to_raw = parse_settings.convert_to_standard_format(input_df)
    runs = replicate_to_raw["A"]
    runs.extend(replicate_to_raw["B"])

    results_performance, all_datapoints, result_df = module_obj.benchmarking(
        matching_file, software_name, user_input=user_config, all_datapoints=[]
    )

    results_performance_runs = results_performance.loc[:, runs]
    total_missing = results_performance.loc[:, runs].isna().sum().sum()
    total_max = results_performance_runs.shape[
        0
    ] * results_performance_runs.shape[1]
    total_peptide_ions = results_performance_runs.shape[0]
    total_runs = results_performance_runs.shape[1]

    performance_dict[row["id"]] = {
        "total_missing": total_missing,
        "total_max": total_max,
        "total_peptide_ions": total_peptide_ions,
        "total_runs": total_runs,
        "total_missing_ratio": total_missing / total_max,
        "total_missing_percentage": (total_missing / total_max) * 100,
    }

In [13]:
performance_df = pd.DataFrame(performance_dict).T

In [None]:
import pandas as pd
import plotly.express as px
performance_df["IDX"] = performance_df.index
# Creating an interactive scatter plot using Plotly
fig = px.scatter(
    performance_df,
    x="total_peptide_ions",
    y="total_missing_percentage",
    hover_name="IDX",
    labels={"total_missing": "Total Missing", "total_missing_percentage": "Total Missing Percentage"},
    title="Interactive Scatter Plot of Missing Data",
)

fig.update_layout(hovermode="closest")
fig.show()

In [None]:
import pandas as pd
import plotly.express as px

performance_df["IDX"] = performance_df.index
# Creating an interactive scatter plot using Plotly
fig = px.scatter(
    performance_df,
    x="total_missing",
    y="total_missing_percentage",
    hover_name="IDX",
    labels={"total_missing": "Total Missing", "total_missing_percentage": "Total Missing Percentage"},
    title="Interactive Scatter Plot of Missing Data",
)

fig.update_layout(hovermode="closest")
fig.show()