In [9]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display
import requests
from bs4 import BeautifulSoup
import os
import zipfile
from tqdm import tqdm
from pathlib import Path

import json

from collections import defaultdict
import toml

from proteobench.modules.quant.quant_lfq_ion_DDA import DDAQuantIonModule
from proteobench.io.parsing.parse_settings import ParseSettingsBuilder
from proteobench.plotting import plot_quant
from proteobench.io.parsing.parse_ion import load_input_file
from proteobench.modules.constants import MODULE_SETTINGS_DIRS

In [10]:
# Examples to choose from:
#
# "https://api.github.com/repos/Proteobench/Results_quant_ion_DDA/contents/",
# "https://api.github.com/repos/Proteobench/Results_quant_peptidoform_DIA/contents/",
# "https://api.github.com/repos/Proteobench/Results_subcellprofile_DOMLFQ_protein_DIA_EXPL/contents/",
# "https://api.github.com/repos/Proteobench/Results_quant_peptidoform_DDA/contents/",
# "https://api.github.com/repos/Proteobench/Results_quant_ion_DIA_diaPASEF/contents/",
# "https://api.github.com/repos/Proteobench/Results_quant_ion_DIA/contents/",
# "https://api.github.com/repos/Proteobench/Results_quant_ion_DIA_singlecell/contents/",

repo_url = "https://api.github.com/repos/Proteobench/Results_quant_ion_DDA/contents/"
    
file_names = []

response = requests.get(repo_url)

if response.status_code == 200:
    repo_contents = response.json()

    for item in repo_contents:
        file_name = item.get("name")
        if file_name:
            file_names.append(file_name)    
else:
    print(f"Failed to retrieve the repository contents. Status code: {response.status_code}")
    print(repo_url)
        
file_names = [f for f in file_names if f.endswith(".json")]
json_files_content = []

for f in file_names:
    url = f"https://raw.githubusercontent.com/Proteobot/Results_quant_ion_DDA/refs/heads/main/{f}"
    response = requests.get(url)
    response.raise_for_status()  # Raise an error if the request fails

    data = response.json()
    
    json_files_content.append(data)
    
df = pd.json_normalize(json_files_content)

df.head(5)

Unnamed: 0,id,old_new,software_name,software_version,search_engine,search_engine_version,ident_fdr_psm,ident_fdr_peptide,ident_fdr_protein,enable_match_between_runs,...,results.2.CV_q75,results.2.CV_q95,results.1.median_abs_epsilon,results.1.mean_abs_epsilon,results.1.variance_epsilon,results.1.nr_prec,results.1.CV_median,results.1.CV_q90,results.1.CV_q75,results.1.CV_q95
0,MaxQuant_20241216_122433,old,MaxQuant,2.5.1.0,Andromeda,,,0.01,0.01,False,...,0.291818,0.512293,0.199852,0.274547,0.160229,50302,0.202619,0.416519,0.291818,0.512293
1,MaxQuant_20241216_124040,old,MaxQuant,2.3.1.0,Andromeda,,,0.01,0.01,True,...,0.312472,0.569765,0.207123,0.304261,0.222469,50339,0.215507,0.46077,0.312472,0.569765
2,MaxQuant_20250506_151623,new,MaxQuant,2.3.1.0,Andromeda,,0.01,,0.01,True,...,0.312472,0.569765,0.207123,0.304261,0.222469,50339,0.215507,0.46077,0.312472,0.569765
3,FragPipe_20250504_164850,new,FragPipe,23.0,MSFragger,4.2,0.01,,0.01,False,...,0.276832,0.560573,0.199952,0.309088,0.230707,57021,0.156473,0.437619,0.276832,0.560573
4,WOMBAT_20250506_073548,new,WOMBAT,0.9.11,Proline,,0.01,0.01,0.01,True,...,0.335215,0.643571,0.204178,0.329195,0.284523,61819,0.23037,0.507628,0.335215,0.643571


In [11]:
%%script false --no-raise-error
# Directory to save the unzipped files
unzip_dir = "temp_results"
os.makedirs(unzip_dir, exist_ok=True)

hash_vis_dirs = {}

# Loop over each hash and download the corresponding zip file, unzip, and remove it
for ihash in df["intermediate_hash"]:
    result_path = os.path.join(unzip_dir,ihash)
    if os.path.isdir(result_path):
        hash_vis_dirs[ihash] = result_path
        continue
    
    os.makedirs(result_path, exist_ok=True)
    
    url = f"https://proteobench.cubimed.rub.de/datasets/{ihash}/{ihash}_data.zip"
    
    # Send the GET request to download the file
    response = requests.get(url)

    if response.status_code == 200:
        # Temporary file path for the zip file
        zip_file_path = os.path.join(result_path, f"{ihash}.zip")
        
        # Write the zip file to disk
        with open(zip_file_path, 'wb') as f:
            f.write(response.content)
        
        # Now unzip the file into the desired directory
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(result_path)
        
        # After extracting, remove the zip file
        os.remove(zip_file_path)
        hash_vis_dirs[ihash] = result_path
        print(f"Downloaded, unzipped, and deleted {ihash}.zip")
    else:
        print(f"Failed to download {ihash}.zip - Status code: {response.status_code}")

In [12]:
import os
# Use local folder for visualization
current_dir = os.getcwd()
print(current_dir)


#
proteobench_dir = "/Users/witoldwolski/__projects/ProteoBench"
unzip_dir = f"{proteobench_dir}/jupyter_notebooks/analysis/temp_results"


hash_vis_dirs = {}

# Loop over each hash and download the corresponding zip file, unzip, and remove it
for ihash in df["intermediate_hash"]:
    result_path = os.path.join(unzip_dir,ihash)
    # check if the directory exists
    if os.path.isdir(result_path):
        hash_vis_dirs[ihash] = result_path
        continue
    hash_vis_dirs[ihash] = result_path



/Users/witoldwolski/__projects/ProteoBench/jupyter_notebooks/analysis/post_analysis/quant/lfq/ion/dda


In [13]:
%%script false --no-raise-error

# Create a SelectMultiple widget with names as options
row_selector = widgets.SelectMultiple(
    options=[(f"{row['id']} (hash: {row['intermediate_hash']}, submission comments: {row['submission_comments']})", idx) for idx, row in df.iterrows()],
    description='Select Rows:',
    rows=10,  # Number of visible rows in the widget
    layout=widgets.Layout(width='50%')  # Adjust layout as needed
)

# Button to confirm selection
button = widgets.Button(description='Filter Rows')

# Output widget to display the filtered DataFrame
output = widgets.Output()

# Callback for filtering rows
def on_button_click(b):
    with output:
        output.clear_output()
        selected_indices = list(row_selector.value)
        global filtered_df  # Store filtered DataFrame globally
        filtered_df = df.iloc[selected_indices]
        print("Filtered DataFrame:")
        display(filtered_df)

# Attach callback
button.on_click(on_button_click)

# Display the widgets
display(row_selector, button, output)

In [14]:
filtered_df = df

location = hash_vis_dirs[filtered_df["intermediate_hash"].iloc[0]]
filtered_df = df[df["intermediate_hash"].isin(os.listdir(unzip_dir))]
location = hash_vis_dirs[filtered_df["intermediate_hash"].iloc[0]]

software_name = filtered_df["software_name"].iloc[0]
all_files = os.listdir(location)

# List all files in the directory

# Filter for files that start with 'input_file' and ignore their extensions
matching_file = os.path.join(location,[f for f in all_files if f.startswith('input_file') and os.path.isfile(os.path.join(location, f))][0])
matching_file_params = os.path.join(location,[f for f in all_files if f.startswith('param') and os.path.isfile(os.path.join(location, f))][0])

user_config = defaultdict(lambda: "")

module_obj = DDAQuantIonModule(token="")
results_df = module_obj.obtain_all_data_points(all_datapoints=None)

results_performance, all_datapoints, result_df = module_obj.benchmarking(
    matching_file, software_name, user_input=user_config, all_datapoints=[]
)

fig1 = plot_quant.PlotDataPoint.plot_CV_violinplot(results_performance)
fig1.show()

parse_settings = ParseSettingsBuilder(
                parse_settings_dir= f"{proteobench_dir}/proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/", module_id="quant_lfq_DDA_ion"
            ).build_parser(software_name)

fig2 = plot_quant.PlotDataPoint.plot_fold_change_histogram(results_performance,parse_settings.species_expected_ratio())
fig2.show()




                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                





In [None]:
performance_dict = {}

for idx,row in filtered_df.iterrows():
    location = hash_vis_dirs[row["intermediate_hash"]]
    software_name = row["software_name"]

    # List all files in the directory
    all_files = os.listdir(location)
    if len(all_files) == 0:
        print(f"Directory {location} is empty.")
        continue

    # Filter for files that start with 'input_file' and ignore their extensions
    matching_file = os.path.join(
        location, [f for f in all_files if f.startswith("input_file") and os.path.isfile(os.path.join(location, f))][0]
    )
    matching_file_params = os.path.join(
        location, [f for f in all_files if f.startswith("param") and os.path.isfile(os.path.join(location, f))][0]
    )

    user_config = defaultdict(lambda: "")

    module_obj = DDAQuantIonModule(token="")
    results_df = module_obj.obtain_all_data_points(all_datapoints=None)

    input_df = load_input_file(matching_file, software_name)

    parse_settings = ParseSettingsBuilder(
        parse_settings_dir=MODULE_SETTINGS_DIRS["quant_lfq_DDA_ion"], module_id="quant_lfq_DDA_ion"
    ).build_parser(software_name)
    standard_format, replicate_to_raw = parse_settings.convert_to_standard_format(input_df)
    runs = replicate_to_raw["A"]
    runs.extend(replicate_to_raw["B"])

    results_performance, all_datapoints, result_df = module_obj.benchmarking(
        matching_file, software_name, user_input=user_config, all_datapoints=[]
    )

    results_performance_runs = results_performance.loc[:, runs]
    total_missing = results_performance.loc[:, runs].isna().sum().sum()
    total_peptide_ions = results_performance_runs.shape[0]
    total_runs = results_performance_runs.shape[1]
    total_max = total_peptide_ions * total_runs
    
    performance_dict[row["id"]] = {
        "total_missing": total_missing,
        "total_max": total_max,
        "total_peptide_ions": total_peptide_ions,
        "total_runs": total_runs,
        "total_missing_ratio": total_missing / total_max,
        "total_missing_percentage": (total_missing / total_max) * 100,
    }




                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                





Traceback (most recent call last):
  File "/Users/witoldwolski/__projects/ProteoBench/.venv/lib/python3.13/site-packages/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_vars.py", line 629, in change_attr_expression
    value = eval(expression, frame.f_globals, frame.f_locals)
  File "<string>", line 1
    Sequence  Length            Modifications  \0                AAAAAAAAAAGAAGGR      16  Acetyl (Protein N-term)   1                AAAAAAAAAAGAAGGR      16  Acetyl (Protein N-term)   4                AAAAAAAAAAGAAGGR      16  Acetyl (Protein N-term)   6       AAAAAAAGDSDSWDADAFSVEDPVR      25  Acetyl (Protein N-term)   7       AAAAAAAGDSDSWDADAFSVEDPVR      25  Acetyl (Protein N-term)   ...                           ...     ...                      ...   204967              YYYVPADFVEYEK      13               Unmodified   204968              YYYVPADFVEYEK      13               Unmodified   204969              YYYVPADFVEYEK      13               Unmodified   204970              YYYVPADFV

In [16]:
performance_df = pd.DataFrame(performance_dict).T

In [17]:
import pandas as pd
import plotly.express as px
performance_df["IDX"] = performance_df.index
# Creating an interactive scatter plot using Plotly
fig = px.scatter(
    performance_df,
    x="total_peptide_ions",
    y="total_missing_percentage",
    hover_name="IDX",
    labels={"total_missing": "Total Missing", "total_missing_percentage": "Total Missing Percentage"},
    title="Interactive Scatter Plot of Missing Data",
)

fig.update_layout(hovermode="closest")
fig.show()

In [18]:
import pandas as pd
import plotly.express as px

performance_df["IDX"] = performance_df.index
# Creating an interactive scatter plot using Plotly
fig = px.scatter(
    performance_df,
    x="total_missing",
    y="total_missing_percentage",
    hover_name="IDX",
    labels={"total_missing": "Total Missing", "total_missing_percentage": "Total Missing Percentage"},
    title="Interactive Scatter Plot of Missing Data",
)

fig.update_layout(hovermode="closest")
fig.show()