In [1]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display
import requests
from bs4 import BeautifulSoup
import os
import zipfile
from tqdm import tqdm
from pathlib import Path

import json

from collections import defaultdict
import toml

from proteobench.modules.quant.quant_lfq_ion_DDA import DDAQuantIonModule
from proteobench.io.parsing.parse_settings import ParseSettingsBuilder
from proteobench.plotting import plot_quant
from proteobench.io.parsing.parse_ion import load_input_file
from proteobench.modules.constants import MODULE_SETTINGS_DIRS

In [2]:
# Examples to choose from:
#
# "https://api.github.com/repos/Proteobench/Results_quant_ion_DDA/contents/",
# "https://api.github.com/repos/Proteobench/Results_quant_peptidoform_DIA/contents/",
# "https://api.github.com/repos/Proteobench/Results_subcellprofile_DOMLFQ_protein_DIA_EXPL/contents/",
# "https://api.github.com/repos/Proteobench/Results_quant_peptidoform_DDA/contents/",
# "https://api.github.com/repos/Proteobench/Results_quant_ion_DIA_diaPASEF/contents/",
# "https://api.github.com/repos/Proteobench/Results_quant_ion_DIA/contents/",
# "https://api.github.com/repos/Proteobench/Results_quant_ion_DIA_singlecell/contents/",

repo_url = "https://api.github.com/repos/Proteobench/Results_quant_ion_DDA/contents/"
    
file_names = []

response = requests.get(repo_url)

if response.status_code == 200:
    repo_contents = response.json()

    for item in repo_contents:
        file_name = item.get("name")
        if file_name:
            file_names.append(file_name)    
else:
    print(f"Failed to retrieve the repository contents. Status code: {response.status_code}")
    print(repo_url)
        
file_names = [f for f in file_names if f.endswith(".json")]
json_files_content = []

for f in file_names:
    url = f"https://raw.githubusercontent.com/Proteobot/Results_quant_ion_DDA/refs/heads/main/{f}"
    response = requests.get(url)
    response.raise_for_status()  # Raise an error if the request fails

    data = response.json()
    
    json_files_content.append(data)
    
df = pd.json_normalize(json_files_content)

df.head(5)

Unnamed: 0,id,old_new,software_name,software_version,search_engine,search_engine_version,ident_fdr_psm,ident_fdr_peptide,ident_fdr_protein,enable_match_between_runs,...,results.2.CV_q75,results.2.CV_q95,results.1.median_abs_epsilon,results.1.mean_abs_epsilon,results.1.variance_epsilon,results.1.nr_prec,results.1.CV_median,results.1.CV_q90,results.1.CV_q75,results.1.CV_q95
0,MaxQuant_20241216_122433,old,MaxQuant,2.5.1.0,Andromeda,,,0.01,0.01,False,...,0.291818,0.512293,0.199852,0.274547,0.160229,50302,0.202619,0.416519,0.291818,0.512293
1,MaxQuant_20241216_124040,old,MaxQuant,2.3.1.0,Andromeda,,,0.01,0.01,True,...,0.312472,0.569765,0.207123,0.304261,0.222469,50339,0.215507,0.46077,0.312472,0.569765
2,MaxQuant_20250506_151623,new,MaxQuant,2.3.1.0,Andromeda,,0.01,,0.01,True,...,0.312472,0.569765,0.207123,0.304261,0.222469,50339,0.215507,0.46077,0.312472,0.569765
3,FragPipe_20250504_164850,new,FragPipe,23.0,MSFragger,4.2,0.01,,0.01,False,...,0.276832,0.560573,0.199952,0.309088,0.230707,57021,0.156473,0.437619,0.276832,0.560573
4,WOMBAT_20250506_073548,new,WOMBAT,0.9.11,Proline,,0.01,0.01,0.01,True,...,0.335215,0.643571,0.204178,0.329195,0.284523,61819,0.23037,0.507628,0.335215,0.643571


In [None]:
# Directory to save the unzipped files
unzip_dir = "temp_results"
os.makedirs(unzip_dir, exist_ok=True)

hash_vis_dirs = {}

# Loop over each hash and download the corresponding zip file, unzip, and remove it
for ihash in df["intermediate_hash"]:
    result_path = os.path.join(unzip_dir,ihash)
    if os.path.isdir(result_path):
        hash_vis_dirs[ihash] = result_path
        continue
    
    os.makedirs(result_path, exist_ok=True)
    
    url = f"https://proteobench.cubimed.rub.de/datasets/{ihash}/{ihash}_data.zip"
    
    # Send the GET request to download the file
    response = requests.get(url)

    if response.status_code == 200:
        # Temporary file path for the zip file
        zip_file_path = os.path.join(result_path, f"{ihash}.zip")
        
        # Write the zip file to disk
        with open(zip_file_path, 'wb') as f:
            f.write(response.content)
        
        # Now unzip the file into the desired directory
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(result_path)
        
        # After extracting, remove the zip file
        os.remove(zip_file_path)
        hash_vis_dirs[ihash] = result_path
        print(f"Downloaded, unzipped, and deleted {ihash}.zip")
    else:
        print(f"Failed to download {ihash}.zip - Status code: {response.status_code}")

In [None]:
# Use local folder for visualization
unzip_dir = "../../../temp_results/"

hash_vis_dirs = {}

# Loop over each hash and download the corresponding zip file, unzip, and remove it
for ihash in df["intermediate_hash"]:
    result_path = os.path.join(unzip_dir,ihash)
    # check if the directory exists
    if os.path.isdir(result_path):
        hash_vis_dirs[ihash] = result_path
        continue
    hash_vis_dirs[ihash] = result_path



In [4]:
# Create a SelectMultiple widget with names as options
row_selector = widgets.SelectMultiple(
    options=[(f"{row['id']} (hash: {row['intermediate_hash']}, submission comments: {row['submission_comments']})", idx) for idx, row in df.iterrows()],
    description='Select Rows:',
    rows=10,  # Number of visible rows in the widget
    layout=widgets.Layout(width='50%')  # Adjust layout as needed
)

# Button to confirm selection
button = widgets.Button(description='Filter Rows')

# Output widget to display the filtered DataFrame
output = widgets.Output()

# Callback for filtering rows
def on_button_click(b):
    with output:
        output.clear_output()
        selected_indices = list(row_selector.value)
        global filtered_df  # Store filtered DataFrame globally
        filtered_df = df.iloc[selected_indices]
        print("Filtered DataFrame:")
        display(filtered_df)

# Attach callback
button.on_click(on_button_click)

# Display the widgets
display(row_selector, button, output)

SelectMultiple(description='Select Rows:', layout=Layout(width='50%'), options=(('MaxQuant_20241216_122433 (ha…

Button(description='Filter Rows', style=ButtonStyle())

Output()

In [20]:
location = hash_vis_dirs[filtered_df["intermediate_hash"].iloc[0]]
software_name = filtered_df["software_name"].iloc[0]

# List all files in the directory
all_files = os.listdir(location)

# Filter for files that start with 'input_file' and ignore their extensions
matching_file = os.path.join(location,[f for f in all_files if f.startswith('input_file') and os.path.isfile(os.path.join(location, f))][0])
matching_file_params = os.path.join(location,[f for f in all_files if f.startswith('param') and os.path.isfile(os.path.join(location, f))][0])

user_config = defaultdict(lambda: "")

module_obj = DDAQuantIonModule(token="")
results_df = module_obj.obtain_all_data_points(all_datapoints=None)

results_performance, all_datapoints, result_df = module_obj.benchmarking(
    matching_file, software_name, user_input=user_config, all_datapoints=[]
)

fig1 = plot_quant.PlotDataPoint.plot_CV_violinplot(results_performance)
fig1.show()

parse_settings = ParseSettingsBuilder(
                parse_settings_dir="../../proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/", module_id="quant_lfq_DDA_ion"
            ).build_parser(software_name)

fig2 = plot_quant.PlotDataPoint.plot_fold_change_histogram(results_performance,parse_settings.species_expected_ratio())
fig2.show()

In [11]:
performance_dict = {}

for idx,row in df.iterrows():
    location = hash_vis_dirs[row["intermediate_hash"]]
    software_name = row["software_name"]

    # List all files in the directory
    all_files = os.listdir(location)
    if len(all_files) == 0:
        print(f"Directory {location} is empty.")
        continue

    # Filter for files that start with 'input_file' and ignore their extensions
    matching_file = os.path.join(
        location, [f for f in all_files if f.startswith("input_file") and os.path.isfile(os.path.join(location, f))][0]
    )
    matching_file_params = os.path.join(
        location, [f for f in all_files if f.startswith("param") and os.path.isfile(os.path.join(location, f))][0]
    )

    user_config = defaultdict(lambda: "")

    module_obj = DDAQuantIonModule(token="")
    results_df = module_obj.obtain_all_data_points(all_datapoints=None)

    input_df = load_input_file(matching_file, software_name)

    parse_settings = ParseSettingsBuilder(
        parse_settings_dir=MODULE_SETTINGS_DIRS["quant_lfq_DDA_ion"], module_id="quant_lfq_DDA_ion"
    ).build_parser(software_name)
    standard_format, replicate_to_raw = parse_settings.convert_to_standard_format(input_df)
    runs = replicate_to_raw["A"]
    runs.extend(replicate_to_raw["B"])

    results_performance, all_datapoints, result_df = module_obj.benchmarking(
        matching_file, software_name, user_input=user_config, all_datapoints=[]
    )

    results_performance_runs = results_performance.loc[:, runs]
    total_missing = results_performance.loc[:, runs].isna().sum().sum()
    total_max = results_performance_runs.shape[
        0
    ] * results_performance_runs.shape[1]
    total_peptide_ions = results_performance_runs.shape[0]
    total_runs = results_performance_runs.shape[1]

    performance_dict[row["id"]] = {
        "total_missing": total_missing,
        "total_max": total_max,
        "total_peptide_ions": total_peptide_ions,
        "total_runs": total_runs,
        "total_missing_ratio": total_missing / total_max,
        "total_missing_percentage": (total_missing / total_max) * 100,
    }


                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                

                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                


{'total_missing': 120782, 'total_max': 301812, 'total_peptide_ions': 50302, 'total_runs': 6, 'total_missing_ratio': 0.4001895219540641, 'total_missing_percentage': 40.01895219540641}



                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                

                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                


{'total_missing': 65242, 'total_max': 302034, 'total_peptide_ions': 50339, 'total_runs': 6, 'total_missing_ratio': 0.2160087937119662, 'total_missing_percentage': 21.600879371196623}
{'total_missing': 189073, 'total_max': 467694, 'total_peptide_ions': 77949, 'total_runs': 6, 'total_missing_ratio': 0.40426646482529177, 'total_missing_percentage': 40.42664648252918}
{'total_missing': 148403, 'total_max': 359208, 'total_peptide_ions': 59868, 'total_runs': 6, 'total_missing_ratio': 0.41313946237277566, 'total_missing_percentage': 41.31394623727756}



                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                

                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                


{'total_missing': 62988, 'total_max': 298074, 'total_peptide_ions': 49679, 'total_runs': 6, 'total_missing_ratio': 0.21131665291169308, 'total_missing_percentage': 21.13166529116931}



                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                

                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                


{'total_missing': 63842, 'total_max': 302028, 'total_peptide_ions': 50338, 'total_runs': 6, 'total_missing_ratio': 0.21137775305600806, 'total_missing_percentage': 21.137775305600805}


  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


{'total_missing': 63705, 'total_max': 396804, 'total_peptide_ions': 66134, 'total_runs': 6, 'total_missing_ratio': 0.16054525660023589, 'total_missing_percentage': 16.05452566002359}



                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                

                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                


{'total_missing': 120914, 'total_max': 302028, 'total_peptide_ions': 50338, 'total_runs': 6, 'total_missing_ratio': 0.4003403657938999, 'total_missing_percentage': 40.03403657938999}
{'total_missing': 116654, 'total_max': 409746, 'total_peptide_ions': 68291, 'total_runs': 6, 'total_missing_ratio': 0.2846983253039688, 'total_missing_percentage': 28.469832530396882}
{'total_missing': 53205, 'total_max': 438264, 'total_peptide_ions': 73044, 'total_runs': 6, 'total_missing_ratio': 0.12139943048025847, 'total_missing_percentage': 12.139943048025847}
{'total_missing': 156106, 'total_max': 433632, 'total_peptide_ions': 72272, 'total_runs': 6, 'total_missing_ratio': 0.35999649472363665, 'total_missing_percentage': 35.999649472363664}
{'total_missing': 170108, 'total_max': 401208, 'total_peptide_ions': 66868, 'total_runs': 6, 'total_missing_ratio': 0.423989551554306, 'total_missing_percentage': 42.3989551554306}
Directory temp_results\45fbb2dc2468abd6dbaec4520c2a0e96cb7f7e0f is empty.
Directory


                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                

                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                


{'total_missing': 119632, 'total_max': 298494, 'total_peptide_ions': 49749, 'total_runs': 6, 'total_missing_ratio': 0.40078527541592124, 'total_missing_percentage': 40.07852754159212}
{'total_missing': 50950, 'total_max': 400008, 'total_peptide_ions': 66668, 'total_runs': 6, 'total_missing_ratio': 0.12737245255094898, 'total_missing_percentage': 12.737245255094898}



                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                

                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                


{'total_missing': 120829, 'total_max': 302160, 'total_peptide_ions': 50360, 'total_runs': 6, 'total_missing_ratio': 0.39988416732856763, 'total_missing_percentage': 39.98841673285676}



                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                

                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                


{'total_missing': 64266, 'total_max': 304404, 'total_peptide_ions': 50734, 'total_runs': 6, 'total_missing_ratio': 0.21112074742776046, 'total_missing_percentage': 21.112074742776045}
{'total_missing': 105110, 'total_max': 401208, 'total_peptide_ions': 66868, 'total_runs': 6, 'total_missing_ratio': 0.2619838088971307, 'total_missing_percentage': 26.198380889713068}



                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                

                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                


{'total_missing': 63842, 'total_max': 302034, 'total_peptide_ions': 50339, 'total_runs': 6, 'total_missing_ratio': 0.211373553970745, 'total_missing_percentage': 21.1373553970745}



                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                

                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                


{'total_missing': 121483, 'total_max': 307158, 'total_peptide_ions': 51193, 'total_runs': 6, 'total_missing_ratio': 0.3955065471190723, 'total_missing_percentage': 39.55065471190723}
{'total_missing': 144945, 'total_max': 348540, 'total_peptide_ions': 58090, 'total_runs': 6, 'total_missing_ratio': 0.4158633155448442, 'total_missing_percentage': 41.58633155448442}
{'total_missing': 147161, 'total_max': 386112, 'total_peptide_ions': 64352, 'total_runs': 6, 'total_missing_ratio': 0.3811355254433947, 'total_missing_percentage': 38.11355254433947}



                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                

                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                


{'total_missing': 120910, 'total_max': 302022, 'total_peptide_ions': 50337, 'total_runs': 6, 'total_missing_ratio': 0.4003350749283165, 'total_missing_percentage': 40.033507492831646}



                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                

                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                


{'total_missing': 121327, 'total_max': 291372, 'total_peptide_ions': 48562, 'total_runs': 6, 'total_missing_ratio': 0.4163989676427385, 'total_missing_percentage': 41.63989676427385}
Directory temp_results\bc6d394a74fdfe29e4af7c0154a3732037d64759 is empty.
{'total_missing': 55932, 'total_max': 440280, 'total_peptide_ions': 73380, 'total_runs': 6, 'total_missing_ratio': 0.12703733987462523, 'total_missing_percentage': 12.703733987462524}
{'total_missing': 46489, 'total_max': 390300, 'total_peptide_ions': 65050, 'total_runs': 6, 'total_missing_ratio': 0.11911094030233153, 'total_missing_percentage': 11.911094030233153}



                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                

                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                


{'total_missing': 63467, 'total_max': 302160, 'total_peptide_ions': 50360, 'total_runs': 6, 'total_missing_ratio': 0.2100443473656341, 'total_missing_percentage': 21.00443473656341}



                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                

                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                


{'total_missing': 122169, 'total_max': 308070, 'total_peptide_ions': 51345, 'total_runs': 6, 'total_missing_ratio': 0.3965624695686045, 'total_missing_percentage': 39.65624695686045}
{'total_missing': 67326, 'total_max': 495198, 'total_peptide_ions': 82533, 'total_runs': 6, 'total_missing_ratio': 0.13595773811687445, 'total_missing_percentage': 13.595773811687446}
{'total_missing': 143518, 'total_max': 292656, 'total_peptide_ions': 48776, 'total_runs': 6, 'total_missing_ratio': 0.4903982833087311, 'total_missing_percentage': 49.03982833087311}
{'total_missing': 44737, 'total_max': 357654, 'total_peptide_ions': 59609, 'total_runs': 6, 'total_missing_ratio': 0.1250845789506059, 'total_missing_percentage': 12.508457895060591}



                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                

                because they are implicit. Only after providing the appropriate parameter file,

                fixed modifications will be added correctly.
                


{'total_missing': 122222, 'total_max': 307914, 'total_peptide_ions': 51319, 'total_runs': 6, 'total_missing_ratio': 0.3969355079665101, 'total_missing_percentage': 39.69355079665101}
{'total_missing': 145148, 'total_max': 349122, 'total_peptide_ions': 58187, 'total_runs': 6, 'total_missing_ratio': 0.41575151379746905, 'total_missing_percentage': 41.575151379746906}


In [13]:
performance_df = pd.DataFrame(performance_dict).T

In [17]:
import pandas as pd
import plotly.express as px
performance_df["IDX"] = performance_df.index
# Creating an interactive scatter plot using Plotly
fig = px.scatter(
    performance_df,
    x="total_peptide_ions",
    y="total_missing_percentage",
    hover_name="IDX",
    labels={"total_missing": "Total Missing", "total_missing_percentage": "Total Missing Percentage"},
    title="Interactive Scatter Plot of Missing Data",
)

fig.update_layout(hovermode="closest")
fig.show()

In [19]:
import pandas as pd
import plotly.express as px

performance_df["IDX"] = performance_df.index
# Creating an interactive scatter plot using Plotly
fig = px.scatter(
    performance_df,
    x="total_missing",
    y="total_missing_percentage",
    hover_name="IDX",
    labels={"total_missing": "Total Missing", "total_missing_percentage": "Total Missing Percentage"},
    title="Interactive Scatter Plot of Missing Data",
)

fig.update_layout(hovermode="closest")
fig.show()