# DIA Astral Module Testing and Visualization

This notebook tests the new DIA Astral protein-group quantification module and provides visualizations of the results.

In [1]:
# Check the current working directory
import os
import sys

print("üìÇ Current working directory:", os.getcwd())
print("üìç Notebook file location:", "/Users/locard/Documents/GitHub/ProteoBench/jupyter_notebooks/dev_tests/test_astral_module_notebook.ipynb")
print("üêç Python executable:", sys.executable)
print("üìÅ ProteoBench root directory:", os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))))

# List files in current directory to see what's here
print("\nüìã Files in current directory:")
for item in sorted(os.listdir(".")):
    print(f"  ‚Ä¢ {item}")

üìÇ Current working directory: /Users/locard/Documents/GitHub/ProteoBench/jupyter_notebooks/dev_tests
üìç Notebook file location: /Users/locard/Documents/GitHub/ProteoBench/jupyter_notebooks/dev_tests/test_astral_module_notebook.ipynb
üêç Python executable: /Users/locard/anaconda3/envs/ProteoBench-jupyter/bin/python
üìÅ ProteoBench root directory: /Users/locard/Documents/GitHub

üìã Files in current directory:
  ‚Ä¢ test_astral_module_notebook.ipynb


In [2]:
#!/usr/bin/env python3
"""
Test script for the DIA Astral module with visualizations.
"""

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("üì¶ Libraries imported successfully!")

üì¶ Libraries imported successfully!


## Import ProteoBench Module

Import the DIA Astral protein groups module and set up the configuration.

In [3]:
# Import your module
from proteobench.modules.quant.quant_lfq_proteingroup_DIA_Astral import DIAQuantProteingroupModuleAstral

print("üîß ProteoBench modules imported successfully!")

üîß ProteoBench modules imported successfully!


## Configuration

Set up the test parameters and file paths.

In [4]:
# Configuration
token = "dummy_token_for_testing"  # Replace with real token if needed

# Test data - replace these paths with your actual test files
input_file = "/Users/locard/Documents/Projets_en_cours/2022_ProteoBench/Dev/20260219_Module_PG/test_data_Robbe/diann_23._defaultpg_matrix_cleanHeaders.tsv"
input_format = "DIA-NN"

# Sample user input configuration -> FAKE VALUES, replace with actual user input as needed
user_input = {
    "software_name": "DIA-NN",
    "software_version": "1.0",
    "search_engine_version": "1.0", 
    "search_engine": "DIA-NN",
    "ident_fdr_peptide": 0.01,
    "ident_fdr_psm": 0.01,
    "ident_fdr_protein": 0.01,
    "enable_match_between_runs": 1,
    "enzyme": "Trypsin",
    "allowed_miscleavages": 2,
    "min_peptide_length": 6,
    "max_peptide_length": 40,
    "precursor_mass_tolerance": 20,
    "fragment_mass_tolerance": 20,
}

print("‚úÖ Configuration set!")
print(f"üìÅ Input file: {input_file}")
print(f"üîß Input format: {input_format}")

‚úÖ Configuration set!
üìÅ Input file: /Users/locard/Documents/Projets_en_cours/2022_ProteoBench/Dev/20260219_Module_PG/test_data_Robbe/diann_23._defaultpg_matrix_cleanHeaders.tsv
üîß Input format: DIA-NN


## Initialize Core Components (Git-Free Approach)

Since the full module requires Git repository access, we'll test the core functionality using individual components. This approach lets you:

- ‚úÖ Test your DIA Astral processing pipeline  
- ‚úÖ Validate input parsing and quantification
- ‚úÖ Generate visualizations and metrics
- ‚ùå Skip Git repository operations (pull requests, data submission)

In [5]:
# Since the full module requires Git, let's use the core components independently

# Import the individual components we need
from proteobench.io.parsing.parse_proteingroup import load_input_file
from proteobench.io.parsing.parse_settings import ParseSettingsBuilder
from proteobench.score.quantscores import QuantScoresHYE
from proteobench.datapoint.quant_datapoint import QuantDatapointHYE

print("üöÄ Core components imported successfully!")
print("üìã We can now test the functionality without Git dependencies")

üöÄ Core components imported successfully!
üìã We can now test the functionality without Git dependencies


In [6]:
# Test the core functionality step by step (without Git)
try:
    print("üîÑ Step 1: Loading input file...")
    input_df = load_input_file(input_file, input_format)
    print(f"‚úÖ Input file loaded! Shape: {input_df.shape}")
    
    print("\nüìä First 3 rows (transposed for better readability):")
    display(input_df.head(3).T)  # Transpose to show columns as rows
    
    print("\nüîÑ Step 2: Setting up parse settings...")
    # Import the constants to get the correct parse settings directory
    import os
    from proteobench.modules.constants import MODULE_SETTINGS_DIRS
    
    module_id = "quant_lfq_DIA_proteingroup_Astral"
    parse_settings_dir = MODULE_SETTINGS_DIRS[module_id]
    
    print(f"üìÇ Parse settings directory: {parse_settings_dir}")
    print(f"üìÇ Directory exists: {os.path.exists(parse_settings_dir)}")
    
    # Try to create parse settings
    parse_settings_builder = ParseSettingsBuilder(
        parse_settings_dir=parse_settings_dir,
        module_id=module_id
    )
    parse_settings = parse_settings_builder.build_parser(input_format)
    print("‚úÖ Parse settings created!")
    
    print("\nüîÑ Step 3: Converting to standard format...")
    standard_format, replicate_to_raw = parse_settings.convert_to_standard_format(input_df) 
    print(f"‚úÖ Standard format created! Shape: {standard_format.shape}")
    ## Display the first few rows of the standard format
    print("\nüìä First 3 rows of standard format (transposed):")
    display(standard_format.head(3).T)  # Transpose to show columns as rows
    ## save the standard format to a temporary file to check it
    temp_standard_format_path = "temp_standard_format.tsv"
    standard_format.to_csv(temp_standard_format_path, sep="\t", index=False)
    print(f"‚úÖ Standard format saved to {temp_standard_format_path}")

except Exception as e:
    print(f"‚ùå Workflow failed at some step: {e}")
    import traceback
    traceback.print_exc()

üîÑ Step 1: Loading input file...
‚úÖ Input file loaded! Shape: (11448, 12)

üìä First 3 rows (transposed for better readability):


Unnamed: 0,0,1,2
Protein.Group,sp|A0A024RBG1|NUD4B_HUMAN,sp|A0A096LP01|SIM26_HUMAN,sp|A0A0B4J2D5|GAL3B_HUMAN;sp|P0DPI2|GAL3A_HUMAN
Protein.Names,NUD4B_HUMAN,SIM26_HUMAN,GAL3A_HUMAN;GAL3B_HUMAN
Genes,,,
First.Protein.Description,,,
N.Sequences,5,4,10
N.Proteotypic.Sequences,1,4,0
LFQ_Astral_DIA_15min_50ng_Condition_A_REP1,5510000.0,303819.0,6480000.0
LFQ_Astral_DIA_15min_50ng_Condition_A_REP2,4700000.0,370917.0,6130000.0
LFQ_Astral_DIA_15min_50ng_Condition_A_REP3,4850000.0,324628.0,6310000.0
LFQ_Astral_DIA_15min_50ng_Condition_B_REP1,5670000.0,242392.0,6290000.0



üîÑ Step 2: Setting up parse settings...
üìÇ Parse settings directory: /Users/locard/Documents/GitHub/ProteoBench/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/proteingroup/Astral
üìÇ Directory exists: True
‚úÖ Parse settings created!

üîÑ Step 3: Converting to standard format...
‚úÖ Standard format created! Shape: (64096, 20)

üìä First 3 rows of standard format (transposed):


Unnamed: 0,0,1,2
N.Proteotypic.Sequences,1,4,0
Protein.Names,NUD4B_HUMAN,SIM26_HUMAN,GAL3A_HUMAN;GAL3B_HUMAN
ECOLI,False,False,False
First.Protein.Description,,,
MULTI_SPEC,False,False,False
Genes,,,
contaminant,False,False,False
YEAST,False,False,False
N.Sequences,5,4,10
HUMAN,True,True,True


‚úÖ Standard format saved to temp_standard_format.tsv


In [None]:

try:
    
    print("\nüîÑ Step 4: Computing quantification scores...")
    quant_score = QuantScoresHYE(
        "Proteins",  # Use Proteins for protein-group level,
        parse_settings.species_expected_ratio(),
        parse_settings.species_dict()
    )
    intermediate_df = quant_score.generate_intermediate(standard_format, replicate_to_raw)
    print(f"‚úÖ Quantification scores computed! Shape: {intermediate_df.shape}")
    ## Display the first few rows of the intermediate dataframe
    print("\nüìä First 3 rows of intermediate dataframe (transposed):")
    display(intermediate_df.head(3).T)  # Transpose to show columns as rows
    ## save the intermediate format to a temporary file to check it
    temp_intermediate_format_path = "temp_intermediate_format.tsv"
    intermediate_df.to_csv(temp_intermediate_format_path, sep="\t", index=False)
    print(f"‚úÖ Intermediate format saved to {temp_intermediate_format_path}")

except Exception as e:
    print(f"‚ùå Workflow failed at some step: {e}")
    import traceback
    traceback.print_exc()

SyntaxError: positional argument follows keyword argument (1100459363.py, line 8)

In [None]:
try:
    
    print("\nüîÑ Step 5: Generating datapoint...")
    current_datapoint = QuantDatapointHYE.generate_datapoint(
        intermediate_df, input_format, user_input, default_cutoff_min_feature=3
    )
    print("‚úÖ Datapoint generated!")
    
    # Create a simple DataFrame for all_datapoints
    all_datapoints = pd.DataFrame([current_datapoint])
    
    print(f"\nüéâ Complete workflow successful!")
    print(f"üìä Intermediate dataframe shape: {intermediate_df.shape}")
    print(f"üìà All datapoints shape: {all_datapoints.shape}")
    print(f"üìÑ Input dataframe shape: {input_df.shape}")
    ## Display the first few rows of all_datapoints
    print("\nüìä First 3 rows of input_df (transposed)")
    display(input_df.head(3).T)  # Transpose to show columns as rows:
    ## Print all values in all_datapoints
    print("\nüìä all_datapoints")
    display(all_datapoints)
    
    # Save the results for inspection
    temp_all_datapoints_path = "temp_all_datapoints.csv"
    all_datapoints.to_csv(temp_all_datapoints_path, index=False)
    print(f"‚úÖ All datapoints saved to {temp_all_datapoints_path}")
    # Set all datapoints to old
    all_datapoints["old_new"] = "old"
    
    temp_intermediate_format_path = "temp_intermediate_format.tsv"
    intermediate_df.to_csv(temp_intermediate_format_path, sep="\t", index=False)
    print(f"‚úÖ Intermediate data saved to {temp_intermediate_format_path}")
    
except Exception as e:
    print(f"‚ùå Workflow failed at some step: {e}")
    import traceback
    traceback.print_exc()

# ProteoBench Plotting and Visualization

Now let's use ProteoBench's built-in plotting capabilities to create publication-ready visualizations!

## Import ProteoBench Plotting Modules

Let's import ProteoBench's specialized plotting functions for quantification analysis:

In [None]:
# Import ProteoBench's LFQHYEPlotGenerator
from proteobench.plotting.plot_generator_lfq_HYE import LFQHYEPlotGenerator

print("‚úÖ LFQHYEPlotGenerator imported successfully!")

# Create an instance of the plot generator
plot_generator = LFQHYEPlotGenerator()
print("üé® LFQHYEPlotGenerator instance created!")

# Display information about available plotting methods
print("\nüìä Available LFQHYEPlotGenerator methods:")
print("  ‚Ä¢ generate_in_depth_plots() - Create standard LFQ HYE plots")
print("  ‚Ä¢ plot_main_metric() - Generate main performance metric plot") 
print("  ‚Ä¢ get_in_depth_plot_layout() - Get plot layout configuration")
print("  ‚Ä¢ get_in_depth_plot_descriptions() - Get plot descriptions")

## 1. Main Quantification Plot

Create the primary ProteoBench quantification visualization:

In [None]:
# Create the main ProteoBench quantification plot using plot_main_metric
try:
    print("üé® Generating main ProteoBench quantification plot...")
    
    # Since we don't have a complete module instance here, we'll create a mock one for testing
    # In real usage, you would get this from your module instance
    
    # Reload the module to pick up changes (needed when module classes are modified)
    import importlib
    import proteobench.modules.quant.quant_lfq_proteingroup_DIA_Astral
    importlib.reload(proteobench.modules.quant.quant_lfq_proteingroup_DIA_Astral)
    
    from proteobench.modules.quant.quant_lfq_proteingroup_DIA_Astral import DIAQuantProteingroupModuleAstral
    
    # Create a mock module to get the y_axis_title
    token = "dummy"  # We won't use GitHub features
    mock_module = DIAQuantProteingroupModuleAstral(token=token, use_github=False)
    
    # Check if y_axis_title exists, provide fallback
    y_axis_title = getattr(mock_module, 'y_axis_title', 
                          "Total number of protein groups quantified in the selected number of raw files")
    
    print(f"üìå Mock module y_axis_title: {y_axis_title}")
    
    # Generate the main metric plot using the module's y_axis_title
    fig_main = plot_generator.plot_main_metric(
        benchmark_metrics_df=all_datapoints,
        metric="Median",  # Can be "Median" or "Mean"
        label="None",     # Use "None" for no labels, or try "id" or "software_name" for column-based labels
        mode="Global",    # Can be "Global" or "Species-weighted" 
        annotation="Test Run",
        feature_column_name="nr_feature",
        y_axis_title=y_axis_title,  # Use the module's y_axis_title
    )
    
    # Display the plot
    fig_main.show()
    
    print("‚úÖ Main ProteoBench quantification plot generated successfully!")
    print(f"üìä Y-axis title: {y_axis_title}")
    
    # You can also try different combinations:
    print("\nüìä Generating alternative versions...")
    
    # Mean with Species-weighted approach - using "id" for labels this time
    fig_mean = plot_generator.plot_main_metric(
        benchmark_metrics_df=all_datapoints,
        metric="Mean", 
        mode="Species-weighted",
        label="id",  # Use the "id" column for labels
        annotation="Alternative View",
        feature_column_name="nr_feature",
        y_axis_title=y_axis_title,  # Use the module's y_axis_title
    )
    fig_mean.show()
    
    print("‚úÖ Alternative plots generated successfully!")
    
except Exception as e:
    print(f"‚ùå Error generating main metric plot: {e}")
    import traceback
    traceback.print_exc()

## 2. In depth plots

In [None]:
# Generate the standard ProteoBench in-depth plots
try:
    print("üé® Generating ProteoBench standard plots...")
    
    # Generate all the standard LFQ HYE plots
    plots = plot_generator.generate_in_depth_plots(
        performance_data=intermediate_df,
        parse_settings=parse_settings
    )
    
    print(f"‚úÖ Generated {len(plots)} ProteoBench plots:")
    for plot_name in plots.keys():
        print(f"  ‚Ä¢ {plot_name}")
    
    # Display each plot
    plot_descriptions = plot_generator.get_in_depth_plot_descriptions()
    
    for plot_name, fig in plots.items():
        print(f"\nüìä {plot_name.upper()} PLOT:")
        print(f"üìù Description: {plot_descriptions.get(plot_name, 'No description available')}")
        
        # Show the plot
        fig.show()
    
    print("\n‚úÖ All ProteoBench standard plots generated successfully!")
    
except Exception as e:
    print(f"‚ùå Error generating ProteoBench plots: {e}")
    import traceback
    traceback.print_exc()
    
    # Fallback to basic plotting if needed
    print("\n‚ö†Ô∏è  Falling back to basic plotting...")
    
    # Create a simple fold change plot as fallback
    import plotly.express as px
    
    if 'log2_A_vs_B' in intermediate_df.columns and 'species' in intermediate_df.columns:
        fig = px.histogram(
            intermediate_df, 
            x='log2_A_vs_B', 
            color='species',
            title='Fold Change Distribution by Species',
            nbins=50,
            opacity=0.7
        )
        fig.update_layout(
            xaxis_title='Log2 Fold Change (A vs B)',
            yaxis_title='Count'
        )
        fig.show()
        print("‚úÖ Basic fold change plot created as fallback!")