### Load libraries

In [12]:

import sys
sys.path.insert(0, '../src/')

from load_modify_sample_utils import load_model, get_objective_functions, get_reaction_bounds, modify_model
from load_modify_sample_utils import sample_dingo, sample_optgp

from distributions_comparison_utils import significantly_altered_reactions

from pathways_utils import sort_reactions_in_pathways_by_reactions_in_model_order, subset_sampling_df_from_reaction_ids
from pathways_utils import map_model_to_kegg_reactions, read_json_file, bigg_to_kegg_id, fill_missing_kegg_ids_in_dict
from pathways_utils import get_kegg_pathways_from_reaction_ids, subset_model_reactions_from_pathway_info
from pathways_utils import dictionary_bigg_id_to_pathway_names, dictionary_forward_reverse_bigg_id_to_pathway_names
from pathways_utils import reaction_in_pathway_binary_matrix, plot_reaction_in_pathway_heatmap


### Load model

In [3]:
ec_cobra_model, ec_cobra_reactions, ec_dingo_model, ec_dingo_reactions = load_model("../ext_data/models/e_coli_core.xml")

objective_functions = get_objective_functions(ec_cobra_model)
print(objective_functions)

default_reaction_bounds = get_reaction_bounds(ec_cobra_model)
print(default_reaction_bounds)


Set parameter Username
Set parameter LicenseID to value 2642044
Academic license - for non-commercial use only - expires 2026-03-25
['BIOMASS_Ecoli_core_w_GAM']
{'PFK': (0.0, 1000.0), 'PFL': (0.0, 1000.0), 'PGI': (-1000.0, 1000.0), 'PGK': (-1000.0, 1000.0), 'PGL': (0.0, 1000.0), 'ACALD': (-1000.0, 1000.0), 'AKGt2r': (-1000.0, 1000.0), 'PGM': (-1000.0, 1000.0), 'PIt2r': (-1000.0, 1000.0), 'ALCD2x': (-1000.0, 1000.0), 'ACALDt': (-1000.0, 1000.0), 'ACKr': (-1000.0, 1000.0), 'PPC': (0.0, 1000.0), 'ACONTa': (-1000.0, 1000.0), 'ACONTb': (-1000.0, 1000.0), 'ATPM': (8.39, 1000.0), 'PPCK': (0.0, 1000.0), 'ACt2r': (-1000.0, 1000.0), 'PPS': (0.0, 1000.0), 'ADK1': (-1000.0, 1000.0), 'AKGDH': (0.0, 1000.0), 'ATPS4r': (-1000.0, 1000.0), 'PTAr': (-1000.0, 1000.0), 'PYK': (0.0, 1000.0), 'BIOMASS_Ecoli_core_w_GAM': (0.0, 1000.0), 'PYRt2': (-1000.0, 1000.0), 'CO2t': (-1000.0, 1000.0), 'RPE': (-1000.0, 1000.0), 'CS': (0.0, 1000.0), 'RPI': (-1000.0, 1000.0), 'SUCCt2_2': (0.0, 1000.0), 'CYTBD': (0.0, 1000.

### Modify model and sample

In [None]:

ec_cobra_model_condition_100, ec_dingo_model_condition_100 = modify_model(ec_cobra_model, objective_function="BIOMASS_Ecoli_core_w_GAM", optimal_percentage=100)

ec_cobra_model_condition_0, ec_dingo_model_condition_0 = modify_model(ec_cobra_model, objective_function="BIOMASS_Ecoli_core_w_GAM", optimal_percentage=0)


samples_optgp_condition_100 = sample_optgp(ec_cobra_model_condition_100, n_samples = 3000, thinning=100, reaction_in_rows = True)
samples_optgp_condition_0 = sample_optgp(ec_cobra_model_condition_0, n_samples = 3000, thinning=100, reaction_in_rows = True)


samples_dingo_condition_100 = sample_dingo(ec_dingo_model_condition_100, reaction_in_rows = True, ess=2000)
samples_dingo_condition_0 = sample_dingo(ec_dingo_model_condition_0, reaction_in_rows = True, ess=2000)



Read LP format model from file /tmp/tmpnk19l_qi.lp
Reading time = 0.01 seconds
: 72 rows, 190 columns, 720 nonzeros
Read LP format model from file /tmp/tmpvo8ymhne.lp
Reading time = 0.00 seconds
: 72 rows, 190 columns, 720 nonzeros
Read LP format model from file /tmp/tmprp52wrnq.lp
Reading time = 0.00 seconds
: 72 rows, 190 columns, 720 nonzeros
Read LP format model from file /tmp/tmple2lg763.lp
Reading time = 0.00 seconds
: 72 rows, 190 columns, 720 nonzeros
phase 1: number of correlated samples = 500, effective sample size = 7, ratio of the maximum singilar value over the minimum singular value = 1458.08
phase 2: number of correlated samples = 500, effective sample size = 9, ratio of the maximum singilar value over the minimum singular value = 355.132
phase 3: number of correlated samples = 500, effective sample size = 4, ratio of the maximum singilar value over the minimum singular value = 149.989
phase 4: number of correlated samples = 500, effective sample size = 96, ratio of the 

[5]maximum marginal PSRF: 1.10543


phase 1: number of correlated samples = 500, effective sample size = 11, ratio of the maximum singilar value over the minimum singular value = 718.573
phase 2: number of correlated samples = 500, effective sample size = 159, ratio of the maximum singilar value over the minimum singular value = 2.62672
phase 3: number of correlated samples = 2400, effective sample size = 967
phase 4: number of correlated samples = 2200, effective sample size = 895
[5]total ess 2032: number of correlated samples = 5600




[5]maximum marginal PSRF: 1.00684


In [13]:
initial_bigg_to_kegg_dictionary = map_model_to_kegg_reactions(ec_cobra_model)


reactions_json, reactions_pandas = read_json_file("../ext_data/reactions/reactions.json")


final_bigg_to_kegg_dictionary = fill_missing_kegg_ids_in_dict(initial_bigg_to_kegg_dictionary, reactions_pandas)


df_kegg_pathways = get_kegg_pathways_from_reaction_ids(final_bigg_to_kegg_dictionary)



### Compare distributions of reactions across conditions

In [29]:


#conditions=[samples_dingo_condition_100, samples_dingo_condition_0]
conditions=[samples_optgp_condition_100, samples_optgp_condition_0]

selected_comparisons = [(0, 1)]

_, significant_diff_reactions, not_significant_diff_reactions = significantly_altered_reactions(conditions, 
                                                                                                selected_comparisons, 
                                                                                                cobra_model=ec_cobra_model,
                                                                                                fold_change_cutoff = 0.5)


print(significant_diff_reactions)
print(len(significant_diff_reactions))


['PFL', 'PGI', 'ACALD', 'AKGt2r', 'PIt2r', 'ALCD2x', 'ACALDt', 'ACKr', 'PPC', 'PPCK', 'ACt2r', 'PPS', 'ADK1', 'PTAr', 'PYK', 'BIOMASS_Ecoli_core_w_GAM', 'PYRt2', 'SUCCt2_2', 'D_LACt2', 'SUCCt3', 'ETOHt2r', 'THD2', 'EX_ac_e', 'EX_acald_e', 'EX_akg_e', 'EX_etoh_e', 'EX_for_e', 'EX_glu__L_e', 'EX_lac__D_e', 'EX_nh4_e', 'EX_pi_e', 'EX_pyr_e', 'EX_succ_e', 'FBP', 'FORt2', 'FORt', 'GLNS', 'GLUDy', 'GLUN', 'GLUSy', 'GLUt2r', 'ICL', 'LDH_D', 'MALS', 'MDH', 'ME1', 'ME2', 'NADTRHD', 'NH4t']
49


In [40]:

from scipy.stats import hypergeom
import pandas as pd
from statsmodels.stats.multitest import multipletests


def hypergeometric_pathway_enrichment(significant_reactions, all_reactions, reaction_to_pathways):
    """
    Perform hypergeometric test to find significantly affected pathways.

    Parameters:
    - significant_reactions: set or list of reaction IDs with altered flux.
    - all_reactions: set or list of all reaction IDs considered in the analysis.
    - reaction_to_pathways: dict mapping reaction ID -> list of pathway names.

    Returns:
    - DataFrame with pathway, p-value, counts and adjusted FDR.
    """

    # Convert inputs to sets for efficiency
    significant_reactions = set(significant_reactions)
    all_reactions = set(all_reactions)

    # Build pathway -> reactions mapping
    pathway_to_reactions = {}
    for rxn, pathways in reaction_to_pathways.items():
        for pw in pathways:
            pathway_to_reactions.setdefault(pw, set()).add(rxn)

    results = []
    M = len(all_reactions)             # Total number of reactions
    n = len(significant_reactions)    # Number of significant reactions

    for pathway, pathway_reactions in pathway_to_reactions.items():
        K = len(all_reactions & pathway_reactions)                      # Reactions in this pathway
        k = len(significant_reactions & pathway_reactions)              # Significant reactions in this pathway

        # Skip if the pathway has no reactions in the background
        if K == 0:
            continue

        # Hypergeometric test (sf: survival function = 1 - cdf, so P(X >= k))
        pval = hypergeom.sf(k - 1, M, n, K)

        results.append({
            'pathway': pathway,
            'overlap': k,
            'pathway_size': K,
            'significant': n,
            'total': M,
            'pval': pval
        })

    # Multiple testing correction (Benjamini-Hochberg FDR)
    df = pd.DataFrame(results)
    if not df.empty:
        df['fdr'] = multipletests(df['pval'], method='fdr_bh')[1]
        df.sort_values('pval', inplace=True)

    return df


In [42]:


def build_reaction_to_pathways_dict(df, reaction_col='bigg_reaction', pathway_col='pathway_names'):
    """
    Build a dictionary mapping each reaction to a list of pathway names.

    Parameters:
    - df: pandas DataFrame with reactions and pathway names.
    - reaction_col: column name for reaction identifiers.
    - pathway_col: column name for list of pathways.

    Returns:
    - dict: {reaction_id: [pathway1, pathway2, ...]}
    """
    reaction_to_pathways = {}
    
    for _, row in df.iterrows():
        reaction = row[reaction_col]
        pathways = row[pathway_col]
        
        # Ensure pathways is a list and not empty
        if isinstance(pathways, list) and pathways:
            reaction_to_pathways[reaction] = pathways

    return reaction_to_pathways


reaction_to_pathways = build_reaction_to_pathways_dict(df_kegg_pathways)
print(reaction_to_pathways)



{'PFK': ['Glycolysis / Gluconeogenesis', 'Pentose phosphate pathway', 'Fructose and mannose metabolism', 'Methane metabolism', 'Metabolic pathways', 'Biosynthesis of secondary metabolites', 'Microbial metabolism in diverse environments', 'Carbon metabolism', 'Biosynthesis of amino acids'], 'PFL': ['Pyruvate metabolism', 'Butanoate metabolism', 'Metabolic pathways', 'Microbial metabolism in diverse environments'], 'PGI': ['Starch and sucrose metabolism', 'Metabolic pathways', 'Biosynthesis of secondary metabolites'], 'PGK': ['Glycolysis / Gluconeogenesis', 'Carbon fixation by Calvin cycle', 'Metabolic pathways', 'Biosynthesis of secondary metabolites', 'Microbial metabolism in diverse environments', 'Carbon metabolism', 'Biosynthesis of amino acids'], 'PGL': ['Pentose phosphate pathway', 'Metabolic pathways', 'Biosynthesis of secondary metabolites', 'Microbial metabolism in diverse environments', 'Carbon metabolism'], 'ACALD': ['Phenylalanine metabolism', 'Benzoate degradation', 'Pyruva

In [None]:

hypergeometric_pathway_enrichment_df = hypergeometric_pathway_enrichment(significant_diff_reactions, ec_dingo_reactions, reaction_to_pathways)
print(hypergeometric_pathway_enrichment_df)

                                         pathway  overlap  pathway_size  \
9                            Pyruvate metabolism       14            15   
31   Alanine, aspartate and glutamate metabolism        4             4   
32                           Nitrogen metabolism        3             3   
30                         Arginine biosynthesis        3             3   
19            Taurine and hypotaurine metabolism        2             2   
3                             Methane metabolism        6            10   
17             Degradation of aromatic compounds        1             1   
26                     Biosynthesis of cofactors        1             1   
16                            Xylene degradation        1             1   
15                            Dioxin degradation        1             1   
14                          Benzoate degradation        1             1   
13                      Phenylalanine metabolism        1             1   
34        Nicotinate and 

In [50]:

import numpy as np
import pandas as pd


def add_enrichment_metrics(hypergeometric_pathway_enrichment_df):
    """
    Add fold enrichment and -log10(p-value) columns to the enrichment results DataFrame.
    
    Parameters:
    - df: DataFrame from hypergeometric_pathway_enrichment
    
    Returns:
    - Updated DataFrame
    """
    enrichment_metrics_df = df.copy()
    enrichment_metrics_df['fold_enrichment'] = (enrichment_metrics_df['overlap'] / enrichment_metrics_df['pathway_size']) / (enrichment_metrics_df['significant'] / enrichment_metrics_df['total'])
                                               
    enrichment_metrics_df['log10_pval'] = -np.log10(enrichment_metrics_df['pval'].clip(lower=1e-300))
    return enrichment_metrics_df


import plotly.express as px

def plot_pathway_enrichment(enrichment_metrics_df, pval_threshold=0.05, use_fdr=True):
    """
    Create a bubble plot of pathway enrichment results.

    Parameters:
    - df: Enrichment DataFrame with added fold_enrichment and log10_pval.
    - pval_threshold: significance threshold for filtering.
    - use_fdr: whether to use 'fdr' or 'pval' for filtering.

    Returns:
    - A Plotly figure.
    """
    df = enrichment_metrics_df.copy()

    # Filter based on significance
    pval_col = 'fdr' if use_fdr else 'pval'
    df = df[df[pval_col] < pval_threshold]

    # Sort for better visual order
    df = df.sort_values('fold_enrichment', ascending=False)

    fig = px.scatter(
        df,
        x='fold_enrichment',
        y='pathway',
        size='overlap',
        color='log10_pval',
        color_continuous_scale='Reds',
        size_max=20,
        labels={
            'fold_enrichment': 'Fold Enrichment',
            'log10_pval': '-log10(p-value)',
            'overlap': '# Reactions',
            'pathway': 'Pathway'
        },
        title='Pathway Enrichment Bubble Plot'
    )
    
    fig.update_layout(yaxis=dict(autorange="reversed"))
    fig.show()



enrichment_metrics_df = add_enrichment_metrics(hypergeometric_pathway_enrichment_df)

plot_pathway_enrichment(enrichment_metrics_df, pval_threshold=2)

