### Load libraries and functions

In [None]:

import sys
sys.path.insert(0, '../src/')

from loopless_utils import loops_enumeration_from_fva

from load_modify_sample_utils import load_model, get_objective_functions, get_reaction_bounds, modify_model
from load_modify_sample_utils import sample_optgp

from correlations_utils import correlated_reactions, plot_correlation_matrix, split_forward_reverse
from correlations_utils import find_reactants_products, sharing_metabolites_square_matrix

from pathways_utils import dictionary_reaction_id_to_kegg_id, get_kegg_pathways_from_reaction_ids
from pathways_utils import map_model_to_kegg_reactions_dictionary, read_json_file, fill_missing_kegg_ids_in_initial_dictionary
from pathways_utils import subset_model_reactions_from_pathway_info, subset_sampling_array_from_reaction_ids, sort_reactions_by_model_order


### Load and inspect model (for more info see `load_modify_samply.ipynb`)

In [2]:

ec_cobra_model, ec_cobra_reactions, ec_cobra_reaction_ids,  = load_model("../ext_data/models/e_coli_core.xml")

objective_functions = get_objective_functions(ec_cobra_model)
print(objective_functions)

default_reaction_bounds = get_reaction_bounds(ec_cobra_model)
print(default_reaction_bounds.get("BIOMASS_Ecoli_core_w_GAM"))


Set parameter Username
Set parameter LicenseID to value 2642044
Academic license - for non-commercial use only - expires 2026-03-25
['BIOMASS_Ecoli_core_w_GAM']
(0.0, 1000.0)


### Modify the model to create two different conditions (for more info see `load_modify_samply.ipynb`)

In [3]:

# Set optimal percentage to 100
ec_cobra_model_condition_100 = modify_model(
    cobra_model         = ec_cobra_model,
    objective_function  = "BIOMASS_Ecoli_core_w_GAM",
    optimal_percentage  = 100,
    objective_direction = "max"
)

updated_objective_functions = get_objective_functions(ec_cobra_model_condition_100)
print(updated_objective_functions)

updated_reaction_bounds = get_reaction_bounds(ec_cobra_model_condition_100)
print(updated_reaction_bounds.get("BIOMASS_Ecoli_core_w_GAM"))

# -----------

# Set optimal percentage to 0
ec_cobra_model_condition_0 = modify_model(
    cobra_model         = ec_cobra_model,
    objective_function  = "BIOMASS_Ecoli_core_w_GAM",
    optimal_percentage  = 0,
    objective_direction = "max"
)

updated_objective_functions = get_objective_functions(ec_cobra_model_condition_0)
print(updated_objective_functions)

updated_reaction_bounds = get_reaction_bounds(ec_cobra_model_condition_0)
print(updated_reaction_bounds.get("BIOMASS_Ecoli_core_w_GAM"))


Read LP format model from file /tmp/tmp1dd955dj.lp
Reading time = 0.01 seconds
: 72 rows, 190 columns, 720 nonzeros


['BIOMASS_Ecoli_core_w_GAM']
(0.872922, 1000)
Read LP format model from file /tmp/tmp0h9w8s_y.lp
Reading time = 0.00 seconds
: 72 rows, 190 columns, 720 nonzeros
['BIOMASS_Ecoli_core_w_GAM']
(0.0, 1000)


### Identify loopy reactions in model (for more info see `loopless.ipynb`)

In [5]:

loopy_reactions_fva_100 = loops_enumeration_from_fva(ec_cobra_model_condition_100, fraction_of_optimum=0.999)
print(loopy_reactions_fva_100)

loopy_reactions_100 = [item[0] for item in loopy_reactions_fva_100]
print(loopy_reactions_100)


loopy_reactions_fva_0 = loops_enumeration_from_fva(ec_cobra_model_condition_0, fraction_of_optimum=0)
print(loopy_reactions_fva_0)

loopy_reactions_0 = [item[0] for item in loopy_reactions_fva_0]
print(loopy_reactions_0)


[('SUCDi', 994.7794007141792), ('FRD7', 995.0539767141795)]
['SUCDi', 'FRD7']
[('SUCDi', 980.0), ('FRD7', 1000.0)]
['SUCDi', 'FRD7']


### Remove loopy reactions from the 2 models created above to reduce the thermodynamically infeasible solutions from sampling

In [4]:

ec_cobra_model_condition_100.reactions.get_by_id("FRD7").bounds = (0, 0)
ec_cobra_model_condition_0.reactions.get_by_id("FRD7").bounds = (0, 0)


### Perform sampling on the modified models with the loopy reaction "FRD7" removed. (for more info see `load_modify_samply.ipynb`)

In [5]:

samples_optgp_condition_100 = sample_optgp(ec_cobra_model_condition_100, 
                                           n_samples = 3000, 
                                           thinning=100, 
                                           reaction_in_rows = True)


samples_optgp_condition_0 = sample_optgp(ec_cobra_model_condition_0, 
                                         n_samples = 3000, 
                                         thinning=100, 
                                         reaction_in_rows = True)


Read LP format model from file /tmp/tmp34045yhi.lp
Reading time = 0.00 seconds
: 72 rows, 190 columns, 720 nonzeros


Read LP format model from file /tmp/tmp591qhwxi.lp
Reading time = 0.01 seconds
: 72 rows, 190 columns, 720 nonzeros


### Map KEGG pathway information to reactions BiGG ids  

Functions that are used to map information from KEGG pathways to the model reactions (BiGG/SEED ids). For more info see `pathways.ipynb`

In [6]:

initial_bigg_to_kegg_dictionary = map_model_to_kegg_reactions_dictionary(ec_cobra_model)

reactions_json, reactions_pandas = read_json_file("../ext_data/reactions/reactions.json")

bigg_to_kegg, seed_to_kegg = dictionary_reaction_id_to_kegg_id(reactions_pandas)

final_bigg_to_kegg_dictionary = fill_missing_kegg_ids_in_initial_dictionary(initial_bigg_to_kegg_dictionary, 
                                                                            modeltype="BiGG", 
                                                                            bigg_to_kegg=bigg_to_kegg,
                                                                            seed_to_kegg=seed_to_kegg)

df_kegg_pathways = get_kegg_pathways_from_reaction_ids(final_bigg_to_kegg_dictionary)


### From now on to better illustrate functions related to computing pairwise correlations, we will focus only in certain subsystems of the model (selected pathways of interest). 

We will subset the sampling array to include only reactions from Glycolysis, Glyconeogenesis and the Pentose phosphate pathway. For more information on how these functions works see: `load_modify_samply.ipynb`

In [None]:

Glycolysis = subset_model_reactions_from_pathway_info(df_kegg_pathways, "Glycolysis / Gluconeogenesis")
PPP = subset_model_reactions_from_pathway_info(df_kegg_pathways, "Pentose phosphate pathway")

reactions_in_pathways_ordered_duplicates = sort_reactions_by_model_order(ec_cobra_reaction_ids, Glycolysis, PPP)


# Remove duplicates
reactions_in_pathways_ordered = []
[reactions_in_pathways_ordered.append(val) for val in reactions_in_pathways_ordered_duplicates if val not in reactions_in_pathways_ordered]


subset_pathways_optgp_condition_100 = subset_sampling_array_from_reaction_ids(
                                                                samples_optgp_condition_100, 
                                                                ec_cobra_reaction_ids, 
                                                                subset_reactions = reactions_in_pathways_ordered)


subset_pathways_optgp_condition_0 = subset_sampling_array_from_reaction_ids(
                                                                samples_optgp_condition_0, 
                                                                ec_cobra_reaction_ids, 
                                                                subset_reactions = reactions_in_pathways_ordered)


### Split bidirectional-reversible reactions (having both positive and negative flux values) into two separate reactions (forward and reverse).

The `split_forward_reverse` function given a flux sampling (steady states) array with reactions as rows, splits all bidirectional reactions having at least 1 positive and 1 negative flux value into separate forward and reverse reactions.

The `find_reactants_products` function identifies and saves in separate lists the reactants, products and directionality status of each reaction in a given metabolic model. Cofactors do not count as reactants/products.

This step is to avoid losing information, when computing correlations from a set that includes reversible reactions. 

In some cases, the forward direction of reactions (as defined in the model) does not align with the observed network topology. As a result, pairs of reactions may show identical correlation magnitudes but with opposite signs.

To deal with this, we split all reversible reactions with both positive and negative flux to separate reactions. Thus, one of the 2 (forward or reverse) has the expected correlations with reactions from the rest network.

In [8]:

(subset_extended_steady_states_100,
 subset_extended_reactions_100) = split_forward_reverse(subset_pathways_optgp_condition_100, 
                                                        reactions=reactions_in_pathways_ordered)

(reversibility_list_all_reactions_100, 
 reactants_list_all_reactions_100,
 products_list_all_reactions_100) = find_reactants_products(ec_cobra_model, 
                                                            subset_extended_reactions_100)

print(len(reversibility_list_all_reactions_100), len(reactants_list_all_reactions_100))



(subset_extended_steady_states_0,
 subset_extended_reactions_0) = split_forward_reverse(subset_pathways_optgp_condition_0,
                                               reactions=reactions_in_pathways_ordered)

(reversibility_list_all_reactions_0,
 reactants_list_all_reactions_0,
 products_list_all_reactions_0) = find_reactants_products(ec_cobra_model, 
                                                          subset_extended_reactions_0)
 
print(len(reversibility_list_all_reactions_0), len(reactants_list_all_reactions_0))


21 21
25 25


### Calculate a boolean square matrix with dimensions equal to the number of reactions (here, reactions after the forward and reverse split) with information on reactions sharing metabolites. 

The `sharing_metabolites_square_matrix` function given the lists with reactants, products and reversibility information, creates a square boolean matrix with `True` values representing reactions sharing a common metabolite, as implemented in `sharing_metabolites` function.

This is done in case the user wants to filter correlations from reactions not sharing metabolites

In [9]:

subset_boolean_sharing_metabolites_matrix_100 = sharing_metabolites_square_matrix(subset_extended_reactions_100, 
                                                                                  reversibility_list_all_reactions_0,
                                                                                  reactants_list_all_reactions_0,
                                                                                  products_list_all_reactions_0)

print(subset_boolean_sharing_metabolites_matrix_100.shape)



subset_boolean_sharing_metabolites_matrix_0 = sharing_metabolites_square_matrix(subset_extended_reactions_0, 
                                                                                reversibility_list_all_reactions_0,
                                                                                reactants_list_all_reactions_0,
                                                                                products_list_all_reactions_0)

print(subset_boolean_sharing_metabolites_matrix_0.shape)


(21, 21)
(25, 25)


### Compute pairwise linear correlations and non-linear copula dependencies from flux samples and filter (remove) correlations between reactions not sharing any metabolites

The `correlated_reactions` function calculate pairwise linear correlations and non-linear copula dependencies given a flux sampling array. User can choose the preffered coefficient to calculate pairwise linear correlations (pearson or spearman). Moreover, he can filter correlations not meeting a cutoff value and choose whether to proceed with calculation of non-linear dependencies using copulas.

In [11]:

(subset_linear_correlation_matrix_100_mets_filter, 
subset_non_linear_correlation_matrix_100_mets_filter, 
subset_mixed_correlation_matrix_100_mets_filter, 
subset_correlations_dictionary_100_mets_filter) = correlated_reactions(
        steady_states = subset_extended_steady_states_100,
        boolean_sharing_metabolites_matrix=subset_boolean_sharing_metabolites_matrix_100,
        reactions=subset_extended_reactions_100,
        linear_coeff = "pearson",
        linear_corr_cutoff = 0.3, 
        indicator_cutoff = 1.2,
        jensenshannon_cutoff = 0.05,
        std_cutoff= 1e-2,
        include_non_linear = True, 
        cells = 5, 
        cop_coeff = 0.2, 
        lower_triangle = False, 
        verbose = True
)


(subset_linear_correlation_matrix_0_mets_filter, 
subset_non_linear_correlation_matrix_0_mets_filter, 
subset_mixed_correlation_matrix_0_mets_filter, 
subset_correlations_dictionary_0_mets_filter) = correlated_reactions(
        steady_states = subset_extended_steady_states_0,
        boolean_sharing_metabolites_matrix=subset_boolean_sharing_metabolites_matrix_0,
        reactions=subset_extended_reactions_0,
        linear_coeff = "pearson",
        linear_corr_cutoff = 0.3, 
        indicator_cutoff = 1.2,
        jensenshannon_cutoff = 0.05,
        std_cutoff= 1e-2,
        include_non_linear = True, 
        cells = 5, 
        cop_coeff = 0.2, 
        lower_triangle = False, 
        verbose = True
)


Completed the process of 30 from 103 copulas
Completed the process of 31 from 103 copulas
Completed the process of 32 from 103 copulas
Completed the process of 33 from 103 copulas
Completed the process of 34 from 103 copulas
Completed the process of 35 from 103 copulas
Completed the process of 44 from 103 copulas
Completed the process of 49 from 103 copulas
Completed the process of 54 from 103 copulas
Completed the process of 59 from 103 copulas
Completed the process of 64 from 103 copulas
Completed the process of 69 from 103 copulas
Completed the process of 74 from 103 copulas
Completed the process of 96 from 103 copulas
Completed the process of 102 from 103 copulas
Completed the process of 1 from 144 copulas
Completed the process of 2 from 144 copulas
Completed the process of 3 from 144 copulas
Completed the process of 4 from 144 copulas
Completed the process of 5 from 144 copulas
Completed the process of 6 from 144 copulas
Completed the process of 7 from 144 copulas
Completed the pr

### Here, we show some pairs of reactions with extreme non-linear dependencies based on the `jensenshannon` distance (this excludes sharing metabolite information present in the 3 matrices calculated above)

In [12]:

filtered_positive = {
    pair: metrics
    for pair, metrics in subset_correlations_dictionary_100_mets_filter.items()
    if metrics['jensenshannon'] > 0.10 and metrics['indicator'] > 1.2
}

print(filtered_positive)
print(filtered_positive.keys())

filtered_negative = {
    pair: metrics
    for pair, metrics in subset_correlations_dictionary_100_mets_filter.items()
    if metrics['jensenshannon'] < -0.10 and metrics['indicator'] < 1.2

}

print(filtered_negative)
print(filtered_negative.keys())


{'PYK~PGK': {'pearson': 0, 'jensenshannon': 0.11517866713970133, 'indicator': 1.6059379192742054, 'classification': 'positive_upper_lower_tail'}, 'PYK~PGL': {'pearson': 0, 'jensenshannon': 0.11448754945687666, 'indicator': 1.5989232815655856, 'classification': 'positive_upper_lower_tail'}, 'PYK~PGM': {'pearson': 0, 'jensenshannon': 0.11531913318452802, 'indicator': 1.6059379192742058, 'classification': 'positive_upper_lower_tail'}, 'RPE~PYK': {'pearson': 0, 'jensenshannon': 0.11423930055008931, 'indicator': 1.598923281565586, 'classification': 'positive_upper_lower_tail'}, 'RPI_rev~PYK': {'pearson': 0, 'jensenshannon': 0.1149624249498473, 'indicator': 1.6037735824645274, 'classification': 'positive_upper_lower_tail'}, 'TKT1~PYK': {'pearson': 0, 'jensenshannon': 0.1142705843943744, 'indicator': 1.5989232815655858, 'classification': 'positive_upper_lower_tail'}, 'GND~PYK': {'pearson': 0, 'jensenshannon': 0.11448754945687664, 'indicator': 1.598923281565586, 'classification': 'positive_upp

### Plot 3 correlation matrices, (a) only with linear correlations, (b) only with non-linear correlations and (c) with both linear and non-linear correlations

The `plot_correlation_matrix` function plots a correlation matrix created with the `correlated_reactions` function

`REMINDER`: In these matrices correlations for reactions not sharing metabolites are filtered and that is why we dont see values in the `b` plot. In chunks below, this filter is removed and we can examine all possible correlations and dependencies.

In [18]:

# a, only linear correlations
plot_correlation_matrix(subset_linear_correlation_matrix_100_mets_filter, 
                        subset_extended_reactions_100, 
                        label_font_size=10)

# b, only non-linear copula dependencies
plot_correlation_matrix(subset_non_linear_correlation_matrix_100_mets_filter, 
                        subset_extended_reactions_100, 
                        label_font_size=10)

# c, both linear and non-linear
plot_correlation_matrix(subset_mixed_correlation_matrix_100_mets_filter, 
                        subset_extended_reactions_100, 
                        label_font_size=10)


### Calculate pairwise linear correlations and non-linear copula dependencies without filtering for sharing metabolites

In [None]:

(subset_linear_correlation_matrix_100, 
subset_non_linear_correlation_matrix_100, 
subset_mixed_correlation_matrix_100, 
subset_correlations_dictionary_100) = correlated_reactions(
        steady_states = subset_extended_steady_states_100,
        boolean_sharing_metabolites_matrix=None,
        reactions=subset_extended_reactions_100,
        linear_coeff = "pearson",
        linear_corr_cutoff = 0.3, 
        indicator_cutoff = 1.2,
        jensenshannon_cutoff = 0.05,
        std_cutoff= 1e-2,
        include_non_linear = True, 
        cells = 5, 
        cop_coeff = 0.2, 
        lower_triangle = False, 
        verbose = True
)


(subset_linear_correlation_matrix_0, 
subset_non_linear_correlation_matrix_0, 
subset_mixed_correlation_matrix_0, 
subset_correlations_dictionary_0) = correlated_reactions(
        steady_states = subset_extended_steady_states_0,
        boolean_sharing_metabolites_matrix=None,
        reactions=subset_extended_reactions_0,
        linear_coeff = "pearson",
        linear_corr_cutoff = 0.3, 
        indicator_cutoff = 1.2,
        jensenshannon_cutoff = 0.05,
        std_cutoff= 1e-2,
        include_non_linear = True, 
        cells = 5, 
        cop_coeff = 0.2, 
        lower_triangle = False, 
        verbose = True
)


Completed the process of 30 from 103 copulas
Completed the process of 31 from 103 copulas
Completed the process of 32 from 103 copulas
Completed the process of 33 from 103 copulas
Completed the process of 34 from 103 copulas
Completed the process of 35 from 103 copulas
Completed the process of 44 from 103 copulas
Completed the process of 49 from 103 copulas
Completed the process of 54 from 103 copulas
Completed the process of 59 from 103 copulas
Completed the process of 64 from 103 copulas
Completed the process of 69 from 103 copulas
Completed the process of 74 from 103 copulas
Completed the process of 96 from 103 copulas
Completed the process of 102 from 103 copulas
Completed the process of 1 from 144 copulas
Completed the process of 2 from 144 copulas
Completed the process of 3 from 144 copulas
Completed the process of 4 from 144 copulas
Completed the process of 5 from 144 copulas
Completed the process of 6 from 144 copulas
Completed the process of 7 from 144 copulas
Completed the pr

### Here, we show some pairs of reactions with extreme non-linear dependencies based on the `jensenshannon` distance

In [15]:

filtered_g = {
    pair: metrics
    for pair, metrics in subset_correlations_dictionary_100.items()
    if metrics['jensenshannon'] > 0.1 and metrics['indicator'] > 1.2
}

print(filtered_g)
print(filtered_g.keys())


filtered_l = {
    pair: metrics
    for pair, metrics in subset_correlations_dictionary_100.items()
    if metrics['jensenshannon'] < -0.1 and metrics['indicator'] < 1.2

}

print(filtered_l)
print(filtered_l.keys())


{'PYK~PGK': {'pearson': 0, 'jensenshannon': 0.11517866713970133, 'indicator': 1.6059379192742054, 'classification': 'positive_upper_lower_tail'}, 'PYK~PGL': {'pearson': 0, 'jensenshannon': 0.11448754945687666, 'indicator': 1.5989232815655856, 'classification': 'positive_upper_lower_tail'}, 'PYK~PGM': {'pearson': 0, 'jensenshannon': 0.11531913318452802, 'indicator': 1.6059379192742058, 'classification': 'positive_upper_lower_tail'}, 'RPE~PYK': {'pearson': 0, 'jensenshannon': 0.11423930055008931, 'indicator': 1.598923281565586, 'classification': 'positive_upper_lower_tail'}, 'RPI_rev~PYK': {'pearson': 0, 'jensenshannon': 0.1149624249498473, 'indicator': 1.6037735824645274, 'classification': 'positive_upper_lower_tail'}, 'TKT1~PYK': {'pearson': 0, 'jensenshannon': 0.1142705843943744, 'indicator': 1.5989232815655858, 'classification': 'positive_upper_lower_tail'}, 'GND~PYK': {'pearson': 0, 'jensenshannon': 0.11448754945687664, 'indicator': 1.598923281565586, 'classification': 'positive_upp

### Plot 3 correlation matrices, (a) only with linear correlations, (b) only with non-linear correlations and (c) with both linear and non-linear correlations

`REMINDER`: In these matrices correlations for reactions not sharing metabolites are `NOT` filtered and that is why we see values in the `b` plot.

In [None]:

# a, only linear correlations
plot_correlation_matrix(subset_linear_correlation_matrix_100, 
                        subset_extended_reactions_100, 
                        label_font_size=10)

# b, only non-linear copula dependencies
plot_correlation_matrix(subset_non_linear_correlation_matrix_100, 
                        subset_extended_reactions_100, 
                        label_font_size=10)

# c, both linear correlations and non-linear dependencies
plot_correlation_matrix(subset_mixed_correlation_matrix_100, 
                        subset_extended_reactions_100, 
                        label_font_size=10)
