### Load libraries and functions

In [1]:

import sys
sys.path.insert(0, '../src/')

from load_modify_sample_utils import load_model, get_objective_functions, get_reaction_bounds

from pathways_utils import map_model_to_kegg_reactions_dictionary, read_json_file, reaction_id_to_kegg_id
from pathways_utils import dictionary_reaction_id_to_kegg_id, fill_missing_kegg_ids_in_initial_dictionary
from pathways_utils import get_kegg_pathways_from_reaction_ids, subset_model_reactions_from_pathway_info
from pathways_utils import dictionary_reaction_id_to_pathway, reaction_in_pathway_binary_matrix, plot_reaction_in_pathway_heatmap 

### Load and inspect model (for more info see `load_modify_samply.ipynb`)

In [2]:

ec_cobra_model, ec_cobra_reactions, ec_cobra_reaction_ids,  = load_model("../ext_data/models/e_coli_core.xml")

objective_functions = get_objective_functions(ec_cobra_model)
print(objective_functions)

default_reaction_bounds = get_reaction_bounds(ec_cobra_model)
print(default_reaction_bounds.get("BIOMASS_Ecoli_core_w_GAM"))


Set parameter Username
Set parameter LicenseID to value 2642044
Academic license - for non-commercial use only - expires 2026-03-25
['BIOMASS_Ecoli_core_w_GAM']
(0.0, 1000.0)


### Extract KEGG terms from reaction IDs

The `map_model_to_kegg_reactions_dictionary` function will create a dictionary that will assign KEGG terms (values) to BiGG/SEED ids (keys) only from model information (without searching on online databases)

In [3]:

initial_bigg_to_kegg_dictionary = map_model_to_kegg_reactions_dictionary(ec_cobra_model)
print(initial_bigg_to_kegg_dictionary)


{'PFK': None, 'PFL': 'R00212', 'PGI': None, 'PGK': 'R01512', 'PGL': 'R02035', 'ACALD': 'R00228', 'AKGt2r': None, 'PGM': 'R01518', 'PIt2r': None, 'ALCD2x': 'R00754', 'ACALDt': None, 'ACKr': 'R00315', 'PPC': 'R00345', 'ACONTa': 'R01325', 'ACONTb': 'R01900', 'ATPM': 'R00086', 'PPCK': 'R00341', 'ACt2r': None, 'PPS': 'R00199', 'ADK1': 'R00127', 'AKGDH': 'R08549', 'ATPS4r': None, 'PTAr': 'R00230', 'PYK': 'R00200', 'BIOMASS_Ecoli_core_w_GAM': None, 'PYRt2': None, 'CO2t': None, 'RPE': 'R01529', 'CS': 'R00351', 'RPI': None, 'SUCCt2_2': None, 'CYTBD': None, 'D_LACt2': None, 'ENO': 'R00658', 'SUCCt3': None, 'ETOHt2r': None, 'SUCDi': None, 'SUCOAS': 'R00405', 'TALA': 'R01827', 'THD2': None, 'TKT1': None, 'TKT2': 'R01830', 'TPI': 'R01015', 'EX_ac_e': None, 'EX_acald_e': None, 'EX_akg_e': None, 'EX_co2_e': None, 'EX_etoh_e': None, 'EX_for_e': None, 'EX_fru_e': None, 'EX_fum_e': None, 'EX_glc__D_e': None, 'EX_gln__L_e': None, 'EX_glu__L_e': None, 'EX_h_e': None, 'EX_h2o_e': None, 'EX_lac__D_e': None,

### Load JSON file with KEGG information

The `read_json_file` function reads the JSON file where the KEGG information is stored and stores it in a JSON and in a pandas dataframe objects. This JSON file is available from https://raw.githubusercontent.com/MGXlab/DNNGIOR/refs/heads/main/docs/biochemistry/reactions.json


In [4]:

reactions_json, reactions_pandas = read_json_file("../ext_data/reactions/reactions.json")


### Inspect the structure of the JSON file

Inspect columns of interest (e.g. `aliases` and `linked_reaction` that contain information of interest)

In [5]:

print(reactions_pandas['aliases'][0])
print(reactions_pandas['linked_reaction'][0])


['AraCyc: INORGPYROPHOSPHAT-RXN', 'BiGG: IPP1; PPA; PPA_1; PPAm', 'BrachyCyc: INORGPYROPHOSPHAT-RXN', 'KEGG: R00004', 'MetaCyc: INORGPYROPHOSPHAT-RXN', 'Name: Diphosphate phosphohydrolase; Inorganic diphosphatase; Inorganic pyrophosphatase; Pyrophosphate phosphohydrolase; diphosphate phosphohydrolase; inorganic diphosphatase; inorganic diphosphatase (one proton translocation); inorganicdiphosphatase; pyrophosphate phosphohydrolase']
rxn27946;rxn27947;rxn27948;rxn32487;rxn38157;rxn38158


### Create dictionaries mapping BiGG/SEED ids to KEGG ids

The `dictionary_reaction_id_to_kegg_id` function works on the `reactions_pandas` dataframe to store for each entry (BiGG or SEED reaction ids) the corresponding KEGG ids

In [6]:

bigg_to_kegg, seed_to_kegg = dictionary_reaction_id_to_kegg_id(reactions_pandas)


In [7]:
print(bigg_to_kegg)
print(seed_to_kegg)

{'IPP1': 'R00004', 'PPA': 'R00004', 'PPA_1': 'R00004', 'PPAm': 'R00004', 'ALPHNH': 'R00005', 'DUR1_2': 'R00005', 'ILV2_2': 'R00006', 'CAT': 'R00009', 'CATp': 'R00009', 'CTA1': 'R00009', 'CTT1': 'R00009', 'ATH1': 'R00010; R06103', 'NTH1': 'R00010; R06103', 'NTH2': 'R00010; R06103', 'TREH': 'R00010; R06103', 'TREHe': 'R00010; R06103', 'TREHpp': 'R00010; R06103', 'TREHv': 'R00010; R06103', 'GLXCBL': 'R00013', 'GLXCL': 'R00013', 'CCP': 'R00017', 'CCP2m': 'R00017', 'CYCPO': 'R00017', 'HSPMS': 'R00018', 'HXAD': 'R00022', 'RBPC': 'R00024', 'FSP2_1': 'R00028; R06084', 'MAL12': 'R00028; R06084', 'MAL32': 'R00028; R06084', 'MALT': 'R00028; R06084', 'YGR287C': 'R00028; R06084', 'YIL172C': 'R00028; R06084', 'YJL216C': 'R00028; R06084', 'HEM2': 'R00036', 'PPBNGS': 'R00036', 'AHXDH': 'R00059', 'FEROpp': 'R00078', 'HEM3': 'R00084', 'HMBS': 'R00084', 'ATPH1': 'R00085', 'ATPM': 'R00086; R10531', 'ATPOBJ': 'R00086; R10531', 'FATP': 'R00086; R10531', 'NTP1': 'R00086; R10531', 'U214': 'R00086; R10531', 'N

### Convert reaction IDs

The `reaction_id_to_kegg_id` function takes as arguments: a BiGG or a SEED id, the modeltype and the mapping dictionaries created above. It returns the corresponding KEGG id.

In [8]:

kegg_id = reaction_id_to_kegg_id("IPP1", "BiGG", bigg_to_kegg, seed_to_kegg)
print(kegg_id)


kegg_id = reaction_id_to_kegg_id("rxn19264", "SEED", bigg_to_kegg, seed_to_kegg)
print(kegg_id)


kegg_id = reaction_id_to_kegg_id("THD2", "BiGG", bigg_to_kegg, seed_to_kegg)
print(kegg_id)


R00004
R00009
NA


### Update initial mapping dictionary with additional information

The `fill_missing_kegg_ids_in_initial_dictionary` function is used to further map KEGG to BiGG/SEED ids, if initial model lacks some information

In [9]:

final_bigg_to_kegg_dictionary = fill_missing_kegg_ids_in_initial_dictionary(initial_bigg_to_kegg_dictionary, 
                                                                            modeltype="BiGG", 
                                                                            bigg_to_kegg=bigg_to_kegg,
                                                                            seed_to_kegg=seed_to_kegg)

print(final_bigg_to_kegg_dictionary)


{'PFK': 'R00756', 'PFL': 'R00212', 'PGI': 'R00771', 'PGK': 'R01512', 'PGL': 'R02035', 'ACALD': 'R00228', 'AKGt2r': 'NA', 'PGM': 'R01518', 'PIt2r': 'NA', 'ALCD2x': 'R00754', 'ACALDt': 'NA', 'ACKr': 'R00315', 'PPC': 'R00345', 'ACONTa': 'R01325', 'ACONTb': 'R01900', 'ATPM': 'R00086', 'PPCK': 'R00341', 'ACt2r': 'NA', 'PPS': 'R00199', 'ADK1': 'R00127', 'AKGDH': 'R08549', 'ATPS4r': 'NA', 'PTAr': 'R00230', 'PYK': 'R00200', 'BIOMASS_Ecoli_core_w_GAM': 'NA', 'PYRt2': 'NA', 'CO2t': 'NA', 'RPE': 'R01529', 'CS': 'R00351', 'RPI': 'R01056', 'SUCCt2_2': 'NA', 'CYTBD': 'NA', 'D_LACt2': 'NA', 'ENO': 'R00658', 'SUCCt3': 'NA', 'ETOHt2r': 'NA', 'SUCDi': 'NA', 'SUCOAS': 'R00405', 'TALA': 'R01827', 'THD2': 'NA', 'TKT1': 'R01641', 'TKT2': 'R01830', 'TPI': 'R01015', 'EX_ac_e': 'NA', 'EX_acald_e': 'NA', 'EX_akg_e': 'NA', 'EX_co2_e': 'NA', 'EX_etoh_e': 'NA', 'EX_for_e': 'NA', 'EX_fru_e': 'NA', 'EX_fum_e': 'NA', 'EX_glc__D_e': 'NA', 'EX_gln__L_e': 'NA', 'EX_glu__L_e': 'NA', 'EX_h_e': 'NA', 'EX_h2o_e': 'NA', 'EX_

### Retrieve KEGG pathways IDs and names for all the model reactions

The `get_kegg_pathways_from_reaction_ids` function gets the final mapping directory as input and retrieves from KEGG pathway information. Thus, each reaction is mapped to one or multiple pathway IDs and names.

In [10]:

df_kegg_pathways = get_kegg_pathways_from_reaction_ids(final_bigg_to_kegg_dictionary)


In [11]:

print(df_kegg_pathways.head(5))


  model_reaction kegg_reaction  \
0            PFL        R00212   
1            O2t            NA   
2            PGL        R02035   
3            PGM        R01518   
4            PGK        R01512   

                                         pathway_ids  \
0               [rn00620, rn00650, rn01100, rn01120]   
1                                                 []   
2      [rn00030, rn01100, rn01110, rn01120, rn01200]   
3  [rn00010, rn00260, rn00680, rn01100, rn01110, ...   
4  [rn00010, rn00710, rn01100, rn01110, rn01120, ...   

                                       pathway_names  
0  [Pyruvate metabolism, Butanoate metabolism, Me...  
1                                                 []  
2  [Pentose phosphate pathway, Metabolic pathways...  
3  [Glycolysis / Gluconeogenesis, Glycine, serine...  
4  [Glycolysis / Gluconeogenesis, Carbon fixation...  


### Subset reactions from pathway of interest

The `subset_model_reactions_from_pathway_info` function takes a pathway name or ID as input alongside the `df_kegg_pathways` dataframe. It returns a list of all reactions belonging to the given pathway.

In [12]:

PPP = subset_model_reactions_from_pathway_info(df_kegg_pathways, "Pentose phosphate pathway")
print(PPP)


Glycolysis = subset_model_reactions_from_pathway_info(df_kegg_pathways, "Glycolysis / Gluconeogenesis")
print(Glycolysis)


Glycolysis = subset_model_reactions_from_pathway_info(df_kegg_pathways, "rn00010")
print(Glycolysis)



['FBA', 'FBP', 'GND', 'PFK', 'PGL', 'RPE', 'RPI', 'TKT1']
['ALCD2x', 'ENO', 'FBA', 'FBP', 'GAPD', 'PFK', 'PGK', 'PGM', 'PPCK', 'PPS', 'PYK', 'TPI']
['ALCD2x', 'ENO', 'FBA', 'FBP', 'GAPD', 'PFK', 'PGK', 'PGM', 'PPCK', 'PPS', 'PYK', 'TPI']


### Create a dictionary mapping selected reactions to pathway names

The `dictionary_reaction_id_to_pathway` function takes one or multiple lists containing reaction IDs (corresponding to pathways as shown above) and creates a dictionary that maps the IDs to pathway names. This is useful for plotting to work with subsets of reactions and to replace names from the `df_kegg_pathways` dataframe like `Glycolysis / Gluconeogenesis` to `Glycolysis` and `Pentose phosphate pathway` to `PPP`.

In [13]:

bigg_to_pathway_dict = dictionary_reaction_id_to_pathway(Glycolysis = Glycolysis, PPP = PPP)


### Create a binary matrix (0/1) as an laternative way to illustrate the presense or absense of a reaction to a certain pathway

The `reaction_in_pathway_binary_matrix` function is used to create a new pandas dataframe with reactions as rows and different pathways as columns. The corresponding cell of the dataframe will show if a reaction belongs to a certain pathway (1) or not (0). If a reaction belongs to more than one pathways, then the column: "Multiple-Pathways" is created and the reaction matching this will only get True (1) there and not in the individual pathway columns (e.g. 1 in Multiple-Pathways, 0 in Glycolysis and 0 in PPP).

In [14]:

binary_df = reaction_in_pathway_binary_matrix(bigg_to_pathway_dict)


### Heatmap of the binary dataframe

The function `plot_reaction_in_pathway_heatmap` is used to plot a heatmap of the `binary_df` to better illustrate the connection between reactions and pathways.

In [15]:

plot_reaction_in_pathway_heatmap(binary_df, font_size=8, fig_width=600, fig_height=600, title="")
