In [1]:
# setup

# general imports
import pathlib
import os
import cobra
import sys
from openpyxl import load_workbook
import pandas as pd


# enabling local imports of autopacmen scripts
sys.path.append(str(pathlib.Path(os.getcwd()).parent.parent / "autopacmen"))

# create folders for input & output data
current_dir = pathlib.Path.cwd()
setup_dir = current_dir.parent / "datasets" / "sMOMENT" / "autopacmen_setup"
project_dir = current_dir.parent / "datasets" / "sMOMENT" / "autopacmen_project"
os.makedirs(setup_dir, exist_ok=True)
os.makedirs(project_dir, exist_ok=True)

In [2]:
# ****************************************************************************************************
# Step 7:   Creating sMOMENT model
# ****************************************************************************************************

'''
In order for the script to work, your project folder should look like this:

project/
├── autopacmen/
│   ├── __init__.py
│   ├── ... .py
│   └── submodules/
│       ├── __init__.py
│       └── ... .py
├── thesis_scripts/
│   ├── models/
│   │   └── your_ftINIT_model.mat
│   └── sMOMENT/
│       └── autopacmen_workflow.ipynb


Before executing the scripts, there are a few necessary precautions:
-   download BIGG metabolites data from from http://bigg.ucsd.edu/data_access (last accessed: 09/12/2024) and 
    save it into "datasets/sMOMENT/autopacmen_setup/bigg_models_metabolites.txt"
-   download BRENDA database data from https://www.brenda-enzymes.org/download.php (last accessed: 09/12/2024) and 
    save it into "datasets/sMOMENT/autopacmen_setup/brenda_2023_1.txt"
-   input the name of your model created in ftINIT and the project name below:
'''

# inputs:

# name of the model the protein data should be integrated into using sMOMENT
model_name = "A375_ftINIT_1+1_thr100_model.xml"
# project name for the model to be created
project_name = "A375_11"
# name of the output model
output_model_name = "A375_1+1_sMOMENT_model.xml"
# path to the bigg metabolites file
bigg_dir = setup_dir / "bigg_models_metabolites.txt"
# path to the brenda file
brenda_dir = setup_dir / "brenda_2023_1.txt"

# path to the protein data tsv file (step 6)
protein_data_path = current_dir.parent / "datasets" / "A735_protein_data_prep.tsv"
# total protein pool data
protein_pool_data = {
    "Total protein content [g/gDW]:": 0.637553516819572,  
    "Fraction of masses of model-included enzymes in comparison to all enzymes (0.0 to 1.0):": 0.25,
    "Average saturation level (0.0 to 1.0):": 1
}


In [3]:
# get model directory
model_dir = current_dir.parent / "models" / model_name

In [4]:
# run data preparation scripts 

# converting BIGG metabolites file to JSON
# inputs:   - path to the "bigg_models_metabolites.txt" file (as string)
#           - path to the folder where the JSON file should be saved in (as string)
# outputs:  - JSON file named "bigg_id_name_mapping.json"
# BIGG metabolites data from downloaded from http://bigg.ucsd.edu/data_access (accessed: 09/12/2024)
from autopacmen.submodules.parse_bigg_metabolites_file import parse_bigg_metabolites_file
parse_bigg_metabolites_file(
    str(bigg_dir), 
    str(setup_dir)
)

# converting sMOMENT relevant data from Brenda textfile to JSON
# inputs:   - path to the "brenda_2023_1.txt" file (as string)
#           - path to the "bigg_id_name_mapping.json" file generated in the previous script (as string)
#           - path to the file (not the folder!) where the JSON file should be saved in (as string)
# outputs:  - JSON file named "brenda.json" (if not stated otherwise in the inputs)
# Brenda database data downloaded from https://www.brenda-enzymes.org/download.php (accessed: 09/12/2024)
from autopacmen.submodules.parse_brenda_textfile import parse_brenda_textfile
parse_brenda_textfile(
    str(brenda_dir), 
    str(setup_dir), 
    str(setup_dir / "brenda.json")
)

# makes the previous "brenda.json" file specific for the model
# inputs:   - path to the model sbml file
#           - path to the previous "brenda.json" file (as string)
#           - path to output file (not folder)
# outputs:  - JSON file named "brenda_model_specific.json" (if not stated otherwise in the inputs)
from autopacmen.submodules.parse_brenda_json_for_model import parse_brenda_json_for_model
parse_brenda_json_for_model(
    str(model_dir), 
    str(setup_dir / "brenda.json"), 
    str(setup_dir / "brenda_model_specific.json")
)





In [5]:
# maps kcat values derived from the SABIO-RK database to model EC-numbers, 
# allowing for "wildcards" (meaning using values from related proteins if no kcat values are available)
# inputs:   - path to the model sbml file
#           - path to output file (not folder)
#           - path to the previous "bigg_id_name_mapping.json" file (as string)
# outputs:  - JSON file named "sabio_rk.json" (if not stated otherwise in the inputs) (as string)
# SABIO-RK was accessed on 10/12/2024
from autopacmen.submodules.parse_sabio_rk_for_model import parse_sabio_rk_for_model_with_sbml
parse_sabio_rk_for_model_with_sbml(
    model_dir, 
    str(setup_dir / "sabio_rk.json"), 
    str(setup_dir / "bigg_id_name_mapping.json")
)

Starting EC numbers kcat search in SABIO-RK...
Wildcard level 0...
['2.7.7.3', '1.5.1.2', '1.1.1.179', '2.4.2.10', '2.6.1.63', '2.7.6.2', '3.1.4.44', '4.2.1.-', '2.4.1.174', '2.4.99.1', '1.1.1.42', '3.2.1.1', '1.1.1.15', '6.3.4.3', '3.1.4.14', '1.2.4.1', '1.3.8.7', '2.3.2.4', '2.6.1.6', '2.7.1.-', '2.4.99.9', '3.1.4.35', '5.4.2.1', '1.11.1.8', '2.6.1.52', '1.14.99.1', '3.2.1.35', '3.5.1.24', '3.6.1.3', '3.1.3.36', '3.5.1.49', '4.2.1.24', '1.1.1.105', '3.4.14.9', '1.1.1.27', '3.1.3.13', '2.1.1.137', '2.5.1.26', '1.1.1.239', '2.2.1.1', '1.13.11.52', '2.1.3.3', '1.14.13.13', '6.2.1.28', '3.1.3.35', '2.7.7.8', '1.4.1.3', '3.4.16.5', '2.3.1.6', '1.-.-.-', '2.7.1.22', '2.7.8.1', '3.1.2.6', '1.5.99.8', '2.1.1.1', '2.1.1.63', '6.2.1.-', '3.1.4.17', '2.7.11.19', '3.1.3.66', '3.1.3.17', '2.4.1.11', '3.6.1.12', '1.2.1.24', '6.3.5.5', '2.1.1.28', '2.4.1.25', '6.1.1.17', '1.1.1.284', '2.7.4.3', '3.5.1.14', '3.6.3.14', '3.5.1.6', '4.1.2.4', '2.6.1.21', '1.2.1.39', '1.7.1.7', '3.4.21.89', '2.4.1.206'

In [6]:
# combines kcat values mapped to EC-numbers from SABIO-RK and BRENDA into one json
# inputs:   - path to SABIO-RK json mapping kcat values to EC-Numbers of the reactions in the model (as string)
#           - path to brenda json kcat database (as string)
#           - path to the output file (not folder)
# output:   - combined json file named "sabio.json"
from autopacmen.submodules.create_combined_kcat_database import create_combined_kcat_database
create_combined_kcat_database(
    str(setup_dir / "sabio_rk.json"), 
    str(setup_dir / "brenda_model_specific.json"), 
    str(setup_dir / "sabio.json")
)

In [7]:
# creates xlsx spreadsheets for the input information
# inputs:   - path to SBML model 
#           - path to the folder where the sheets should be created in
#           - project name
# outputs:  - xlsx spreadsheets to input data about the organism (only the enzyme_stoichiometries and protein_data sheets are essential)
from autopacmen.submodules.get_initial_spreadsheets import get_initial_spreadsheets_with_sbml
get_initial_spreadsheets_with_sbml(
    str(model_dir), 
    str(project_dir), 
    str(project_name)
)

INFO: Reaction MAR08360 does not have a KEGG ID annotation
INFO: Reaction MAR08652 does not have a KEGG ID annotation
INFO: Reaction MAR08757 does not have a KEGG ID annotation
INFO: Reaction MAR05396 does not have a KEGG ID annotation
INFO: Reaction MAR09727 does not have a KEGG ID annotation
INFO: Reaction MAR05397 does not have a KEGG ID annotation
INFO: Reaction MAR05398 does not have a KEGG ID annotation
INFO: Reaction MAR05399 does not have a KEGG ID annotation
INFO: Reaction MAR05400 does not have a KEGG ID annotation
INFO: Reaction MAR05401 does not have a KEGG ID annotation
INFO: Reaction MAR08568 does not have a KEGG ID annotation
INFO: Reaction MAR08569 does not have a KEGG ID annotation
INFO: Reaction MAR08570 does not have a KEGG ID annotation
INFO: Reaction MAR08571 does not have a KEGG ID annotation
INFO: Reaction MAR08572 does not have a KEGG ID annotation
INFO: Reaction MAR08573 does not have a KEGG ID annotation
INFO: Reaction MAR08574 does not have a KEGG ID annotati

In [None]:
# inputs protein data generated in step 6 into the xlsx file created in the previous function

# loading the protein data from the path specified and formatting it
conc_df = pd.read_csv(protein_data_path, delimiter="\t")
conc_df = conc_df[["Gene ID", "Protein Concentration"]]

# Load the Excel file created in the previous autopacmen function
excel_path = project_dir / f"{project_name}_protein_data.xlsx"
wb = load_workbook(excel_path)

# Select the sheet "Total Protein Data"
sheet = wb["Total protein data"]

# Iterate over rows to find and update values according to input
for row in sheet.iter_rows(min_row=1, max_row=sheet.max_row, min_col=1, max_col=2):
    for cell in row:
        if cell.value in protein_pool_data:
            sheet.cell(row=cell.row, column=cell.column+1, value=protein_pool_data[cell.value])  # Assuming values are in column 2

# save changes
wb.save(excel_path)  
wb.close()

# append the protein data to the second sheet of the excel file
with pd.ExcelWriter(excel_path, mode="a", engine="openpyxl", if_sheet_exists="overlay") as writer:
    conc_df.to_excel(writer, sheet_name="Single protein data", startrow=1, index=False, header=False)

print(f"{excel_path} updated successfully!")


In [8]:
# Creates a JSON with the protein masses from UniProt for all proteins given in the gene rules of the given metabolic model
# inputs:   - path to the model smbl file (as string)
#           - path to the project folder (as string)
#           - project name
# outputs:  - json file with the mapping called project name + "_protein_id_mass_mapping.json"

from autopacmen.submodules.get_protein_mass_mapping import get_protein_mass_mapping_with_sbml
get_protein_mass_mapping_with_sbml(
    str(model_dir), 
    str(project_dir), 
    str(project_name)
)

{'O60762': ['ENSG00000000419'], 'Q9BTY2': ['ENSG00000001036'], 'P48506': ['ENSG00000001084'], 'Q16850': ['ENSG00000001630'], 'P28838': ['ENSG00000002549'], 'O14792': ['ENSG00000002587'], 'P19801': ['ENSG00000002726'], 'Q76N89': ['ENSG00000002746'], 'Q9NR63': ['ENSG00000003137'], 'Q9Y216': ['ENSG00000003987'], 'P52569': ['ENSG00000003989'], 'P54819': ['ENSG00000004455'], 'P28907': ['ENSG00000004468'], 'Q02790': ['ENSG00000004478'], 'O14561': ['ENSG00000004779'], 'Q16654': ['ENSG00000004799'], 'Q86VW1': ['ENSG00000004809'], 'Q9UJS0': ['ENSG00000004864'], 'P02730': ['ENSG00000004939'], 'P53701': ['ENSG00000004961'], 'P05141': ['ENSG00000005022'], 'P52435': ['ENSG00000005075'], 'Q53FZ2': ['ENSG00000005187'], 'Q92793': ['ENSG00000005339'], 'P05164': ['ENSG00000005381'], 'P27169': ['ENSG00000005421'], 'Q9UKG9': ['ENSG00000005469'], 'P21439': ['ENSG00000005471'], 'O75592': ['ENSG00000005810'], 'Q15119': ['ENSG00000005882'], 'Q9NZC3': ['ENSG00000006007'], 'Q09428': ['ENSG00000006071'], 'Q53H12

In [10]:
# maps kcat values from BRENDA/SABIO RK databases as well as optional custom values to the reactions in the model
# inputs:   - path to the SBML of the metabolic model
#           - path to the output folder 
#           - project name
#           - scientific name of the organism (used for the taxonomy-dependent search of kcat values)
#           - path to the SABIO-RK&BRENDA kcat<->reaction mapping JSON (from data_create_combined_kcat_database.py)
#           - *optional* path to the custom user-defined kcat<->protein JSON (default: "")
#           - type of kcat selection TEXT Can be "mean", "median" or "random"
# output:   - json with kcat values mapped to the reactions

from autopacmen.submodules.get_reactions_kcat_mapping import get_reactions_kcat_mapping

get_reactions_kcat_mapping(
    sbml_path=str(model_dir),
    project_folder=str(project_dir),
    project_name=str(project_name),
    organism='Homo sapiens',
    kcat_database_path=str(setup_dir / "sabio.json"),
    protein_kcat_database_path="",
    type_of_kcat_selection='mean'
)

***
Reaction: MAR03905
Forward kcat: 2.1973736575757576
Reverse kcat: 399.5392410555556

***
Reaction: MAR03907
Forward kcat: 3.944748461345614
Reverse kcat: 38.717

***
Reaction: MAR04097
Forward kcat: 88.53999999999999
Reverse kcat: 107.02857142857142

***
Reaction: MAR04099
Forward kcat: 88.53999999999999
Reverse kcat: 107.02857142857142

***
Reaction: MAR04108
Forward kcat: 107.02857142857142
Reverse kcat: 107.02857142857142

***
Reaction: MAR04133
Forward kcat: 986.53
Reverse kcat: 107.02857142857142

***
Reaction: MAR04281
Forward kcat: 882.8334071428571
Reverse kcat: 71.43518166666667

***
Reaction: MAR04388
Forward kcat: 882.8334071428571
Reverse kcat: 71.43518166666667

***
Reaction: MAR04283
Forward kcat: 8.2
Reverse kcat: 8.2

***
Reaction: MAR08357
Forward kcat: 7.907437153846153
Reverse kcat: 18.170115717216483

***
Reaction: MAR04379
Forward kcat: 218.5313568974359
Reverse kcat: 241.21004633333334

***
Reaction: MAR04301
Forward kcat: 542.5036
Reverse kcat: 241.2100463333

In [None]:
# Applies the sMOMENT method to the given SBML
# inputs:   - path to the input SBML model
#           - name of the output SBML (protein-constraint-enhanced model)
#           - project folder containing reaction<->kcat mapping, protein<->mass mapping and enzyme stoichiometry spreadsheet
#           - project name
#           - excluded reactions for which the pseudometabolite of the protein pool shall not be introduced
#           - type of default kcat selection (can be "mean", "median" or "random")
# output:   - protein-constraint-enhanced version of the input model

from autopacmen.submodules.create_smoment_model_reaction_wise import create_smoment_model_reaction_wise_with_sbml
create_smoment_model_reaction_wise_with_sbml(
    input_sbml_path=str(model_dir),
    output_sbml_name=str(output_model_name),
    project_folder=str(project_dir),
    project_name=str(project_name),
    excluded_reactions=[], # reactions can be excluded from the pseudometabolite constrain
    type_of_default_kcat_selection="mean"
)

# *** Important Note ***
# Because of the size of the Human1 model, this step might take days to compute on a standard computer. This is why computation on 
# a cluster is recommended. For this, it is easiest to execute the script from the console using this command:

# (<your_conda_env>) .../path/to/autopacmen$
# python -m autopacmen.modeling_create_smoment_model 
# --input_sbml_path /path/to/thesis_scripts/models/<your_model>.xml 
# --project_folder /path/to/thesis_scripts/datasets/sMOMENT/autopacmen_input/ 
# --project_name <your_project_name> 
# --output_sbml_name <your_output_name> 
# --excluded_reactions "" 
# --type_of_default_kcat_selection "mean"