In [2]:
import sys
import os
import time
import numpy as np
import pandas as pd
import warnings
from tqdm.auto import tqdm
from anndata import AnnData
from pathlib import Path
from multiprocessing import Pool
import cobra
from google.colab import drive

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", message="ChainedAssignmentError")

project_root = Path(os.getcwd()).parent
sys.path.append(str(project_root))
print("Added to sys.path:", project_root)

from scripts.utils_scFBApy import scFBApy, repairNeg
print("Environment initialized.")

Added to sys.path: /home/sadegh/python_projects/HRplus-BC-Multimodal
Environment initialized.


In [None]:
# 1. Mount Google Drive
drive.mount('/content/drive')

In [3]:
# Set directories of files
base_dir = Path("/content/drive/MyDrive/Data")

input_file = base_dir / "dataset/csv/transcriptomic_features.csv"
meta_data = base_dir / "dataset/csv/metadata.csv"
model_file = base_dir / "models/model.xml"
output_dir = base_dir / "flux_batch"
output_dir.mkdir(exist_ok=True)
final_output = base_dir / "dataset/csv/fluxomic_features.csv"

print(f"""
---------------------------------
Input data:  {input_file}
Metadata:    {meta_data}
Model:       {model_file}
Output dir:  {output_dir}
Final file:  {final_output}
""")


---------------------------------
Input data:  /home/sadegh/python_projects/HRplus-BC-Multimodal/dataset/csv/transcriptomic_features.csv
Metadata:    /home/sadegh/python_projects/HRplus-BC-Multimodal/dataset/csv/metadata.csv
Model:       /home/sadegh/python_projects/HRplus-BC-Multimodal/models/model.xml
Output dir:  /home/sadegh/python_projects/HRplus-BC-Multimodal/flux_batch
Final file:  /home/sadegh/python_projects/HRplus-BC-Multimodal/dataset/csv/fluxomic_features.csv



In [4]:
# Load Transcriptomic Data
print("Loading transcriptomics data...")
trans = pd.read_csv(input_file, index_col=0)
print(f"Expression matrix shape: {trans.shape}")
trans.head()

Loading transcriptomics data...
Expression matrix shape: (52831, 5000)


Unnamed: 0,ENSG00000241860,ENSG00000237491,ENSG00000188290,ENSG00000187608,ENSG00000186891,ENSG00000186827,ENSG00000184163,ENSG00000260179,ENSG00000230415,ENSG00000272455,...,ENSG00000198899,ENSG00000198938,ENSG00000198840,ENSG00000212907,ENSG00000198786,ENSG00000198695,ENSG00000198727,ENSG00000276256,ENSG00000277856,ENSG00000275063
AAACCTGAGACTGTAA-1,0.0,0.0,0.0,1.149485,0.0,0.0,0.0,0.0,0.0,0.0,...,2.264496,3.569675,3.368515,3.44016,2.778569,0.0,3.78717,0.0,0.0,0.0
AAACCTGAGCAGCGTA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.938584,4.504747,4.436543,1.940615,3.212943,0.0,3.885766,0.0,0.0,0.0
AAACCTGAGCCAACAG-1,1.139273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.42566,4.429093,4.570602,4.322241,3.42566,2.452941,4.136931,0.0,0.0,0.0
AAACCTGAGCGTGAAC-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.248333,4.107691,3.791163,4.374079,3.073749,1.634045,3.88017,0.0,0.0,0.0
AAACCTGAGCTACCTA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.409472,4.590032,4.485799,4.55648,3.906986,3.409472,4.85005,0.0,0.0,0.0


In [5]:
# Load Cobra model
print("Loading COBRA metabolic model...")
model = cobra.io.read_sbml_model(model_file)
model_genes = [g.id for g in model.genes]
print(f"Model loaded: {len(model_genes)} genes, {len(model.reactions)} reactions.")


Loading COBRA metabolic model...
Model loaded: 497 genes, 449 reactions.


In [6]:
# Filter to Model Genes and Build AnnData
overlap_genes = [g for g in trans.columns if g in model_genes]
expr_filtered = trans[overlap_genes]
adata_expr = AnnData(expr_filtered)

print(f"Overlapping genes with model: {len(overlap_genes)}")
print(f"AnnData object: {adata_expr.shape}")
adata_expr

Overlapping genes with model: 75
AnnData object: (52831, 75)


AnnData object with n_obs × n_vars = 52831 × 75

In [7]:
# Set Parameters
batch_size = 100
objective = "Biomass"
n_cells = adata_expr.shape[0]
n_batches = (n_cells + batch_size - 1) // batch_size

print(f"Total cells: {n_cells}")
print(f"Batch size: {batch_size}")
print(f"Total batches: {n_batches}")

Total cells: 52831
Batch size: 100
Total batches: 529


In [8]:
# Run scFBApy on each batch
def run_batch(batch_idx):
    """Run scFBApy flux computation on a batch of cells"""
    import pandas as pd
    import numpy as np
    import warnings
    from scripts.utils_scFBApy import scFBApy

    warnings.filterwarnings("ignore", category=FutureWarning)
    warnings.filterwarnings("ignore", category=RuntimeWarning)
    warnings.filterwarnings("ignore", category=UserWarning)
    warnings.filterwarnings("ignore", message="ChainedAssignmentError")

    start = batch_idx * batch_size
    end = min((batch_idx + 1) * batch_size, n_cells)
    batch_file = output_dir / f"flux_batch_{batch_idx}.csv"

    if batch_file.exists():
        return f"Skipped batch {batch_idx+1} (already done)"

    adata_batch = adata_expr[start:end, :].copy()

    try:
        adata_flux_batch = scFBApy(
            model_orig=model,
            adata=adata_batch,
            objective=objective,
            cooperation=True,
            compute_fva=True,
            npop_fva=5,
            eps=0.001,
            type_ras_normalization="max",
            and_expression=np.nanmin,
            or_expression=np.nansum,
            fraction_of_optimum=0,
            processes=1,
            round_c=10
        )

        flux_df = pd.DataFrame(
            adata_flux_batch.X,
            index=adata_flux_batch.obs.index,
            columns=adata_flux_batch.var.index
        )
        flux_df.to_csv(batch_file)
        return f"Finished batch {batch_idx+1}/{n_batches}"

    except Exception as e:
        return f"Error in batch {batch_idx}: {e}"


In [None]:
# Generate Fluxomics data from Transcriptomics
print("Starting parallel flux computation...")
start_time = time.time()

with Pool(processes=8) as pool:  # adjust CPU count as needed
    results = list(tqdm(pool.imap(run_batch, range(n_batches)), total=n_batches))

for r in results:
    print(r)

print(f"\n Total runtime: {(time.time() - start_time)/60:.1f} minutes")

In [None]:
# Combine All Batch Files
print("Combining all batch files...")
batch_files = sorted(output_dir.glob("flux_batch_*.csv"))
combined_flux = pd.concat([pd.read_csv(f, index_col=0) for f in batch_files])
print(f"Combined flux shape: {combined_flux.shape}")
combined_flux.head()

In [None]:
# Add Metadata and Save Final File
meta_df = pd.read_csv(meta_data, index_col=0)
combined_flux.index = meta_df.index.values[: combined_flux.shape[0]]
combined_flux["response"] = meta_df["response"].values[: combined_flux.shape[0]]

combined_flux.to_csv(final_output, index=True)
print(f"Final fluxomics file saved to: {final_output}")