In [None]:
import pandas as pd
import numpy as np
import gc
import time
import warnings
from pathlib import Path
from tqdm.auto import tqdm
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.callbacks import EarlyStopping

import shap

warnings.filterwarnings("ignore")

base_dir = Path().resolve().parent
dataset_dir = base_dir / "dataset"
results_dir = base_dir / "results"
results_dir.mkdir(exist_ok=True)


✅ Environment ready.


In [None]:
# Select Number of Feature for Each Modality
tr_n = 250
fl_n = 200

In [3]:
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    else:
        y_prob = y_pred  # for ANN sigmoid output
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    print(f"✅ {model_name}: acc={acc:.3f}, f1={f1:.3f}, auc={auc:.3f}")
    return acc, prec, rec, f1, auc


In [4]:
def load_transcriptomic(base_dir, meta):
    trans = pd.read_csv(base_dir / "dataset/transcriptomic_hvg.csv", index_col=0)
    trans = pd.merge(trans, meta[["response"]], left_index=True, right_index=True, how="inner")
    trans = trans.loc[:, ~trans.columns.str.upper().str.startswith("MT-")]
    trans["response"] = trans["response"].map({"Responder": 1, "Non-responder": 0})
    y = trans["response"].astype(np.int8)
    X = trans.drop(columns=["response"]).astype(np.float32)
    return X, y


def load_fluxomic(base_dir):
    flux = pd.read_csv(base_dir / "dataset/fluxomics.csv", index_col=0)
    rxns = [c for c in flux.columns if c != "response" and not c.startswith("EX_")]
    flux = flux[rxns + ["response"]]
    flux["response"] = flux["response"].map({"Responder": 1, "Non-responder": 0})
    y = flux["response"].astype(np.int8)
    X = flux.drop(columns=["response"]).astype(np.float32)
    return X, y

In [None]:
# Load Data and select features
meta = pd.read_csv(base_dir / "dataset/metadata.csv", index_col=0)
hvg_genes = pd.read_csv(base_dir / "dataset/hvg_genes.csv", header=None).squeeze().tolist()

X_trans, y_trans = load_transcriptomic(base_dir, meta)
X_flux, y_flux = load_fluxomic(base_dir)

# select top features
tr_top = [g for g in hvg_genes if g in X_trans.columns][:tr_n]
fl_top = X_flux.var().sort_values(ascending=False).index[:fl_n]

X_tr_250 = X_trans[tr_top]
X_fl_200 = X_flux[fl_top]

# multimodal merge
X_multi_full = pd.merge(X_trans, X_flux, left_index=True, right_index=True, how="inner")
y_multi = meta.loc[X_multi_full.index, "response"].map({"Responder": 1, "Non-responder": 0}).astype(np.int8)
X_multi = X_multi_full[list(tr_top) + list(fl_top)]
print("✅ Data loaded and selected.")


✅ Data loaded and selected.


In [8]:
def get_models():
    models_dict = {}

    # Logistic Regression
    models_dict["LogReg"] = LogisticRegression(
        solver="lbfgs", C=0.5, max_iter=1000
    )

    # Random Forest
    models_dict["RandomForest"] = RandomForestClassifier(
        n_estimators=500, max_depth=10, min_samples_split=2, 
        min_samples_leaf=1, max_features="sqrt", n_jobs=-1, random_state=42
    )

    # XGBoost
    models_dict["XGBoost"] = xgb.XGBClassifier(
        objective="binary:logistic", 
        eval_metric="logloss",
        learning_rate=0.05,
        n_estimators=800,
        max_depth=8,
        subsample=0.9,
        colsample_bytree=0.8,
        tree_method="hist",
        n_jobs=-1,
        random_state=42,
        verbosity=0
    )

    # ANN
    def build_ann(input_dim):
        model = models.Sequential([
            layers.Input(shape=(input_dim,)),
            layers.Dense(64, activation="relu"),
            layers.BatchNormalization(),
            layers.Dropout(0.5),
            layers.Dense(32, activation="relu"),
            layers.BatchNormalization(),
            layers.Dense(1, activation="sigmoid")
        ])
        model.compile(optimizer=optimizers.Adam(learning_rate=0.001),
                      loss="binary_crossentropy",
                      metrics=["accuracy"])
        return model
    models_dict["ANN_builder"] = build_ann

    return models_dict


In [13]:
results = []
models_dict = get_models()

modalities = {
    "Transcriptomic (250)": (X_tr_250, y_trans),
    "Fluxomic (200)": (X_fl_200, y_flux),
    "Multimodal (250+200)": (X_multi, y_multi)
}

for label, (X, y) in modalities.items():
    print(f"\n==============================\n🧬 {label}\n==============================")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    for model_name, model in models_dict.items():
        print(f"\n🚀 Training {model_name} ...")

        if model_name == "ANN_builder":
            ann = model(X_train_scaled.shape[1])
            early_stop = EarlyStopping(patience=10, restore_best_weights=True)
            ann.fit(
                X_train_scaled, y_train,
                validation_data=(X_test_scaled, y_test),
                epochs=50,
                batch_size=32,
                verbose=0,
                callbacks=[early_stop]
            )
            y_pred = (ann.predict(X_test_scaled) > 0.5).astype(int)
            y_prob = ann.predict(X_test_scaled)
            acc = accuracy_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred)
            rec = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_prob)
        else:
            model.fit(X_train_scaled, y_train)
            acc, prec, rec, f1, auc = evaluate_model(model, X_test_scaled, y_test, model_name)

        results.append({
            "Modality": label,
            "Model": model_name.replace("_builder", ""),
            "Acc": acc, "Prec": prec, "Rec": rec, "F1": f1, "AUC": auc
        })

        # --- SHAP only for Multimodal XGBoost ---
        if label == "Multimodal (250+200)" and model_name == "XGBoost":
            print("🧩 Computing SHAP contributions for multimodal XGBoost...")
            sample_X = pd.DataFrame(X_test_scaled, columns=X.columns).sample(n=min(100, len(X_test)), random_state=42)
            explainer = shap.TreeExplainer(model)
            shap_vals = explainer.shap_values(sample_X)
            shap_df = pd.DataFrame({
                "Feature": sample_X.columns,
                "MeanAbsSHAP": np.abs(shap_vals).mean(axis=0)
            })
            shap_df["Modality"] = ["Fluxomic" if f in X_flux.columns else "Transcriptomic" for f in shap_df["Feature"]]
            contrib = shap_df.groupby("Modality")["MeanAbsSHAP"].sum()
            perc = 100 * contrib / contrib.sum()
            tr_perc, fl_perc = perc.get("Transcriptomic", 0), perc.get("Fluxomic", 0)

            results[-1]["SHAP_Tr_%"] = tr_perc
            results[-1]["SHAP_Fl_%"] = fl_perc
            print(f"📊 SHAP contribution → Transcriptomic: {tr_perc:.1f}%, Fluxomic: {fl_perc:.1f}%")

        gc.collect()



🧬 Transcriptomic (250)

🚀 Training LogReg ...
✅ LogReg: acc=0.799, f1=0.683, auc=0.873

🚀 Training RandomForest ...
✅ RandomForest: acc=0.801, f1=0.671, auc=0.875

🚀 Training XGBoost ...
✅ XGBoost: acc=0.847, f1=0.764, auc=0.922

🚀 Training ANN_builder ...
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 991us/step
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 880us/step

🧬 Fluxomic (200)

🚀 Training LogReg ...
✅ LogReg: acc=0.663, f1=0.006, auc=0.553

🚀 Training RandomForest ...
✅ RandomForest: acc=0.663, f1=0.001, auc=0.584

🚀 Training XGBoost ...
✅ XGBoost: acc=0.646, f1=0.185, auc=0.565

🚀 Training ANN_builder ...
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 828us/step

🧬 Multimodal (250+200)

🚀 Training LogReg ...
✅ LogReg: acc=0.790, f1=0.678, auc=0.868

🚀 Training RandomForest ...
✅ RandomForest: acc=0.781, f1=0.630, auc=0.861

🚀 Training XGBoost 

In [None]:
# Save results as csv
pd.DataFrame(results).to_csv(results_dir / 'comparison_results.csv',index=False)