# Predicting the hydrogen storage capacity of alumina pillared interlayer clays using interpretable ensemble machine learning

Makungu M. Madirisha a , Lenganji Simwanda b ,  Regina P. Mtei a


https://doi.org/10.1016/j.ijhydene.2025.03.216

CODE BY LENGANJI SIMWANDA, COPYRIGHT © 2025

# Save-only mode + output folder + helpers

In [1]:
# ==============================
# CELL 1: SAVE-ONLY OUTPUT CONFIG
# ==============================
from pathlib import Path
import os, sys, warnings, contextlib, io, json
warnings.filterwarnings("ignore")

# All outputs saved here
OUTPUT_DIR = Path("outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Matplotlib: never display, only save
import matplotlib
matplotlib.use("Agg")  # MUST be before importing pyplot
import matplotlib.pyplot as plt
plt.ioff()

def savefig(filename: str, fig=None, dpi: int = 300, tight: bool = True):
    """Save current (or given) figure into OUTPUT_DIR, then close it. No display."""
    if fig is None:
        fig = plt.gcf()
    out_path = OUTPUT_DIR / filename
    fig.savefig(out_path, dpi=dpi, bbox_inches="tight" if tight else None)
    plt.close(fig)
    return out_path

def save_text(filename: str, text: str):
    out_path = OUTPUT_DIR / filename
    out_path.write_text(text, encoding="utf-8")
    return out_path

def save_json(filename: str, obj):
    out_path = OUTPUT_DIR / filename
    out_path.write_text(json.dumps(obj, indent=2, default=str), encoding="utf-8")
    return out_path

# Log file (use log(...) instead of print)
LOG_FILE = OUTPUT_DIR / "run.log"

def log(*args):
    msg = " ".join(str(a) for a in args)
    with LOG_FILE.open("a", encoding="utf-8") as f:
        f.write(msg + "\n")

# Optional: capture accidental prints within a block and write to log instead
@contextlib.contextmanager
def capture_stdout(to_file: Path = LOG_FILE):
    old_out, old_err = sys.stdout, sys.stderr
    buffer = io.StringIO()
    sys.stdout = sys.stderr = buffer
    try:
        yield
    finally:
        sys.stdout, sys.stderr = old_out, old_err
        content = buffer.getvalue()
        if content.strip():
            with to_file.open("a", encoding="utf-8") as f:
                f.write(content + ("\n" if not content.endswith("\n") else ""))

log(f"✅ Save-only mode ON. Outputs folder: {OUTPUT_DIR.resolve()}")


# Imports (core + ML + Optuna + SHAP)

In [2]:
# ==============================
# CELL 2: IMPORTS
# ==============================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    r2_score, mean_squared_error, mean_absolute_error
)

from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

# Optional models (install if needed)
try:
    from xgboost import XGBRegressor
except Exception:
    XGBRegressor = None

try:
    from catboost import CatBoostRegressor
except Exception:
    CatBoostRegressor = None

import optuna

try:
    import shap
except Exception:
    shap = None

log("✅ Imports loaded.")


# Paths + Load data (Excel) + basic cleaning

In [3]:
# ==============================
# CELL 3: LOAD DATA
# ==============================
DATA_FILE = Path("database.xlsx")      # <-- update if needed
SHEET_NAME = 0                      # or "Sheet1"
TARGET_COL = "C"                    # <-- CHANGE this to your target column name

assert DATA_FILE.exists(), f"Missing file: {DATA_FILE.resolve()}"

with capture_stdout():
    df = pd.read_excel(DATA_FILE, sheet_name=SHEET_NAME)

# Basic numeric coercion (like your notebook style)
df = df.copy()
for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors="ignore")

# Drop rows where target missing
df = df.dropna(subset=[TARGET_COL])

# Save a snapshot of data info
save_text("data_head.txt", df.head(20).to_string(index=False))
save_text("data_info.txt", str(df.info()))
save_json("data_shape.json", {"rows": int(df.shape[0]), "cols": int(df.shape[1])})

log("✅ Data loaded:", df.shape, "Target:", TARGET_COL)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191 entries, 0 to 190
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   P        191 non-null    float64
 1   T        191 non-null    int64  
 2   Al/clay  191 non-null    int64  
 3   C        191 non-null    float64
dtypes: float64(2), int64(2)
memory usage: 6.1 KB


# Split features/target + preprocessing pipeline

In [4]:
# ==============================
# CELL 4: FEATURES / TARGET + PREPROCESS
# ==============================
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL].astype(float)

# Detect numeric vs categorical (keep simple)
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    # (Optional) OneHotEncoder if you have categoricals:
    # ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ],
    remainder="drop"
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

log("✅ Split done. Train:", X_train.shape, "Test:", X_test.shape)
save_json("columns.json", {"num_cols": num_cols, "cat_cols": cat_cols})


WindowsPath('outputs/columns.json')

# EDA plots saved only (pairplot + correlations)

In [5]:
# ==============================
# CELL 5: EDA (SAVE-ONLY)
# ==============================
import matplotlib.pyplot as plt

# 5A) Correlation heatmap (numeric only)
if len(num_cols) >= 2:
    corr = df[num_cols + [TARGET_COL]].corr(numeric_only=True)
    fig = plt.figure()
    plt.imshow(corr.values)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.index)), corr.index)
    plt.colorbar()
    savefig("corr_heatmap.png", fig=fig)
    save_text("corr_matrix.txt", corr.to_string())
    log("✅ Saved corr heatmap + matrix.")
else:
    log("⚠️ Skipped corr heatmap: not enough numeric columns.")


# Metrics + plotting helpers (save-only)

In [6]:
# ==============================
# CELL 6: METRICS + PLOT HELPERS (SAVE-ONLY)
# ==============================
import numpy as np
import matplotlib.pyplot as plt

def mape(y_true, y_pred, eps=1e-9):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return np.mean(np.abs((y_true - y_pred) / (np.abs(y_true) + eps))) * 100.0

def evaluate_regression(y_true, y_pred):
    from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
    return {
        "r2": float(r2_score(y_true, y_pred)),
        "rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))),
        "mae": float(mean_absolute_error(y_true, y_pred)),
        "mape": float(mape(y_true, y_pred)),
    }

def parity_plot(ax, y_train, y_train_pred, y_test, y_test_pred, title,
                xlabel=r"$H_{2,exp}$ (mmmol STP/g)", ylabel=r"$H_{2,pred}$ (mmmol STP/g)"):
    # Training: blue circles
    ax.scatter(y_train, y_train_pred, c="blue", s=18, marker="o", label="Training set")
    # Testing: red squares (matches your sample better than triangles)
    ax.scatter(y_test, y_test_pred, c="red", s=22, marker="s", label="Testing set")

    # Perfect prediction line using combined min/max
    mn = float(min(np.min(y_train), np.min(y_test), np.min(y_train_pred), np.min(y_test_pred)))
    mx = float(max(np.max(y_train), np.max(y_test), np.max(y_train_pred), np.max(y_test_pred)))
    ax.plot([mn, mx], [mn, mx], color="gray", linewidth=2, label="Perfect Prediction")

    ax.set_title(title, fontsize=14)
    ax.set_xlabel(xlabel, fontsize=12)
    ax.set_ylabel(ylabel, fontsize=12)

    ax.legend(loc="upper left", frameon=False)
    ax.grid(False)

    # Optional: start axes at 0 if your data is non-negative
    ax.set_xlim(left=0)
    ax.set_ylim(bottom=0)

def plot_metrics_2x2(metrics_dict, filename="metrics_2x2.png"):
    """
    metrics_dict example:
      {
        "AdaBoost": {"train": {...}, "test": {...}},
        "GBM": {...}, "CatBoost": {...}, "XGBoost": {...}
      }
    """
    # Keep order like your figure
    order = ["XGBoost", "CatBoost", "GBM", "AdaBoost"]
    order = [m for m in order if m in metrics_dict]

    fig, axes = plt.subplots(2, 2, figsize=(10, 8))
    axes = axes.ravel()

    keys = [("r2", r"$(a)\ r^2$"),
            ("rmse", "(b) RMSE"),
            ("mae", "(c) MAE"),
            ("mape", "(d) MAPE(%)")]

    for ax, (k, title) in zip(axes, keys):
        y_pos = np.arange(len(order))

        train_vals = [metrics_dict[m]["train"][k] for m in order]
        test_vals  = [metrics_dict[m]["test"][k]  for m in order]

        # Horizontal bars (training blue, testing red) like your sample
        ax.barh(y_pos - 0.18, train_vals, height=0.35, label="Training set")
        ax.barh(y_pos + 0.18, test_vals,  height=0.35, label="Testing set")

        ax.set_yticks(y_pos)
        ax.set_yticklabels(order)
        ax.set_title(title)

        # Annotate values on bars
        for i, (tv, sv) in enumerate(zip(train_vals, test_vals)):
            ax.text(tv, i - 0.18, f" {tv:.3f}", va="center", fontsize=9)
            ax.text(sv, i + 0.18, f" {sv:.3f}", va="center", fontsize=9)

        ax.grid(False)

    axes[0].legend(loc="upper right", frameon=False)
    savefig(filename, fig=fig)
    log(f"✅ Saved metrics figure: {filename}")


# Define models

In [7]:
# ==============================
# CELL 7: MODELS
# ==============================
models = {}

# Baselines
models["DecisionTree"] = DecisionTreeRegressor(random_state=42)
models["AdaBoost"] = AdaBoostRegressor(random_state=42)
models["GradientBoosting"] = GradientBoostingRegressor(random_state=42)

# Optional advanced
if XGBRegressor is not None:
    models["XGBoost"] = XGBRegressor(
        random_state=42,
        n_estimators=500,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0
    )
else:
    log("⚠️ XGBoost not available (xgboost not installed).")

if CatBoostRegressor is not None:
    models["CatBoost"] = CatBoostRegressor(
        random_state=42,
        verbose=False,
        iterations=1000,
        learning_rate=0.05,
        depth=6
    )
else:
    log("⚠️ CatBoost not available (catboost not installed).")

log("✅ Models defined:", list(models.keys()))


# Train + evaluate all models (save metrics + plots)

In [10]:
# ==============================
# CELL 8: TRAIN + EVALUATE + PARITY(2x2) + METRICS(2x2) (SAVE-ONLY)
# ==============================
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import joblib
import pandas as pd
import numpy as np
import optuna
import json
from pathlib import Path
import logging
import warnings

# ------------------------------
# Silence Optuna & warnings
# ------------------------------
optuna.logging.set_verbosity(optuna.logging.WARNING)
logging.getLogger("optuna").setLevel(logging.WARNING)
warnings.filterwarnings("ignore")

# ------------------------------
# Containers
# ------------------------------
results_rows = []
pred_store = {}
metrics_store = {}
artifacts = {}

# ------------------------------
# Optuna setup
# ------------------------------
TUNING_DIR = OUTPUT_DIR / "optuna"
TUNING_DIR.mkdir(parents=True, exist_ok=True)

N_TRIALS = 60
CV_SPLITS = 5

cv = KFold(n_splits=CV_SPLITS, shuffle=True, random_state=42)

# ------------------------------
# CV RMSE (version-safe)
# ------------------------------
def _cv_rmse(model):
    rmses = []
    for tr_idx, va_idx in cv.split(X_train):
        Xtr, Xva = X_train.iloc[tr_idx], X_train.iloc[va_idx]
        ytr, yva = y_train.iloc[tr_idx], y_train.iloc[va_idx]

        pipe = Pipeline([
            ("preprocess", preprocess),
            ("model", model)
        ])

        pipe.fit(Xtr, ytr)
        pred = pipe.predict(Xva)

        rmse = np.sqrt(mean_squared_error(yva, pred))
        rmses.append(rmse)

    return float(np.mean(rmses))

# ------------------------------
# Optuna tuning function
# ------------------------------
def tune_model(model_name):

    sampler = optuna.samplers.TPESampler(seed=42)
    pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)

    def objective(trial):

        if model_name == "AdaBoost":
            max_depth = trial.suggest_int("base_estimator_max_depth", 1, 10)
            base_tree = DecisionTreeRegressor(max_depth=max_depth, random_state=42)

            params = dict(
                n_estimators=trial.suggest_int("n_estimators", 50, 500),
                learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0, log=True),
                loss=trial.suggest_categorical("loss", ["linear", "square", "exponential"]),
                random_state=42
            )

            try:
                model = AdaBoostRegressor(estimator=base_tree, **params)
            except TypeError:
                model = AdaBoostRegressor(base_estimator=base_tree, **params)

        elif model_name == "GradientBoosting":
            model = GradientBoostingRegressor(
                n_estimators=trial.suggest_int("n_estimators", 100, 500),
                learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5, log=True),
                max_depth=trial.suggest_int("max_depth", 3, 10),
                min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
                min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 5),
                max_features=trial.suggest_categorical("max_features", ["sqrt", "log2"]),
                subsample=trial.suggest_float("subsample", 0.5, 1.0),
                random_state=42
            )

        elif model_name == "XGBoost":
            if XGBRegressor is None:
                raise optuna.exceptions.TrialPruned()

            model = XGBRegressor(
                objective="reg:squarederror",
                n_estimators=trial.suggest_int("n_estimators", 100, 500),
                learning_rate=trial.suggest_float("learning_rate", 0.01, 0.3),
                max_depth=trial.suggest_int("max_depth", 3, 10),
                min_child_weight=trial.suggest_int("min_child_weight", 1, 10),
                subsample=trial.suggest_float("subsample", 0.6, 1.0),
                colsample_bytree=trial.suggest_float("colsample_bytree", 0.6, 1.0),
                reg_alpha=trial.suggest_float("reg_alpha", 0.0, 1.0),
                reg_lambda=trial.suggest_float("reg_lambda", 0.0, 1.0),
                random_state=42,
                n_jobs=-1
            )

        elif model_name == "CatBoost":
            if CatBoostRegressor is None:
                raise optuna.exceptions.TrialPruned()

            model = CatBoostRegressor(
                iterations=trial.suggest_int("iterations", 100, 1000),
                depth=trial.suggest_int("depth", 4, 10),
                learning_rate=trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
                random_strength=trial.suggest_int("random_strength", 0, 100),
                bagging_temperature=trial.suggest_float("bagging_temperature", 0.01, 1.0, log=True),
                l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1.0, 10.0, log=True),
                border_count=trial.suggest_int("border_count", 50, 255),
                loss_function="RMSE",
                verbose=False,
                random_seed=42
            )

        else:
            return _cv_rmse(models[model_name])

        return _cv_rmse(model)

    study = optuna.create_study(
        direction="minimize",
        sampler=sampler,
        pruner=pruner
    )

    study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=False)

    best_params = dict(study.best_params)

    # --------------------------
    # Rebuild best model
    # --------------------------
    if model_name == "AdaBoost":
        md = best_params.pop("base_estimator_max_depth")
        base_tree = DecisionTreeRegressor(max_depth=md, random_state=42)
        try:
            best_model = AdaBoostRegressor(estimator=base_tree, random_state=42, **best_params)
        except TypeError:
            best_model = AdaBoostRegressor(base_estimator=base_tree, random_state=42, **best_params)

    elif model_name == "GradientBoosting":
        best_model = GradientBoostingRegressor(random_state=42, **best_params)

    elif model_name == "XGBoost":
        best_model = XGBRegressor(
            objective="reg:squarederror",
            random_state=42,
            n_jobs=-1,
            **best_params
        )

    elif model_name == "CatBoost":
        best_model = CatBoostRegressor(
            loss_function="RMSE",
            verbose=False,
            random_seed=42,
            **best_params
        )

    else:
        best_model = models[model_name]

    # Save best params only (no history)
    with open(TUNING_DIR / f"{model_name}_best_params.json", "w") as f:
        json.dump(
            {
                "model": model_name,
                "best_cv_rmse": float(study.best_value),
                "best_params": best_params
            },
            f,
            indent=2
        )

    log(f"✅ Optuna tuned {model_name}")

    return best_model

# ------------------------------
# Tune selected models
# ------------------------------
to_tune = ["AdaBoost", "GradientBoosting", "XGBoost", "CatBoost"]
to_tune = [m for m in to_tune if m in models]

for mname in to_tune:
    try:
        models[mname] = tune_model(mname)
    except Exception as e:
        log(f"⚠️ Tuning skipped for {mname}: {e}")

# ------------------------------
# Train & evaluate models
# ------------------------------
for name, model in models.items():

    pipe = Pipeline([
        ("preprocess", preprocess),
        ("model", clone(model))
    ])

    with capture_stdout():
        pipe.fit(X_train, y_train)

    ytr = pipe.predict(X_train)
    yte = pipe.predict(X_test)

    m_tr = evaluate_regression(y_train, ytr)
    m_te = evaluate_regression(y_test, yte)

    pred_store[name] = {"ytr": ytr, "yte": yte}
    metrics_store[name] = {"train": m_tr, "test": m_te}

    results_rows.append({"model": name, "split": "train", **m_tr})
    results_rows.append({"model": name, "split": "test", **m_te})

    model_path = OUTPUT_DIR / f"{name}_pipeline.joblib"
    joblib.dump(pipe, model_path)
    artifacts[name] = str(model_path)

    log(f"✅ {name} done | R2(test)={m_te['r2']:.4f}")

# ------------------------------
# Save metrics
# ------------------------------
results_df = pd.DataFrame(results_rows)
results_df.to_csv(OUTPUT_DIR / "metrics_all_models.csv", index=False)
save_text("metrics_all_models.txt", results_df.to_string(index=False))
save_json("model_artifacts.json", artifacts)

# ------------------------------
# Parity plots (2x2)
# ------------------------------
parity_order = ["AdaBoost", "GradientBoosting", "CatBoost", "XGBoost"]
titles = {
    "AdaBoost": "(a) AdaBoost",
    "GradientBoosting": "(b) GBM",
    "CatBoost": "(c) CatBoost",
    "XGBoost": "(d) XGBoost",
}

fig, axes = plt.subplots(2, 2, figsize=(7, 7))
axes = axes.ravel()

for ax, mname in zip(axes, parity_order):
    if mname not in pred_store:
        ax.axis("off")
        continue

    parity_plot(
        ax=ax,
        y_train=y_train,
        y_train_pred=pred_store[mname]["ytr"],
        y_test=y_test,
        y_test_pred=pred_store[mname]["yte"],
        title=titles[mname],
        xlabel=r"$H_{2,exp}$ (mmmol STP/g)",
        ylabel=r"$H_{2,pred}$ (mmmol STP/g)",
    )

plt.tight_layout()
savefig("parity_2x2.png", fig=fig)

# ------------------------------
# Metrics plots
# ------------------------------
plot_metrics_2x2(metrics_store, filename="metrics_2x2.png")

log("✅ All models tuned, trained, evaluated, and saved.")


# SHAP explainability (save-only, no display)

In [11]:
# ==============================
# CELL 10: SHAP (SAVE-ONLY) WITH FEATURE LABELS + BAR + DEPENDENCE
# ==============================
if shap is None:
    log("⚠️ SHAP not available (shap not installed). Skipping.")
else:
    import joblib
    
    
model_name = "XGBoost"

FEATURE_LABELS = [
    "P", "T", "TSU$_{mont-Al}$"
    # add the rest in the correct order...
    # e.g., "Al/clay", "Si/clay", ...
]

# Use your actual column names if you prefer:
# FEATURE_LABELS = list(X.columns)

# ---- 3) Transform X_test into model input space
X_test_proc = pipe.named_steps["preprocess"].transform(X_test)
model = pipe.named_steps["model"]

# If preprocess changes feature count (e.g., one-hot), we fallback safely
n_features_proc = X_test_proc.shape[1]
if len(FEATURE_LABELS) != n_features_proc:
    log(f"⚠️ FEATURE_LABELS length ({len(FEATURE_LABELS)}) != processed features ({n_features_proc}).")
    # fallback: generic labels
    FEATURE_LABELS = [f"f{i}" for i in range(n_features_proc)]

# SHAP wants a DataFrame to show names nicely
Xshap = pd.DataFrame(X_test_proc, columns=FEATURE_LABELS)

# ---- 4) Explainer
with capture_stdout():
    try:
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(Xshap)
    except Exception:
        explainer = shap.Explainer(model, Xshap)
        shap_values = explainer(Xshap)

# Standardize shap_values to an array for plotting
sv = getattr(shap_values, "values", shap_values)

# ---- (a) Beeswarm summary
fig = plt.figure(figsize=(6, 3))
with capture_stdout():
    shap.summary_plot(sv, Xshap, show=False)
savefig(f"shap_summary_beeswarm_{model_name}.png", fig=fig)
log(f"✅ Saved shap_summary_beeswarm_{model_name}.png")

# ---- (b) Mean(|SHAP|) bar plot
fig = plt.figure(figsize=(6, 3))
with capture_stdout():
    shap.summary_plot(sv, Xshap, plot_type="bar", show=False)
savefig(f"shap_mean_bar_{model_name}.png", fig=fig)
log(f"✅ Saved shap_mean_bar_{model_name}.png")

# ---- (c) Dependence plots like your example
# Choose feature pairs here (edit names to match your labels)
dependence_pairs = [
    ("P", "T"),
    ("T", "P"),
    ("TSU$_{mont-Al}$", "P"),
]

for main_feat, interaction_feat in dependence_pairs:
    if main_feat in Xshap.columns and interaction_feat in Xshap.columns:
        fig = plt.figure(figsize=(5, 4))
        ax = plt.gca()
        with capture_stdout():
            shap.dependence_plot(
                main_feat, sv, Xshap,
                interaction_index=interaction_feat,
                show=False, ax=ax
            )
        # title like your figure panels
        ax.set_title(f"{main_feat} against {interaction_feat}")
        savefig(f"shap_dependence_{main_feat}_vs_{interaction_feat}_{model_name}.png", fig=fig)
        log(f"✅ Saved dependence: {main_feat} vs {interaction_feat}")
    else:
        log(f"⚠️ Skipped dependence plot: {main_feat} or {interaction_feat} not in labels.")


# Final “run report” saved

In [12]:
# ==============================
# CELL 11: FINAL REPORT
# ==============================
report_lines = []
report_lines.append("RUN COMPLETE\n")
report_lines.append(f"Data file: {DATA_FILE}\n")
report_lines.append(f"Rows/Cols: {df.shape}\n")
report_lines.append(f"Target: {TARGET_COL}\n")
report_lines.append(f"Numeric cols: {len(num_cols)} | Categorical cols: {len(cat_cols)}\n")

metrics_path = OUTPUT_DIR / "metrics_all_models.csv"
if metrics_path.exists():
    report_lines.append(f"Metrics: {metrics_path.name}\n")

if (OUTPUT_DIR / "BEST_pipeline.joblib").exists():
    report_lines.append("Best model saved: BEST_pipeline.joblib\n")
    if (OUTPUT_DIR / "best_model_metrics.json").exists():
        report_lines.append("Best model metrics: best_model_metrics.json\n")

save_text("RUN_REPORT.txt", "".join(report_lines))
log("✅ Saved RUN_REPORT.txt")
