In [1]:
# %pip install -U pandas numpy scikit-learn xgboost catboost lightgbm torch optuna joblib

## 0)Environment configuration and necessary library imports

In [2]:
import os, json, math, warnings, gc, time
from typing import Dict, List, Optional, Tuple
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor  
from sklearn.base import clone
warnings.filterwarnings("ignore")
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import optuna

## 1)Tool Functions: Metrics Evaluation and Data Processing

In [3]:
def _rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def _to_arr(a):
    if isinstance(a, (pd.DataFrame, pd.Series)):
        return a.to_numpy()
    return np.asarray(a)

def _make_dir(p):
    os.makedirs(p, exist_ok=True)

## 2)GPU and PyTorch device detection

In [4]:
# Check if GPU is available (using CUDA)
def _gpu_available() -> bool:
    import shutil, os
    try:
        import torch
        if torch.cuda.is_available():
            return True
    except Exception:
        pass
    try:
        import cupy
        if cupy.cuda.runtime.getDeviceCount() > 0:
            return True
    except Exception:
        pass
    if shutil.which("nvidia-smi"):
        return True
    if os.environ.get("CUDA_VISIBLE_DEVICES", "") not in ("", "-1", None):
        return True
    return False

# Check if PyTorch is installed and available
def _torch_ok() -> bool:
    try:
        import torch
        return True
    except Exception:
        return False

# Set device (GPU or CPU) for PyTorch
def _torch_device():
    try:
        import torch
        return torch.device("cuda" if torch.cuda.is_available() else "cpu")
    except Exception:
        return None

## 3)Set all random seeds for reproducibility across libraries

In [5]:
def _set_all_seeds(seed: int = 42):
    import random
    random.seed(seed)
    np.random.seed(seed)
    try:
        import torch
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    except Exception:
        pass

## 4)MLP Model Construction and Training (PyTorch Implementation)

In [6]:
# Create a simple MLP model using PyTorch
def _make_mlp(in_dim: int, out_dim: int, width: int, nlayer: int, dropout: float):
    import torch.nn as nn
    layers = []
    last = in_dim
    act = nn.ReLU()
    for _ in range(nlayer):
        layers += [nn.Linear(last, width), act, nn.Dropout(dropout)]
        last = width
    layers += [nn.Linear(last, out_dim)]
    return nn.Sequential(*layers)

# Training one epoch for the MLP model
def _train_one_epoch(model, loader, criterion, optimizer, device):
    import torch
    model.train()
    loss_sum, n = 0.0, 0
    for xb, yb in loader:
        xb = xb.to(device); yb = yb.to(device)
        optimizer.zero_grad(set_to_none=True)
        pred = model(xb).squeeze(-1)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
        bs = yb.size(0)
        loss_sum += loss.item() * bs
        n += bs
    return loss_sum / max(1, n)

# Evaluate RMSE on validation set for the model
def _eval_rmse(model, loader, device):
    import torch
    model.eval()
    se_sum, n = 0.0, 0
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device); yb = yb.to(device)
            pred = model(xb).squeeze(-1)
            se_sum += torch.sum((pred - yb) ** 2).item()
            n += yb.size(0)
    return _safe_sqrt(se_sum / max(1, n))

## 5)PyTorch Model Training: Data Standardisation and Model Optimisation

In [7]:
# Fit MLP using PyTorch with standardization and hyperparameter tuning
def _fit_torch_mlp(
    X_tr, y_tr, X_va, y_va,
    width: int, nlayer: int, dropout: float,
    lr: float, weight_decay: float,
    batch: int, epochs: int, patience: int,
    seed: int = 42, verbose: bool = False
):
    import torch
    import torch.nn as nn
    from torch.utils.data import TensorDataset, DataLoader

    # Standardize the data
    scaler = StandardScaler(with_mean=True, with_std=True)
    X_tr_std = scaler.fit_transform(X_tr)
    X_va_std = scaler.transform(X_va)

    # Convert to PyTorch tensors
    Xtr = torch.tensor(X_tr_std, dtype=torch.float32)
    ytr = torch.tensor(y_tr.reshape(-1), dtype=torch.float32)
    Xva = torch.tensor(X_va_std, dtype=torch.float32)
    yva = torch.tensor(y_va.reshape(-1), dtype=torch.float32)

    _set_all_seeds(seed)
    device = _torch_device() or "cpu"

    # DataLoader for training and validation
    train_loader = DataLoader(TensorDataset(Xtr, ytr), batch_size=int(batch), shuffle=True)
    val_loader   = DataLoader(TensorDataset(Xva, yva), batch_size=4096, shuffle=False)

    # MLP network architecture
    net = _make_mlp(in_dim=X_tr.shape[1], out_dim=1, width=int(width), nlayer=int(nlayer), dropout=float(dropout)).to(device)
    optimizer = torch.optim.AdamW(net.parameters(), lr=float(lr), weight_decay=float(weight_decay))
    criterion = nn.MSELoss()

    best_rmse = float("inf")
    best_state = None
    wait = 0

    # Training loop
    for ep in range(int(epochs)):
        _ = _train_one_epoch(net, train_loader, criterion, optimizer, device)
        rmse = _eval_rmse(net, val_loader, device)
        if verbose and (ep % 10 == 0 or ep == epochs - 1):
            print(f"[Torch-MLP] epoch {ep:03d}  val RMSE={rmse:.5f}")

        if rmse + 1e-6 < best_rmse:
            best_rmse = rmse
            best_state = {k: v.detach().cpu().clone() for k, v in net.state_dict().items()}
            wait = 0
        else:
            wait += 1
            if wait >= int(patience):
                break

    del Xtr, ytr, Xva, yva
    try:
        torch.cuda.empty_cache()
    except Exception:
        pass

    return scaler, best_state, best_rmse

def _fit_torch_mlp(
    X_tr, y_tr, X_va, y_va,
    width: int, nlayer: int, dropout: float,
    lr: float, weight_decay: float,
    batch: int, epochs: int, patience: int,
    seed: int = 42, verbose: bool = False
):
    import torch
    import torch.nn as nn
    from torch.utils.data import TensorDataset, DataLoader

    scaler = StandardScaler(with_mean=True, with_std=True)
    X_tr_std = scaler.fit_transform(X_tr)
    X_va_std = scaler.transform(X_va)

    Xtr = torch.tensor(X_tr_std, dtype=torch.float32)
    ytr = torch.tensor(y_tr.reshape(-1), dtype=torch.float32)
    Xva = torch.tensor(X_va_std, dtype=torch.float32)
    yva = torch.tensor(y_va.reshape(-1), dtype=torch.float32)

    _set_all_seeds(seed)
    device = _torch_device() or "cpu"

    train_loader = DataLoader(TensorDataset(Xtr, ytr), batch_size=int(batch), shuffle=True)
    val_loader   = DataLoader(TensorDataset(Xva, yva), batch_size=4096, shuffle=False)

    net = _make_mlp(in_dim=X_tr.shape[1], out_dim=1, width=int(width), nlayer=int(nlayer), dropout=float(dropout)).to(device)
    optimizer = torch.optim.AdamW(net.parameters(), lr=float(lr), weight_decay=float(weight_decay))
    criterion = nn.MSELoss()

    best_rmse = float("inf")
    best_state = None
    wait = 0

    for ep in range(int(epochs)):
        _ = _train_one_epoch(net, train_loader, criterion, optimizer, device)
        rmse = _eval_rmse(net, val_loader, device)
        if verbose and (ep % 10 == 0 or ep == epochs - 1):
            print(f"[Torch-MLP] epoch {ep:03d}  val RMSE={rmse:.5f}")

        if rmse + 1e-6 < best_rmse:
            best_rmse = rmse
            best_state = {k: v.detach().cpu().clone() for k, v in net.state_dict().items()}
            wait = 0
        else:
            wait += 1
            if wait >= int(patience):
                break

    
    del Xtr, ytr, Xva, yva
    try:
        torch.cuda.empty_cache()
    except Exception:
        pass

    return scaler, best_state, best_rmse

class TorchMLPRegressor:
    def __init__(self, **params):
        self.params = dict(params)
        self.scaler_ = None
        self.state_  = None
        self.in_dim_ = None

    def fit(self, X, y):
        if not _torch_ok():
            raise RuntimeError("PyTorch is not available.")
        n = len(y); m = max(1, int(0.1 * n))
        X_tr, y_tr = X[m:], y[m:]
        X_va, y_va = X[:m], y[:m]
        self.in_dim_ = X.shape[1]
        self.scaler_, self.state_, _ = _fit_torch_mlp(
            X_tr, y_tr, X_va, y_va,
            self.params["width"], self.params["nlayer"], self.params["dropout"],
            self.params["lr"], self.params["weight_decay"],
            self.params["batch"], self.params["epochs"], self.params["patience"],
            seed=42, verbose=False
        )
        return self

    def predict(self, X):
        if not _torch_ok():
            raise RuntimeError("PyTorch is not available.")
        import torch
        if self.state_ is None or self.scaler_ is None:
            raise RuntimeError("Model not fitted yet.")
        X_std = self.scaler_.transform(X)
        X_te = torch.tensor(X_std, dtype=torch.float32)
        device = _torch_device() or "cpu"
        net = _make_mlp(self.in_dim_, 1,
                        width=self.params["width"],
                        nlayer=self.params["nlayer"],
                        dropout=self.params["dropout"]).to(device)
        net.load_state_dict(self.state_)
        net.eval()
        with torch.no_grad():
            yhat = net(X_te.to(device)).cpu().numpy().reshape(-1)
        return yhat

## 6)Optuna optimisation: performing hyperparameter tuning for various models

In [8]:
def obj_xgb(X_tr, y_tr, X_va, y_va):
    def _obj(trial: optuna.Trial):
        params = dict(
            max_depth=trial.suggest_int("max_depth", 6, 14),
            eta=trial.suggest_float("eta", 0.01, 0.2, log=True),
            min_child_weight=trial.suggest_float("min_child_weight", 1.0, 12.0),
            subsample=trial.suggest_float("subsample", 0.6, 1.0),
            colsample_bytree=trial.suggest_float("colsample_bytree", 0.6, 1.0),
            _lambda=trial.suggest_float("lambda", 1e-4, 10.0, log=True),
            alpha=trial.suggest_float("alpha", 1e-6, 1.0, log=True),
        )
        mdl = XGBRegressor(
            max_depth=params["max_depth"],
            learning_rate=params["eta"],
            min_child_weight=params["min_child_weight"],
            subsample=params["subsample"],
            colsample_bytree=params["colsample_bytree"],
            reg_lambda=params["_lambda"],
            reg_alpha=params["alpha"],
            n_estimators=1200,
            tree_method="hist",
            random_state=42,
            n_jobs=-1
        )
        mdl.fit(X_tr, y_tr)
        pred = mdl.predict(X_va)
        return _rmse(y_va, pred)
    return _obj

In [9]:
def obj_lgb(X_tr, y_tr, X_va, y_va):
    def _obj(trial: optuna.Trial):
        params = dict(
            learning_rate   = trial.suggest_float("learning_rate", 0.01, 0.10, log=True),
            num_leaves      = trial.suggest_int("num_leaves", 64, 512, step=32),
            min_data_in_leaf= trial.suggest_int("min_data_in_leaf", 20, 200, step=10),
            feature_fraction= trial.suggest_float("feature_fraction", 0.6, 1.0),
            bagging_fraction= trial.suggest_float("bagging_fraction", 0.6, 1.0),
            lambda_l1       = trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
            lambda_l2       = trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        )
        mdl = LGBMRegressor(
            **params,
            n_estimators=2000,
            random_state=42,
            n_jobs=-1,
            force_col_wise=True,
            verbosity=-1
        )
        mdl.fit(X_tr, y_tr)
        pred = mdl.predict(X_va)
        return _rmse(y_va, pred)
    return _obj

In [10]:
def obj_cat(X_tr, y_tr, X_va, y_va):
    def _obj(trial: optuna.Trial):
        params = dict(
            depth=trial.suggest_int("depth", 6, 10),
            learning_rate=trial.suggest_float("learning_rate", 0.02, 0.2, log=True),
            l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-3, 20.0, log=True),
            subsample=trial.suggest_float("subsample", 0.6, 1.0),
        )
        mdl = CatBoostRegressor(
            **params, iterations=2000, loss_function="RMSE",
            random_seed=42, verbose=False
        )
        mdl.fit(X_tr, y_tr)
        pred = mdl.predict(X_va)
        return _rmse(y_va, pred)
    return _obj

In [11]:
def obj_mlp_torch(X_tr, y_tr, X_va, y_va):
    def _objective(trial: optuna.Trial):
        width        = trial.suggest_categorical("width",  [128, 256, 384, 512, 768, 1024])
        nlayer       = trial.suggest_int("nlayer", 2, 4)
        dropout      = trial.suggest_float("dropout", 0.0, 0.30)
        lr           = trial.suggest_float("lr", 1e-4, 5e-3, log=True)
        weight_decay = trial.suggest_float("weight_decay", 1e-6, 5e-4, log=True)
        batch        = trial.suggest_categorical("batch", [2048, 4096, 8192, 16384])
        epochs       = trial.suggest_int("epochs", 120, 320, step=40)
        patience     = trial.suggest_int("patience", 10, 35)

        if not _torch_ok():
            hidden = tuple([int(width)] * int(nlayer))
            mdl = Pipeline([
                ("scaler", StandardScaler()),
                ("mdl", MLPRegressor(
                    hidden_layer_sizes=hidden, activation="relu",
                    alpha=float(weight_decay),               
                    learning_rate_init=float(lr),
                    batch_size=int(batch),
                    max_iter=int(epochs),
                    early_stopping=True, n_iter_no_change=int(patience),
                    random_state=42))
            ])
            mdl.fit(X_tr, y_tr)
            pred = mdl.predict(X_va)
            return _rmse(y_va, pred)
        try:
            _, _, rmse = _fit_torch_mlp(
                X_tr, y_tr, X_va, y_va,
                width, nlayer, dropout, lr, weight_decay, batch, epochs, patience,
                seed=42, verbose=False
            )
        except Exception:
            rmse = float("inf")
        return rmse
    return _objective

## 7)Model Training and Results Documentation

In [12]:
def run_one_dataset(
    X, y, X_std=None,
    groups: Optional[np.ndarray] = None,
    region_tag: Optional[str] = None,
    n_repeats: int = 1,
    xgb_trials: int = 40,
    cat_trials: int = 40,
    lgb_trials: int = 40,
    mlp_trials: int = 20,
    save_dir: str = "./runs"
) -> Tuple[pd.DataFrame, Dict[str, dict]]:

    GPU_OK = _gpu_available()
    _make_dir(save_dir)
    X = _to_arr(X); y = _to_arr(y).reshape(-1).astype(float)
    n = len(y)

    records: List[dict] = []
    best_param_dict: Dict[str, dict] = {}

    def with_device_params(model_name: str, params: dict, prefer_gpu: bool = True) -> dict:
        p = dict(params)
        if not prefer_gpu:
            return p
        if model_name == "xgboost":
            p["tree_method"] = "gpu_hist" if GPU_OK else p.get("tree_method", "hist")
        elif model_name == "lightgbm":
            if GPU_OK:
                p["device"] = "gpu"
        elif model_name == "catboost":
            if GPU_OK:
                p["task_type"] = "GPU"
                p.setdefault("devices", "0")
        return p

    def _tune_once_on_fold(X_tr, y_tr, X_va, y_va) -> Dict[str, dict]:
        out: Dict[str, dict] = {}

        # XGB
        study = optuna.create_study(direction="minimize",
                                    sampler=optuna.samplers.TPESampler(seed=42),
                                    study_name="xgb_study")
        study.optimize(obj_xgb(X_tr, y_tr, X_va, y_va), n_trials=xgb_trials, show_progress_bar=False)
        bp = dict(study.best_trial.params)
        xgb_best = {
            "max_depth":        int(bp["max_depth"]),
            "learning_rate":    float(bp["eta"]),
            "min_child_weight": float(bp["min_child_weight"]),
            "subsample":        float(bp["subsample"]),
            "colsample_bytree": float(bp["colsample_bytree"]),
            "reg_lambda":       float(bp["lambda"]),
            "reg_alpha":        float(bp["alpha"]),
            "n_estimators":     1200,
            "tree_method":      "hist",
            "random_state":     42,
        }
        out["xgboost"] = {"params": xgb_best, "rmse": float(study.best_trial.value)}

        # Cat
        study = optuna.create_study(direction="minimize",
                                    sampler=optuna.samplers.TPESampler(seed=42),
                                    study_name="cat_study")
        study.optimize(obj_cat(X_tr, y_tr, X_va, y_va), n_trials=cat_trials, show_progress_bar=False)
        bp = dict(study.best_trial.params)
        cat_best = {
            "depth":         int(bp["depth"]),
            "learning_rate": float(bp["learning_rate"]),
            "l2_leaf_reg":   float(bp["l2_leaf_reg"]),
            "subsample":     float(bp["subsample"]),
            "iterations":    2000,
            "loss_function": "RMSE",
            "random_seed":   42,
            "verbose":       False,
        }
        out["catboost"] = {"params": cat_best, "rmse": float(study.best_trial.value)}

        # LGBM
        study = optuna.create_study(direction="minimize",
                                    sampler=optuna.samplers.TPESampler(seed=42),
                                    study_name="lgbm_study")
        study.optimize(obj_lgb(X_tr, y_tr, X_va, y_va), n_trials=lgb_trials, show_progress_bar=False)
        bp = dict(study.best_trial.params)
        lgb_best = {
            "learning_rate":    float(bp["learning_rate"]),
            "num_leaves":       int(bp["num_leaves"]),
            "min_data_in_leaf": int(bp["min_data_in_leaf"]),
            "feature_fraction": float(bp["feature_fraction"]),
            "bagging_fraction": float(bp["bagging_fraction"]),
            "lambda_l1":        float(bp["lambda_l1"]),
            "lambda_l2":        float(bp["lambda_l2"]),
            "n_estimators":     2000,
            "random_state":     42,
            "force_col_wise":   True,
            "verbosity":        -1,
        }
        out["lightgbm"] = {"params": lgb_best, "rmse": float(study.best_trial.value)}

        study = optuna.create_study(direction="minimize",
                                    sampler=optuna.samplers.TPESampler(seed=42),
                                    study_name="mlp_study")
        study.optimize(obj_mlp_torch(X_tr, y_tr, X_va, y_va), n_trials=mlp_trials, show_progress_bar=False)
        bp = dict(study.best_trial.params)

        if _torch_ok():
            mlp_best = {
                "width":         int(bp["width"]),
                "nlayer":        int(bp["nlayer"]),
                "dropout":       float(bp.get("dropout", 0.0)),
                "lr":            float(bp["lr"]),
                "weight_decay":  float(bp.get("weight_decay", 1e-5)),
                "batch":         int(bp["batch"]),
                "epochs":        int(bp["epochs"]),
                "patience":      int(bp["patience"]),
            }
        else:
            hidden = tuple([int(bp["width"])] * int(bp["nlayer"]))
            mlp_best = {
                "hidden_layer_sizes": hidden,
                "alpha":              float(bp.get("weight_decay", 1e-5)),  
                "learning_rate_init": float(bp["lr"]),
                "batch_size":         int(bp["batch"]),
                "max_iter":           int(bp["epochs"]),
                "early_stopping":     True,
                "n_iter_no_change":   int(bp["patience"]),
                "random_state":       42,
                "_backend":           "sklearn",
            }
        out["mlp"] = {"params": mlp_best, "rmse": float(study.best_trial.value)}
        return out

    def _run_scenario(name: str, splitter):
        nonlocal best_param_dict
        tuned = False

        for rep in range(n_repeats):
            iterator = splitter.split(X, y, groups if name == "without_supervision" else None)
            for fold_id, (tr_idx, va_idx) in enumerate(iterator):
                X_tr, y_tr = X[tr_idx], y[tr_idx]
                X_va, y_va = X[va_idx], y[va_idx]

                if not tuned:
                    best_param_dict = _tune_once_on_fold(X_tr, y_tr, X_va, y_va)
                    tuned = True

                xgb_params = with_device_params("xgboost",  best_param_dict["xgboost"]["params"])
                cat_params = with_device_params("catboost", best_param_dict["catboost"]["params"])
                lgb_params = with_device_params("lightgbm", best_param_dict["lightgbm"]["params"])

                if _torch_ok() and "_backend" not in best_param_dict["mlp"]["params"]:
                    mlp_model = TorchMLPRegressor(**best_param_dict["mlp"]["params"])
                else:
                    mlp_model = Pipeline([("scaler", StandardScaler()),
                                          ("mdl", MLPRegressor(**{k: v for k, v in best_param_dict["mlp"]["params"].items()
                                                                  if k != "_backend"}))])

                models = {
                    "xgboost":  XGBRegressor(**xgb_params),
                    "catboost": CatBoostRegressor(**cat_params),
                    "lightgbm": LGBMRegressor(**lgb_params),
                    "mlp":      mlp_model,
                }

                for mname, mdl in models.items():
                    try:
                        mdl.fit(X_tr, y_tr)
                    except Exception:
                        if mname == "xgboost":
                            cpu = dict(xgb_params); cpu["tree_method"] = "hist"
                            mdl = XGBRegressor(**cpu)
                        elif mname == "lightgbm":
                            cpu = dict(lgb_params); cpu.pop("device", None)
                            mdl = LGBMRegressor(**cpu)
                        elif mname == "catboost":
                            cpu = dict(cat_params); cpu.pop("task_type", None); cpu.pop("devices", None)
                            mdl = CatBoostRegressor(**cpu)
                        elif mname == "mlp":
                            if _torch_ok():
                                p = best_param_dict["mlp"]["params"]
                                hidden = tuple([int(p["width"])] * int(p["nlayer"]))
                                mdl = Pipeline([
                                    ("scaler", StandardScaler()),
                                    ("mdl", MLPRegressor(
                                        hidden_layer_sizes=hidden, activation="relu",
                                        alpha=float(p.get("weight_decay", 1e-5)),
                                        learning_rate_init=float(p["lr"]),
                                        batch_size=int(p["batch"]),
                                        max_iter=int(p["epochs"]),
                                        early_stopping=True,
                                        n_iter_no_change=int(p["patience"]),
                                        random_state=42))
                                ])
                            
                        mdl.fit(X_tr, y_tr)

                    pred = mdl.predict(X_va).reshape(-1)
                    rmse = _rmse(y_va, pred)
                    mae  = float(mean_absolute_error(y_va, pred))
                    r2   = float(r2_score(y_va, pred))
                    records.append({
                        "scenario": name,
                        "region": region_tag if region_tag is not None else "GLOBAL",
                        "rep": rep,
                        "fold": fold_id,
                        "model": mname,
                        "rmse": rmse,
                        "mae": mae,
                        "r2":  r2,
                    })

    # with supervision
    sup_splitter = KFold(n_splits=5, shuffle=True, random_state=42)
    _run_scenario("with_supervision", sup_splitter)

    # without supervision (grouped)
    if groups is not None:
        groups = _to_arr(groups).reshape(-1)
        if len(groups) != n:
            raise ValueError("groups length mismatch with y.")
        grp_split = GroupKFold(n_splits=5)
        _run_scenario("without_supervision", grp_split)
    else:
        print("No groups provided; skipping 'without_supervision' scenario.")

    logs_df = pd.DataFrame.from_records(records)
    logs_path   = os.path.join(save_dir, f"logs_{region_tag or 'global'}.csv")
    params_path = os.path.join(save_dir, f"best_params_{region_tag or 'global'}.json")
    logs_df.to_csv(logs_path, index=False)
    with open(params_path, "w") as f:
        json.dump(best_param_dict, f, indent=2)

    return logs_df, best_param_dict

## 8)Preliminary data preprocessing

In [13]:
DATA_PATH = "./data/global_clean.csv"  
SAVE_DIR = "./runs_demo" 
os.makedirs(SAVE_DIR, exist_ok=True)

df = pd.read_csv(DATA_PATH)

drop_cols_hard = [
    'Unnamed: 0','ij_grid','i_grid','j_grid','x_proj','y_proj','survey_id','date','rgi_id',
    'consensus_ice_thickness','millan_ice_thickness','itslive_v','hugonnet_dhdt',
    'glacier_length','glacier_area_km2','glacier_oggm_volume',
    'glacier_min_elev','glacier_max_elev','glacier_median_elev','glacier_outline_year',
    'lin_mb_above_z','oggm_mb_above_z','thickness_uncertainty'
]
df = df.drop(columns=[c for c in drop_cols_hard if c in df.columns], errors="ignore")

TARGET_COL = "thickness"
RGI_COL    = "RGI" if "RGI" in df.columns else "region"
ID_COL     = "glacier_id" if "glacier_id" in df.columns else None

drop_cols = {TARGET_COL, RGI_COL, "lat", "lon", "year", "date"}
if ID_COL: drop_cols.add(ID_COL)
num_df = df.select_dtypes(include=[np.number])
feat_cols = [c for c in num_df.columns if c not in drop_cols]

X = df[feat_cols].astype(float).copy()
y = df[TARGET_COL].astype(float).copy()
regions = df[RGI_COL].astype(str).copy()
groups_glacier = df[ID_COL].astype(str).copy() if ID_COL else None

scaler = StandardScaler()
X_std = pd.DataFrame(scaler.fit_transform(X), columns=feat_cols, index=X.index)

print(f"Samples: {len(df)} | Features: {len(feat_cols)}")
print("Feature preview:", feat_cols[:10])

Samples: 284558 | Features: 18
Feature preview: ['latitude', 'longitude', 'topo', 'topo_smoothed', 'slope', 'slope_factor', 'aspect', 'dis_from_border', 'catchment_area', 'millan_v']


## 9)Model training and tuning

In [4]:
logs_global, best_params_global = run_one_dataset(
    X, y, X_std=None,
    groups=None,
    region_tag=None,
    n_repeats=1,
    xgb_trials=80, cat_trials=100, lgb_trials=100, mlp_trials=40,
    save_dir=SAVE_DIR
)
display(logs_global.groupby(["scenario","model"])["rmse"].agg(["mean","std"]).round(5))

[I 2025-09-02 11:20:20,550] A new study created in memory with name: xgb_study
[I 2025-09-02 11:20:26,153] Trial 0 finished with value: 38.02684822580678 and parameters: {'max_depth': 9, 'eta': 0.17254716573280354, 'min_child_weight': 9.051933359925457, 'subsample': 0.8394633936788146, 'colsample_bytree': 0.6624074561769746, 'lambda': 0.000602521573620386, 'alpha': 2.231010801867921e-06}. Best is trial 0 with value: 38.02684822580678.
[I 2025-09-02 11:20:42,807] Trial 1 finished with value: 35.50529089012883 and parameters: {'max_depth': 13, 'eta': 0.06054365855469246, 'min_child_weight': 8.788798355756501, 'subsample': 0.608233797718321, 'colsample_bytree': 0.9879639408647978, 'lambda': 1.452824663751602, 'alpha': 1.879466824163847e-05}. Best is trial 1 with value: 35.50529089012883.
[I 2025-09-02 11:20:46,302] Trial 2 finished with value: 59.37419360088611 and parameters: {'max_depth': 7, 'eta': 0.017322667470546258, 'min_child_weight': 4.346664672554915, 'subsample': 0.8099025726528

No groups provided; skipping 'without_supervision' scenario.


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
scenario,model,Unnamed: 2_level_1,Unnamed: 3_level_1
with_supervision,catboost,39.47228,0.45796
with_supervision,lightgbm,35.75134,0.4054
with_supervision,mlp,107.98805,12.40498
with_supervision,xgboost,35.37767,0.49433
