Solution generated by https://github.com/bogoconic1/Qgentic-AI

This result is probably suboptimal since a codebase bug terminated the pipeline midway 

In [None]:
# coding: utf-8
# MobileNetV4 Hybrid-Medium (11.1M) + CatBoost 1.2.5 for CSIRO biomass
# v9: 5-fold grouped CV for image branch with FiveCrop+hflip TTA, OOF nested affine calibration (slope-constrained, no-harm guard),
#     fold ensembling; optional CatBoost per fold with Optuna (FULL mode, fold 0) and late fusion when test has metadata.
# Logging -> task/csiro-biomass/outputs/1_7/code_1_7_v9.txt
# Submission -> task/csiro-biomass/outputs/1_7/submission_9.csv (FULL mode only)

import os, sys, time, math, random, logging
from pathlib import Path
from typing import Tuple, List, Dict, Optional, NamedTuple

# ---------------------------------------------------------------------
# Paths and logging (v9)
BASE_DIR = Path("/kaggle/input/csiro-biomass")
OUTPUT_DIR = Path(".")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
LOG_PATH = OUTPUT_DIR / "code_1_7_v9.txt"
SUB_PATH = OUTPUT_DIR / "submission.csv"

logging.basicConfig(
    filename=str(LOG_PATH),
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
)
print("==== Script start (v9: 5-fold OOF nested affine calibration + fold ensemble) ====")
print(f"Base: {BASE_DIR.resolve()} | Out: {OUTPUT_DIR.resolve()}")
print(f"Log: {LOG_PATH.resolve()} | Submission (FULL only): {SUB_PATH.resolve()}")

# Determinism
def set_seed(seed: int = 42):
    import numpy as np
    import torch
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

HF_TOKEN = os.environ.get("HF_TOKEN", None)
print(f"HF_TOKEN present: {bool(HF_TOKEN)} (value not logged)")

# ---------------------------------------------------------------------
# Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.transforms import InterpolationMode
from PIL import Image, ImageOps

import timm
from timm.data import resolve_data_config, create_transform

from catboost import CatBoostRegressor, Pool
import optuna
import catboost

# Environment report
print(f"timm version: {getattr(timm, '__version__', 'unknown')}")
print(f"CatBoost version: {getattr(catboost, '__version__', 'unknown')} (require ~1.2.5)")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print(f"CUDA: True | GPU: {torch.cuda.get_device_name(0)} | VRAM≈{torch.cuda.get_device_properties(0).total_memory/1e9:.2f} GB")
else:
    print("CUDA: False | Running CNN on CPU (AMP disabled).")

# ---------------------------------------------------------------------
# Constants
SEED = 42
N_FOLDS = 5
TARGETS = ["Dry_Green_g", "Dry_Dead_g", "Dry_Clover_g", "GDM_g", "Dry_Total_g"]
TARGET_WEIGHTS = np.array([0.1, 0.1, 0.1, 0.2, 0.5], dtype=np.float32)
IMG_SIZE = 224
BATCH_SIZE_FULL = 32
BATCH_SIZE_DEBUG = 16
EPOCHS_FULL = 25
EPOCHS_DEBUG = 1
DEFAULT_IMAGE_WEIGHT = 0.07  # used when metadata available
print("Task: multi-output regression; metric: weighted R² across 5 outputs; training aligned via WN-MSE/label scaling.")

# ---------------------------------------------------------------------
# Dataset and transforms
class PadToSquare:
    def __init__(self, fill=0): self.fill = fill
    def __call__(self, img: Image.Image) -> Image.Image:
        w, h = img.size
        if w == h: return img
        if w > h:
            pad = (0, (w - h)//2, 0, (w - h) - (w - h)//2)
        else:
            pad = ((h - w)//2, 0, (h - w) - (h - w)//2, 0)
        return ImageOps.expand(img, border=pad, fill=self.fill)

def make_train_transform(model, input_px: int = IMG_SIZE):
    cfg = resolve_data_config({}, model=model)
    cfg = {**cfg, "input_size": (3, input_px, input_px)}
    tf = create_transform(
        **cfg,
        is_training=True,
        hflip=0.5,
        vflip=0.0,
        auto_augment=None,
        re_prob=0.0
    )
    return transforms.Compose([PadToSquare(fill=0), tf])

def make_eval_transform(model, input_px: int = IMG_SIZE):
    cfg = resolve_data_config({}, model=model)
    cfg = {**cfg, "input_size": (3, input_px, input_px)}
    tf = create_transform(
        **cfg,
        is_training=False,
        hflip=0.0,
        vflip=0.0,
        auto_augment=None,
        re_prob=0.0
    )
    return transforms.Compose([PadToSquare(fill=0), tf])

def get_fivecrop_preprocess(model, input_px: int = IMG_SIZE):
    # Build a FiveCrop evaluation pipeline matching timm eval resize policy
    cfg = resolve_data_config({}, model=model)
    crop_pct = float(cfg.get('crop_pct', 0.875))
    resize_px = int(round(input_px / crop_pct))
    mean = cfg['mean']; std = cfg['std']
    preprocess = transforms.Compose([
        PadToSquare(fill=0),
        transforms.Resize(resize_px, interpolation=InterpolationMode.BILINEAR)
    ])
    norm = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=std)
    ])
    return preprocess, norm

class BiomassImageDataset(Dataset):
    def __init__(self, df_wide: pd.DataFrame, image_root: Path, img_col: str, y_cols, transform, is_train: bool):
        self.df = df_wide.reset_index(drop=True)
        self.image_root = image_root
        self.img_col = img_col
        self.y_cols = y_cols
        self.transform = transform
        self.is_train = is_train
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = Image.open(self.image_root / row[self.img_col]).convert("RGB")
        x = self.transform(img)
        if self.y_cols is not None:
            y = torch.tensor(row[self.y_cols].values.astype(np.float32))
            return x, y
        return x, row[self.img_col]

# ---------------------------------------------------------------------
# Loss and metrics
class WeightedNormalizedMSELoss(nn.Module):
    def __init__(self, sst: np.ndarray, weights: np.ndarray):
        super().__init__()
        self.register_buffer("den", torch.tensor(sst.astype(np.float32)))
        self.register_buffer("w", torch.tensor(weights.astype(np.float32)))
    def forward(self, preds: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
        se = (preds - targets) ** 2
        se_sum = torch.sum(se, dim=0) / preds.shape[0]
        norm = se_sum / (self.den + 1e-9)
        return torch.sum(self.w * norm)

def weighted_r2(y_true: np.ndarray, y_pred: np.ndarray, weights: np.ndarray = TARGET_WEIGHTS) -> Tuple[float, Dict[str, float]]:
    per = []
    per_map = {}
    for i, t in enumerate(TARGETS):
        r2 = r2_score(y_true[:, i], y_pred[:, i])
        per.append(r2)
        per_map[t] = float(r2)
    per = np.array(per, dtype=float)
    score = float(np.sum(per * weights))
    return score, per_map

def compute_sst(y: np.ndarray) -> np.ndarray:
    sst = []
    for i in range(y.shape[1]):
        yi = y[:, i]; mu = yi.mean()
        sst.append(np.sum((yi - mu) ** 2))
    return np.array(sst, dtype=np.float64)

def make_composite_strata(df_wide: pd.DataFrame) -> np.ndarray:
    s = df_wide.get("State", pd.Series(["UNK"] * len(df_wide), index=df_wide.index)).astype(str).fillna("UNK")
    sd_series = df_wide.get("Sampling_Date", pd.Series(np.nan, index=df_wide.index))
    m = pd.to_datetime(sd_series, errors="coerce").dt.month.fillna(0).astype(int)
    y_total = df_wide["Dry_Total_g"].astype(float).values
    qs = np.quantile(y_total, np.linspace(0, 1, 11))
    dec = np.digitize(y_total, qs[1:-1], right=True)
    comp = s.astype(str) + "_" + m.astype(str) + "_" + dec.astype(str)
    _, inv = np.unique(comp, return_inverse=True)
    return inv

# ---------------------------------------------------------------------
# Tabular features (for CatBoost when metadata is available)
def prepare_long_to_wide(train_long: pd.DataFrame) -> pd.DataFrame:
    meta_cols = ['sample_id', 'image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm']
    present_meta = [c for c in meta_cols if c in train_long.columns]
    piv = train_long.pivot_table(index='image_path', columns='target_name', values='target', aggfunc='first').reset_index()
    meta = train_long[present_meta].drop_duplicates(subset=['image_path']).groupby('image_path').first().reset_index()
    wide = pd.merge(meta, piv, on='image_path', how='inner')
    for t in TARGETS:
        if t not in wide.columns: wide[t] = np.nan
    wide = wide.dropna(subset=TARGETS)
    return wide

def build_tabular_features(df: pd.DataFrame, encoders: Dict[str, any], fit: bool, y_cols: List[str]) -> Tuple[pd.DataFrame, Dict[str, any]]:
    X = pd.DataFrame(index=df.index)
    month = pd.to_datetime(df['Sampling_Date'], errors='coerce').dt.month.fillna(0).astype(int)
    X['Month'] = month
    X['State'] = df['State'].astype(str).fillna("UNK")
    X['Height_Ave_cm'] = pd.to_numeric(df.get('Height_Ave_cm', pd.Series([np.nan]*len(df))), errors='coerce').fillna(df.get('Height_Ave_cm', pd.Series([0]*len(df))).median())
    X['Pre_GSHH_NDVI'] = pd.to_numeric(df.get('Pre_GSHH_NDVI', pd.Series([np.nan]*len(df))), errors='coerce').fillna(df.get('Pre_GSHH_NDVI', pd.Series([0]*len(df))).median())
    X['Height2'] = X['Height_Ave_cm'] ** 2
    X['NDVI2'] = X['Pre_GSHH_NDVI'] ** 2
    X['H_x_N'] = X['Height_Ave_cm'] * X['Pre_GSHH_NDVI']
    species = df['Species'].astype(str).fillna("UNK")
    te_dict = encoders.get('species_te', None)
    global_means = encoders.get('global_means', None)
    if te_dict is not None and global_means is not None:
        for i, t in enumerate(y_cols):
            key = f"Species_TE_{t}"
            X[key] = species.map(te_dict.get(t, {})).fillna(global_means[i])
    freq_map = encoders.get('species_freq', None)
    if freq_map is not None:
        X['Species_freq'] = species.map(freq_map).fillna(0.0)
    if fit:
        ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        ohe.fit(X[['State', 'Month']]); encoders['ohe'] = ohe
    else:
        ohe = encoders['ohe']
    ohe_mat = ohe.transform(X[['State', 'Month']])
    ohe_cols = [f"OHE_{c}" for c in ohe.get_feature_names_out(['State', 'Month'])]
    Xo = pd.DataFrame(ohe_mat, columns=ohe_cols, index=X.index)
    X_final = pd.concat([X.drop(columns=['State', 'Month']), Xo], axis=1)
    return X_final, encoders

def compute_species_encodings(df_train: pd.DataFrame, y_cols: List[str]) -> Dict[str, any]:
    enc = {}
    te = {}; glob = []
    if 'Species' in df_train.columns:
        for t in y_cols:
            grp = df_train.groupby('Species')[t].mean()
            te[t] = grp.to_dict()
            glob.append(df_train[t].mean())
        enc['species_te'] = te
        enc['global_means'] = np.array(glob, dtype=np.float32)
        freq = df_train['Species'].value_counts(normalize=True)
        enc['species_freq'] = freq.to_dict()
        print(f"Species TE sizes: { {t:len(te[t]) for t in y_cols} } | #species: {len(freq)}")
    else:
        enc['species_te'] = {}; enc['global_means'] = np.zeros(len(y_cols), dtype=np.float32); enc['species_freq'] = {}
        print("Species missing; TE/freq encodings empty.")
    return enc

# ---------------------------------------------------------------------
# Model utilities
def get_param_groups(model: nn.Module, base_lr_backbone: float, lr_head: float) -> List[dict]:
    head_params, backbone_params = [], []
    for n, p in model.named_parameters():
        if any(k in n for k in ['classifier', 'head.fc', 'head', 'fc']):
            head_params.append(p)
        else:
            backbone_params.append(p)
    return [{"params": backbone_params, "lr": base_lr_backbone}, {"params": head_params, "lr": lr_head}]

def partial_unfreeze_mnv4(model: nn.Module):
    for p in model.parameters(): p.requires_grad = False
    unfrozen = []
    for name, p in model.named_parameters():
        if any(s in name for s in ['classifier', 'head.fc', 'head']):
            p.requires_grad = True; unfrozen.append(name)
    found_stage = any('stages.' in n for n, _ in model.named_parameters())
    if found_stage:
        for name, p in model.named_parameters():
            if any(name.startswith(f"stages.{k}") for k in ['3', '4', '5']):
                p.requires_grad = True; unfrozen.append(name)
    else:
        names = [n for n, _ in model.named_parameters()]; cutoff = int(len(names) * 0.7)
        for i, (name, p) in enumerate(model.named_parameters()):
            if i >= cutoff: p.requires_grad = True; unfrozen.append(name)
    print(f"[CNN] Unfrozen params: {len(unfrozen)} (sample: {unfrozen[:8]})")
    return model

def freeze_batchnorm_layers(m: nn.Module):
    if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.SyncBatchNorm)):
        m.eval()

# ---------------------------------------------------------------------
# CNN training (single fold)
def train_cnn_fold(train_df: pd.DataFrame, val_df: pd.DataFrame, sst: np.ndarray,
                   epochs: int, base_lr_backbone: float, lr_head: float, weight_decay: float, debug: bool):
    model_name = 'mobilenetv4_hybrid_medium.e500_r224_in1k'
    print(f"Creating timm model: {model_name} (pretrained=True, num_classes=5)")
    model = timm.create_model(
            'mobilenetv4_hybrid_medium.e500_r224_in1k',
            pretrained=True,
            pretrained_cfg_overlay=dict(file='/kaggle/input/timm-mobilenet-v4/model.safetensors'),
            num_classes=5
    )
    model = partial_unfreeze_mnv4(model)
    model.apply(freeze_batchnorm_layers)
    model.to(device)

    tr_tf = make_train_transform(model, input_px=IMG_SIZE)
    va_tf = make_eval_transform(model, input_px=IMG_SIZE)
    ds_tr = BiomassImageDataset(train_df, BASE_DIR, 'image_path', TARGETS, transform=tr_tf, is_train=True)
    ds_va = BiomassImageDataset(val_df, BASE_DIR, 'image_path', TARGETS, transform=va_tf, is_train=False)
    bs = (BATCH_SIZE_DEBUG if debug else BATCH_SIZE_FULL)
    if len(ds_tr) < 2: bs, drop_last = 1, False
    else: drop_last = True
    dl_tr = DataLoader(ds_tr, batch_size=bs, shuffle=True, num_workers=4, pin_memory=True, drop_last=drop_last)
    dl_va = DataLoader(ds_va, batch_size=bs, shuffle=False, num_workers=4, pin_memory=True, drop_last=False)
    print(f"[CNN] TrainLoader: batches={len(dl_tr)}, bs={bs}, drop_last={drop_last} | ValLoader={len(dl_va)}")

    optimizer = torch.optim.AdamW(get_param_groups(model, base_lr_backbone, lr_head), weight_decay=weight_decay, betas=(0.9, 0.999), eps=1e-8)
    total_steps = max(1, epochs * max(1, len(dl_tr)))
    warmup_steps = max(1, int(0.08 * total_steps))
    min_lr_ratio = 0.1
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max(1, total_steps - warmup_steps), eta_min=base_lr_backbone * min_lr_ratio)
    scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())
    criterion = WeightedNormalizedMSELoss(sst=sst, weights=TARGET_WEIGHTS).to(device)

    best_val, best_epoch = -1e9, -1
    nan_flagged = False
    start_time = time.time()
    global_step = 0
    first_bs, last_bs = None, None

    for epoch in range(epochs):
        model.train(); model.apply(freeze_batchnorm_layers)
        tr_loss = 0.0
        if len(dl_tr) == 0: logging.warning("[CNN] No training batches; skipping epoch.")
        for i, (xb, yb) in enumerate(dl_tr):
            if first_bs is None: first_bs = xb.size(0)
            last_bs = xb.size(0)
            xb, yb = xb.to(device, non_blocking=True), yb.to(device, non_blocking=True)
            optimizer.zero_grad(set_to_none=True)
            with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
                preds = model(xb); loss = criterion(preds, yb)
            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer); scaler.update()
            if global_step >= warmup_steps: scheduler.step()
            tr_loss += float(loss.item()); global_step += 1
        tr_loss = tr_loss / max(1, len(dl_tr))
        if first_bs is not None:
            print(f"[CNN] Epoch {epoch+1}: first/last batch sizes {first_bs}/{last_bs}")

        # Quick val (single pass; final val uses TTA predictor outside)
        model.eval()
        preds_all, ys_all = [], []
        with torch.no_grad():
            for xb, yb in dl_va:
                xb = xb.to(device, non_blocking=True)
                with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
                    pb = model(xb).detach().float().cpu().numpy()
                preds_all.append(pb); ys_all.append(yb.numpy())
        if len(preds_all) > 0:
            y_pred = np.concatenate(preds_all, axis=0); y_true = np.concatenate(ys_all, axis=0)
            val_wr2, _ = weighted_r2(y_true, y_pred, TARGET_WEIGHTS)
        else:
            val_wr2 = float("nan")
        print(f"[CNN] Epoch {epoch+1}/{epochs} | train_loss(WN-MSE): {tr_loss:.6f} | quick_val_weighted_R2: {val_wr2:.6f}")

        if not debug and epoch == 0 and (math.isnan(tr_loss) or math.isnan(val_wr2)):
            print("NaN after epoch 1 (FULL). Stopping CNN training and proceeding to inference.")
            nan_flagged = True; break

        if (not math.isnan(val_wr2)) and val_wr2 > best_val:
            best_val = val_wr2; best_epoch = epoch + 1
            best_state = {k: v.cpu() for k, v in model.state_dict().items()}

    train_time = time.time() - start_time
    if not nan_flagged and best_epoch > 0: model.load_state_dict(best_state, strict=True)
    print(f"[CNN] Best quick-val weighted R²: {best_val:.6f} at epoch {best_epoch} | train time: {train_time/60:.1f} min")
    return model, {"best_val_wr2": best_val, "best_epoch": best_epoch, "train_minutes": train_time/60.0}

# ---------------------------------------------------------------------
# TTA predictors (FiveCrop + hflip)
def predict_cnn_views(model, imgs: torch.Tensor) -> torch.Tensor:
    with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
        p0 = model(imgs); p1 = model(torch.flip(imgs, dims=[3]))
        return 0.5 * (p0 + p1)

def predict_cnn_fivecrop(model: nn.Module, df_wide: pd.DataFrame, input_px: int = IMG_SIZE) -> np.ndarray:
    model.eval()
    preprocess, norm = get_fivecrop_preprocess(model, input_px=input_px)
    fivecrop = transforms.FiveCrop(input_px)

    class TTA_DS(Dataset):
        def __init__(self, df): self.df = df.reset_index(drop=True)
        def __len__(self): return len(self.df)
        def __getitem__(self, idx):
            p = self.df.loc[idx, 'image_path']
            im = Image.open(BASE_DIR / p).convert("RGB")
            im_r = preprocess(im)             # resized PIL
            crops = fivecrop(im_r)            # tuple of 5 PIL crops
            x = torch.stack([norm(c) for c in crops], dim=0)  # [5, C, H, W]
            return x, p

    ds = TTA_DS(df_wide)
    dl = DataLoader(ds, batch_size=8, shuffle=False, num_workers=2, pin_memory=True)
    outs = []
    with torch.no_grad():
        for xb5, _ in dl:
            B, NC, C, H, W = xb5.shape
            xb = xb5.view(B * NC, C, H, W).to(device, non_blocking=True)
            preds = predict_cnn_views(model, xb).detach().float().cpu()  # [5*B, 5]
            preds = preds.view(B, NC, -1).mean(dim=1)  # average 5 crops
            outs.append(preds.numpy())
    return np.concatenate(outs, axis=0) if outs else np.zeros((0, len(TARGETS)), np.float32)

# ---------------------------------------------------------------------
# Nested slope-constrained affine calibration (no-harm guard)
class AffineCal(NamedTuple):
    a: float
    b: float
    use: bool  # apply or fallback to identity

def _fit_affine(x, y, l2=1e-3, slope_bounds=(0.0, 1.3)) -> AffineCal:
    x = np.asarray(x).reshape(-1)
    y = np.asarray(y).reshape(-1)
    X = np.stack([x, np.ones_like(x)], axis=1)  # [N,2]
    XtX = X.T @ X
    XtX[0, 0] += l2
    beta = np.linalg.pinv(XtX) @ (X.T @ y)
    a, b = float(beta[0]), float(beta[1])
    a = float(np.clip(a, slope_bounds[0], slope_bounds[1]))
    return AffineCal(a=a, b=b, use=True)

def fit_affine_calibration_nested(oof_pred: np.ndarray, y_true: np.ndarray,
                                  target_weights: np.ndarray,
                                  n_splits: int = 5, l2: float = 1e-3,
                                  slope_bounds=(0.0, 1.3),
                                  min_gain: float = 0.002) -> List[AffineCal]:
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    calibs: List[AffineCal] = []
    for i in range(y_true.shape[1]):
        gains = []
        coeffs = []
        for tr, va in kf.split(oof_pred):
            x_tr, y_tr = oof_pred[tr, i], y_true[tr, i]
            x_va, y_va = oof_pred[va, i], y_true[va, i]
            cal = _fit_affine(x_tr, y_tr, l2=l2, slope_bounds=slope_bounds)
            yhat_pre = x_va
            yhat_post = cal.a * x_va + cal.b
            r2_pre = r2_score(y_va, yhat_pre)
            r2_post = r2_score(y_va, yhat_post)
            gains.append(r2_post - r2_pre)
            coeffs.append((cal.a, cal.b))
        mean_gain = float(np.mean(gains))
        if mean_gain >= min_gain:
            a = float(np.median([c[0] for c in coeffs]))
            b = float(np.median([c[1] for c in coeffs]))
            calibs.append(AffineCal(a=a, b=b, use=True))
        else:
            calibs.append(AffineCal(a=1.0, b=0.0, use=False))
    return calibs

def apply_affine_cal(pred: np.ndarray, calibs: List[AffineCal]) -> np.ndarray:
    out = pred.copy()
    for i, cal in enumerate(calibs):
        if cal.use:
            out[:, i] = cal.a * pred[:, i] + cal.b
        # identity otherwise
        out[:, i] = np.maximum(out[:, i], 0.0)  # nonnegativity only
    return out

# ---------------------------------------------------------------------
# CatBoost training (only if test has metadata)
def train_catboost_fold(X_tr: pd.DataFrame, y_tr: np.ndarray, X_va: pd.DataFrame, y_va: np.ndarray,
                        debug: bool, base_params: Optional[dict] = None, tune: bool = False) -> Tuple[CatBoostRegressor, Dict[str, any], dict]:
    params = base_params or dict(
        loss_function='MultiRMSE',
        depth=8,
        iterations=1500 if not debug else 150,
        learning_rate=0.03,
        l2_leaf_reg=5.0,
        subsample=0.8,
        bootstrap_type='MVS',
        random_strength=1.0,
        border_count=254,
        task_type='CPU',
        eval_metric='MultiRMSE',
        early_stopping_rounds=100,
        random_seed=SEED,
        verbose=50
    )
    train_pool = Pool(X_tr, y_tr); valid_pool = Pool(X_va, y_va)
    model = CatBoostRegressor(**params)
    t0 = time.time(); model.fit(train_pool, eval_set=valid_pool, use_best_model=True); base_time = time.time() - t0
    print(f"[CatBoost] Base fit time: {base_time:.1f}s")

    best_params = params.copy()
    if tune and not debug:
        def objective(trial: optuna.Trial):
            p = params.copy()
            p.update({
                "depth": trial.suggest_int("depth", 6, 10),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.07, log=True),
                "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0, log=True),
                "subsample": trial.suggest_float("subsample", 0.6, 1.0),
                "random_strength": trial.suggest_float("random_strength", 0.5, 2.0),
                "border_count": trial.suggest_int("border_count", 128, 254),
                "iterations": trial.suggest_int("iterations", 800, 2000),
            })
            m = CatBoostRegressor(**p)
            m.fit(train_pool, eval_set=valid_pool, use_best_model=True, verbose=False)
            preds = m.predict(X_va)
            r2s = [r2_score(y_va[:, i], preds[:, i]) for i in range(y_va.shape[1])]
            return float(np.mean(r2s))
        study = optuna.create_study(direction="maximize"); study.optimize(objective, timeout=300, n_trials=100, gc_after_trial=True)
        best = study.best_trial
        best_params.update({
            "depth": best.params["depth"],
            "learning_rate": best.params["learning_rate"],
            "l2_leaf_reg": best.params["l2_leaf_reg"],
            "subsample": best.params["subsample"],
            "random_strength": best.params["random_strength"],
            "border_count": best.params["border_count"],
            "iterations": best.params["iterations"],
        })
        model = CatBoostRegressor(**best_params)
        t1 = time.time(); model.fit(train_pool, eval_set=valid_pool, use_best_model=True, verbose=50); t2 = time.time()
        print(f"[CatBoost] Retrain with best params in {t2 - t1:.1f}s")
    info = {"base_time_sec": base_time, "best_params": best_params}
    return model, info, best_params

# ---------------------------------------------------------------------
# Main pipeline with 5-fold OOF affine calibration and fold ensembling
def run_pipeline(DEBUG: bool):
    mode = "DEBUG" if DEBUG else "FULL"
    print(f"==== Running in {mode} mode ====")
    set_seed(SEED)

    # Load
    train_csv = BASE_DIR / "train.csv"; test_csv = BASE_DIR / "test.csv"
    assert train_csv.exists(), f"train.csv not found at {train_csv}"
    assert test_csv.exists(), f"test.csv not found at {test_csv}"
    df_train_long = pd.read_csv(train_csv); df_test_long = pd.read_csv(test_csv)
    print(f"Loaded train rows: {len(df_train_long)} | test rows: {len(df_test_long)}")

    # Determine test metadata availability and fusion weights
    TEST_HAS_META = all(c in df_test_long.columns for c in ['Sampling_Date','State','Species','Pre_GSHH_NDVI','Height_Ave_cm'])
    IMAGE_WEIGHT = 1.0 if not TEST_HAS_META else DEFAULT_IMAGE_WEIGHT
    TAB_WEIGHT = 1.0 - IMAGE_WEIGHT
    print(f"TEST_HAS_META={TEST_HAS_META} -> fusion weights (tab={TAB_WEIGHT:.2f}, img={IMAGE_WEIGHT:.2f})")

    # Prepare wide training
    df_wide = prepare_long_to_wide(df_train_long)
    df_wide['group'] = df_wide['image_path'].apply(lambda p: Path(p).stem)
    strata = make_composite_strata(df_wide)
    print(f"Wide train images: {len(df_wide)}")

    sgkf = StratifiedGroupKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

    # OOF store and per-fold models
    oof_pred_img = np.zeros((len(df_wide), len(TARGETS)), dtype=np.float32)
    fold_models = []
    fold_val_idx = []

    # Train MobileNetV4 per fold
    for f, (tr_idx, va_idx) in enumerate(sgkf.split(df_wide, y=strata, groups=df_wide['group'])):
        df_tr = df_wide.iloc[tr_idx].reset_index(drop=True)
        df_va = df_wide.iloc[va_idx].reset_index(drop=True)
        y_va = df_va[TARGETS].values.astype(np.float32)

        # Compute SST on training fold for WN-MSE
        sst_fold = compute_sst(df_tr[TARGETS].values.astype(np.float32))
        print(f"[Fold {f}] Train={len(df_tr)} Val={len(df_va)} | SST: {sst_fold.tolist()}")

        # CNN fold training
        model_f, info_f = train_cnn_fold(
            train_df=df_tr, val_df=df_va, sst=sst_fold,
            epochs=EPOCHS_DEBUG if DEBUG else EPOCHS_FULL,
            base_lr_backbone=2e-4, lr_head=1e-3, weight_decay=0.05, debug=DEBUG
        )
        fold_models.append(model_f); fold_val_idx.append(va_idx)

        # Predict validation with FiveCrop + hflip TTA
        pred_va = predict_cnn_fivecrop(model_f, df_va, input_px=IMG_SIZE)
        oof_pred_img[va_idx] = pred_va
        wr2_f, _ = weighted_r2(y_va, pred_va, TARGET_WEIGHTS)
        print(f"[Fold {f}] Val weighted R² (image-only + FiveCrop TTA): {wr2_f:.6f}")

    # OOF nested affine calibration (image branch)
    y_all = df_wide[TARGETS].values.astype(np.float32)
    wr2_img_pre, _ = weighted_r2(y_all, oof_pred_img, TARGET_WEIGHTS)
    cal_models = fit_affine_calibration_nested(
        oof_pred=oof_pred_img, y_true=y_all, target_weights=TARGET_WEIGHTS,
        n_splits=5, l2=1e-3, slope_bounds=(0.0, 1.3), min_gain=0.002
    )
    oof_img_cal = apply_affine_cal(oof_pred_img, cal_models)
    wr2_img_post, _ = weighted_r2(y_all, oof_img_cal, TARGET_WEIGHTS)
    print(f"[OOF Image] weighted R² pre-cal: {wr2_img_pre:.6f} | post-cal (affine+no-harm): {wr2_img_post:.6f}")

    # Optional CatBoost across folds if test has metadata
    if TEST_HAS_META:
        oof_pred_tab = np.zeros_like(oof_pred_img)
        test_unique = df_test_long['image_path'].unique().tolist()
        df_test_img = pd.DataFrame({"image_path": test_unique})
        fold_test_tab_preds = []
        best_cb_params_global = None

        for f, (tr_idx, va_idx) in enumerate(sgkf.split(df_wide, y=strata, groups=df_wide['group'])):
            df_tr = df_wide.iloc[tr_idx].reset_index(drop=True)
            df_va = df_wide.iloc[va_idx].reset_index(drop=True)
            y_tr = df_tr[TARGETS].values.astype(np.float32)
            y_va = df_va[TARGETS].values.astype(np.float32)
            sst_fold = compute_sst(y_tr)
            scale = np.sqrt(np.maximum(sst_fold, 1e-9)).astype(np.float32)
            encoders = compute_species_encodings(df_tr, TARGETS)
            X_tr, encoders = build_tabular_features(df_tr, encoders, fit=True, y_cols=TARGETS)
            X_va, _ = build_tabular_features(df_va, encoders, fit=False, y_cols=TARGETS)

            tune_flag = (not DEBUG) and (f == 0)
            cb_model, cb_info, best_params = train_catboost_fold(
                X_tr, y_tr/scale, X_va, y_va/scale, debug=DEBUG,
                base_params=best_cb_params_global, tune=tune_flag
            )
            if tune_flag: best_cb_params_global = best_params

            pred_va_tab = cb_model.predict(X_va) * scale
            oof_pred_tab[va_idx] = pred_va_tab
            wr2_tab_f, _ = weighted_r2(y_va, pred_va_tab, TARGET_WEIGHTS)
            print(f"[Fold {f} CatBoost] Val weighted R² (tabular-only): {wr2_tab_f:.6f}")

            # Test preds per fold
            X_te_full = pd.merge(
                df_test_img,
                df_test_long[['image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm']].drop_duplicates('image_path'),
                on='image_path', how='left'
            )
            X_te, _ = build_tabular_features(X_te_full, encoders, fit=False, y_cols=TARGETS)
            pred_te_tab = cb_model.predict(X_te) * scale
            fold_test_tab_preds.append(pred_te_tab)

        # Fused OOF (calibrated image + tabular)
        oof_fused = TAB_WEIGHT * oof_pred_tab + IMAGE_WEIGHT * oof_img_cal
        wr2_fused, _ = weighted_r2(y_all, oof_fused, TARGET_WEIGHTS)
        print(f"[OOF Fusion] weighted R² (tab={TAB_WEIGHT:.2f}, img(cal)={IMAGE_WEIGHT:.2f}): {wr2_fused:.6f}")
    else:
        oof_pred_tab = None
        df_test_img = pd.DataFrame({"image_path": df_test_long['image_path'].unique().tolist()})
        fold_test_tab_preds = None
        print("Skipping CatBoost branch (test has no metadata).")

    # Inference for submission (FULL only)
    if not DEBUG:
        # Image predictions on test for each fold; ensemble average; apply OOF affine calibration
        all_fold_img_preds = []
        for f, model_f in enumerate(fold_models):
            pred_f = predict_cnn_fivecrop(model_f, df_test_img, input_px=IMG_SIZE)
            all_fold_img_preds.append(pred_f)
        test_img_raw = np.mean(all_fold_img_preds, axis=0)
        test_img_cal = apply_affine_cal(test_img_raw, cal_models)

        if TEST_HAS_META:
            test_tab_avg = np.mean(fold_test_tab_preds, axis=0)
            test_pred = TAB_WEIGHT * test_tab_avg + IMAGE_WEIGHT * test_img_cal
            print("Test meta present: fused predictions (tabular ensemble + calibrated image ensemble).")
        else:
            test_pred = test_img_cal
            print("Test meta absent: using calibrated image ensemble only.")

        # Build submission
        pred_map = {p: test_pred[i] for i, p in enumerate(df_test_img['image_path'].tolist())}
        preds = []
        for _, row in df_test_long.iterrows():
            vec = pred_map[row['image_path']]
            idx = TARGETS.index(row['target_name'])
            preds.append(float(vec[idx]))
        sub = pd.DataFrame({"sample_id": df_test_long['sample_id'], "target": preds})
        sub.to_csv(SUB_PATH, index=False)
        print(f"Submission saved: {SUB_PATH} | shape: {sub.shape}")
        print(f"Submission target summary: N={sub['target'].shape[0]}, mean={sub['target'].mean():.4f}, std={sub['target'].std():.4f}, min={sub['target'].min():.4f}, max={sub['target'].max():.4f}, q05={sub['target'].quantile(0.05):.4f}, q95={sub['target'].quantile(0.95):.4f}")
    else:
        print("DEBUG mode: submission not created (as required).")

# ---------------------------------------------------------------------
if __name__ == "__main__":
    t_all = time.time()
    for debug_flag in [True, False]:
        run_pipeline(DEBUG=debug_flag)
    print(f"==== Script end | total wall time: {(time.time()-t_all)/60:.1f} min ====")

# Notes:
# - MobileNetV4-HM timm ID: 'mobilenetv4_hybrid_medium.e500_r224_in1k' (~11.1M params); pretrained=True; 5-output head.
# - BN frozen + drop_last to avoid BatchNorm size-1 errors.
# - Image CV: 5-fold StratifiedGroupKFold on image_id-like group; FiveCrop+hflip TTA; OOF nested affine calibration with slope bounds and no-harm guard.
# - CatBoost: MultiRMSE; eval_metric=MultiRMSE; Optuna tuning for 300s on fold 0 (FULL), reused params; per-fold CB test preds averaged and fused with calibrated image preds when metadata present.
# - Fusion weights default to (tab=0.93, img=0.07) when metadata available; image-only otherwise.