# Load Data & Initial Inspection

In [None]:
# ============================================================
# STAGE 1 — Load Data & Initial Inspection (ONE CELL, Kaggle)
# Paths (given):
#   /kaggle/input/ts-forecasting/train.parquet
#   /kaggle/input/ts-forecasting/test.parquet
# Output globals:
#   df_train, df_test, TARGET_COL, ID_COL, WEIGHT_COL, TIME_COL, CAT_COLS, FEAT_COLS, NUM_COLS
# ============================================================

import os, gc
from pathlib import Path
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

TRAIN_PATH = Path("/kaggle/input/ts-forecasting/train.parquet")
TEST_PATH  = Path("/kaggle/input/ts-forecasting/test.parquet")

for p in [TRAIN_PATH, TEST_PATH]:
    if not p.exists():
        raise FileNotFoundError(f"Missing file: {p}")

print("Loading parquet...")
df_train = pd.read_parquet(TRAIN_PATH)
df_test  = pd.read_parquet(TEST_PATH)

print("\n==================== BASIC SHAPES ====================")
print("train:", df_train.shape)
print("test :", df_test.shape)

print("\n==================== COLUMNS ====================")
print("train cols:", len(df_train.columns))
print("test  cols:", len(df_test.columns))

# ---- Standard column names (from competition description)
ID_COL     = "id"
WEIGHT_COL = "weight"
TIME_COL   = "ts_index"
BASE_CATS  = ["code", "sub_code", "sub_category", "horizon"]

# ---- Detect target column (must exist in train, not in test)
train_only_cols = [c for c in df_train.columns if c not in df_test.columns]
# remove any obviously non-target extras if present
train_only_cols = [c for c in train_only_cols if c not in [ID_COL, WEIGHT_COL, TIME_COL] + BASE_CATS]

# Prefer common target names if present
preferred_names = ["target", "y", "label", "value", "prediction_target"]
TARGET_COL = None
for name in preferred_names:
    if name in df_train.columns and name not in df_test.columns:
        TARGET_COL = name
        break

if TARGET_COL is None:
    # If exactly one train-only col remains -> pick it
    if len(train_only_cols) == 1:
        TARGET_COL = train_only_cols[0]
    else:
        # Fallback: pick numeric column(s) absent in test
        cand = []
        for c in [c for c in df_train.columns if c not in df_test.columns]:
            if pd.api.types.is_numeric_dtype(df_train[c]):
                cand.append(c)
        # remove known non-target just in case
        cand = [c for c in cand if c not in [WEIGHT_COL, TIME_COL]]
        if len(cand) == 1:
            TARGET_COL = cand[0]
        elif len(cand) > 1:
            # pick the one with highest variance (usually the true target)
            vars_ = {c: float(np.nanvar(df_train[c].to_numpy(dtype=np.float64))) for c in cand}
            TARGET_COL = sorted(vars_.items(), key=lambda kv: kv[1], reverse=True)[0][0]
            print("\n[WARN] Multiple numeric train-only cols found; picked by variance:", TARGET_COL)
        else:
            # last resort: if any train-only cols exist, pick the first
            if len([c for c in df_train.columns if c not in df_test.columns]) > 0:
                TARGET_COL = [c for c in df_train.columns if c not in df_test.columns][0]
                print("\n[WARN] Could not confidently detect target; picked first train-only col:", TARGET_COL)
            else:
                raise RuntimeError("Could not detect target column (no train-only columns).")

print("\n==================== KEY COLS ====================")
print("ID_COL     :", ID_COL, "| exists:", ID_COL in df_train.columns and ID_COL in df_test.columns)
print("TIME_COL   :", TIME_COL, "| exists:", TIME_COL in df_train.columns and TIME_COL in df_test.columns)
print("WEIGHT_COL :", WEIGHT_COL, "| exists:", WEIGHT_COL in df_train.columns and WEIGHT_COL in df_test.columns)
print("TARGET_COL :", TARGET_COL, "| exists in train:", TARGET_COL in df_train.columns, "| exists in test:", TARGET_COL in df_test.columns)

# ---- Determine categorical columns that exist
CAT_COLS = [c for c in BASE_CATS if c in df_train.columns]
# Also include any object/category cols (excluding id)
for c in df_train.columns:
    if c == ID_COL or c == TARGET_COL:
        continue
    if pd.api.types.is_object_dtype(df_train[c]) or str(df_train[c].dtype).startswith("category"):
        if c not in CAT_COLS:
            CAT_COLS.append(c)

# ---- Determine feature columns (exclude id, target, weight; keep everything else)
EXCLUDE = set([ID_COL, TARGET_COL, WEIGHT_COL])
FEAT_COLS = [c for c in df_train.columns if c not in EXCLUDE]

# ---- Determine numeric feature columns
NUM_COLS = [c for c in FEAT_COLS if pd.api.types.is_numeric_dtype(df_train[c]) and c != WEIGHT_COL]

print("\n==================== QUICK CHECKS ====================")
# id uniqueness
if ID_COL in df_train.columns:
    print("train id unique:", df_train[ID_COL].nunique(), "/", len(df_train))
if ID_COL in df_test.columns:
    print("test  id unique:", df_test[ID_COL].nunique(), "/", len(df_test))

# ts_index ranges
if TIME_COL in df_train.columns and TIME_COL in df_test.columns:
    print("train ts_index range:", int(df_train[TIME_COL].min()), "->", int(df_train[TIME_COL].max()))
    print("test  ts_index range:", int(df_test[TIME_COL].min()),  "->", int(df_test[TIME_COL].max()))

# horizon distribution (small peek)
if "horizon" in df_train.columns:
    print("\ntrain horizon value counts (top):")
    print(df_train["horizon"].value_counts(dropna=False).head(10))
if "horizon" in df_test.columns:
    print("\ntest horizon value counts (top):")
    print(df_test["horizon"].value_counts(dropna=False).head(10))

# missingness summary (top 15 columns)
print("\n==================== MISSING VALUES (TOP) ====================")
miss_train = df_train.isna().mean().sort_values(ascending=False)
miss_test  = df_test.isna().mean().sort_values(ascending=False)
print("train missing rate top 15:")
print(miss_train.head(15))
print("\ntest missing rate top 15:")
print(miss_test.head(15))

# target stats
if TARGET_COL in df_train.columns and pd.api.types.is_numeric_dtype(df_train[TARGET_COL]):
    y = df_train[TARGET_COL].to_numpy(dtype=np.float64)
    print("\n==================== TARGET STATS ====================")
    print("count:", np.isfinite(y).sum(), " / ", len(y))
    print("mean :", float(np.nanmean(y)))
    print("std  :", float(np.nanstd(y)))
    print("min  :", float(np.nanmin(y)))
    print("p1   :", float(np.nanpercentile(y, 1)))
    print("p50  :", float(np.nanpercentile(y, 50)))
    print("p99  :", float(np.nanpercentile(y, 99)))
    print("max  :", float(np.nanmax(y)))

# weight stats (reminder: do NOT use as feature)
if WEIGHT_COL in df_train.columns and pd.api.types.is_numeric_dtype(df_train[WEIGHT_COL]):
    w = df_train[WEIGHT_COL].to_numpy(dtype=np.float64)
    print("\n==================== WEIGHT STATS (NOT A FEATURE) ====================")
    print("mean :", float(np.nanmean(w)))
    print("min  :", float(np.nanmin(w)))
    print("p50  :", float(np.nanpercentile(w, 50)))
    print("p99  :", float(np.nanpercentile(w, 99)))
    print("max  :", float(np.nanmax(w)))

print("\n==================== FEATURE SET SUMMARY ====================")
print("CAT_COLS :", CAT_COLS)
print("NUM_COLS :", len(NUM_COLS), "(numeric features excluding weight/target/id)")
print("FEAT_COLS:", len(FEAT_COLS), "(all usable columns excluding target and weight; id excluded)")

print("\n==================== HEAD (train) ====================")
display(df_train.head(3))
print("\n==================== HEAD (test) ====================")
display(df_test.head(3))

gc.collect()


# Sanity Checks & Leakage Rules Setup

In [None]:
# ============================================================
# STAGE 2 — Sanity Checks & Leakage Rules Setup (ONE CELL, Kaggle)
# Assumes STAGE 1 already ran and created:
#   df_train, df_test, TARGET_COL, ID_COL, TIME_COL, CAT_COLS, FEAT_COLS
# This stage:
# - Validates schema and uniqueness
# - Confirms time ordering (train < test)
# - Sets leakage-safe column lists
# - Defines "DO NOT USE" columns and lightweight guards
# Outputs/Globals:
#   DO_NOT_USE_COLS, FEATURE_COLS_NUM, FEATURE_COLS_CAT, FEATURE_COLS_ALL
#   TRAIN_MAX_TS, TEST_MIN_TS, TEST_MAX_TS
# ============================================================

import gc, re
import numpy as np
import pandas as pd

# ----------------------------
# 0) Require STAGE 1 globals
# ----------------------------
need = ["df_train","df_test","TARGET_COL","ID_COL","TIME_COL","CAT_COLS","FEAT_COLS"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing global '{k}'. Jalankan STAGE 1 dulu.")

assert isinstance(df_train, pd.DataFrame) and isinstance(df_test, pd.DataFrame)

# ----------------------------
# 1) Core column existence
# ----------------------------
must_in_train = [ID_COL, TIME_COL, TARGET_COL]
must_in_test  = [ID_COL, TIME_COL]
for c in must_in_train:
    if c not in df_train.columns:
        raise RuntimeError(f"Train missing required column: {c}")
for c in must_in_test:
    if c not in df_test.columns:
        raise RuntimeError(f"Test missing required column: {c}")

# Optional: weight may exist only in train
WEIGHT_COL = "weight"
has_weight_train = WEIGHT_COL in df_train.columns
has_weight_test  = WEIGHT_COL in df_test.columns

print("==================== STAGE 2: SANITY ====================")
print("Has weight in train:", has_weight_train, "| in test:", has_weight_test)
if has_weight_test:
    print("[WARN] weight also exists in test. We'll still exclude it as a feature.")
print("Target col:", TARGET_COL)

# ----------------------------
# 2) ID uniqueness checks
# ----------------------------
ntr = len(df_train)
nts = len(df_test)

nuniq_tr = df_train[ID_COL].nunique(dropna=False)
nuniq_ts = df_test[ID_COL].nunique(dropna=False)

if nuniq_tr != ntr:
    dup = df_train[df_train[ID_COL].duplicated(keep=False)][ID_COL].head(10).tolist()
    raise RuntimeError(f"Train id not unique: {nuniq_tr}/{ntr}. Example dups: {dup}")
if nuniq_ts != nts:
    dup = df_test[df_test[ID_COL].duplicated(keep=False)][ID_COL].head(10).tolist()
    raise RuntimeError(f"Test id not unique: {nuniq_ts}/{nts}. Example dups: {dup}")

intersect = np.intersect1d(df_train[ID_COL].values, df_test[ID_COL].values)
if len(intersect) > 0:
    print(f"[WARN] Train/Test share {len(intersect)} ids (unexpected). Example:", intersect[:5])

print("ID uniqueness: OK")

# ----------------------------
# 3) Time ordering checks
# ----------------------------
if not pd.api.types.is_integer_dtype(df_train[TIME_COL]) and not pd.api.types.is_numeric_dtype(df_train[TIME_COL]):
    raise RuntimeError(f"{TIME_COL} in train is not numeric.")
if not pd.api.types.is_integer_dtype(df_test[TIME_COL]) and not pd.api.types.is_numeric_dtype(df_test[TIME_COL]):
    raise RuntimeError(f"{TIME_COL} in test is not numeric.")

TRAIN_MAX_TS = int(np.nanmax(df_train[TIME_COL].values))
TRAIN_MIN_TS = int(np.nanmin(df_train[TIME_COL].values))
TEST_MIN_TS  = int(np.nanmin(df_test[TIME_COL].values))
TEST_MAX_TS  = int(np.nanmax(df_test[TIME_COL].values))

print("Train ts_index range:", TRAIN_MIN_TS, "->", TRAIN_MAX_TS)
print("Test  ts_index range:", TEST_MIN_TS,  "->", TEST_MAX_TS)

# Expect test period after train; allow small overlaps but flag loudly
if TEST_MIN_TS <= TRAIN_MAX_TS:
    print("[WARN] Test min ts_index <= Train max ts_index. Check competition rules / possible overlap.")
else:
    print("Time ordering (train -> test): OK (test starts after train).")

# ----------------------------
# 4) Leakage rules + feature lists
# ----------------------------
# DO NOT USE columns as model input features:
# - id, target, and weight (even if present in train/test)
DO_NOT_USE_COLS = {ID_COL, TARGET_COL, WEIGHT_COL}

# Basic categorical feature columns: from STAGE 1 CAT_COLS
FEATURE_COLS_CAT = [c for c in CAT_COLS if c not in DO_NOT_USE_COLS and c in df_train.columns]

# Numeric candidate features: all numeric columns except forbidden
numeric_cols = [c for c in df_train.columns if pd.api.types.is_numeric_dtype(df_train[c])]
FEATURE_COLS_NUM = [c for c in numeric_cols if c not in DO_NOT_USE_COLS and c != TIME_COL]  # exclude ts_index by default

# Full feature set used by "tabular model" baseline:
FEATURE_COLS_ALL = FEATURE_COLS_CAT + FEATURE_COLS_NUM

print("\n==================== FEATURE LISTS ====================")
print("Categorical features:", FEATURE_COLS_CAT)
print("Numeric features (excluding ts_index):", len(FEATURE_COLS_NUM))
print("Total features:", len(FEATURE_COLS_ALL))

# ----------------------------
# 5) Minimal integrity checks (dtypes, NaNs)
# ----------------------------
# Categorical columns should exist in both train and test
missing_cats_test = [c for c in FEATURE_COLS_CAT if c not in df_test.columns]
if missing_cats_test:
    raise RuntimeError(f"Categorical cols missing in test: {missing_cats_test}")

# Numeric columns should exist in both train and test for inference
missing_num_test = [c for c in FEATURE_COLS_NUM if c not in df_test.columns]
if missing_num_test:
    # It's possible, but unusual; better fail fast
    raise RuntimeError(f"Numeric feature cols missing in test: {missing_num_test[:10]} ... ({len(missing_num_test)} total)")

# Check target has no NaN (important)
y_nan = df_train[TARGET_COL].isna().mean()
print("\nTarget NaN rate:", float(y_nan))
if y_nan > 0:
    print("[WARN] Target has missing values. We'll need to drop or impute target rows later (usually drop).")

# Weight sanity (if exists)
if has_weight_train:
    w = df_train[WEIGHT_COL].to_numpy(dtype=np.float64)
    w_nan = np.isnan(w).mean()
    w_neg = np.mean(w < 0)
    w_zero = np.mean(w == 0)
    print("\n==================== WEIGHT SANITY (TRAIN) ====================")
    print("NaN rate:", float(w_nan), "| negative rate:", float(w_neg), "| zero rate:", float(w_zero))
    if w_neg > 0:
        print("[WARN] Found negative weights. Usually unexpected; we'll handle carefully later.")

# ----------------------------
# 6) Leakage-safe reminders (printed)
# ----------------------------
print("\n==================== LEAKAGE RULES (REMINDER) ====================")
print("- Do NOT use 'weight' as a feature (only as sample_weight).")
print("- Any preprocessing (imputer/encoder/scaler) must be fit on TRAIN-FOLD only.")
print("- Any time-based features (rolling/expanding) must be computed with shift(1) per group.")
print("- Do NOT compute statistics using future rows (ts_index > t) for predicting time t.")
print("- Avoid fitting encoders on train+test combined.")

gc.collect()

# Implement Official Metric

In [None]:
# ============================================================
# STAGE 3 — Implement Official Metric (ONE CELL, Kaggle)
# Assumes STAGE 1–2 already ran and created:
#   df_train, df_test, TARGET_COL, ID_COL, TIME_COL, (optional) WEIGHT_COL="weight"
# This stage:
# - Implements competition metric exactly
# - Adds helpers to score arrays / dataframes
# - Provides a few baselines sanity checks (zero, weighted-mean)
# Outputs/Globals:
#   weighted_rmse_score, score_df, score_arrays
# ============================================================

import numpy as np
import pandas as pd

# ----------------------------
# 0) Require minimal globals
# ----------------------------
need = ["df_train", "TARGET_COL", "ID_COL", "TIME_COL"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing global '{k}'. Jalankan STAGE 1–2 dulu.")

WEIGHT_COL = "weight"
HAS_W = WEIGHT_COL in df_train.columns

# ----------------------------
# 1) Official metric (as provided by host)
# ----------------------------
def _clip01(x: float) -> float:
    return float(np.minimum(np.maximum(x, 0.0), 1.0))

def weighted_rmse_score(y_target, y_pred, w) -> float:
    """
    Competition metric:
      denom = sum(w * y^2)
      ratio = sum(w * (y - yhat)^2) / denom
      score = sqrt( 1 - clip01(ratio) )
    """
    y_target = np.asarray(y_target, dtype=np.float64)
    y_pred   = np.asarray(y_pred, dtype=np.float64)
    w        = np.asarray(w, dtype=np.float64)

    # Robust guards
    if y_target.shape != y_pred.shape or y_target.shape != w.shape:
        raise ValueError(f"Shape mismatch: y={y_target.shape}, yhat={y_pred.shape}, w={w.shape}")

    # If denom is 0, the metric is ill-defined; return 0 safely
    denom = np.sum(w * (y_target ** 2))
    if not np.isfinite(denom) or denom <= 0:
        return 0.0

    ratio = np.sum(w * ((y_target - y_pred) ** 2)) / denom
    clipped = _clip01(ratio)
    val = 1.0 - clipped
    # Numerical safety
    val = max(val, 0.0)
    return float(np.sqrt(val))

# ----------------------------
# 2) Convenience wrappers
# ----------------------------
def score_arrays(y_true: np.ndarray, y_pred: np.ndarray, w: np.ndarray | None = None) -> float:
    if w is None:
        w = np.ones_like(y_true, dtype=np.float64)
    return weighted_rmse_score(y_true, y_pred, w)

def score_df(df: pd.DataFrame, y_col: str, pred_col: str, w_col: str = "weight") -> float:
    if w_col not in df.columns:
        w = np.ones(len(df), dtype=np.float64)
    else:
        w = df[w_col].to_numpy(dtype=np.float64)
    return weighted_rmse_score(df[y_col].to_numpy(dtype=np.float64),
                               df[pred_col].to_numpy(dtype=np.float64),
                               w)

# ----------------------------
# 3) Sanity check on train (simple baselines)
# ----------------------------
print("==================== STAGE 3: OFFICIAL METRIC ====================")
y = df_train[TARGET_COL].to_numpy(dtype=np.float64)

if HAS_W:
    w = df_train[WEIGHT_COL].to_numpy(dtype=np.float64)
else:
    w = np.ones_like(y, dtype=np.float64)

# Baseline A: predict 0
pred0 = np.zeros_like(y, dtype=np.float64)
s0 = weighted_rmse_score(y, pred0, w)

# Baseline B: predict weighted mean (best constant under weighted MSE)
# Use small epsilon to avoid /0
w_sum = float(np.sum(w))
c = float(np.sum(w * y) / (w_sum + 1e-18))
predc = np.full_like(y, c, dtype=np.float64)
sc = weighted_rmse_score(y, predc, w)

# Baseline C: predict unweighted median (often robust)
m = float(np.median(y))
predm = np.full_like(y, m, dtype=np.float64)
sm = weighted_rmse_score(y, predm, w)

print(f"Using weight column: {HAS_W}")
print(f"Baseline (predict 0)            score = {s0:.6f}")
print(f"Baseline (predict w-mean {c:.6f}) score = {sc:.6f}")
print(f"Baseline (predict median {m:.6f}) score = {sm:.6f}")

# Some extra diagnostics about denom / ratio scaling
denom = float(np.sum(w * (y ** 2)))
sse0  = float(np.sum(w * ((y - pred0) ** 2)))
ratio0 = sse0 / denom if denom > 0 else np.nan
print("\nDiagnostics:")
print(f"denom sum(w*y^2) = {denom:.6e}")
print(f"ratio(predict0)  = {ratio0:.6f}  (should be ~1.0 => score~0)")

print("\nGlobals exported: weighted_rmse_score, score_arrays, score_df")


# Time-based Validation Split

In [None]:
# ============================================================
# STAGE 4 — Time-based Validation Split (Leakage-Safe CV) (ONE CELL, Kaggle)
# Assumes STAGE 1–3 already ran and created:
#   df_train, df_test, ID_COL, TIME_COL, TARGET_COL, CAT_COLS
#
# This stage:
# - Builds walk-forward (blocked) time splits on ts_index
# - Optionally makes splits per-horizon (recommended later), but here we create a global fold id
# - Exports df_folds (id -> fold) and adds df_train["fold"]
#
# Outputs/Globals:
#   df_folds, df_train (with 'fold'), FOLD_CFG
#   fold_boundaries (list of dicts)
#
# Notes:
# - We use last portion of time as validation windows.
# - Training for fold k uses all data with ts_index <= train_end
#   Validation uses (train_end, valid_end] (strict future)
# ============================================================

import numpy as np
import pandas as pd

# ----------------------------
# 0) Require
# ----------------------------
need = ["df_train", "ID_COL", "TIME_COL"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing global '{k}'. Jalankan STAGE 1–3 dulu.")

# ----------------------------
# 1) Config (tune-friendly)
# ----------------------------
# Number of folds (walk-forward windows)
N_FOLDS = 4

# Validation window size in ts_index units.
# If None, we auto-set based on the last ~20% of time span.
VALID_WINDOW = None  # e.g., 150, 200, 300; or None for auto

# Gap between train_end and valid_start to reduce leakage via feature smoothing (usually 0 is OK)
GAP = 0

# Ensure we validate only on the tail period (mimics test)
TAIL_FRACTION = 0.25  # last 25% of time used to place validation windows

# Minimum validation samples per fold (fail-fast if too small)
MIN_VALID_ROWS = 200_000

# ----------------------------
# 2) Prepare timeline
# ----------------------------
ts = df_train[TIME_COL].to_numpy(dtype=np.int64)
ts_min = int(ts.min())
ts_max = int(ts.max())
ts_unique = np.unique(ts)
ts_unique.sort()

span = ts_max - ts_min + 1
tail_start_ts = int(ts_min + (1.0 - TAIL_FRACTION) * span)
tail_start_ts = max(tail_start_ts, ts_min)

# determine VALID_WINDOW
if VALID_WINDOW is None:
    tail_span = ts_max - tail_start_ts + 1
    # split tail into N_FOLDS windows, with a bit of buffer
    VALID_WINDOW = max(1, int(np.floor(tail_span / (N_FOLDS + 0.5))))
VALID_WINDOW = int(VALID_WINDOW)

# Build fold boundaries ending at ts_max
# Fold k validates on (train_end, valid_end], where valid_end increases toward ts_max
fold_boundaries = []
valid_end = ts_max
for k in range(N_FOLDS-1, -1, -1):
    valid_start = valid_end - VALID_WINDOW + 1
    # ensure validation window stays in tail
    if valid_start < tail_start_ts:
        valid_start = tail_start_ts
    train_end = valid_start - 1 - GAP
    fold_boundaries.append({
        "fold": k,
        "train_end": int(train_end),
        "valid_start": int(valid_start),
        "valid_end": int(valid_end),
        "gap": int(GAP),
        "valid_window": int(valid_end - valid_start + 1),
    })
    valid_end = train_end  # next fold ends where this train ended

# sort by fold id ascending
fold_boundaries = sorted(fold_boundaries, key=lambda d: d["fold"])

FOLD_CFG = {
    "N_FOLDS": N_FOLDS,
    "VALID_WINDOW": VALID_WINDOW,
    "GAP": GAP,
    "TAIL_FRACTION": TAIL_FRACTION,
    "MIN_VALID_ROWS": MIN_VALID_ROWS,
    "TIME_COL": TIME_COL,
    "ID_COL": ID_COL,
}

print("==================== STAGE 4: TIME SPLITS ====================")
print("Train ts_index:", ts_min, "->", ts_max, "| span:", span)
print("Tail fraction:", TAIL_FRACTION, "| tail_start_ts:", tail_start_ts)
print("N_FOLDS:", N_FOLDS, "| VALID_WINDOW:", VALID_WINDOW, "| GAP:", GAP)
print("\nFold boundaries:")
for b in fold_boundaries:
    print(f"  fold {b['fold']}: train <= {b['train_end']} | valid ({b['valid_start']}, {b['valid_end']}] | window={b['valid_window']}")

# ----------------------------
# 3) Assign folds
# ----------------------------
# Default fold = -1 (train-only, never used for validation)
fold_arr = np.full(len(df_train), -1, dtype=np.int16)

ts_series = df_train[TIME_COL].to_numpy(dtype=np.int64)

for b in fold_boundaries:
    k = b["fold"]
    vs, ve = b["valid_start"], b["valid_end"]
    mask = (ts_series >= vs) & (ts_series <= ve)
    fold_arr[mask] = k

df_train["fold"] = fold_arr

# df_folds mapping (id -> fold)
df_folds = df_train[[ID_COL, "fold"]].copy()

# ----------------------------
# 4) Diagnostics
# ----------------------------
vc = df_train["fold"].value_counts(dropna=False).sort_index()
print("\nFold row counts (fold=-1 means never validated):")
print(vc)

# Ensure each fold has enough validation rows
ok = True
for b in fold_boundaries:
    k = b["fold"]
    n_valid = int((df_train["fold"] == k).sum())
    if n_valid < MIN_VALID_ROWS:
        print(f"[WARN] fold {k} valid rows too small: {n_valid} < {MIN_VALID_ROWS}")
        ok = False
if ok:
    print("Validation sizes: OK")

# Quick check: validation is strictly in the future of its training end
viol = []
for b in fold_boundaries:
    if not (b["train_end"] < b["valid_start"]):
        viol.append(b["fold"])
if viol:
    raise RuntimeError(f"Invalid split: folds where train_end >= valid_start: {viol}")

print("\nGlobals exported: df_train['fold'], df_folds, FOLD_CFG, fold_boundaries")


# Feature Preparation & Weighting Strategy

In [None]:
# ============================================================
# STAGE 5 — Feature Preparation & Weighting Strategy (UPGRADE v4: TS + LAG + CS) (ONE CELL, Kaggle)
# - Adds leakage-safe time-series features (lags/diffs/gaps) and optional cross-sectional (same-ts) features
# - Updates FEATURE_COLS_* so Stage 6/7 can use the engineered columns
#
# REQUIRE (from STAGE 1–4):
#   df_train, df_test, TARGET_COL, ID_COL, TIME_COL, WEIGHT_COL
#   df_train["fold"]
#
# OUTPUT globals:
#   df_train, df_test (with engineered features)
#   SERIES_KEYS, FEATURE_COLS_CAT_ALL, FEATURE_COLS_NUM_ALL, FEATURE_COLS_ALL
#   CAT_FEATURE_IDXS_ALL
#   make_sample_weight(), fit_median_imputer(), apply_median_imputer()
# ============================================================

import gc
import numpy as np
import pandas as pd

# ----------------------------
# 0) Config
# ----------------------------
USE_TS_AS_FEATURE = True
ADD_AGE_FEATURE   = True

# Feature engineering intensity (start moderate; scale up after you see CV lift)
TOPK_NUM_FOR_FE   = 24          # how many numeric base features to expand
LAGS              = (1, 2, 3)    # lag steps per series
ADD_DIFFS         = True
ADD_TIME_GAP      = True

# Cross-sectional features at SAME ts_index (no future). Turn on after baseline FE works.
ADD_CS_STATS      = True        # z-score within (ts_index, horizon)
ADD_CS_RANK       = True        # rank(pct) within (ts_index, horizon)
CS_BUCKET         = [TIME_COL, "horizon"]  # will auto-prune if column missing

# Speed
MAX_ROWS_FOR_CORR = 250_000
RANDOM_STATE      = 42

# ----------------------------
# 1) Series keys + basic cols
# ----------------------------
base_cats = [c for c in ["code", "sub_code", "sub_category", "horizon"] if c in df_train.columns]
SERIES_KEYS = [c for c in ["code", "sub_code", "sub_category", "horizon"] if c in df_train.columns]
TRAIN_MAX_TS = int(df_train[TIME_COL].max())

# Cast categorical to category (RAM saver + CatBoost friendly)
for c in base_cats:
    for df in (df_train, df_test):
        if c in df.columns:
            df[c] = df[c].astype("category")

# ----------------------------
# 2) Pick numeric features to expand (TOPK) using weighted abs-corr (train-only)
# ----------------------------
def _weighted_abs_corr(x: np.ndarray, y: np.ndarray, w: np.ndarray, eps: float = 1e-12) -> float:
    m_w = np.sum(w) + eps
    wx = np.sum(w * x) / m_w
    wy = np.sum(w * y) / m_w
    xc = x - wx
    yc = y - wy
    cov = np.sum(w * xc * yc) / m_w
    vx = np.sum(w * xc * xc) / m_w
    vy = np.sum(w * yc * yc) / m_w
    denom = (np.sqrt(vx * vy) + eps)
    return float(abs(cov / denom))

HELPER_COLS = {"fold"}
DROP_ALWAYS = {ID_COL, TARGET_COL, WEIGHT_COL} | HELPER_COLS

num_candidates = [c for c in df_train.columns
                  if (c not in DROP_ALWAYS)
                  and pd.api.types.is_numeric_dtype(df_train[c])]

# Prefer anonymized features (feature_*)
feat_candidates = [c for c in num_candidates if str(c).startswith("feature_")]
if len(feat_candidates) >= 5:
    num_candidates = feat_candidates
num_candidates = list(num_candidates)

df_corr = df_train[[*num_candidates, TARGET_COL, WEIGHT_COL, TIME_COL]].copy()
if len(df_corr) > MAX_ROWS_FOR_CORR:
    t = df_corr[TIME_COL].to_numpy(np.float64)
    p = np.exp(-(TRAIN_MAX_TS - t) / 600.0)
    p = p / (p.sum() + 1e-12)
    take = np.random.RandomState(RANDOM_STATE).choice(len(df_corr), size=MAX_ROWS_FOR_CORR, replace=False, p=p)
    df_corr = df_corr.iloc[take].copy()

y = df_corr[TARGET_COL].to_numpy(np.float64)
w_off = df_corr[WEIGHT_COL].to_numpy(np.float64)
w_off = np.where(np.isfinite(w_off), w_off, 0.0)
w_off = np.maximum(w_off, 0.0)
if w_off.sum() <= 0:
    w_off = np.ones_like(w_off)

corr_scores = []
for c in num_candidates:
    x = df_corr[c].to_numpy(np.float64)
    med = np.nanmedian(x)
    x = np.where(np.isfinite(x), x, med)
    corr_scores.append((_weighted_abs_corr(x, y, w_off), c))

corr_scores.sort(reverse=True, key=lambda kv: kv[0])
TOPK = [c for _, c in corr_scores[:min(TOPK_NUM_FOR_FE, len(corr_scores))]]

print("Selected TOPK numeric features for FE:", TOPK[:12], "..." if len(TOPK) > 12 else "")
del df_corr
gc.collect()

# ----------------------------
# 3) Build engineered features on concatenated (train+test) in TIME order
# ----------------------------
df_train["_is_train"] = 1
df_test["_is_train"] = 0
df_all = pd.concat([df_train, df_test], axis=0, ignore_index=True)

df_all["__row_id"] = np.arange(len(df_all), dtype=np.int64)
sort_cols = [c for c in SERIES_KEYS if c in df_all.columns] + [TIME_COL, "__row_id"]
df_all = df_all.sort_values(sort_cols, kind="mergesort").reset_index(drop=True)

# Time gap per series
if ADD_TIME_GAP and len(SERIES_KEYS) > 0:
    dt = df_all.groupby(SERIES_KEYS, sort=False)[TIME_COL].diff()
    df_all["fe_dt"] = dt.fillna(0).astype(np.float32)
    df_all["fe_dt_clip"] = np.clip(df_all["fe_dt"].to_numpy(np.float32), 0.0, 50.0).astype(np.float32)

# Age feature
if USE_TS_AS_FEATURE and ADD_AGE_FEATURE:
    df_all["fe_age"] = (TRAIN_MAX_TS - df_all[TIME_COL].to_numpy(np.int32)).astype(np.int32)

# Lags / diffs
if len(SERIES_KEYS) > 0 and len(TOPK) > 0:
    g = df_all.groupby(SERIES_KEYS, sort=False)
    for f in TOPK:
        for L in LAGS:
            df_all[f"fe_{f}_lag{L}"] = g[f].shift(L).astype(np.float32)
        if ADD_DIFFS:
            lag1 = df_all[f"fe_{f}_lag1"].to_numpy(np.float32)
            cur  = df_all[f].to_numpy(np.float32)
            df_all[f"fe_{f}_diff1"] = (cur - lag1).astype(np.float32)

# Cross-sectional (same ts_index) stats: z-score + rank pct within (ts_index, horizon)
bucket = [c for c in CS_BUCKET if c in df_all.columns]
if ADD_CS_STATS and len(bucket) > 0 and len(TOPK) > 0:
    gb = df_all.groupby(bucket, sort=False)
    for f in TOPK:
        mu = gb[f].transform("mean").astype(np.float32)
        sd = gb[f].transform("std").astype(np.float32)
        df_all[f"fe_{f}_z_{'_'.join(bucket)}"] = ((df_all[f].astype(np.float32) - mu) / (sd + 1e-6)).astype(np.float32)

if ADD_CS_RANK and len(bucket) > 0 and len(TOPK) > 0:
    gb = df_all.groupby(bucket, sort=False)
    for f in TOPK:
        df_all[f"fe_{f}_r_{'_'.join(bucket)}"] = gb[f].rank(pct=True).astype(np.float32)

# Restore original order
df_all = df_all.sort_values("__row_id", kind="mergesort").reset_index(drop=True)

# Split back
df_train = df_all[df_all["_is_train"] == 1].drop(columns=["_is_train", "__row_id"]).reset_index(drop=True)
df_test  = df_all[df_all["_is_train"] == 0].drop(columns=["_is_train", "__row_id", TARGET_COL], errors="ignore").reset_index(drop=True)

gc.collect()

# ----------------------------
# 4) Finalize feature lists (include engineered columns)
# ----------------------------
FEATURE_COLS_CAT_ALL = [c for c in ["code", "sub_code", "sub_category", "horizon"] if c in df_train.columns]

HELPER_COLS = {"fold"}
DO_NOT_USE = {ID_COL, TARGET_COL, WEIGHT_COL} | HELPER_COLS

FEATURE_COLS_NUM_ALL = [c for c in df_train.columns
                        if (c not in DO_NOT_USE)
                        and (c not in FEATURE_COLS_CAT_ALL)
                        and pd.api.types.is_numeric_dtype(df_train[c])]

FEATURE_COLS_ALL = FEATURE_COLS_CAT_ALL + FEATURE_COLS_NUM_ALL
CAT_FEATURE_IDXS_ALL = list(range(len(FEATURE_COLS_CAT_ALL)))

# ----------------------------
# 5) Weight strategy (official weight * optional recency)
# ----------------------------
def make_sample_weight(df: pd.DataFrame,
                       use_recency: bool = True,
                       tau: float = 600.0,
                       clip_w_quantile: float | None = None,
                       eps: float = 1e-12) -> np.ndarray:
    w = df[WEIGHT_COL].to_numpy(dtype=np.float64)
    if clip_w_quantile is not None:
        q = float(np.nanquantile(w, clip_w_quantile))
        if np.isfinite(q) and q > 0:
            w = np.minimum(w, q)
    if use_recency:
        t = df[TIME_COL].to_numpy(dtype=np.float64)
        rec = np.exp(-(TRAIN_MAX_TS - t) / float(tau))
        w = w * rec
    w = np.where(np.isfinite(w), w, 0.0)
    w = np.maximum(w, 0.0)
    if float(w.sum()) <= eps:
        w = np.ones(len(df), dtype=np.float64)
    return w

# ----------------------------
# 6) Median imputer (optional, kalau nanti tambah model linear)
# ----------------------------
def fit_median_imputer(df_fit: pd.DataFrame, num_cols: list[str]) -> dict:
    med = df_fit[num_cols].median(numeric_only=True)
    return {c: float(med[c]) if c in med.index and np.isfinite(med[c]) else 0.0 for c in num_cols}

def apply_median_imputer(df_apply: pd.DataFrame, medians: dict, num_cols: list[str]) -> pd.DataFrame:
    out = df_apply.copy()
    for c in num_cols:
        if c in out.columns and out[c].isna().any():
            out[c] = out[c].fillna(medians.get(c, 0.0))
    return out

print("\n==================== STAGE 5 UPGRADE SUMMARY ====================")
print("Train/Test shapes:", df_train.shape, df_test.shape)
print("SERIES_KEYS:", SERIES_KEYS)
print("FEATURE_COLS_CAT_ALL:", FEATURE_COLS_CAT_ALL)
print("NUM features:", len(FEATURE_COLS_NUM_ALL), "| TOTAL features:", len(FEATURE_COLS_ALL))
print("Example engineered cols:", [c for c in df_train.columns if c.startswith('fe_')][:20])

globals()["df_train"] = df_train
globals()["df_test"] = df_test
globals()["SERIES_KEYS"] = SERIES_KEYS
globals()["FEATURE_COLS_CAT_ALL"] = FEATURE_COLS_CAT_ALL
globals()["FEATURE_COLS_NUM_ALL"] = FEATURE_COLS_NUM_ALL
globals()["FEATURE_COLS_ALL"] = FEATURE_COLS_ALL
globals()["CAT_FEATURE_IDXS_ALL"] = CAT_FEATURE_IDXS_ALL


# Model Training, OOF Evaluation, and Model Selection

In [None]:
# ============================================================
# STAGE 6 — Model Training, OOF Evaluation, and Model Selection (UPGRADE v4: Ensemble + Blend + Alpha)
# ============================================================

import gc, json, time
from pathlib import Path
import numpy as np
import pandas as pd

need = ["df_train","TARGET_COL","TIME_COL","ID_COL","WEIGHT_COL","FEATURE_COLS_ALL","FEATURE_COLS_CAT_ALL",
        "fold_boundaries","weighted_rmse_score","make_sample_weight"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing global: {k}. Jalankan stage sebelumnya.")

MODE = "per_horizon"

if MODE == "per_horizon" and "horizon" in FEATURE_COLS_CAT_ALL:
    CAT_COLS_MODEL = [c for c in FEATURE_COLS_CAT_ALL if c != "horizon"]
else:
    CAT_COLS_MODEL = list(FEATURE_COLS_CAT_ALL)

NUM_COLS_MODEL = [c for c in FEATURE_COLS_ALL if c not in FEATURE_COLS_CAT_ALL]
FEATURE_COLS_MODEL = CAT_COLS_MODEL + NUM_COLS_MODEL
CAT_FEATURE_IDXS = [i for i,c in enumerate(FEATURE_COLS_MODEL) if c in CAT_COLS_MODEL]

print("MODE:", MODE)
print("Total features:", len(FEATURE_COLS_MODEL), "| cat idx count:", len(CAT_FEATURE_IDXS))

# weights
USE_RECENCY = True
TAU = 600.0
CLIP_W_Q = 0.9995

TRAIN_SAMPLE_CAP = 450_000
SAMPLE_WEIGHTED = True

USE_CATBOOST = True
USE_LGBM = True

CB_SEEDS = [42, 52, 62]
LGB_SEEDS = [41, 51]

CB_PARAM_LIST = [
    dict(iterations=2500, learning_rate=0.04, depth=8,  l2_leaf_reg=6.0,  random_strength=1.0, rsm=0.9, min_data_in_leaf=120, bootstrap_type="Bernoulli", subsample=0.8),
    dict(iterations=3500, learning_rate=0.03, depth=10, l2_leaf_reg=8.0,  random_strength=1.2, rsm=0.9, min_data_in_leaf=70,  bootstrap_type="Bernoulli", subsample=0.8),
]
EARLY_STOPPING_ROUNDS = 200

LGB_PARAMS = dict(
    objective="regression",
    metric="rmse",
    learning_rate=0.03,
    num_leaves=192,
    min_data_in_leaf=200,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    lambda_l2=8.0,
    max_bin=255,
    verbose=-1,
)

OUT_DIR = Path("/kaggle/working/tsf_stage6_models_v4")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def _maybe_sample(df: pd.DataFrame, cap: int, weighted: bool, seed: int) -> pd.DataFrame:
    if cap is None or cap <= 0 or len(df) <= cap:
        return df
    rs = np.random.RandomState(seed)
    if not weighted:
        idx = rs.choice(len(df), size=cap, replace=False)
        return df.iloc[idx]
    w = df[WEIGHT_COL].to_numpy(np.float64)
    w = np.where(np.isfinite(w), w, 0.0)
    w = np.maximum(w, 0.0)
    if w.sum() <= 0:
        idx = rs.choice(len(df), size=cap, replace=False)
        return df.iloc[idx]
    p = w / (w.sum() + 1e-12)
    idx = rs.choice(len(df), size=cap, replace=False, p=p)
    return df.iloc[idx]

def fit_alpha(y: np.ndarray, pred: np.ndarray, w: np.ndarray, clip=(0.0, 3.0)) -> float:
    num = float(np.sum(w * y * pred))
    den = float(np.sum(w * pred * pred) + 1e-12)
    a = num / den
    return float(np.clip(a, clip[0], clip[1]))

def fit_blend_weights(pred_mat: np.ndarray, y: np.ndarray, w: np.ndarray, nonneg=True) -> np.ndarray:
    sw = np.sqrt(np.maximum(w, 0.0)).astype(np.float64)
    Pw = pred_mat * sw[:, None]
    yw = y * sw
    b, *_ = np.linalg.lstsq(Pw, yw, rcond=None)
    b = b.astype(np.float64)
    if nonneg:
        b = np.clip(b, 0.0, None)
    s = b.sum()
    if not np.isfinite(s) or s <= 0:
        b = np.ones(pred_mat.shape[1], dtype=np.float64) / pred_mat.shape[1]
    else:
        b = b / s
    return b

from catboost import CatBoostRegressor, Pool
try:
    import lightgbm as lgb
    _HAS_LGB = True
except Exception:
    _HAS_LGB = False
    USE_LGBM = False
    print("[WARN] lightgbm not available; skipping LGBM.")

df_train = df_train.copy()
df_train["fold"] = df_train["fold"].astype(int)

fold_ids = sorted(df_train["fold"].unique().tolist())
horizons = sorted(df_train["horizon"].unique().tolist()) if (MODE == "per_horizon" and "horizon" in df_train.columns) else [None]

oof_store = {}
def _new_oof():
    return np.zeros(len(df_train), dtype=np.float32)

model_registry = []

t0 = time.time()
for h in horizons:
    if h is None:
        mask_h = np.ones(len(df_train), dtype=bool)
        h_key = "__all__"
    else:
        mask_h = (df_train["horizon"].astype(str) == str(h))
        h_key = str(h)

    if mask_h.sum() == 0:
        continue

    print("\n" + "="*80)
    print("HORIZON:", h_key, "| rows:", int(mask_h.sum()), "| folds:", fold_ids)

    for k in fold_ids:
        va_mask = mask_h & (df_train["fold"] == k)
        tr_mask = mask_h & (df_train["fold"] != k)

        df_tr = df_train.loc[tr_mask]
        df_va = df_train.loc[va_mask]
        if len(df_va) == 0 or len(df_tr) == 0:
            continue

        df_tr_s = _maybe_sample(df_tr, TRAIN_SAMPLE_CAP, SAMPLE_WEIGHTED, seed=1234 + k + (0 if h is None else int(h)*7))

        X_tr = df_tr_s[FEATURE_COLS_MODEL]
        y_tr = df_tr_s[TARGET_COL].to_numpy(np.float32)
        w_tr = make_sample_weight(df_tr_s, use_recency=USE_RECENCY, tau=TAU, clip_w_quantile=CLIP_W_Q)

        X_va = df_va[FEATURE_COLS_MODEL]
        y_va = df_va[TARGET_COL].to_numpy(np.float32)
        w_va_eval = df_va[WEIGHT_COL].to_numpy(np.float64)

        if USE_CATBOOST:
            for pi, pbase in enumerate(CB_PARAM_LIST):
                for sd in CB_SEEDS:
                    name = f"cb_p{pi}_s{sd}"
                    if name not in oof_store:
                        oof_store[name] = _new_oof()

                    params = dict(
                        loss_function="RMSE",
                        eval_metric="RMSE",
                        task_type="CPU",
                        thread_count=-1,
                        random_seed=int(sd + 1000*k),
                        allow_writing_files=False,
                        **pbase
                    )
                    train_pool = Pool(X_tr, label=y_tr, weight=w_tr, cat_features=CAT_FEATURE_IDXS)
                    valid_pool = Pool(X_va, label=y_va, cat_features=CAT_FEATURE_IDXS)

                    model = CatBoostRegressor(**params)
                    model.fit(train_pool, eval_set=valid_pool, use_best_model=True,
                              verbose=False, early_stopping_rounds=EARLY_STOPPING_ROUNDS)

                    pred_va = model.predict(valid_pool).astype(np.float32)
                    oof_store[name][va_mask] = pred_va

                    sc = weighted_rmse_score(y_va.astype(np.float64), pred_va.astype(np.float64), w_va_eval.astype(np.float64))
                    model_registry.append(dict(family="catboost", name=name, horizon=h_key, fold=int(k), score=float(sc)))

        if USE_LGBM and _HAS_LGB:
            X_tr_l = X_tr.copy()
            X_va_l = X_va.copy()
            for c in CAT_COLS_MODEL:
                if c in X_tr_l.columns:
                    X_tr_l[c] = X_tr_l[c].astype("category")
                    X_va_l[c] = X_va_l[c].astype("category")

            dtrain = lgb.Dataset(X_tr_l, label=y_tr, weight=w_tr, categorical_feature=CAT_COLS_MODEL, free_raw_data=True)
            dvalid = lgb.Dataset(X_va_l, label=y_va, weight=None, categorical_feature=CAT_COLS_MODEL, free_raw_data=True)

            for sd in LGB_SEEDS:
                name = f"lgb_s{sd}"
                if name not in oof_store:
                    oof_store[name] = _new_oof()

                params = dict(LGB_PARAMS)
                params["seed"] = int(sd + 1000*k)

                model = lgb.train(
                    params, dtrain,
                    num_boost_round=8000,
                    valid_sets=[dvalid],
                    valid_names=["valid"],
                    callbacks=[lgb.early_stopping(250, verbose=False), lgb.log_evaluation(0)],
                )
                pred_va = model.predict(X_va_l, num_iteration=model.best_iteration).astype(np.float32)
                oof_store[name][va_mask] = pred_va

                sc = weighted_rmse_score(y_va.astype(np.float64), pred_va.astype(np.float64), w_va_eval.astype(np.float64))
                model_registry.append(dict(family="lgbm", name=name, horizon=h_key, fold=int(k), score=float(sc)))

        gc.collect()

print("\nTraining done in", round(time.time() - t0, 1), "sec")
print("OOF variants:", len(oof_store))

def _avg_oof(names):
    arr = np.stack([oof_store[n] for n in names], axis=0).astype(np.float64)
    return arr.mean(axis=0).astype(np.float32)

cb_names = [n for n in oof_store.keys() if n.startswith("cb_")]
lgb_names = [n for n in oof_store.keys() if n.startswith("lgb_")]

components = {}
if len(cb_names):  components["cb"]  = _avg_oof(cb_names)
if len(lgb_names): components["lgb"] = _avg_oof(lgb_names)
if len(components) == 0:
    raise RuntimeError("No OOF predictions built.")

blend_weights_by_h = {}
alpha_by_h = {}
oof_pred_blend = np.zeros(len(df_train), dtype=np.float32)

y_all = df_train[TARGET_COL].to_numpy(np.float64)
w_all = df_train[WEIGHT_COL].to_numpy(np.float64)
w_all = np.where(np.isfinite(w_all), w_all, 0.0)
w_all = np.maximum(w_all, 0.0)

for h in horizons:
    if h is None:
        idx = np.ones(len(df_train), dtype=bool)
        h_key = "__all__"
    else:
        idx = (df_train["horizon"].astype(str) == str(h))
        h_key = str(h)

    y = y_all[idx]
    w = w_all[idx]
    P = np.vstack([components[k][idx].astype(np.float64) for k in components.keys()]).T
    names = list(components.keys())

    b = fit_blend_weights(P, y, w, nonneg=True)
    blend_weights_by_h[h_key] = {names[i]: float(b[i]) for i in range(len(names))}

    pred_bl = (P @ b).astype(np.float64)
    a = fit_alpha(y, pred_bl, w, clip=(0.0, 3.0))
    alpha_by_h[h_key] = float(a)

    oof_pred_blend[idx] = (pred_bl * a).astype(np.float32)

cv_score = weighted_rmse_score(y_all, oof_pred_blend.astype(np.float64), w_all)
print("\nCV score (OOF blend):", float(cv_score))
print("blend_weights_by_h:", blend_weights_by_h)
print("alpha_by_h:", alpha_by_h)

model_cfg_used = dict(
    MODE=MODE,
    USE_RECENCY=USE_RECENCY, TAU=float(TAU), CLIP_W_Q=CLIP_W_Q,
    TRAIN_SAMPLE_CAP=int(TRAIN_SAMPLE_CAP), SAMPLE_WEIGHTED=bool(SAMPLE_WEIGHTED),
    USE_CATBOOST=True, USE_LGBM=bool(USE_LGBM),
    CB_SEEDS=CB_SEEDS, LGB_SEEDS=LGB_SEEDS,
    CB_PARAM_LIST=CB_PARAM_LIST, LGB_PARAMS=LGB_PARAMS,
    FEATURE_COLS_MODEL=FEATURE_COLS_MODEL,
    CAT_COLS_MODEL=CAT_COLS_MODEL,
    CAT_FEATURE_IDXS=CAT_FEATURE_IDXS,
    blend_weights_by_h=blend_weights_by_h,
    alpha_by_h=alpha_by_h,
    cv_score=float(cv_score),
)

(OUT_DIR / "stage6_cfg.json").write_text(json.dumps(model_cfg_used, indent=2))
(OUT_DIR / "stage6_model_registry.json").write_text(json.dumps(model_registry, indent=2))
print("Saved:", str(OUT_DIR / "stage6_cfg.json"))

globals()["oof_pred_blend"] = oof_pred_blend
globals()["blend_weights_by_h"] = blend_weights_by_h
globals()["alpha_by_h"] = alpha_by_h
globals()["model_cfg_used"] = model_cfg_used
globals()["FEATURE_COLS_MODEL"] = FEATURE_COLS_MODEL
globals()["CAT_FEATURE_IDXS"] = CAT_FEATURE_IDXS
globals()["CAT_COLS_MODEL"] = CAT_COLS_MODEL

gc.collect()


# Final Fit, Test Inference, and Submission Packaging

In [None]:
# ============================================================
# STAGE 7 — Final Fit, Test Inference, and Submission Packaging (UPGRADE v4: Ensemble + Blend)
# ============================================================

import gc, json, time
from pathlib import Path
import numpy as np
import pandas as pd

need = ["df_train","df_test","TARGET_COL","ID_COL","TIME_COL","WEIGHT_COL",
        "FEATURE_COLS_MODEL","CAT_FEATURE_IDXS","CAT_COLS_MODEL","make_sample_weight","model_cfg_used"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing global: {k}. Jalankan stage sebelumnya.")

from catboost import CatBoostRegressor, Pool
try:
    import lightgbm as lgb
    _HAS_LGB = True
except Exception:
    _HAS_LGB = False

CFG = model_cfg_used
blend_weights_by_h = CFG["blend_weights_by_h"]
alpha_by_h = CFG["alpha_by_h"]

USE_CATBOOST = bool(CFG.get("USE_CATBOOST", True))
USE_LGBM = bool(CFG.get("USE_LGBM", False)) and _HAS_LGB

CB_SEEDS = CFG.get("CB_SEEDS", [42, 52, 62])
LGB_SEEDS = CFG.get("LGB_SEEDS", [41, 51])
CB_PARAM_LIST = CFG.get("CB_PARAM_LIST", [])
LGB_PARAMS = CFG.get("LGB_PARAMS", {})

USE_RECENCY = bool(CFG.get("USE_RECENCY", True))
TAU = float(CFG.get("TAU", 600.0))
CLIP_W_Q = CFG.get("CLIP_W_Q", 0.9995)

MODE = CFG.get("MODE", "per_horizon")
horizons = sorted(df_train["horizon"].unique().tolist()) if (MODE == "per_horizon" and "horizon" in df_train.columns) else [None]

OUT_DIR = Path("/kaggle/working/tsf_stage7_bundle_v4")
MODEL_DIR = OUT_DIR / "final_models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

test_pred = np.zeros(len(df_test), dtype=np.float64)

t0 = time.time()
for h in horizons:
    if h is None:
        tr_idx = np.ones(len(df_train), dtype=bool)
        te_idx = np.ones(len(df_test), dtype=bool)
        h_key = "__all__"
    else:
        tr_idx = (df_train["horizon"].astype(str) == str(h))
        te_idx = (df_test["horizon"].astype(str) == str(h)) if "horizon" in df_test.columns else np.ones(len(df_test), dtype=bool)
        h_key = str(h)

    if tr_idx.sum() == 0 or te_idx.sum() == 0:
        continue

    X_tr = df_train.loc[tr_idx, FEATURE_COLS_MODEL]
    y_tr = df_train.loc[tr_idx, TARGET_COL].to_numpy(np.float32)
    w_tr = make_sample_weight(df_train.loc[tr_idx], use_recency=USE_RECENCY, tau=TAU, clip_w_quantile=CLIP_W_Q)

    X_te = df_test.loc[te_idx, FEATURE_COLS_MODEL]
    comps = {}

    if USE_CATBOOST and len(CB_PARAM_LIST):
        preds = []
        for pi, pbase in enumerate(CB_PARAM_LIST):
            for sd in CB_SEEDS:
                params = dict(
                    loss_function="RMSE",
                    eval_metric="RMSE",
                    task_type="CPU",
                    thread_count=-1,
                    random_seed=int(sd + 777),
                    allow_writing_files=False,
                    **pbase
                )
                model = CatBoostRegressor(**params)
                pool_tr = Pool(X_tr, label=y_tr, weight=w_tr, cat_features=CAT_FEATURE_IDXS)
                model.fit(pool_tr, verbose=False)

                pred_te = model.predict(Pool(X_te, cat_features=CAT_FEATURE_IDXS)).astype(np.float64)
                preds.append(pred_te)

                model.save_model(str(MODEL_DIR / f"cb_h{h_key}_p{pi}_s{sd}.cbm"))
        comps["cb"] = np.mean(np.stack(preds, axis=0), axis=0)

    if USE_LGBM:
        X_tr_l = X_tr.copy()
        X_te_l = X_te.copy()
        for c in CAT_COLS_MODEL:
            if c in X_tr_l.columns:
                X_tr_l[c] = X_tr_l[c].astype("category")
                X_te_l[c] = X_te_l[c].astype("category")

        dtrain = lgb.Dataset(X_tr_l, label=y_tr, weight=w_tr, categorical_feature=CAT_COLS_MODEL, free_raw_data=True)

        preds = []
        for sd in LGB_SEEDS:
            params = dict(LGB_PARAMS)
            params["seed"] = int(sd + 777)
            model = lgb.train(params, dtrain, num_boost_round=CFG.get("LGB_NUM_BOOST", 3500))
            preds.append(model.predict(X_te_l).astype(np.float64))
            model.save_model(str(MODEL_DIR / f"lgb_h{h_key}_s{sd}.txt"))
        comps["lgb"] = np.mean(np.stack(preds, axis=0), axis=0)

    wts = blend_weights_by_h.get(h_key, None)
    if wts is None:
        keys = list(comps.keys())
        wts = {k: 1.0/len(keys) for k in keys}

    pred = np.zeros(len(X_te), dtype=np.float64)
    for k, v in comps.items():
        pred += float(wts.get(k, 0.0)) * v

    pred *= float(alpha_by_h.get(h_key, 1.0))
    test_pred[te_idx] = pred

    print(f"H={h_key} | n_tr={int(tr_idx.sum())} n_te={int(te_idx.sum())} | comps={list(comps.keys())} | wts={wts} | alpha={alpha_by_h.get(h_key,1.0)}")

print("Final inference done in", round(time.time()-t0, 1), "sec")

SUB_PATH = Path("/kaggle/working/submission.csv")
sub = pd.DataFrame({ID_COL: df_test[ID_COL].astype(str).values, "prediction": test_pred.astype(np.float64)})
sub.to_csv(SUB_PATH, index=False)
print("Saved submission:", str(SUB_PATH), "| shape:", sub.shape)

bundle = dict(
    created_utc=time.time(),
    cfg=model_cfg_used,
    features=FEATURE_COLS_MODEL,
    cat_cols=CAT_COLS_MODEL,
    id_col=ID_COL, time_col=TIME_COL, weight_col=WEIGHT_COL, target_col=TARGET_COL,
    train_shape=list(df_train.shape),
    test_shape=list(df_test.shape),
)
OUT_DIR.mkdir(parents=True, exist_ok=True)
(OUT_DIR / "bundle.json").write_text(json.dumps(bundle, indent=2))
print("Saved bundle:", str(OUT_DIR / "bundle.json"))

globals()["test_pred"] = test_pred
globals()["SUB_PATH"] = str(SUB_PATH)
globals()["BUNDLE_DIR"] = str(OUT_DIR)

gc.collect()
