# Load Data & Initial Inspection

In [2]:
# ============================================================
# STAGE 1 — Load Data & Initial Inspection (REVISI FULL v3, RAM-SAFE + ROBUST)
# - weight wajib di train, opsional di test
# - FEAT_COLS = intersection(train,test) - {id, weight, target}
# Output globals:
#   df_train, df_test, TARGET_COL, ID_COL, WEIGHT_COL, TIME_COL
#   BASE_CATS, CAT_COLS, SERIES_KEYS
#   FEAT_COLS, NUM_COLS, BASE_NUM_COLS, DROP_COLS
# ============================================================

import gc
from pathlib import Path
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

SEED = 42
np.random.seed(SEED)

TRAIN_PATH = Path("/kaggle/input/ts-forecasting/train.parquet")
TEST_PATH  = Path("/kaggle/input/ts-forecasting/test.parquet")

for p in [TRAIN_PATH, TEST_PATH]:
    if not p.exists():
        raise FileNotFoundError(f"Missing file: {p}")

print("Loading parquet...")
df_train = pd.read_parquet(TRAIN_PATH)
df_test  = pd.read_parquet(TEST_PATH)

print("\n==================== BASIC SHAPES ====================")
print("train:", df_train.shape)
print("test :", df_test.shape)

# ----------------------------
# 0) Robust key column detection
# ----------------------------
def pick_first_existing(cols, candidates):
    for c in candidates:
        if c in cols:
            return c
    return None

# ID
ID_COL = pick_first_existing(df_train.columns, ["id", "ID"])
if ID_COL is None:
    ID_COL = df_train.columns[0]
    print(f"[WARN] id column not found by name; fallback to first col: {ID_COL}")

# TIME
TIME_COL = pick_first_existing(df_train.columns, ["ts_index", "time_index", "t", "time"])
if TIME_COL is None:
    # try heuristic: column containing 'ts' and 'index'
    heur = [c for c in df_train.columns if ("ts" in str(c).lower() and "index" in str(c).lower())]
    TIME_COL = heur[0] if heur else None
if TIME_COL is None:
    raise RuntimeError("Could not detect TIME_COL (ts_index).")

# WEIGHT (wajib di train, opsional di test)
def detect_weight_col(cols):
    if "weight" in cols:
        return "weight"
    # common alternatives
    alts = ["w", "sample_weight", "weights", "wgt", "wt"]
    for a in alts:
        if a in cols:
            return a
    # heuristic contains 'weight'
    heur = [c for c in cols if "weight" in str(c).lower()]
    return heur[0] if heur else None

WEIGHT_COL = detect_weight_col(df_train.columns)
if WEIGHT_COL is None:
    print("[WARN] weight column not found in TRAIN. Will create WEIGHT_COL=1.0 (not ideal).")
    WEIGHT_COL = "__weight__"
    df_train[WEIGHT_COL] = 1.0

# Base cats (as described)
BASE_CATS = ["code", "sub_code", "sub_category", "horizon"]

print("\n==================== KEY COLS ====================")
print("ID_COL     :", ID_COL)
print("TIME_COL   :", TIME_COL)
print("WEIGHT_COL :", WEIGHT_COL, "| present in train:", WEIGHT_COL in df_train.columns, "| present in test:", WEIGHT_COL in df_test.columns)

# ----------------------------
# 1) Detect TARGET_COL (train-only numeric)
# ----------------------------
train_only = [c for c in df_train.columns if c not in df_test.columns]
# remove obvious non-targets
train_only = [c for c in train_only if c not in [ID_COL, WEIGHT_COL, TIME_COL] + BASE_CATS]

preferred_names = ["target", "y", "label", "value", "prediction_target"]
TARGET_COL = None
for nm in preferred_names:
    if nm in df_train.columns and nm not in df_test.columns:
        TARGET_COL = nm
        break

if TARGET_COL is None:
    # numeric train-only
    cand = [c for c in train_only if pd.api.types.is_numeric_dtype(df_train[c])]
    if len(cand) == 1:
        TARGET_COL = cand[0]
    elif len(cand) > 1:
        vars_ = {c: float(np.nanvar(df_train[c].to_numpy(np.float64))) for c in cand}
        TARGET_COL = sorted(vars_.items(), key=lambda kv: kv[1], reverse=True)[0][0]
        print("\n[WARN] Multiple numeric train-only cols found; picked by variance:", TARGET_COL)
    else:
        # last resort: single train-only col
        if len(train_only) == 1:
            TARGET_COL = train_only[0]
            print("\n[WARN] Non-numeric train-only target picked:", TARGET_COL)
        else:
            raise RuntimeError(f"Could not detect target. train_only candidates: {train_only[:20]}")

print("TARGET_COL :", TARGET_COL)

# ----------------------------
# 2) RAM-safe downcast numeric
# ----------------------------
def downcast_numeric(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in out.columns:
        if c == ID_COL:
            continue
        if pd.api.types.is_float_dtype(out[c]):
            out[c] = pd.to_numeric(out[c], downcast="float")
        elif pd.api.types.is_integer_dtype(out[c]):
            out[c] = pd.to_numeric(out[c], downcast="integer")
    return out

df_train = downcast_numeric(df_train)
df_test  = downcast_numeric(df_test)

# ----------------------------
# 3) Feature columns = intersection(train,test) excluding id/weight/target
# ----------------------------
DROP_COLS = {ID_COL, TARGET_COL, WEIGHT_COL}

common_cols = [c for c in df_train.columns if c in df_test.columns]
FEAT_COLS = [c for c in common_cols if c not in DROP_COLS]

# Categorical columns = base cats (if in FEAT_COLS) + any object/category (also must be in FEAT_COLS)
CAT_COLS = [c for c in BASE_CATS if c in FEAT_COLS]
for c in FEAT_COLS:
    if c in CAT_COLS:
        continue
    if pd.api.types.is_object_dtype(df_train[c]) or str(df_train[c].dtype).startswith("category"):
        CAT_COLS.append(c)

# unify categories across train+test for categorical cols
for c in CAT_COLS:
    if c in df_train.columns and c in df_test.columns:
        both = pd.concat([df_train[c], df_test[c]], axis=0, ignore_index=True)
        both = both.astype("category")
        df_train[c] = pd.Categorical(df_train[c], categories=both.cat.categories)
        df_test[c]  = pd.Categorical(df_test[c],  categories=both.cat.categories)

# Numeric feature cols
NUM_COLS = [c for c in FEAT_COLS if pd.api.types.is_numeric_dtype(df_train[c])]

# Prefer anonymized base numeric features for later FE expansion
BASE_NUM_COLS = [c for c in NUM_COLS if str(c).startswith("feature_")]
if len(BASE_NUM_COLS) < 10:
    BASE_NUM_COLS = [c for c in NUM_COLS if c != TIME_COL]

# series keys for sequential FE
SERIES_KEYS = [c for c in ["code", "sub_code", "sub_category", "horizon"] if c in FEAT_COLS]

# ----------------------------
# 4) Quick checks
# ----------------------------
print("\n==================== QUICK CHECKS ====================")
print("train id unique:", int(df_train[ID_COL].nunique()), "/", len(df_train))
print("test  id unique:", int(df_test[ID_COL].nunique()),  "/", len(df_test))

tr_min, tr_max = int(df_train[TIME_COL].min()), int(df_train[TIME_COL].max())
te_min, te_max = int(df_test[TIME_COL].min()),  int(df_test[TIME_COL].max())
print("train ts range:", tr_min, "->", tr_max)
print("test  ts range:", te_min, "->", te_max)
print("test_after_train_max?:", te_min >= tr_max)

# Missing rates
print("\nMissing rate top 10 (train):")
print(df_train[FEAT_COLS].isna().mean().sort_values(ascending=False).head(10))
print("\nMissing rate top 10 (test):")
print(df_test[FEAT_COLS].isna().mean().sort_values(ascending=False).head(10))

# target stats
y = df_train[TARGET_COL].to_numpy(np.float64)
print("\n==================== TARGET STATS ====================")
print("finite:", int(np.isfinite(y).sum()), "/", len(y))
print("mean:", float(np.nanmean(y)), "std:", float(np.nanstd(y)))
print("p1:", float(np.nanpercentile(y, 1)), "p50:", float(np.nanpercentile(y, 50)), "p99:", float(np.nanpercentile(y, 99)))

# weight stats
w = df_train[WEIGHT_COL].to_numpy(np.float64)
w = np.where(np.isfinite(w), w, 0.0)
print("\n==================== WEIGHT STATS (TRAIN ONLY) ====================")
print("min:", float(np.min(w)), "p50:", float(np.percentile(w, 50)), "p99:", float(np.percentile(w, 99)), "max:", float(np.max(w)))

print("\n==================== FEATURE SET SUMMARY ====================")
print("SERIES_KEYS:", SERIES_KEYS)
print("CAT_COLS   :", CAT_COLS)
print("NUM_COLS   :", len(NUM_COLS))
print("BASE_NUM_COLS(for FE):", len(BASE_NUM_COLS))
print("FEAT_COLS  :", len(FEAT_COLS))

print("\n==================== HEAD (train) ====================")
display(df_train.head(3))
print("\n==================== HEAD (test) ====================")
display(df_test.head(3))

globals().update({
    "df_train": df_train,
    "df_test": df_test,
    "TARGET_COL": TARGET_COL,
    "ID_COL": ID_COL,
    "WEIGHT_COL": WEIGHT_COL,
    "TIME_COL": TIME_COL,
    "BASE_CATS": BASE_CATS,
    "CAT_COLS": CAT_COLS,
    "SERIES_KEYS": SERIES_KEYS,
    "FEAT_COLS": FEAT_COLS,
    "NUM_COLS": NUM_COLS,
    "BASE_NUM_COLS": BASE_NUM_COLS,
    "DROP_COLS": DROP_COLS,
    "SEED": SEED,
})

gc.collect()


Loading parquet...

train: (5337414, 94)
test : (1447107, 92)

ID_COL     : id
TIME_COL   : ts_index
WEIGHT_COL : weight | present in train: True | present in test: False
TARGET_COL : y_target

train id unique: 5337414 / 5337414
test  id unique: 1447107 / 1447107
train ts range: 1 -> 3601
test  ts range: 3602 -> 4376
test_after_train_max?: True

Missing rate top 10 (train):
feature_at    0.124719
feature_by    0.110192
feature_ay    0.085420
feature_cd    0.074964
feature_ce    0.051678
feature_cf    0.044289
feature_al    0.042233
feature_aw    0.038444
feature_bz    0.028426
feature_bi    0.027622
dtype: float64

Missing rate top 10 (test):
feature_y     0.385765
feature_x     0.385765
feature_w     0.385765
feature_z     0.385765
feature_at    0.092342
feature_by    0.092043
feature_ay    0.057988
feature_cd    0.057969
feature_aw    0.038026
feature_bz    0.037755
dtype: float64

finite: 5337414 / 5337414
mean: -0.6659048370404826 std: 32.52763850194126
p1: -82.79721488952636 p50: 

Unnamed: 0,id,code,sub_code,sub_category,horizon,ts_index,feature_a,feature_b,feature_c,feature_d,feature_e,feature_f,feature_g,feature_h,feature_i,feature_j,feature_k,feature_l,feature_m,feature_n,feature_o,feature_p,feature_q,feature_r,feature_s,feature_t,feature_u,feature_v,feature_w,feature_x,feature_y,feature_z,feature_aa,feature_ab,feature_ac,feature_ad,feature_ae,feature_af,feature_ag,feature_ah,feature_ai,feature_aj,feature_ak,feature_al,feature_am,feature_an,feature_ao,feature_ap,feature_aq,feature_ar,feature_as,feature_at,feature_au,feature_av,feature_aw,feature_ax,feature_ay,feature_az,feature_ba,feature_bb,feature_bc,feature_bd,feature_be,feature_bf,feature_bg,feature_bh,feature_bi,feature_bj,feature_bk,feature_bl,feature_bm,feature_bn,feature_bo,feature_bp,feature_bq,feature_br,feature_bs,feature_bt,feature_bu,feature_bv,feature_bw,feature_bx,feature_by,feature_bz,feature_ca,feature_cb,feature_cc,feature_cd,feature_ce,feature_cf,feature_cg,feature_ch,y_target,weight
0,W2MW3G2L__J0G2B0KU__PZ9S1Z4V__25__89,W2MW3G2L,J0G2B0KU,PZ9S1Z4V,25,89,29,16.364094,7.464023,5.966933,1.622184,10.26136,4.914369,0.000467,0.023686,0.006409,0.000187,0.744244,2.001013,-0.01687,0.009892,0.013162,0.021502,0.901966,0.402125,0.038566,0.177947,0.091141,-84.968735,-1.765306,10.109641,145.320404,0.08958,0.868698,0.080088,0.101631,0.026555,0.092776,0.004,1.298972,7.321646,3.628258,0.453027,-0.080212,0.192181,0.510727,17.136629,0.267856,7.745722,4.037853,4.856791,,5.188995,79.423474,244.471191,13.848771,,0.01707,0.709292,21.80395,0.120968,26999.430482,34126.269444,791.709562,0.15467,9499.742248,1.266071,429.318704,2540.88981,0.008927,1.122459,23.815924,0.54985,0.067941,0.076033,0.02759,-0.47269,-0.202944,-3.769914,0.104535,3.040304,4.499546,,-0.058543,-0.001686,-0.105328,-0.005045,,-0.133697,2.849819,0.112068,1,-0.551324,40.982572
1,W2MW3G2L__J0G2B0KU__PZ9S1Z4V__1__89,W2MW3G2L,J0G2B0KU,PZ9S1Z4V,1,89,53,2.858806,5.050617,15.906651,10.879453,3.072151,4.091032,0.000467,0.023686,0.006409,0.000187,0.744244,2.001013,-0.01687,0.009892,0.013162,0.021502,0.901966,0.402125,0.038566,0.177947,0.091141,-84.968735,-1.765306,10.109641,145.320404,0.08958,0.868698,0.080088,0.101631,0.026555,0.092776,0.004,1.298972,7.321646,3.628258,0.453027,0.00148,0.192181,0.510727,17.136629,0.267856,7.745722,4.037853,4.856791,,5.188995,79.423474,244.471191,13.848771,,0.01707,0.709292,21.80395,0.120968,26999.430482,34126.269444,791.709562,0.15467,9499.742248,1.266071,429.318704,2540.88981,0.008927,1.122459,23.815924,0.54985,0.067941,0.076033,0.02759,-0.47269,-0.202944,-3.769914,0.104535,3.040304,4.499546,,-0.058543,-0.001686,-0.105328,-0.005045,,-0.133697,2.849819,0.112068,1,-0.315583,150.075406
2,W2MW3G2L__J0G2B0KU__PZ9S1Z4V__3__89,W2MW3G2L,J0G2B0KU,PZ9S1Z4V,3,89,51,9.585452,1.076268,9.004147,16.74049,15.166901,11.427982,0.000467,0.023686,0.006409,0.000187,0.744244,2.001013,-0.01687,0.009892,0.013162,0.021502,0.901966,0.402125,0.038566,0.177947,0.091141,-84.968735,-1.765306,10.109641,145.320404,0.08958,0.868698,0.080088,0.101631,0.026555,0.092776,0.004,1.298972,7.321646,3.628258,0.453027,-0.045494,0.192181,0.510727,17.136629,0.267856,7.745722,4.037853,4.856791,,5.188995,79.423474,244.471191,13.848771,,0.01707,0.709292,21.80395,0.120968,26999.430482,34126.269444,791.709562,0.15467,9499.742248,1.266071,429.318704,2540.88981,0.008927,1.122459,23.815924,0.54985,0.067941,0.076033,0.02759,-0.47269,-0.202944,-3.769914,0.104535,3.040304,4.499546,,-0.058543,-0.001686,-0.105328,-0.005045,,-0.133697,2.849819,0.112068,1,-0.362894,115.953552





Unnamed: 0,id,code,sub_code,sub_category,horizon,ts_index,feature_a,feature_b,feature_c,feature_d,feature_e,feature_f,feature_g,feature_h,feature_i,feature_j,feature_k,feature_l,feature_m,feature_n,feature_o,feature_p,feature_q,feature_r,feature_s,feature_t,feature_u,feature_v,feature_w,feature_x,feature_y,feature_z,feature_aa,feature_ab,feature_ac,feature_ad,feature_ae,feature_af,feature_ag,feature_ah,feature_ai,feature_aj,feature_ak,feature_al,feature_am,feature_an,feature_ao,feature_ap,feature_aq,feature_ar,feature_as,feature_at,feature_au,feature_av,feature_aw,feature_ax,feature_ay,feature_az,feature_ba,feature_bb,feature_bc,feature_bd,feature_be,feature_bf,feature_bg,feature_bh,feature_bi,feature_bj,feature_bk,feature_bl,feature_bm,feature_bn,feature_bo,feature_bp,feature_bq,feature_br,feature_bs,feature_bt,feature_bu,feature_bv,feature_bw,feature_bx,feature_by,feature_bz,feature_ca,feature_cb,feature_cc,feature_cd,feature_ce,feature_cf,feature_cg,feature_ch
0,W2MW3G2L__495MGHFJ__PZ9S1Z4V__3__3647,W2MW3G2L,495MGHFJ,PZ9S1Z4V,3,3647,95,10.365266,3.209321,8.10934,9.043471,10.123041,15.722121,0.000243,0.021819,0.00142,7.3e-05,0.572125,1.265875,1.341192,0.005564,0.011987,0.035243,0.833918,1.791284,0.020539,0.218876,0.08066,-50.981239,-4.854592,-8.087713,119.237251,0.040442,0.635006,0.105355,0.075415,0.03444,0.09455,0.006728,1.986904,4.411098,3.050746,0.484755,0.020247,0.186578,0.528456,15.395411,0.219483,4.83955,2.420423,2.652015,0.0,4.151196,1012.649294,425.853042,197.344987,209.253182,0.016366,0.552138,108.859861,2.369993,66589.814887,34282.221003,1316.738008,0.04801,11660.961097,0.116372,11.122246,716.158132,0.008559,1.772256,38.452076,0.872948,0.06611,0.078856,0.030888,-0.480743,-0.197747,-3.659776,0.100295,3.131395,4.554258,-0.000832,-0.032241,-0.00083,-0.058961,-0.002774,-0.00148,-0.25646,1.665532,0.071324,2
1,W2MW3G2L__495MGHFJ__PZ9S1Z4V__10__3647,W2MW3G2L,495MGHFJ,PZ9S1Z4V,10,3647,88,2.571476,15.234848,16.505699,0.230426,10.145378,10.15964,0.000243,0.021819,0.00142,7.3e-05,0.572125,1.265875,1.341192,0.005564,0.011987,0.035243,0.833918,1.791284,0.020539,0.218876,0.08066,-50.981239,-4.854592,-8.087713,119.237251,0.040442,0.635006,0.105355,0.075415,0.03444,0.09455,0.006728,1.986904,4.411098,3.050746,0.484755,0.052623,0.186578,0.528456,15.395411,0.219483,4.83955,2.420423,2.652015,0.0,4.151196,1012.649294,425.853042,197.344987,209.253182,0.016366,0.552138,108.859861,2.369993,66589.814887,34282.221003,1316.738008,0.04801,11660.961097,0.116372,11.122246,716.158132,0.008559,1.772256,38.452076,0.872948,0.06611,0.078856,0.030888,-0.480743,-0.197747,-3.659776,0.100295,3.131395,4.554258,-0.000832,-0.032241,-0.00083,-0.058961,-0.002774,-0.00148,-0.25646,1.665532,0.071324,2
2,W2MW3G2L__495MGHFJ__PZ9S1Z4V__25__3647,W2MW3G2L,495MGHFJ,PZ9S1Z4V,25,3647,71,5.524709,6.931663,8.939537,0.668187,16.578701,3.150691,0.000243,0.021819,0.00142,7.3e-05,0.572125,1.265875,1.341192,0.005564,0.011987,0.035243,0.833918,1.791284,0.020539,0.218876,0.08066,-50.981239,-4.854592,-8.087713,119.237251,0.040442,0.635006,0.105355,0.075415,0.03444,0.09455,0.006728,1.986904,4.411098,3.050746,0.484755,0.041667,0.186578,0.528456,15.395411,0.219483,4.83955,2.420423,2.652015,0.0,4.151196,1012.649294,425.853042,197.344987,209.253182,0.016366,0.552138,108.859861,2.369993,66589.814887,34282.221003,1316.738008,0.04801,11660.961097,0.116372,11.122246,716.158132,0.008559,1.772256,38.452076,0.872948,0.06611,0.078856,0.030888,-0.480743,-0.197747,-3.659776,0.100295,3.131395,4.554258,-0.000832,-0.032241,-0.00083,-0.058961,-0.002774,-0.00148,-0.25646,1.665532,0.071324,2


0

# Sanity Checks & Leakage Rules Setup

In [3]:
# ============================================================
# STAGE 2 — Sanity Checks & Leakage Rules Setup (REVISI FULL v2, RANK1-READY)
# Assumes STAGE 1 already ran and created:
#   df_train, df_test, TARGET_COL, ID_COL, TIME_COL, WEIGHT_COL, CAT_COLS, FEAT_COLS, SERIES_KEYS
#
# Outputs/Globals:
#   DO_NOT_USE_COLS
#   FEATURE_COLS_CAT, FEATURE_COLS_NUM, FEATURE_COLS_ALL
#   TRAIN_MIN_TS, TRAIN_MAX_TS, TEST_MIN_TS, TEST_MAX_TS
#   HAS_WEIGHT_TRAIN, HAS_WEIGHT_TEST
#   DROP_TARGET_NAN (bool), N_TARGET_NAN
#   ALLOW_TIME_FEATURES (bool)
# ============================================================

import gc
import numpy as np
import pandas as pd

# ----------------------------
# 0) Require STAGE 1 globals
# ----------------------------
need = ["df_train","df_test","TARGET_COL","ID_COL","TIME_COL","CAT_COLS","FEAT_COLS","WEIGHT_COL","SERIES_KEYS"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing global '{k}'. Jalankan STAGE 1 dulu.")

assert isinstance(df_train, pd.DataFrame) and isinstance(df_test, pd.DataFrame)

print("==================== STAGE 2: SANITY (v2) ====================")
print("ID_COL:", ID_COL, "| TIME_COL:", TIME_COL, "| WEIGHT_COL:", WEIGHT_COL, "| TARGET_COL:", TARGET_COL)

# ----------------------------
# 1) Core column existence
# ----------------------------
must_in_train = [ID_COL, TIME_COL, TARGET_COL]
must_in_test  = [ID_COL, TIME_COL]
for c in must_in_train:
    if c not in df_train.columns:
        raise RuntimeError(f"Train missing required column: {c}")
for c in must_in_test:
    if c not in df_test.columns:
        raise RuntimeError(f"Test missing required column: {c}")

HAS_WEIGHT_TRAIN = WEIGHT_COL in df_train.columns
HAS_WEIGHT_TEST  = WEIGHT_COL in df_test.columns

print("Has weight in train:", HAS_WEIGHT_TRAIN, "| in test:", HAS_WEIGHT_TEST)
if HAS_WEIGHT_TEST:
    print("[WARN] weight exists in test too; tetap dilarang sebagai fitur.")

# ----------------------------
# 2) ID uniqueness checks
# ----------------------------
ntr = len(df_train)
nts = len(df_test)

nuniq_tr = df_train[ID_COL].nunique(dropna=False)
nuniq_ts = df_test[ID_COL].nunique(dropna=False)

if nuniq_tr != ntr:
    dup = df_train.loc[df_train[ID_COL].duplicated(keep=False), ID_COL].head(10).tolist()
    raise RuntimeError(f"Train id not unique: {nuniq_tr}/{ntr}. Example dups: {dup}")
if nuniq_ts != nts:
    dup = df_test.loc[df_test[ID_COL].duplicated(keep=False), ID_COL].head(10).tolist()
    raise RuntimeError(f"Test id not unique: {nuniq_ts}/{nts}. Example dups: {dup}")

# It's fine if ids don't overlap; warn if overlap
intersect = np.intersect1d(df_train[ID_COL].astype(str).values, df_test[ID_COL].astype(str).values)
if len(intersect) > 0:
    print(f"[WARN] Train/Test share {len(intersect)} ids. Example:", intersect[:5])

print("ID uniqueness: OK")

# ----------------------------
# 3) Time ordering checks
# ----------------------------
if not pd.api.types.is_numeric_dtype(df_train[TIME_COL]):
    raise RuntimeError(f"{TIME_COL} in train is not numeric.")
if not pd.api.types.is_numeric_dtype(df_test[TIME_COL]):
    raise RuntimeError(f"{TIME_COL} in test is not numeric.")

TRAIN_MIN_TS = int(np.nanmin(df_train[TIME_COL].values))
TRAIN_MAX_TS = int(np.nanmax(df_train[TIME_COL].values))
TEST_MIN_TS  = int(np.nanmin(df_test[TIME_COL].values))
TEST_MAX_TS  = int(np.nanmax(df_test[TIME_COL].values))

print("Train ts range:", TRAIN_MIN_TS, "->", TRAIN_MAX_TS)
print("Test  ts range:", TEST_MIN_TS,  "->", TEST_MAX_TS)

if TEST_MIN_TS <= TRAIN_MAX_TS:
    print("[WARN] Test min ts_index <= Train max ts_index. Bisa overlap. Pastikan fitur time-series di-shift agar anti look-forward.")
else:
    print("Time ordering: OK (test after train).")

# time feature policy: we won't use raw ts_index as feature by default, but we WILL engineer age/bin in Stage 5
ALLOW_TIME_FEATURES = True

# ----------------------------
# 4) Leakage rules + feature lists (SAFE for inference)
# ----------------------------
# IMPORTANT: FEAT_COLS from Stage 1 sudah intersection(train,test) minus id/target/weight
# Jadi gunakan FEAT_COLS sebagai universe fitur untuk avoid "missing in test"
DO_NOT_USE_COLS = set([ID_COL, TARGET_COL, WEIGHT_COL, "fold"])

# Categoricals: only those that are in FEAT_COLS
FEATURE_COLS_CAT = [c for c in CAT_COLS if (c in FEAT_COLS) and (c not in DO_NOT_USE_COLS)]

# Numerics: only numeric columns that are in FEAT_COLS
FEATURE_COLS_NUM = [c for c in FEAT_COLS
                    if (c not in DO_NOT_USE_COLS)
                    and pd.api.types.is_numeric_dtype(df_train[c])]

# Exclude raw ts_index from numeric features (we'll engineer time features later)
FEATURE_COLS_NUM = [c for c in FEATURE_COLS_NUM if c != TIME_COL]

FEATURE_COLS_ALL = FEATURE_COLS_CAT + FEATURE_COLS_NUM

print("\n==================== FEATURE LISTS ====================")
print("SERIES_KEYS:", SERIES_KEYS)
print("Categorical features:", FEATURE_COLS_CAT)
print("Numeric features (excluding ts_index):", len(FEATURE_COLS_NUM))
print("Total features:", len(FEATURE_COLS_ALL))

# ----------------------------
# 5) Integrity checks
# ----------------------------
# Ensure all features exist in test
missing_in_test = [c for c in FEATURE_COLS_ALL if c not in df_test.columns]
if missing_in_test:
    raise RuntimeError(f"Some feature columns missing in test (shouldn't happen if FEAT_COLS is intersection): {missing_in_test[:10]}")

# Target NaNs
N_TARGET_NAN = int(df_train[TARGET_COL].isna().sum())
DROP_TARGET_NAN = (N_TARGET_NAN > 0)
print("\nTarget NaN count:", N_TARGET_NAN, "| drop later:", DROP_TARGET_NAN)

# Weight sanity (train)
if HAS_WEIGHT_TRAIN:
    w = df_train[WEIGHT_COL].to_numpy(np.float64)
    w = np.where(np.isfinite(w), w, 0.0)
    w_neg = float(np.mean(w < 0))
    w_zero = float(np.mean(w == 0))
    print("\n==================== WEIGHT SANITY (TRAIN) ====================")
    print("min:", float(np.min(w)), "p50:", float(np.percentile(w, 50)), "p99:", float(np.percentile(w, 99)), "max:", float(np.max(w)))
    print("negative rate:", w_neg, "| zero rate:", w_zero)
    if w_neg > 0:
        print("[WARN] Negative weights exist. We'll clip to >=0 later.")

# ----------------------------
# 6) Extra diagnostics (helps rank-1 tuning)
# ----------------------------
# Cardinality of key categorical cols (drift hint)
print("\n==================== CATEGORY CARDINALITY ====================")
for c in FEATURE_COLS_CAT:
    tr_u = int(df_train[c].nunique(dropna=False))
    te_u = int(df_test[c].nunique(dropna=False))
    print(f"{c:15s} | train uniq={tr_u:6d} | test uniq={te_u:6d}")

# quick ts density check
print("\n==================== TS DENSITY (unique ts_index) ====================")
print("train unique ts:", int(df_train[TIME_COL].nunique()), "| test unique ts:", int(df_test[TIME_COL].nunique()))

print("\n==================== LEAKAGE RULES (REMINDER) ====================")
print("- Jangan gunakan WEIGHT sebagai fitur (hanya sample_weight).")
print("- FE time-series wajib computed per group dan SHIFT(1) (no future).")
print("- Encoder/imputer/scaler fit hanya di TRAIN-fold, apply ke valid/test.")
print("- Hindari compute statistik menggunakan data ts_index > t untuk prediksi t.")

globals().update({
    "DO_NOT_USE_COLS": DO_NOT_USE_COLS,
    "FEATURE_COLS_CAT": FEATURE_COLS_CAT,
    "FEATURE_COLS_NUM": FEATURE_COLS_NUM,
    "FEATURE_COLS_ALL": FEATURE_COLS_ALL,
    "TRAIN_MIN_TS": TRAIN_MIN_TS,
    "TRAIN_MAX_TS": TRAIN_MAX_TS,
    "TEST_MIN_TS": TEST_MIN_TS,
    "TEST_MAX_TS": TEST_MAX_TS,
    "HAS_WEIGHT_TRAIN": HAS_WEIGHT_TRAIN,
    "HAS_WEIGHT_TEST": HAS_WEIGHT_TEST,
    "DROP_TARGET_NAN": DROP_TARGET_NAN,
    "N_TARGET_NAN": N_TARGET_NAN,
    "ALLOW_TIME_FEATURES": ALLOW_TIME_FEATURES,
})

gc.collect()


ID_COL: id | TIME_COL: ts_index | WEIGHT_COL: weight | TARGET_COL: y_target
Has weight in train: True | in test: False
ID uniqueness: OK
Train ts range: 1 -> 3601
Test  ts range: 3602 -> 4376
Time ordering: OK (test after train).

SERIES_KEYS: ['code', 'sub_code', 'sub_category', 'horizon']
Categorical features: ['code', 'sub_code', 'sub_category', 'horizon']
Numeric features (excluding ts_index): 86
Total features: 90

Target NaN count: 0 | drop later: False

min: 0.0 p50: 1699.3843705131449 p99: 303840772.74105984 max: 13912217783333.135
negative rate: 0.0 | zero rate: 0.0009332234673945098

code            | train uniq=    23 | test uniq=    23
sub_code        | train uniq=   180 | test uniq=    47
sub_category    | train uniq=     5 | test uniq=     5
horizon         | train uniq=     4 | test uniq=     4

train unique ts: 3601 | test unique ts: 775

- Jangan gunakan WEIGHT sebagai fitur (hanya sample_weight).
- FE time-series wajib computed per group dan SHIFT(1) (no future).
- En

0

# Implement Official Metric

In [4]:
# ============================================================
# STAGE 3 — Implement Official Metric (REVISI FULL v2, ROBUST)
# Assumes STAGE 1–2 already ran and created:
#   df_train, TARGET_COL, ID_COL, TIME_COL, WEIGHT_COL
# Outputs/Globals:
#   weighted_rmse_score, score_df, score_arrays
# ============================================================

import numpy as np
import pandas as pd

# ----------------------------
# 0) Require minimal globals
# ----------------------------
need = ["df_train", "TARGET_COL", "ID_COL", "TIME_COL", "WEIGHT_COL"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing global '{k}'. Jalankan STAGE 1–2 dulu.")

HAS_W = WEIGHT_COL in df_train.columns

# ----------------------------
# 1) Official metric (host)
# ----------------------------
def _clip01(x: float) -> float:
    return float(np.minimum(np.maximum(x, 0.0), 1.0))

def weighted_rmse_score(y_target, y_pred, w) -> float:
    """
    Competition metric:
      denom = sum(w * y^2)
      ratio = sum(w * (y - yhat)^2) / denom
      score = sqrt( 1 - clip01(ratio) )
    """
    y_target = np.asarray(y_target, dtype=np.float64)
    y_pred   = np.asarray(y_pred, dtype=np.float64)
    w        = np.asarray(w,        dtype=np.float64)

    if y_target.shape != y_pred.shape or y_target.shape != w.shape:
        raise ValueError(f"Shape mismatch: y={y_target.shape}, yhat={y_pred.shape}, w={w.shape}")

    # robust weights: finite & non-negative
    w = np.where(np.isfinite(w), w, 0.0)
    w = np.maximum(w, 0.0)

    # robust targets/preds: finite only (mask non-finite)
    m = np.isfinite(y_target) & np.isfinite(y_pred)
    if not np.all(m):
        y_target = y_target[m]
        y_pred   = y_pred[m]
        w        = w[m]

    denom = np.sum(w * (y_target ** 2))
    if not np.isfinite(denom) or denom <= 0:
        return 0.0

    ratio = np.sum(w * ((y_target - y_pred) ** 2)) / denom
    clipped = _clip01(ratio)
    val = 1.0 - clipped
    return float(np.sqrt(max(val, 0.0)))

# ----------------------------
# 2) Convenience wrappers
# ----------------------------
def score_arrays(y_true: np.ndarray, y_pred: np.ndarray, w: np.ndarray | None = None) -> float:
    y_true = np.asarray(y_true, dtype=np.float64)
    y_pred = np.asarray(y_pred, dtype=np.float64)
    if w is None:
        w = np.ones_like(y_true, dtype=np.float64)
    else:
        w = np.asarray(w, dtype=np.float64)
    return weighted_rmse_score(y_true, y_pred, w)

def score_df(df: pd.DataFrame, y_col: str, pred_col: str, w_col: str | None = None) -> float:
    y = df[y_col].to_numpy(np.float64)
    p = df[pred_col].to_numpy(np.float64)
    if (w_col is None) or (w_col not in df.columns):
        w = np.ones(len(df), dtype=np.float64)
    else:
        w = df[w_col].to_numpy(np.float64)
    return weighted_rmse_score(y, p, w)

# ----------------------------
# 3) Sanity checks / baselines
# ----------------------------
print("==================== STAGE 3: OFFICIAL METRIC (v2) ====================")

y = df_train[TARGET_COL].to_numpy(np.float64)

if HAS_W:
    w = df_train[WEIGHT_COL].to_numpy(np.float64)
else:
    w = np.ones_like(y, dtype=np.float64)

# clean w
w = np.where(np.isfinite(w), w, 0.0)
w = np.maximum(w, 0.0)

# For baselines, mask finite y
m = np.isfinite(y)
y_m = y[m]
w_m = w[m]
if len(y_m) == 0:
    raise RuntimeError("All targets are NaN/Inf after masking.")

# Baseline A: predict 0
pred0 = np.zeros_like(y_m, dtype=np.float64)
s0 = weighted_rmse_score(y_m, pred0, w_m)

# Baseline B: predict weighted mean (best constant for weighted MSE)
w_sum = float(np.sum(w_m))
c = float(np.sum(w_m * y_m) / (w_sum + 1e-18))
predc = np.full_like(y_m, c, dtype=np.float64)
sc = weighted_rmse_score(y_m, predc, w_m)

# Baseline C: predict weighted median (more robust than mean)
# compute weighted median (simple sort)
order = np.argsort(y_m)
ys = y_m[order]
ws = w_m[order]
cum = np.cumsum(ws)
wm = ys[np.searchsorted(cum, 0.5 * cum[-1])] if cum[-1] > 0 else float(np.median(y_m))
predwm = np.full_like(y_m, float(wm), dtype=np.float64)
swm = weighted_rmse_score(y_m, predwm, w_m)

print(f"Using WEIGHT_COL: {WEIGHT_COL} | present: {HAS_W}")
print(f"Baseline (predict 0)                score = {s0:.6f}")
print(f"Baseline (predict weighted-mean {c:.6f}) score = {sc:.6f}")
print(f"Baseline (predict weighted-med  {float(wm):.6f}) score = {swm:.6f}")

denom = float(np.sum(w_m * (y_m ** 2)))
sse0  = float(np.sum(w_m * ((y_m - 0.0) ** 2)))
ratio0 = sse0 / denom if denom > 0 else np.nan
print("\nDiagnostics:")
print(f"denom sum(w*y^2) = {denom:.6e}")
print(f"ratio(predict0)  = {ratio0:.6f} (score -> ~0 if ratio~1)")

print("\nGlobals exported: weighted_rmse_score, score_arrays, score_df")

globals().update({
    "weighted_rmse_score": weighted_rmse_score,
    "score_arrays": score_arrays,
    "score_df": score_df,
})


Using WEIGHT_COL: weight | present: True
Baseline (predict 0)                score = 0.000000
Baseline (predict weighted-mean -0.000024) score = 0.011117
Baseline (predict weighted-med  0.000009) score = 0.000000

Diagnostics:
denom sum(w*y^2) = 4.082630e+08
ratio(predict0)  = 1.000000 (score -> ~0 if ratio~1)

Globals exported: weighted_rmse_score, score_arrays, score_df


# Time-based Validation Split

In [5]:
# ============================================================
# STAGE 4 — Time-based Validation Split (REVISI FULL v2, PURGED + TAIL)
# Assumes STAGE 1–3 already ran and created:
#   df_train, ID_COL, TIME_COL
#
# This stage:
# - Builds purged time CV on unique ts_index (tail-focused)
# - Adds df_train["fold"] where fold>=0 means validation membership, -1 means never validated
#
# Outputs/Globals:
#   df_train (with 'fold'), df_folds
#   fold_boundaries, fold_info, FOLD_CFG
#   CUT_TS, TAIL_TS
# ============================================================

import numpy as np
import pandas as pd
import gc

# ----------------------------
# 0) Require
# ----------------------------
need = ["df_train", "ID_COL", "TIME_COL"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing global '{k}'. Jalankan STAGE 1–3 dulu.")

# ----------------------------
# 1) Config (rank1-oriented defaults)
# ----------------------------
N_FOLDS = 5

# Use tail portion for validation placement (mimic test regime)
CV_START_FRAC = 0.70   # validate on last 30% ts points (lebih informatif dari 25%)

# Purge gap to avoid subtle leakage through smoothing/rolling
GAP = 2                # 1-2 biasanya bagus; 0 boleh, tapi lebih risk

# Minimal validation rows per fold (adaptive, warn-only)
MIN_VALID_ROWS = 50_000

# ----------------------------
# 2) Prepare unique ts timeline (robust if ts_index has gaps)
# ----------------------------
ts_unique = np.sort(df_train[TIME_COL].dropna().unique().astype(np.int64))
if len(ts_unique) < (N_FOLDS + 5):
    raise RuntimeError(f"Unique ts_index terlalu sedikit ({len(ts_unique)}) untuk N_FOLDS={N_FOLDS}.")

cut_idx = int(len(ts_unique) * CV_START_FRAC)
cut_idx = min(max(cut_idx, 0), len(ts_unique) - 1)
CUT_TS = int(ts_unique[cut_idx])

TAIL_TS = ts_unique[ts_unique >= CUT_TS]
if len(TAIL_TS) < N_FOLDS:
    raise RuntimeError(f"TAIL_TS terlalu kecil ({len(TAIL_TS)}) untuk N_FOLDS={N_FOLDS}. Turunkan N_FOLDS atau CV_START_FRAC.")

# split tail ts into contiguous segments (by order, not by value)
segments = np.array_split(TAIL_TS, N_FOLDS)

fold_boundaries = []
fold_info = {}
for k, seg in enumerate(segments):
    seg = np.array(seg, dtype=np.int64)
    if len(seg) == 0:
        continue
    vmin = int(seg.min())
    vmax = int(seg.max())
    train_max = int(vmin - GAP)  # purged

    fold_boundaries.append({
        "fold": int(k),
        "val_min_ts": vmin,
        "val_max_ts": vmax,
        "train_max_ts": train_max,
        "gap": int(GAP),
        "n_val_ts": int(len(seg)),
    })
    fold_info[int(k)] = (vmin, vmax, train_max)

fold_ids = sorted(fold_info.keys())

FOLD_CFG = dict(
    N_FOLDS=int(N_FOLDS),
    CV_START_FRAC=float(CV_START_FRAC),
    GAP=int(GAP),
    MIN_VALID_ROWS=int(MIN_VALID_ROWS),
    CUT_TS=int(CUT_TS),
)

print("==================== STAGE 4: PURGED TIME CV ====================")
print("Unique ts:", len(ts_unique), "| ts_min:", int(ts_unique.min()), "| ts_max:", int(ts_unique.max()))
print("CV_START_FRAC:", CV_START_FRAC, "| CUT_TS:", CUT_TS, "| tail unique ts:", len(TAIL_TS))
print("N_FOLDS:", N_FOLDS, "| GAP:", GAP)
print("\nFold boundaries:")
for b in fold_boundaries:
    print(f"  fold {b['fold']}: train <= {b['train_max_ts']} | val [{b['val_min_ts']}, {b['val_max_ts']}] | n_val_ts={b['n_val_ts']}")

# ----------------------------
# 3) Assign folds by ts_index membership in validation segments
# ----------------------------
ts_arr = df_train[TIME_COL].to_numpy(np.int64)
fold_arr = np.full(len(df_train), -1, dtype=np.int16)

# mapping ts -> fold (fast)
ts_to_fold = {}
for b in fold_boundaries:
    k = int(b["fold"])
    vmin, vmax = int(b["val_min_ts"]), int(b["val_max_ts"])
    # map all ts in range that actually exist in tail segments
    # safest: explicit mapping from segment list
    seg = segments[k]  # segment order matches fold
    for t in seg:
        ts_to_fold[int(t)] = k

# assign
for i, t in enumerate(ts_arr):
    fold_arr[i] = ts_to_fold.get(int(t), -1)

df_train["fold"] = fold_arr
df_folds = df_train[[ID_COL, "fold"]].copy()

# ----------------------------
# 4) Diagnostics
# ----------------------------
vc = df_train["fold"].value_counts(dropna=False).sort_index()
print("\nFold row counts (fold=-1 means never validated):")
print(vc.to_dict())

for b in fold_boundaries:
    k = int(b["fold"])
    n_valid = int((df_train["fold"] == k).sum())
    if n_valid < MIN_VALID_ROWS:
        print(f"[WARN] fold {k} valid rows kecil: {n_valid} < {MIN_VALID_ROWS} (tetap lanjut)")

# strict validity check
viol = []
for b in fold_boundaries:
    if not (int(b["train_max_ts"]) < int(b["val_min_ts"])):
        viol.append(int(b["fold"]))
if viol:
    raise RuntimeError(f"Invalid split: folds where train_max_ts >= val_min_ts: {viol}")

print("\nGlobals exported: df_train['fold'], df_folds, fold_boundaries, fold_info, FOLD_CFG, CUT_TS, TAIL_TS")

globals().update({
    "df_train": df_train,
    "df_folds": df_folds,
    "fold_boundaries": fold_boundaries,
    "fold_info": fold_info,
    "FOLD_CFG": FOLD_CFG,
    "CUT_TS": CUT_TS,
    "TAIL_TS": TAIL_TS,
})

gc.collect()


Unique ts: 3601 | ts_min: 1 | ts_max: 3601
CV_START_FRAC: 0.7 | CUT_TS: 2521 | tail unique ts: 1081
N_FOLDS: 5 | GAP: 2

Fold boundaries:
  fold 0: train <= 2519 | val [2521, 2737] | n_val_ts=217
  fold 1: train <= 2736 | val [2738, 2953] | n_val_ts=216
  fold 2: train <= 2952 | val [2954, 3169] | n_val_ts=216
  fold 3: train <= 3168 | val [3170, 3385] | n_val_ts=216
  fold 4: train <= 3384 | val [3386, 3601] | n_val_ts=216

Fold row counts (fold=-1 means never validated):
{-1: 3493999, 0: 379658, 1: 374318, 2: 361889, 3: 361991, 4: 365559}

Globals exported: df_train['fold'], df_folds, fold_boundaries, fold_info, FOLD_CFG, CUT_TS, TAIL_TS


0

# Feature Preparation & Weighting Strategy

In [6]:
# ============================================================
# STAGE 5 — Feature Engineering (REVISI FULL v5, RAM-SAFE + TS/LAG + CS subset)
# REQUIRE (from STAGE 1–4):
#   df_train, df_test, TARGET_COL, ID_COL, TIME_COL, WEIGHT_COL, SERIES_KEYS
#   df_train["fold"], fold_boundaries (optional, for tail focus)
#
# OUTPUT globals:
#   df_train, df_test (added engineered features)
#   TOPK, CS_FEATS
#   FEATURE_COLS_CAT_ALL, FEATURE_COLS_NUM_ALL, FEATURE_COLS_ALL
#   CAT_FEATURE_IDXS_ALL
#   make_sample_weight(), fit_median_imputer(), apply_median_imputer()
# ============================================================

import gc
import numpy as np
import pandas as pd

# ----------------------------
# 0) Require
# ----------------------------
need = ["df_train","df_test","TARGET_COL","ID_COL","TIME_COL","WEIGHT_COL","SERIES_KEYS"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing global '{k}'. Jalankan STAGE 1–4 dulu.")
if "fold" not in df_train.columns:
    raise RuntimeError("Missing df_train['fold']. Jalankan STAGE 4 dulu.")

# ----------------------------
# 1) Config (rank1-oriented tapi RAM-safe)
# ----------------------------
TOPK_NUM_FOR_FE   = 20          # naikkan ke 30 kalau RAM kuat
CS_TOPK           = 8           # hanya subset untuk CS (biar aman)
LAGS              = (1, 2, 3)    # kalau berat: (1,2)
ADD_DIFF1         = True
ADD_TIME_GAP      = True
ADD_AGE           = True
ADD_TS_BIN        = True
TS_BIN_SIZE       = 20

# Cross-sectional within SAME ts (no future): zscore + rank pct
ADD_CS_Z          = True
ADD_CS_RANK       = True        # kalau lambat: False
CS_BUCKET_CAND    = [TIME_COL, "horizon"]

# TOPK selection sampling
MAX_ROWS_FOR_CORR = 180_000
TAU_SELECT        = 450.0       # recency emphasis for selecting TOPK
RANDOM_STATE      = 42

# Weighting config (used later in make_sample_weight)
RECENCY_TAU_DEFAULT = 450.0
CLIP_W_Q_DEFAULT    = 0.9995

TRAIN_MAX_TS = int(df_train[TIME_COL].max())

# ----------------------------
# 2) Pick numeric features to expand (TOPK) using weighted abs-corr on tail-ish data
# ----------------------------
def _weighted_abs_corr(x: np.ndarray, y: np.ndarray, w: np.ndarray, eps: float = 1e-12) -> float:
    m_w = np.sum(w) + eps
    wx = np.sum(w * x) / m_w
    wy = np.sum(w * y) / m_w
    xc = x - wx
    yc = y - wy
    cov = np.sum(w * xc * yc) / m_w
    vx = np.sum(w * xc * xc) / m_w
    vy = np.sum(w * yc * yc) / m_w
    return float(abs(cov / (np.sqrt(vx * vy) + eps)))

DROP_ALWAYS = {ID_COL, TARGET_COL, WEIGHT_COL, "fold"}

# numeric candidates restricted to common numeric features (prefer feature_*)
num_candidates = [c for c in df_train.columns
                  if (c not in DROP_ALWAYS)
                  and pd.api.types.is_numeric_dtype(df_train[c])]

feat_candidates = [c for c in num_candidates if str(c).startswith("feature_")]
if len(feat_candidates) >= 10:
    num_candidates = feat_candidates

# focus correlation selection on validation-tail (fold>=0) if available
df_sel = df_train[df_train["fold"] >= 0].copy()
if len(df_sel) < 50_000:
    df_sel = df_train.copy()

use_cols = list(dict.fromkeys([*num_candidates, TARGET_COL, WEIGHT_COL, TIME_COL]))
df_sel = df_sel[use_cols]

# sample with recency
if len(df_sel) > MAX_ROWS_FOR_CORR:
    t = df_sel[TIME_COL].to_numpy(np.float64)
    p = np.exp(-(TRAIN_MAX_TS - t) / float(TAU_SELECT))
    p = p / (p.sum() + 1e-12)
    take = np.random.RandomState(RANDOM_STATE).choice(len(df_sel), size=MAX_ROWS_FOR_CORR, replace=False, p=p)
    df_sel = df_sel.iloc[take].copy()

y = df_sel[TARGET_COL].to_numpy(np.float64)
w = df_sel[WEIGHT_COL].to_numpy(np.float64)
w = np.where(np.isfinite(w), w, 0.0)
w = np.maximum(w, 0.0)
# recency multiply for TOPK selection
t = df_sel[TIME_COL].to_numpy(np.float64)
w = w * np.exp(-(TRAIN_MAX_TS - t) / float(TAU_SELECT))
if w.sum() <= 0:
    w = np.ones_like(y, dtype=np.float64)

corr_scores = []
for c in num_candidates:
    x = df_sel[c].to_numpy(np.float64)
    med = np.nanmedian(x)
    x = np.where(np.isfinite(x), x, med)
    corr_scores.append((_weighted_abs_corr(x, y, w), c))

corr_scores.sort(reverse=True, key=lambda kv: kv[0])
TOPK = [c for _, c in corr_scores[:min(TOPK_NUM_FOR_FE, len(corr_scores))]]
CS_FEATS = TOPK[:min(CS_TOPK, len(TOPK))]

print("TOPK selected:", TOPK[:12], "..." if len(TOPK) > 12 else "")
print("CS_FEATS subset:", CS_FEATS)

del df_sel, y, w, t
gc.collect()

# ----------------------------
# 3) Build engineered features on MINIMAL concatenation (RAM-safe)
# ----------------------------
tr_n = len(df_train)
te_n = len(df_test)

bucket = [c for c in CS_BUCKET_CAND if c in df_train.columns and c in df_test.columns]
# columns needed for FE
min_cols = list(dict.fromkeys([TIME_COL, *SERIES_KEYS, *bucket, *TOPK]))
min_cols = [c for c in min_cols if c in df_train.columns and c in df_test.columns]

min_train = df_train[min_cols].copy()
min_test  = df_test[min_cols].copy()

min_train["__is_train"] = 1
min_test["__is_train"]  = 0
min_train["__rid"] = np.arange(tr_n, dtype=np.int64)
min_test["__rid"]  = np.arange(te_n, dtype=np.int64)

min_all = pd.concat([min_train, min_test], axis=0, ignore_index=True)
del min_train, min_test
gc.collect()

# Optional time bin
if ADD_TS_BIN:
    min_all["fe_ts_bin"] = (min_all[TIME_COL].astype(np.int32) // int(TS_BIN_SIZE)).astype(np.int32)

# Sort for sequential FE (stable)
sort_cols = [c for c in SERIES_KEYS if c in min_all.columns] + [TIME_COL, "__is_train", "__rid"]
min_all = min_all.sort_values(sort_cols, kind="mergesort").reset_index(drop=True)

g = min_all.groupby([c for c in SERIES_KEYS if c in min_all.columns], sort=False)

# Time gap (dt)
if ADD_TIME_GAP:
    dt = g[TIME_COL].diff()
    min_all["fe_dt"] = dt.fillna(0).astype(np.float32)
    min_all["fe_dt_clip"] = np.clip(min_all["fe_dt"].to_numpy(np.float32), 0.0, 50.0).astype(np.float32)

# Age
if ADD_AGE:
    min_all["fe_age"] = (TRAIN_MAX_TS - min_all[TIME_COL].to_numpy(np.int32)).astype(np.int32)

# Lags + diff1 with safety: only valid if current_ts > ts_shift
cur_ts = min_all[TIME_COL].to_numpy(np.int64)

for f in TOPK:
    for L in LAGS:
        ts_shift = g[TIME_COL].shift(L).to_numpy(np.float64)
        val_shift = g[f].shift(L).astype(np.float32).to_numpy()
        ok = (cur_ts.astype(np.float64) > ts_shift)  # strict earlier time
        out = np.where(ok, val_shift, np.nan).astype(np.float32)
        min_all[f"fe_{f}_lag{L}"] = out
    if ADD_DIFF1:
        cur = min_all[f].astype(np.float32).to_numpy()
        lag1 = min_all[f"fe_{f}_lag1"].to_numpy(np.float32)
        min_all[f"fe_{f}_diff1"] = (cur - lag1).astype(np.float32)

# small missing-count feature (cheap & often helps)
min_all["fe_nan_cnt_topk"] = np.zeros(len(min_all), dtype=np.float32)
if len(TOPK) > 0:
    arr = min_all[TOPK].to_numpy()
    min_all["fe_nan_cnt_topk"] = np.isnan(arr).sum(axis=1).astype(np.float32)
    del arr
    gc.collect()

# Cross-sectional stats within same ts bucket (safe)
if ADD_CS_Z and len(bucket) > 0 and len(CS_FEATS) > 0:
    gb = min_all.groupby(bucket, sort=False)
    for f in CS_FEATS:
        mu = gb[f].transform("mean").astype(np.float32)
        sd = gb[f].transform("std").astype(np.float32)
        min_all[f"fe_{f}_z_{'_'.join(bucket)}"] = ((min_all[f].astype(np.float32) - mu) / (sd + 1e-6)).astype(np.float32)

if ADD_CS_RANK and len(bucket) > 0 and len(CS_FEATS) > 0:
    gb = min_all.groupby(bucket, sort=False)
    for f in CS_FEATS:
        min_all[f"fe_{f}_r_{'_'.join(bucket)}"] = gb[f].rank(pct=True).astype(np.float32)

# Split back by __rid WITHOUT reordering original df_train/df_test
eng_cols = [c for c in min_all.columns if c.startswith("fe_")]
tr_eng = min_all[min_all["__is_train"] == 1][["__rid"] + eng_cols].sort_values("__rid")
te_eng = min_all[min_all["__is_train"] == 0][["__rid"] + eng_cols].sort_values("__rid")

for c in eng_cols:
    df_train[c] = tr_eng[c].to_numpy()
    df_test[c]  = te_eng[c].to_numpy()

del min_all, tr_eng, te_eng
gc.collect()

# ----------------------------
# 4) Finalize feature lists for Stage 6/7
# ----------------------------
# categorical (base cats) + fe_ts_bin as categorical if enabled
FEATURE_COLS_CAT_ALL = [c for c in ["code","sub_code","sub_category","horizon"] if c in df_train.columns]
if ADD_TS_BIN and "fe_ts_bin" in df_train.columns:
    FEATURE_COLS_CAT_ALL = FEATURE_COLS_CAT_ALL + ["fe_ts_bin"]

# do-not-use
DO_NOT_USE = {ID_COL, TARGET_COL, WEIGHT_COL, "fold"}

# numeric = all numeric excluding do-not-use & excluding categorical
FEATURE_COLS_NUM_ALL = [c for c in df_train.columns
                        if (c not in DO_NOT_USE)
                        and (c not in FEATURE_COLS_CAT_ALL)
                        and pd.api.types.is_numeric_dtype(df_train[c])]

# (optional) do NOT include raw ts_index; we use engineered time instead
FEATURE_COLS_NUM_ALL = [c for c in FEATURE_COLS_NUM_ALL if c != TIME_COL]

FEATURE_COLS_ALL = FEATURE_COLS_CAT_ALL + FEATURE_COLS_NUM_ALL
CAT_FEATURE_IDXS_ALL = list(range(len(FEATURE_COLS_CAT_ALL)))

# ----------------------------
# 5) Weight strategy
# ----------------------------
def make_sample_weight(df: pd.DataFrame,
                       use_recency: bool = True,
                       tau: float = RECENCY_TAU_DEFAULT,
                       clip_w_quantile: float | None = CLIP_W_Q_DEFAULT,
                       eps: float = 1e-12) -> np.ndarray:
    w = df[WEIGHT_COL].to_numpy(np.float64)
    w = np.where(np.isfinite(w), w, 0.0)
    w = np.maximum(w, 0.0)

    if clip_w_quantile is not None:
        q = float(np.nanquantile(w, float(clip_w_quantile)))
        if np.isfinite(q) and q > 0:
            w = np.minimum(w, q)

    if use_recency:
        t = df[TIME_COL].to_numpy(np.float64)
        rec = np.exp(-(TRAIN_MAX_TS - t) / float(tau))
        w = w * rec

    if float(w.sum()) <= eps:
        w = np.ones(len(df), dtype=np.float64)
    return w

# ----------------------------
# 6) Median imputer (optional)
# ----------------------------
def fit_median_imputer(df_fit: pd.DataFrame, num_cols: list[str]) -> dict:
    med = df_fit[num_cols].median(numeric_only=True)
    return {c: float(med[c]) if c in med.index and np.isfinite(med[c]) else 0.0 for c in num_cols}

def apply_median_imputer(df_apply: pd.DataFrame, medians: dict, num_cols: list[str]) -> pd.DataFrame:
    out = df_apply.copy()
    for c in num_cols:
        if c in out.columns:
            out[c] = out[c].fillna(medians.get(c, 0.0))
    return out

print("\n==================== STAGE 5 SUMMARY (v5) ====================")
print("Train/Test shapes:", df_train.shape, df_test.shape)
print("SERIES_KEYS:", SERIES_KEYS)
print("TOPK:", len(TOPK), "| CS_FEATS:", len(CS_FEATS), "| bucket:", bucket)
print("CAT:", FEATURE_COLS_CAT_ALL)
print("NUM features:", len(FEATURE_COLS_NUM_ALL), "| TOTAL:", len(FEATURE_COLS_ALL))
print("Example engineered:", [c for c in df_train.columns if c.startswith("fe_")][:25])

globals().update({
    "df_train": df_train,
    "df_test": df_test,
    "TOPK": TOPK,
    "CS_FEATS": CS_FEATS,
    "FEATURE_COLS_CAT_ALL": FEATURE_COLS_CAT_ALL,
    "FEATURE_COLS_NUM_ALL": FEATURE_COLS_NUM_ALL,
    "FEATURE_COLS_ALL": FEATURE_COLS_ALL,
    "CAT_FEATURE_IDXS_ALL": CAT_FEATURE_IDXS_ALL,
    "make_sample_weight": make_sample_weight,
    "fit_median_imputer": fit_median_imputer,
    "apply_median_imputer": apply_median_imputer,
    "TRAIN_MAX_TS": TRAIN_MAX_TS,
})


TOPK selected: ['feature_u', 'feature_bn', 'feature_ca', 'feature_cd', 'feature_v', 'feature_am', 'feature_cb', 'feature_ap', 'feature_bo', 'feature_an', 'feature_bm', 'feature_cc'] ...
CS_FEATS subset: ['feature_u', 'feature_bn', 'feature_ca', 'feature_cd', 'feature_v', 'feature_am', 'feature_cb', 'feature_ap']


  g = min_all.groupby([c for c in SERIES_KEYS if c in min_all.columns], sort=False)
  gb = min_all.groupby(bucket, sort=False)
  gb = min_all.groupby(bucket, sort=False)
  min_all[f"fe_{f}_r_{'_'.join(bucket)}"] = gb[f].rank(pct=True).astype(np.float32)
  min_all[f"fe_{f}_r_{'_'.join(bucket)}"] = gb[f].rank(pct=True).astype(np.float32)
  min_all[f"fe_{f}_r_{'_'.join(bucket)}"] = gb[f].rank(pct=True).astype(np.float32)
  min_all[f"fe_{f}_r_{'_'.join(bucket)}"] = gb[f].rank(pct=True).astype(np.float32)
  min_all[f"fe_{f}_r_{'_'.join(bucket)}"] = gb[f].rank(pct=True).astype(np.float32)
  df_train[c] = tr_eng[c].to_numpy()
  df_train[c] = tr_eng[c].to_numpy()
  df_train[c] = tr_eng[c].to_numpy()
  df_train[c] = tr_eng[c].to_numpy()
  df_test[c]  = te_eng[c].to_numpy()
  df_train[c] = tr_eng[c].to_numpy()
  df_test[c]  = te_eng[c].to_numpy()
  df_train[c] = tr_eng[c].to_numpy()
  df_test[c]  = te_eng[c].to_numpy()
  df_train[c] = tr_eng[c].to_numpy()
  df_test[c]  = te_eng[c].to_numpy()
  d


Train/Test shapes: (5337414, 196) (1447107, 193)
SERIES_KEYS: ['code', 'sub_code', 'sub_category', 'horizon']
TOPK: 20 | CS_FEATS: 8 | bucket: ['ts_index', 'horizon']
CAT: ['code', 'sub_code', 'sub_category', 'horizon', 'fe_ts_bin']
NUM features: 186 | TOTAL: 191
Example engineered: ['fe_ts_bin', 'fe_dt', 'fe_dt_clip', 'fe_age', 'fe_feature_u_lag1', 'fe_feature_u_lag2', 'fe_feature_u_lag3', 'fe_feature_u_diff1', 'fe_feature_bn_lag1', 'fe_feature_bn_lag2', 'fe_feature_bn_lag3', 'fe_feature_bn_diff1', 'fe_feature_ca_lag1', 'fe_feature_ca_lag2', 'fe_feature_ca_lag3', 'fe_feature_ca_diff1', 'fe_feature_cd_lag1', 'fe_feature_cd_lag2', 'fe_feature_cd_lag3', 'fe_feature_cd_diff1', 'fe_feature_v_lag1', 'fe_feature_v_lag2', 'fe_feature_v_lag3', 'fe_feature_v_diff1', 'fe_feature_am_lag1']


# Model Training, OOF Evaluation, and Model Selection

In [None]:
# ============================================================
# STAGE 6 — Model Training, OOF Evaluation, and Model Selection (REVISI FULL v5, PURGED + RANK1-READY + RAM-SAFE)
# - FIX leakage: training fold-k hanya pakai ts_index <= train_max_ts(fold-k) (purged) + horizon mask
# - OOF predictions built per row (per horizon if MODE="per_horizon")
# - Select top models by OOF, then per-horizon nonneg blend + alpha calibration
# - Saves cfg + registry (no heavy model saving; Stage 7 akan final-fit)
#
# REQUIRE:
#   df_train, TARGET_COL, TIME_COL, ID_COL, WEIGHT_COL
#   FEATURE_COLS_ALL, FEATURE_COLS_CAT_ALL
#   fold_boundaries, weighted_rmse_score, make_sample_weight
#   df_train["fold"]
#
# OUTPUT globals:
#   oof_pred_blend, blend_weights_by_h, alpha_by_h, model_cfg_used
#   top_model_names, oof_store
# ============================================================

import gc, json, time, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

need = ["df_train","TARGET_COL","TIME_COL","ID_COL","WEIGHT_COL","FEATURE_COLS_ALL","FEATURE_COLS_CAT_ALL",
        "fold_boundaries","weighted_rmse_score","make_sample_weight"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing global: {k}. Jalankan stage sebelumnya.")
if "fold" not in df_train.columns:
    raise RuntimeError("Missing df_train['fold']. Jalankan STAGE 4 dulu.")

# ----------------------------
# 0) Config (moderate -> aman RAM; bisa dinaikkan setelah stabil)
# ----------------------------
MODE = "per_horizon"  # "per_horizon" recommended

# sampling
TRAIN_SAMPLE_CAP = 450_000
SAMPLE_WEIGHTED  = True

# sample weights (official * recency); untuk fold-k recency anchor di train_max_ts fold-k
USE_RECENCY = True
TAU = 450.0
CLIP_W_Q = 0.9995

# model families (mulai tidak terlalu banyak)
USE_CATBOOST = True
USE_LGBM = True

# CatBoost (2 configs x 2 seeds) -> 4 model per fold/horizon
CB_SEEDS = [42, 62]
CB_PARAM_LIST = [
    dict(iterations=5000, learning_rate=0.03, depth=10, l2_leaf_reg=8.0,  random_strength=1.1, rsm=0.9,
         min_data_in_leaf=80, bootstrap_type="Bernoulli", subsample=0.8),
    dict(iterations=3500, learning_rate=0.04, depth=8,  l2_leaf_reg=6.0,  random_strength=1.0, rsm=0.9,
         min_data_in_leaf=120, bootstrap_type="Bernoulli", subsample=0.85),
]
EARLY_STOPPING_ROUNDS = 250

# LightGBM (2 seeds) -> 2 model per fold/horizon
LGB_SEEDS = [41, 51]
LGB_PARAMS = dict(
    objective="regression",
    metric="rmse",
    learning_rate=0.03,
    num_leaves=192,
    min_data_in_leaf=180,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    lambda_l2=8.0,
    max_bin=255,
    verbose=-1,
)

# target transform (asinh stabil untuk heavy-tail & bisa negatif)
USE_TARGET_ASINH = True
ASINH_CLIP_PRED = 8.0  # clamp pred in transformed space (stability)

# Blend selection
TOP_MODELS_KEEP = 6   # pilih top-N model (across all names) untuk blending
NONNEG_BLEND = True

OUT_DIR = Path("/kaggle/working/tsf_stage6_models_v5")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------
# 1) Feature columns for model (per_horizon exclude horizon from inputs)
# ----------------------------
if MODE == "per_horizon" and "horizon" in df_train.columns and ("horizon" in FEATURE_COLS_CAT_ALL):
    CAT_COLS_MODEL = [c for c in FEATURE_COLS_CAT_ALL if c != "horizon"]
else:
    CAT_COLS_MODEL = list(FEATURE_COLS_CAT_ALL)

# ensure categorical dtypes for cat cols (once, in-place)
for c in CAT_COLS_MODEL:
    if c in df_train.columns:
        if not (pd.api.types.is_categorical_dtype(df_train[c]) or pd.api.types.is_object_dtype(df_train[c])):
            df_train[c] = df_train[c].astype("category")

NUM_COLS_MODEL = [c for c in FEATURE_COLS_ALL if c not in FEATURE_COLS_CAT_ALL]
FEATURE_COLS_MODEL = CAT_COLS_MODEL + NUM_COLS_MODEL
CAT_FEATURE_IDXS = [i for i, c in enumerate(FEATURE_COLS_MODEL) if c in CAT_COLS_MODEL]

print("MODE:", MODE)
print("Total features:", len(FEATURE_COLS_MODEL), "| cat idx:", len(CAT_FEATURE_IDXS))

# ----------------------------
# 2) Fold boundaries map (PURGED)
# ----------------------------
fold_map = {int(b["fold"]): b for b in fold_boundaries}
fold_ids = sorted(fold_map.keys())

# horizons list
if MODE == "per_horizon" and "horizon" in df_train.columns:
    horizons = sorted(df_train["horizon"].astype(str).unique().tolist())
else:
    horizons = ["__all__"]

print("fold_ids:", fold_ids)
print("horizons:", horizons[:10], "..." if len(horizons) > 10 else "")

# ----------------------------
# 3) Helpers
# ----------------------------
def _new_oof(n: int) -> np.ndarray:
    return np.full(n, np.nan, dtype=np.float32)

def _clean_weight(w: np.ndarray) -> np.ndarray:
    w = np.asarray(w, dtype=np.float64)
    w = np.where(np.isfinite(w), w, 0.0)
    w = np.maximum(w, 0.0)
    return w

def _sample_df(df: pd.DataFrame, cap: int, p: np.ndarray, seed: int) -> pd.DataFrame:
    if cap is None or cap <= 0 or len(df) <= cap:
        return df
    rs = np.random.RandomState(seed)
    p = np.asarray(p, dtype=np.float64)
    if (not np.isfinite(p).all()) or p.sum() <= 0:
        idx = rs.choice(len(df), size=cap, replace=False)
        return df.iloc[idx]
    p = p / (p.sum() + 1e-12)
    idx = rs.choice(len(df), size=cap, replace=False, p=p)
    return df.iloc[idx]

def _fold_sample_weight(df: pd.DataFrame, anchor_ts: int, use_recency: bool, tau: float, clip_w_q: float | None) -> np.ndarray:
    # official weight
    w = df[WEIGHT_COL].to_numpy(np.float64) if (WEIGHT_COL in df.columns) else np.ones(len(df), dtype=np.float64)
    w = _clean_weight(w)

    if clip_w_q is not None:
        q = float(np.nanquantile(w, float(clip_w_q)))
        if np.isfinite(q) and q > 0:
            w = np.minimum(w, q)

    if use_recency:
        t = df[TIME_COL].to_numpy(np.float64)
        rec = np.exp(-(float(anchor_ts) - t) / float(tau))
        rec = np.where(np.isfinite(rec), rec, 0.0)
        w = w * rec

    if w.sum() <= 1e-12:
        w = np.ones(len(df), dtype=np.float64)
    return w

def fit_alpha(y: np.ndarray, pred: np.ndarray, w: np.ndarray, clip=(0.0, 3.0)) -> float:
    y = np.asarray(y, np.float64)
    pred = np.asarray(pred, np.float64)
    w = _clean_weight(w)
    m = np.isfinite(y) & np.isfinite(pred)
    if not np.any(m):
        return 1.0
    y, pred, w = y[m], pred[m], w[m]
    num = float(np.sum(w * y * pred))
    den = float(np.sum(w * pred * pred) + 1e-12)
    a = num / den
    return float(np.clip(a, clip[0], clip[1]))

def fit_blend_weights(pred_mat: np.ndarray, y: np.ndarray, w: np.ndarray, nonneg=True) -> np.ndarray:
    y = np.asarray(y, np.float64)
    P = np.asarray(pred_mat, np.float64)
    w = _clean_weight(w)

    m = np.isfinite(y) & np.isfinite(P).all(axis=1)
    if not np.any(m):
        b = np.ones(P.shape[1], dtype=np.float64) / P.shape[1]
        return b

    y = y[m]
    P = P[m]
    w = w[m]

    sw = np.sqrt(np.maximum(w, 0.0))
    Pw = P * sw[:, None]
    yw = y * sw

    b, *_ = np.linalg.lstsq(Pw, yw, rcond=None)
    b = b.astype(np.float64)

    if nonneg:
        b = np.clip(b, 0.0, None)

    s = b.sum()
    if not np.isfinite(s) or s <= 0:
        b = np.ones(P.shape[1], dtype=np.float64) / P.shape[1]
    else:
        b = b / s
    return b

# ----------------------------
# 4) Imports for models
# ----------------------------
from catboost import CatBoostRegressor, Pool

try:
    import lightgbm as lgb
    _HAS_LGB = True
except Exception:
    _HAS_LGB = False
    USE_LGBM = False
    print("[WARN] lightgbm not available; skipping LGBM.")

# ----------------------------
# 5) Train loop (PURGED per fold)
# ----------------------------
df_train = df_train.copy()
df_train["fold"] = df_train["fold"].astype(int)

N = len(df_train)
oof_store = {}          # name -> oof pred vector (len N)
model_registry = []     # list of dicts with fold/horizon score

t0 = time.time()

for h_key in horizons:
    if h_key == "__all__":
        mask_h = np.ones(N, dtype=bool)
    else:
        mask_h = (df_train["horizon"].astype(str).to_numpy() == str(h_key))

    n_h = int(mask_h.sum())
    if n_h == 0:
        continue

    print("\n" + "="*90)
    print("HORIZON:", h_key, "| rows:", n_h)

    for k in fold_ids:
        b = fold_map[int(k)]
        train_max_ts = int(b["train_max_ts"])
        vmin = int(b["val_min_ts"])
        vmax = int(b["val_max_ts"])

        # VALID strictly inside [vmin,vmax]
        va_mask = mask_h & (df_train[TIME_COL].to_numpy(np.int64) >= vmin) & (df_train[TIME_COL].to_numpy(np.int64) <= vmax)
        # TRAIN strictly <= train_max_ts (purged)
        tr_mask = mask_h & (df_train[TIME_COL].to_numpy(np.int64) <= train_max_ts)

        n_va = int(va_mask.sum())
        n_tr = int(tr_mask.sum())
        if n_va == 0 or n_tr == 0:
            continue

        df_tr = df_train.loc[tr_mask]
        df_va = df_train.loc[va_mask]

        # weights for sampling/training anchored at train_max_ts
        w_tr_full = _fold_sample_weight(df_tr, anchor_ts=train_max_ts, use_recency=USE_RECENCY, tau=TAU, clip_w_q=CLIP_W_Q)

        # optional sampling (RAM/time)
        if TRAIN_SAMPLE_CAP is not None and TRAIN_SAMPLE_CAP > 0 and len(df_tr) > TRAIN_SAMPLE_CAP:
            df_tr = _sample_df(df_tr, cap=TRAIN_SAMPLE_CAP, p=w_tr_full, seed=10000 + 31*k + (0 if h_key=="__all__" else (hash(h_key) % 1000)))
            w_tr_full = _fold_sample_weight(df_tr, anchor_ts=train_max_ts, use_recency=USE_RECENCY, tau=TAU, clip_w_q=CLIP_W_Q)

        # build matrices
        X_tr = df_tr[FEATURE_COLS_MODEL]
        X_va = df_va[FEATURE_COLS_MODEL]
        y_tr_raw = df_tr[TARGET_COL].to_numpy(np.float64)
        y_va_raw = df_va[TARGET_COL].to_numpy(np.float64)

        # eval weights should be official (NO recency) to match metric
        w_va_eval = df_va[WEIGHT_COL].to_numpy(np.float64) if (WEIGHT_COL in df_va.columns) else np.ones(len(df_va), np.float64)
        w_va_eval = _clean_weight(w_va_eval)

        # target transform (fit scale only on TRAIN fold subset)
        if USE_TARGET_ASINH:
            s = float(np.nanmedian(np.abs(y_tr_raw)))
            if not np.isfinite(s) or s <= 1e-6:
                s = 1.0
            y_tr = np.arcsinh(y_tr_raw / s).astype(np.float32)
            y_va = np.arcsinh(y_va_raw / s).astype(np.float32)
            inv = lambda z: (np.sinh(np.clip(z, -ASINH_CLIP_PRED, ASINH_CLIP_PRED)) * s).astype(np.float64)
        else:
            y_tr = y_tr_raw.astype(np.float32)
            y_va = y_va_raw.astype(np.float32)
            inv = lambda z: np.asarray(z, np.float64)

        w_tr = _clean_weight(w_tr_full)

        print(f"  fold {k}: train(ts<= {train_max_ts})={len(df_tr):,} | valid[{vmin},{vmax}]={len(df_va):,}")

        # --- CatBoost
        if USE_CATBOOST:
            train_pool = Pool(X_tr, label=y_tr, weight=w_tr, cat_features=CAT_FEATURE_IDXS)
            valid_pool = Pool(X_va, label=y_va, cat_features=CAT_FEATURE_IDXS)

            for pi, pbase in enumerate(CB_PARAM_LIST):
                for sd in CB_SEEDS:
                    name = f"cb_p{pi}_s{sd}"
                    if name not in oof_store:
                        oof_store[name] = _new_oof(N)

                    params = dict(
                        loss_function="RMSE",
                        eval_metric="RMSE",
                        task_type="CPU",
                        thread_count=-1,
                        random_seed=int(sd + 1000*k),
                        allow_writing_files=False,
                        od_type="Iter",
                        **pbase
                    )

                    model = CatBoostRegressor(**params)
                    model.fit(
                        train_pool,
                        eval_set=valid_pool,
                        use_best_model=True,
                        verbose=False,
                        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                    )

                    pred_va_t = model.predict(valid_pool).astype(np.float64)
                    pred_va = inv(pred_va_t)

                    # store OOF
                    oof_store[name][va_mask] = pred_va.astype(np.float32)

                    sc = weighted_rmse_score(y_va_raw.astype(np.float64), pred_va.astype(np.float64), w_va_eval.astype(np.float64))
                    model_registry.append(dict(family="catboost", name=name, horizon=h_key, fold=int(k), score=float(sc)))

        # --- LightGBM
        if USE_LGBM and _HAS_LGB:
            X_tr_l = X_tr.copy()
            X_va_l = X_va.copy()
            for c in CAT_COLS_MODEL:
                if c in X_tr_l.columns:
                    X_tr_l[c] = X_tr_l[c].astype("category")
                    X_va_l[c] = X_va_l[c].astype("category")

            # LightGBM trains on transformed y if enabled
            y_tr_l = y_tr.astype(np.float32)
            y_va_l = y_va.astype(np.float32)

            dtrain = lgb.Dataset(X_tr_l, label=y_tr_l, weight=w_tr, categorical_feature=CAT_COLS_MODEL, free_raw_data=True)
            dvalid = lgb.Dataset(X_va_l, label=y_va_l, weight=None, categorical_feature=CAT_COLS_MODEL, free_raw_data=True)

            for sd in LGB_SEEDS:
                name = f"lgb_s{sd}"
                if name not in oof_store:
                    oof_store[name] = _new_oof(N)

                params = dict(LGB_PARAMS)
                params["seed"] = int(sd + 1000*k)

                model = lgb.train(
                    params,
                    dtrain,
                    num_boost_round=12000,
                    valid_sets=[dvalid],
                    valid_names=["valid"],
                    callbacks=[lgb.early_stopping(350, verbose=False), lgb.log_evaluation(0)],
                )

                pred_va_t = model.predict(X_va_l, num_iteration=model.best_iteration).astype(np.float64)
                pred_va = inv(pred_va_t)

                oof_store[name][va_mask] = pred_va.astype(np.float32)

                sc = weighted_rmse_score(y_va_raw.astype(np.float64), pred_va.astype(np.float64), w_va_eval.astype(np.float64))
                model_registry.append(dict(family="lgbm", name=name, horizon=h_key, fold=int(k), score=float(sc)))

        gc.collect()

print("\nTraining done in", round(time.time() - t0, 1), "sec")
print("OOF variants:", len(oof_store))

# ----------------------------
# 6) Choose top models by global OOF (across all horizons)
# ----------------------------
y_all = df_train[TARGET_COL].to_numpy(np.float64)
w_all = df_train[WEIGHT_COL].to_numpy(np.float64) if (WEIGHT_COL in df_train.columns) else np.ones(N, np.float64)
w_all = _clean_weight(w_all)

oof_scores_global = []
for name, pred in oof_store.items():
    p = pred.astype(np.float64)
    m = np.isfinite(p) & np.isfinite(y_all)
    if not np.any(m):
        continue
    sc = weighted_rmse_score(y_all[m], p[m], w_all[m])
    oof_scores_global.append((float(sc), name))

oof_scores_global.sort(reverse=True, key=lambda x: x[0])
top_model_names = [n for _, n in oof_scores_global[:min(TOP_MODELS_KEEP, len(oof_scores_global))]]

print("\nTop models kept for blending:")
for s, n in oof_scores_global[:min(10, len(oof_scores_global))]:
    tag = " <KEEP>" if n in top_model_names else ""
    print(f"  {n:10s}  score={s:.6f}{tag}")

if len(top_model_names) == 0:
    raise RuntimeError("No valid OOF models found (all NaN).")

# ----------------------------
# 7) Per-horizon blend + alpha
# ----------------------------
blend_weights_by_h = {}
alpha_by_h = {}
oof_pred_blend = np.full(N, np.nan, dtype=np.float32)

# pre-stack top preds for speed
P_top = np.stack([oof_store[n] for n in top_model_names], axis=1).astype(np.float64)  # (N, M)

for h_key in horizons:
    if h_key == "__all__":
        idx = np.ones(N, dtype=bool)
        hk = "__all__"
    else:
        idx = (df_train["horizon"].astype(str).to_numpy() == str(h_key))
        hk = str(h_key)

    y = y_all[idx]
    w = w_all[idx]
    P = P_top[idx]

    b = fit_blend_weights(P, y, w, nonneg=NONNEG_BLEND)
    blend_weights_by_h[hk] = {top_model_names[i]: float(b[i]) for i in range(len(top_model_names))}

    pred_bl = (P @ b).astype(np.float64)

    a = fit_alpha(y, pred_bl, w, clip=(0.0, 3.0))
    alpha_by_h[hk] = float(a)

    oof_pred_blend[idx] = (pred_bl * a).astype(np.float32)

# global CV
m_ok = np.isfinite(oof_pred_blend) & np.isfinite(y_all)
cv_score = weighted_rmse_score(y_all[m_ok], oof_pred_blend[m_ok].astype(np.float64), w_all[m_ok])

print("\nCV score (OOF blend):", float(cv_score))
print("alpha_by_h:", alpha_by_h)

# ----------------------------
# 8) Save cfg
# ----------------------------
model_cfg_used = dict(
    MODE=MODE,
    USE_RECENCY=USE_RECENCY, TAU=float(TAU), CLIP_W_Q=float(CLIP_W_Q),
    TRAIN_SAMPLE_CAP=int(TRAIN_SAMPLE_CAP), SAMPLE_WEIGHTED=bool(SAMPLE_WEIGHTED),
    USE_CATBOOST=bool(USE_CATBOOST), USE_LGBM=bool(USE_LGBM and _HAS_LGB),
    CB_SEEDS=CB_SEEDS, LGB_SEEDS=LGB_SEEDS,
    CB_PARAM_LIST=CB_PARAM_LIST, LGB_PARAMS=LGB_PARAMS,
    USE_TARGET_ASINH=bool(USE_TARGET_ASINH), ASINH_CLIP_PRED=float(ASINH_CLIP_PRED),
    FEATURE_COLS_MODEL=FEATURE_COLS_MODEL,
    CAT_COLS_MODEL=CAT_COLS_MODEL,
    CAT_FEATURE_IDXS=CAT_FEATURE_IDXS,
    fold_boundaries=fold_boundaries,
    top_model_names=top_model_names,
    blend_weights_by_h=blend_weights_by_h,
    alpha_by_h=alpha_by_h,
    cv_score=float(cv_score),
)

(OUT_DIR / "stage6_cfg.json").write_text(json.dumps(model_cfg_used, indent=2))
(OUT_DIR / "stage6_model_registry.json").write_text(json.dumps(model_registry, indent=2))
(OUT_DIR / "stage6_oof_scores_global.json").write_text(json.dumps(oof_scores_global[:200], indent=2))
print("Saved:", str(OUT_DIR / "stage6_cfg.json"))

# export globals
globals().update({
    "oof_pred_blend": oof_pred_blend,
    "blend_weights_by_h": blend_weights_by_h,
    "alpha_by_h": alpha_by_h,
    "model_cfg_used": model_cfg_used,
    "FEATURE_COLS_MODEL": FEATURE_COLS_MODEL,
    "CAT_FEATURE_IDXS": CAT_FEATURE_IDXS,
    "CAT_COLS_MODEL": CAT_COLS_MODEL,
    "top_model_names": top_model_names,
    "oof_store": oof_store,
    "OUT_DIR_STAGE6": str(OUT_DIR),
})

gc.collect()


MODE: per_horizon
Total features: 190 | cat idx: 4
fold_ids: [0, 1, 2, 3, 4]
horizons: ['1', '10', '25', '3'] 

HORIZON: 1 | rows: 1394653
  fold 0: train(ts<= 2519)=450,000 | valid[2521,2737]=99,142


# Final Fit, Test Inference, and Submission Packaging

In [None]:
# ============================================================
# STAGE 7 — Final Fit, Test Inference, and Submission Packaging (REVISI FULL v5)
# - CONSISTENT with STAGE 6 v5:
#     * Purged-CV used asinh target transform -> apply same transform in final fit
#     * Blend uses TOP model names (top_model_names) not family averages
# - RAM-safe:
#     * trains models ONE-BY-ONE, predicts, discards model (no big stacks)
#     * uses float32 matrices where possible
# - Per-horizon final fit (recommended), with robust fallback if horizon missing in test
# - Saves lightweight artifacts (cfg + selected model list + submission)
#
# REQUIRE:
#   df_train, df_test, TARGET_COL, ID_COL, TIME_COL, WEIGHT_COL
#   FEATURE_COLS_MODEL, CAT_FEATURE_IDXS, CAT_COLS_MODEL
#   make_sample_weight OR (better) _fold_sample_weight logic in CFG
#   model_cfg_used (from Stage 6 v5)
#
# OUTPUT:
#   /kaggle/working/submission.csv
#   /kaggle/working/tsf_stage7_bundle_v5/bundle.json
# ============================================================

import gc, json, time, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

need = ["df_train","df_test","TARGET_COL","ID_COL","TIME_COL","WEIGHT_COL",
        "FEATURE_COLS_MODEL","CAT_FEATURE_IDXS","CAT_COLS_MODEL","model_cfg_used"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing global: {k}. Jalankan stage sebelumnya.")

from catboost import CatBoostRegressor, Pool
try:
    import lightgbm as lgb
    _HAS_LGB = True
except Exception:
    _HAS_LGB = False

CFG = model_cfg_used
MODE = CFG.get("MODE", "per_horizon")

# blend maps produced by Stage 6 v5
blend_weights_by_h = CFG["blend_weights_by_h"]
alpha_by_h = CFG["alpha_by_h"]
top_model_names = CFG.get("top_model_names", None)
if not top_model_names:
    raise RuntimeError("CFG missing top_model_names. Pastikan pakai STAGE 6 v5 (yang menyimpan top_model_names).")

USE_CATBOOST = bool(CFG.get("USE_CATBOOST", True))
USE_LGBM = bool(CFG.get("USE_LGBM", False)) and _HAS_LGB

CB_SEEDS = CFG.get("CB_SEEDS", [42, 62])
LGB_SEEDS = CFG.get("LGB_SEEDS", [41, 51])
CB_PARAM_LIST = CFG.get("CB_PARAM_LIST", [])
LGB_PARAMS = CFG.get("LGB_PARAMS", {})

USE_RECENCY = bool(CFG.get("USE_RECENCY", True))
TAU = float(CFG.get("TAU", 450.0))
CLIP_W_Q = float(CFG.get("CLIP_W_Q", 0.9995))

USE_TARGET_ASINH = bool(CFG.get("USE_TARGET_ASINH", True))
ASINH_CLIP_PRED = float(CFG.get("ASINH_CLIP_PRED", 8.0))

# training anchor for recency: use TRAIN_MAX_TS (final training sees all past)
TRAIN_MAX_TS = int(df_train[TIME_COL].max())

OUT_DIR = Path("/kaggle/working/tsf_stage7_bundle_v5")
MODEL_DIR = OUT_DIR / "final_models"
OUT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------
# Helpers
# ----------------------------
def _clean_weight(w: np.ndarray) -> np.ndarray:
    w = np.asarray(w, dtype=np.float64)
    w = np.where(np.isfinite(w), w, 0.0)
    w = np.maximum(w, 0.0)
    if w.sum() <= 1e-12:
        w = np.ones_like(w, dtype=np.float64)
    return w

def _make_final_weight(df: pd.DataFrame) -> np.ndarray:
    # final-fit weight = official weight * recency (optional)
    w = df[WEIGHT_COL].to_numpy(np.float64) if (WEIGHT_COL in df.columns) else np.ones(len(df), np.float64)
    w = _clean_weight(w)

    # clip huge weights (stability)
    if CLIP_W_Q is not None:
        q = float(np.nanquantile(w, float(CLIP_W_Q)))
        if np.isfinite(q) and q > 0:
            w = np.minimum(w, q)

    if USE_RECENCY:
        t = df[TIME_COL].to_numpy(np.float64)
        rec = np.exp(-(float(TRAIN_MAX_TS) - t) / float(TAU))
        rec = np.where(np.isfinite(rec), rec, 0.0)
        w = w * rec

    return _clean_weight(w)

def _asinh_fit_scale(y: np.ndarray) -> float:
    s = float(np.nanmedian(np.abs(y)))
    if not np.isfinite(s) or s <= 1e-6:
        s = 1.0
    return s

def _asinh_transform(y: np.ndarray, s: float) -> np.ndarray:
    return np.arcsinh(y / s).astype(np.float32)

def _asinh_inverse(z: np.ndarray, s: float) -> np.ndarray:
    z = np.asarray(z, np.float64)
    z = np.clip(z, -ASINH_CLIP_PRED, ASINH_CLIP_PRED)
    return (np.sinh(z) * s).astype(np.float64)

# Parse which exact model instances to train from top_model_names
# Expected names: "cb_p{pi}_s{sd}" and "lgb_s{sd}"
def _parse_top_models(top_names):
    cb_list = []
    lgb_list = []
    for nm in top_names:
        if nm.startswith("cb_"):
            # cb_p{pi}_s{sd}
            try:
                parts = nm.split("_")
                pi = int(parts[1].replace("p",""))
                sd = int(parts[2].replace("s",""))
                cb_list.append((pi, sd, nm))
            except Exception:
                pass
        elif nm.startswith("lgb_"):
            try:
                sd = int(nm.split("s")[-1])
                lgb_list.append((sd, nm))
            except Exception:
                pass
    return cb_list, lgb_list

CB_KEEP, LGB_KEEP = _parse_top_models(top_model_names)
if USE_CATBOOST and (len(CB_KEEP) == 0):
    print("[WARN] No CatBoost model in top_model_names (blend). CatBoost inference may be skipped.")
if USE_LGBM and (len(LGB_KEEP) == 0):
    print("[WARN] No LGBM model in top_model_names (blend). LGBM inference may be skipped.")

# horizons
if MODE == "per_horizon" and ("horizon" in df_train.columns):
    horizons = sorted(df_train["horizon"].astype(str).unique().tolist())
else:
    horizons = ["__all__"]

# Pre-init predictions
test_pred = np.zeros(len(df_test), dtype=np.float64)

# ----------------------------
# Main loop
# ----------------------------
t0 = time.time()
for h in horizons:
    if h == "__all__":
        tr_idx = np.ones(len(df_train), dtype=bool)
        te_idx = np.ones(len(df_test), dtype=bool)
        h_key = "__all__"
    else:
        tr_idx = (df_train["horizon"].astype(str).to_numpy() == str(h))
        if "horizon" in df_test.columns:
            te_idx = (df_test["horizon"].astype(str).to_numpy() == str(h))
        else:
            te_idx = np.ones(len(df_test), dtype=bool)
        h_key = str(h)

    ntr = int(tr_idx.sum())
    nte = int(te_idx.sum())
    if ntr == 0 or nte == 0:
        continue

    # prepare data (keep as float32 where possible)
    X_tr = df_train.loc[tr_idx, FEATURE_COLS_MODEL]
    y_tr_raw = df_train.loc[tr_idx, TARGET_COL].to_numpy(np.float64)
    w_tr = _make_final_weight(df_train.loc[tr_idx])

    X_te = df_test.loc[te_idx, FEATURE_COLS_MODEL]

    # target transform consistent with Stage 6 v5
    if USE_TARGET_ASINH:
        s = _asinh_fit_scale(y_tr_raw)
        y_tr = _asinh_transform(y_tr_raw, s)
        inv = lambda z: _asinh_inverse(z, s)
    else:
        y_tr = y_tr_raw.astype(np.float32)
        inv = lambda z: np.asarray(z, np.float64)

    # Which models to use for this horizon blend?
    # weights are stored by horizon key; if missing -> fallback uniform over available comps
    wts = blend_weights_by_h.get(h_key, None)
    if wts is None:
        wts = blend_weights_by_h.get("__all__", None)

    # We'll accumulate prediction by iterating each kept model (RAM-safe).
    pred_acc = np.zeros(nte, dtype=np.float64)
    used_any = False

    # ---------- CatBoost models ----------
    if USE_CATBOOST and len(CB_PARAM_LIST) and len(CB_KEEP):
        pool_tr = Pool(X_tr, label=y_tr, weight=w_tr, cat_features=CAT_FEATURE_IDXS)
        pool_te = Pool(X_te, cat_features=CAT_FEATURE_IDXS)

        for (pi, sd, name) in CB_KEEP:
            # skip if weight for this model is 0 (for this horizon)
            if wts is not None and float(wts.get(name, 0.0)) <= 0:
                continue

            if pi < 0 or pi >= len(CB_PARAM_LIST):
                continue

            pbase = CB_PARAM_LIST[pi]
            params = dict(
                loss_function="RMSE",
                eval_metric="RMSE",
                task_type="CPU",
                thread_count=-1,
                random_seed=int(sd + 777),
                allow_writing_files=False,
                **pbase
            )
            model = CatBoostRegressor(**params)
            model.fit(pool_tr, verbose=False)

            pred_t = model.predict(pool_te).astype(np.float64)  # transformed space if asinh enabled
            pred = inv(pred_t)

            wt = float(wts.get(name, 0.0)) if wts is not None else 1.0
            pred_acc += wt * pred
            used_any = True

            # optional save
            model.save_model(str(MODEL_DIR / f"cb_h{h_key}_{name}.cbm"))
            del model, pred_t, pred
            gc.collect()

    # ---------- LightGBM models ----------
    if USE_LGBM and len(LGB_KEEP):
        X_tr_l = X_tr.copy()
        X_te_l = X_te.copy()
        for c in CAT_COLS_MODEL:
            if c in X_tr_l.columns:
                X_tr_l[c] = X_tr_l[c].astype("category")
                X_te_l[c] = X_te_l[c].astype("category")

        dtrain = lgb.Dataset(X_tr_l, label=y_tr.astype(np.float32), weight=w_tr,
                             categorical_feature=CAT_COLS_MODEL, free_raw_data=True)

        num_boost = int(CFG.get("LGB_NUM_BOOST", 6000))
        for (sd, name) in LGB_KEEP:
            if wts is not None and float(wts.get(name, 0.0)) <= 0:
                continue

            params = dict(LGB_PARAMS)
            params["seed"] = int(sd + 777)

            model = lgb.train(params, dtrain, num_boost_round=num_boost)
            pred_t = model.predict(X_te_l).astype(np.float64)
            pred = inv(pred_t)

            wt = float(wts.get(name, 0.0)) if wts is not None else 1.0
            pred_acc += wt * pred
            used_any = True

            model.save_model(str(MODEL_DIR / f"lgb_h{h_key}_{name}.txt"))
            del model, pred_t, pred
            gc.collect()

        del X_tr_l, X_te_l, dtrain
        gc.collect()

    # fallback if wts missing or all 0
    if not used_any:
        # uniform average over available kept models (rare)
        print(f"[WARN] No model used for horizon {h_key}. Fallback to mean=0.")
        pred_acc = np.zeros(nte, dtype=np.float64)

    # apply alpha calibration (per horizon, fallback to __all__)
    a = alpha_by_h.get(h_key, alpha_by_h.get("__all__", 1.0))
    pred_acc *= float(a)

    # assign
    test_pred[te_idx] = pred_acc

    print(f"H={h_key} | n_tr={ntr} n_te={nte} | used_models={used_any} | alpha={a}")

    del X_tr, X_te, y_tr_raw, y_tr, w_tr, pred_acc
    gc.collect()

print("Final inference done in", round(time.time()-t0, 1), "sec")

# ----------------------------
# Submission
# ----------------------------
SUB_PATH = Path("/kaggle/working/submission.csv")
sub = pd.DataFrame({ID_COL: df_test[ID_COL].astype(str).values, "prediction": test_pred.astype(np.float64)})
sub.to_csv(SUB_PATH, index=False)
print("Saved submission:", str(SUB_PATH), "| shape:", sub.shape)

# ----------------------------
# Bundle for reproducibility
# ----------------------------
bundle = dict(
    created_utc=float(time.time()),
    cfg=CFG,
    top_model_names=top_model_names,
    features=FEATURE_COLS_MODEL,
    cat_cols=CAT_COLS_MODEL,
    cat_feature_idxs=CAT_FEATURE_IDXS,
    id_col=ID_COL, time_col=TIME_COL, weight_col=WEIGHT_COL, target_col=TARGET_COL,
    train_shape=list(df_train.shape),
    test_shape=list(df_test.shape),
)
(OUT_DIR / "bundle.json").write_text(json.dumps(bundle, indent=2))
print("Saved bundle:", str(OUT_DIR / "bundle.json"))

globals().update({
    "test_pred": test_pred,
    "SUB_PATH": str(SUB_PATH),
    "BUNDLE_DIR": str(OUT_DIR),
})

gc.collect()
