# Load Data & Initial Inspection

In [1]:
# ============================================================
# STAGE 1 — Load Data & Initial Inspection (ONE CELL, Kaggle)
# Paths (given):
#   /kaggle/input/ts-forecasting/train.parquet
#   /kaggle/input/ts-forecasting/test.parquet
# Output globals:
#   df_train, df_test, TARGET_COL, ID_COL, WEIGHT_COL, TIME_COL, CAT_COLS, FEAT_COLS, NUM_COLS
# ============================================================

import os, gc
from pathlib import Path
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

TRAIN_PATH = Path("/kaggle/input/ts-forecasting/train.parquet")
TEST_PATH  = Path("/kaggle/input/ts-forecasting/test.parquet")

for p in [TRAIN_PATH, TEST_PATH]:
    if not p.exists():
        raise FileNotFoundError(f"Missing file: {p}")

print("Loading parquet...")
df_train = pd.read_parquet(TRAIN_PATH)
df_test  = pd.read_parquet(TEST_PATH)

print("\n==================== BASIC SHAPES ====================")
print("train:", df_train.shape)
print("test :", df_test.shape)

print("\n==================== COLUMNS ====================")
print("train cols:", len(df_train.columns))
print("test  cols:", len(df_test.columns))

# ---- Standard column names (from competition description)
ID_COL     = "id"
WEIGHT_COL = "weight"
TIME_COL   = "ts_index"
BASE_CATS  = ["code", "sub_code", "sub_category", "horizon"]

# ---- Detect target column (must exist in train, not in test)
train_only_cols = [c for c in df_train.columns if c not in df_test.columns]
# remove any obviously non-target extras if present
train_only_cols = [c for c in train_only_cols if c not in [ID_COL, WEIGHT_COL, TIME_COL] + BASE_CATS]

# Prefer common target names if present
preferred_names = ["target", "y", "label", "value", "prediction_target"]
TARGET_COL = None
for name in preferred_names:
    if name in df_train.columns and name not in df_test.columns:
        TARGET_COL = name
        break

if TARGET_COL is None:
    # If exactly one train-only col remains -> pick it
    if len(train_only_cols) == 1:
        TARGET_COL = train_only_cols[0]
    else:
        # Fallback: pick numeric column(s) absent in test
        cand = []
        for c in [c for c in df_train.columns if c not in df_test.columns]:
            if pd.api.types.is_numeric_dtype(df_train[c]):
                cand.append(c)
        # remove known non-target just in case
        cand = [c for c in cand if c not in [WEIGHT_COL, TIME_COL]]
        if len(cand) == 1:
            TARGET_COL = cand[0]
        elif len(cand) > 1:
            # pick the one with highest variance (usually the true target)
            vars_ = {c: float(np.nanvar(df_train[c].to_numpy(dtype=np.float64))) for c in cand}
            TARGET_COL = sorted(vars_.items(), key=lambda kv: kv[1], reverse=True)[0][0]
            print("\n[WARN] Multiple numeric train-only cols found; picked by variance:", TARGET_COL)
        else:
            # last resort: if any train-only cols exist, pick the first
            if len([c for c in df_train.columns if c not in df_test.columns]) > 0:
                TARGET_COL = [c for c in df_train.columns if c not in df_test.columns][0]
                print("\n[WARN] Could not confidently detect target; picked first train-only col:", TARGET_COL)
            else:
                raise RuntimeError("Could not detect target column (no train-only columns).")

print("\n==================== KEY COLS ====================")
print("ID_COL     :", ID_COL, "| exists:", ID_COL in df_train.columns and ID_COL in df_test.columns)
print("TIME_COL   :", TIME_COL, "| exists:", TIME_COL in df_train.columns and TIME_COL in df_test.columns)
print("WEIGHT_COL :", WEIGHT_COL, "| exists:", WEIGHT_COL in df_train.columns and WEIGHT_COL in df_test.columns)
print("TARGET_COL :", TARGET_COL, "| exists in train:", TARGET_COL in df_train.columns, "| exists in test:", TARGET_COL in df_test.columns)

# ---- Determine categorical columns that exist
CAT_COLS = [c for c in BASE_CATS if c in df_train.columns]
# Also include any object/category cols (excluding id)
for c in df_train.columns:
    if c == ID_COL or c == TARGET_COL:
        continue
    if pd.api.types.is_object_dtype(df_train[c]) or str(df_train[c].dtype).startswith("category"):
        if c not in CAT_COLS:
            CAT_COLS.append(c)

# ---- Determine feature columns (exclude id, target, weight; keep everything else)
EXCLUDE = set([ID_COL, TARGET_COL, WEIGHT_COL])
FEAT_COLS = [c for c in df_train.columns if c not in EXCLUDE]

# ---- Determine numeric feature columns
NUM_COLS = [c for c in FEAT_COLS if pd.api.types.is_numeric_dtype(df_train[c]) and c != WEIGHT_COL]

print("\n==================== QUICK CHECKS ====================")
# id uniqueness
if ID_COL in df_train.columns:
    print("train id unique:", df_train[ID_COL].nunique(), "/", len(df_train))
if ID_COL in df_test.columns:
    print("test  id unique:", df_test[ID_COL].nunique(), "/", len(df_test))

# ts_index ranges
if TIME_COL in df_train.columns and TIME_COL in df_test.columns:
    print("train ts_index range:", int(df_train[TIME_COL].min()), "->", int(df_train[TIME_COL].max()))
    print("test  ts_index range:", int(df_test[TIME_COL].min()),  "->", int(df_test[TIME_COL].max()))

# horizon distribution (small peek)
if "horizon" in df_train.columns:
    print("\ntrain horizon value counts (top):")
    print(df_train["horizon"].value_counts(dropna=False).head(10))
if "horizon" in df_test.columns:
    print("\ntest horizon value counts (top):")
    print(df_test["horizon"].value_counts(dropna=False).head(10))

# missingness summary (top 15 columns)
print("\n==================== MISSING VALUES (TOP) ====================")
miss_train = df_train.isna().mean().sort_values(ascending=False)
miss_test  = df_test.isna().mean().sort_values(ascending=False)
print("train missing rate top 15:")
print(miss_train.head(15))
print("\ntest missing rate top 15:")
print(miss_test.head(15))

# target stats
if TARGET_COL in df_train.columns and pd.api.types.is_numeric_dtype(df_train[TARGET_COL]):
    y = df_train[TARGET_COL].to_numpy(dtype=np.float64)
    print("\n==================== TARGET STATS ====================")
    print("count:", np.isfinite(y).sum(), " / ", len(y))
    print("mean :", float(np.nanmean(y)))
    print("std  :", float(np.nanstd(y)))
    print("min  :", float(np.nanmin(y)))
    print("p1   :", float(np.nanpercentile(y, 1)))
    print("p50  :", float(np.nanpercentile(y, 50)))
    print("p99  :", float(np.nanpercentile(y, 99)))
    print("max  :", float(np.nanmax(y)))

# weight stats (reminder: do NOT use as feature)
if WEIGHT_COL in df_train.columns and pd.api.types.is_numeric_dtype(df_train[WEIGHT_COL]):
    w = df_train[WEIGHT_COL].to_numpy(dtype=np.float64)
    print("\n==================== WEIGHT STATS (NOT A FEATURE) ====================")
    print("mean :", float(np.nanmean(w)))
    print("min  :", float(np.nanmin(w)))
    print("p50  :", float(np.nanpercentile(w, 50)))
    print("p99  :", float(np.nanpercentile(w, 99)))
    print("max  :", float(np.nanmax(w)))

print("\n==================== FEATURE SET SUMMARY ====================")
print("CAT_COLS :", CAT_COLS)
print("NUM_COLS :", len(NUM_COLS), "(numeric features excluding weight/target/id)")
print("FEAT_COLS:", len(FEAT_COLS), "(all usable columns excluding target and weight; id excluded)")

print("\n==================== HEAD (train) ====================")
display(df_train.head(3))
print("\n==================== HEAD (test) ====================")
display(df_test.head(3))

gc.collect()


Loading parquet...

train: (5337414, 94)
test : (1447107, 92)

train cols: 94
test  cols: 92

ID_COL     : id | exists: True
TIME_COL   : ts_index | exists: True
WEIGHT_COL : weight | exists: False
TARGET_COL : y_target | exists in train: True | exists in test: False

train id unique: 5337414 / 5337414
test  id unique: 1447107 / 1447107
train ts_index range: 1 -> 3601
test  ts_index range: 3602 -> 4376

train horizon value counts (top):
horizon
1     1394653
3     1385816
10    1337236
25    1219709
Name: count, dtype: int64

test horizon value counts (top):
horizon
1     379617
3     376558
10    362057
25    328875
Name: count, dtype: int64

train missing rate top 15:
feature_at    0.124719
feature_by    0.110192
feature_ay    0.085420
feature_cd    0.074964
feature_ce    0.051678
feature_cf    0.044289
feature_al    0.042233
feature_aw    0.038444
feature_bz    0.028426
feature_bi    0.027622
feature_k     0.011059
feature_i     0.011059
feature_h     0.010954
feature_j     0.010954

Unnamed: 0,id,code,sub_code,sub_category,horizon,ts_index,feature_a,feature_b,feature_c,feature_d,feature_e,feature_f,feature_g,feature_h,feature_i,feature_j,feature_k,feature_l,feature_m,feature_n,feature_o,feature_p,feature_q,feature_r,feature_s,feature_t,feature_u,feature_v,feature_w,feature_x,feature_y,feature_z,feature_aa,feature_ab,feature_ac,feature_ad,feature_ae,feature_af,feature_ag,feature_ah,feature_ai,feature_aj,feature_ak,feature_al,feature_am,feature_an,feature_ao,feature_ap,feature_aq,feature_ar,feature_as,feature_at,feature_au,feature_av,feature_aw,feature_ax,feature_ay,feature_az,feature_ba,feature_bb,feature_bc,feature_bd,feature_be,feature_bf,feature_bg,feature_bh,feature_bi,feature_bj,feature_bk,feature_bl,feature_bm,feature_bn,feature_bo,feature_bp,feature_bq,feature_br,feature_bs,feature_bt,feature_bu,feature_bv,feature_bw,feature_bx,feature_by,feature_bz,feature_ca,feature_cb,feature_cc,feature_cd,feature_ce,feature_cf,feature_cg,feature_ch,y_target,weight
0,W2MW3G2L__J0G2B0KU__PZ9S1Z4V__25__89,W2MW3G2L,J0G2B0KU,PZ9S1Z4V,25,89,29,16.364093,7.464023,5.966933,1.622184,10.26136,4.914369,0.000467,0.023686,0.006409,0.000187,0.744244,2.001013,-0.01687,0.009892,0.013162,0.021502,0.901966,0.402125,0.038566,0.177947,0.091141,-84.968733,-1.765306,10.109641,145.320404,0.08958,0.868698,0.080088,0.101631,0.026555,0.092776,0.004,1.298973,7.321646,3.628258,0.453027,-0.080212,0.192181,0.510727,17.136629,0.267856,7.745722,4.037853,4.85679,,5.188995,79.423474,244.471191,13.848771,,0.01707,0.709292,21.80395,0.120968,26999.430482,34126.269444,791.709562,0.15467,9499.742248,1.266071,429.318704,2540.88981,0.008927,1.122459,23.815924,0.54985,0.067941,0.076033,0.02759,-0.47269,-0.202944,-3.769914,0.104535,3.040304,4.499546,,-0.058543,-0.001686,-0.105328,-0.005045,,-0.133697,2.849819,0.112068,1,-0.551324,40.982572
1,W2MW3G2L__J0G2B0KU__PZ9S1Z4V__1__89,W2MW3G2L,J0G2B0KU,PZ9S1Z4V,1,89,53,2.858806,5.050617,15.906651,10.879453,3.072151,4.091032,0.000467,0.023686,0.006409,0.000187,0.744244,2.001013,-0.01687,0.009892,0.013162,0.021502,0.901966,0.402125,0.038566,0.177947,0.091141,-84.968733,-1.765306,10.109641,145.320404,0.08958,0.868698,0.080088,0.101631,0.026555,0.092776,0.004,1.298973,7.321646,3.628258,0.453027,0.00148,0.192181,0.510727,17.136629,0.267856,7.745722,4.037853,4.85679,,5.188995,79.423474,244.471191,13.848771,,0.01707,0.709292,21.80395,0.120968,26999.430482,34126.269444,791.709562,0.15467,9499.742248,1.266071,429.318704,2540.88981,0.008927,1.122459,23.815924,0.54985,0.067941,0.076033,0.02759,-0.47269,-0.202944,-3.769914,0.104535,3.040304,4.499546,,-0.058543,-0.001686,-0.105328,-0.005045,,-0.133697,2.849819,0.112068,1,-0.315583,150.075406
2,W2MW3G2L__J0G2B0KU__PZ9S1Z4V__3__89,W2MW3G2L,J0G2B0KU,PZ9S1Z4V,3,89,51,9.585452,1.076268,9.004147,16.74049,15.166901,11.427983,0.000467,0.023686,0.006409,0.000187,0.744244,2.001013,-0.01687,0.009892,0.013162,0.021502,0.901966,0.402125,0.038566,0.177947,0.091141,-84.968733,-1.765306,10.109641,145.320404,0.08958,0.868698,0.080088,0.101631,0.026555,0.092776,0.004,1.298973,7.321646,3.628258,0.453027,-0.045494,0.192181,0.510727,17.136629,0.267856,7.745722,4.037853,4.85679,,5.188995,79.423474,244.471191,13.848771,,0.01707,0.709292,21.80395,0.120968,26999.430482,34126.269444,791.709562,0.15467,9499.742248,1.266071,429.318704,2540.88981,0.008927,1.122459,23.815924,0.54985,0.067941,0.076033,0.02759,-0.47269,-0.202944,-3.769914,0.104535,3.040304,4.499546,,-0.058543,-0.001686,-0.105328,-0.005045,,-0.133697,2.849819,0.112068,1,-0.362894,115.953552





Unnamed: 0,id,code,sub_code,sub_category,horizon,ts_index,feature_a,feature_b,feature_c,feature_d,feature_e,feature_f,feature_g,feature_h,feature_i,feature_j,feature_k,feature_l,feature_m,feature_n,feature_o,feature_p,feature_q,feature_r,feature_s,feature_t,feature_u,feature_v,feature_w,feature_x,feature_y,feature_z,feature_aa,feature_ab,feature_ac,feature_ad,feature_ae,feature_af,feature_ag,feature_ah,feature_ai,feature_aj,feature_ak,feature_al,feature_am,feature_an,feature_ao,feature_ap,feature_aq,feature_ar,feature_as,feature_at,feature_au,feature_av,feature_aw,feature_ax,feature_ay,feature_az,feature_ba,feature_bb,feature_bc,feature_bd,feature_be,feature_bf,feature_bg,feature_bh,feature_bi,feature_bj,feature_bk,feature_bl,feature_bm,feature_bn,feature_bo,feature_bp,feature_bq,feature_br,feature_bs,feature_bt,feature_bu,feature_bv,feature_bw,feature_bx,feature_by,feature_bz,feature_ca,feature_cb,feature_cc,feature_cd,feature_ce,feature_cf,feature_cg,feature_ch
0,W2MW3G2L__495MGHFJ__PZ9S1Z4V__3__3647,W2MW3G2L,495MGHFJ,PZ9S1Z4V,3,3647,95,10.365266,3.209321,8.109339,9.043471,10.123041,15.722121,0.000243,0.021819,0.00142,7.3e-05,0.572125,1.265875,1.341192,0.005564,0.011987,0.035243,0.833918,1.791284,0.020539,0.218876,0.08066,-50.98124,-4.854592,-8.087713,119.237254,0.040442,0.635006,0.105355,0.075415,0.03444,0.09455,0.006728,1.986904,4.411098,3.050746,0.484755,0.020247,0.186578,0.528456,15.395411,0.219483,4.83955,2.420422,2.652015,0.0,4.151196,1012.649294,425.853042,197.344987,209.253182,0.016366,0.552138,108.859861,2.369993,66589.814887,34282.221003,1316.738008,0.04801,11660.961097,0.116372,11.122246,716.158132,0.008559,1.772256,38.452077,0.872948,0.06611,0.078856,0.030888,-0.480743,-0.197747,-3.659776,0.100295,3.131395,4.554259,-0.000832,-0.032241,-0.00083,-0.058961,-0.002774,-0.00148,-0.25646,1.665532,0.071324,2
1,W2MW3G2L__495MGHFJ__PZ9S1Z4V__10__3647,W2MW3G2L,495MGHFJ,PZ9S1Z4V,10,3647,88,2.571477,15.234848,16.505699,0.230426,10.145378,10.159641,0.000243,0.021819,0.00142,7.3e-05,0.572125,1.265875,1.341192,0.005564,0.011987,0.035243,0.833918,1.791284,0.020539,0.218876,0.08066,-50.98124,-4.854592,-8.087713,119.237254,0.040442,0.635006,0.105355,0.075415,0.03444,0.09455,0.006728,1.986904,4.411098,3.050746,0.484755,0.052623,0.186578,0.528456,15.395411,0.219483,4.83955,2.420422,2.652015,0.0,4.151196,1012.649294,425.853042,197.344987,209.253182,0.016366,0.552138,108.859861,2.369993,66589.814887,34282.221003,1316.738008,0.04801,11660.961097,0.116372,11.122246,716.158132,0.008559,1.772256,38.452077,0.872948,0.06611,0.078856,0.030888,-0.480743,-0.197747,-3.659776,0.100295,3.131395,4.554259,-0.000832,-0.032241,-0.00083,-0.058961,-0.002774,-0.00148,-0.25646,1.665532,0.071324,2
2,W2MW3G2L__495MGHFJ__PZ9S1Z4V__25__3647,W2MW3G2L,495MGHFJ,PZ9S1Z4V,25,3647,71,5.524709,6.931663,8.939537,0.668187,16.578701,3.15069,0.000243,0.021819,0.00142,7.3e-05,0.572125,1.265875,1.341192,0.005564,0.011987,0.035243,0.833918,1.791284,0.020539,0.218876,0.08066,-50.98124,-4.854592,-8.087713,119.237254,0.040442,0.635006,0.105355,0.075415,0.03444,0.09455,0.006728,1.986904,4.411098,3.050746,0.484755,0.041667,0.186578,0.528456,15.395411,0.219483,4.83955,2.420422,2.652015,0.0,4.151196,1012.649294,425.853042,197.344987,209.253182,0.016366,0.552138,108.859861,2.369993,66589.814887,34282.221003,1316.738008,0.04801,11660.961097,0.116372,11.122246,716.158132,0.008559,1.772256,38.452077,0.872948,0.06611,0.078856,0.030888,-0.480743,-0.197747,-3.659776,0.100295,3.131395,4.554259,-0.000832,-0.032241,-0.00083,-0.058961,-0.002774,-0.00148,-0.25646,1.665532,0.071324,2


0

# Sanity Checks & Leakage Rules Setup

In [2]:
# ============================================================
# STAGE 2 — Sanity Checks & Leakage Rules Setup (ONE CELL, Kaggle)
# Assumes STAGE 1 already ran and created:
#   df_train, df_test, TARGET_COL, ID_COL, TIME_COL, CAT_COLS, FEAT_COLS
# This stage:
# - Validates schema and uniqueness
# - Confirms time ordering (train < test)
# - Sets leakage-safe column lists
# - Defines "DO NOT USE" columns and lightweight guards
# Outputs/Globals:
#   DO_NOT_USE_COLS, FEATURE_COLS_NUM, FEATURE_COLS_CAT, FEATURE_COLS_ALL
#   TRAIN_MAX_TS, TEST_MIN_TS, TEST_MAX_TS
# ============================================================

import gc, re
import numpy as np
import pandas as pd

# ----------------------------
# 0) Require STAGE 1 globals
# ----------------------------
need = ["df_train","df_test","TARGET_COL","ID_COL","TIME_COL","CAT_COLS","FEAT_COLS"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing global '{k}'. Jalankan STAGE 1 dulu.")

assert isinstance(df_train, pd.DataFrame) and isinstance(df_test, pd.DataFrame)

# ----------------------------
# 1) Core column existence
# ----------------------------
must_in_train = [ID_COL, TIME_COL, TARGET_COL]
must_in_test  = [ID_COL, TIME_COL]
for c in must_in_train:
    if c not in df_train.columns:
        raise RuntimeError(f"Train missing required column: {c}")
for c in must_in_test:
    if c not in df_test.columns:
        raise RuntimeError(f"Test missing required column: {c}")

# Optional: weight may exist only in train
WEIGHT_COL = "weight"
has_weight_train = WEIGHT_COL in df_train.columns
has_weight_test  = WEIGHT_COL in df_test.columns

print("==================== STAGE 2: SANITY ====================")
print("Has weight in train:", has_weight_train, "| in test:", has_weight_test)
if has_weight_test:
    print("[WARN] weight also exists in test. We'll still exclude it as a feature.")
print("Target col:", TARGET_COL)

# ----------------------------
# 2) ID uniqueness checks
# ----------------------------
ntr = len(df_train)
nts = len(df_test)

nuniq_tr = df_train[ID_COL].nunique(dropna=False)
nuniq_ts = df_test[ID_COL].nunique(dropna=False)

if nuniq_tr != ntr:
    dup = df_train[df_train[ID_COL].duplicated(keep=False)][ID_COL].head(10).tolist()
    raise RuntimeError(f"Train id not unique: {nuniq_tr}/{ntr}. Example dups: {dup}")
if nuniq_ts != nts:
    dup = df_test[df_test[ID_COL].duplicated(keep=False)][ID_COL].head(10).tolist()
    raise RuntimeError(f"Test id not unique: {nuniq_ts}/{nts}. Example dups: {dup}")

intersect = np.intersect1d(df_train[ID_COL].values, df_test[ID_COL].values)
if len(intersect) > 0:
    print(f"[WARN] Train/Test share {len(intersect)} ids (unexpected). Example:", intersect[:5])

print("ID uniqueness: OK")

# ----------------------------
# 3) Time ordering checks
# ----------------------------
if not pd.api.types.is_integer_dtype(df_train[TIME_COL]) and not pd.api.types.is_numeric_dtype(df_train[TIME_COL]):
    raise RuntimeError(f"{TIME_COL} in train is not numeric.")
if not pd.api.types.is_integer_dtype(df_test[TIME_COL]) and not pd.api.types.is_numeric_dtype(df_test[TIME_COL]):
    raise RuntimeError(f"{TIME_COL} in test is not numeric.")

TRAIN_MAX_TS = int(np.nanmax(df_train[TIME_COL].values))
TRAIN_MIN_TS = int(np.nanmin(df_train[TIME_COL].values))
TEST_MIN_TS  = int(np.nanmin(df_test[TIME_COL].values))
TEST_MAX_TS  = int(np.nanmax(df_test[TIME_COL].values))

print("Train ts_index range:", TRAIN_MIN_TS, "->", TRAIN_MAX_TS)
print("Test  ts_index range:", TEST_MIN_TS,  "->", TEST_MAX_TS)

# Expect test period after train; allow small overlaps but flag loudly
if TEST_MIN_TS <= TRAIN_MAX_TS:
    print("[WARN] Test min ts_index <= Train max ts_index. Check competition rules / possible overlap.")
else:
    print("Time ordering (train -> test): OK (test starts after train).")

# ----------------------------
# 4) Leakage rules + feature lists
# ----------------------------
# DO NOT USE columns as model input features:
# - id, target, and weight (even if present in train/test)
DO_NOT_USE_COLS = {ID_COL, TARGET_COL, WEIGHT_COL}

# Basic categorical feature columns: from STAGE 1 CAT_COLS
FEATURE_COLS_CAT = [c for c in CAT_COLS if c not in DO_NOT_USE_COLS and c in df_train.columns]

# Numeric candidate features: all numeric columns except forbidden
numeric_cols = [c for c in df_train.columns if pd.api.types.is_numeric_dtype(df_train[c])]
FEATURE_COLS_NUM = [c for c in numeric_cols if c not in DO_NOT_USE_COLS and c != TIME_COL]  # exclude ts_index by default

# Full feature set used by "tabular model" baseline:
FEATURE_COLS_ALL = FEATURE_COLS_CAT + FEATURE_COLS_NUM

print("\n==================== FEATURE LISTS ====================")
print("Categorical features:", FEATURE_COLS_CAT)
print("Numeric features (excluding ts_index):", len(FEATURE_COLS_NUM))
print("Total features:", len(FEATURE_COLS_ALL))

# ----------------------------
# 5) Minimal integrity checks (dtypes, NaNs)
# ----------------------------
# Categorical columns should exist in both train and test
missing_cats_test = [c for c in FEATURE_COLS_CAT if c not in df_test.columns]
if missing_cats_test:
    raise RuntimeError(f"Categorical cols missing in test: {missing_cats_test}")

# Numeric columns should exist in both train and test for inference
missing_num_test = [c for c in FEATURE_COLS_NUM if c not in df_test.columns]
if missing_num_test:
    # It's possible, but unusual; better fail fast
    raise RuntimeError(f"Numeric feature cols missing in test: {missing_num_test[:10]} ... ({len(missing_num_test)} total)")

# Check target has no NaN (important)
y_nan = df_train[TARGET_COL].isna().mean()
print("\nTarget NaN rate:", float(y_nan))
if y_nan > 0:
    print("[WARN] Target has missing values. We'll need to drop or impute target rows later (usually drop).")

# Weight sanity (if exists)
if has_weight_train:
    w = df_train[WEIGHT_COL].to_numpy(dtype=np.float64)
    w_nan = np.isnan(w).mean()
    w_neg = np.mean(w < 0)
    w_zero = np.mean(w == 0)
    print("\n==================== WEIGHT SANITY (TRAIN) ====================")
    print("NaN rate:", float(w_nan), "| negative rate:", float(w_neg), "| zero rate:", float(w_zero))
    if w_neg > 0:
        print("[WARN] Found negative weights. Usually unexpected; we'll handle carefully later.")

# ----------------------------
# 6) Leakage-safe reminders (printed)
# ----------------------------
print("\n==================== LEAKAGE RULES (REMINDER) ====================")
print("- Do NOT use 'weight' as a feature (only as sample_weight).")
print("- Any preprocessing (imputer/encoder/scaler) must be fit on TRAIN-FOLD only.")
print("- Any time-based features (rolling/expanding) must be computed with shift(1) per group.")
print("- Do NOT compute statistics using future rows (ts_index > t) for predicting time t.")
print("- Avoid fitting encoders on train+test combined.")

gc.collect()

Has weight in train: True | in test: False
Target col: y_target
ID uniqueness: OK
Train ts_index range: 1 -> 3601
Test  ts_index range: 3602 -> 4376
Time ordering (train -> test): OK (test starts after train).

Categorical features: ['code', 'sub_code', 'sub_category', 'horizon']
Numeric features (excluding ts_index): 87
Total features: 91

Target NaN rate: 0.0

NaN rate: 0.0 | negative rate: 0.0 | zero rate: 0.0009332234673945098

- Do NOT use 'weight' as a feature (only as sample_weight).
- Any preprocessing (imputer/encoder/scaler) must be fit on TRAIN-FOLD only.
- Any time-based features (rolling/expanding) must be computed with shift(1) per group.
- Do NOT compute statistics using future rows (ts_index > t) for predicting time t.
- Avoid fitting encoders on train+test combined.


0

# Implement Official Metric

In [3]:
# ============================================================
# STAGE 3 — Implement Official Metric (ONE CELL, Kaggle)
# Assumes STAGE 1–2 already ran and created:
#   df_train, df_test, TARGET_COL, ID_COL, TIME_COL, (optional) WEIGHT_COL="weight"
# This stage:
# - Implements competition metric exactly
# - Adds helpers to score arrays / dataframes
# - Provides a few baselines sanity checks (zero, weighted-mean)
# Outputs/Globals:
#   weighted_rmse_score, score_df, score_arrays
# ============================================================

import numpy as np
import pandas as pd

# ----------------------------
# 0) Require minimal globals
# ----------------------------
need = ["df_train", "TARGET_COL", "ID_COL", "TIME_COL"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing global '{k}'. Jalankan STAGE 1–2 dulu.")

WEIGHT_COL = "weight"
HAS_W = WEIGHT_COL in df_train.columns

# ----------------------------
# 1) Official metric (as provided by host)
# ----------------------------
def _clip01(x: float) -> float:
    return float(np.minimum(np.maximum(x, 0.0), 1.0))

def weighted_rmse_score(y_target, y_pred, w) -> float:
    """
    Competition metric:
      denom = sum(w * y^2)
      ratio = sum(w * (y - yhat)^2) / denom
      score = sqrt( 1 - clip01(ratio) )
    """
    y_target = np.asarray(y_target, dtype=np.float64)
    y_pred   = np.asarray(y_pred, dtype=np.float64)
    w        = np.asarray(w, dtype=np.float64)

    # Robust guards
    if y_target.shape != y_pred.shape or y_target.shape != w.shape:
        raise ValueError(f"Shape mismatch: y={y_target.shape}, yhat={y_pred.shape}, w={w.shape}")

    # If denom is 0, the metric is ill-defined; return 0 safely
    denom = np.sum(w * (y_target ** 2))
    if not np.isfinite(denom) or denom <= 0:
        return 0.0

    ratio = np.sum(w * ((y_target - y_pred) ** 2)) / denom
    clipped = _clip01(ratio)
    val = 1.0 - clipped
    # Numerical safety
    val = max(val, 0.0)
    return float(np.sqrt(val))

# ----------------------------
# 2) Convenience wrappers
# ----------------------------
def score_arrays(y_true: np.ndarray, y_pred: np.ndarray, w: np.ndarray | None = None) -> float:
    if w is None:
        w = np.ones_like(y_true, dtype=np.float64)
    return weighted_rmse_score(y_true, y_pred, w)

def score_df(df: pd.DataFrame, y_col: str, pred_col: str, w_col: str = "weight") -> float:
    if w_col not in df.columns:
        w = np.ones(len(df), dtype=np.float64)
    else:
        w = df[w_col].to_numpy(dtype=np.float64)
    return weighted_rmse_score(df[y_col].to_numpy(dtype=np.float64),
                               df[pred_col].to_numpy(dtype=np.float64),
                               w)

# ----------------------------
# 3) Sanity check on train (simple baselines)
# ----------------------------
print("==================== STAGE 3: OFFICIAL METRIC ====================")
y = df_train[TARGET_COL].to_numpy(dtype=np.float64)

if HAS_W:
    w = df_train[WEIGHT_COL].to_numpy(dtype=np.float64)
else:
    w = np.ones_like(y, dtype=np.float64)

# Baseline A: predict 0
pred0 = np.zeros_like(y, dtype=np.float64)
s0 = weighted_rmse_score(y, pred0, w)

# Baseline B: predict weighted mean (best constant under weighted MSE)
# Use small epsilon to avoid /0
w_sum = float(np.sum(w))
c = float(np.sum(w * y) / (w_sum + 1e-18))
predc = np.full_like(y, c, dtype=np.float64)
sc = weighted_rmse_score(y, predc, w)

# Baseline C: predict unweighted median (often robust)
m = float(np.median(y))
predm = np.full_like(y, m, dtype=np.float64)
sm = weighted_rmse_score(y, predm, w)

print(f"Using weight column: {HAS_W}")
print(f"Baseline (predict 0)            score = {s0:.6f}")
print(f"Baseline (predict w-mean {c:.6f}) score = {sc:.6f}")
print(f"Baseline (predict median {m:.6f}) score = {sm:.6f}")

# Some extra diagnostics about denom / ratio scaling
denom = float(np.sum(w * (y ** 2)))
sse0  = float(np.sum(w * ((y - pred0) ** 2)))
ratio0 = sse0 / denom if denom > 0 else np.nan
print("\nDiagnostics:")
print(f"denom sum(w*y^2) = {denom:.6e}")
print(f"ratio(predict0)  = {ratio0:.6f}  (should be ~1.0 => score~0)")

print("\nGlobals exported: weighted_rmse_score, score_arrays, score_df")


Using weight column: True
Baseline (predict 0)            score = 0.000000
Baseline (predict w-mean -0.000024) score = 0.011117
Baseline (predict median -0.000577) score = 0.000000

Diagnostics:
denom sum(w*y^2) = 4.082630e+08
ratio(predict0)  = 1.000000  (should be ~1.0 => score~0)

Globals exported: weighted_rmse_score, score_arrays, score_df


# Time-based Validation Split

In [4]:
# ============================================================
# STAGE 4 — Time-based Validation Split (Leakage-Safe CV) (ONE CELL, Kaggle)
# Assumes STAGE 1–3 already ran and created:
#   df_train, df_test, ID_COL, TIME_COL, TARGET_COL, CAT_COLS
#
# This stage:
# - Builds walk-forward (blocked) time splits on ts_index
# - Optionally makes splits per-horizon (recommended later), but here we create a global fold id
# - Exports df_folds (id -> fold) and adds df_train["fold"]
#
# Outputs/Globals:
#   df_folds, df_train (with 'fold'), FOLD_CFG
#   fold_boundaries (list of dicts)
#
# Notes:
# - We use last portion of time as validation windows.
# - Training for fold k uses all data with ts_index <= train_end
#   Validation uses (train_end, valid_end] (strict future)
# ============================================================

import numpy as np
import pandas as pd

# ----------------------------
# 0) Require
# ----------------------------
need = ["df_train", "ID_COL", "TIME_COL"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing global '{k}'. Jalankan STAGE 1–3 dulu.")

# ----------------------------
# 1) Config (tune-friendly)
# ----------------------------
# Number of folds (walk-forward windows)
N_FOLDS = 4

# Validation window size in ts_index units.
# If None, we auto-set based on the last ~20% of time span.
VALID_WINDOW = None  # e.g., 150, 200, 300; or None for auto

# Gap between train_end and valid_start to reduce leakage via feature smoothing (usually 0 is OK)
GAP = 0

# Ensure we validate only on the tail period (mimics test)
TAIL_FRACTION = 0.25  # last 25% of time used to place validation windows

# Minimum validation samples per fold (fail-fast if too small)
MIN_VALID_ROWS = 200_000

# ----------------------------
# 2) Prepare timeline
# ----------------------------
ts = df_train[TIME_COL].to_numpy(dtype=np.int64)
ts_min = int(ts.min())
ts_max = int(ts.max())
ts_unique = np.unique(ts)
ts_unique.sort()

span = ts_max - ts_min + 1
tail_start_ts = int(ts_min + (1.0 - TAIL_FRACTION) * span)
tail_start_ts = max(tail_start_ts, ts_min)

# determine VALID_WINDOW
if VALID_WINDOW is None:
    tail_span = ts_max - tail_start_ts + 1
    # split tail into N_FOLDS windows, with a bit of buffer
    VALID_WINDOW = max(1, int(np.floor(tail_span / (N_FOLDS + 0.5))))
VALID_WINDOW = int(VALID_WINDOW)

# Build fold boundaries ending at ts_max
# Fold k validates on (train_end, valid_end], where valid_end increases toward ts_max
fold_boundaries = []
valid_end = ts_max
for k in range(N_FOLDS-1, -1, -1):
    valid_start = valid_end - VALID_WINDOW + 1
    # ensure validation window stays in tail
    if valid_start < tail_start_ts:
        valid_start = tail_start_ts
    train_end = valid_start - 1 - GAP
    fold_boundaries.append({
        "fold": k,
        "train_end": int(train_end),
        "valid_start": int(valid_start),
        "valid_end": int(valid_end),
        "gap": int(GAP),
        "valid_window": int(valid_end - valid_start + 1),
    })
    valid_end = train_end  # next fold ends where this train ended

# sort by fold id ascending
fold_boundaries = sorted(fold_boundaries, key=lambda d: d["fold"])

FOLD_CFG = {
    "N_FOLDS": N_FOLDS,
    "VALID_WINDOW": VALID_WINDOW,
    "GAP": GAP,
    "TAIL_FRACTION": TAIL_FRACTION,
    "MIN_VALID_ROWS": MIN_VALID_ROWS,
    "TIME_COL": TIME_COL,
    "ID_COL": ID_COL,
}

print("==================== STAGE 4: TIME SPLITS ====================")
print("Train ts_index:", ts_min, "->", ts_max, "| span:", span)
print("Tail fraction:", TAIL_FRACTION, "| tail_start_ts:", tail_start_ts)
print("N_FOLDS:", N_FOLDS, "| VALID_WINDOW:", VALID_WINDOW, "| GAP:", GAP)
print("\nFold boundaries:")
for b in fold_boundaries:
    print(f"  fold {b['fold']}: train <= {b['train_end']} | valid ({b['valid_start']}, {b['valid_end']}] | window={b['valid_window']}")

# ----------------------------
# 3) Assign folds
# ----------------------------
# Default fold = -1 (train-only, never used for validation)
fold_arr = np.full(len(df_train), -1, dtype=np.int16)

ts_series = df_train[TIME_COL].to_numpy(dtype=np.int64)

for b in fold_boundaries:
    k = b["fold"]
    vs, ve = b["valid_start"], b["valid_end"]
    mask = (ts_series >= vs) & (ts_series <= ve)
    fold_arr[mask] = k

df_train["fold"] = fold_arr

# df_folds mapping (id -> fold)
df_folds = df_train[[ID_COL, "fold"]].copy()

# ----------------------------
# 4) Diagnostics
# ----------------------------
vc = df_train["fold"].value_counts(dropna=False).sort_index()
print("\nFold row counts (fold=-1 means never validated):")
print(vc)

# Ensure each fold has enough validation rows
ok = True
for b in fold_boundaries:
    k = b["fold"]
    n_valid = int((df_train["fold"] == k).sum())
    if n_valid < MIN_VALID_ROWS:
        print(f"[WARN] fold {k} valid rows too small: {n_valid} < {MIN_VALID_ROWS}")
        ok = False
if ok:
    print("Validation sizes: OK")

# Quick check: validation is strictly in the future of its training end
viol = []
for b in fold_boundaries:
    if not (b["train_end"] < b["valid_start"]):
        viol.append(b["fold"])
if viol:
    raise RuntimeError(f"Invalid split: folds where train_end >= valid_start: {viol}")

print("\nGlobals exported: df_train['fold'], df_folds, FOLD_CFG, fold_boundaries")


Train ts_index: 1 -> 3601 | span: 3601
Tail fraction: 0.25 | tail_start_ts: 2701
N_FOLDS: 4 | VALID_WINDOW: 200 | GAP: 0

Fold boundaries:
  fold 0: train <= 2801 | valid (2802, 3001] | window=200
  fold 1: train <= 3001 | valid (3002, 3201] | window=200
  fold 2: train <= 3201 | valid (3202, 3401] | window=200
  fold 3: train <= 3401 | valid (3402, 3601] | window=200

Fold row counts (fold=-1 means never validated):
fold
-1    3981259
 0     343037
 1     342025
 2     335612
 3     335481
Name: count, dtype: int64
Validation sizes: OK

Globals exported: df_train['fold'], df_folds, FOLD_CFG, fold_boundaries


# Feature Preparation & Weighting Strategy

In [5]:
# ============================================================
# STAGE 5 — Feature Preparation & Weighting Strategy (ONE CELL, Kaggle)
# REVISI FULL (FIX: exclude 'fold' + any non-feature helper cols)
#
# Assumes STAGE 1–4 already ran and created:
#   df_train, df_test, TARGET_COL, ID_COL, TIME_COL
#   df_train["fold"] from STAGE 4
#
# This stage:
# - Finalize feature column lists (cat + num) with an option to include ts_index
# - Excludes helper columns like 'fold' from features
# - Optimize dtypes for categorical columns (category) to save RAM
# - Defines leakage-safe helper functions:
#     * make_sample_weight(...) -> uses official weight + optional recency weighting + optional clipping
#     * fit_median_imputer(...) / apply_median_imputer(...) (for linear models)
# - Prepares CatBoost cat feature indices for later stages
#
# Outputs/Globals:
#   WEIGHT_COL, USE_TS_AS_FEATURE
#   FEATURE_COLS_CAT, FEATURE_COLS_NUM, FEATURE_COLS_ALL
#   CAT_FEATURE_IDXS
#   make_sample_weight, fit_median_imputer, apply_median_imputer
#   TRAIN_MAX_TS
# ============================================================

import gc
import numpy as np
import pandas as pd

# ----------------------------
# 0) Require
# ----------------------------
need = ["df_train", "df_test", "TARGET_COL", "ID_COL", "TIME_COL"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing global '{k}'. Jalankan STAGE 1–4 dulu.")

WEIGHT_COL = "weight"
if WEIGHT_COL not in df_train.columns:
    raise RuntimeError("Kolom 'weight' tidak ada di train (harusnya ada di dataset ini).")

# Helper cols that must never be treated as features
HELPER_COLS = {"fold"}

# ----------------------------
# 1) Feature list finalization
# ----------------------------
USE_TS_AS_FEATURE = False  # ubah True untuk eksperimen drift

# Base categorical columns (as per competition)
BASE_CATS = ["code", "sub_code", "sub_category", "horizon"]
FEATURE_COLS_CAT = [c for c in BASE_CATS if c in df_train.columns and c not in HELPER_COLS]

# Numeric features: all numeric columns excluding forbidden + helpers
DO_NOT_USE = {ID_COL, TARGET_COL, WEIGHT_COL} | HELPER_COLS

numeric_cols = [c for c in df_train.columns if pd.api.types.is_numeric_dtype(df_train[c])]
FEATURE_COLS_NUM = [c for c in numeric_cols if c not in DO_NOT_USE]

# Remove ts_index if not used
if not USE_TS_AS_FEATURE and TIME_COL in FEATURE_COLS_NUM:
    FEATURE_COLS_NUM.remove(TIME_COL)

# Full feature set
FEATURE_COLS_ALL = FEATURE_COLS_CAT + FEATURE_COLS_NUM

# Ensure all selected features exist in test
missing_in_test = [c for c in FEATURE_COLS_ALL if c not in df_test.columns]
if missing_in_test:
    raise RuntimeError(f"Fitur berikut hilang di test: {missing_in_test[:20]} (total {len(missing_in_test)})")

# ----------------------------
# 2) Categorical dtype optimization (RAM + CatBoost friendliness)
# ----------------------------
for c in FEATURE_COLS_CAT:
    if str(df_train[c].dtype) != "category":
        df_train[c] = df_train[c].astype("category")
    if str(df_test[c].dtype) != "category":
        df_test[c] = df_test[c].astype("category")

# CatBoost expects cat feature indices in the final X column order
CAT_FEATURE_IDXS = list(range(len(FEATURE_COLS_CAT)))  # cat cols placed first

# ----------------------------
# 3) Weighting strategy helpers
# ----------------------------
TRAIN_MAX_TS = int(df_train[TIME_COL].max())

def make_sample_weight(df: pd.DataFrame,
                       use_recency: bool = True,
                       tau: float = 600.0,
                       clip_w_quantile: float | None = None,
                       eps: float = 1e-12) -> np.ndarray:
    """
    sample_weight = weight * recency_decay(optional)
    - clip_w_quantile: e.g. 0.999 or 0.9995, only if training unstable due to huge weights.
    """
    w = df[WEIGHT_COL].to_numpy(dtype=np.float64)

    if clip_w_quantile is not None:
        q = float(np.nanquantile(w, clip_w_quantile))
        if np.isfinite(q) and q > 0:
            w = np.minimum(w, q)

    if use_recency:
        t = df[TIME_COL].to_numpy(dtype=np.float64)
        rec = np.exp(-(TRAIN_MAX_TS - t) / float(tau))
        w = w * rec

    w = np.where(np.isfinite(w), w, 0.0)
    w = np.maximum(w, 0.0)
    if float(w.sum()) <= eps:
        w = np.ones(len(df), dtype=np.float64)
    return w

# ----------------------------
# 4) Median imputer (fit per train-fold only; leakage-safe)
# ----------------------------
def fit_median_imputer(df_fit: pd.DataFrame, num_cols: list[str]) -> dict:
    med = df_fit[num_cols].median(numeric_only=True)
    return {c: float(med[c]) if c in med.index and np.isfinite(med[c]) else 0.0 for c in num_cols}

def apply_median_imputer(df_apply: pd.DataFrame, medians: dict, num_cols: list[str]) -> pd.DataFrame:
    out = df_apply.copy()
    # fill only cols that contain NaN
    for c in num_cols:
        if c in out.columns and out[c].isna().any():
            out[c] = out[c].fillna(medians.get(c, 0.0))
    return out

# ----------------------------
# 5) Prints + quick stats
# ----------------------------
print("==================== STAGE 5: FEATURE PREP & WEIGHTING ====================")
print("USE_TS_AS_FEATURE:", USE_TS_AS_FEATURE, f"(ts_index {'included' if USE_TS_AS_FEATURE else 'excluded'})")
print("Helper cols excluded:", sorted(list(HELPER_COLS)))
print("Categorical cols:", FEATURE_COLS_CAT)
print("Numeric cols     :", len(FEATURE_COLS_NUM))
print("Total features   :", len(FEATURE_COLS_ALL))
print("Cat idxs for CatBoost:", CAT_FEATURE_IDXS)

# missingness overview (top 8) for selected numeric features (train vs test)
if len(FEATURE_COLS_NUM) > 0:
    miss_tr = df_train[FEATURE_COLS_NUM].isna().mean().sort_values(ascending=False).head(8)
    miss_te = df_test[FEATURE_COLS_NUM].isna().mean().sort_values(ascending=False).head(8)
    print("\nTop missing numeric features (train):")
    print(miss_tr)
    print("\nTop missing numeric features (test):")
    print(miss_te)

# weight sanity
w0 = df_train[WEIGHT_COL].to_numpy(dtype=np.float64)
print("\nWeight stats (train):")
print("  min:", float(np.nanmin(w0)),
      "p50:", float(np.nanpercentile(w0, 50)),
      "p99.9:", float(np.nanpercentile(w0, 99.9)),
      "max:", float(np.nanmax(w0)))
print("  zero_rate:", float(np.mean(w0 == 0.0)),
      "neg_rate:", float(np.mean(w0 < 0.0)))

print("\nLeakage reminder:")
print("- Fit imputer/encoder ONLY on train-fold (ts_index <= train_end).")
print("- If building rolling/expanding features: sort by ts_index and use shift(1) per group.")
print("- Do NOT use 'weight' as a feature; only as sample_weight.")

gc.collect()


USE_TS_AS_FEATURE: False (ts_index excluded)
Helper cols excluded: ['fold']
Categorical cols: ['code', 'sub_code', 'sub_category', 'horizon']
Numeric cols     : 87
Total features   : 91
Cat idxs for CatBoost: [0, 1, 2, 3]

Top missing numeric features (train):
feature_at    0.124719
feature_by    0.110192
feature_ay    0.085420
feature_cd    0.074964
feature_ce    0.051678
feature_cf    0.044289
feature_al    0.042233
feature_aw    0.038444
dtype: float64

Top missing numeric features (test):
feature_y     0.385765
feature_x     0.385765
feature_w     0.385765
feature_z     0.385765
feature_at    0.092342
feature_by    0.092043
feature_ay    0.057988
feature_cd    0.057969
dtype: float64

Weight stats (train):
  min: 0.0 p50: 1699.3843705131449 p99.9: 1321398915.3320074 max: 13912217783333.135
  zero_rate: 0.0009332234673945098 neg_rate: 0.0

Leakage reminder:
- Fit imputer/encoder ONLY on train-fold (ts_index <= train_end).
- If building rolling/expanding features: sort by ts_index an

0

# Model Training, OOF Evaluation, and Model Selection

In [6]:
# ============================================================
# STAGE 6 — Model Training, OOF Evaluation, and Model Selection (ONE CELL, Kaggle)
# REVISI FULL v3 (FIX: CatBoost bootstrap_type Bayesian vs subsample conflict)
# - Uses bootstrap_type="Bernoulli" so subsample is allowed
# - PER-HORIZON mode default; 'horizon' excluded from features (used only for filtering)
# ============================================================

import gc, json, time
from pathlib import Path
import numpy as np
import pandas as pd

# ----------------------------
# 0) Require
# ----------------------------
need = ["df_train","TARGET_COL","TIME_COL","ID_COL","FEATURE_COLS_NUM","fold_boundaries","weighted_rmse_score","make_sample_weight"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing global '{k}'. Jalankan STAGE 1–5 dulu.")

WEIGHT_COL = "weight"
if WEIGHT_COL not in df_train.columns:
    raise RuntimeError("Kolom 'weight' tidak ada di df_train.")
if "fold" not in df_train.columns:
    raise RuntimeError("Kolom 'fold' belum ada. Jalankan STAGE 4 dulu.")

try:
    from catboost import CatBoostRegressor, Pool
except Exception as e:
    raise RuntimeError(f"CatBoost import gagal: {e}")

# ----------------------------
# 1) Mode + Feature list
# ----------------------------
MODE = "per_horizon"  # "per_horizon" (recommended) or "single_model"

HELPER_COLS = {"fold"}
DROP_ALWAYS = {ID_COL, TARGET_COL, WEIGHT_COL} | HELPER_COLS

if MODE == "per_horizon":
    CAT_COLS = ["code", "sub_code", "sub_category"]
    DROP_ALWAYS = DROP_ALWAYS | {"horizon"}  # horizon hanya untuk filter
else:
    CAT_COLS = ["code", "sub_code", "sub_category", "horizon"]

CAT_COLS = [c for c in CAT_COLS if c in df_train.columns]
cat_set = set(CAT_COLS)

# Numeric cols from Stage 5, excluding forbidden/helper/cat
NUM_COLS = [c for c in FEATURE_COLS_NUM if c not in DROP_ALWAYS and c not in cat_set]

FEATURE_COLS_ALL = CAT_COLS + NUM_COLS
# dedup
seen = set()
FEATURE_COLS_ALL = [c for c in FEATURE_COLS_ALL if not (c in seen or seen.add(c))]

missing = [c for c in FEATURE_COLS_ALL if c not in df_train.columns]
if missing:
    raise RuntimeError(f"Missing features in train: {missing[:20]} (total {len(missing)})")

def get_cat_feature_indices(df: pd.DataFrame, cols: list[str]) -> list[int]:
    idxs = []
    for i, c in enumerate(cols):
        dt = df[c].dtype
        if str(dt) == "category" or dt == object:
            idxs.append(i)
    return idxs

CAT_FEATURE_IDXS = get_cat_feature_indices(df_train, FEATURE_COLS_ALL)

print("==================== STAGE 6: CATBOOST OOF ====================")
print("MODE:", MODE)
print("Total features:", len(FEATURE_COLS_ALL), "| cat idx count:", len(CAT_FEATURE_IDXS))
print("Cat cols:", CAT_COLS)
if MODE == "per_horizon":
    print("NOTE: 'horizon' excluded from features (used only for filtering).")

# ----------------------------
# 2) Config (CPU-safe)
# ----------------------------
SEED = 42

# Weight strategy for TRAIN
USE_RECENCY = True
TAU = 600.0
CLIP_W_Q = None  # kalau instabil, coba 0.999 atau 0.9995

# Runtime control
TRAIN_SAMPLE_CAP = 400_000   # per (fold,horizon)
SAMPLE_WEIGHTED = True

# CatBoost params (FIX: Bernoulli supports subsample)
CB_PARAMS = dict(
    loss_function="RMSE",
    eval_metric="RMSE",
    iterations=1500,
    learning_rate=0.06,
    depth=8,
    l2_leaf_reg=6.0,
    random_strength=1.0,
    rsm=0.9,
    min_data_in_leaf=300,

    bootstrap_type="Bernoulli",  # FIX
    subsample=0.8,               # allowed with Bernoulli

    task_type="CPU",
    thread_count=-1,
    random_seed=SEED,
    allow_writing_files=False,
)
EARLY_STOPPING_ROUNDS = 150

OUT_DIR = Path("/kaggle/working/tsf_stage6_catboost_models")
OUT_DIR.mkdir(parents=True, exist_ok=True)

CB_CFG_USED = dict(
    MODE=MODE, SEED=SEED,
    USE_RECENCY=USE_RECENCY, TAU=TAU, CLIP_W_Q=CLIP_W_Q,
    TRAIN_SAMPLE_CAP=TRAIN_SAMPLE_CAP, SAMPLE_WEIGHTED=SAMPLE_WEIGHTED,
    CB_PARAMS=CB_PARAMS, EARLY_STOPPING_ROUNDS=EARLY_STOPPING_ROUNDS,
    FEATURE_COLS_ALL=FEATURE_COLS_ALL, CAT_FEATURE_IDXS=CAT_FEATURE_IDXS
)

# ----------------------------
# 3) Arrays
# ----------------------------
n = len(df_train)
oof_pred = np.full(n, np.nan, dtype=np.float32)

y_all    = df_train[TARGET_COL].to_numpy(dtype=np.float64)
w_off    = df_train[WEIGHT_COL].to_numpy(dtype=np.float64)   # official weight for scoring
ts_all   = df_train[TIME_COL].to_numpy(dtype=np.int64)
fold_all = df_train["fold"].to_numpy(dtype=np.int16)

if "horizon" in df_train.columns:
    h_all = df_train["horizon"].astype(int).to_numpy(dtype=np.int16)
    h_vals = np.unique(h_all); h_vals.sort()
else:
    h_all = None
    h_vals = np.array([0], dtype=np.int16)

rng = np.random.default_rng(SEED)

# ----------------------------
# 4) Helpers
# ----------------------------
def sample_cap_indices(idx: np.ndarray, w_aligned: np.ndarray, cap: int | None) -> np.ndarray:
    if cap is None or len(idx) <= cap:
        return idx
    if not SAMPLE_WEIGHTED:
        return rng.choice(idx, size=cap, replace=False)
    ww = np.asarray(w_aligned, dtype=np.float64)
    ww = np.where(np.isfinite(ww), ww, 0.0)
    ww = np.maximum(ww, 0.0)
    s = ww.sum()
    if s <= 0:
        return rng.choice(idx, size=cap, replace=False)
    p = ww / s
    return rng.choice(idx, size=cap, replace=False, p=p)

def fold_train_valid_indices(b: dict) -> tuple[np.ndarray, np.ndarray]:
    k = int(b["fold"])
    train_end = int(b["train_end"])
    vs, ve = int(b["valid_start"]), int(b["valid_end"])
    tr_idx = np.where(ts_all <= train_end)[0]
    va_idx = np.where((ts_all >= vs) & (ts_all <= ve))[0]
    # sanity compare with fold markers
    va2 = np.where(fold_all == k)[0]
    if abs(len(va_idx) - len(va2)) > 1000:
        print(f"[WARN] Fold {k}: time-valid {len(va_idx)} vs fold-mark {len(va2)}")
    return tr_idx, va_idx

# ----------------------------
# 5) Train OOF
# ----------------------------
models_cb = {}
oof_score_by_fold = {}
oof_score_by_horizon = {}

t0 = time.time()

for b in fold_boundaries:
    k = int(b["fold"])
    tr_idx_all, va_idx_all = fold_train_valid_indices(b)

    print("\n" + "-"*72)
    print(f"FOLD {k} | train<= {b['train_end']} | valid {b['valid_start']}..{b['valid_end']} | "
          f"train_rows={len(tr_idx_all):,} valid_rows={len(va_idx_all):,}")

    if MODE == "single_model":
        tr_idx = tr_idx_all
        va_idx = va_idx_all

        w_tr_full = make_sample_weight(df_train.iloc[tr_idx], use_recency=USE_RECENCY, tau=TAU, clip_w_quantile=CLIP_W_Q)
        tr_idx_cap = sample_cap_indices(tr_idx, w_tr_full, TRAIN_SAMPLE_CAP)
        w_tr = make_sample_weight(df_train.iloc[tr_idx_cap], use_recency=USE_RECENCY, tau=TAU, clip_w_quantile=CLIP_W_Q)

        X_tr = df_train.iloc[tr_idx_cap][FEATURE_COLS_ALL]
        y_tr = y_all[tr_idx_cap]
        X_va = df_train.iloc[va_idx][FEATURE_COLS_ALL]
        y_va = y_all[va_idx]
        w_va = w_off[va_idx].astype(np.float64)

        train_pool = Pool(X_tr, label=y_tr, weight=w_tr, cat_features=CAT_FEATURE_IDXS)
        valid_pool = Pool(X_va, label=y_va, weight=w_va, cat_features=CAT_FEATURE_IDXS)

        model = CatBoostRegressor(**CB_PARAMS)
        model.fit(train_pool, eval_set=valid_pool, use_best_model=True,
                  verbose=200, early_stopping_rounds=EARLY_STOPPING_ROUNDS)

        pred_va = model.predict(valid_pool).astype(np.float32)
        oof_pred[va_idx] = pred_va

        best_it = int(model.get_best_iteration() if model.get_best_iteration() is not None else CB_PARAMS["iterations"])
        model_path = OUT_DIR / f"cb_single_fold{k}_best{best_it}.cbm"
        model.save_model(str(model_path))
        models_cb[("single", k)] = str(model_path)

        s_fold = weighted_rmse_score(y_va, pred_va.astype(np.float64), w_off[va_idx].astype(np.float64))
        oof_score_by_fold[k] = float(s_fold)
        print(f"FOLD {k} single-model score: {s_fold:.6f}")

        del model, train_pool, valid_pool, X_tr, X_va
        gc.collect()

    else:
        for h in h_vals:
            h = int(h)
            tr_idx = tr_idx_all[h_all[tr_idx_all] == h]
            va_idx = va_idx_all[h_all[va_idx_all] == h]
            if len(tr_idx) == 0 or len(va_idx) == 0:
                print(f"  horizon={h}: skipped (train={len(tr_idx)}, valid={len(va_idx)})")
                continue

            w_tr_full = make_sample_weight(df_train.iloc[tr_idx], use_recency=USE_RECENCY, tau=TAU, clip_w_quantile=CLIP_W_Q)
            tr_idx_cap = sample_cap_indices(tr_idx, w_tr_full, TRAIN_SAMPLE_CAP)
            w_tr = make_sample_weight(df_train.iloc[tr_idx_cap], use_recency=USE_RECENCY, tau=TAU, clip_w_quantile=CLIP_W_Q)

            X_tr = df_train.iloc[tr_idx_cap][FEATURE_COLS_ALL]
            y_tr = y_all[tr_idx_cap]

            X_va = df_train.iloc[va_idx][FEATURE_COLS_ALL]
            y_va = y_all[va_idx]
            w_va = w_off[va_idx].astype(np.float64)

            train_pool = Pool(X_tr, label=y_tr, weight=w_tr, cat_features=CAT_FEATURE_IDXS)
            valid_pool = Pool(X_va, label=y_va, weight=w_va, cat_features=CAT_FEATURE_IDXS)

            model = CatBoostRegressor(**CB_PARAMS)
            model.fit(train_pool, eval_set=valid_pool, use_best_model=True,
                      verbose=200, early_stopping_rounds=EARLY_STOPPING_ROUNDS)

            pred_va = model.predict(valid_pool).astype(np.float32)
            oof_pred[va_idx] = pred_va

            best_it = int(model.get_best_iteration() if model.get_best_iteration() is not None else CB_PARAMS["iterations"])
            model_path = OUT_DIR / f"cb_h{h}_fold{k}_best{best_it}.cbm"
            model.save_model(str(model_path))
            models_cb[(h, k)] = str(model_path)

            s = weighted_rmse_score(y_va, pred_va.astype(np.float64), w_off[va_idx].astype(np.float64))
            print(f"  horizon={h}: train={len(tr_idx_cap):,} (cap from {len(tr_idx):,}) | valid={len(va_idx):,} | "
                  f"best_it={best_it} | score={s:.6f}")

            del model, train_pool, valid_pool, X_tr, X_va
            gc.collect()

        idx_fold = np.where(fold_all == k)[0]
        pf = oof_pred[idx_fold].astype(np.float64)
        yf = y_all[idx_fold]
        wf = w_off[idx_fold]
        m = np.isfinite(pf)
        s_fold = weighted_rmse_score(yf[m], pf[m], wf[m])
        oof_score_by_fold[k] = float(s_fold)
        print(f"FOLD {k} aggregate OOF score: {s_fold:.6f} | n={m.sum():,}")

# ----------------------------
# 6) Aggregate OOF + per-horizon
# ----------------------------
valid_idx_all = np.where(fold_all >= 0)[0]
p_all = oof_pred[valid_idx_all].astype(np.float64)
y_v   = y_all[valid_idx_all]
w_v   = w_off[valid_idx_all]
m = np.isfinite(p_all)
oof_score_all = float(weighted_rmse_score(y_v[m], p_all[m], w_v[m]))

if MODE == "per_horizon":
    for h in h_vals:
        h = int(h)
        idx_h = valid_idx_all[h_all[valid_idx_all] == h]
        if len(idx_h) == 0:
            continue
        ph = oof_pred[idx_h].astype(np.float64)
        yh = y_all[idx_h]
        wh = w_off[idx_h]
        mm = np.isfinite(ph)
        oof_score_by_horizon[h] = float(weighted_rmse_score(yh[mm], ph[mm], wh[mm]))

elapsed = int(time.time() - t0)

print("\n" + "="*72)
print("OOF SCORE (ALL FOLDS):", f"{oof_score_all:.6f}")
print("OOF SCORE BY FOLD:", oof_score_by_fold)
if MODE == "per_horizon":
    print("OOF SCORE BY HORIZON:", oof_score_by_horizon)
print("Models saved under:", str(OUT_DIR))
print("Elapsed (sec):", elapsed)

# Save report
report_path = OUT_DIR / "stage6_oof_report.json"
with open(report_path, "w") as f:
    json.dump(
        {
            "oof_score_all": oof_score_all,
            "oof_score_by_fold": oof_score_by_fold,
            "oof_score_by_horizon": oof_score_by_horizon,
            "fold_boundaries": fold_boundaries,
            "cfg": CB_CFG_USED,
            "models": {str(k): v for k, v in models_cb.items()},
        },
        f,
        indent=2
    )
print("Saved report:", str(report_path))

# Export globals
globals()["oof_pred"] = oof_pred
globals()["oof_score_all"] = oof_score_all
globals()["oof_score_by_fold"] = oof_score_by_fold
globals()["oof_score_by_horizon"] = oof_score_by_horizon
globals()["models_cb"] = models_cb
globals()["CB_CFG_USED"] = CB_CFG_USED

gc.collect()


MODE: per_horizon
Total features: 89 | cat idx count: 3
Cat cols: ['code', 'sub_code', 'sub_category']
NOTE: 'horizon' excluded from features (used only for filtering).

------------------------------------------------------------------------
FOLD 0 | train<= 2801 | valid 2802..3001 | train_rows=3,981,259 valid_rows=343,037
0:	learn: 0.0005893	test: 0.0007221	best: 0.0007221 (0)	total: 402ms	remaining: 10m 2s
200:	learn: 0.0005874	test: 0.0007221	best: 0.0007220 (76)	total: 53.9s	remaining: 5m 48s
Stopped by overfitting detector  (150 iterations wait)

bestTest = 0.0007220254686
bestIteration = 76

Shrink model to first 77 iterations.
  horizon=1: train=400,000 (cap from 1,039,446) | valid=89,622 | best_it=76 | score=0.016606
0:	learn: 0.0009301	test: 0.0012907	best: 0.0012907 (0)	total: 370ms	remaining: 9m 14s
200:	learn: 0.0009258	test: 0.0012903	best: 0.0012902 (160)	total: 54.8s	remaining: 5m 53s
400:	learn: 0.0009115	test: 0.0012899	best: 0.0012898 (394)	total: 1m 51s	remaining: 5

33

# Final Fit, Test Inference, and Submission Packaging

In [7]:
# ============================================================
# STAGE 7 — Final Fit, Test Inference, and Submission Packaging (ONE CELL, Kaggle)
# Baseline: CatBoostRegressor final training per-horizon on FULL TRAIN (all ts_index <= 3601)
# Then predict TEST and write /kaggle/working/submission.csv
#
# Requires globals from STAGE 1–6:
#   df_train, df_test, TARGET_COL, ID_COL, TIME_COL, WEIGHT_COL="weight"
#   FEATURE_COLS_ALL, CAT_FEATURE_IDXS   (from STAGE 6 cell)
#   make_sample_weight                  (from STAGE 5)
#   CB_CFG_USED                         (from STAGE 6)  (for params)
#
# Output:
#   /kaggle/working/submission.csv
#   /kaggle/working/tsf_stage7_bundle/final_models/*.cbm
#   /kaggle/working/tsf_stage7_bundle/bundle.json
# ============================================================

import os, gc, json, time
from pathlib import Path
import numpy as np
import pandas as pd

# ----------------------------
# 0) Require
# ----------------------------
need = ["df_train","df_test","ID_COL","TIME_COL","TARGET_COL","make_sample_weight"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing global '{k}'. Jalankan STAGE 1–6 dulu.")

WEIGHT_COL = "weight"
if WEIGHT_COL not in df_train.columns:
    raise RuntimeError("Kolom 'weight' tidak ada di df_train.")

# Feature globals from Stage 6 (or fallback from Stage 5)
if "FEATURE_COLS_ALL" not in globals() or "CAT_FEATURE_IDXS" not in globals():
    raise RuntimeError("Missing FEATURE_COLS_ALL / CAT_FEATURE_IDXS. Jalankan STAGE 6 (revisi) dulu.")

# CatBoost import
try:
    from catboost import CatBoostRegressor, Pool
except Exception as e:
    raise RuntimeError(f"CatBoost import gagal: {e}")

# ----------------------------
# 1) Load training config from Stage 6
# ----------------------------
if "CB_CFG_USED" in globals() and isinstance(CB_CFG_USED, dict) and "CB_PARAMS" in CB_CFG_USED:
    CB_PARAMS = dict(CB_CFG_USED["CB_PARAMS"])
    MODE = CB_CFG_USED.get("MODE", "per_horizon")
    USE_RECENCY = bool(CB_CFG_USED.get("USE_RECENCY", True))
    TAU = float(CB_CFG_USED.get("TAU", 600.0))
    CLIP_W_Q = CB_CFG_USED.get("CLIP_W_Q", None)
else:
    # fallback defaults
    MODE = "per_horizon"
    USE_RECENCY = True
    TAU = 600.0
    CLIP_W_Q = None
    CB_PARAMS = dict(
        loss_function="RMSE",
        eval_metric="RMSE",
        iterations=1500,
        learning_rate=0.06,
        depth=8,
        l2_leaf_reg=6.0,
        random_strength=1.0,
        rsm=0.9,
        min_data_in_leaf=300,
        bootstrap_type="Bernoulli",
        subsample=0.8,
        task_type="CPU",
        thread_count=-1,
        random_seed=42,
        allow_writing_files=False,
    )

print("==================== STAGE 7: FINAL FIT + TEST INFER + SUBMISSION ====================")
print("MODE:", MODE)
print("USE_RECENCY:", USE_RECENCY, "| TAU:", TAU, "| CLIP_W_Q:", CLIP_W_Q)
print("Features:", len(FEATURE_COLS_ALL), "| Cat idx:", len(CAT_FEATURE_IDXS))

# ----------------------------
# 2) Bundle dirs
# ----------------------------
BUNDLE_DIR = Path("/kaggle/working/tsf_stage7_bundle")
MODEL_DIR = BUNDLE_DIR / "final_models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------
# 3) Prepare horizon handling
# ----------------------------
if "horizon" not in df_train.columns or "horizon" not in df_test.columns:
    raise RuntimeError("Kolom 'horizon' wajib ada di train & test.")

h_train = df_train["horizon"].astype(int).to_numpy()
h_test  = df_test["horizon"].astype(int).to_numpy()

h_vals = np.unique(h_train)
h_vals.sort()

# In per_horizon mode, make sure horizon is NOT a feature (it was excluded in Stage 6)
if MODE == "per_horizon" and "horizon" in FEATURE_COLS_ALL:
    raise RuntimeError("MODE per_horizon tapi 'horizon' masih ada di FEATURE_COLS_ALL. Jalankan ulang STAGE 6 revisi.")

# ----------------------------
# 4) Train final model(s) and predict test
# ----------------------------
t0 = time.time()

# Prediction array for test
test_pred = np.zeros(len(df_test), dtype=np.float32)

# Training data
y = df_train[TARGET_COL].to_numpy(dtype=np.float64)

final_models = {}

if MODE == "single_model":
    # Train one model on full train and predict all test
    w_tr = make_sample_weight(df_train, use_recency=USE_RECENCY, tau=TAU, clip_w_quantile=CLIP_W_Q)

    X_tr = df_train[FEATURE_COLS_ALL]
    X_te = df_test[FEATURE_COLS_ALL]

    train_pool = Pool(X_tr, label=y, weight=w_tr, cat_features=CAT_FEATURE_IDXS)
    test_pool  = Pool(X_te, cat_features=CAT_FEATURE_IDXS)

    model = CatBoostRegressor(**CB_PARAMS)
    model.fit(train_pool, verbose=200)

    pred_te = model.predict(test_pool).astype(np.float32)
    test_pred[:] = pred_te

    model_path = MODEL_DIR / "cb_final_single.cbm"
    model.save_model(str(model_path))
    final_models["single"] = str(model_path)

    del model, train_pool, test_pool, X_tr, X_te
    gc.collect()

else:
    # Per-horizon: train one model for each horizon value and predict test rows for that horizon
    for h in h_vals:
        h = int(h)
        tr_idx = np.where(h_train == h)[0]
        te_idx = np.where(h_test == h)[0]

        if len(tr_idx) == 0:
            print(f"[WARN] horizon={h}: no train rows, skipped")
            continue
        if len(te_idx) == 0:
            print(f"[WARN] horizon={h}: no test rows, skipped")
            continue

        df_tr_h = df_train.iloc[tr_idx]
        w_tr = make_sample_weight(df_tr_h, use_recency=USE_RECENCY, tau=TAU, clip_w_quantile=CLIP_W_Q)

        X_tr = df_tr_h[FEATURE_COLS_ALL]
        y_tr = y[tr_idx]

        X_te = df_test.iloc[te_idx][FEATURE_COLS_ALL]

        train_pool = Pool(X_tr, label=y_tr, weight=w_tr, cat_features=CAT_FEATURE_IDXS)
        test_pool  = Pool(X_te, cat_features=CAT_FEATURE_IDXS)

        model = CatBoostRegressor(**CB_PARAMS)
        model.fit(train_pool, verbose=200)

        pred_te = model.predict(test_pool).astype(np.float32)
        test_pred[te_idx] = pred_te

        model_path = MODEL_DIR / f"cb_final_h{h}.cbm"
        model.save_model(str(model_path))
        final_models[str(h)] = str(model_path)

        print(f"horizon={h}: train={len(tr_idx):,} test={len(te_idx):,} -> saved {model_path.name}")

        del model, train_pool, test_pool, X_tr, X_te
        gc.collect()

elapsed = int(time.time() - t0)
print("Final fit + inference done. Elapsed (sec):", elapsed)

# ----------------------------
# 5) Build submission.csv
# ----------------------------
sub = pd.DataFrame({
    "id": df_test[ID_COL].astype(str).values,
    "prediction": test_pred.astype(np.float64)  # keep float64 for CSV
})

# safety: unique ids
if sub["id"].nunique() != len(sub):
    raise RuntimeError("Submission id tidak unik (unexpected).")

SUB_PATH = Path("/kaggle/working/submission.csv")
sub.to_csv(SUB_PATH, index=False)
print("Saved:", str(SUB_PATH), "| shape:", sub.shape)
print(sub.head(5))

# ----------------------------
# 6) Package bundle (models + config + feature list)
# ----------------------------
bundle = {
    "created_at_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    "mode": MODE,
    "feature_cols": FEATURE_COLS_ALL,
    "cat_feature_idxs": CAT_FEATURE_IDXS,
    "cat_cols_intended": [c for c in ["code","sub_code","sub_category","horizon"] if c in df_train.columns],
    "cb_params": CB_PARAMS,
    "weighting": {"use_recency": USE_RECENCY, "tau": TAU, "clip_w_q": CLIP_W_Q},
    "final_models": final_models,
    "train_ts_index_max": int(df_train[TIME_COL].max()),
    "test_ts_index_min": int(df_test[TIME_COL].min()),
    "test_ts_index_max": int(df_test[TIME_COL].max()),
    "n_train": int(len(df_train)),
    "n_test": int(len(df_test)),
}
BUNDLE_JSON = BUNDLE_DIR / "bundle.json"
BUNDLE_JSON.write_text(json.dumps(bundle, indent=2))
print("Saved bundle:", str(BUNDLE_JSON))

# Export globals
globals()["test_pred"] = test_pred
globals()["SUB_PATH"] = str(SUB_PATH)
globals()["BUNDLE_DIR"] = str(BUNDLE_DIR)

gc.collect()


MODE: per_horizon
USE_RECENCY: True | TAU: 600.0 | CLIP_W_Q: None
Features: 89 | Cat idx: 3
0:	learn: 0.0010246	total: 1.14s	remaining: 28m 32s
200:	learn: 0.0010236	total: 2m 46s	remaining: 17m 54s
400:	learn: 0.0010208	total: 5m 32s	remaining: 15m 12s
600:	learn: 0.0010184	total: 8m 37s	remaining: 12m 54s
800:	learn: 0.0010165	total: 11m 41s	remaining: 10m 12s
1000:	learn: 0.0010145	total: 14m 47s	remaining: 7m 22s
1200:	learn: 0.0010129	total: 17m 54s	remaining: 4m 27s
1400:	learn: 0.0010114	total: 20m 58s	remaining: 1m 28s
1499:	learn: 0.0010108	total: 22m 29s	remaining: 0us
horizon=1: train=1,394,653 test=379,617 -> saved cb_final_h1.cbm
0:	learn: 0.0017059	total: 1s	remaining: 24m 59s
200:	learn: 0.0017030	total: 2m 37s	remaining: 16m 57s
400:	learn: 0.0016956	total: 5m 26s	remaining: 14m 53s
600:	learn: 0.0016901	total: 8m 30s	remaining: 12m 44s
800:	learn: 0.0016841	total: 11m 37s	remaining: 10m 8s
1000:	learn: 0.0016796	total: 14m 42s	remaining: 7m 19s
1200:	learn: 0.0016749	t

33