In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/penyisihan-datavidia-10/sample_submission.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2012-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-2023-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2010-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2018-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2025.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2014-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2024.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2011-kompone

# Data Loading & Sanity Checks

In [2]:
# ============================================================
# STAGE 1 — Data Loading & Sanity Checks (REV8: OUT-OF-BOUNDS SAFE + MASK MERGE)
# Fix utama:
# - Date parse: multi-strategy + VALID RANGE clamp (anti tahun ngaco 124904)
# - periode_data YYYYMM divalidasi (min/max) sebelum dipakai
# - Merge kandidat tanggal pakai mask assignment (bukan fillna chain) -> anti pandas AssertionError
# - Robust CSV reader: auto delimiter + skip bad lines + encoding fallback
#
# Output:
#   df_sample, df_ispu_hist, df_ispu_train
# ============================================================

import re
from pathlib import Path
import numpy as np
import pandas as pd

ROOT = Path("/kaggle/input/penyisihan-datavidia-10")
SAMPLE_PATH = ROOT / "sample_submission.csv"
ISPU_DIR = ROOT / "ISPU"

CUTOFF_DATE = pd.Timestamp("2025-09-01")
HIST_END = CUTOFF_DATE - pd.Timedelta(days=1)  # 2025-08-31
VALID_STATIONS = [f"DKI{i}" for i in range(1, 6)]

# Range valid untuk tanggal ISPU (clamp semua parse di luar ini -> NaT)
MIN_VALID_DATE = pd.Timestamp("2009-01-01")
MAX_VALID_DATE = HIST_END

# Range valid untuk periode_data (YYYYMM)
MIN_VALID_YYYYMM = 200901
MAX_VALID_YYYYMM = 202512

# Excel serial range aman (kira-kira 1995 s.d. 2036)
MIN_EXCEL_SERIAL = 35000
MAX_EXCEL_SERIAL = 50000

# -----------------------------
# Helpers
# -----------------------------
def norm_colname(s: str) -> str:
    s = str(s).strip().lower()
    s = s.replace("\ufeff", "")  # BOM
    s = re.sub(r"[^\w]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s

def read_csv_robust(path: Path) -> pd.DataFrame:
    tries = [
        dict(),
        dict(encoding="utf-8-sig"),
        dict(encoding="latin1"),
        dict(engine="python", sep=None, on_bad_lines="skip"),
        dict(engine="python", sep=None, on_bad_lines="skip", encoding="utf-8-sig"),
        dict(engine="python", sep=None, on_bad_lines="skip", encoding="latin1"),
    ]
    last_err = None
    for kw in tries:
        try:
            return pd.read_csv(path, **kw)
        except Exception as e:
            last_err = e
    raise RuntimeError(f"Failed reading {path.name}: {last_err}")

def series_or_na(df: pd.DataFrame, col: str) -> pd.Series:
    return df[col] if col in df.columns else pd.Series(pd.NA, index=df.index)

def coalesce_cols(df: pd.DataFrame, cols: list[str]) -> pd.Series:
    mats = [series_or_na(df, c) for c in cols]
    return pd.concat(mats, axis=1).bfill(axis=1).iloc[:, 0]

def clean_missing_tokens(s: pd.Series) -> pd.Series:
    return s.replace({"---": pd.NA, "—": pd.NA, "-": pd.NA, "": pd.NA, "NA": pd.NA, "N/A": pd.NA})

def to_num(s: pd.Series) -> pd.Series:
    s = clean_missing_tokens(s)
    if s.dtype.name == "string":
        ss = s.str.replace(".", "", regex=False).str.replace(",", ".", regex=False)
        return pd.to_numeric(ss, errors="coerce")
    return pd.to_numeric(s, errors="coerce")

def coalesce_num(df: pd.DataFrame, cols: list[str]) -> pd.Series:
    mats = [to_num(series_or_na(df, c)) for c in cols]
    return pd.concat(mats, axis=1).bfill(axis=1).iloc[:, 0]

def clamp_datetime(s: pd.Series) -> pd.Series:
    # Pastikan datetime64[ns] dan buang yang di luar rentang valid
    s = pd.to_datetime(s, errors="coerce")
    ok = s.between(MIN_VALID_DATE, MAX_VALID_DATE, inclusive="both")
    return s.where(ok, pd.NaT)

def extract_yyyymm(per: pd.Series) -> pd.Series:
    s = clean_missing_tokens(per.astype("string"))

    # 1) ambil 6 digit pertama yang tampil (mis. 202501)
    a = s.str.extract(r"(\d{6})", expand=False)

    # 2) pola YYYY-MM / YYYY/MM
    b = s.str.extract(r"(\d{4})\D(\d{2})", expand=True)
    b_yyyymm = pd.Series(pd.NA, index=s.index, dtype="string")
    if b.shape[1] >= 2:
        b_yyyymm = b[0].astype("string").str.zfill(4) + b[1].astype("string").str.zfill(2)

    out = a.fillna(b_yyyymm)
    yyyymm = to_num(out)

    # VALIDASI range: kalau bukan 200901..202512 -> anggap sampah
    ok = yyyymm.between(MIN_VALID_YYYYMM, MAX_VALID_YYYYMM, inclusive="both")
    return yyyymm.where(ok, np.nan)

def ymd_from_yyyymm_day(yyyymm: pd.Series, day: pd.Series) -> pd.Series:
    yyyymm = to_num(yyyymm)
    day = to_num(day)

    y = np.floor(yyyymm / 100.0)
    m = np.floor(yyyymm % 100.0)
    d = np.floor(day)

    # validasi komponen tanggal
    mask = (
        np.isfinite(y) & np.isfinite(m) & np.isfinite(d) &
        (y >= 2009) & (y <= 2025) &
        (m >= 1) & (m <= 12) &
        (d >= 1) & (d <= 31)
    )

    out = pd.Series(pd.NaT, index=day.index, dtype="datetime64[ns]")
    if mask.any():
        ys = pd.Series(y[mask].astype(np.int64), index=out.index[mask]).astype(str).str.zfill(4)
        ms = pd.Series(m[mask].astype(np.int64), index=out.index[mask]).astype(str).str.zfill(2)
        ds = pd.Series(d[mask].astype(np.int64), index=out.index[mask]).astype(str).str.zfill(2)
        out.loc[mask] = pd.to_datetime(ys + "-" + ms + "-" + ds, errors="coerce")
    return clamp_datetime(out)

def merge_dates(primary: pd.Series, fallback: pd.Series) -> pd.Series:
    # Gabung tanpa fillna chain untuk menghindari pandas AssertionError
    out = primary.copy()
    m = out.isna() & fallback.notna()
    if m.any():
        out.loc[m] = fallback.loc[m].values
    return out

def parse_date_flexible(df: pd.DataFrame) -> pd.Series:
    d_raw = clean_missing_tokens(coalesce_cols(df, ["tanggal", "time", "date"])).astype("string")

    # A) format YYYY-MM-DD (silence warning dayfirst)
    d_iso = pd.to_datetime(d_raw, errors="coerce", format="%Y-%m-%d")
    d_iso = clamp_datetime(d_iso)

    # B) dayfirst parse (dd/mm/yyyy dll)
    d_dayfirst = pd.to_datetime(d_raw, errors="coerce", dayfirst=True)
    d_dayfirst = clamp_datetime(d_dayfirst)

    # C) excel serial (hanya jika serial masuk range aman)
    d_num = to_num(d_raw)
    serial_ok = d_num.between(MIN_EXCEL_SERIAL, MAX_EXCEL_SERIAL, inclusive="both")
    d_excel = pd.Series(pd.NaT, index=df.index, dtype="datetime64[ns]")
    if serial_ok.any():
        d_excel.loc[serial_ok] = pd.to_datetime(d_num.loc[serial_ok], unit="D", origin="1899-12-30", errors="coerce")
    d_excel = clamp_datetime(d_excel)

    # D) periode_data YYYYMM + day-of-month (kalau tanggal hanya 1..31)
    yyyymm = extract_yyyymm(series_or_na(df, "periode_data"))
    d_from_per = ymd_from_yyyymm_day(yyyymm, d_raw)

    # E) fallback YYYYMMDD (8 digit)
    d_yyyymmdd = d_raw.str.extract(r"(\d{8})", expand=False)
    d_full8 = pd.to_datetime(d_yyyymmdd, format="%Y%m%d", errors="coerce")
    d_full8 = clamp_datetime(d_full8)

    # MERGE berurutan (tanpa fillna chain)
    out = pd.Series(pd.NaT, index=df.index, dtype="datetime64[ns]")
    out = merge_dates(out, d_iso)
    out = merge_dates(out, d_dayfirst)
    out = merge_dates(out, d_excel)
    out = merge_dates(out, d_from_per)
    out = merge_dates(out, d_full8)
    return out

_LOC_TO_DKI = [
    (r"BUNDARAN\s*HI|\bHI\b", "DKI1"),
    (r"KELAPA\s*GADING", "DKI2"),
    (r"JAGAKARSA", "DKI3"),
    (r"LUBANG\s*BUAYA", "DKI4"),
    (r"KEBON\s*JERUK", "DKI5"),
]

def parse_station_code(raw: pd.Series) -> pd.Series:
    s = raw.astype("string").str.upper().str.strip()
    s2 = s.str.replace("SPKU", "", regex=False).str.replace("STASIUN", "", regex=False)
    s2 = s2.str.replace(r"\s+", " ", regex=True).str.strip()

    code = s2.str.extract(r"(DKI\s*[-]?\s*[1-5])", expand=False)
    code = code.str.replace(r"\s*-\s*", "", regex=True).str.replace(" ", "", regex=False)

    loc_code = pd.Series(pd.NA, index=s.index, dtype="string")
    for pat, rep in _LOC_TO_DKI:
        hit = s2.str.contains(pat, regex=True, na=False)
        loc_code = loc_code.fillna(pd.Series(np.where(hit, rep, pd.NA), index=s.index, dtype="string"))

    digit = s2.str.extract(r"\b([1-5])\b", expand=False)
    digit_code = ("DKI" + digit).astype("string")

    return code.fillna(loc_code).fillna(digit_code)

def map_label_3class(raw: pd.Series) -> pd.Series:
    s = raw.astype("string").str.upper().str.strip()
    s = s.replace({
        "SANGAT TIDAK SEHAT": "TIDAK SEHAT",
        "BERBAHAYA": "TIDAK SEHAT",
        "TIDAK ADA DATA": pd.NA,
        "NA": pd.NA,
    })
    return s.where(s.isin(["BAIK", "SEDANG", "TIDAK SEHAT"]), pd.NA)

# -----------------------------
# 1) sample_submission
# -----------------------------
df_sample = pd.read_csv(SAMPLE_PATH)
id_col = df_sample.columns[0]
target_col = df_sample.columns[1] if len(df_sample.columns) > 1 else "category"

parts = df_sample[id_col].astype(str).str.split("_", n=1, expand=True)
df_sample["tanggal_target"] = pd.to_datetime(parts[0], errors="coerce")
df_sample["stasiun_code"] = parts[1].astype("string").str.upper().str.strip()

assert df_sample["tanggal_target"].min() == pd.Timestamp("2025-09-01")
assert df_sample["tanggal_target"].max() == pd.Timestamp("2025-11-30")
assert df_sample.shape[0] == 455
assert set(df_sample["stasiun_code"].unique()) == set(VALID_STATIONS)

# -----------------------------
# 2) Load semua ISPU
# -----------------------------
ispu_files = sorted(ISPU_DIR.glob("*.csv"))
if len(ispu_files) == 0:
    raise RuntimeError(f"Tidak menemukan file ISPU di {ISPU_DIR}")

frames = []
for fp in ispu_files:
    dfi = read_csv_robust(fp)
    dfi.columns = [norm_colname(c) for c in dfi.columns]
    dfi["__source_file"] = fp.name
    frames.append(dfi)

df_ispu0 = pd.concat(frames, ignore_index=True)

# -----------------------------
# 3) Normalize core fields
# -----------------------------
raw_station = coalesce_cols(df_ispu0, ["stasiun", "lokasi_spku", "stasiun_id", "lokasi"])
raw_label   = coalesce_cols(df_ispu0, ["kategori", "categori", "category"])
raw_crit    = coalesce_cols(df_ispu0, ["parameter_pencemar_kritis", "critical", "parameter_kritis"])

tanggal = parse_date_flexible(df_ispu0)
st_code = parse_station_code(raw_station)

max_idx = coalesce_num(df_ispu0, ["max", "maks", "indeks", "nilai_indeks"])
pm10 = coalesce_num(df_ispu0, ["pm10", "pm_10", "pm_sepuluh"])
pm25 = coalesce_num(df_ispu0, ["pm25", "pm_25", "pm_duakomalima", "pm_dua_koma_lima", "pm2_5", "pm2_5_"])
so2  = coalesce_num(df_ispu0, ["so2", "sulfur_dioksida"])
co   = coalesce_num(df_ispu0, ["co", "karbon_monoksida"])
o3   = coalesce_num(df_ispu0, ["o3", "ozon"])
no2  = coalesce_num(df_ispu0, ["no2", "nitrogen_dioksida"])

df_ispu = pd.DataFrame({
    "tanggal": tanggal,
    "stasiun_raw": raw_station.astype("string"),
    "stasiun_code": st_code.astype("string"),
    "label_raw": raw_label.astype("string"),
    "critical_raw": raw_crit.astype("string"),
    "max": max_idx,
    "pm10": pm10,
    "pm25": pm25,
    "so2": so2,
    "co": co,
    "o3": o3,
    "no2": no2,
    "__source_file": df_ispu0["__source_file"].astype("string"),
})

df_ispu["label_3"] = map_label_3class(df_ispu["label_raw"])

# -----------------------------
# 4) FILE-LEVEL QA (raw parse)
# -----------------------------
tmp = df_ispu.copy()
tmp["is_valid_station"] = tmp["stasiun_code"].isin(VALID_STATIONS)
tmp["is_valid_date"] = tmp["tanggal"].notna()

file_qa = (tmp.groupby("__source_file")
             .agg(
                 rows=("tanggal","size"),
                 pct_date_nat=("is_valid_date", lambda x: 1 - float(np.mean(x))),
                 date_min=("tanggal","min"),
                 date_max=("tanggal","max"),
                 pct_station_valid=("is_valid_station","mean"),
             )
             .sort_index())

print("=== STAGE 1 FILE QA (raw parse) ===")
print(file_qa)
print()

# -----------------------------
# 5) Filter & clean (cutoff + corrupt + bad station)
# -----------------------------
df_ispu = df_ispu.loc[df_ispu["tanggal"].notna()].copy()
df_ispu = df_ispu.loc[df_ispu["tanggal"] <= HIST_END].copy()

station_upper = df_ispu["stasiun_raw"].fillna("").str.upper()
label_upper   = df_ispu["label_raw"].fillna("").str.upper()

class_tokens = {"BAIK", "SEDANG", "TIDAK SEHAT", "SANGAT TIDAK SEHAT", "BERBAHAYA", "TIDAK ADA DATA"}
poll_tokens  = {"O3", "NO2", "SO2", "CO", "PM10", "PM2.5", "PM25", "PM_10", "PM_2_5"}

is_corrupt = (station_upper.isin(class_tokens) | label_upper.isin(poll_tokens))
is_bad_station = ~df_ispu["stasiun_code"].isin(VALID_STATIONS)

n_before = len(df_ispu)
df_ispu = df_ispu.loc[~(is_corrupt | is_bad_station)].copy()
n_after = len(df_ispu)
n_removed = n_before - n_after

# -----------------------------
# 6) Dedup best record per (tanggal, stasiun)
# -----------------------------
feat_cols = ["max", "pm10", "pm25", "so2", "co", "o3", "no2"]
completeness = df_ispu[feat_cols].notna().sum(axis=1).astype(int)
label_bonus = df_ispu["label_3"].notna().astype(int)
df_ispu["_score"] = completeness * 10 + label_bonus

df_ispu = (
    df_ispu.sort_values(["tanggal", "stasiun_code", "_score"], ascending=[True, True, False])
           .drop_duplicates(["tanggal", "stasiun_code"], keep="first")
           .drop(columns=["_score"])
           .reset_index(drop=True)
)

df_ispu_hist = df_ispu.copy()
df_ispu_train = df_ispu_hist.loc[df_ispu_hist["label_3"].notna()].copy()

# -----------------------------
# 7) Sanity summary
# -----------------------------
print("=== STAGE 1 SUMMARY (REV8) ===")
print(f"ISPU files loaded           : {len(ispu_files)}")
print(f"Rows after date<=HIST_END   : {n_before}")
print(f"Rows removed (corrupt/bad)  : {n_removed}")
print(f"Rows after dedup            : {len(df_ispu_hist)}")
print(f"Train rows (labeled 3c)     : {len(df_ispu_train)}")
print()

rng = df_ispu_hist.groupby("stasiun_code")["tanggal"].agg(["min","max","count"]).reindex(VALID_STATIONS)
print("Date range per station (hist):")
print(rng)
print()

print("Label distribution (train only):")
print(df_ispu_train["label_3"].value_counts(dropna=False))
print()

cov = df_ispu_hist.groupby("tanggal")["stasiun_code"].nunique().sort_index()
print("Last 30 days coverage (unique stations per day):")
print(cov.tail(30))
print()

obs_max = df_ispu_hist["tanggal"].max()
print(f"Expected HIST_END           : {HIST_END.date()}")
print(f"Observed max date (hist)    : {obs_max.date() if pd.notna(obs_max) else obs_max}")

if obs_max != HIST_END:
    print("\n[WARN] Histori belum mencapai 2025-08-31.")
    print("Cek tabel FILE QA: cari file 2024/2025 yang date_max kecil atau pct_date_nat tinggi.")
else:
    print("\n[OK] Histori mencapai cutoff. Lanjut Stage 2 aman.")

print("\n[OK] Stage 1 completed: df_ispu_hist, df_ispu_train, df_sample ready.")

  d_dayfirst = pd.to_datetime(d_raw, errors="coerce", dayfirst=True)


=== STAGE 1 FILE QA (raw parse) ===
                                                    rows  pct_date_nat  \
__source_file                                                            
data-indeks-standar-pencemar-udara-(ispu)-di-pr...  1825      0.000000   
data-indeks-standar-pencemar-udara-(ispu)-di-pr...  1830      0.000000   
data-indeks-standar-pencemar-udara-(ispu)-di-pr...  1215      0.000000   
indeks-standar-pencemaran-udara-(ispu)-tahun-20...  1825      0.000000   
indeks-standar-pencemaran-udara-(ispu)-tahun-20...   365      0.000000   
indeks-standar-pencemaran-udara-(ispu)-tahun-20...   366      0.000000   
indeks-standar-pencemaran-udara-(ispu)-tahun-20...   365      0.082192   
indeks-standar-pencemaran-udara-(ispu)-tahun-20...  1825      0.000000   
indeks-standar-pencemaran-udara-(ispu)-tahun-20...   365      0.000000   
indeks-standar-pencemaran-udara-(ispu)-tahun-20...  1830      0.041530   
indeks-standar-pencemaran-udara-(ispu)-tahun-20...  1825      0.000000   
in

# Master Table Building (Correct Joins)

In [3]:
# ============================================================
# STAGE 2 — Master Table Building (Correct Joins) — REV2 (TOPSCORE FOUNDATION)
#
# Upgrade utama:
# - Build df_master: grid harian (tanggal x stasiun) sampai HIST_END (1 row per key)
# - Calendar + holiday safe untuk future (Sep–Nov 2025) + cyclical time features lengkap
# - df_targets: index submission + fitur kalender + last_obs_date + horizon_days (HARUS 1..91)
# - Load aux tables (NDVI, Weather, Water, Pop) forecast-safe (<= HIST_END) + cleaned + dedup
# - Robust date parsing (ISO first -> dayfirst -> excel) + clamp range
#
# Output:
#   df_sub, df_calendar, df_targets, df_master
#   last_obs_by_station
#   df_ndvi, df_weather, df_water_m, df_pop_y
# ============================================================

from pathlib import Path
import re
import numpy as np
import pandas as pd

# ---------- GUARDS ----------
need = ["df_ispu_hist", "HIST_END", "CUTOFF_DATE", "VALID_STATIONS"]
miss = [k for k in need if k not in globals()]
if miss:
    raise RuntimeError(f"Missing globals from STAGE 1: {miss}. Jalankan STAGE 1 (REV8) dulu.")

DATA_ROOT = Path("/kaggle/input/penyisihan-datavidia-10")

SUB_PATH    = DATA_ROOT / "sample_submission.csv"
HOL_PATH    = DATA_ROOT / "libur-nasional" / "dataset-libur-nasional-dan-weekend.csv"
NDVI_PATH   = DATA_ROOT / "NDVI (vegetation index)" / "indeks-ndvi-jakarta.csv"
WATER_PATH  = DATA_ROOT / "kualitas-air-sungai" / "data-kualitas-air-sungai-komponen-data.csv"
POP_PATH    = DATA_ROOT / "jumlah-penduduk" / "data-jumlah-penduduk-provinsi-dki-jakarta-berdasarkan-kelompok-usia-dan-jenis-kelamin-tahun-2013-2021-komponen-data.csv"
WEATHER_DIR = DATA_ROOT / "cuaca-harian"

MIN_VALID_DATE = pd.Timestamp("2009-01-01")
MAX_TARGET_DATE = pd.Timestamp("2025-11-30")

# ---------- Helpers ----------
def norm_colname(c: str) -> str:
    c = str(c).strip().lower().replace("\ufeff", "")
    c = re.sub(r"[()\[\]{}]", "", c)
    c = re.sub(r"[%/]", "_", c)
    c = re.sub(r"[^a-z0-9]+", "_", c)
    c = re.sub(r"_+", "_", c).strip("_")
    return c

def read_csv_robust(path: Path) -> pd.DataFrame:
    tries = [
        dict(),
        dict(encoding="utf-8-sig"),
        dict(encoding="latin1"),
        dict(engine="python", sep=None, on_bad_lines="skip"),
        dict(engine="python", sep=None, on_bad_lines="skip", encoding="utf-8-sig"),
        dict(engine="python", sep=None, on_bad_lines="skip", encoding="latin1"),
    ]
    last_err = None
    for kw in tries:
        try:
            return pd.read_csv(path, **kw)
        except Exception as e:
            last_err = e
    raise RuntimeError(f"Failed reading {path.name}: {last_err}")

def clean_missing_tokens(s: pd.Series) -> pd.Series:
    return s.replace({"---": pd.NA, "—": pd.NA, "-": pd.NA, "": pd.NA, "NA": pd.NA, "N/A": pd.NA})

def to_num(s: pd.Series) -> pd.Series:
    s = clean_missing_tokens(s)
    if s.dtype.name == "string":
        ss = s.str.replace(".", "", regex=False).str.replace(",", ".", regex=False)
        return pd.to_numeric(ss, errors="coerce")
    return pd.to_numeric(s, errors="coerce")

def clamp_date(dt: pd.Series) -> pd.Series:
    dt = pd.to_datetime(dt, errors="coerce")
    ok = dt.between(MIN_VALID_DATE, MAX_TARGET_DATE, inclusive="both")
    return dt.where(ok, pd.NaT)

def parse_any_date(s: pd.Series) -> pd.Series:
    s = clean_missing_tokens(s).astype("string")

    # ISO first (yyyy-mm-dd) -> no warning
    d_iso = pd.to_datetime(s, errors="coerce", format="%Y-%m-%d")
    d_iso = clamp_date(d_iso)

    # dayfirst fallback (dd/mm/yyyy etc)
    d_df = pd.to_datetime(s, errors="coerce", dayfirst=True)
    d_df = clamp_date(d_df)

    # excel serial fallback (batasi range aman)
    n = to_num(s)
    serial_ok = n.between(35000, 50000, inclusive="both")
    d_xl = pd.Series(pd.NaT, index=s.index, dtype="datetime64[ns]")
    if serial_ok.any():
        d_xl.loc[serial_ok] = pd.to_datetime(n.loc[serial_ok], unit="D", origin="1899-12-30", errors="coerce")
    d_xl = clamp_date(d_xl)

    out = pd.Series(pd.NaT, index=s.index, dtype="datetime64[ns]")
    m = out.isna() & d_iso.notna()
    out.loc[m] = d_iso.loc[m].values
    m = out.isna() & d_df.notna()
    out.loc[m] = d_df.loc[m].values
    m = out.isna() & d_xl.notna()
    out.loc[m] = d_xl.loc[m].values
    return out

def parse_id_to_date_station(id_s: pd.Series) -> tuple[pd.Series, pd.Series]:
    parts = id_s.astype(str).str.split("_", n=1, expand=True)
    dt = pd.to_datetime(parts[0], errors="coerce")
    st = parts[1].astype("string").str.upper().str.strip()
    return dt, st

def normalize_station_code(s: pd.Series) -> pd.Series:
    x = s.astype("string").str.upper().str.strip()
    code = x.str.extract(r"(DKI\s*[-]?\s*[1-5])", expand=False)
    code = code.str.replace(r"\s*-\s*", "", regex=True).str.replace(" ", "", regex=False)
    digit = x.str.extract(r"\b([1-5])\b", expand=False)
    digit_code = ("DKI" + digit).astype("string")
    return code.fillna(digit_code)

def pick_first_col(df: pd.DataFrame, cols: list[str]) -> str | None:
    for c in cols:
        if c in df.columns:
            return c
    return None

# ============================================================
# 2.1 Parse sample_submission -> df_sub
# ============================================================
df_sub0 = pd.read_csv(SUB_PATH)
df_sub0.columns = [norm_colname(c) for c in df_sub0.columns]
id_col = "id" if "id" in df_sub0.columns else df_sub0.columns[0]

df_sub = pd.DataFrame({"id": df_sub0[id_col].astype(str)})
df_sub["tanggal_target"], df_sub["stasiun_code"] = parse_id_to_date_station(df_sub["id"])
df_sub["stasiun_code"] = normalize_station_code(df_sub["stasiun_code"])

df_sub = df_sub.loc[df_sub["tanggal_target"].notna()].copy()
df_sub = df_sub.loc[df_sub["stasiun_code"].isin(VALID_STATIONS)].reset_index(drop=True)

# hard QA (critical)
if len(df_sub) != 455:
    raise RuntimeError(f"Submission index harus 455 baris, sekarang {len(df_sub)}")
if df_sub["tanggal_target"].min() != pd.Timestamp("2025-09-01") or df_sub["tanggal_target"].max() != pd.Timestamp("2025-11-30"):
    raise RuntimeError("Range tanggal submission tidak sesuai 2025-09-01 s.d. 2025-11-30")
if set(df_sub["stasiun_code"].unique()) != set(VALID_STATIONS):
    raise RuntimeError("Stasiun di submission tidak lengkap DKI1..DKI5")

# ============================================================
# 2.2 ISPU history subset (<= HIST_END) + last_obs_date
# ============================================================
df_hist = df_ispu_hist.loc[
    df_ispu_hist["stasiun_code"].isin(VALID_STATIONS) &
    (df_ispu_hist["tanggal"] <= HIST_END)
].copy()

# last obs per stasiun (harusnya 2025-08-31 untuk semua)
last_obs_by_station = df_hist.groupby("stasiun_code")["tanggal"].max().reindex(VALID_STATIONS)

# fallback jika ada station yang kosong (harusnya tidak terjadi)
last_obs_by_station = last_obs_by_station.fillna(HIST_END)

# ============================================================
# 2.3 Build df_master (grid harian stasiun x tanggal) — TOPSCORE FOUNDATION
# ============================================================
hist_min = df_hist["tanggal"].min()
hist_max = df_hist["tanggal"].max()

all_dates = pd.date_range(hist_min, HIST_END, freq="D")
grid = pd.MultiIndex.from_product([all_dates, VALID_STATIONS], names=["tanggal", "stasiun_code"])
df_master = pd.DataFrame(index=grid).reset_index()

# pastikan df_hist unique key (stage 1 sudah dedup, tapi cek lagi)
dup = df_hist.duplicated(["tanggal", "stasiun_code"]).sum()
if dup > 0:
    raise RuntimeError(f"df_hist masih memiliki duplikasi key: {dup} baris. Stage 1 dedup belum aman.")

# merge ISPU fields (label + numeric) ke master
keep_cols = [c for c in ["tanggal","stasiun_code","label_3","max","pm10","pm25","so2","co","o3","no2"] if c in df_hist.columns]
df_master = df_master.merge(df_hist[keep_cols], on=["tanggal","stasiun_code"], how="left")

# ============================================================
# 2.4 Calendar table + holiday (allowed future features)
# ============================================================
cal_min = min(hist_min, df_sub["tanggal_target"].min())
cal_max = max(HIST_END, df_sub["tanggal_target"].max())

df_calendar = pd.DataFrame({"tanggal": pd.date_range(cal_min, cal_max, freq="D")})
df_calendar["year"] = df_calendar["tanggal"].dt.year
df_calendar["month"] = df_calendar["tanggal"].dt.month
df_calendar["day"] = df_calendar["tanggal"].dt.day
df_calendar["dow"] = df_calendar["tanggal"].dt.dayofweek
df_calendar["dayofyear"] = df_calendar["tanggal"].dt.dayofyear
df_calendar["weekofyear"] = df_calendar["tanggal"].dt.isocalendar().week.astype(int)
df_calendar["is_month_start"] = df_calendar["tanggal"].dt.is_month_start.astype(int)
df_calendar["is_month_end"] = df_calendar["tanggal"].dt.is_month_end.astype(int)

# cyclical
df_calendar["sin_doy"] = np.sin(2*np.pi*df_calendar["dayofyear"]/365.25)
df_calendar["cos_doy"] = np.cos(2*np.pi*df_calendar["dayofyear"]/365.25)
df_calendar["sin_month"] = np.sin(2*np.pi*df_calendar["month"]/12.0)
df_calendar["cos_month"] = np.cos(2*np.pi*df_calendar["month"]/12.0)
df_calendar["sin_dow"] = np.sin(2*np.pi*df_calendar["dow"]/7.0)
df_calendar["cos_dow"] = np.cos(2*np.pi*df_calendar["dow"]/7.0)

# holiday table
df_hol = read_csv_robust(HOL_PATH)
df_hol.columns = [norm_colname(c) for c in df_hol.columns]
tcol_h = pick_first_col(df_hol, ["tanggal", "date", "time"])
if tcol_h is None:
    raise RuntimeError("Holiday table tidak punya kolom tanggal/time/date yang bisa diparse.")

df_hol["tanggal"] = parse_any_date(df_hol[tcol_h])
keep_h = [c for c in ["tanggal","is_holiday_nasional","nama_libur","is_weekend","day_name"] if c in df_hol.columns]
df_hol = df_hol[keep_h].dropna(subset=["tanggal"]).drop_duplicates("tanggal")

df_calendar = df_calendar.merge(df_hol, on="tanggal", how="left")

# fallback weekend/holiday
df_calendar["is_weekend"] = df_calendar["is_weekend"].fillna((df_calendar["dow"] >= 5).astype(int)).astype(int)
df_calendar["is_holiday_nasional"] = df_calendar["is_holiday_nasional"].fillna(0).astype(int)
df_calendar["holiday_or_weekend"] = ((df_calendar["is_weekend"] == 1) | (df_calendar["is_holiday_nasional"] == 1)).astype(int)

# ============================================================
# 2.5 df_targets = submission index + calendar features + horizon
# ============================================================
df_targets = df_sub.merge(
    df_calendar,
    left_on="tanggal_target",
    right_on="tanggal",
    how="left"
).drop(columns=["tanggal"])

df_targets["last_obs_date"] = df_targets["stasiun_code"].map(last_obs_by_station)
df_targets["horizon_days"] = (df_targets["tanggal_target"] - df_targets["last_obs_date"]).dt.days

# CRITICAL QA: horizon harus 1..91 (Sep–Nov from 2025-08-31)
hmin, hmax = int(df_targets["horizon_days"].min()), int(df_targets["horizon_days"].max())
if hmin < 1 or hmax > 91:
    raise RuntimeError(f"horizon_days tidak sesuai. Dapat {hmin}..{hmax}, seharusnya 1..91. Cek last_obs_date / cutoff.")

# ============================================================
# 2.6 Aux tables (forecasting-safe: <= HIST_END)
# ============================================================

# ---- NDVI (keep minimal, dedup) ----
df_ndvi = read_csv_robust(NDVI_PATH)
df_ndvi.columns = [norm_colname(c) for c in df_ndvi.columns]

tcol_n = pick_first_col(df_ndvi, ["tanggal", "time", "date"])
if tcol_n:
    df_ndvi["tanggal"] = parse_any_date(df_ndvi[tcol_n])
else:
    df_ndvi["tanggal"] = pd.NaT

sid_col = pick_first_col(df_ndvi, ["stasiun_code", "stasiun_id", "stasiun", "lokasi"])
sid = df_ndvi[sid_col] if sid_col else pd.Series(pd.NA, index=df_ndvi.index)
df_ndvi["stasiun_code"] = normalize_station_code(sid)

if "ndvi" in df_ndvi.columns:
    df_ndvi["ndvi"] = to_num(df_ndvi["ndvi"])
else:
    df_ndvi["ndvi"] = np.nan

df_ndvi = df_ndvi.loc[df_ndvi["tanggal"].notna()].copy()
df_ndvi = df_ndvi.loc[df_ndvi["tanggal"] <= HIST_END].copy()
df_ndvi = df_ndvi.loc[df_ndvi["stasiun_code"].isin(VALID_STATIONS)].copy()
df_ndvi = df_ndvi[["tanggal","stasiun_code","ndvi"]].drop_duplicates(["tanggal","stasiun_code"]).reset_index(drop=True)

# tambahan: versi bulanan (lebih stabil untuk asof join di stage 3)
df_ndvi["year"] = df_ndvi["tanggal"].dt.year
df_ndvi["month"] = df_ndvi["tanggal"].dt.month
df_ndvi_m = (df_ndvi.groupby(["stasiun_code","year","month"], as_index=False)
                    .agg(ndvi_mean=("ndvi","mean"), ndvi_n=("ndvi","count")))
df_ndvi_m["tanggal_bulan"] = pd.to_datetime(df_ndvi_m["year"].astype(int).astype(str) + "-" +
                                            df_ndvi_m["month"].astype(int).astype(str).str.zfill(2) + "-01")

# ---- Weather (per stasiun, forecasting-safe <= HIST_END) ----
w_files = sorted(WEATHER_DIR.glob("cuaca-harian-*.csv"))
w_list = []
for p in w_files:
    m = re.search(r"dki(\d+)", p.name.lower())
    st_code = f"DKI{m.group(1)}" if m else None
    if st_code not in VALID_STATIONS:
        continue

    d = read_csv_robust(p)
    d.columns = [norm_colname(c) for c in d.columns]
    tcol = pick_first_col(d, ["time","tanggal","date","waktu"])
    if not tcol:
        continue

    d["tanggal"] = parse_any_date(d[tcol])
    d["stasiun_code"] = st_code
    d = d.drop(columns=[tcol], errors="ignore")
    d = d.dropna(subset=["tanggal"]).copy()
    d = d.loc[d["tanggal"] <= HIST_END].copy()

    # convert numeric columns except station/date
    for c in d.columns:
        if c in ["tanggal","stasiun_code"]:
            continue
        d[c] = to_num(d[c].astype("string"))

    d = d.drop_duplicates(["tanggal","stasiun_code"]).reset_index(drop=True)
    w_list.append(d)

df_weather = pd.concat(w_list, ignore_index=True) if len(w_list) else pd.DataFrame(columns=["tanggal","stasiun_code"])

# ---- Water quality (aggregate by year-month; forecasting-safe <= HIST_END) ----
df_water = read_csv_robust(WATER_PATH)
df_water.columns = [norm_colname(c) for c in df_water.columns]

for c in ["hasil_pengukuran","baku_mutu"]:
    if c in df_water.columns:
        df_water[c] = to_num(df_water[c].astype("string"))

ycol = pick_first_col(df_water, ["periode_data","tahun"])
mcol = pick_first_col(df_water, ["bulan_sampling","bulan"])
df_water["tahun"] = to_num(df_water[ycol].astype("string")) if ycol else np.nan
df_water["bulan"] = to_num(df_water[mcol].astype("string")) if mcol else np.nan

bm = df_water["baku_mutu"] if "baku_mutu" in df_water.columns else pd.Series(np.nan, index=df_water.index)
hp = df_water["hasil_pengukuran"] if "hasil_pengukuran" in df_water.columns else pd.Series(np.nan, index=df_water.index)
exceed = (hp / bm).replace([np.inf, -np.inf], np.nan)

df_water_m = (
    pd.DataFrame({"tahun": df_water["tahun"], "bulan": df_water["bulan"], "exceed_ratio": exceed})
      .dropna(subset=["tahun","bulan"])
      .query("tahun >= 2009 and tahun <= 2025 and bulan >= 1 and bulan <= 12")
      .groupby(["tahun","bulan"], as_index=False)
      .agg(
          water_exceed_mean=("exceed_ratio","mean"),
          water_exceed_rate=("exceed_ratio", lambda s: float((s > 1).mean())),
          water_n=("exceed_ratio","count")
      )
)
df_water_m["tanggal_bulan"] = pd.to_datetime(df_water_m["tahun"].astype(int).astype(str) + "-" +
                                             df_water_m["bulan"].astype(int).astype(str).str.zfill(2) + "-01")
df_water_m = df_water_m.loc[df_water_m["tanggal_bulan"] <= HIST_END].reset_index(drop=True)

# ---- Population (aggregate yearly total; forecasting-safe by definition) ----
df_pop = read_csv_robust(POP_PATH)
df_pop.columns = [norm_colname(c) for c in df_pop.columns]
ycol_p = pick_first_col(df_pop, ["tahun","periode_data"])
df_pop["tahun"] = to_num(df_pop[ycol_p].astype("string")) if ycol_p else np.nan
df_pop["jumlah_penduduk"] = to_num(df_pop["jumlah_penduduk"].astype("string")) if "jumlah_penduduk" in df_pop.columns else np.nan

df_pop_y = (
    df_pop.dropna(subset=["tahun"])
          .groupby("tahun", as_index=False)
          .agg(pop_total=("jumlah_penduduk","sum"))
          .sort_values("tahun")
          .reset_index(drop=True)
)
df_pop_y["pop_yoy"] = df_pop_y["pop_total"].pct_change()

# ============================================================
# 2.7 QA prints (ringkas, penting)
# ============================================================
print("=== STAGE 2 SUMMARY (REV2) ===")
print("Submission rows:", len(df_sub))
print("Target date range:", df_sub["tanggal_target"].min().date(), "->", df_sub["tanggal_target"].max().date())
print("Stations in submission:", sorted(df_sub["stasiun_code"].unique().tolist()))

print("\nISPU hist max date:", df_hist["tanggal"].max().date(), "| HIST_END:", HIST_END.date())
print("Last observed date per station:")
print(last_obs_by_station)

print("\nHorizon_days range (MUST 1..91):", int(df_targets["horizon_days"].min()), "->", int(df_targets["horizon_days"].max()))

print("\nMaster table (df_master):", df_master.shape, "| missing label_3:", int(df_master["label_3"].isna().sum()))
print("Aux shapes:",
      "NDVI daily", df_ndvi.shape,
      "| NDVI monthly", df_ndvi_m.shape,
      "| Weather", df_weather.shape,
      "| Water monthly", df_water_m.shape,
      "| Pop yearly", df_pop_y.shape)

print("\ndf_targets preview:")
print(df_targets[["id","tanggal_target","stasiun_code","last_obs_date","horizon_days","is_weekend","is_holiday_nasional","holiday_or_weekend"]].head(10))

print("\n[OK] Stage 2 completed: df_sub, df_calendar, df_targets, df_master, df_ndvi/df_ndvi_m, df_weather, df_water_m, df_pop_y ready.")


  d_df = pd.to_datetime(s, errors="coerce", dayfirst=True)
  .query("tahun >= 2009 and tahun <= 2025 and bulan >= 1 and bulan <= 12")


=== STAGE 2 SUMMARY (REV2) ===
Submission rows: 455
Target date range: 2025-09-01 -> 2025-11-30
Stations in submission: ['DKI1', 'DKI2', 'DKI3', 'DKI4', 'DKI5']

ISPU hist max date: 2025-08-31 | HIST_END: 2025-08-31
Last observed date per station:
stasiun_code
DKI1   2025-08-31
DKI2   2025-08-31
DKI3   2025-08-31
DKI4   2025-08-31
DKI5   2025-08-31
Name: tanggal, dtype: datetime64[ns]

Horizon_days range (MUST 1..91): 1 -> 91

Master table (df_master): (28610, 10) | missing label_3: 13886
Aux shapes: NDVI daily (1810, 5) | NDVI monthly (945, 6) | Weather (28610, 25) | Water monthly (4, 6) | Pop yearly (4, 3)

df_targets preview:
                id tanggal_target stasiun_code last_obs_date  horizon_days  \
0  2025-09-01_DKI1     2025-09-01         DKI1    2025-08-31             1   
1  2025-09-01_DKI2     2025-09-01         DKI2    2025-08-31             1   
2  2025-09-01_DKI3     2025-09-01         DKI3    2025-08-31             1   
3  2025-09-01_DKI4     2025-09-01         DKI4    2

# Feature Engineering (Time-Series + Calendar + Robustness)

In [4]:
# ============================================================
# STAGE 3 — Feature Engineering + Supervised Forecasting Table
# REV TOP v3.3 (FIX: df_targets_feat month missing)
#
# Fix utama:
# - Selalu bikin year/month/day/dow/dayofyear/weekofyear dari tanggal_target
#   (jangan tergantung df_calendar punya month atau tidak)
# - merge_asof NDVI: per-station (no global sort pitfalls)
#
# Output:
#   df_ts_feat, df_targets_feat, df_train_sup
#   FEATURE_COLS_MODEL, CAT_FEATURES, NUM_FEATURES
# ============================================================

import numpy as np
import pandas as pd

# ---------- guards ----------
need = ["df_calendar","df_targets","df_ispu_hist","df_weather","df_ndvi","VALID_STATIONS","HIST_END"]
miss = [k for k in need if k not in globals()]
if miss:
    raise RuntimeError(f"Missing globals from previous stages: {miss}. Missing={miss}")

SEED = 42
np.random.seed(SEED)

MAX_H = 91
TRAIN_START = pd.Timestamp("2020-01-01")
BUILD_TRAIN_TABLE = True

WINS_POLL = [7, 14, 30]
WINS_WX   = [7, 14, 30]
MINP_POLL = {7: 3, 14: 5, 30: 10}
MINP_WX   = {7: 3, 14: 5, 30: 10}

USE_SAMPLE_WEIGHT = True
WEIGHT_LONGH_MAX = 2.0
WEIGHT_SEP_NOV = 1.3

LABELS = ["BAIK","SEDANG","TIDAK SEHAT"]
MAP_CODE = {"BAIK": 0, "SEDANG": 1, "TIDAK SEHAT": 2}

# =========================
# Helpers
# =========================
def _dedup_cols(df: pd.DataFrame) -> pd.DataFrame:
    return df.loc[:, ~df.columns.duplicated()].copy()

def _ensure_station(df: pd.DataFrame, col="stasiun_code") -> pd.DataFrame:
    df = _dedup_cols(df)
    if col not in df.columns:
        raise RuntimeError(f"Missing '{col}' in df cols={df.columns.tolist()[:50]}")
    df[col] = df[col].astype("string").str.upper().str.strip()
    return df

def _safe_num(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s, errors="coerce")

def _grp_roll(series_shifted: pd.Series, by: pd.Series, w: int, func: str, minp: int) -> pd.Series:
    r = series_shifted.groupby(by).rolling(w, min_periods=minp)
    out = getattr(r, func)()
    return out.reset_index(level=0, drop=True)

def _merge_safe(left: pd.DataFrame, right: pd.DataFrame, on, how="left", validate=None) -> pd.DataFrame:
    left = _dedup_cols(left)
    right = _dedup_cols(right)
    out = left.merge(right, on=on, how=how, validate=validate)
    return _dedup_cols(out)

def _add_dateparts(df: pd.DataFrame, date_col: str) -> pd.DataFrame:
    """Force year/month/day/dow/dayofyear/weekofyear + cyclic features from a date column."""
    d = pd.to_datetime(df[date_col], errors="coerce")
    df["year"] = d.dt.year.astype("Int64")
    df["month"] = d.dt.month.astype("Int64")
    df["day"] = d.dt.day.astype("Int64")
    df["dow"] = d.dt.dayofweek.astype("Int64")
    df["dayofyear"] = d.dt.dayofyear.astype("Int64")
    df["weekofyear"] = d.dt.isocalendar().week.astype(int)

    # cyclic (float)
    df["sin_doy"] = np.sin(2*np.pi*df["dayofyear"].astype(float)/365.25)
    df["cos_doy"] = np.cos(2*np.pi*df["dayofyear"].astype(float)/365.25)
    df["sin_month"] = np.sin(2*np.pi*df["month"].astype(float)/12.0)
    df["cos_month"] = np.cos(2*np.pi*df["month"].astype(float)/12.0)
    df["sin_dow"] = np.sin(2*np.pi*df["dow"].astype(float)/7.0)
    df["cos_dow"] = np.cos(2*np.pi*df["dow"].astype(float)/7.0)

    df["is_month_start"] = d.dt.is_month_start.astype(int)
    df["is_month_end"]   = d.dt.is_month_end.astype(int)
    return df

# ============================================================
# 3.0 Build base TS (ISPU hist) + weather
# ============================================================
dfh = _ensure_station(df_ispu_hist.copy(), "stasiun_code")
dfh["tanggal"] = pd.to_datetime(dfh["tanggal"], errors="coerce")
dfh = dfh.dropna(subset=["tanggal"]).copy()
dfh = dfh.loc[dfh["stasiun_code"].isin(VALID_STATIONS)].copy()
dfh = dfh.loc[dfh["tanggal"] <= pd.Timestamp(HIST_END)].copy()

LBL_COL = "label_3" if "label_3" in dfh.columns else None
if LBL_COL is None:
    raise RuntimeError("df_ispu_hist harus punya label_3.")

POLL_COLS = [c for c in ["max","pm10","pm25","so2","co","o3","no2"] if c in dfh.columns]
if len(POLL_COLS) == 0:
    raise RuntimeError("POLL_COLS tidak ditemukan di df_ispu_hist.")

for c in POLL_COLS:
    dfh[c] = _safe_num(dfh[c])

dfw = _ensure_station(df_weather.copy(), "stasiun_code")
dfw["tanggal"] = pd.to_datetime(dfw["tanggal"], errors="coerce")
dfw = dfw.dropna(subset=["tanggal"]).copy()
dfw = dfw.loc[dfw["stasiun_code"].isin(VALID_STATIONS)].copy()
dfw = dfw.loc[dfw["tanggal"] <= pd.Timestamp(HIST_END)].copy()

wx_candidates = [
    "temperature_2m_mean","temperature_2m_max","temperature_2m_min",
    "precipitation_sum","precipitation_hours",
    "wind_speed_10m_mean","wind_speed_10m_max","wind_speed_10m_min",
    "wind_gusts_10m_mean","wind_gusts_10m_max","wind_gusts_10m_min",
    "relative_humidity_2m_mean","relative_humidity_2m_max","relative_humidity_2m_min",
    "cloud_cover_mean","cloud_cover_max","cloud_cover_min",
    "surface_pressure_mean","surface_pressure_max","surface_pressure_min",
    "shortwave_radiation_sum",
    "wind_direction_10m_dominant","winddirection_10m_dominant"
]
WX_COLS = [c for c in wx_candidates if c in dfw.columns]
for c in WX_COLS:
    dfw[c] = _safe_num(dfw[c])

df_ts = _merge_safe(
    dfh[["tanggal","stasiun_code",LBL_COL] + POLL_COLS],
    dfw[["tanggal","stasiun_code"] + WX_COLS],
    on=["tanggal","stasiun_code"],
    how="left"
).sort_values(["stasiun_code","tanggal"]).reset_index(drop=True)

# ============================================================
# 3.1 NDVI merge_asof PER STATION
# ============================================================
df_ts["ndvi"] = np.nan
dfn = df_ndvi.copy()
use_ndvi = (len(dfn) > 0) and {"tanggal","stasiun_code","ndvi"}.issubset(dfn.columns)

if use_ndvi:
    dfn = _ensure_station(dfn, "stasiun_code")
    dfn["tanggal"] = pd.to_datetime(dfn["tanggal"], errors="coerce")
    dfn["ndvi"] = _safe_num(dfn["ndvi"])
    dfn = dfn.dropna(subset=["tanggal"]).copy()
    dfn = dfn.loc[dfn["stasiun_code"].isin(VALID_STATIONS)].copy()
    dfn = dfn.loc[dfn["tanggal"] <= pd.Timestamp(HIST_END)].copy()
    dfn = (dfn.sort_values(["stasiun_code","tanggal"])
              .drop_duplicates(["stasiun_code","tanggal"], keep="last")
              .reset_index(drop=True))

    ndvi_out = []
    for st in VALID_STATIONS:
        left = df_ts.loc[df_ts["stasiun_code"] == st, ["tanggal"]].copy()
        right = dfn.loc[dfn["stasiun_code"] == st, ["tanggal","ndvi"]].copy()
        left = left.sort_values("tanggal").reset_index()
        right = right.sort_values("tanggal").reset_index(drop=True)

        if len(right) == 0:
            left["ndvi"] = np.nan
        else:
            tmp = pd.merge_asof(left, right, on="tanggal", direction="backward", allow_exact_matches=True)
            left["ndvi"] = tmp["ndvi"].values

        ndvi_out.append(left.set_index("index")["ndvi"])
    df_ts["ndvi"] = pd.concat(ndvi_out).sort_index().values

# ============================================================
# 3.2 Rolling/Lag features (PAST-ONLY)
# ============================================================
df_ts = df_ts.sort_values(["stasiun_code","tanggal"]).reset_index(drop=True)
by = df_ts["stasiun_code"]
g  = df_ts.groupby("stasiun_code", sort=False)

df_ts["_ycode"] = df_ts[LBL_COL].astype("string").map(MAP_CODE).astype("float")

feat_blocks = {}

# pollutants
for c in POLL_COLS:
    s1 = g[c].shift(1)
    feat_blocks[f"{c}_lag1"] = s1
    feat_blocks[f"{c}_lag7"] = g[c].shift(7)
    for w in WINS_POLL:
        minp = MINP_POLL.get(w, 1)
        feat_blocks[f"{c}_rmean{w}"] = _grp_roll(s1, by, w, "mean", minp)
        feat_blocks[f"{c}_rstd{w}"]  = _grp_roll(s1, by, w, "std",  minp)
        feat_blocks[f"{c}_rmin{w}"]  = _grp_roll(s1, by, w, "min",  minp)
        feat_blocks[f"{c}_rmax{w}"]  = _grp_roll(s1, by, w, "max",  minp)

    feat_blocks[f"{c}_mom_7_30"]  = feat_blocks[f"{c}_rmean7"]  - feat_blocks[f"{c}_rmean30"]
    feat_blocks[f"{c}_mom_14_30"] = feat_blocks[f"{c}_rmean14"] - feat_blocks[f"{c}_rmean30"]

    na_flag = s1.isna().astype(float)
    feat_blocks[f"{c}_na_rate30"] = _grp_roll(na_flag, by, 30, "mean", 10)

# weather rolling
for c in WX_COLS:
    s1 = g[c].shift(1)
    feat_blocks[f"{c}_lag1"] = s1
    feat_blocks[f"{c}_lag7"] = g[c].shift(7)
    for w in WINS_WX:
        minp = MINP_WX.get(w, 1)
        feat_blocks[f"{c}_rmean{w}"] = _grp_roll(s1, by, w, "mean", minp)
        feat_blocks[f"{c}_rstd{w}"]  = _grp_roll(s1, by, w, "std",  minp)

# ndvi rolling
df_ts["ndvi"] = _safe_num(df_ts["ndvi"])
nd1 = g["ndvi"].shift(1)
feat_blocks["ndvi_lag1"] = nd1
feat_blocks["ndvi_rmean30"] = _grp_roll(nd1, by, 30, "mean", 10)

df_ts_feat = pd.concat([df_ts, pd.DataFrame(feat_blocks)], axis=1)
df_ts_feat = _dedup_cols(df_ts_feat)

# label-derived (past-only)
y1 = g["_ycode"].shift(1)
y2 = g["_ycode"].shift(2)
df_ts_feat["y_lag1"] = y1
df_ts_feat["y_lag7"] = g["_ycode"].shift(7)

y_change = (y1 != y2).astype(float)
y_change = y_change.where(y1.notna() & y2.notna(), np.nan)
df_ts_feat["y_change_lag1"] = y_change

for w in [7, 14, 30]:
    minp = MINP_POLL.get(w, 1)
    df_ts_feat[f"p_baik_{w}"]   = _grp_roll((y1 == 0).astype(float), by, w, "mean", minp)
    df_ts_feat[f"p_sedang_{w}"] = _grp_roll((y1 == 1).astype(float), by, w, "mean", minp)
    df_ts_feat[f"p_tidak_{w}"]  = _grp_roll((y1 == 2).astype(float), by, w, "mean", minp)
    df_ts_feat[f"n_trans_{w}"]  = _grp_roll(y_change.fillna(0.0), by, w, "sum",  minp)

# FIX NA->int (aman)
lbl_s = df_ts_feat[LBL_COL].astype("string")
is_tidak = lbl_s.eq("TIDAK SEHAT").fillna(False).astype(np.int8)

last_tidak_date = df_ts_feat["tanggal"].where(is_tidak == 1, pd.NaT)
last_tidak_date = last_tidak_date.groupby(by).ffill().shift(1)
df_ts_feat["days_since_tidak"] = (df_ts_feat["tanggal"] - last_tidak_date).dt.days.astype("float")

lag_lbl  = g[LBL_COL].shift(1)
lag_lbl2 = g[LBL_COL].shift(2)
streak_break = (lag_lbl != lag_lbl2) | lag_lbl.isna()
streak_id = streak_break.groupby(by).cumsum()
df_ts_feat["streak_len_lag1"] = df_ts_feat.groupby(["stasiun_code", streak_id]).cumcount() + 1
df_ts_feat["streak_len_lag1"] = df_ts_feat["streak_len_lag1"].where(lag_lbl.notna(), np.nan)

# add time keys (for climatology)
df_ts_feat["month"] = df_ts_feat["tanggal"].dt.month
df_ts_feat["weekofyear"] = df_ts_feat["tanggal"].dt.isocalendar().week.astype(int)
df_ts_feat["dow"] = df_ts_feat["tanggal"].dt.dayofweek

# ============================================================
# 3.3 Climatology + label priors
# ============================================================
df_pol_mon = (df_ts_feat.groupby(["stasiun_code","month"], as_index=False)[POLL_COLS]
                        .mean()
                        .rename(columns={c: f"{c}_mon_mean" for c in POLL_COLS}))
df_pol_wk  = (df_ts_feat.groupby(["stasiun_code","weekofyear"], as_index=False)[POLL_COLS]
                        .mean()
                        .rename(columns={c: f"{c}_wk_mean" for c in POLL_COLS}))

df_wx_mon = (df_ts_feat.groupby(["stasiun_code","month"], as_index=False)[WX_COLS]
                      .mean()
                      .rename(columns={c: f"{c}_mon_mean" for c in WX_COLS})) if len(WX_COLS) else pd.DataFrame(columns=["stasiun_code","month"])
df_wx_wk  = (df_ts_feat.groupby(["stasiun_code","weekofyear"], as_index=False)[WX_COLS]
                      .mean()
                      .rename(columns={c: f"{c}_wk_mean" for c in WX_COLS})) if len(WX_COLS) else pd.DataFrame(columns=["stasiun_code","weekofyear"])

df_ndvi_mon = (df_ts_feat.groupby(["stasiun_code","month"], as_index=False)["ndvi"]
                        .mean()
                        .rename(columns={"ndvi":"ndvi_mon_mean"}))

# label priors from history
dfh_lbl = dfh.loc[dfh[LBL_COL].notna(), ["tanggal","stasiun_code",LBL_COL]].copy()
dfh_lbl["month"] = dfh_lbl["tanggal"].dt.month
dfh_lbl["weekofyear"] = dfh_lbl["tanggal"].dt.isocalendar().week.astype(int)

tmp_m = pd.crosstab([dfh_lbl["stasiun_code"], dfh_lbl["month"]], dfh_lbl[LBL_COL], normalize="index").reset_index()
tmp_w = pd.crosstab([dfh_lbl["stasiun_code"], dfh_lbl["weekofyear"]], dfh_lbl[LBL_COL], normalize="index").reset_index()
for k in LABELS:
    if k not in tmp_m.columns: tmp_m[k] = 0.0
    if k not in tmp_w.columns: tmp_w[k] = 0.0

df_lbl_mon = tmp_m.rename(columns={"BAIK":"p_baik_mon","SEDANG":"p_sedang_mon","TIDAK SEHAT":"p_tidak_mon"})
df_lbl_wk  = tmp_w.rename(columns={"BAIK":"p_baik_wk","SEDANG":"p_sedang_wk","TIDAK SEHAT":"p_tidak_wk"})
df_lbl_mon = _ensure_station(df_lbl_mon, "stasiun_code")
df_lbl_wk  = _ensure_station(df_lbl_wk, "stasiun_code")

# ============================================================
# 3.4 df_targets_feat (submission) — FORCE month exists (FIX KEYERROR)
# ============================================================
df_targets_feat = _ensure_station(df_targets.copy(), "stasiun_code")
df_targets_feat["tanggal_target"] = pd.to_datetime(df_targets_feat["tanggal_target"], errors="coerce")
df_targets_feat["anchor_date"] = pd.to_datetime(df_targets_feat["last_obs_date"], errors="coerce")

# FORCE dateparts from tanggal_target (ini FIX month hilang)
df_targets_feat = _add_dateparts(df_targets_feat, "tanggal_target")

# attach holiday info from df_calendar if available (optional)
cal = _dedup_cols(df_calendar.copy())
cal["tanggal"] = pd.to_datetime(cal["tanggal"], errors="coerce")
cal = cal.dropna(subset=["tanggal"]).copy()

if "is_weekend" not in cal.columns:
    cal["is_weekend"] = (cal["tanggal"].dt.dayofweek >= 5).astype(int)
if "is_holiday_nasional" not in cal.columns:
    cal["is_holiday_nasional"] = 0
if "holiday_or_weekend" not in cal.columns:
    cal["holiday_or_weekend"] = ((cal["is_weekend"].astype(int) == 1) | (cal["is_holiday_nasional"].astype(int) == 1)).astype(int)

cal_keep = [c for c in ["tanggal","is_weekend","is_holiday_nasional","holiday_or_weekend","day_name","nama_libur"] if c in cal.columns]
df_targets_feat = df_targets_feat.merge(cal[cal_keep], left_on="tanggal_target", right_on="tanggal", how="left")
df_targets_feat = df_targets_feat.drop(columns=["tanggal"], errors="ignore")

# horizon transforms
df_targets_feat["horizon_weeks"]  = df_targets_feat["horizon_days"] / 7.0
df_targets_feat["horizon_months"] = df_targets_feat["horizon_days"] / 30.0
df_targets_feat["log1p_horizon"]  = np.log1p(df_targets_feat["horizon_days"].clip(lower=0))

# anchor snapshot at HIST_END (prefix a_)
drop_raw = {"label_3","_ycode"}
anchor_cols_raw = [c for c in df_ts_feat.columns if c not in drop_raw and c not in ["tanggal","stasiun_code","month","weekofyear","dow"]]
df_anchor = df_ts_feat.loc[df_ts_feat["tanggal"] == pd.Timestamp(HIST_END), ["stasiun_code"] + anchor_cols_raw].copy()
df_anchor = _ensure_station(df_anchor, "stasiun_code")
df_anchor = df_anchor.rename(columns={c: f"a_{c}" for c in anchor_cols_raw})
df_targets_feat = _merge_safe(df_targets_feat, df_anchor, on=["stasiun_code"], how="left", validate="m:1")

# climatology + priors (sekarang month sudah pasti ada -> tidak KeyError)
df_targets_feat = _merge_safe(df_targets_feat, df_pol_mon, on=["stasiun_code","month"], how="left", validate="m:1")
df_targets_feat = _merge_safe(df_targets_feat, df_pol_wk,  on=["stasiun_code","weekofyear"], how="left", validate="m:1")
if len(WX_COLS):
    df_targets_feat = _merge_safe(df_targets_feat, df_wx_mon, on=["stasiun_code","month"], how="left", validate="m:1")
    df_targets_feat = _merge_safe(df_targets_feat, df_wx_wk,  on=["stasiun_code","weekofyear"], how="left", validate="m:1")
df_targets_feat = _merge_safe(df_targets_feat, df_ndvi_mon, on=["stasiun_code","month"], how="left", validate="m:1")
df_targets_feat = _merge_safe(df_targets_feat, df_lbl_mon,  on=["stasiun_code","month"], how="left", validate="m:1")
df_targets_feat = _merge_safe(df_targets_feat, df_lbl_wk,   on=["stasiun_code","weekofyear"], how="left", validate="m:1")

for c in ["p_baik_mon","p_sedang_mon","p_tidak_mon","p_baik_wk","p_sedang_wk","p_tidak_wk"]:
    if c in df_targets_feat.columns:
        df_targets_feat[c] = df_targets_feat[c].fillna(0.0)

# anomalies
for c in ["max","pm25","pm10","o3","no2"]:
    a_col = f"a_{c}_rmean7"
    m_col = f"{c}_mon_mean"
    if a_col in df_targets_feat.columns and m_col in df_targets_feat.columns:
        df_targets_feat[f"{c}_anom_anchor7_vs_mon"] = df_targets_feat[a_col] - df_targets_feat[m_col]

# ============================================================
# 3.5 df_train_sup (vectorized)
# ============================================================
df_train_sup = None
if BUILD_TRAIN_TABLE:
    y = dfh.loc[dfh[LBL_COL].isin(LABELS), ["tanggal","stasiun_code",LBL_COL]].copy()
    y = y.rename(columns={"tanggal":"tanggal_target"}).copy()
    y["tanggal_target"] = pd.to_datetime(y["tanggal_target"])
    y = y.loc[y["tanggal_target"] >= TRAIN_START].copy()

    H = np.arange(1, MAX_H + 1, dtype=np.int16)
    n0 = len(y)
    rep = np.repeat(np.arange(n0), len(H))
    hvec = np.tile(H, n0)

    df_sup = y.iloc[rep].reset_index(drop=True)
    df_sup["horizon_days"] = hvec.astype(np.int16)
    df_sup["anchor_date"] = df_sup["tanggal_target"] - pd.to_timedelta(df_sup["horizon_days"].astype(int), unit="D")

    min_anchor = df_ts_feat["tanggal"].min()
    max_anchor = df_ts_feat["tanggal"].max()
    df_sup = df_sup.loc[(df_sup["anchor_date"] >= min_anchor) & (df_sup["anchor_date"] <= max_anchor)].copy()
    df_sup = df_sup.reset_index(drop=True)

    df_anchor_all = df_ts_feat[["tanggal","stasiun_code"] + anchor_cols_raw].copy()
    df_anchor_all = df_anchor_all.rename(columns={"tanggal":"anchor_date"})
    df_anchor_all = df_anchor_all.rename(columns={c: f"a_{c}" for c in anchor_cols_raw})
    df_anchor_all = _ensure_station(df_anchor_all, "stasiun_code")
    df_anchor_all["anchor_date"] = pd.to_datetime(df_anchor_all["anchor_date"])

    df_sup = _merge_safe(df_sup, df_anchor_all, on=["anchor_date","stasiun_code"], how="left", validate="m:1")

    core = []
    if f"a_{POLL_COLS[0]}_lag1" in df_sup.columns: core.append(f"a_{POLL_COLS[0]}_lag1")
    if f"a_{POLL_COLS[0]}_rmean30" in df_sup.columns: core.append(f"a_{POLL_COLS[0]}_rmean30")
    if len(core):
        df_sup = df_sup.dropna(subset=core).copy()

    # FORCE dateparts from tanggal_target (buat month pasti ada)
    df_sup = _add_dateparts(df_sup, "tanggal_target")

    # merge optional calendar flags
    df_sup = df_sup.merge(cal[cal_keep].rename(columns={"tanggal":"tanggal_target"}), on="tanggal_target", how="left", validate="m:1")

    df_sup["horizon_weeks"]  = df_sup["horizon_days"] / 7.0
    df_sup["horizon_months"] = df_sup["horizon_days"] / 30.0
    df_sup["log1p_horizon"]  = np.log1p(df_sup["horizon_days"].astype(float))

    df_sup = _merge_safe(df_sup, df_pol_mon, on=["stasiun_code","month"], how="left", validate="m:1")
    df_sup = _merge_safe(df_sup, df_pol_wk,  on=["stasiun_code","weekofyear"], how="left", validate="m:1")
    if len(WX_COLS):
        df_sup = _merge_safe(df_sup, df_wx_mon, on=["stasiun_code","month"], how="left", validate="m:1")
        df_sup = _merge_safe(df_sup, df_wx_wk,  on=["stasiun_code","weekofyear"], how="left", validate="m:1")
    df_sup = _merge_safe(df_sup, df_ndvi_mon, on=["stasiun_code","month"], how="left", validate="m:1")
    df_sup = _merge_safe(df_sup, df_lbl_mon,  on=["stasiun_code","month"], how="left", validate="m:1")
    df_sup = _merge_safe(df_sup, df_lbl_wk,   on=["stasiun_code","weekofyear"], how="left", validate="m:1")

    for c in ["p_baik_mon","p_sedang_mon","p_tidak_mon","p_baik_wk","p_sedang_wk","p_tidak_wk"]:
        if c in df_sup.columns:
            df_sup[c] = df_sup[c].fillna(0.0)

    for c in ["max","pm25","pm10","o3","no2"]:
        a_col = f"a_{c}_rmean7"
        m_col = f"{c}_mon_mean"
        if a_col in df_sup.columns and m_col in df_sup.columns:
            df_sup[f"{c}_anom_anchor7_vs_mon"] = df_sup[a_col] - df_sup[m_col]

    df_sup["y"] = df_sup[LBL_COL].map(MAP_CODE).astype(int)
    df_sup["target_date"] = df_sup["tanggal_target"]  # alias utk Stage 4

    if USE_SAMPLE_WEIGHT:
        h = df_sup["horizon_days"].astype(float)
        w_h = 1.0 + (WEIGHT_LONGH_MAX - 1.0) * (h - 1.0) / max(1.0, (MAX_H - 1.0))
        m = df_sup["month"].astype(int)
        w_m = np.where(m.isin([9,10,11]), WEIGHT_SEP_NOV, 1.0)
        df_sup["sample_weight"] = (w_h * w_m).astype("float32")
    else:
        df_sup["sample_weight"] = 1.0

    df_train_sup = _dedup_cols(df_sup).copy()

# ============================================================
# 3.6 Feature list
# ============================================================
CAT_FEATURES = [c for c in ["stasiun_code","day_name","nama_libur"] if c in df_targets_feat.columns]

BASE_NUM = [c for c in [
    "year","month","day","dow","dayofyear","weekofyear",
    "sin_doy","cos_doy","sin_month","cos_month","sin_dow","cos_dow",
    "is_weekend","is_holiday_nasional","holiday_or_weekend",
    "is_month_start","is_month_end",
    "horizon_days","horizon_weeks","horizon_months","log1p_horizon"
] if c in df_targets_feat.columns]

ANCHOR_NUM = [c for c in df_targets_feat.columns if c.startswith("a_")]
PRIOR_NUM  = [c for c in ["p_baik_mon","p_sedang_mon","p_tidak_mon","p_baik_wk","p_sedang_wk","p_tidak_wk","ndvi_mon_mean"] if c in df_targets_feat.columns]
CLIM_NUM   = [c for c in df_targets_feat.columns if c.endswith("_mon_mean") or c.endswith("_wk_mean")]
ANOM_NUM   = [c for c in df_targets_feat.columns if c.endswith("_anom_anchor7_vs_mon")]

def _dedup_list(xs):
    out, seen = [], set()
    for x in xs:
        if x not in seen:
            out.append(x); seen.add(x)
    return out

NUM_FEATURES = _dedup_list(BASE_NUM + PRIOR_NUM + CLIM_NUM + ANOM_NUM + ANCHOR_NUM)
CAT_FEATURES = _dedup_list(CAT_FEATURES)
FEATURE_COLS_MODEL = CAT_FEATURES + NUM_FEATURES

# ============================================================
# 3.7 QA
# ============================================================
print("=== STAGE 3 SUMMARY (REV TOP v3.3) ===")
print("HIST_END:", pd.Timestamp(HIST_END).date())
print("df_ts_feat:", df_ts_feat.shape, "| POLL_COLS:", POLL_COLS, "| WX_COLS:", len(WX_COLS), "| use_ndvi:", bool(use_ndvi))
print("df_targets_feat:", df_targets_feat.shape, "| features:", len(FEATURE_COLS_MODEL), "| cat:", len(CAT_FEATURES), "| num:", len(NUM_FEATURES))

# hard check month existence (fix for your KeyError)
if "month" not in df_targets_feat.columns:
    raise RuntimeError("BUG: month still missing in df_targets_feat. Check tanggal_target parsing.")
if "month" not in df_pol_mon.columns:
    raise RuntimeError("BUG: month missing in df_pol_mon. Check df_ts_feat month creation.")

show_cols = ["id","tanggal_target","stasiun_code","horizon_days","month","weekofyear"]
show_cols += [c for c in ["is_weekend","is_holiday_nasional","p_tidak_mon","p_tidak_wk"] if c in df_targets_feat.columns]
show_cols = [c for c in show_cols if c in df_targets_feat.columns]
print("\ndf_targets_feat preview:")
print(df_targets_feat[show_cols].head(10))

if BUILD_TRAIN_TABLE:
    print("\ndf_train_sup:", df_train_sup.shape)
    print("Train target date range:", df_train_sup["tanggal_target"].min().date(), "->", df_train_sup["tanggal_target"].max().date())
    print("Horizon range:", int(df_train_sup["horizon_days"].min()), "->", int(df_train_sup["horizon_days"].max()))
    print("Label distribution:")
    print(df_train_sup[LBL_COL].value_counts())
    print("Has target_date alias:", "target_date" in df_train_sup.columns)

print("\n[OK] Stage 3 completed: df_ts_feat, df_targets_feat, df_train_sup, FEATURE_COLS_MODEL ready.")

=== STAGE 3 SUMMARY (REV TOP v3.3) ===
HIST_END: 2025-08-31
df_ts_feat: (16183, 225) | POLL_COLS: ['max', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2'] | WX_COLS: 8 | use_ndvi: True
df_targets_feat: (455, 293) | features: 279 | cat: 1 | num: 278

df_targets_feat preview:
                id tanggal_target stasiun_code  horizon_days  month  \
0  2025-09-01_DKI1     2025-09-01         DKI1             1      9   
1  2025-09-01_DKI2     2025-09-01         DKI2             1      9   
2  2025-09-01_DKI3     2025-09-01         DKI3             1      9   
3  2025-09-01_DKI4     2025-09-01         DKI4             1      9   
4  2025-09-01_DKI5     2025-09-01         DKI5             1      9   
5  2025-09-02_DKI1     2025-09-02         DKI1             2      9   
6  2025-09-02_DKI2     2025-09-02         DKI2             2      9   
7  2025-09-02_DKI3     2025-09-02         DKI3             2      9   
8  2025-09-02_DKI4     2025-09-02         DKI4             2      9   
9  2025-09-02_DKI5    

# Model Training 

In [5]:
# ============================================================
# STAGE 4 — Model Training (Forecasting-safe + Time-based CV)
# REV TOP v6.2 — FIX string[python] dtype (NO np.issubdtype)
#
# Requires from STAGE 3:
#   df_train_sup, FEATURE_COLS_MODEL
#
# Outputs:
#   models_by_fold (dict fold -> list[CatBoostClassifier])
#   oof_pred_proba, oof_macro_f1, cv_report
#   FEATURE_COLS_MODEL_USED, CAT_FEATS_MODEL_USED, LABELS, id_to_label
# ============================================================

import os
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from pandas.api.types import (
    is_datetime64_any_dtype,
    is_timedelta64_dtype,
)
from catboost import CatBoostClassifier, Pool

# ---------- guards ----------
need = ["df_train_sup", "FEATURE_COLS_MODEL"]
miss = [k for k in need if k not in globals()]
if miss:
    raise RuntimeError(f"Missing globals from STAGE 3: {miss}. Jalankan STAGE 3 dulu.")

SEED_BASE = 42
np.random.seed(SEED_BASE)

LABELS = ["BAIK", "SEDANG", "TIDAK SEHAT"]
label_to_id = {k: i for i, k in enumerate(LABELS)}
id_to_label = {i: k for k, i in label_to_id.items()}

# =========================
# helpers
# =========================
def _dedup_cols(df: pd.DataFrame) -> pd.DataFrame:
    return df.loc[:, ~df.columns.duplicated()].copy()

def _pick_col(df: pd.DataFrame, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

def _macro_f1(y_true, proba_3):
    pred = np.asarray(proba_3).argmax(axis=1)
    return float(f1_score(y_true, pred, average="macro"))

def _sanitize_for_cb(df: pd.DataFrame, cat_cols, num_cols) -> pd.DataFrame:
    df = df.copy()
    for c in cat_cols:
        if c in df.columns:
            df[c] = df[c].astype("string").fillna("NA").astype(str)
    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("float32")
    return df

def _is_datetime_like(s: pd.Series) -> bool:
    # aman untuk dtype "string[python]" dll
    return is_datetime64_any_dtype(s) or is_timedelta64_dtype(s)

def _drop_forbidden_features(cols):
    # pastikan tidak ada leakage/kolom target masuk fitur
    forbid_exact = {
        "label_3", "kategori_3", "kategori", "y",
        "target_date", "tanggal_target", "tanggal", "date", "time",
        "anchor_date", "cutoff_date",
    }
    forbid_substr = ["label", "kategori", "target_date", "tanggal_target", "anchor_date", "cutoff_date"]
    out = []
    for c in cols:
        if c in forbid_exact:
            continue
        low = str(c).lower()
        if any(s in low for s in forbid_substr):
            continue
        out.append(c)
    return out

# =========================
# 4.1 Prepare supervised table
# =========================
df = _dedup_cols(df_train_sup.copy())

date_col  = _pick_col(df, ["target_date", "tanggal_target", "tanggal", "date", "time"])
label_col = _pick_col(df, ["label_3", "kategori_3", "kategori"])
hcol      = _pick_col(df, ["horizon_days", "horizon", "h"])

if date_col is None:
    raise RuntimeError(f"Cannot find target date column in df_train_sup. cols(head)={df.columns.tolist()[:60]}")
if label_col is None:
    raise RuntimeError("Cannot find label column (label_3/kategori_3/kategori) in df_train_sup.")
if hcol is None:
    raise RuntimeError("Cannot find horizon column (horizon_days/horizon/h) in df_train_sup.")
if "stasiun_code" not in df.columns:
    raise RuntimeError("df_train_sup missing stasiun_code. Pastikan Stage 3 benar.")

df["target_date"] = pd.to_datetime(df[date_col], errors="coerce")
df = df.dropna(subset=["target_date"]).copy()

df["label_3"] = df[label_col].astype("string").str.upper().str.strip()
df["label_3"] = df["label_3"].replace({"SANGAT TIDAK SEHAT":"TIDAK SEHAT","BERBAHAYA":"TIDAK SEHAT"})
df = df.loc[df["label_3"].isin(LABELS)].copy()
df["y"] = df["label_3"].map(label_to_id).astype(int)

df["horizon_days"] = pd.to_numeric(df[hcol], errors="coerce").astype("float32")
df = df.dropna(subset=["horizon_days"]).copy()
df["horizon_days"] = df["horizon_days"].clip(1, 91)

df["stasiun_code"] = df["stasiun_code"].astype("string").str.upper().str.strip()

# =========================
# 4.2 Feature list (strict + safe)  (NO np.issubdtype)
# =========================
raw_feats = [c for c in FEATURE_COLS_MODEL if c in df.columns]
raw_feats = _drop_forbidden_features(raw_feats)

FEATURE_COLS_MODEL_USED = []
for c in raw_feats:
    s = df[c]
    if _is_datetime_like(s):
        continue
    FEATURE_COLS_MODEL_USED.append(c)

FEATURE_COLS_MODEL_USED = list(dict.fromkeys(FEATURE_COLS_MODEL_USED))
if len(FEATURE_COLS_MODEL_USED) < 50:
    raise RuntimeError(f"Too few usable features: {len(FEATURE_COLS_MODEL_USED)}. Cek Stage 3 FEATURE_COLS_MODEL.")

# cat features
if "CAT_FEATURES" in globals() and isinstance(globals()["CAT_FEATURES"], (list, tuple)):
    CAT_FEATS_MODEL_USED = [c for c in globals()["CAT_FEATURES"] if c in FEATURE_COLS_MODEL_USED]
else:
    CAT_FEATS_MODEL_USED = [c for c in ["stasiun_code", "day_name", "nama_libur"] if c in FEATURE_COLS_MODEL_USED]

NUM_FEATS_MODEL_USED = [c for c in FEATURE_COLS_MODEL_USED if c not in CAT_FEATS_MODEL_USED]

# sanitize for CatBoost
df = _sanitize_for_cb(df, CAT_FEATS_MODEL_USED, NUM_FEATS_MODEL_USED)

# =========================
# 4.3 Sample weights (LB-oriented: long-horizon + recency)
# =========================
h = df["horizon_days"].to_numpy(dtype="float64")
w_h = np.where(h < 14, 0.85, np.where(h < 30, 1.00, 1.00 + 0.55*((h-30)/(91-30))**1.25)).astype("float64")

tmin, tmax = df["target_date"].min(), df["target_date"].max()
if pd.isna(tmin) or pd.isna(tmax) or tmin == tmax:
    w_t = np.ones(len(df), dtype="float64")
else:
    ti = df["target_date"].astype("datetime64[ns]").astype("int64").to_numpy()
    frac = (ti - int(tmin.value)) / max(1, (int(tmax.value) - int(tmin.value)))
    frac = np.clip(frac.astype("float64"), 0.0, 1.0)
    w_t = (0.85 + 0.70*(frac**1.35)).astype("float64")

sample_weight = (w_h * w_t).astype("float32")

# =========================
# 4.4 Time-based CV (walk-forward by year)
# =========================
years = sorted(df["target_date"].dt.year.unique().tolist())
if len(years) >= 6:
    val_years = years[-3:]
elif len(years) >= 4:
    val_years = years[-2:]
else:
    val_years = years[-1:]

folds = []
for vy in val_years:
    tr = (df["target_date"].dt.year < vy).to_numpy()
    va = (df["target_date"].dt.year == vy).to_numpy()
    folds.append((f"val_year_{vy}", tr, va))

print("=== STAGE 4 SETUP (REV TOP v6.2) ===")
print("Detected date_col:", date_col, "| label_col:", label_col, "| horizon_col:", hcol)
print("Rows:", len(df), "| Years:", years)
print("Val folds:", [f[0] for f in folds])
print("Features:", len(FEATURE_COLS_MODEL_USED), "| cat:", len(CAT_FEATS_MODEL_USED), "| num:", len(NUM_FEATS_MODEL_USED))
print("Label counts:", df["y"].value_counts().to_dict())

TASK_TYPE = "GPU" if os.environ.get("CUDA_VISIBLE_DEVICES") not in (None, "", "-1") else "CPU"

# NOTE: early stop pakai MultiClass (loss) agar tidak "best_iter=0..10" karena noise F1
CB_PARAMS = dict(
    loss_function="MultiClass",
    eval_metric="MultiClass",
    custom_metric=["TotalF1"],
    iterations=8000,
    learning_rate=0.03,
    depth=8,
    l2_leaf_reg=9.0,
    random_strength=1.0,
    bagging_temperature=0.8,
    bootstrap_type="Bayesian",
    boosting_type="Ordered",
    rsm=0.90,
    border_count=254,
    max_ctr_complexity=4,
    od_type="Iter",
    od_wait=900,
    auto_class_weights="Balanced",
    task_type=TASK_TYPE,
    verbose=300
)

# =========================
# 4.5 Train (multi-seed ensemble per fold)
# =========================
SEEDS = [42, 777]
oof_pred_proba = np.full((len(df), len(LABELS)), np.nan, dtype=float)
models_by_fold = {}
cv_rows = []

for fold_name, tr_mask, va_mask in folds:
    ntr, nva = int(tr_mask.sum()), int(va_mask.sum())
    print(f"\n--- Fold {fold_name}: train={ntr} valid={nva} | task_type={TASK_TYPE} ---")
    if ntr == 0 or nva == 0:
        cv_rows.append({"fold": fold_name, "n_train": ntr, "n_valid": nva, "macro_f1_ens": np.nan})
        continue

    X_tr = df.loc[tr_mask, FEATURE_COLS_MODEL_USED]
    y_tr = df.loc[tr_mask, "y"].to_numpy()
    w_tr = sample_weight[tr_mask]

    X_va = df.loc[va_mask, FEATURE_COLS_MODEL_USED]
    y_va = df.loc[va_mask, "y"].to_numpy()
    w_va = sample_weight[va_mask]

    tr_pool = Pool(X_tr, label=y_tr, cat_features=CAT_FEATS_MODEL_USED, weight=w_tr)
    va_pool = Pool(X_va, label=y_va, cat_features=CAT_FEATS_MODEL_USED, weight=w_va)

    proba_sum = np.zeros((nva, len(LABELS)), dtype=float)
    models = []
    seed_scores = []

    for sd in SEEDS:
        model = CatBoostClassifier(**CB_PARAMS, random_seed=int(sd))
        model.fit(tr_pool, eval_set=va_pool, use_best_model=True)

        proba = model.predict_proba(X_va)
        proba_sum += proba

        f1m = _macro_f1(y_va, proba)
        seed_scores.append(f1m)
        models.append(model)
        print(f"  seed={sd} macroF1={f1m:.5f} | best_iter={int(model.get_best_iteration())}")

    proba_ens = proba_sum / float(len(SEEDS))
    f1_ens = _macro_f1(y_va, proba_ens)
    print(f"  ENSEMBLE macroF1={f1_ens:.5f} | seeds={SEEDS}")

    oof_pred_proba[va_mask] = proba_ens
    models_by_fold[fold_name] = models
    cv_rows.append({
        "fold": fold_name,
        "n_train": ntr,
        "n_valid": nva,
        "macro_f1_ens": float(f1_ens),
        "macro_f1_seed_min": float(np.min(seed_scores)),
        "macro_f1_seed_max": float(np.max(seed_scores)),
        "macro_f1_seed_mean": float(np.mean(seed_scores)),
    })

cv_report = pd.DataFrame(cv_rows)

oof_mask = ~np.isnan(oof_pred_proba).any(axis=1)
oof_true = df.loc[oof_mask, "y"].to_numpy()
oof_macro_f1 = _macro_f1(oof_true, oof_pred_proba[oof_mask]) if oof_mask.any() else np.nan

print("\n=== STAGE 4 SUMMARY (REV TOP v6.2) ===")
print(cv_report)
print("OOF macro F1:", oof_macro_f1)

print("\n[OK] Stage 4 completed. Globals ready for Stage 5:")
print("- models_by_fold (dict fold -> list[CatBoostClassifier])")
print("- oof_pred_proba, oof_macro_f1, cv_report")
print("- FEATURE_COLS_MODEL_USED, CAT_FEATS_MODEL_USED, LABELS, id_to_label")

  w_h = np.where(h < 14, 0.85, np.where(h < 30, 1.00, 1.00 + 0.55*((h-30)/(91-30))**1.25)).astype("float64")


=== STAGE 4 SETUP (REV TOP v6.2) ===
Detected date_col: target_date | label_col: label_3 | horizon_col: horizon_days
Rows: 586647 | Years: [2020, 2021, 2022, 2023, 2024, 2025]
Val folds: ['val_year_2023', 'val_year_2024', 'val_year_2025']
Features: 279 | cat: 1 | num: 278
Label counts: {1: 443099, 2: 82250, 0: 61298}

--- Fold val_year_2023: train=180852 valid=142537 | task_type=CPU ---
0:	learn: 1.0748509	test: 1.0884126	best: 1.0884126 (0)	total: 4.97s	remaining: 11h 3m 1s
300:	learn: 0.2372476	test: 1.5215647	best: 0.9383125 (38)	total: 21m 45s	remaining: 9h 16m 35s
600:	learn: 0.1315743	test: 1.9899362	best: 0.9383125 (38)	total: 43m 17s	remaining: 8h 52m 56s
900:	learn: 0.0844847	test: 2.2760111	best: 0.9383125 (38)	total: 1h 4m 46s	remaining: 8h 30m 22s
Stopped by overfitting detector  (900 iterations wait)

bestTest = 0.9383124597
bestIteration = 38

Shrink model to first 39 iterations.
  seed=42 macroF1=0.55625 | best_iter=38
0:	learn: 1.0756165	test: 1.0841223	best: 1.0841223 

# Inference, Ensembling, Submission & QA

In [6]:
# ============================================================
# STAGE 5 — Test Inference (Fold Ensemble) + Submission + QA
# REV TOP v5.0 — MATCH Stage 4 v6.2 output
#
# Expects:
#   df_targets_feat
#   models_by_fold (dict: fold_name -> list[CatBoostClassifier])  [from Stage 4 v6.2]
#   FEATURE_COLS_MODEL_USED, CAT_FEATS_MODEL_USED                [from Stage 4 v6.2]
#
# Output:
#   /kaggle/working/submission.csv
#   /kaggle/working/pred_proba.npy
# ============================================================

import re
import numpy as np
import pandas as pd
from pathlib import Path

# -----------------------------
# Guards
# -----------------------------
if "df_targets_feat" not in globals():
    raise RuntimeError("Missing df_targets_feat. Jalankan Stage 3 dulu.")
if not isinstance(df_targets_feat, pd.DataFrame) or len(df_targets_feat) == 0:
    raise RuntimeError("df_targets_feat kosong / bukan DataFrame.")

DATA_ROOT = Path("/kaggle/input/penyisihan-datavidia-10")
SAMPLE_PATH = DATA_ROOT / "sample_submission.csv"
OUT_SUB = Path("/kaggle/working/submission.csv")
OUT_PROBA = Path("/kaggle/working/pred_proba.npy")

LABELS = ["BAIK", "SEDANG", "TIDAK SEHAT"]
id_to_label = {i: k for i, k in enumerate(LABELS)}

# -----------------------------
# Controls
# -----------------------------
SAVE_PROBA = True

# prior blend (stabil long horizon)
USE_PRIOR_BLEND = True
PRIOR_BLEND_MAX = 0.30   # 0.15–0.45 (naikkan kalau model terlalu "overconfident")

# fold recency weighting (target 2025)
WEIGHT_FOLDS_BY_RECENCY = True
RECENCY_TAU = 1.20       # makin kecil => makin berat ke fold terbaru

# -----------------------------
# Helpers
# -----------------------------
def _dedup_cols(df: pd.DataFrame) -> pd.DataFrame:
    return df.loc[:, ~df.columns.duplicated()].copy()

def sanitize_features(df: pd.DataFrame, cat_cols, num_cols) -> pd.DataFrame:
    df = df.copy()
    for c in cat_cols:
        if c in df.columns:
            df[c] = df[c].astype("string").fillna("NA").astype(str)
        else:
            df[c] = "NA"
    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("float32")
        else:
            df[c] = np.nan
    return df

def normalize_rows(p: np.ndarray) -> np.ndarray:
    p = np.asarray(p, dtype="float64")
    s = p.sum(axis=1, keepdims=True)
    s = np.where(s <= 0, 1.0, s)
    return p / s

def make_prior_proba(df: pd.DataFrame) -> np.ndarray:
    n = len(df)
    mon_cols = ["p_baik_mon", "p_sedang_mon", "p_tidak_mon"]
    wk_cols  = ["p_baik_wk",  "p_sedang_wk",  "p_tidak_wk"]

    has_mon = all(c in df.columns for c in mon_cols)
    has_wk  = all(c in df.columns for c in wk_cols)

    cnt = 0
    acc = np.zeros((n, 3), dtype="float64")
    if has_mon:
        acc += np.nan_to_num(df[mon_cols].to_numpy(dtype="float64"), nan=0.0); cnt += 1
    if has_wk:
        acc += np.nan_to_num(df[wk_cols].to_numpy(dtype="float64"), nan=0.0); cnt += 1

    if cnt > 0:
        return normalize_rows(acc / cnt)

    # fallback terakhir: uniform
    return np.tile(np.array([1/3, 1/3, 1/3], dtype="float64")[None, :], (n, 1))

def blend_with_prior(proba_model: np.ndarray, proba_prior: np.ndarray, horizon: np.ndarray, prior_max: float) -> np.ndarray:
    h = np.asarray(horizon, dtype="float64")
    hmin, hmax = np.nanmin(h), np.nanmax(h)
    denom = (hmax - hmin) if (hmax > hmin) else 1.0
    t = np.clip((h - hmin) / denom, 0.0, 1.0).reshape(-1, 1)
    w = prior_max * t
    out = (1.0 - w) * proba_model + w * proba_prior
    return normalize_rows(out)

def iter_models(obj):
    # flatten holder: model | list/tuple/set | dict | nested
    out = []
    if obj is None:
        return out
    if hasattr(obj, "predict_proba"):
        return [obj]
    if isinstance(obj, (list, tuple, set)):
        for x in obj:
            out.extend(iter_models(x))
        return out
    if isinstance(obj, dict):
        for v in obj.values():
            out.extend(iter_models(v))
        return out
    return out

def fold_year_from_name(name: str):
    m = re.search(r"(\d{4})", str(name))
    return int(m.group(1)) if m else None

def build_fold_weights(fold_names):
    if not WEIGHT_FOLDS_BY_RECENCY:
        return {n: 1.0 / max(1, len(fold_names)) for n in fold_names}

    years = [fold_year_from_name(n) for n in fold_names]
    years_ok = [y for y in years if y is not None]
    if len(years_ok) == 0:
        return {n: 1.0 / max(1, len(fold_names)) for n in fold_names}

    y_max = max(years_ok)
    w = {}
    for n in fold_names:
        y = fold_year_from_name(n)
        if y is None:
            w[n] = 1.0
        else:
            w[n] = float(np.exp(-(y_max - y) / max(1e-6, RECENCY_TAU)))

    s = sum(w.values())
    if s <= 0:
        return {n: 1.0 / max(1, len(fold_names)) for n in fold_names}
    return {n: v / s for n, v in w.items()}

# -----------------------------
# Load sample_submission (order lock)
# -----------------------------
df_sample = pd.read_csv(SAMPLE_PATH)
sample_id_col = df_sample.columns[0]
sample_target_col = df_sample.columns[1] if len(df_sample.columns) > 1 else "kategori"

# -----------------------------
# Prepare prediction table
# -----------------------------
df_pred = _dedup_cols(df_targets_feat.copy())

if "id" not in df_pred.columns:
    raise RuntimeError("df_targets_feat harus punya kolom 'id'.")
if "horizon_days" not in df_pred.columns:
    raise RuntimeError("df_targets_feat harus punya kolom 'horizon_days'.")

# Feature list from Stage 4 v6.2 (priority)
if "FEATURE_COLS_MODEL_USED" in globals() and isinstance(globals()["FEATURE_COLS_MODEL_USED"], list):
    FEATURE_COLS = globals()["FEATURE_COLS_MODEL_USED"]
else:
    raise RuntimeError("Missing FEATURE_COLS_MODEL_USED (jalankan Stage 4 v6.2 dulu).")

if "CAT_FEATS_MODEL_USED" in globals() and isinstance(globals()["CAT_FEATS_MODEL_USED"], list):
    CAT_COLS = globals()["CAT_FEATS_MODEL_USED"]
else:
    CAT_COLS = [c for c in ["stasiun_code", "day_name", "nama_libur"] if c in df_pred.columns]

CAT_COLS = [c for c in CAT_COLS if c in FEATURE_COLS]
NUM_COLS = [c for c in FEATURE_COLS if c not in CAT_COLS]

df_pred = sanitize_features(df_pred, CAT_COLS, NUM_COLS)
X_test = df_pred[FEATURE_COLS].copy()

# -----------------------------
# Predict using fold ensemble (Stage 4 v6.2 structure)
# -----------------------------
proba = None
used_model = False

models_by_fold = globals().get("models_by_fold", None)
if isinstance(models_by_fold, dict) and len(models_by_fold) > 0:
    fold_names = list(models_by_fold.keys())
    fold_w = build_fold_weights(fold_names)

    psum = np.zeros((len(X_test), 3), dtype="float64")
    wsum = 0.0

    print("=== STAGE 5 MODEL DETECTION ===")
    print("Folds:", fold_names)
    print("Fold weights:", {k: round(v, 4) for k, v in fold_w.items()})

    for fold_name, holder in models_by_fold.items():
        fold_models = iter_models(holder)  # list model
        if len(fold_models) == 0:
            print(f"[WARN] Fold '{fold_name}': no usable model.")
            continue

        p_fold_sum = np.zeros((len(X_test), 3), dtype="float64")
        ok = 0
        for mi, model in enumerate(fold_models):
            try:
                p = model.predict_proba(X_test)
                p = np.asarray(p, dtype="float64")
                if p.ndim != 2 or p.shape[1] != 3:
                    p = normalize_rows(p.reshape(len(X_test), -1)[:, :3])
                p_fold_sum += p
                ok += 1
            except Exception as e:
                print(f"[WARN] Fold '{fold_name}' model#{mi} skipped: {e}")

        if ok == 0:
            print(f"[WARN] Fold '{fold_name}': all models failed.")
            continue

        p_fold = normalize_rows(p_fold_sum / ok)
        w = float(fold_w.get(fold_name, 1.0))
        psum += w * p_fold
        wsum += w
        print(f"[OK] Fold '{fold_name}': used {ok}/{len(fold_models)} models, weight={w:.4f}")

    if wsum > 0:
        proba = normalize_rows(psum / wsum)
        used_model = True

# fallback: prior-only (should almost never happen if Stage 4 OK)
if proba is None:
    print("[WARN] models_by_fold not usable -> using prior baseline.")
    proba = make_prior_proba(df_pred)

# optional: blend with prior for long horizon
if USE_PRIOR_BLEND:
    proba_prior = make_prior_proba(df_pred)
    proba = blend_with_prior(proba, proba_prior, df_pred["horizon_days"].to_numpy(), PRIOR_BLEND_MAX)

proba = normalize_rows(proba)

pred_id = proba.argmax(axis=1)
pred_label = pd.Series(pred_id).map(id_to_label).astype(str)

# -----------------------------
# Build submission (locked to sample order)
# -----------------------------
sub = pd.DataFrame({
    sample_id_col: df_pred["id"].astype(str),
    sample_target_col: pred_label
})

sample_ids = df_sample[sample_id_col].astype(str).tolist()

# safest align
sub = df_sample[[sample_id_col]].merge(sub, on=sample_id_col, how="left")

# -----------------------------
# QA
# -----------------------------
ok_rows = (len(sub) == len(df_sample) == 455)
missing_pred = int(sub[sample_target_col].isna().sum())
ok_labels = set(sub[sample_target_col].dropna().unique()).issubset(set(LABELS))
id_match = sub[sample_id_col].astype(str).tolist() == sample_ids

print("\n=== STAGE 5 QA ===")
print("Used model ensemble:", used_model)
print("Rows:", len(sub), "| expected:", len(df_sample), "| ok:", ok_rows)
print("ID order matches sample:", bool(id_match))
print("Missing pred:", missing_pred)
print("Labels ok:", bool(ok_labels))
print("Label counts:\n", sub[sample_target_col].value_counts(dropna=False))

# degenerate check
if sub[sample_target_col].nunique(dropna=False) == 1:
    print("\n[WARN] Prediksi hanya 1 kelas (degenerate). Biasanya karena:")
    print("  - fitur mismatch (FEATURE_COLS tidak sama saat training vs inference)")
    print("  - model tidak benar-benar kepakai (cek log MODEL DETECTION)")
    print("  - distribusi prior terlalu dominan (turunkan PRIOR_BLEND_MAX)")

if (not ok_rows) or (not id_match) or (missing_pred > 0) or (not ok_labels):
    raise RuntimeError("QA failed. Check Stage 3/4 outputs or feature alignment.")

# save
sub.to_csv(OUT_SUB, index=False)
print(f"\n[OK] Saved submission -> {OUT_SUB}")

if SAVE_PROBA:
    np.save(OUT_PROBA, proba.astype("float32"))
    print(f"[OK] Saved proba -> {OUT_PROBA}")

=== STAGE 5 MODEL DETECTION ===
Folds: ['val_year_2023', 'val_year_2024', 'val_year_2025']
Fold weights: {'val_year_2023': 0.1163, 'val_year_2024': 0.2677, 'val_year_2025': 0.616}
[OK] Fold 'val_year_2023': used 2/2 models, weight=0.1163
[OK] Fold 'val_year_2024': used 2/2 models, weight=0.2677
[OK] Fold 'val_year_2025': used 2/2 models, weight=0.6160

=== STAGE 5 QA ===
Used model ensemble: True
Rows: 455 | expected: 455 | ok: True
ID order matches sample: True
Missing pred: 0
Labels ok: True
Label counts:
 category
SEDANG         366
TIDAK SEHAT     61
BAIK            28
Name: count, dtype: int64

[OK] Saved submission -> /kaggle/working/submission.csv
[OK] Saved proba -> /kaggle/working/pred_proba.npy
