In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/penyisihan-datavidia-10/sample_submission.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2012-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-2023-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2010-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2018-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2025.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2014-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2024.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2011-kompone

# Data Loading & Sanity Checks

In [2]:
import re
from pathlib import Path
import numpy as np
import pandas as pd

# ============================================================
# CONFIG
# ============================================================
DATA_ROOT = Path("/kaggle/input/penyisihan-datavidia-10")
assert DATA_ROOT.exists(), f"DATA_ROOT not found: {DATA_ROOT}"

SEED = 42
np.random.seed(SEED)

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

# ============================================================
# UTILITIES (ROBUST)
# ============================================================
def read_csv_safe(path: Path) -> pd.DataFrame:
    for sep in [",", ";", "\t", "|"]:
        for enc in ["utf-8", "utf-8-sig", "latin1"]:
            try:
                df = pd.read_csv(path, sep=sep, encoding=enc, low_memory=False)
                if df.shape[1] >= 2:
                    return df
            except Exception:
                pass
    raise RuntimeError(f"Gagal membaca CSV: {path}")

def norm_col(c: str) -> str:
    c = str(c).lower().strip()
    c = re.sub(r"[^\w]+", "_", c)
    c = re.sub(r"_+", "_", c).strip("_")
    return c

def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns={c: norm_col(c) for c in df.columns})

    rename = {}
    for c in df.columns:
        if c in ["tanggal", "date", "waktu", "time", "datetime"]:
            rename[c] = "tanggal"
        elif c in ["stasiun", "station", "stasiun_id"]:
            rename[c] = "stasiun"
        elif c in ["pm10", "pm_10"]:
            rename[c] = "pm10"
        elif c in ["pm25", "pm2_5", "pm_2_5"]:
            rename[c] = "pm25"
        elif c in ["so2"]:
            rename[c] = "so2"
        elif c in ["co"]:
            rename[c] = "co"
        elif c in ["o3"]:
            rename[c] = "o3"
        elif c in ["no2"]:
            rename[c] = "no2"
        elif c in ["kategori", "category", "categori"]:
            rename[c] = "kategori"
        elif c in ["parameter_pencemar_kritis", "critical", "pencemar_kritis"]:
            rename[c] = "parameter_pencemar_kritis"
        elif c in ["ndvi"]:
            rename[c] = "ndvi"
        elif c in ["is_holiday", "is_holiday_nasional"]:
            rename[c] = "is_holiday_nasional"
        elif c in ["is_weekend"]:
            rename[c] = "is_weekend"

    return df.rename(columns=rename)

def parse_date(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.strip()
    s = s.replace({"": np.nan, "nan": np.nan, "none": np.nan, "None": np.nan})
    d1 = pd.to_datetime(s, errors="coerce", dayfirst=True)
    m = d1.isna()
    if m.any():
        d2 = pd.to_datetime(s[m], errors="coerce", dayfirst=False)
        d1.loc[m] = d2
    return d1

def to_numeric(df: pd.DataFrame, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

def find_date_column(df: pd.DataFrame):
    for c in df.columns:
        if c in ["tanggal", "date", "waktu", "time", "datetime"]:
            return c
    return None

# ============================================================
# 0) SAMPLE SUBMISSION (FORECAST TARGET ONLY)
# ============================================================
sub = standardize_columns(read_csv_safe(DATA_ROOT / "sample_submission.csv"))

ID_COL = "id" if "id" in sub.columns else sub.columns[0]
SUB_TARGET_COL = "category" if "category" in sub.columns else sub.columns[-1]

assert sub[ID_COL].is_unique, "ID submission tidak unik"

print("sample_submission:", sub.shape)
print("ID_COL:", ID_COL, "| TARGET:", SUB_TARGET_COL)
display(sub.head())

# ============================================================
# 1) ISPU — HISTORICAL TRAIN DATA
# ============================================================
ispu_files = sorted((DATA_ROOT / "ISPU").glob("*.csv"))
assert len(ispu_files) > 0, "Folder ISPU kosong"

frames = []
for p in ispu_files:
    df = standardize_columns(read_csv_safe(p))
    df["source_file"] = p.name
    frames.append(df)

df_ispu_all = pd.concat(frames, ignore_index=True)

df_ispu_all["tanggal"] = parse_date(df_ispu_all["tanggal"])
df_ispu_all["stasiun"] = df_ispu_all["stasiun"].astype(str).str.strip()

df_ispu_all["stasiun_code"] = (
    df_ispu_all["stasiun"]
    .str.upper()
    .str.extract(r"(DKI\s*\d+)", expand=False)
    .str.replace(" ", "", regex=False)
)

df_ispu_all = to_numeric(
    df_ispu_all,
    ["pm10", "pm25", "so2", "co", "o3", "no2"]
)

df_ispu_all = df_ispu_all.dropna(subset=["tanggal", "stasiun_code"])

# ============================================================
# 2) LABEL FILTERING → 3 KELAS SUBMISSION
# ============================================================
df_ispu_all["kategori"] = (
    df_ispu_all["kategori"]
    .astype(str)
    .str.strip()
    .str.upper()
)

DROP_LABELS = {
    "TIDAK ADA DATA", "NAN",
    "O3", "NO2", "SO2", "CO", "PM10", "PM25"
}

df_ispu_all = df_ispu_all[~df_ispu_all["kategori"].isin(DROP_LABELS)]

LABEL_MAP = {
    "BAIK": "BAIK",
    "SEDANG": "SEDANG",
    "TIDAK SEHAT": "TIDAK SEHAT",
    "SANGAT TIDAK SEHAT": "TIDAK SEHAT",
    "BERBAHAYA": "TIDAK SEHAT",
}

df_ispu_all["kategori"] = df_ispu_all["kategori"].map(LABEL_MAP)
df_ispu_all = df_ispu_all.dropna(subset=["kategori"])

print("\nDistribusi label (TRAIN FINAL):")
print(df_ispu_all["kategori"].value_counts())

df_train = (
    df_ispu_all
    .sort_values(["tanggal", "stasiun_code"])
    .reset_index(drop=True)
)

# ============================================================
# 3) NDVI (HISTORICAL)
# ============================================================
df_ndvi = standardize_columns(
    read_csv_safe(DATA_ROOT / "NDVI (vegetation index)" / "indeks-ndvi-jakarta.csv")
)

df_ndvi["tanggal"] = parse_date(df_ndvi["tanggal"])
df_ndvi["stasiun"] = df_ndvi["stasiun"].astype(str).str.upper().str.replace(" ", "", regex=False)
df_ndvi["stasiun_code"] = df_ndvi["stasiun"].str.extract(r"(DKI\d+)", expand=False)
df_ndvi = to_numeric(df_ndvi, ["ndvi"])
df_ndvi = df_ndvi.dropna(subset=["tanggal", "stasiun_code"])

# ============================================================
# 4) HOLIDAYS (KNOWN FUTURE → ALLOWED)
# ============================================================
df_holiday = standardize_columns(
    read_csv_safe(DATA_ROOT / "libur-nasional" / "dataset-libur-nasional-dan-weekend.csv")
)

df_holiday["tanggal"] = parse_date(df_holiday["tanggal"])

for c in ["is_holiday_nasional", "is_weekend"]:
    if c in df_holiday.columns:
        df_holiday[c] = pd.to_numeric(df_holiday[c], errors="coerce").fillna(0).astype(int)

df_holiday = df_holiday.dropna(subset=["tanggal"]).drop_duplicates(["tanggal"])

# ============================================================
# 5) WEATHER (HISTORICAL, DEFENSIVE)
# ============================================================
weather_files = sorted((DATA_ROOT / "cuaca-harian").glob("*.csv"))
wx_frames = []

for p in weather_files:
    w = standardize_columns(read_csv_safe(p))
    date_col = find_date_column(w)
    if date_col is None:
        print(f"[SKIP WEATHER] {p.name} — no date column")
        continue

    w["tanggal"] = parse_date(w[date_col])
    w = w.dropna(subset=["tanggal"])

    tag = p.stem.lower()
    w["weather_station"] = tag
    w["weather_code"] = (
        pd.Series([tag] * len(w))
        .str.extract(r"(dki\d)", expand=False)
        .str.upper()
    )

    wx_frames.append(w)

if len(wx_frames) == 0:
    raise RuntimeError("Tidak ada data cuaca valid.")

df_weather = pd.concat(wx_frames, ignore_index=True)

# ============================================================
# 6) POPULATION (STATIC)
# ============================================================
df_pop = standardize_columns(
    read_csv_safe(
        DATA_ROOT / "jumlah-penduduk" /
        "data-jumlah-penduduk-provinsi-dki-jakarta-berdasarkan-kelompok-usia-dan-jenis-kelamin-tahun-2013-2021-komponen-data.csv"
    )
)

# ============================================================
# 7) RIVER QUALITY (HISTORICAL)
# ============================================================
df_river = standardize_columns(
    read_csv_safe(
        DATA_ROOT / "kualitas-air-sungai" /
        "data-kualitas-air-sungai-komponen-data.csv"
    )
)

# ============================================================
# FINAL SANITY
# ============================================================
print("\n=== STAGE 1 SELESAI ===")
print("df_train:", df_train.shape)
print("Label unik:", sorted(df_train["kategori"].unique().tolist()))
print("Forecast horizon (submission rows):", len(sub))

display(df_train.head(3))


sample_submission: (455, 2)
ID_COL: id | TARGET: kategori


Unnamed: 0,id,kategori
0,2025-09-01_DKI1,
1,2025-09-01_DKI2,
2,2025-09-01_DKI3,
3,2025-09-01_DKI4,
4,2025-09-01_DKI5,


  d1 = pd.to_datetime(s, errors="coerce", dayfirst=True)
  d2 = pd.to_datetime(s[m], errors="coerce", dayfirst=False)
  d1 = pd.to_datetime(s, errors="coerce", dayfirst=True)



Distribusi label (TRAIN FINAL):
kategori
SEDANG         6652
BAIK           1858
TIDAK SEHAT    1004
Name: count, dtype: int64

=== STAGE 1 SELESAI ===
df_train: (9514, 22)
Label unik: ['BAIK', 'SEDANG', 'TIDAK SEHAT']
Forecast horizon (submission rows): 455


Unnamed: 0,periode_data,tanggal,stasiun,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,kategori,source_file,bulan,pm10,so2,co,o3,no2,lokasi_spku,pm25,stasiun_code
0,201001,2010-01-01,DKI1 (Bunderan HI),,,,,,,73,CO,SEDANG,indeks-standar-pencemaran-udara-(ispu)-tahun-2...,,60.0,4.0,73.0,27.0,14.0,,,DKI1
1,201001,2010-01-02,DKI1 (Bunderan HI),,,,,,,33,O3,BAIK,indeks-standar-pencemaran-udara-(ispu)-tahun-2...,,32.0,2.0,16.0,33.0,9.0,,,DKI1
2,201001,2010-01-03,DKI1 (Bunderan HI),,,,,,,27,PM10,BAIK,indeks-standar-pencemaran-udara-(ispu)-tahun-2...,,27.0,2.0,19.0,20.0,9.0,,,DKI1


# Master Table Building (Correct Joins)

In [3]:
# ============================================================
# STAGE 2 — Master Table Building (FORECASTING-SAFE) (ONE CELL)
#
# INPUT (WAJIB dari Stage 1):
# - sub, ID_COL, SUB_TARGET_COL
# - df_train
# - df_ndvi, df_holiday, df_weather, df_pop, df_river
#
# OUTPUT GLOBAL:
# - df_train_master   (train table lengkap hasil join)
# - df_test_master    (forecast table: dibangun dari ID submission -> tanggal+stasiun_code)
#
# CATATAN (FAQ COMPLIANT):
# - Tidak ada fitur test mentah dari panitia (memang forecasting).
# - df_test_master dibuat dari horizon waktu pada sample_submission (ID).
# - Join eksternal yang "future-actual" tidak dipaksa ada; jika future tidak tersedia -> NaN (AMAN).
# ============================================================

import re
import numpy as np
import pandas as pd

# ----------------------------
# Guards
# ----------------------------
need = ["sub","ID_COL","SUB_TARGET_COL","df_train","df_ndvi","df_holiday","df_weather","df_pop","df_river"]
miss = [k for k in need if k not in globals() or globals()[k] is None]
if miss:
    raise RuntimeError(f"Missing globals from Stage 1: {miss}. Jalankan Stage 1 dulu.")

# ----------------------------
# Helpers
# ----------------------------
def _ensure_datetime(df: pd.DataFrame, col="tanggal") -> pd.DataFrame:
    df = df.copy()
    if col in df.columns:
        if not np.issubdtype(df[col].dtype, np.datetime64):
            df[col] = pd.to_datetime(df[col], errors="coerce")
    return df

def _mk_stasiun_code(s: pd.Series) -> pd.Series:
    st = s.astype(str).str.strip().str.upper()
    code = st.str.extract(r"(DKI\s*\d+)", expand=False).str.replace(" ", "", regex=False)
    code = code.str.replace("DKI0", "DKI", regex=False)
    return code

def _parse_id_to_date_station(id_series: pd.Series):
    """
    Expected ID patterns (most common):
      - YYYY-MM-DD_DKI1
      - YYYY/MM/DD_DKI1
      - YYYY-MM-DD-DKI1
      - 2025-09-01_DKI1 (from your example)
    Returns: (tanggal(datetime64), stasiun_code(str))
    """
    s = id_series.astype(str).str.strip()

    # split by underscore first if exists
    has_us = s.str.contains("_", regex=False)

    date_part = pd.Series(np.where(has_us, s.str.split("_").str[0], s), index=s.index)
    st_part   = pd.Series(np.where(has_us, s.str.split("_").str[1], np.nan), index=s.index)

    # if station not found via underscore, try regex capture DKI\d anywhere
    st_part2 = s.str.extract(r"(DKI\s*\d+)", expand=False).str.replace(" ", "", regex=False).str.upper()
    st_part = st_part.fillna(st_part2)

    # date parse (try replace / -> -)
    date_part = date_part.str.replace("/", "-", regex=False)
    tanggal = pd.to_datetime(date_part, errors="coerce")

    return tanggal, st_part

def _prefix_numeric_cols(df: pd.DataFrame, prefix: str, keep_cols: set) -> pd.DataFrame:
    df = df.copy()
    ren = {}
    for c in df.columns:
        if c in keep_cols:
            continue
        if pd.api.types.is_numeric_dtype(df[c]):
            ren[c] = f"{prefix}{c}"
    return df.rename(columns=ren)

# ----------------------------
# Build BASE train (from historical df_train)
# ----------------------------
train_base = df_train.copy()
train_base = _ensure_datetime(train_base, "tanggal")

# ensure station code exists
if "stasiun_code" not in train_base.columns:
    if "stasiun" in train_base.columns:
        train_base["stasiun_code"] = _mk_stasiun_code(train_base["stasiun"])
    else:
        train_base["stasiun_code"] = np.nan

train_base = train_base.dropna(subset=["tanggal", "stasiun_code"]).copy()
train_base = train_base.sort_values(["tanggal","stasiun_code"]).reset_index(drop=True)

# ----------------------------
# Build BASE test (forecast horizon from submission IDs)
# ----------------------------
test_base = sub[[ID_COL]].copy()
tanggal, st_code = _parse_id_to_date_station(test_base[ID_COL])

test_base["tanggal"] = tanggal
test_base["stasiun_code"] = st_code

# If stasiun human-readable not available, create placeholder
test_base["stasiun"] = test_base["stasiun_code"]

# sanity
if test_base["tanggal"].isna().any():
    bad = test_base.loc[test_base["tanggal"].isna(), ID_COL].head(10).tolist()
    raise RuntimeError(
        "Gagal parse tanggal dari ID submission. Contoh ID bermasalah:\n"
        + "\n".join(map(str, bad))
        + "\nPastikan format ID mengandung tanggal (YYYY-MM-DD) dan stasiun (DKI#)."
    )
if test_base["stasiun_code"].isna().any():
    bad = test_base.loc[test_base["stasiun_code"].isna(), ID_COL].head(10).tolist()
    raise RuntimeError(
        "Gagal parse stasiun_code dari ID submission. Contoh ID bermasalah:\n"
        + "\n".join(map(str, bad))
        + "\nPastikan ID mengandung DKI1..DKI5."
    )

test_base = test_base.sort_values(["tanggal","stasiun_code"]).reset_index(drop=True)

# ----------------------------
# Common calendar fields (safe)
# ----------------------------
def add_calendar(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["year"] = df["tanggal"].dt.year.astype("Int64")
    df["month"] = df["tanggal"].dt.month.astype("Int64")
    df["day"] = df["tanggal"].dt.day.astype("Int64")
    df["dow"] = df["tanggal"].dt.dayofweek.astype("Int64")
    df["dayofyear"] = df["tanggal"].dt.dayofyear.astype("Int64")
    return df

train_base = add_calendar(train_base)
test_base  = add_calendar(test_base)

# ----------------------------
# Join HOLIDAY by date (allowed future)
# ----------------------------
hol = df_holiday.copy()
hol = _ensure_datetime(hol, "tanggal")
hol = hol.dropna(subset=["tanggal"]).drop_duplicates(["tanggal"]).copy()

# keep only relevant columns
hol_keep = [c for c in ["tanggal","is_holiday_nasional","is_weekend","nama_libur","day_name"] if c in hol.columns]
hol = hol[hol_keep].copy()

train_m = train_base.merge(hol, on="tanggal", how="left")
test_m  = test_base.merge(hol,  on="tanggal", how="left")

# ----------------------------
# Join NDVI by date + stasiun_code (future likely NaN -> OK)
# ----------------------------
nd = df_ndvi.copy()
nd = _ensure_datetime(nd, "tanggal")
if "stasiun_code" not in nd.columns:
    if "stasiun" in nd.columns:
        nd["stasiun_code"] = _mk_stasiun_code(nd["stasiun"])
nd = nd.dropna(subset=["tanggal","stasiun_code"]).drop_duplicates(["tanggal","stasiun_code"])

# keep numeric NDVI only + keys
nd_cols = ["tanggal","stasiun_code"] + [c for c in nd.columns if c not in ["tanggal","stasiun","stasiun_code"]]
nd = nd[nd_cols].copy()

train_m = train_m.merge(nd, on=["tanggal","stasiun_code"], how="left")
test_m  = test_m.merge(nd,  on=["tanggal","stasiun_code"], how="left")

# ----------------------------
# Join WEATHER:
# - local station: df_weather weather_code ~= DKI#
# - global fallback by date (mean numeric)
# Future dates mostly NaN -> OK (no leak)
# ----------------------------
wx = df_weather.copy()
wx = _ensure_datetime(wx, "tanggal")

if "weather_code" in wx.columns:
    wx["weather_code"] = wx["weather_code"].astype(str).str.strip().str.upper()
else:
    wx["weather_code"] = np.nan

# Local station weather: rename weather_code -> stasiun_code
wx_loc = wx.dropna(subset=["tanggal","weather_code"]).copy()
wx_loc = wx_loc.rename(columns={"weather_code":"stasiun_code"})
# keep numeric + keys
loc_keys = {"tanggal","stasiun_code","weather_station"}
wx_loc_cols = [c for c in wx_loc.columns if c in loc_keys or pd.api.types.is_numeric_dtype(wx_loc[c])]
wx_loc = wx_loc[wx_loc_cols].drop_duplicates(["tanggal","stasiun_code"])

# prefix numeric columns to avoid clash
wx_loc = _prefix_numeric_cols(wx_loc, "wx_", keep_cols={"tanggal","stasiun_code"})

train_m = train_m.merge(wx_loc, on=["tanggal","stasiun_code"], how="left")
test_m  = test_m.merge(wx_loc,  on=["tanggal","stasiun_code"], how="left")

# Global weather by date mean numeric
wx_g = wx.dropna(subset=["tanggal"]).copy()
wx_g_num = [c for c in wx_g.columns if pd.api.types.is_numeric_dtype(wx_g[c])]
if len(wx_g_num) > 0:
    wx_g = wx_g.groupby("tanggal", as_index=False)[wx_g_num].mean(numeric_only=True)
    wx_g = _prefix_numeric_cols(wx_g, "wxg_", keep_cols={"tanggal"})
    train_m = train_m.merge(wx_g, on="tanggal", how="left")
    test_m  = test_m.merge(wx_g,  on="tanggal", how="left")

    # Fill local wx_ NaNs with global wxg_ if both exist
    for c in [c for c in train_m.columns if c.startswith("wx_")]:
        base = c.replace("wx_", "")
        cg = "wxg_" + base
        if cg in train_m.columns:
            train_m[c] = train_m[c].fillna(train_m[cg])
            test_m[c]  = test_m[c].fillna(test_m[cg])

# ----------------------------
# Population: aggregate by year (static; future year may NaN)
# ----------------------------
pop = df_pop.copy()
if "tahun" in pop.columns and "jumlah_penduduk" in pop.columns:
    pop["tahun"] = pd.to_numeric(pop["tahun"], errors="coerce")
    pop["jumlah_penduduk"] = pd.to_numeric(pop["jumlah_penduduk"], errors="coerce")
    pop_y = pop.groupby("tahun", as_index=False)["jumlah_penduduk"].sum()
    pop_y = pop_y.rename(columns={"tahun":"year", "jumlah_penduduk":"pop_total_year"})
    train_m = train_m.merge(pop_y, on="year", how="left")
    test_m  = test_m.merge(pop_y,  on="year", how="left")

# ----------------------------
# River quality: aggregate by (year, month) (historical; future month likely NaN)
# ----------------------------
riv = df_river.copy()
for cc in ["periode_data","bulan_sampling","baku_mutu","hasil_pengukuran"]:
    if cc in riv.columns:
        riv[cc] = pd.to_numeric(riv[cc], errors="coerce")

if {"periode_data","bulan_sampling","baku_mutu","hasil_pengukuran"}.issubset(riv.columns):
    r = riv.dropna(subset=["periode_data","bulan_sampling","baku_mutu","hasil_pengukuran"]).copy()
    r["ratio_to_std"] = r["hasil_pengukuran"] / (r["baku_mutu"].replace(0, np.nan))
    r["exceed"] = (r["hasil_pengukuran"] > r["baku_mutu"]).astype(int)

    r_agg = r.groupby(["periode_data","bulan_sampling"], as_index=False).agg(
        river_exceed_rate=("exceed","mean"),
        river_ratio_mean=("ratio_to_std","mean"),
        river_n=("exceed","size"),
    )
    r_agg = r_agg.rename(columns={"periode_data":"year", "bulan_sampling":"month"})

    train_m = train_m.merge(r_agg, on=["year","month"], how="left")
    test_m  = test_m.merge(r_agg,  on=["year","month"], how="left")

# ----------------------------
# Final cleanup for CatBoost friendliness
# ----------------------------
for c in ["stasiun","stasiun_code","parameter_pencemar_kritis","nama_libur","day_name","weather_station"]:
    if c in train_m.columns:
        train_m[c] = train_m[c].astype("object")
    if c in test_m.columns:
        test_m[c] = test_m[c].astype("object")

# ensure test has the same key columns + id
if ID_COL not in test_m.columns:
    test_m[ID_COL] = test_base[ID_COL].values

# ----------------------------
# Export globals
# ----------------------------
df_train_master = train_m.sort_values(["tanggal","stasiun_code"]).reset_index(drop=True)
df_test_master  = test_m.sort_values(["tanggal","stasiun_code"]).reset_index(drop=True)

print("df_train_master:", df_train_master.shape)
print("df_test_master :", df_test_master.shape)

print("\nTrain date range:", df_train_master["tanggal"].min(), "->", df_train_master["tanggal"].max())
print("Test  date range:", df_test_master["tanggal"].min(),  "->", df_test_master["tanggal"].max())

# Sanity: submission ids all present
assert df_test_master[ID_COL].nunique() == len(sub), "Mismatch count test vs submission"

display(df_train_master.head(3))
display(df_test_master.head(3))

df_train_master: (9514, 83)
df_test_master : (455, 65)

Train date range: 2010-01-01 00:00:00 -> 2023-11-30 00:00:00
Test  date range: 2025-09-01 00:00:00 -> 2025-11-30 00:00:00


Unnamed: 0,periode_data,tanggal,stasiun,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,kategori,source_file,bulan,pm10,so2,co,o3,no2,lokasi_spku,pm25,stasiun_code,year,month,day,dow,dayofyear,is_holiday_nasional,is_weekend,nama_libur,day_name,ndvi,wx_temperature_2m_max_c,wx_temperature_2m_min_c,wx_precipitation_sum_mm,wx_precipitation_hours_h,wx_wind_speed_10m_max_km_h,wx_wind_direction_10m_dominant,wx_shortwave_radiation_sum_mj_m²,wx_temperature_2m_mean_c,wx_relative_humidity_2m_mean,wx_cloud_cover_mean,wx_surface_pressure_mean_hpa,wx_wind_gusts_10m_max_km_h,wx_winddirection_10m_dominant,wx_relative_humidity_2m_max,wx_relative_humidity_2m_min,wx_cloud_cover_max,wx_cloud_cover_min,wx_wind_gusts_10m_mean_km_h,wx_wind_speed_10m_mean_km_h,wx_wind_gusts_10m_min_km_h,wx_wind_speed_10m_min_km_h,wx_surface_pressure_max_hpa,wx_surface_pressure_min_hpa,weather_station,wxg_temperature_2m_max_c,wxg_temperature_2m_min_c,wxg_precipitation_sum_mm,wxg_precipitation_hours_h,wxg_wind_speed_10m_max_km_h,wxg_wind_direction_10m_dominant,wxg_shortwave_radiation_sum_mj_m²,wxg_temperature_2m_mean_c,wxg_relative_humidity_2m_mean,wxg_cloud_cover_mean,wxg_surface_pressure_mean_hpa,wxg_wind_gusts_10m_max_km_h,wxg_winddirection_10m_dominant,wxg_relative_humidity_2m_max,wxg_relative_humidity_2m_min,wxg_cloud_cover_max,wxg_cloud_cover_min,wxg_wind_gusts_10m_mean_km_h,wxg_wind_speed_10m_mean_km_h,wxg_wind_gusts_10m_min_km_h,wxg_wind_speed_10m_min_km_h,wxg_surface_pressure_max_hpa,wxg_surface_pressure_min_hpa,pop_total_year,river_exceed_rate,river_ratio_mean,river_n
0,201001,2010-01-01,DKI1 (Bunderan HI),,,,,,,73,CO,SEDANG,indeks-standar-pencemaran-udara-(ispu)-tahun-2...,,60.0,4.0,73.0,27.0,14.0,,,DKI1,2010,1,1,4,1,1,0,New Year's Day,Friday,0.2023,29.4,24.4,4.0,14.0,16.0,246,16.24,26.6,81,100,1007.5,38.2,246,90,69,100,99,21.0,10.5,11.9,6.9,1009.3,1005.1,cuaca-harian-dki1-bundaranhi,29.58,24.24,4.48,12.4,16.2,249.6,16.484,26.48,82.2,100.0,1004.98,38.2,249.6,90.8,69.6,100.0,99.0,21.04,10.54,11.9,7.1,1006.86,1002.7,,,,
1,201001,2010-01-02,DKI1 (Bunderan HI),,,,,,,33,O3,BAIK,indeks-standar-pencemaran-udara-(ispu)-tahun-2...,,32.0,2.0,16.0,33.0,9.0,,,DKI1,2010,1,2,5,2,0,0,,Monday,,28.9,24.2,6.9,14.0,9.5,260,13.01,26.2,85,99,1010.1,22.0,260,95,72,100,94,13.7,6.0,8.6,2.3,1011.9,1007.4,cuaca-harian-dki1-bundaranhi,29.02,23.88,7.14,14.4,9.5,264.8,12.666,26.08,85.8,99.0,1007.68,23.28,264.8,95.6,72.0,100.0,95.2,13.9,6.08,8.76,1.94,1009.44,1005.04,,,,
2,201001,2010-01-03,DKI1 (Bunderan HI),,,,,,,27,PM10,BAIK,indeks-standar-pencemaran-udara-(ispu)-tahun-2...,,27.0,2.0,19.0,20.0,9.0,,,DKI1,2010,1,3,6,3,0,0,,Monday,,31.4,24.9,11.2,6.0,9.4,224,23.89,27.1,85,93,1009.9,21.2,224,95,70,100,28,15.7,5.7,8.3,1.6,1012.2,1007.0,cuaca-harian-dki1-bundaranhi,31.46,24.34,12.16,6.8,9.12,218.0,23.986,26.98,85.6,93.8,1007.48,22.24,218.0,96.0,70.4,100.0,40.4,15.5,5.58,8.42,1.68,1009.76,1004.78,,,,


Unnamed: 0,id,tanggal,stasiun_code,stasiun,year,month,day,dow,dayofyear,is_holiday_nasional,is_weekend,nama_libur,day_name,ndvi,wx_temperature_2m_max_c,wx_temperature_2m_min_c,wx_precipitation_sum_mm,wx_precipitation_hours_h,wx_wind_speed_10m_max_km_h,wx_wind_direction_10m_dominant,wx_shortwave_radiation_sum_mj_m²,wx_temperature_2m_mean_c,wx_relative_humidity_2m_mean,wx_cloud_cover_mean,wx_surface_pressure_mean_hpa,wx_wind_gusts_10m_max_km_h,wx_winddirection_10m_dominant,wx_relative_humidity_2m_max,wx_relative_humidity_2m_min,wx_cloud_cover_max,wx_cloud_cover_min,wx_wind_gusts_10m_mean_km_h,wx_wind_speed_10m_mean_km_h,wx_wind_gusts_10m_min_km_h,wx_wind_speed_10m_min_km_h,wx_surface_pressure_max_hpa,wx_surface_pressure_min_hpa,weather_station,wxg_temperature_2m_max_c,wxg_temperature_2m_min_c,wxg_precipitation_sum_mm,wxg_precipitation_hours_h,wxg_wind_speed_10m_max_km_h,wxg_wind_direction_10m_dominant,wxg_shortwave_radiation_sum_mj_m²,wxg_temperature_2m_mean_c,wxg_relative_humidity_2m_mean,wxg_cloud_cover_mean,wxg_surface_pressure_mean_hpa,wxg_wind_gusts_10m_max_km_h,wxg_winddirection_10m_dominant,wxg_relative_humidity_2m_max,wxg_relative_humidity_2m_min,wxg_cloud_cover_max,wxg_cloud_cover_min,wxg_wind_gusts_10m_mean_km_h,wxg_wind_speed_10m_mean_km_h,wxg_wind_gusts_10m_min_km_h,wxg_wind_speed_10m_min_km_h,wxg_surface_pressure_max_hpa,wxg_surface_pressure_min_hpa,pop_total_year,river_exceed_rate,river_ratio_mean,river_n
0,2025-09-01_DKI1,2025-09-01,DKI1,DKI1,2025,9,1,0,244,0,0,,Thursday,,27.8,23.9,30.2,21.0,7.6,258.0,7.8,25.7,89.0,100.0,1009.9,20.5,258.0,98.0,78.0,100.0,100.0,12.4,4.4,6.1,2.3,1012.1,1007.9,cuaca-harian-dki1-bundaranhi,27.54,23.7,28.46,20.8,8.74,265.2,7.694,25.42,90.2,99.8,1007.52,22.38,265.2,98.0,78.4,100.0,95.2,13.26,4.96,6.64,2.3,1009.7,1005.6,,,,
1,2025-09-01_DKI2,2025-09-01,DKI2,DKI2,2025,9,1,0,244,0,0,,Thursday,,27.6,24.1,28.1,22.0,11.1,268.0,7.85,25.7,90.0,100.0,1009.6,26.6,268.0,96.0,79.0,100.0,100.0,16.7,6.1,8.3,2.3,1011.9,1007.7,cuaca-harian-dki2-kelapagading,27.54,23.7,28.46,20.8,8.74,265.2,7.694,25.42,90.2,99.8,1007.52,22.38,265.2,98.0,78.4,100.0,95.2,13.26,4.96,6.64,2.3,1009.7,1005.6,,,,
2,2025-09-01_DKI3,2025-09-01,DKI3,DKI3,2025,9,1,0,244,0,0,,Thursday,,27.0,23.2,29.5,21.0,8.6,269.0,7.4,25.0,91.0,100.0,1002.5,22.3,269.0,98.0,79.0,100.0,98.0,13.0,5.2,7.6,2.6,1004.6,1000.7,cuaca-harian-dki3-jagakarsa,27.54,23.7,28.46,20.8,8.74,265.2,7.694,25.42,90.2,99.8,1007.52,22.38,265.2,98.0,78.4,100.0,95.2,13.26,4.96,6.64,2.3,1009.7,1005.6,,,,


# Feature Engineering (Time-Series + Calendar + Robustness)

In [9]:
# ============================================================
# STAGE 3 — Feature Engineering (FORECASTING-SAFE) — FINAL
# - Train + Test master digabung (agar test dapat history dari train)
# - Semua base pollutant dipaksa NUMERIC (fix str-int)
# - Lag & rolling dihitung dari SHIFT(1) => NO LEAK
# - Feature ditambahkan via pd.concat sekali (anti-fragmentation)
#
# REQUIRE:
# - df_train_master
# - df_test_master
#
# OUTPUT:
# - df_train_fe
# - df_test_fe
# ============================================================

import numpy as np
import pandas as pd

need = ["df_train_master", "df_test_master"]
miss = [k for k in need if k not in globals() or globals()[k] is None]
if miss:
    raise RuntimeError(f"Missing required objects: {miss}. Jalankan Stage 2 dulu.")

df_tr = df_train_master.copy()
df_te = df_test_master.copy()

df_tr["_is_train"] = 1
df_te["_is_train"] = 0

df_all = pd.concat([df_tr, df_te], ignore_index=True, sort=False)

# --- guards minimal
for col in ["tanggal", "stasiun_code"]:
    if col not in df_all.columns:
        raise RuntimeError(f"Missing '{col}' in master. Pastikan Stage 2 membuat {col}.")

# --- ensure datetime
if not np.issubdtype(df_all["tanggal"].dtype, np.datetime64):
    df_all["tanggal"] = pd.to_datetime(df_all["tanggal"], errors="coerce")
if df_all["tanggal"].isna().any():
    bad = df_all[df_all["tanggal"].isna()].head(5)
    raise RuntimeError(f"Found NaT in tanggal. Contoh:\n{bad}")

df_all = df_all.sort_values(["stasiun_code", "tanggal"]).reset_index(drop=True)

# ============================================================
# 1) Force numeric for base pollutant columns (fix str-int)
# ============================================================
BASE_COLS = ["pm10","pm25","so2","co","o3","no2","max"]
for c in BASE_COLS:
    if c in df_all.columns:
        df_all[c] = pd.to_numeric(df_all[c], errors="coerce")

base_cols_exist = [c for c in BASE_COLS if c in df_all.columns]
if not base_cols_exist:
    raise RuntimeError("No base pollutant columns found in df_all.")

# ============================================================
# 2) Time features
# ============================================================
df_all["year"] = df_all["tanggal"].dt.year.astype("Int64")
df_all["month"] = df_all["tanggal"].dt.month.astype("Int64")
df_all["day"] = df_all["tanggal"].dt.day.astype("Int64")
df_all["dow"] = df_all["tanggal"].dt.dayofweek.astype("Int64")
df_all["dayofyear"] = df_all["tanggal"].dt.dayofyear.astype("Int64")

doy = df_all["dayofyear"].astype(float)
df_all["doy_sin"] = np.sin(2 * np.pi * doy / 365.25)
df_all["doy_cos"] = np.cos(2 * np.pi * doy / 365.25)

mon = df_all["month"].astype(float)
df_all["mon_sin"] = np.sin(2 * np.pi * mon / 12.0)
df_all["mon_cos"] = np.cos(2 * np.pi * mon / 12.0)

if "is_weekend" not in df_all.columns:
    df_all["is_weekend"] = df_all["dow"].isin([5, 6]).astype(int)

# ============================================================
# 3) Lag + Rolling (past-only) via dict -> concat once
# ============================================================
g = df_all.groupby("stasiun_code", sort=False)

LAGS = (1,2,3,7,14)
WINDOWS = (3,7,14,30)

feat = {}

for c in base_cols_exist:
    # lags
    for L in LAGS:
        feat[f"{c}_lag{L}"] = g[c].shift(L)

    # rolling on shifted (past only)
    s = g[c].shift(1)
    for w in WINDOWS:
        feat[f"{c}_rmean{w}"] = s.rolling(w, min_periods=max(2, w//3)).mean()
        feat[f"{c}_rstd{w}"]  = s.rolling(w, min_periods=max(2, w//3)).std()

    # deltas (safe numeric)
    feat[f"{c}_d12"] = feat[f"{c}_lag1"] - feat[f"{c}_lag2"]
    feat[f"{c}_d1_rm7"] = feat[f"{c}_lag1"] - feat[f"{c}_rmean7"]

df_all = pd.concat([df_all, pd.DataFrame(feat)], axis=1)

# ============================================================
# 4) Weather interactions (past-only, use lag1)
# ============================================================
def _pick_first(cols):
    for c in cols:
        if c in df_all.columns:
            return c
    return None

wind = _pick_first(["wx_wind_speed_10m_mean_km_h", "wxg_wind_speed_10m_mean_km_h"])
prec = _pick_first(["wx_precipitation_sum_mm", "wxg_precipitation_sum_mm"])
rad  = _pick_first(["wx_shortwave_radiation_sum_mj_m²", "wxg_shortwave_radiation_sum_mj_m²"])
rh   = _pick_first(["wx_relative_humidity_2m_mean", "wxg_relative_humidity_2m_mean"])
tmp  = _pick_first(["wx_temperature_2m_mean_c", "wxg_temperature_2m_mean_c"])

if "pm25_lag1" in df_all.columns and wind is not None:
    df_all["pm25_lag1_x_wind"] = df_all["pm25_lag1"] * pd.to_numeric(df_all[wind], errors="coerce")
if "pm10_lag1" in df_all.columns and wind is not None:
    df_all["pm10_lag1_x_wind"] = df_all["pm10_lag1"] * pd.to_numeric(df_all[wind], errors="coerce")
if "o3_lag1" in df_all.columns and rad is not None:
    df_all["o3_lag1_x_rad"] = df_all["o3_lag1"] * pd.to_numeric(df_all[rad], errors="coerce")
if "pm25_lag1" in df_all.columns and prec is not None:
    pnum = pd.to_numeric(df_all[prec], errors="coerce").fillna(0)
    df_all["pm25_lag1_div_prec"] = df_all["pm25_lag1"] / (pnum + 1.0)
if "co_lag1" in df_all.columns and rh is not None:
    df_all["co_lag1_x_rh"] = df_all["co_lag1"] * pd.to_numeric(df_all[rh], errors="coerce")
if "pm25_lag1" in df_all.columns and tmp is not None:
    df_all["pm25_lag1_x_temp"] = df_all["pm25_lag1"] * pd.to_numeric(df_all[tmp], errors="coerce")

# ============================================================
# 5) Finalize CatBoost types
# ============================================================
for c in ["stasiun", "stasiun_code", "parameter_pencemar_kritis",
          "day_name", "nama_libur", "weather_station"]:
    if c in df_all.columns:
        df_all[c] = df_all[c].astype("object")

for c in ["is_weekend", "is_holiday_nasional"]:
    if c in df_all.columns:
        df_all[c] = pd.to_numeric(df_all[c], errors="coerce").fillna(0).astype(int)

# ============================================================
# 6) Split back
# ============================================================
df_train_fe = df_all[df_all["_is_train"] == 1].drop(columns=["_is_train"]).reset_index(drop=True)
df_test_fe  = df_all[df_all["_is_train"] == 0].drop(columns=["_is_train"]).reset_index(drop=True)

# ============================================================
# 7) Sanity
# ============================================================
print("df_train_fe:", df_train_fe.shape)
print("df_test_fe :", df_test_fe.shape)
print("\nTrain date range:", df_train_fe["tanggal"].min(), "->", df_train_fe["tanggal"].max())
print("Test  date range:", df_test_fe["tanggal"].min(),  "->", df_test_fe["tanggal"].max())

lag_cols = [c for c in df_test_fe.columns if c.endswith("_lag1")]
if lag_cols:
    miss_rate = df_test_fe[lag_cols].isna().mean().mean() * 100
    print(f"\nAvg missing % lag features in TEST: {miss_rate:.2f}%")

display(df_train_fe.head(3))
display(df_test_fe.head(3))


df_train_fe: (9514, 199)
df_test_fe : (455, 199)

Train date range: 2010-01-01 00:00:00 -> 2023-11-30 00:00:00
Test  date range: 2025-09-01 00:00:00 -> 2025-11-30 00:00:00

Avg missing % lag features in TEST: 99.84%


Unnamed: 0,periode_data,tanggal,stasiun,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,kategori,source_file,bulan,pm10,so2,co,o3,no2,lokasi_spku,pm25,stasiun_code,year,month,day,dow,dayofyear,is_holiday_nasional,is_weekend,nama_libur,day_name,ndvi,wx_temperature_2m_max_c,wx_temperature_2m_min_c,wx_precipitation_sum_mm,wx_precipitation_hours_h,wx_wind_speed_10m_max_km_h,wx_wind_direction_10m_dominant,wx_shortwave_radiation_sum_mj_m²,wx_temperature_2m_mean_c,wx_relative_humidity_2m_mean,wx_cloud_cover_mean,wx_surface_pressure_mean_hpa,wx_wind_gusts_10m_max_km_h,wx_winddirection_10m_dominant,wx_relative_humidity_2m_max,wx_relative_humidity_2m_min,wx_cloud_cover_max,wx_cloud_cover_min,wx_wind_gusts_10m_mean_km_h,wx_wind_speed_10m_mean_km_h,wx_wind_gusts_10m_min_km_h,wx_wind_speed_10m_min_km_h,wx_surface_pressure_max_hpa,wx_surface_pressure_min_hpa,weather_station,wxg_temperature_2m_max_c,wxg_temperature_2m_min_c,wxg_precipitation_sum_mm,wxg_precipitation_hours_h,wxg_wind_speed_10m_max_km_h,wxg_wind_direction_10m_dominant,wxg_shortwave_radiation_sum_mj_m²,wxg_temperature_2m_mean_c,wxg_relative_humidity_2m_mean,wxg_cloud_cover_mean,wxg_surface_pressure_mean_hpa,wxg_wind_gusts_10m_max_km_h,wxg_winddirection_10m_dominant,wxg_relative_humidity_2m_max,wxg_relative_humidity_2m_min,wxg_cloud_cover_max,wxg_cloud_cover_min,wxg_wind_gusts_10m_mean_km_h,wxg_wind_speed_10m_mean_km_h,wxg_wind_gusts_10m_min_km_h,wxg_wind_speed_10m_min_km_h,wxg_surface_pressure_max_hpa,wxg_surface_pressure_min_hpa,pop_total_year,river_exceed_rate,river_ratio_mean,river_n,id,doy_sin,doy_cos,mon_sin,mon_cos,pm10_lag1,pm10_lag2,pm10_lag3,pm10_lag7,pm10_lag14,pm10_rmean3,pm10_rstd3,pm10_rmean7,pm10_rstd7,pm10_rmean14,pm10_rstd14,pm10_rmean30,pm10_rstd30,pm10_d12,pm10_d1_rm7,pm25_lag1,pm25_lag2,pm25_lag3,pm25_lag7,pm25_lag14,pm25_rmean3,pm25_rstd3,pm25_rmean7,pm25_rstd7,pm25_rmean14,pm25_rstd14,pm25_rmean30,pm25_rstd30,pm25_d12,pm25_d1_rm7,so2_lag1,so2_lag2,so2_lag3,so2_lag7,so2_lag14,so2_rmean3,so2_rstd3,so2_rmean7,so2_rstd7,so2_rmean14,so2_rstd14,so2_rmean30,so2_rstd30,so2_d12,so2_d1_rm7,co_lag1,co_lag2,co_lag3,co_lag7,co_lag14,co_rmean3,co_rstd3,co_rmean7,co_rstd7,co_rmean14,co_rstd14,co_rmean30,co_rstd30,co_d12,co_d1_rm7,o3_lag1,o3_lag2,o3_lag3,o3_lag7,o3_lag14,o3_rmean3,o3_rstd3,o3_rmean7,o3_rstd7,o3_rmean14,o3_rstd14,o3_rmean30,o3_rstd30,o3_d12,o3_d1_rm7,no2_lag1,no2_lag2,no2_lag3,no2_lag7,no2_lag14,no2_rmean3,no2_rstd3,no2_rmean7,no2_rstd7,no2_rmean14,no2_rstd14,no2_rmean30,no2_rstd30,no2_d12,no2_d1_rm7,max_lag1,max_lag2,max_lag3,max_lag7,max_lag14,max_rmean3,max_rstd3,max_rmean7,max_rstd7,max_rmean14,max_rstd14,max_rmean30,max_rstd30,max_d12,max_d1_rm7,pm25_lag1_x_wind,pm10_lag1_x_wind,o3_lag1_x_rad,pm25_lag1_div_prec,co_lag1_x_rh,pm25_lag1_x_temp
0,201001.0,2010-01-01,DKI1 (Bunderan HI),,,,,,,73.0,CO,SEDANG,indeks-standar-pencemaran-udara-(ispu)-tahun-2...,,60.0,4.0,73.0,27.0,14.0,,,DKI1,2010,1,1,4,1,1,0,New Year's Day,Friday,0.2023,29.4,24.4,4.0,14.0,16.0,246.0,16.24,26.6,81.0,100.0,1007.5,38.2,246.0,90.0,69.0,100.0,99.0,21.0,10.5,11.9,6.9,1009.3,1005.1,cuaca-harian-dki1-bundaranhi,29.58,24.24,4.48,12.4,16.2,249.6,16.484,26.48,82.2,100.0,1004.98,38.2,249.6,90.8,69.6,100.0,99.0,21.04,10.54,11.9,7.1,1006.86,1002.7,,,,,,0.017202,0.999852,0.5,0.866025,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,201001.0,2010-01-02,DKI1 (Bunderan HI),,,,,,,33.0,O3,BAIK,indeks-standar-pencemaran-udara-(ispu)-tahun-2...,,32.0,2.0,16.0,33.0,9.0,,,DKI1,2010,1,2,5,2,0,0,,Monday,,28.9,24.2,6.9,14.0,9.5,260.0,13.01,26.2,85.0,99.0,1010.1,22.0,260.0,95.0,72.0,100.0,94.0,13.7,6.0,8.6,2.3,1011.9,1007.4,cuaca-harian-dki1-bundaranhi,29.02,23.88,7.14,14.4,9.5,264.8,12.666,26.08,85.8,99.0,1007.68,23.28,264.8,95.6,72.0,100.0,95.2,13.9,6.08,8.76,1.94,1009.44,1005.04,,,,,,0.034398,0.999408,0.5,0.866025,60.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,73.0,,,,,,,,,,,,,,,27.0,,,,,,,,,,,,,,,14.0,,,,,,,,,,,,,,,73.0,,,,,,,,,,,,,,,,360.0,351.27,,6205.0,
2,201001.0,2010-01-03,DKI1 (Bunderan HI),,,,,,,27.0,PM10,BAIK,indeks-standar-pencemaran-udara-(ispu)-tahun-2...,,27.0,2.0,19.0,20.0,9.0,,,DKI1,2010,1,3,6,3,0,0,,Monday,,31.4,24.9,11.2,6.0,9.4,224.0,23.89,27.1,85.0,93.0,1009.9,21.2,224.0,95.0,70.0,100.0,28.0,15.7,5.7,8.3,1.6,1012.2,1007.0,cuaca-harian-dki1-bundaranhi,31.46,24.34,12.16,6.8,9.12,218.0,23.986,26.98,85.6,93.8,1007.48,22.24,218.0,96.0,70.4,100.0,40.4,15.5,5.58,8.42,1.68,1009.76,1004.78,,,,,,0.051584,0.998669,0.5,0.866025,32.0,60.0,,,,46.0,19.79899,46.0,19.79899,,,,,-28.0,-14.0,,,,,,,,,,,,,,,,2.0,4.0,,,,3.0,1.414214,3.0,1.414214,,,,,-2.0,-1.0,16.0,73.0,,,,44.5,40.305087,44.5,40.305087,,,,,-57.0,-28.5,33.0,27.0,,,,30.0,4.242641,30.0,4.242641,,,,,6.0,3.0,9.0,14.0,,,,11.5,3.535534,11.5,3.535534,,,,,-5.0,-2.5,33.0,73.0,,,,53.0,28.284271,53.0,28.284271,,,,,-40.0,-20.0,,182.4,788.37,,1360.0,


Unnamed: 0,periode_data,tanggal,stasiun,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,kategori,source_file,bulan,pm10,so2,co,o3,no2,lokasi_spku,pm25,stasiun_code,year,month,day,dow,dayofyear,is_holiday_nasional,is_weekend,nama_libur,day_name,ndvi,wx_temperature_2m_max_c,wx_temperature_2m_min_c,wx_precipitation_sum_mm,wx_precipitation_hours_h,wx_wind_speed_10m_max_km_h,wx_wind_direction_10m_dominant,wx_shortwave_radiation_sum_mj_m²,wx_temperature_2m_mean_c,wx_relative_humidity_2m_mean,wx_cloud_cover_mean,wx_surface_pressure_mean_hpa,wx_wind_gusts_10m_max_km_h,wx_winddirection_10m_dominant,wx_relative_humidity_2m_max,wx_relative_humidity_2m_min,wx_cloud_cover_max,wx_cloud_cover_min,wx_wind_gusts_10m_mean_km_h,wx_wind_speed_10m_mean_km_h,wx_wind_gusts_10m_min_km_h,wx_wind_speed_10m_min_km_h,wx_surface_pressure_max_hpa,wx_surface_pressure_min_hpa,weather_station,wxg_temperature_2m_max_c,wxg_temperature_2m_min_c,wxg_precipitation_sum_mm,wxg_precipitation_hours_h,wxg_wind_speed_10m_max_km_h,wxg_wind_direction_10m_dominant,wxg_shortwave_radiation_sum_mj_m²,wxg_temperature_2m_mean_c,wxg_relative_humidity_2m_mean,wxg_cloud_cover_mean,wxg_surface_pressure_mean_hpa,wxg_wind_gusts_10m_max_km_h,wxg_winddirection_10m_dominant,wxg_relative_humidity_2m_max,wxg_relative_humidity_2m_min,wxg_cloud_cover_max,wxg_cloud_cover_min,wxg_wind_gusts_10m_mean_km_h,wxg_wind_speed_10m_mean_km_h,wxg_wind_gusts_10m_min_km_h,wxg_wind_speed_10m_min_km_h,wxg_surface_pressure_max_hpa,wxg_surface_pressure_min_hpa,pop_total_year,river_exceed_rate,river_ratio_mean,river_n,id,doy_sin,doy_cos,mon_sin,mon_cos,pm10_lag1,pm10_lag2,pm10_lag3,pm10_lag7,pm10_lag14,pm10_rmean3,pm10_rstd3,pm10_rmean7,pm10_rstd7,pm10_rmean14,pm10_rstd14,pm10_rmean30,pm10_rstd30,pm10_d12,pm10_d1_rm7,pm25_lag1,pm25_lag2,pm25_lag3,pm25_lag7,pm25_lag14,pm25_rmean3,pm25_rstd3,pm25_rmean7,pm25_rstd7,pm25_rmean14,pm25_rstd14,pm25_rmean30,pm25_rstd30,pm25_d12,pm25_d1_rm7,so2_lag1,so2_lag2,so2_lag3,so2_lag7,so2_lag14,so2_rmean3,so2_rstd3,so2_rmean7,so2_rstd7,so2_rmean14,so2_rstd14,so2_rmean30,so2_rstd30,so2_d12,so2_d1_rm7,co_lag1,co_lag2,co_lag3,co_lag7,co_lag14,co_rmean3,co_rstd3,co_rmean7,co_rstd7,co_rmean14,co_rstd14,co_rmean30,co_rstd30,co_d12,co_d1_rm7,o3_lag1,o3_lag2,o3_lag3,o3_lag7,o3_lag14,o3_rmean3,o3_rstd3,o3_rmean7,o3_rstd7,o3_rmean14,o3_rstd14,o3_rmean30,o3_rstd30,o3_d12,o3_d1_rm7,no2_lag1,no2_lag2,no2_lag3,no2_lag7,no2_lag14,no2_rmean3,no2_rstd3,no2_rmean7,no2_rstd7,no2_rmean14,no2_rstd14,no2_rmean30,no2_rstd30,no2_d12,no2_d1_rm7,max_lag1,max_lag2,max_lag3,max_lag7,max_lag14,max_rmean3,max_rstd3,max_rmean7,max_rstd7,max_rmean14,max_rstd14,max_rmean30,max_rstd30,max_d12,max_d1_rm7,pm25_lag1_x_wind,pm10_lag1_x_wind,o3_lag1_x_rad,pm25_lag1_div_prec,co_lag1_x_rh,pm25_lag1_x_temp
0,,2025-09-01,DKI1,,,,,,,,,,,,,,,,,,,DKI1,2025,9,1,0,244,0,0,,Thursday,,27.8,23.9,30.2,21.0,7.6,258.0,7.8,25.7,89.0,100.0,1009.9,20.5,258.0,98.0,78.0,100.0,100.0,12.4,4.4,6.1,2.3,1012.1,1007.9,cuaca-harian-dki1-bundaranhi,27.54,23.7,28.46,20.8,8.74,265.2,7.694,25.42,90.2,99.8,1007.52,22.38,265.2,98.0,78.4,100.0,95.2,13.26,4.96,6.64,2.3,1009.7,1005.6,,,,,2025-09-01_DKI1,-0.870294,-0.492533,-1.0,-1.83697e-16,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,67.0,82.0,96.0,101.0,75.0,81.666667,14.502873,82.857143,11.992061,84.214286,14.305344,83.366667,12.43045,-15.0,-15.857143,,,,,,
1,,2025-09-02,DKI1,,,,,,,,,,,,,,,,,,,DKI1,2025,9,2,1,245,0,1,,Sunday,,29.3,22.8,16.0,18.0,11.1,292.0,18.36,26.1,85.0,93.0,1009.0,28.8,292.0,97.0,68.0,100.0,48.0,18.9,6.5,6.1,2.0,1010.9,1005.8,cuaca-harian-dki1-bundaranhi,29.04,22.68,12.66,17.8,12.1,291.8,18.178,25.86,85.4,94.6,1006.6,30.9,291.8,98.0,69.2,100.0,53.8,19.72,6.98,7.48,2.0,1008.54,1003.46,,,,,2025-09-02_DKI1,-0.878637,-0.477489,-1.0,-1.83697e-16,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,67.0,82.0,83.0,120.0,74.5,10.606602,79.833333,9.786044,84.923077,14.631368,83.275862,12.640345,,,,,,,,
2,,2025-09-03,DKI1,,,,,,,,,,,,,,,,,,,DKI1,2025,9,3,2,246,0,1,,Sunday,,28.2,23.7,15.1,18.0,8.4,62.0,12.24,25.9,88.0,100.0,1009.3,24.8,62.0,97.0,78.0,100.0,99.0,13.1,3.9,6.5,1.5,1011.0,1007.5,cuaca-harian-dki1-bundaranhi,28.32,23.46,11.88,17.4,8.62,76.4,11.958,25.66,89.2,99.8,1006.94,26.0,76.4,97.2,78.0,100.0,97.8,13.12,4.06,5.48,1.06,1008.62,1005.18,,,,,2025-09-03_DKI1,-0.886721,-0.462305,-1.0,-1.83697e-16,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,67.0,75.0,80.0,,,79.2,10.802777,82.0,10.600172,83.25,12.871516,,,,,,,,


# Model Training (Time-Based CV + CatBoost Optimization)

In [10]:
# ============================================================
# STAGE 4 — Model Training (Forecasting-supervised, Time-based CV) — FINAL
#
# Inti:
# - Kita ubah problem jadi supervised forecasting:
#   fitur di waktu t -> target label di waktu t+H (shift(-H))
# - Tidak ada merge ulang fitur (FIX MergeError)
# - Tambah fitur horizon_days sebagai input model
#
# REQUIRE:
# - df_train_fe (Stage 3)
#
# OUTPUT:
# - models, feature_cols, cat_cols
# - classes, class_to_id, id_to_class
# - oof_macro_f1
# - H_LIST
# ============================================================

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import f1_score, classification_report

if "df_train_fe" not in globals():
    raise RuntimeError("Missing df_train_fe. Jalankan Stage 3 dulu.")

SEED = 42
np.random.seed(SEED)

TARGET_COL = "kategori"
DATE_COL = "tanggal"
GROUP_COL = "stasiun_code"
MISSING_CAT = "__MISSING__"

FINAL_CLASSES = ["BAIK", "SEDANG", "TIDAK SEHAT"]

# Horizon set (aman & sederhana dulu)
H_LIST = [1, 3, 7, 14]

# ============================================================
# 1) Target cleaning
# ============================================================
df0 = df_train_fe.copy()
if TARGET_COL not in df0.columns:
    raise RuntimeError("Target kolom 'kategori' tidak ada di df_train_fe")

df0[TARGET_COL] = df0[TARGET_COL].astype(str).str.strip().str.upper()

MAP_TARGET = {
    "BAIK": "BAIK",
    "SEDANG": "SEDANG",
    "TIDAK SEHAT": "TIDAK SEHAT",
    "SANGAT TIDAK SEHAT": "TIDAK SEHAT",
    "BERBAHAYA": "TIDAK SEHAT",
}
df0["target_final"] = df0[TARGET_COL].map(MAP_TARGET)
df0 = df0.dropna(subset=["target_final"]).reset_index(drop=True)

# guard tanggal + group
for c in [DATE_COL, GROUP_COL]:
    if c not in df0.columns:
        raise RuntimeError(f"Missing {c} in df_train_fe")

df0 = df0.dropna(subset=[DATE_COL, GROUP_COL]).copy()
df0 = df0.sort_values([GROUP_COL, DATE_COL]).reset_index(drop=True)

print("Target distribution:")
print(df0["target_final"].value_counts().to_string())

# ============================================================
# 2) Build forecasting-supervised dataset (NO MERGE)
# ============================================================
g = df0.groupby(GROUP_COL, sort=False)

sup_blocks = []
for h in H_LIST:
    d = df0.copy()
    d["y_future"] = g["target_final"].shift(-h)
    d["horizon_days"] = h
    sup_blocks.append(d)

df_sup = pd.concat(sup_blocks, ignore_index=True)
df_sup = df_sup.dropna(subset=["y_future"]).reset_index(drop=True)

print("\nSupervised rows:", len(df_sup))
print("y_future distribution:")
print(df_sup["y_future"].value_counts().to_string())

# ============================================================
# 3) Feature selection
# ============================================================
drop_cols = {
    TARGET_COL, "target_final", "y_future",
    DATE_COL, "source_file", "periode_data",
}
if "id" in df_sup.columns:
    drop_cols.add("id")

# tambah horizon_days sebagai fitur
feature_cols = [c for c in df_sup.columns if c not in drop_cols]
if "horizon_days" not in feature_cols:
    feature_cols.append("horizon_days")

X = df_sup[feature_cols].copy()
y_str = df_sup["y_future"].copy()

# ============================================================
# 4) Auto categorical detection + sanitize
# ============================================================
is_num = X.apply(pd.api.types.is_numeric_dtype)
cat_cols = X.columns[~is_num].tolist()

for c in cat_cols:
    X[c] = X[c].where(X[c].notna(), MISSING_CAT).astype(str)
    X[c] = X[c].replace({"nan": MISSING_CAT, "None": MISSING_CAT, "": MISSING_CAT})

num_cols = X.columns[is_num].tolist()
X[num_cols] = X[num_cols].apply(pd.to_numeric, errors="coerce")

classes = FINAL_CLASSES
class_to_id = {c: i for i, c in enumerate(classes)}
id_to_class = {i: c for c, i in class_to_id.items()}
y = y_str.map(class_to_id).astype(int)

print("\nFeatures:", len(feature_cols), "| cat:", len(cat_cols))

# class weights
counts = y.value_counts().sort_index()
class_weights = [len(y) / (len(classes) * counts[i]) for i in range(len(classes))]

# ============================================================
# 5) Time-based CV on "anchor date" (tanggal t)
# ============================================================
# Validasi selalu masa depan (tahun terakhir)
df_sup["_year"] = df_sup[DATE_COL].dt.year
years = sorted(df_sup["_year"].unique().tolist())

# ambil sampai 4 fold terakhir
N_SPLITS = min(4, max(1, len(years)-1))
val_years = years[-N_SPLITS:]

folds = []
for vy in val_years:
    tr_idx = df_sup.index[df_sup["_year"] < vy].to_numpy()
    va_idx = df_sup.index[df_sup["_year"] == vy].to_numpy()
    if len(tr_idx) and len(va_idx):
        folds.append((tr_idx, va_idx))

if not folds:
    raise RuntimeError("Failed to build time folds. Cek distribusi tahun.")

print("\nFolds:")
for i,(tr,va) in enumerate(folds):
    print(
        f"fold{i}: train={len(tr)} [{df_sup.loc[tr,DATE_COL].min()}..{df_sup.loc[tr,DATE_COL].max()}] | "
        f"valid={len(va)} [{df_sup.loc[va,DATE_COL].min()}..{df_sup.loc[va,DATE_COL].max()}]"
    )

# ============================================================
# 6) Train CatBoost
# ============================================================
ITERATIONS = 6000
LR = 0.05
DEPTH = 8
L2 = 6.0
EARLY_STOP = 400

models = []
oof_pred = np.full(len(df_sup), -1, dtype=int)

for fi,(tr_idx,va_idx) in enumerate(folds):
    print(f"\n=== Fold {fi} ===")
    X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
    X_va, y_va = X.iloc[va_idx], y.iloc[va_idx]

    train_pool = Pool(X_tr, y_tr, cat_features=cat_cols)
    valid_pool = Pool(X_va, y_va, cat_features=cat_cols)

    model = CatBoostClassifier(
        loss_function="MultiClass",
        eval_metric="TotalF1",
        classes_count=len(classes),
        class_weights=class_weights,
        iterations=ITERATIONS,
        learning_rate=LR,
        depth=DEPTH,
        l2_leaf_reg=L2,
        random_seed=SEED,
        task_type="CPU",
        od_type="Iter",
        od_wait=EARLY_STOP,
        verbose=300
    )

    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)

    p = model.predict_proba(X_va)
    pred = np.argmax(p, axis=1)
    oof_pred[va_idx] = pred

    f1 = f1_score(y_va, pred, average="macro")
    print(f"[fold {fi}] macroF1={f1:.5f} | best_iter={model.get_best_iteration()}")

    models.append(model)

# ============================================================
# 7) OOF summary
# ============================================================
m = oof_pred >= 0
oof_macro_f1 = f1_score(y[m], oof_pred[m], average="macro")

print("\n=== OOF RESULTS ===")
print("OOF macroF1:", round(oof_macro_f1, 6))
print("\nClassification report:")
print(classification_report(y[m], oof_pred[m], target_names=classes, digits=4))


Target distribution:
target_final
SEDANG         6652
BAIK           1858
TIDAK SEHAT    1004

Supervised rows: 37931
y_future distribution:
y_future
SEDANG         26557
BAIK            7373
TIDAK SEHAT     4001

Features: 195 | cat: 13

Folds:
fold0: train=16428 [2010-01-01 00:00:00..2016-12-31 00:00:00] | valid=7180 [2017-01-01 00:00:00..2017-12-31 00:00:00]
fold1: train=23608 [2010-01-01 00:00:00..2017-12-31 00:00:00] | valid=7232 [2021-01-01 00:00:00..2021-12-31 00:00:00]
fold2: train=30840 [2010-01-01 00:00:00..2021-12-31 00:00:00] | valid=620 [2022-12-01 00:00:00..2022-12-31 00:00:00]
fold3: train=31460 [2010-01-01 00:00:00..2022-12-31 00:00:00] | valid=6471 [2023-01-01 00:00:00..2023-11-29 00:00:00]

=== Fold 0 ===
0:	learn: 0.5962415	test: 0.5439669	best: 0.5439669 (0)	total: 232ms	remaining: 23m 14s
300:	learn: 0.7805393	test: 0.5646670	best: 0.6034764 (11)	total: 1m 9s	remaining: 21m 56s
Stopped by overfitting detector  (400 iterations wait)

bestTest = 0.6034764367
bestIter

# Inference, Ensembling, Submission & QA

In [11]:
# ============================================================
# STAGE 5 — Forecasting Inference, Ensembling, Submission & QA — FINAL
#
# Forecasting-safe:
# - Test tidak punya fitur
# - Kita buat "forecast test matrix" dari snapshot terakhir per stasiun
# - horizon_days dihitung dari ID date - last_train_date_global
#
# REQUIRE:
# - sub, ID_COL (Stage 1)
# - df_train_fe (Stage 3)
# - models, feature_cols, cat_cols, id_to_class, H_LIST (Stage 4)
# ============================================================

import json
import numpy as np
import pandas as pd
from catboost import Pool

need = ["sub","ID_COL","df_train_fe","models","feature_cols","cat_cols","id_to_class","H_LIST"]
miss = [k for k in need if k not in globals() or globals()[k] is None]
if miss:
    raise RuntimeError(f"Missing required objects: {miss}")

SUB_TARGET_COL = "category" if "category" in sub.columns else sub.columns[-1]
MISSING_CAT = "__MISSING__"

# ============================================================
# 1) Parse submission IDs => (date, stasiun_code)
# ============================================================
sub2 = sub[[ID_COL]].copy()
tmp = sub2[ID_COL].astype(str).str.split("_", n=1, expand=True)
sub2["tanggal_id"] = pd.to_datetime(tmp[0], errors="coerce")
sub2["stasiun_code"] = tmp[1].astype(str).str.upper().str.replace(" ", "", regex=False)

if sub2["tanggal_id"].isna().any():
    bad = sub2[sub2["tanggal_id"].isna()].head(5)
    raise RuntimeError(f"Failed parse tanggal from ID. contoh:\n{bad}")

# ============================================================
# 2) Build snapshot per stasiun dari train_fe (last known t)
# ============================================================
last_train_date = df_train_fe["tanggal"].max()
if pd.isna(last_train_date):
    raise RuntimeError("df_train_fe tanggal max is NaT.")

snap = (
    df_train_fe.sort_values(["stasiun_code","tanggal"])
    .groupby("stasiun_code", as_index=False)
    .tail(1)
    .reset_index(drop=True)
)

# map snapshot row per stasiun_code untuk semua ID
snap_map = snap.set_index("stasiun_code")

# ============================================================
# 3) Build forecast test matrix rows == submission rows
# ============================================================
rows = []
for i, r in sub2.iterrows():
    sc = r["stasiun_code"]
    if sc in snap_map.index:
        base = snap_map.loc[sc]
        # base bisa Series atau DataFrame (kalau duplikat), handle:
        if isinstance(base, pd.DataFrame):
            base = base.iloc[-1]
        row = base.to_dict()
    else:
        # fallback: pakai global last row (kalau stasiun tidak ketemu)
        base = df_train_fe.sort_values("tanggal").tail(1).iloc[0]
        row = base.to_dict()

    row[ID_COL] = r[ID_COL]
    row["stasiun_code"] = sc
    row["tanggal_id"] = r["tanggal_id"]

    # horizon_days (clip agar tidak negatif)
    h = int((r["tanggal_id"] - last_train_date).days)
    row["horizon_days"] = max(1, h)

    rows.append(row)

df_test_forecast = pd.DataFrame(rows)

# ============================================================
# 4) Align features exactly like training
# ============================================================
X_test = df_test_forecast.reindex(columns=feature_cols)

# sanitize categoricals
for c in cat_cols:
    if c not in X_test.columns:
        X_test[c] = MISSING_CAT
    X_test[c] = X_test[c].where(X_test[c].notna(), MISSING_CAT).astype(str)
    X_test[c] = X_test[c].replace({"nan": MISSING_CAT, "None": MISSING_CAT, "": MISSING_CAT})

# sanitize numerics
num_cols = [c for c in X_test.columns if c not in cat_cols]
X_test[num_cols] = X_test[num_cols].apply(pd.to_numeric, errors="coerce")

bad_num = [c for c in num_cols if X_test[c].dtype == object]
if bad_num:
    raise RuntimeError(f"Numeric columns still object after coercion: {bad_num[:10]}")

print(f"[OK] Forecast test matrix ready | rows={len(X_test)} | features={len(feature_cols)} | cat={len(cat_cols)}")
print("[OK] horizon_days min/max:", int(df_test_forecast["horizon_days"].min()), int(df_test_forecast["horizon_days"].max()))

# ============================================================
# 5) Inference ensemble (mean prob)
# ============================================================
K = len(id_to_class)
proba_ens = np.zeros((len(X_test), K), dtype=np.float32)

pool = Pool(X_test, cat_features=cat_cols)

for mi, model in enumerate(models):
    p = model.predict_proba(pool)
    if p.shape[1] != K:
        raise RuntimeError(f"Class mismatch on model {mi}")
    proba_ens += p / len(models)
    print(f"[OK] model {mi} inferred")

pred_int = np.argmax(proba_ens, axis=1)
pred_label = np.array([id_to_class[i] for i in pred_int], dtype=object)

# ============================================================
# 6) Submission + QA
# ============================================================
submission = sub[[ID_COL]].copy()
submission[SUB_TARGET_COL] = pred_label

qa = {
    "rows_submission": int(len(submission)),
    "rows_sample": int(len(sub)),
    "id_unique": bool(submission[ID_COL].is_unique),
    "missing_pred": int(submission[SUB_TARGET_COL].isna().sum()),
    "label_distribution": submission[SUB_TARGET_COL].value_counts().to_dict(),
    "labels_expected": sorted(list(id_to_class.values())),
    "horizon_days_minmax": [int(df_test_forecast["horizon_days"].min()), int(df_test_forecast["horizon_days"].max())],
}

print("\n=== QA REPORT ===")
for k,v in qa.items():
    print(f"{k}: {v}")

assert qa["rows_submission"] == qa["rows_sample"]
assert qa["id_unique"]
assert qa["missing_pred"] == 0

OUT_PATH = "/kaggle/working/submission.csv"
QA_PATH  = "/kaggle/working/qa_submission.json"

submission.to_csv(OUT_PATH, index=False)
with open(QA_PATH, "w") as f:
    json.dump(qa, f, indent=2)

print(f"\n[OK] submission saved -> {OUT_PATH}")
print(f"[OK] qa saved -> {QA_PATH}")
display(submission.head(10))


[OK] Forecast test matrix ready | rows=455 | features=195 | cat=13
[OK] horizon_days min/max: 641 731
[OK] model 0 inferred
[OK] model 1 inferred
[OK] model 2 inferred
[OK] model 3 inferred

=== QA REPORT ===
rows_submission: 455
rows_sample: 455
id_unique: True
missing_pred: 0
label_distribution: {'BAIK': 455}
labels_expected: ['BAIK', 'SEDANG', 'TIDAK SEHAT']
horizon_days_minmax: [641, 731]

[OK] submission saved -> /kaggle/working/submission.csv
[OK] qa saved -> /kaggle/working/qa_submission.json


Unnamed: 0,id,kategori
0,2025-09-01_DKI1,BAIK
1,2025-09-01_DKI2,BAIK
2,2025-09-01_DKI3,BAIK
3,2025-09-01_DKI4,BAIK
4,2025-09-01_DKI5,BAIK
5,2025-09-02_DKI1,BAIK
6,2025-09-02_DKI2,BAIK
7,2025-09-02_DKI3,BAIK
8,2025-09-02_DKI4,BAIK
9,2025-09-02_DKI5,BAIK
