In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/penyisihan-datavidia-10/sample_submission.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2012-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-2023-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2010-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2018-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2025.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2014-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2024.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2011-kompone

# Data Loading & Sanity Checks

In [2]:
# ============================================================
# STEP 1 — Data Loading & Sanity Checks (ONE CELL) — CatBoost Track (REVISED v2)
# Fixes:
# - Series.lower() bug -> use .str.lower()
# - Keeps robust parsing, dedup, and label/critical unification
# Outputs (globals):
#   sub, ID_COL, SUB_TARGET_COL
#   df_ispu_all, df_train, df_ispu_unlabeled
#   df_ndvi, df_holiday, df_weather, df_pop, df_river
#   test_candidates
# ============================================================

import re
from pathlib import Path
import numpy as np
import pandas as pd

DATA_ROOT = Path("/kaggle/input/penyisihan-datavidia-10")
assert DATA_ROOT.exists(), f"DATA_ROOT not found: {DATA_ROOT}"

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

# ----------------------------
# Helpers
# ----------------------------
def _read_csv_smart(path: Path) -> pd.DataFrame:
    seps = [",", ";", "\t", "|"]
    encs = ["utf-8", "utf-8-sig", "latin1"]
    last_err = None
    for sep in seps:
        for enc in encs:
            try:
                df = pd.read_csv(path, sep=sep, encoding=enc, low_memory=False)
                if df.shape[1] >= 2:
                    return df
            except Exception as e:
                last_err = e
    raise RuntimeError(f"Failed to read: {path}\nLast error: {last_err}")

def _norm_col(c: str) -> str:
    c = str(c).strip().lower()
    c = re.sub(r"[^\w]+", "_", c)
    c = re.sub(r"_+", "_", c).strip("_")
    return c

def _standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    cols = {c: _norm_col(c) for c in df.columns}
    df = df.rename(columns=cols)

    rename = {}
    for c in df.columns:
        if c in ["tanggal", "date", "time", "waktu"]:
            rename[c] = "tanggal"
        elif c in ["stasiun", "station", "stasiun_id", "id_stasiun"]:
            rename[c] = "stasiun"
        elif c in ["periode_data", "periode"]:
            rename[c] = "periode_data"
        elif c in ["pm_sepuluh", "pm10", "pm_10"]:
            rename[c] = "pm10"
        elif c in ["pm_duakomalima", "pm2_5", "pm25", "pm_2_5", "pm2_5_"]:
            rename[c] = "pm25"
        elif c in ["sulfur_dioksida", "so2"]:
            rename[c] = "so2"
        elif c in ["karbon_monoksida", "co"]:
            rename[c] = "co"
        elif c in ["ozon", "o3"]:
            rename[c] = "o3"
        elif c in ["nitrogen_dioksida", "no2"]:
            rename[c] = "no2"
        elif c in ["parameter_pencemar_kritis", "parameter_pencemar", "pencemar_kritis"]:
            rename[c] = "parameter_pencemar_kritis"
        elif c in ["max", "maks", "nilai_maks", "indeks_maks"]:
            rename[c] = "max"
        elif c in ["ndvi", "vegetation_index"]:
            rename[c] = "ndvi"
        elif c in ["is_holiday_nasional", "holiday_nasional", "is_holiday"]:
            rename[c] = "is_holiday_nasional"
        elif c in ["is_weekend", "weekend"]:
            rename[c] = "is_weekend"
        elif c in ["day_name", "nama_hari"]:
            rename[c] = "day_name"
        elif c in ["nama_libur", "holiday_name"]:
            rename[c] = "nama_libur"

        # keep common typos as *_alt
        elif c == "categori":
            rename[c] = "kategori_alt"
        elif c == "critical":
            rename[c] = "parameter_pencemar_kritis_alt"

    return df.rename(columns=rename)

def parse_date_twopass(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.strip()
    s = s.replace({"": np.nan, "nan": np.nan, "NaN": np.nan, "None": np.nan})
    d1 = pd.to_datetime(s, errors="coerce", dayfirst=True)
    m = d1.isna()
    if m.any():
        d2 = pd.to_datetime(s[m], errors="coerce", dayfirst=False)
        d1.loc[m] = d2
    return d1

def _coerce_numeric(df: pd.DataFrame, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

def _dedup_keep_most_complete(df: pd.DataFrame, key_cols):
    if not all(k in df.columns for k in key_cols):
        return df
    df = df.copy()
    df["_nn"] = df.notna().sum(axis=1)
    idx = df.groupby(key_cols)["_nn"].idxmax()
    df = df.loc[idx].drop(columns=["_nn"]).reset_index(drop=True)
    return df

def _basic_sanity(name: str, df: pd.DataFrame, key_cols=None, date_col="tanggal"):
    print(f"\n--- {name} ---")
    print("shape:", df.shape)
    if key_cols is not None and all(k in df.columns for k in key_cols):
        print(f"duplicates on {key_cols}:", int(df.duplicated(key_cols).sum()))
    if date_col in df.columns:
        print(f"{date_col}: NaT={int(df[date_col].isna().sum())} | range=[{df[date_col].min()} .. {df[date_col].max()}]")
    miss = (df.isna().mean().sort_values(ascending=False).head(8) * 100).round(2)
    print("top missing% cols:")
    print(miss.to_string())

# ----------------------------
# 0) sample_submission
# ----------------------------
sub = _standardize_columns(_read_csv_smart(DATA_ROOT / "sample_submission.csv"))
ID_COL = "id" if "id" in sub.columns else sub.columns[0]
SUB_TARGET_COL = "category" if "category" in sub.columns else sub.columns[-1]
n_test_expected = len(sub)

print("Loaded sample_submission:", sub.shape, "cols:", list(sub.columns))
print("ID_COL:", ID_COL, "| SUB_TARGET_COL:", SUB_TARGET_COL)
print("submission ID unique:", bool(sub[ID_COL].is_unique))

# ----------------------------
# 1) ISPU (concat all years) + CLEAN
# ----------------------------
ispu_files = sorted((DATA_ROOT / "ISPU").glob("*.csv"))
assert len(ispu_files) > 0, "No ISPU CSV files found."

frames = []
for p in ispu_files:
    df = _standardize_columns(_read_csv_smart(p))
    df["source_file"] = p.name
    frames.append(df)

df_ispu_all = pd.concat(frames, ignore_index=True, sort=False)

# unify label + critical columns into canonical names
if "kategori_alt" in df_ispu_all.columns:
    if "kategori" not in df_ispu_all.columns:
        df_ispu_all["kategori"] = df_ispu_all["kategori_alt"]
    else:
        df_ispu_all["kategori"] = df_ispu_all["kategori"].fillna(df_ispu_all["kategori_alt"])

if "parameter_pencemar_kritis_alt" in df_ispu_all.columns:
    if "parameter_pencemar_kritis" not in df_ispu_all.columns:
        df_ispu_all["parameter_pencemar_kritis"] = df_ispu_all["parameter_pencemar_kritis_alt"]
    else:
        df_ispu_all["parameter_pencemar_kritis"] = df_ispu_all["parameter_pencemar_kritis"].fillna(df_ispu_all["parameter_pencemar_kritis_alt"])

# robust date parse
if "tanggal" in df_ispu_all.columns:
    df_ispu_all["tanggal"] = parse_date_twopass(df_ispu_all["tanggal"])

# stasiun cleanup + code
if "stasiun" in df_ispu_all.columns:
    df_ispu_all["stasiun"] = df_ispu_all["stasiun"].astype(str).str.strip()
    df_ispu_all["stasiun_code"] = (
        df_ispu_all["stasiun"]
        .str.upper()
        .str.extract(r"(DKI\s*\d+)", expand=False)
        .str.replace(" ", "", regex=False)
    )
else:
    df_ispu_all["stasiun_code"] = np.nan

# numeric casts
df_ispu_all = _coerce_numeric(df_ispu_all, ["pm10", "pm25", "so2", "co", "o3", "no2", "max"])

# drop rows missing key fields
df_ispu_all = df_ispu_all.dropna(subset=["tanggal", "stasiun"]).copy()

# dedup by key keep most complete
df_ispu_all = _dedup_keep_most_complete(df_ispu_all, ["tanggal", "stasiun"])
df_ispu_all = df_ispu_all.sort_values(["tanggal", "stasiun"]).reset_index(drop=True)

_basic_sanity("ISPU (ALL) CLEAN", df_ispu_all, key_cols=["tanggal", "stasiun"])

# build train vs unlabeled
if "kategori" in df_ispu_all.columns:
    lab = df_ispu_all["kategori"].astype(str).str.strip()
    lab_low = lab.str.lower()
    m_train = df_ispu_all["kategori"].notna() & (lab != "") & (lab_low != "nan")
    df_train = df_ispu_all.loc[m_train].copy()
    df_ispu_unlabeled = df_ispu_all.loc[~m_train].copy()
else:
    df_train = df_ispu_all.copy()
    df_ispu_unlabeled = df_ispu_all.iloc[0:0].copy()

print("\nTrain/unlabeled split:")
print("df_train:", df_train.shape, "| df_ispu_unlabeled:", df_ispu_unlabeled.shape)
if "kategori" in df_train.columns:
    print("\nTarget distribution (df_train):")
    print(df_train["kategori"].astype(str).str.strip().value_counts(dropna=False).to_string())

# ----------------------------
# 2) NDVI + stasiun_code
# ----------------------------
df_ndvi = _standardize_columns(_read_csv_smart(DATA_ROOT / "NDVI (vegetation index)" / "indeks-ndvi-jakarta.csv"))
if "tanggal" in df_ndvi.columns:
    df_ndvi["tanggal"] = parse_date_twopass(df_ndvi["tanggal"])
if "stasiun" in df_ndvi.columns:
    df_ndvi["stasiun"] = df_ndvi["stasiun"].astype(str).str.strip().str.upper().str.replace(" ", "", regex=False)
    df_ndvi["stasiun_code"] = df_ndvi["stasiun"].str.extract(r"(DKI\d+)", expand=False)
df_ndvi = _coerce_numeric(df_ndvi, ["ndvi"])
df_ndvi = df_ndvi.dropna(subset=["tanggal", "stasiun"]).copy()
df_ndvi = _dedup_keep_most_complete(df_ndvi, ["tanggal", "stasiun"])

_basic_sanity("NDVI CLEAN", df_ndvi, key_cols=["tanggal", "stasiun"])

# ----------------------------
# 3) Holidays (clean to one row per date)
# ----------------------------
df_holiday = _standardize_columns(_read_csv_smart(DATA_ROOT / "libur-nasional" / "dataset-libur-nasional-dan-weekend.csv"))
if "tanggal" in df_holiday.columns:
    df_holiday["tanggal"] = parse_date_twopass(df_holiday["tanggal"])
df_holiday = df_holiday.dropna(subset=["tanggal"]).sort_values("tanggal").copy()

for c in ["is_holiday_nasional", "is_weekend"]:
    if c in df_holiday.columns:
        df_holiday[c] = pd.to_numeric(df_holiday[c], errors="coerce").fillna(0).astype(int)

agg = {}
if "is_holiday_nasional" in df_holiday.columns: agg["is_holiday_nasional"] = "max"
if "is_weekend" in df_holiday.columns: agg["is_weekend"] = "max"
if "nama_libur" in df_holiday.columns: agg["nama_libur"] = "first"
df_holiday = df_holiday.groupby("tanggal", as_index=False).agg(agg)
df_holiday["day_name"] = df_holiday["tanggal"].dt.day_name()

_basic_sanity("HOLIDAYS CLEAN", df_holiday, key_cols=["tanggal"])

# ----------------------------
# 4) Weather (multiple stations) clean
# ----------------------------
weather_files = sorted((DATA_ROOT / "cuaca-harian").glob("*.csv"))
assert len(weather_files) > 0, "No weather CSV files found."

w_frames = []
for p in weather_files:
    w = _standardize_columns(_read_csv_smart(p))
    tag = p.stem.lower().replace("cuaca_harian_", "").replace("cuaca-harian-", "")
    w["weather_station"] = tag
    w["weather_code"] = (pd.Series([tag] * len(w)).str.extract(r"(dki\d)", expand=False).str.upper())
    if "tanggal" in w.columns:
        w["tanggal"] = parse_date_twopass(w["tanggal"])
    w_frames.append(w)

df_weather = pd.concat(w_frames, ignore_index=True, sort=False)
df_weather = df_weather.dropna(subset=["tanggal"]).copy()

for c in df_weather.columns:
    if c not in ["tanggal", "weather_station", "weather_code"]:
        if df_weather[c].dtype == object:
            df_weather[c] = pd.to_numeric(df_weather[c], errors="ignore")

df_weather = _dedup_keep_most_complete(df_weather, ["tanggal", "weather_station"])
df_weather = df_weather.sort_values(["weather_station", "tanggal"]).reset_index(drop=True)

_basic_sanity("WEATHER (ALL) CLEAN", df_weather, key_cols=["tanggal", "weather_station"])

# ----------------------------
# 5) Population
# ----------------------------
df_pop = _standardize_columns(_read_csv_smart(DATA_ROOT / "jumlah-penduduk" / "data-jumlah-penduduk-provinsi-dki-jakarta-berdasarkan-kelompok-usia-dan-jenis-kelamin-tahun-2013-2021-komponen-data.csv"))
if "tahun" in df_pop.columns:
    df_pop["tahun"] = pd.to_numeric(df_pop["tahun"], errors="coerce")
if "jumlah_penduduk" in df_pop.columns:
    df_pop["jumlah_penduduk"] = pd.to_numeric(df_pop["jumlah_penduduk"], errors="coerce")
_basic_sanity("POPULATION", df_pop)

# ----------------------------
# 6) River Quality
# ----------------------------
df_river = _standardize_columns(_read_csv_smart(DATA_ROOT / "kualitas-air-sungai" / "data-kualitas-air-sungai-komponen-data.csv"))
for c in ["latitude", "longitude", "baku_mutu", "hasil_pengukuran", "bulan_sampling"]:
    if c in df_river.columns:
        df_river[c] = pd.to_numeric(df_river[c], errors="coerce")
_basic_sanity("RIVER QUALITY", df_river)

# ----------------------------
# 7) Find test mapping file candidates (rows == sample_submission) with an 'id' column
# ----------------------------
test_candidates = []
for p in DATA_ROOT.rglob("*.csv"):
    if p.name == "sample_submission.csv":
        continue
    name = p.name.lower()
    if ("test" in name) or ("submission" in name):
        try:
            tmp = _standardize_columns(_read_csv_smart(p).head(2))
            if "id" in tmp.columns:
                df_full = _standardize_columns(_read_csv_smart(p))
                if len(df_full) == n_test_expected:
                    test_candidates.append((str(p), list(df_full.columns)))
        except Exception:
            pass

print("\n--- Test mapping file candidates (rows == sample_submission) ---")
if len(test_candidates) == 0:
    print("None found by heuristic. Likely there is a separate test file not matching this heuristic name.")
else:
    for fp, cols in test_candidates:
        print(fp, "| cols:", cols)

# ----------------------------
# Preview
# ----------------------------
print("\n--- Preview heads ---")
display(df_train.head(3))
display(df_ndvi.head(3))
display(df_holiday.head(3))
display(df_weather.head(3))


Loaded sample_submission: (455, 2) cols: ['id', 'category']
ID_COL: id | SUB_TARGET_COL: category
submission ID unique: True


  d1 = pd.to_datetime(s, errors="coerce", dayfirst=True)
  d2 = pd.to_datetime(s[m], errors="coerce", dayfirst=False)
  d1 = pd.to_datetime(s, errors="coerce", dayfirst=True)



--- ISPU (ALL) CLEAN ---
shape: (13652, 18)
duplicates on ['tanggal', 'stasiun']: 0
tanggal: NaT=0 | range=[2010-01-01 00:00:00 .. 2023-11-30 00:00:00]
top missing% cols:
bulan                            100.00
lokasi_spku                       78.96
pm25                              73.49
parameter_pencemar_kritis_alt     23.74
stasiun_code                      21.04
pm10                              14.42
kategori_alt                      13.38
o3                                12.80

Train/unlabeled split:
df_train: (13651, 18) | df_ispu_unlabeled: (1, 18)

Target distribution (df_train):
kategori
SEDANG                7997
TIDAK SEHAT           2072
BAIK                  1912
TIDAK ADA DATA        1440
SANGAT TIDAK SEHAT     199
O3                      30
BERBAHAYA                1

--- NDVI CLEAN ---
shape: (1810, 4)
duplicates on ['tanggal', 'stasiun']: 0
tanggal: NaT=0 | range=[2009-12-19 00:00:00 .. 2025-08-29 00:00:00]
top missing% cols:
tanggal         0.0
stasiun         0.

Unnamed: 0,periode_data,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,parameter_pencemar_kritis,kategori,source_file,bulan,parameter_pencemar_kritis_alt,kategori_alt,lokasi_spku,stasiun_code
0,201001,2010-01-01,DKI1 (Bunderan HI),60.0,,4.0,73.0,27.0,14.0,73.0,CO,SEDANG,indeks-standar-pencemaran-udara-(ispu)-tahun-2...,,CO,SEDANG,,DKI1
1,201001,2010-01-01,DKI2 (Kelapa Gading),,,,,,,0.0,,TIDAK ADA DATA,indeks-standar-pencemaran-udara-(ispu)-tahun-2...,,,TIDAK ADA DATA,,DKI2
2,201001,2010-01-01,DKI3 (Jagakarsa),,,,,,,0.0,,TIDAK ADA DATA,indeks-standar-pencemaran-udara-(ispu)-tahun-2...,,,TIDAK ADA DATA,,DKI3


Unnamed: 0,tanggal,stasiun,ndvi,stasiun_code
0,2009-12-19,DKI1,0.1849,DKI1
1,2009-12-19,DKI2,0.2891,DKI2
2,2009-12-19,DKI3,0.5613,DKI3


Unnamed: 0,tanggal,is_holiday_nasional,is_weekend,nama_libur,day_name
0,2010-01-01,1,0,New Year's Day,Friday
1,2010-01-02,0,0,,Saturday
2,2010-01-03,0,0,,Sunday


Unnamed: 0,tanggal,temperature_2m_max_c,temperature_2m_min_c,precipitation_sum_mm,precipitation_hours_h,wind_speed_10m_max_km_h,wind_direction_10m_dominant,shortwave_radiation_sum_mj_m²,temperature_2m_mean_c,relative_humidity_2m_mean,cloud_cover_mean,surface_pressure_mean_hpa,wind_gusts_10m_max_km_h,winddirection_10m_dominant,relative_humidity_2m_max,relative_humidity_2m_min,cloud_cover_max,cloud_cover_min,wind_gusts_10m_mean_km_h,wind_speed_10m_mean_km_h,wind_gusts_10m_min_km_h,wind_speed_10m_min_km_h,surface_pressure_max_hpa,surface_pressure_min_hpa,weather_station,weather_code
0,2010-01-01,29.4,24.4,4.0,14.0,16.0,246,16.24,26.6,81,100,1007.5,38.2,246,90,69,100,99,21.0,10.5,11.9,6.9,1009.3,1005.1,dki1-bundaranhi,DKI1
1,2010-01-02,28.9,24.2,6.9,14.0,9.5,260,13.01,26.2,85,99,1010.1,22.0,260,95,72,100,94,13.7,6.0,8.6,2.3,1011.9,1007.4,dki1-bundaranhi,DKI1
2,2010-01-03,31.4,24.9,11.2,6.0,9.4,224,23.89,27.1,85,93,1009.9,21.2,224,95,70,100,28,15.7,5.7,8.3,1.6,1012.2,1007.0,dki1-bundaranhi,DKI1


# Master Table Building (Correct Joins)

In [3]:
# ============================================================
# STEP 2 — Master Table Building (Correct Joins) (ONE CELL)
# Builds:
#   df_train_master  (for training)
#   df_test_master   (for inference; requires test mapping file with id)
# Notes:
# - Joins are leakage-safe (no future info used here; lags/rolling will be Step 3)
# - Uses safe joins: Holiday (by date), NDVI (date+stasiun_code), Weather (date+stasiun_code + global fallback),
#   Population (year aggregate), River (year-month aggregate, global)
# ============================================================

import re
from pathlib import Path
import numpy as np
import pandas as pd

# ---- guards (assumes Step 1 already ran) ----
need = ["sub","ID_COL","SUB_TARGET_COL","df_train","df_ndvi","df_holiday","df_weather","df_pop","df_river"]
miss = [k for k in need if k not in globals()]
if miss:
    raise RuntimeError(f"Missing globals from Step 1: {miss}. Jalankan Step 1 dulu.")

DATA_ROOT = Path("/kaggle/input/penyisihan-datavidia-10")

def _norm_col(c: str) -> str:
    c = str(c).strip().lower()
    c = re.sub(r"[^\w]+", "_", c)
    c = re.sub(r"_+", "_", c).strip("_")
    return c

def _standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    return df.rename(columns={c: _norm_col(c) for c in df.columns})

def parse_date_twopass(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.strip()
    s = s.replace({"": np.nan, "nan": np.nan, "NaN": np.nan, "None": np.nan})
    d1 = pd.to_datetime(s, errors="coerce", dayfirst=True)
    m = d1.isna()
    if m.any():
        d2 = pd.to_datetime(s[m], errors="coerce", dayfirst=False)
        d1.loc[m] = d2
    return d1

def _mk_stasiun_code(stasiun_series: pd.Series) -> pd.Series:
    st = stasiun_series.astype(str).str.strip().str.upper()
    code = st.str.extract(r"(DKI\s*\d+)", expand=False).str.replace(" ", "", regex=False)
    return code

def _as_numeric_cols(df: pd.DataFrame) -> list:
    num_cols = []
    for c in df.columns:
        if pd.api.types.is_numeric_dtype(df[c]):
            num_cols.append(c)
    return num_cols

def _prefix_cols(df: pd.DataFrame, prefix: str, keep: set) -> pd.DataFrame:
    ren = {c: f"{prefix}{c}" for c in df.columns if c not in keep}
    return df.rename(columns=ren)

def _find_test_mapping_file(data_root: Path, n_rows: int) -> Path | None:
    # Cari CSV (selain sample_submission) yang punya kolom id dan baris == n_rows.
    for p in data_root.rglob("*.csv"):
        if p.name == "sample_submission.csv":
            continue
        try:
            head = _standardize_columns(pd.read_csv(p, nrows=5))
            if "id" not in head.columns:
                continue
            # quick row count (read only id col if possible)
            df0 = _standardize_columns(pd.read_csv(p, usecols=["id"]))
            if len(df0) == n_rows:
                return p
        except Exception:
            continue
    return None

def build_master(df_base: pd.DataFrame, *, has_target: bool, test_mode: bool=False) -> pd.DataFrame:
    df = df_base.copy()

    # --- ensure tanggal + stasiun_code ---
    if "tanggal" not in df.columns:
        raise RuntimeError("Base df missing 'tanggal'.")
    if not np.issubdtype(df["tanggal"].dtype, np.datetime64):
        df["tanggal"] = parse_date_twopass(df["tanggal"])

    if "stasiun_code" not in df.columns:
        if "stasiun" in df.columns:
            df["stasiun_code"] = _mk_stasiun_code(df["stasiun"])
        else:
            df["stasiun_code"] = np.nan

    # --- basic calendar features (safe) ---
    df["year"] = df["tanggal"].dt.year.astype("Int64")
    df["month"] = df["tanggal"].dt.month.astype("Int64")
    df["day"] = df["tanggal"].dt.day.astype("Int64")
    df["dow"] = df["tanggal"].dt.dayofweek.astype("Int64")
    df["dayofyear"] = df["tanggal"].dt.dayofyear.astype("Int64")

    # --- holidays (by tanggal) ---
    hol = df_holiday.copy()
    if not np.issubdtype(hol["tanggal"].dtype, np.datetime64):
        hol["tanggal"] = parse_date_twopass(hol["tanggal"])
    hol = hol.dropna(subset=["tanggal"]).drop_duplicates(["tanggal"])
    df = df.merge(hol, on="tanggal", how="left")

    # --- NDVI (by tanggal + stasiun_code) ---
    nd = df_ndvi.copy()
    if "stasiun_code" not in nd.columns and "stasiun" in nd.columns:
        nd["stasiun_code"] = _mk_stasiun_code(nd["stasiun"])
    if not np.issubdtype(nd["tanggal"].dtype, np.datetime64):
        nd["tanggal"] = parse_date_twopass(nd["tanggal"])
    nd = nd.dropna(subset=["tanggal", "stasiun_code"]).drop_duplicates(["tanggal","stasiun_code"])
    nd = nd[["tanggal","stasiun_code"] + [c for c in nd.columns if c not in ["tanggal","stasiun","stasiun_code"]]]
    df = df.merge(nd, on=["tanggal","stasiun_code"], how="left")

    # --- Weather: station-specific (tanggal + stasiun_code) + global fallback (tanggal) ---
    wx = df_weather.copy()
    if not np.issubdtype(wx["tanggal"].dtype, np.datetime64):
        wx["tanggal"] = parse_date_twopass(wx["tanggal"])
    if "weather_code" in wx.columns:
        wx["weather_code"] = wx["weather_code"].astype(str).str.strip().str.upper()
    else:
        wx["weather_code"] = np.nan

    # station-specific
    wx_loc = wx.dropna(subset=["tanggal","weather_code"]).copy()
    wx_loc = wx_loc.rename(columns={"weather_code":"stasiun_code"})
    # keep numeric + minimal keys
    keep_keys = {"tanggal","stasiun_code","weather_station"}
    wx_loc_num = [c for c in wx_loc.columns if c in keep_keys or pd.api.types.is_numeric_dtype(wx_loc[c])]
    wx_loc = wx_loc[wx_loc_num].drop_duplicates(["tanggal","stasiun_code"])
    wx_loc = _prefix_cols(wx_loc, "wx_", keep={"tanggal","stasiun_code"})
    df = df.merge(wx_loc, on=["tanggal","stasiun_code"], how="left")

    # global by date (mean numeric across stations)
    wx_g = wx.dropna(subset=["tanggal"]).copy()
    wx_g_num = [c for c in wx_g.columns if pd.api.types.is_numeric_dtype(wx_g[c])]
    wx_g = wx_g.groupby("tanggal", as_index=False)[wx_g_num].mean(numeric_only=True)
    wx_g = _prefix_cols(wx_g, "wxg_", keep={"tanggal"})
    df = df.merge(wx_g, on="tanggal", how="left")

    # fill local weather NaNs using global fallback for same base variable
    # (only for columns that exist in both)
    for c in list(df.columns):
        if c.startswith("wx_"):
            base = c.replace("wx_", "")
            cg = "wxg_" + base
            if cg in df.columns:
                df[c] = df[c].fillna(df[cg])

    # --- Population: total per year (global) ---
    pop = df_pop.copy()
    # expected cols: tahun, jumlah_penduduk
    if "tahun" in pop.columns and "jumlah_penduduk" in pop.columns:
        pop_y = pop.groupby("tahun", as_index=False)["jumlah_penduduk"].sum()
        pop_y = pop_y.rename(columns={"tahun":"year","jumlah_penduduk":"pop_total_year"})
        df = df.merge(pop_y, on="year", how="left")

    # --- River: global year-month aggregates ---
    riv = df_river.copy()
    # standard columns might be: periode_data (year), bulan_sampling, baku_mutu, hasil_pengukuran
    for cc in ["periode_data","bulan_sampling","baku_mutu","hasil_pengukuran"]:
        if cc in riv.columns:
            riv[cc] = pd.to_numeric(riv[cc], errors="coerce")

    if {"periode_data","bulan_sampling","baku_mutu","hasil_pengukuran"}.issubset(riv.columns):
        r = riv.dropna(subset=["periode_data","bulan_sampling","baku_mutu","hasil_pengukuran"]).copy()
        r["ratio_to_std"] = r["hasil_pengukuran"] / (r["baku_mutu"].replace(0, np.nan))
        r["exceed"] = (r["hasil_pengukuran"] > r["baku_mutu"]).astype(int)
        r_agg = r.groupby(["periode_data","bulan_sampling"], as_index=False).agg(
            river_exceed_rate=("exceed","mean"),
            river_ratio_mean=("ratio_to_std","mean"),
            river_n=("exceed","size"),
        )
        r_agg = r_agg.rename(columns={"periode_data":"year", "bulan_sampling":"month"})
        df = df.merge(r_agg, on=["year","month"], how="left")

    # --- cleanup: categorical columns kept as object for CatBoost later ---
    for c in ["stasiun","stasiun_code","parameter_pencemar_kritis","day_name","nama_libur"]:
        if c in df.columns:
            df[c] = df[c].astype(str).replace({"nan": np.nan, "None": np.nan})

    # --- final sanity ---
    key_cols = ["tanggal","stasiun_code"] if "stasiun_code" in df.columns else ["tanggal"]
    dup = int(df.duplicated(key_cols).sum())
    if dup > 0:
        # keep most complete if duplicates remain
        df["_nn"] = df.notna().sum(axis=1)
        idx = df.groupby(key_cols)["_nn"].idxmax()
        df = df.loc[idx].drop(columns=["_nn"]).reset_index(drop=True)

    if has_target and "kategori" in df.columns:
        df["kategori"] = df["kategori"].astype(str).str.strip()

    return df

# ----------------------------
# 1) Build TRAIN master
# ----------------------------
df_train_master = build_master(df_train, has_target=True)
print("df_train_master:", df_train_master.shape)
print("train key duplicates:", int(df_train_master.duplicated(["tanggal","stasiun_code"]).sum()) if "stasiun_code" in df_train_master.columns else 0)

# ----------------------------
# 2) Build TEST master (needs id mapping file)
# ----------------------------
n_test_expected = len(sub)

test_path = None
# If Step 1 created test_candidates, try it first
if "test_candidates" in globals() and isinstance(test_candidates, list) and len(test_candidates) > 0:
    test_path = Path(test_candidates[0][0])
else:
    test_path = _find_test_mapping_file(DATA_ROOT, n_test_expected)

if test_path is None:
    print("\n[WARN] Test mapping file (id -> tanggal/stasiun) not found yet.")
    print("       You can still continue feature engineering + CV on df_train_master.")
    df_test_master = None
else:
    print("\nTest mapping file:", str(test_path))
    df_test = _standardize_columns(pd.read_csv(test_path))
    # ensure id exists and length matches submission
    if "id" not in df_test.columns:
        raise RuntimeError(f"Test file has no 'id': {test_path}")
    if len(df_test) != n_test_expected:
        raise RuntimeError(f"Test file rows != submission rows: {len(df_test)} vs {n_test_expected}")

    # normalize date/station fields
    if "tanggal" in df_test.columns:
        df_test["tanggal"] = parse_date_twopass(df_test["tanggal"])
    else:
        raise RuntimeError("Test mapping file missing 'tanggal'.")

    if "stasiun_code" not in df_test.columns:
        if "stasiun" in df_test.columns:
            df_test["stasiun_code"] = _mk_stasiun_code(df_test["stasiun"])
        elif "stasiun_id" in df_test.columns:
            df_test["stasiun_code"] = _mk_stasiun_code(df_test["stasiun_id"])
        else:
            # some datasets store only DKI1..DKI5
            # try infer from any column containing 'dki'
            cand = None
            for c in df_test.columns:
                if df_test[c].astype(str).str.contains("dki", case=False, na=False).any():
                    cand = c
                    break
            if cand is None:
                raise RuntimeError("Cannot infer station from test mapping file.")
            df_test["stasiun_code"] = _mk_stasiun_code(df_test[cand])

    df_test_master = build_master(df_test, has_target=False, test_mode=True)
    print("df_test_master:", df_test_master.shape)
    # keep id for submission
    if "id" not in df_test_master.columns:
        df_test_master["id"] = df_test["id"].values

# ----------------------------
# 3) Quick preview
# ----------------------------
display(df_train_master.head(3))
if df_test_master is not None:
    display(df_test_master.head(3))


df_train_master: (13651, 79)
train key duplicates: 0

[WARN] Test mapping file (id -> tanggal/stasiun) not found yet.
       You can still continue feature engineering + CV on df_train_master.


Unnamed: 0,periode_data,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,parameter_pencemar_kritis,kategori,source_file,bulan,parameter_pencemar_kritis_alt,kategori_alt,lokasi_spku,stasiun_code,year,month,day,dow,dayofyear,is_holiday_nasional,is_weekend,nama_libur,day_name,ndvi,wx_temperature_2m_max_c,wx_temperature_2m_min_c,wx_precipitation_sum_mm,wx_precipitation_hours_h,wx_wind_speed_10m_max_km_h,wx_wind_direction_10m_dominant,wx_shortwave_radiation_sum_mj_m²,wx_temperature_2m_mean_c,wx_relative_humidity_2m_mean,wx_cloud_cover_mean,wx_surface_pressure_mean_hpa,wx_wind_gusts_10m_max_km_h,wx_winddirection_10m_dominant,wx_relative_humidity_2m_max,wx_relative_humidity_2m_min,wx_cloud_cover_max,wx_cloud_cover_min,wx_wind_gusts_10m_mean_km_h,wx_wind_speed_10m_mean_km_h,wx_wind_gusts_10m_min_km_h,wx_wind_speed_10m_min_km_h,wx_surface_pressure_max_hpa,wx_surface_pressure_min_hpa,wx_weather_station,wxg_temperature_2m_max_c,wxg_temperature_2m_min_c,wxg_precipitation_sum_mm,wxg_precipitation_hours_h,wxg_wind_speed_10m_max_km_h,wxg_wind_direction_10m_dominant,wxg_shortwave_radiation_sum_mj_m²,wxg_temperature_2m_mean_c,wxg_relative_humidity_2m_mean,wxg_cloud_cover_mean,wxg_surface_pressure_mean_hpa,wxg_wind_gusts_10m_max_km_h,wxg_winddirection_10m_dominant,wxg_relative_humidity_2m_max,wxg_relative_humidity_2m_min,wxg_cloud_cover_max,wxg_cloud_cover_min,wxg_wind_gusts_10m_mean_km_h,wxg_wind_speed_10m_mean_km_h,wxg_wind_gusts_10m_min_km_h,wxg_wind_speed_10m_min_km_h,wxg_surface_pressure_max_hpa,wxg_surface_pressure_min_hpa,pop_total_year,river_exceed_rate,river_ratio_mean,river_n
0,201001,2010-01-01,DKI1 (Bunderan HI),60.0,,4.0,73.0,27.0,14.0,73.0,CO,SEDANG,indeks-standar-pencemaran-udara-(ispu)-tahun-2...,,CO,SEDANG,,DKI1,2010,1,1,4,1,1,0,New Year's Day,Friday,0.2023,29.4,24.4,4.0,14.0,16.0,246.0,16.24,26.6,81.0,100.0,1007.5,38.2,246.0,90.0,69.0,100.0,99.0,21.0,10.5,11.9,6.9,1009.3,1005.1,dki1-bundaranhi,29.58,24.24,4.48,12.4,16.2,249.6,16.484,26.48,82.2,100.0,1004.98,38.2,249.6,90.8,69.6,100.0,99.0,21.04,10.54,11.9,7.1,1006.86,1002.7,,,,
1,201001,2010-01-01,DKI2 (Kelapa Gading),,,,,,,0.0,,TIDAK ADA DATA,indeks-standar-pencemaran-udara-(ispu)-tahun-2...,,,TIDAK ADA DATA,,DKI2,2010,1,1,4,1,1,0,New Year's Day,Friday,0.0939,29.4,24.6,5.2,10.0,16.5,255.0,16.85,26.7,81.0,100.0,1007.1,38.2,255.0,89.0,69.0,100.0,99.0,21.1,10.6,11.9,7.4,1009.0,1004.9,dki2-kelapagading,29.58,24.24,4.48,12.4,16.2,249.6,16.484,26.48,82.2,100.0,1004.98,38.2,249.6,90.8,69.6,100.0,99.0,21.04,10.54,11.9,7.1,1006.86,1002.7,,,,
2,201001,2010-01-01,DKI3 (Jagakarsa),,,,,,,0.0,,TIDAK ADA DATA,indeks-standar-pencemaran-udara-(ispu)-tahun-2...,,,TIDAK ADA DATA,,DKI3,2010,1,1,4,1,1,0,New Year's Day,Friday,0.5332,29.8,23.7,4.0,14.0,16.0,246.0,16.24,26.1,85.0,100.0,999.9,38.2,246.0,94.0,71.0,100.0,99.0,21.0,10.5,11.9,6.9,1001.8,997.6,dki3-jagakarsa,29.58,24.24,4.48,12.4,16.2,249.6,16.484,26.48,82.2,100.0,1004.98,38.2,249.6,90.8,69.6,100.0,99.0,21.04,10.54,11.9,7.1,1006.86,1002.7,,,,


# Feature Engineering (Time-Series + Calendar + Robustness)

In [4]:
# ============================================================
# STEP 3 — Feature Engineering (Time-Series + Calendar + Robustness) (ONE CELL)
# Requires:
#   df_train_master  (from Step 2)
#   df_test_master   (optional; from Step 2)
# Produces:
#   df_train_fe, df_test_fe
# Notes:
# - Leakage-safe: all rolling features are computed on SHIFTED values (past only).
# - Station-aware: computed per stasiun_code.
# - Keeps CatBoost-friendly categoricals as object; numeric features stay numeric.
# ============================================================

import numpy as np
import pandas as pd

if "df_train_master" not in globals():
    raise RuntimeError("Missing df_train_master. Run Step 2 first.")

def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # robust calendar
    df["year"] = df["tanggal"].dt.year.astype("Int64")
    df["month"] = df["tanggal"].dt.month.astype("Int64")
    df["day"] = df["tanggal"].dt.day.astype("Int64")
    df["dow"] = df["tanggal"].dt.dayofweek.astype("Int64")
    df["dayofyear"] = df["tanggal"].dt.dayofyear.astype("Int64")

    # cyclic encoding (helps CatBoost a bit; safe)
    doy = df["dayofyear"].astype(float)
    df["doy_sin"] = np.sin(2 * np.pi * doy / 365.25)
    df["doy_cos"] = np.cos(2 * np.pi * doy / 365.25)
    mon = df["month"].astype(float)
    df["mon_sin"] = np.sin(2 * np.pi * mon / 12.0)
    df["mon_cos"] = np.cos(2 * np.pi * mon / 12.0)

    # weekend if missing
    if "is_weekend" not in df.columns:
        df["is_weekend"] = (df["dow"].isin([5, 6])).astype(int)

    return df

def add_station_lag_rolling(
    df: pd.DataFrame,
    group_col: str = "stasiun_code",
    base_cols = ("pm10","pm25","so2","co","o3","no2","max"),
    lags = (1,2,3,7,14),
    windows = (3,7,14,30),
) -> pd.DataFrame:
    df = df.copy()

    # ensure order
    df = df.sort_values([group_col, "tanggal"]).reset_index(drop=True)

    # only keep existing numeric columns
    base_cols = [c for c in base_cols if c in df.columns]
    if not base_cols:
        raise RuntimeError("No base pollutant columns found for lag/rolling.")

    g = df.groupby(group_col, sort=False)

    # lags
    for c in base_cols:
        for L in lags:
            df[f"{c}_lag{L}"] = g[c].shift(L)

    # rolling on shifted series (leakage-safe)
    for c in base_cols:
        s = g[c].shift(1)  # only past values included
        for w in windows:
            df[f"{c}_rmean{w}"] = s.rolling(w, min_periods=max(2, w//3)).mean().reset_index(level=0, drop=True)
            df[f"{c}_rstd{w}"]  = s.rolling(w, min_periods=max(2, w//3)).std().reset_index(level=0, drop=True)

    # deltas (based on lags; safe)
    for c in base_cols:
        if f"{c}_lag1" in df.columns and f"{c}_lag2" in df.columns:
            df[f"{c}_d12"] = df[f"{c}_lag1"] - df[f"{c}_lag2"]
        if f"{c}_lag1" in df.columns and f"{c}_rmean7" in df.columns:
            df[f"{c}_d1_rm7"] = df[f"{c}_lag1"] - df[f"{c}_rmean7"]

    return df

def add_weather_interactions(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # pick weather columns (prefer local wx_, fallback already filled at Step 2)
    # examples based on your weather columns
    cand = {
        "wind_mean": ["wx_wind_speed_10m_mean_km_h", "wxg_wind_speed_10m_mean_km_h", "wind_speed_10m_mean_km_h"],
        "precip_sum": ["wx_precipitation_sum_mm", "wxg_precipitation_sum_mm", "precipitation_sum_mm"],
        "rad_sum": ["wx_shortwave_radiation_sum_mj_m²", "wxg_shortwave_radiation_sum_mj_m²", "shortwave_radiation_sum_mj_m²"],
        "rh_mean": ["wx_relative_humidity_2m_mean", "wxg_relative_humidity_2m_mean", "relative_humidity_2m_mean"],
        "temp_mean": ["wx_temperature_2m_mean_c", "wxg_temperature_2m_mean_c", "temperature_2m_mean_c"],
    }
    def pick(cols):
        for c in cols:
            if c in df.columns:
                return c
        return None

    wind = pick(cand["wind_mean"])
    prec = pick(cand["precip_sum"])
    rad  = pick(cand["rad_sum"])
    rh   = pick(cand["rh_mean"])
    tmp  = pick(cand["temp_mean"])

    # build a few robust interactions using lag1 (already leakage-safe)
    if "pm25_lag1" in df.columns and wind is not None:
        df["pm25_lag1_x_wind"] = df["pm25_lag1"] * df[wind]
    if "pm10_lag1" in df.columns and wind is not None:
        df["pm10_lag1_x_wind"] = df["pm10_lag1"] * df[wind]
    if "o3_lag1" in df.columns and rad is not None:
        df["o3_lag1_x_rad"] = df["o3_lag1"] * df[rad]
    if "pm25_lag1" in df.columns and prec is not None:
        df["pm25_lag1_div_prec"] = df["pm25_lag1"] / (df[prec].fillna(0) + 1.0)
    if "co_lag1" in df.columns and rh is not None:
        df["co_lag1_x_rh"] = df["co_lag1"] * df[rh]
    if "pm25_lag1" in df.columns and tmp is not None:
        df["pm25_lag1_x_temp"] = df["pm25_lag1"] * df[tmp]

    return df

def finalize_types_for_catboost(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # keep these as categorical (object)
    cat_cols = []
    for c in ["stasiun", "stasiun_code", "parameter_pencemar_kritis", "day_name", "nama_libur", "weather_station"]:
        if c in df.columns:
            cat_cols.append(c)
    for c in cat_cols:
        df[c] = df[c].astype("object")

    # cast flags to int
    for c in ["is_weekend", "is_holiday_nasional"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

    return df

def build_fe(df: pd.DataFrame) -> pd.DataFrame:
    df = add_time_features(df)
    df = add_station_lag_rolling(df)
    df = add_weather_interactions(df)
    df = finalize_types_for_catboost(df)
    return df

# ---- Build FE for train and test (if available) ----
df_train_fe = build_fe(df_train_master)

if "df_test_master" in globals() and df_test_master is not None:
    df_test_fe = build_fe(df_test_master)
else:
    df_test_fe = None

# ---- Sanity: no leakage from current-day target directly ----
# keep raw current-day pollutant columns; CatBoost can use them too, but lags are key.
# ensure we do NOT have rolling computed without shift (we don't).

print("df_train_fe:", df_train_fe.shape)
if df_test_fe is not None:
    print("df_test_fe :", df_test_fe.shape)

# Quick missing rate on engineered features
top_miss = (df_train_fe.isna().mean().sort_values(ascending=False).head(15) * 100).round(2)
print("\nTop missing% (train_fe):")
print(top_miss.to_string())

display(df_train_fe.head(3))


df_train_fe: (13651, 194)

Top missing% (train_fe):
bulan                 100.00
river_exceed_rate     100.00
river_n               100.00
river_ratio_mean      100.00
nama_libur             95.69
ndvi                   95.04
lokasi_spku            78.96
pm25_lag14             76.65
pm25_d12               76.59
pm25_lag7              76.41
pm25_lag3              76.27
pm25_d1_rm7            76.26
pm25_lag2              76.24
pm25_lag1_div_prec     76.20
pm25_lag1_x_wind       76.20


  df[f"{c}_d1_rm7"] = df[f"{c}_lag1"] - df[f"{c}_rmean7"]
  df[f"{c}_d12"] = df[f"{c}_lag1"] - df[f"{c}_lag2"]
  df[f"{c}_d1_rm7"] = df[f"{c}_lag1"] - df[f"{c}_rmean7"]
  df[f"{c}_d12"] = df[f"{c}_lag1"] - df[f"{c}_lag2"]
  df[f"{c}_d1_rm7"] = df[f"{c}_lag1"] - df[f"{c}_rmean7"]
  df[f"{c}_d12"] = df[f"{c}_lag1"] - df[f"{c}_lag2"]
  df[f"{c}_d1_rm7"] = df[f"{c}_lag1"] - df[f"{c}_rmean7"]
  df[f"{c}_d12"] = df[f"{c}_lag1"] - df[f"{c}_lag2"]
  df[f"{c}_d1_rm7"] = df[f"{c}_lag1"] - df[f"{c}_rmean7"]


Unnamed: 0,periode_data,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,parameter_pencemar_kritis,kategori,source_file,bulan,parameter_pencemar_kritis_alt,kategori_alt,lokasi_spku,stasiun_code,year,month,day,dow,dayofyear,is_holiday_nasional,is_weekend,nama_libur,day_name,ndvi,wx_temperature_2m_max_c,wx_temperature_2m_min_c,wx_precipitation_sum_mm,wx_precipitation_hours_h,wx_wind_speed_10m_max_km_h,wx_wind_direction_10m_dominant,wx_shortwave_radiation_sum_mj_m²,wx_temperature_2m_mean_c,wx_relative_humidity_2m_mean,wx_cloud_cover_mean,wx_surface_pressure_mean_hpa,wx_wind_gusts_10m_max_km_h,wx_winddirection_10m_dominant,wx_relative_humidity_2m_max,wx_relative_humidity_2m_min,wx_cloud_cover_max,wx_cloud_cover_min,wx_wind_gusts_10m_mean_km_h,wx_wind_speed_10m_mean_km_h,wx_wind_gusts_10m_min_km_h,wx_wind_speed_10m_min_km_h,wx_surface_pressure_max_hpa,wx_surface_pressure_min_hpa,wx_weather_station,wxg_temperature_2m_max_c,wxg_temperature_2m_min_c,wxg_precipitation_sum_mm,wxg_precipitation_hours_h,wxg_wind_speed_10m_max_km_h,wxg_wind_direction_10m_dominant,wxg_shortwave_radiation_sum_mj_m²,wxg_temperature_2m_mean_c,wxg_relative_humidity_2m_mean,wxg_cloud_cover_mean,wxg_surface_pressure_mean_hpa,wxg_wind_gusts_10m_max_km_h,wxg_winddirection_10m_dominant,wxg_relative_humidity_2m_max,wxg_relative_humidity_2m_min,wxg_cloud_cover_max,wxg_cloud_cover_min,wxg_wind_gusts_10m_mean_km_h,wxg_wind_speed_10m_mean_km_h,wxg_wind_gusts_10m_min_km_h,wxg_wind_speed_10m_min_km_h,wxg_surface_pressure_max_hpa,wxg_surface_pressure_min_hpa,pop_total_year,river_exceed_rate,river_ratio_mean,river_n,doy_sin,doy_cos,mon_sin,mon_cos,pm10_lag1,pm10_lag2,pm10_lag3,pm10_lag7,pm10_lag14,pm25_lag1,pm25_lag2,pm25_lag3,pm25_lag7,pm25_lag14,so2_lag1,so2_lag2,so2_lag3,so2_lag7,so2_lag14,co_lag1,co_lag2,co_lag3,co_lag7,co_lag14,o3_lag1,o3_lag2,o3_lag3,o3_lag7,o3_lag14,no2_lag1,no2_lag2,no2_lag3,no2_lag7,no2_lag14,max_lag1,max_lag2,max_lag3,max_lag7,max_lag14,pm10_rmean3,pm10_rstd3,pm10_rmean7,pm10_rstd7,pm10_rmean14,pm10_rstd14,pm10_rmean30,pm10_rstd30,pm25_rmean3,pm25_rstd3,pm25_rmean7,pm25_rstd7,pm25_rmean14,pm25_rstd14,pm25_rmean30,pm25_rstd30,so2_rmean3,so2_rstd3,so2_rmean7,so2_rstd7,so2_rmean14,so2_rstd14,so2_rmean30,so2_rstd30,co_rmean3,co_rstd3,co_rmean7,co_rstd7,co_rmean14,co_rstd14,co_rmean30,co_rstd30,o3_rmean3,o3_rstd3,o3_rmean7,o3_rstd7,o3_rmean14,o3_rstd14,o3_rmean30,o3_rstd30,no2_rmean3,no2_rstd3,no2_rmean7,no2_rstd7,no2_rmean14,no2_rstd14,no2_rmean30,no2_rstd30,max_rmean3,max_rstd3,max_rmean7,max_rstd7,max_rmean14,max_rstd14,max_rmean30,max_rstd30,pm10_d12,pm10_d1_rm7,pm25_d12,pm25_d1_rm7,so2_d12,so2_d1_rm7,co_d12,co_d1_rm7,o3_d12,o3_d1_rm7,no2_d12,no2_d1_rm7,max_d12,max_d1_rm7,pm25_lag1_x_wind,pm10_lag1_x_wind,o3_lag1_x_rad,pm25_lag1_div_prec,co_lag1_x_rh,pm25_lag1_x_temp
0,201001,2010-01-01,DKI1 (Bunderan HI),60.0,,4.0,73.0,27.0,14.0,73.0,CO,SEDANG,indeks-standar-pencemaran-udara-(ispu)-tahun-2...,,CO,SEDANG,,DKI1,2010,1,1,4,1,1,0,New Year's Day,Friday,0.2023,29.4,24.4,4.0,14.0,16.0,246.0,16.24,26.6,81.0,100.0,1007.5,38.2,246.0,90.0,69.0,100.0,99.0,21.0,10.5,11.9,6.9,1009.3,1005.1,dki1-bundaranhi,29.58,24.24,4.48,12.4,16.2,249.6,16.484,26.48,82.2,100.0,1004.98,38.2,249.6,90.8,69.6,100.0,99.0,21.04,10.54,11.9,7.1,1006.86,1002.7,,,,,0.017202,0.999852,0.5,0.866025,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,201001,2010-01-02,DKI1 (Bunderan HI),32.0,,2.0,16.0,33.0,9.0,33.0,O3,BAIK,indeks-standar-pencemaran-udara-(ispu)-tahun-2...,,O3,BAIK,,DKI1,2010,1,2,5,2,0,0,,Saturday,,28.9,24.2,6.9,14.0,9.5,260.0,13.01,26.2,85.0,99.0,1010.1,22.0,260.0,95.0,72.0,100.0,94.0,13.7,6.0,8.6,2.3,1011.9,1007.4,dki1-bundaranhi,29.02,23.88,7.14,14.4,9.5,264.8,12.666,26.08,85.8,99.0,1007.68,23.28,264.8,95.6,72.0,100.0,95.2,13.9,6.08,8.76,1.94,1009.44,1005.04,,,,,0.034398,0.999408,0.5,0.866025,60.0,,,,,,,,,,4.0,,,,,73.0,,,,,27.0,,,,,14.0,,,,,73.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,360.0,351.27,,6205.0,
2,201001,2010-01-03,DKI1 (Bunderan HI),27.0,,2.0,19.0,20.0,9.0,27.0,PM10,BAIK,indeks-standar-pencemaran-udara-(ispu)-tahun-2...,,PM10,BAIK,,DKI1,2010,1,3,6,3,0,0,,Sunday,,31.4,24.9,11.2,6.0,9.4,224.0,23.89,27.1,85.0,93.0,1009.9,21.2,224.0,95.0,70.0,100.0,28.0,15.7,5.7,8.3,1.6,1012.2,1007.0,dki1-bundaranhi,31.46,24.34,12.16,6.8,9.12,218.0,23.986,26.98,85.6,93.8,1007.48,22.24,218.0,96.0,70.4,100.0,40.4,15.5,5.58,8.42,1.68,1009.76,1004.78,,,,,0.051584,0.998669,0.5,0.866025,32.0,60.0,,,,,,,,,2.0,4.0,,,,16.0,73.0,,,,33.0,27.0,,,,9.0,14.0,,,,33.0,73.0,,,,46.0,19.79899,46.0,19.79899,,,,,,,,,,,,,3.0,1.414214,3.0,1.414214,,,,,44.5,40.305087,44.5,40.305087,,,,,30.0,4.242641,30.0,4.242641,,,,,11.5,3.535534,11.5,3.535534,,,,,53.0,28.284271,53.0,28.284271,,,,,-28.0,-14.0,,,-2.0,-1.0,-57.0,-28.5,6.0,3.0,-5.0,-2.5,-40.0,-20.0,,182.4,788.37,,1360.0,


# Model Training (Time-Based CV + CatBoost Optimization)

In [5]:
# ============================================================
# STEP 4 — Model Training (Time-Based CV + CatBoost Optimization) (ONE CELL) — FINAL ROBUST
# Fixes (compared to your last version):
# - CPU only (no CUDA probing)
# - AUTO-detect ALL non-numeric columns as categorical (prevents "CO to float" etc)
# - Sanitize categoricals: NaN/None/"" -> "__MISSING__" and cast to string
# - Sanitize numerics: force to numeric (errors->NaN)
# - Time folds filtered so TRAIN contains ALL classes
# Outputs:
# - feature_cols, cat_cols, classes, class_to_id, id_to_class
# - folds
# - models
# - oof_proba, oof_pred, oof_macro_f1
# ============================================================

import numpy as np
import pandas as pd

from catboost import CatBoostClassifier, Pool
from sklearn.metrics import f1_score, classification_report

if "df_train_fe" not in globals():
    raise RuntimeError("Missing df_train_fe. Run Step 3 first.")

# ----------------------------
# Config
# ----------------------------
N_SPLITS = 4
GAP_DAYS = 0
SEEDS = [42]

ITERATIONS = 8000
LR = 0.05
DEPTH = 8
L2 = 6.0

MISSING_CAT = "__MISSING__"

# ----------------------------
# Prepare data
# ----------------------------
df = df_train_fe.copy()
if "tanggal" not in df.columns or "kategori" not in df.columns:
    raise RuntimeError("df_train_fe must contain 'tanggal' and 'kategori'.")

df = df.dropna(subset=["tanggal"]).sort_values(["tanggal", "stasiun_code"] if "stasiun_code" in df.columns else ["tanggal"]).reset_index(drop=True)

drop_cols = {"kategori", "tanggal", "source_file", "periode_data"}
if "id" in df.columns: drop_cols.add("id")

# align features with test (if exists)
if "df_test_fe" in globals() and df_test_fe is not None:
    common = [c for c in df.columns if c in df_test_fe.columns]
    feature_cols = [c for c in common if c not in drop_cols]
else:
    feature_cols = [c for c in df.columns if c not in drop_cols]

X = df[feature_cols].copy()

# clean target
y_str = df["kategori"].astype(str).str.strip()
y_str = y_str[y_str.str.lower() != "nan"]
df = df.loc[y_str.index].reset_index(drop=True)
X  = X.loc[y_str.index].reset_index(drop=True)
y_str = y_str.reset_index(drop=True)

# ----------------------------
# AUTO categorical detection + sanitization (KEY FIX)
# ----------------------------
# Any column that is NOT numeric is treated as categorical.
is_num = X.apply(pd.api.types.is_numeric_dtype)
cat_cols = X.columns[~is_num].tolist()

# sanitize cat cols
for c in cat_cols:
    X[c] = X[c].where(X[c].notna(), MISSING_CAT).astype(str)
    X[c] = X[c].replace({"nan": MISSING_CAT, "None": MISSING_CAT, "": MISSING_CAT})

# sanitize numeric cols (coerce)
num_cols = X.columns[is_num].tolist()
for c in num_cols:
    X[c] = pd.to_numeric(X[c], errors="coerce")

# safety: ensure no object left in numeric cols
bad_num = [c for c in num_cols if X[c].dtype == object]
if bad_num:
    raise RuntimeError(f"Numeric columns still object after coercion: {bad_num[:10]}")

# ----------------------------
# classes + weights
# ----------------------------
classes = sorted(y_str.unique().tolist())
counts = y_str.value_counts()
class_weights = [float(len(y_str) / (len(classes) * counts[c])) for c in classes]

class_to_id = {c:i for i,c in enumerate(classes)}
id_to_class = {i:c for c,i in class_to_id.items()}
y = y_str.map(class_to_id).astype(int)

print("Train rows:", len(df), "| n_features:", len(feature_cols), "| n_cat(auto):", len(cat_cols))
print("Classes:", classes)
print("Class counts:\n", counts.to_string())

# ----------------------------
# Time folds (ensure train has all classes)
# ----------------------------
def make_time_folds_filtered(df_in: pd.DataFrame, y_int: pd.Series, n_splits=4, gap_days=0):
    d = df_in.copy()
    d["year"] = d["tanggal"].dt.year
    years = sorted(d["year"].dropna().unique().tolist())

    all_ids = set(range(int(y_int.nunique())))

    def train_has_all(tr_idx):
        return set(y_int.iloc[tr_idx].unique().tolist()) == all_ids

    folds = []

    # prefer year-based (newest years as validation)
    if len(years) >= 2:
        for vy in years[::-1]:
            tr_mask = d["year"] < vy
            va_mask = d["year"] == vy
            if not va_mask.any():
                continue
            if gap_days > 0:
                va_start = d.loc[va_mask, "tanggal"].min()
                tr_mask = tr_mask & (d["tanggal"] <= (va_start - pd.Timedelta(days=gap_days)))
            tr_idx = d.index[tr_mask].to_numpy()
            va_idx = d.index[va_mask].to_numpy()
            if len(tr_idx) == 0 or len(va_idx) == 0:
                continue
            if not train_has_all(tr_idx):
                continue
            folds.append((tr_idx, va_idx))
            if len(folds) >= n_splits:
                break
        if folds:
            return folds

    # fallback blocks by date
    uniq_dates = np.array(sorted(d["tanggal"].unique()))
    blocks = np.array_split(uniq_dates, n_splits + 1)
    for b in blocks[1:][::-1]:
        va_start, va_end = b.min(), b.max()
        tr_mask = d["tanggal"] < va_start
        va_mask = (d["tanggal"] >= va_start) & (d["tanggal"] <= va_end)
        if gap_days > 0:
            tr_mask = tr_mask & (d["tanggal"] <= (va_start - pd.Timedelta(days=gap_days)))
        tr_idx = d.index[tr_mask].to_numpy()
        va_idx = d.index[va_mask].to_numpy()
        if len(tr_idx) == 0 or len(va_idx) == 0:
            continue
        if not train_has_all(tr_idx):
            continue
        folds.append((tr_idx, va_idx))
        if len(folds) >= n_splits:
            break

    # final fallback: last 20%
    if not folds:
        cut = int(len(d) * 0.8)
        folds = [(d.index[:cut].to_numpy(), d.index[cut:].to_numpy())]
    return folds

folds = make_time_folds_filtered(df, y, n_splits=N_SPLITS, gap_days=GAP_DAYS)

print("\nFolds:")
for i, (tr, va) in enumerate(folds):
    dtr = (df.loc[tr, "tanggal"].min(), df.loc[tr, "tanggal"].max())
    dva = (df.loc[va, "tanggal"].min(), df.loc[va, "tanggal"].max())
    print(f"fold{i}: train={len(tr)} [{dtr[0]}..{dtr[1]}] | valid={len(va)} [{dva[0]}..{dva[1]}]")

# ----------------------------
# Train CV (CPU only)
# ----------------------------
K = len(classes)
oof_proba = np.zeros((len(df), K), dtype=np.float32)
models = []
fold_scores = []

for seed in SEEDS:
    print(f"\n=== SEED {seed} | task_type=CPU ===")
    for fi, (tr_idx, va_idx) in enumerate(folds):
        X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
        X_va, y_va = X.iloc[va_idx], y.iloc[va_idx]

        train_pool = Pool(X_tr, y_tr, cat_features=cat_cols)
        valid_pool = Pool(X_va, y_va, cat_features=cat_cols)

        model = CatBoostClassifier(
            loss_function="MultiClass",
            eval_metric="TotalF1",
            classes_count=K,
            class_weights=class_weights,
            iterations=ITERATIONS,
            learning_rate=LR,
            depth=DEPTH,
            l2_leaf_reg=L2,
            random_strength=1.0,
            bagging_temperature=0.5,
            border_count=128,
            random_seed=seed,
            od_type="Iter",
            od_wait=400,
            task_type="CPU",
            thread_count=-1,
            verbose=250
        )

        model.fit(train_pool, eval_set=valid_pool, use_best_model=True)

        proba = model.predict_proba(X_va)
        pred_int = np.argmax(proba, axis=1)
        score = f1_score(y_va, pred_int, average="macro")

        oof_proba[va_idx] += (proba / len(SEEDS))
        fold_scores.append(score)
        models.append(model)

        print(f"[seed {seed} fold {fi}] macroF1={score:.5f} | best_iter={model.get_best_iteration()}")

# ----------------------------
# OOF summary
# ----------------------------
oof_pred_int = np.argmax(oof_proba, axis=1)
oof_pred = np.array([id_to_class[i] for i in oof_pred_int])
oof_macro_f1 = f1_score(y_str, oof_pred, average="macro")

print("\n=== OOF RESULTS ===")
print("Fold macroF1:", [round(s, 5) for s in fold_scores])
print("OOF macroF1 :", round(oof_macro_f1, 6))
print("\nOOF classification report:")
print(classification_report(y_str, oof_pred, digits=4))


Train rows: 13651 | n_features: 190 | n_cat(auto): 9
Classes: ['BAIK', 'BERBAHAYA', 'O3', 'SANGAT TIDAK SEHAT', 'SEDANG', 'TIDAK ADA DATA', 'TIDAK SEHAT']
Class counts:
 kategori
SEDANG                7997
TIDAK SEHAT           2072
BAIK                  1912
TIDAK ADA DATA        1440
SANGAT TIDAK SEHAT     199
O3                      30
BERBAHAYA                1

Folds:
fold0: train=11981 [2010-01-01 00:00:00..2022-12-31 00:00:00] | valid=1670 [2023-01-01 00:00:00..2023-11-30 00:00:00]
fold1: train=11490 [2010-01-01 00:00:00..2021-12-31 00:00:00] | valid=491 [2022-01-01 00:00:00..2022-12-31 00:00:00]
fold2: train=9666 [2010-01-01 00:00:00..2020-12-31 00:00:00] | valid=1824 [2021-01-01 00:00:00..2021-12-31 00:00:00]

=== SEED 42 | task_type=CPU ===
0:	learn: 0.8485167	test: 0.1448394	best: 0.1448394 (0)	total: 753ms	remaining: 1h 40m 21s
250:	learn: 0.9998135	test: 0.5368871	best: 0.5368871 (249)	total: 2m 51s	remaining: 1h 28m
500:	learn: 0.9999270	test: 0.5437375	best: 0.5437375 (4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Inference, Ensembling, Submission & QA

In [6]:
# ============================================================
# STEP 5 — Inference, Ensembling, Submission & QA (ONE CELL) — FAIL-SAFE
# Behavior:
# - If df_test_like + models exist  -> REAL inference
# - Else                           -> DUMMY submission (SEDANG)
# Output:
# - /kaggle/working/submission.csv
# ============================================================

import numpy as np
import pandas as pd
from catboost import Pool

# ----------------------------
# Guards (minimal)
# ----------------------------
if "sub" not in globals():
    raise RuntimeError("Missing `sub` (sample_submission). Jalankan STEP 1.")

ID_COL = "id" if "id" in sub.columns else sub.columns[0]
SUB_TARGET_COL = "category" if "category" in sub.columns else sub.columns[-1]
MISSING_CAT = "__MISSING__"

print("[INFO] STEP 5 starting...")

# ============================================================
# CASE A — FULL PIPELINE AVAILABLE → REAL INFERENCE
# ============================================================
can_infer = all(k in globals() for k in ["models", "df_test_like", "feature_cols", "cat_cols", "id_to_class"])

if can_infer:
    print("[INFO] Found models + df_test_like → running REAL inference")

    df_test_fe = df_test_like.copy()

    # ensure ID exists
    if ID_COL not in df_test_fe.columns:
        raise RuntimeError(f"{ID_COL} not found in df_test_like")

    # ensure all features exist
    for c in feature_cols:
        if c not in df_test_fe.columns:
            df_test_fe[c] = np.nan

    df_test_fe = df_test_fe[[ID_COL] + feature_cols].reset_index(drop=True)
    X_test = df_test_fe[feature_cols].copy()

    # sanitize categoricals
    for c in cat_cols:
        X_test[c] = X_test[c].where(X_test[c].notna(), MISSING_CAT).astype(str)
        X_test[c] = X_test[c].replace({"nan": MISSING_CAT, "None": MISSING_CAT, "": MISSING_CAT})

    # sanitize numerics
    num_cols = [c for c in X_test.columns if c not in cat_cols]
    for c in num_cols:
        X_test[c] = pd.to_numeric(X_test[c], errors="coerce")

    # ensemble
    K = len(id_to_class)
    proba_ens = np.zeros((len(X_test), K), dtype=np.float32)
    pool = Pool(X_test, cat_features=cat_cols)

    for model in models:
        proba_ens += model.predict_proba(pool) / len(models)

    pred_int = np.argmax(proba_ens, axis=1)
    pred_label = [id_to_class[i] for i in pred_int]

    submission = sub.copy()
    submission = submission.merge(
        df_test_fe[[ID_COL]].assign(_row=np.arange(len(df_test_fe))),
        on=ID_COL,
        how="left"
    ).sort_values("_row")

    submission[SUB_TARGET_COL] = pred_label
    submission = submission.drop(columns="_row")

# ============================================================
# CASE B — MISSING OBJECTS → DUMMY SUBMISSION (TEST ONLY)
# ============================================================
else:
    print("[WARN] Missing df_test_like / models → creating DUMMY submission (TEST ONLY)")

    submission = sub.copy()
    submission[SUB_TARGET_COL] = "SEDANG"  # majority class, safe

# ----------------------------
# QA (ALWAYS)
# ----------------------------
qa = {
    "rows_submission": len(submission),
    "rows_sample": len(sub),
    "id_unique": submission[ID_COL].is_unique,
    "missing_pred": int(submission[SUB_TARGET_COL].isna().sum()),
    "label_distribution": submission[SUB_TARGET_COL].value_counts().to_dict(),
}

print("\n=== QA REPORT ===")
for k, v in qa.items():
    print(f"{k}: {v}")

assert qa["rows_submission"] == qa["rows_sample"], "Row count mismatch"
assert qa["id_unique"], "Duplicate IDs in submission"
assert qa["missing_pred"] == 0, "Missing predictions"

# ----------------------------
# Save
# ----------------------------
OUT_PATH = "/kaggle/working/submission.csv"
submission.to_csv(OUT_PATH, index=False)

print(f"\n[OK] submission saved to {OUT_PATH}")
display(submission.head(10))


[INFO] STEP 5 starting...
[WARN] Missing df_test_like / models → creating DUMMY submission (TEST ONLY)

=== QA REPORT ===
rows_submission: 455
rows_sample: 455
id_unique: True
missing_pred: 0
label_distribution: {'SEDANG': 455}

[OK] submission saved to /kaggle/working/submission.csv


Unnamed: 0,id,category
0,2025-09-01_DKI1,SEDANG
1,2025-09-01_DKI2,SEDANG
2,2025-09-01_DKI3,SEDANG
3,2025-09-01_DKI4,SEDANG
4,2025-09-01_DKI5,SEDANG
5,2025-09-02_DKI1,SEDANG
6,2025-09-02_DKI2,SEDANG
7,2025-09-02_DKI3,SEDANG
8,2025-09-02_DKI4,SEDANG
9,2025-09-02_DKI5,SEDANG
