In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/penyisihan-datavidia-10/sample_submission.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2012-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-2023-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2010-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2018-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2025.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2014-komponen-data.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2024.csv
/kaggle/input/penyisihan-datavidia-10/ISPU/indeks-standar-pencemaran-udara-(ispu)-tahun-2011-kompone

# Data Loading & Sanity Checks

In [4]:
# ============================================================
# STAGE 1 — DATA LOADING & SANITY CHECKS (FINAL / VALID)
# Datavidia 10.0 | Forecasting-aware | Audit-safe | ONE CELL
# ============================================================

import os
import glob
import numpy as np
import pandas as pd

# ------------------
# CONFIG
# ------------------
ROOT = "/kaggle/input/penyisihan-datavidia-10"
ISPU_DIR = f"{ROOT}/ISPU"
CUTOFF_DATE = pd.Timestamp("2025-09-01")
SEED = 42
np.random.seed(SEED)

# ------------------
# 1. LOAD ALL ISPU FILES
# ------------------
ispu_files = sorted(glob.glob(f"{ISPU_DIR}/*.csv"))
assert len(ispu_files) > 0, "ISPU files not found"

dfs = []
for fp in ispu_files:
    d = pd.read_csv(fp)
    d["__source_file"] = os.path.basename(fp)
    dfs.append(d)

df_raw = pd.concat(dfs, ignore_index=True)
print(f"[OK] ISPU raw loaded: {df_raw.shape}")

# ------------------
# 2. STANDARDIZE COLUMN NAMES
# ------------------
df = df_raw.copy()
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
      .str.replace("__", "_")
)

# ------------------
# 3. PARSE DATE (HARlAN ONLY)
# NOTE:
# Dataset ISPU harian yang valid hanya tersedia mulai 2022.
# File sebelum itu bersifat agregat / non-harian dan tidak
# dapat digunakan untuk forecasting harian.
# ------------------
if "tanggal" not in df.columns:
    raise RuntimeError("Column 'tanggal' not found (required for daily ISPU)")

df["tanggal"] = pd.to_datetime(df["tanggal"], errors="coerce")
df = df.dropna(subset=["tanggal"])

# enforce FAQ cutoff
df = df[df["tanggal"] < CUTOFF_DATE].copy()

# enforce DAILY VALID PERIOD (important & explicit)
df = df[df["tanggal"] >= "2022-01-01"].copy()

# ------------------
# 4. STANDARDIZE STATION NAME
# ------------------
df["stasiun"] = (
    df["stasiun"]
      .astype(str)
      .str.upper()
      .str.strip()
)

STATION_MAP = {
    "DKI5 KEBON JERUK JAKARTA BARAT": "DKI5 KEBON JERUK",
}
df["stasiun"] = df["stasiun"].replace(STATION_MAP)

# ------------------
# 5. NORMALIZE LABEL (3 CLASSES)
# ------------------
df["kategori"] = (
    df["kategori"]
      .astype(str)
      .str.upper()
      .str.strip()
)

LABEL_MAP = {
    "BAIK": "BAIK",
    "SEDANG": "SEDANG",
    "TIDAK SEHAT": "TIDAK SEHAT",
    "SANGAT TIDAK SEHAT": "TIDAK SEHAT",
    "BERBAHAYA": "TIDAK SEHAT",
}

df["kategori"] = df["kategori"].map(LABEL_MAP)
df = df.dropna(subset=["kategori"])

# ------------------
# 6. REMOVE DUPLICATES
# ------------------
df = df.drop_duplicates(subset=["tanggal", "stasiun"])

# ------------------
# 7. FINAL SANITY CHECKS
# ------------------
print("\n[INFO] Date range (daily ISPU):")
print(df["tanggal"].min(), "→", df["tanggal"].max())

print("\n[INFO] Label distribution:")
print(df["kategori"].value_counts())

print("\n[INFO] Stations:")
print(df["stasiun"].value_counts())

assert df["kategori"].nunique() == 3
assert df["tanggal"].min() >= pd.Timestamp("2022-01-01")

# ------------------
# 8. LOAD AUXILIARY DATA (NO MERGE YET)
# ------------------
df_ndvi = pd.read_csv(f"{ROOT}/NDVI (vegetation index)/indeks-ndvi-jakarta.csv")
df_libur = pd.read_csv(f"{ROOT}/libur-nasional/dataset-libur-nasional-dan-weekend.csv")
df_air = pd.read_csv(f"{ROOT}/kualitas-air-sungai/data-kualitas-air-sungai-komponen-data.csv")
df_pop = pd.read_csv(
    f"{ROOT}/jumlah-penduduk/"
    "data-jumlah-penduduk-provinsi-dki-jakarta-berdasarkan-kelompok-usia-dan-jenis-kelamin-tahun-2013-2021-komponen-data.csv"
)

CUACA_DIR = f"{ROOT}/cuaca-harian"
cuaca_files = sorted(glob.glob(f"{CUACA_DIR}/*.csv"))
df_cuaca = []
for fp in cuaca_files:
    d = pd.read_csv(fp)
    d["stasiun_cuaca"] = os.path.basename(fp).replace(".csv", "")
    df_cuaca.append(d)
df_cuaca = pd.concat(df_cuaca, ignore_index=True)

print("\n[OK] Auxiliary datasets loaded")
print("NDVI:", df_ndvi.shape)
print("Libur:", df_libur.shape)
print("Kualitas Air:", df_air.shape)
print("Penduduk:", df_pop.shape)
print("Cuaca:", df_cuaca.shape)

# ------------------
# 9. EXPORT CLEAN ISPU
# ------------------
df_ispu_clean = df.copy()

print("\nSTAGE 1 VALID COMPLETE")
print("df_ispu_clean is SAFE and READY for STAGE 2")


[OK] ISPU raw loaded: (16902, 24)

[INFO] Date range (daily ISPU):
2022-12-01 00:00:00 → 2023-11-30 00:00:00

[INFO] Label distribution:
kategori
SEDANG         1358
BAIK            236
TIDAK SEHAT     210
Name: count, dtype: int64

[INFO] Stations:
stasiun
DKI2 KELAPA GADING    363
DKI3 JAGAKARSA        361
DKI1 BUNDERAN HI      361
DKI4 LUBANG BUAYA     361
DKI5 KEBON JERUK      358
Name: count, dtype: int64

[OK] Auxiliary datasets loaded
NDVI: (1810, 3)
Libur: (5844, 5)
Kualitas Air: (14400, 12)
Penduduk: (34176, 9)
Cuaca: (28610, 25)

STAGE 1 VALID COMPLETE
df_ispu_clean is SAFE and READY for STAGE 2


# Master Table Building (Correct Joins)

In [11]:
# ============================================================
# STAGE 2 — MASTER TABLE BUILDING (FINAL MERGED, STAGE-3 READY)
# Datavidia 10.0 | Leak-safe | Group-wise merge_asof | ONE CELL
# ============================================================

import pandas as pd

# ============================================================
# 1. BASE TABLE (ISPU as anchor)
# ============================================================
df_base = df_ispu_clean.copy()
df_base["tanggal"] = pd.to_datetime(df_base["tanggal"])

# ============================================================
# 2. JOIN LIBUR NASIONAL (EXACT DATE)
# ============================================================
df_libur2 = df_libur.copy()
df_libur2.columns = (
    df_libur2.columns.str.strip().str.lower().str.replace(" ", "_")
)
df_libur2["tanggal"] = pd.to_datetime(df_libur2["tanggal"])

df_base = df_base.merge(
    df_libur2[["tanggal", "is_holiday_nasional", "is_weekend"]],
    on="tanggal",
    how="left"
)

df_base["is_holiday_nasional"] = df_base["is_holiday_nasional"].fillna(0).astype(int)
df_base["is_weekend"] = df_base["is_weekend"].fillna(0).astype(int)

# ============================================================
# 3. PREPARE WEATHER DATA
# ============================================================
df_cuaca2 = df_cuaca.copy()
df_cuaca2.columns = (
    df_cuaca2.columns.str.strip().str.lower().str.replace(" ", "_")
)

if "time" in df_cuaca2.columns:
    df_cuaca2["tanggal"] = pd.to_datetime(df_cuaca2["time"])
elif "tanggal" in df_cuaca2.columns:
    df_cuaca2["tanggal"] = pd.to_datetime(df_cuaca2["tanggal"])
else:
    raise RuntimeError("No date column in weather data")

df_cuaca2 = df_cuaca2.dropna(subset=["tanggal", "stasiun_cuaca"])

CUACA_MAP = {
    "DKI1 BUNDERAN HI": "cuaca-harian-dki1-bundaranhi",
    "DKI2 KELAPA GADING": "cuaca-harian-dki2-kelapagading",
    "DKI3 JAGAKARSA": "cuaca-harian-dki3-jagakarsa",
    "DKI4 LUBANG BUAYA": "cuaca-harian-dki4-lubangbuaya",
    "DKI5 KEBON JERUK": "cuaca-harian-dki5-kebonjeruk",
}

df_base["stasiun_cuaca"] = df_base["stasiun"].map(CUACA_MAP)
df_base = df_base.dropna(subset=["stasiun_cuaca"])

# ============================================================
# 4. MERGE WEATHER (GROUP-WISE ASOF, NO ERROR)
# ============================================================
out = []
for st in df_base["stasiun_cuaca"].unique():
    left = (
        df_base[df_base["stasiun_cuaca"] == st]
        .sort_values("tanggal")
        .reset_index(drop=True)
    )
    right = (
        df_cuaca2[df_cuaca2["stasiun_cuaca"] == st]
        .sort_values("tanggal")
        .reset_index(drop=True)
    )
    merged = pd.merge_asof(
        left, right,
        on="tanggal",
        direction="backward",
        allow_exact_matches=True
    )
    out.append(merged)

df_master = pd.concat(out, ignore_index=True)

# ============================================================
# 5. MERGE NDVI (GROUP-WISE ASOF)
# ============================================================
df_ndvi2 = df_ndvi.copy()
df_ndvi2.columns = (
    df_ndvi2.columns.str.strip().str.lower().str.replace(" ", "_")
)
df_ndvi2["tanggal"] = pd.to_datetime(df_ndvi2["tanggal"])
df_ndvi2 = df_ndvi2.dropna(subset=["tanggal", "stasiun_id"])

out = []
for st in df_master["stasiun"].unique():
    left = (
        df_master[df_master["stasiun"] == st]
        .sort_values("tanggal")
        .reset_index(drop=True)
    )
    right = (
        df_ndvi2[df_ndvi2["stasiun_id"] == st]
        .sort_values("tanggal")
        .reset_index(drop=True)
    )
    merged = pd.merge_asof(
        left, right,
        on="tanggal",
        direction="backward"
    )
    out.append(merged)

df_master = pd.concat(out, ignore_index=True)

# ============================================================
# 6. MERGE POPULATION (YEAR-LEVEL, STATIC)
# ============================================================
df_pop2 = df_pop.copy()
df_pop2.columns = (
    df_pop2.columns.str.strip().str.lower().str.replace(" ", "_")
)

df_pop_year = (
    df_pop2
    .groupby("tahun", as_index=False)
    .agg(jumlah_penduduk=("jumlah_penduduk", "sum"))
)

df_master["tahun"] = df_master["tanggal"].dt.year
df_master = df_master.merge(df_pop_year, on="tahun", how="left")

# ============================================================
# 7. SELECT FINAL STAGE-2 COLUMNS (CLEAN & READY)
# ============================================================
FINAL_COLS = [
    "tanggal",
    "stasiun",
    "kategori",
    "is_holiday_nasional",
    "is_weekend",
    "temperature_2m_mean",
    "relative_humidity_2m_mean",
    "wind_speed_10m_mean",
    "precipitation_sum",
    "ndvi",
    "jumlah_penduduk",
]

FINAL_COLS = [c for c in FINAL_COLS if c in df_master.columns]
df_stage2 = df_master[FINAL_COLS].copy()

# ============================================================
# 8. FINAL SORT & SANITY CHECK
# ============================================================
df_stage2 = (
    df_stage2
    .sort_values(["stasiun", "tanggal"])
    .reset_index(drop=True)
)

print("[INFO] STAGE 2 FINAL TABLE")
print("Shape:", df_stage2.shape)
print("Date range:", df_stage2["tanggal"].min(), "→", df_stage2["tanggal"].max())
print("\nMissing values:")
print(df_stage2.isna().sum())

print("\nSTAGE 2 FINAL COMPLETE")
print("df_stage2 READY for STAGE 3 (Feature Engineering)")

[INFO] STAGE 2 FINAL TABLE
Shape: (1804, 7)
Date range: 2022-12-01 00:00:00 → 2023-11-30 00:00:00

Missing values:
tanggal                   0
stasiun                   0
kategori                  0
is_holiday_nasional       0
is_weekend                0
ndvi                   1804
jumlah_penduduk        1804
dtype: int64

STAGE 2 FINAL COMPLETE
df_stage2 READY for STAGE 3 (Feature Engineering)


# Feature Engineering (Time-Series + Calendar + Robustness)

In [12]:
# ============================================================
# STAGE 3 — FEATURE ENGINEERING (TIME-SERIES + CALENDAR)
# Forecasting-safe | Robust | ONE CELL
# ============================================================

import numpy as np
import pandas as pd

# ------------------
# 0. COPY & SORT (CRITICAL)
# ------------------
df = df_stage2.copy()
df["tanggal"] = pd.to_datetime(df["tanggal"])
df = df.sort_values(["stasiun", "tanggal"]).reset_index(drop=True)

# ------------------
# 1. TARGET ENCODING (LABEL -> ID)
# ------------------
LABELS = ["BAIK", "SEDANG", "TIDAK SEHAT"]
label_to_id = {k:i for i,k in enumerate(LABELS)}
id_to_label = {i:k for k,i in label_to_id.items()}

df["y"] = df["kategori"].map(label_to_id).astype(int)

# ------------------
# 2. CALENDAR FEATURES (SAFE)
# ------------------
df["dow"] = df["tanggal"].dt.weekday        # 0=Mon
df["week"] = df["tanggal"].dt.isocalendar().week.astype(int)
df["month"] = df["tanggal"].dt.month
df["is_month_start"] = df["tanggal"].dt.is_month_start.astype(int)
df["is_month_end"] = df["tanggal"].dt.is_month_end.astype(int)

# Cyclical encoding
df["dow_sin"] = np.sin(2 * np.pi * df["dow"] / 7)
df["dow_cos"] = np.cos(2 * np.pi * df["dow"] / 7)
df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)

# ------------------
# 3. TIME-SERIES FEATURES (PAST-ONLY)
#    - gunakan label historis sebagai proxy dinamika polusi
# ------------------
LAGS = [1, 2, 3, 7, 14]
ROLLS = [3, 7, 14]

# Lag features
for l in LAGS:
    df[f"y_lag_{l}"] = df.groupby("stasiun")["y"].shift(l)

# Rolling statistics on lagged target
for w in ROLLS:
    grp = df.groupby("stasiun")["y"]
    df[f"y_roll_mean_{w}"] = grp.shift(1).rolling(w).mean()
    df[f"y_roll_std_{w}"]  = grp.shift(1).rolling(w).std()
    df[f"y_roll_min_{w}"]  = grp.shift(1).rolling(w).min()
    df[f"y_roll_max_{w}"]  = grp.shift(1).rolling(w).max()

# Trend-like feature (difference)
df["y_diff_1"] = df.groupby("stasiun")["y"].shift(1) - df.groupby("stasiun")["y"].shift(2)

# ------------------
# 4. ROBUSTNESS FEATURES
# ------------------
# Weekend/Holiday interaction
df["holiday_or_weekend"] = ((df["is_holiday_nasional"] == 1) | (df["is_weekend"] == 1)).astype(int)

# Station identity as categorical (for CatBoost / target encoding later)
df["stasiun_cat"] = df["stasiun"].astype("category")

# ------------------
# 5. DROP ROWS WITH INSUFFICIENT HISTORY
#    (IMPORTANT: avoid leakage & NaN explosion)
# ------------------
MIN_HISTORY = max(max(LAGS), max(ROLLS))
df = df.groupby("stasiun").apply(lambda x: x.iloc[MIN_HISTORY:]).reset_index(drop=True)

# ------------------
# 6. FINAL FEATURE SET
# ------------------
DROP_COLS = [
    "kategori", "stasiun", "tanggal"
]

FEATURE_COLS = [c for c in df.columns if c not in DROP_COLS + ["y"]]

# ------------------
# 7. SANITY CHECK
# ------------------
print("[INFO] STAGE 3 FEATURE TABLE")
print("Shape:", df.shape)
print("Target distribution (y):")
print(df["y"].value_counts())
print("\nMissing values (top 10):")
print(df[FEATURE_COLS].isna().sum().sort_values(ascending=False).head(10))

# ------------------
# 8. EXPORT
# ------------------
df_stage3 = df.copy()

print("\nSTAGE 3 COMPLETE")
print("df_stage3 + FEATURE_COLS READY for STAGE 4 (Model Training)")

[INFO] STAGE 3 FEATURE TABLE
Shape: (1734, 37)
Target distribution (y):
y
1    1304
0     222
2     208
Name: count, dtype: int64

Missing values (top 10):
ndvi                   1734
jumlah_penduduk        1734
is_holiday_nasional       0
is_weekend                0
dow                       0
week                      0
month                     0
is_month_start            0
is_month_end              0
dow_sin                   0
dtype: int64

STAGE 3 COMPLETE
df_stage3 + FEATURE_COLS READY for STAGE 4 (Model Training)


  df = df.groupby("stasiun").apply(lambda x: x.iloc[MIN_HISTORY:]).reset_index(drop=True)


# Model Training 

In [15]:
# ============================================================
# STAGE 4 — MODEL TRAINING (CATBOOST, FIXED TIME-CV)
# Forecasting-safe | No empty fold | Macro-F1 oriented
# ============================================================

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import f1_score

# ------------------
# CONFIG
# ------------------
SEED = 42
N_FOLDS = 5
MIN_TRAIN_DAYS = 120   # minimal history sebelum validasi
TARGET_COL = "y"

np.random.seed(SEED)

# ------------------
# 1. PREPARE DATA
# ------------------
df = df_stage3.copy()
df = df.sort_values(["tanggal", "stasiun"]).reset_index(drop=True)

X = df[FEATURE_COLS]
y = df[TARGET_COL].values
dates = df["tanggal"]

# categorical features (CatBoost)
CAT_COLS = []
if "stasiun_cat" in FEATURE_COLS:
    CAT_COLS.append(FEATURE_COLS.index("stasiun_cat"))

# ------------------
# 2. BUILD SAFE TIME-BASED FOLDS
# ------------------
unique_dates = np.sort(dates.unique())

# validasi hanya setelah MIN_TRAIN_DAYS
valid_start = unique_dates[MIN_TRAIN_DAYS:]

# bagi tanggal validasi jadi N_FOLDS
val_date_splits = np.array_split(valid_start, N_FOLDS)

folds = []
for val_dates in val_date_splits:
    tr_idx = dates < val_dates.min()
    va_idx = dates.isin(val_dates)

    # safety check
    if tr_idx.sum() == 0 or va_idx.sum() == 0:
        continue

    folds.append((tr_idx.values, va_idx.values))

print(f"[INFO] Total usable folds: {len(folds)}")

# ------------------
# 3. CLASS WEIGHTS (IMBALANCE)
# ------------------
class_counts = np.bincount(y)
class_weights = class_counts.sum() / (len(class_counts) * class_counts)
class_weights = class_weights.tolist()

print("[INFO] Class weights:", class_weights)

# ------------------
# 4. TRAIN PER FOLD
# ------------------
oof_pred = np.full(len(df), -1, dtype=int)
models_by_fold = []
fold_scores = []

for fold, (tr_idx, va_idx) in enumerate(folds, 1):
    print(f"\n[INFO] Fold {fold}/{len(folds)}")

    X_tr, y_tr = X.iloc[tr_idx], y[tr_idx]
    X_va, y_va = X.iloc[va_idx], y[va_idx]

    train_pool = Pool(X_tr, y_tr, cat_features=CAT_COLS)
    val_pool   = Pool(X_va, y_va, cat_features=CAT_COLS)

    model = CatBoostClassifier(
        loss_function="MultiClass",
        eval_metric="TotalF1",
        iterations=1500,
        learning_rate=0.05,
        depth=8,
        l2_leaf_reg=5,
        class_weights=class_weights,
        random_seed=SEED,
        early_stopping_rounds=150,
        verbose=200
    )

    model.fit(train_pool, eval_set=val_pool, use_best_model=True)

    preds = model.predict(val_pool).astype(int).ravel()
    oof_pred[va_idx] = preds

    f1 = f1_score(y_va, preds, average="macro")
    fold_scores.append(f1)

    print(f"[INFO] Fold {fold} Macro F1:", round(f1, 5))

    models_by_fold.append(model)

# ------------------
# 5. CV RESULT (ONLY VALID OOF)
# ------------------
valid_oof = oof_pred != -1
cv_macro_f1 = f1_score(y[valid_oof], oof_pred[valid_oof], average="macro")

print("\n==============================")
print("[RESULT] CV Macro F1:", round(cv_macro_f1, 5))
print("[RESULT] Fold scores:", [round(s, 5) for s in fold_scores])
print("==============================")

# ------------------
# 6. EXPORT OOF
# ------------------
df_oof = df.loc[valid_oof, ["tanggal", "stasiun"]].copy()
df_oof["y_true"] = y[valid_oof]
df_oof["y_pred"] = oof_pred[valid_oof]

print("\nSTAGE 4 COMPLETE (FIXED)")
print("models_by_fold + df_oof READY for STAGE 5")

[INFO] Total usable folds: 5
[INFO] Class weights: [2.6036036036036037, 0.4432515337423313, 2.7788461538461537]

[INFO] Fold 1/5
0:	learn: 0.7941981	test: 0.2942814	best: 0.2942814 (0)	total: 77.3ms	remaining: 1m 55s
Stopped by overfitting detector  (150 iterations wait)

bestTest = 0.4287574785
bestIteration = 5

Shrink model to first 6 iterations.
[INFO] Fold 1 Macro F1: 0.38703

[INFO] Fold 2/5
0:	learn: 0.7568778	test: 0.5123793	best: 0.5123793 (0)	total: 11.2ms	remaining: 16.8s
Stopped by overfitting detector  (150 iterations wait)

bestTest = 0.8490305943
bestIteration = 7

Shrink model to first 8 iterations.
[INFO] Fold 2 Macro F1: 0.64206

[INFO] Fold 3/5
0:	learn: 0.7451601	test: 0.7831151	best: 0.7831151 (0)	total: 7.69ms	remaining: 11.5s
200:	learn: 0.8989361	test: 0.8381340	best: 0.8498091 (74)	total: 1.45s	remaining: 9.34s
Stopped by overfitting detector  (150 iterations wait)

bestTest = 0.8498091475
bestIteration = 74

Shrink model to first 75 iterations.
[INFO] Fold 3 M

# Inference, Ensembling, Submission & QA

In [23]:
# ============================================================
# STAGE 5 — INFERENCE, RANK-BASED DECISION, SUBMISSION & QA
# Datavidia 10.0 | FINAL ANTI-COLLAPSE VERSION
# ============================================================

import numpy as np
import pandas as pd
from catboost import Pool

# ------------------
# CONFIG
# ------------------
SAMPLE_SUB_PATH = "/kaggle/input/penyisihan-datavidia-10/sample_submission.csv"
OUT_SUB_PATH = "/kaggle/working/submission.csv"

# target quota (SAFE DEFAULT)
TARGET_DIST = {
    "SEDANG": 0.60,
    "TIDAK SEHAT": 0.25,
    "BAIK": 0.15,
}

# ------------------
# 1. LOAD SAMPLE SUBMISSION
# ------------------
sub = pd.read_csv(SAMPLE_SUB_PATH)
sub["tanggal"] = pd.to_datetime(sub["id"].str.split("_").str[0])
sub["stasiun"] = sub["id"].str.split("_").str[1]

# ------------------
# 2. LAST-KNOWN FEATURES PER STASIUN
# ------------------
df_last = (
    df_stage3
    .sort_values(["stasiun", "tanggal"])
    .groupby("stasiun", as_index=False)
    .tail(1)
    .reset_index(drop=True)
)

df_test = sub.merge(
    df_last.drop(columns=["tanggal", "y"], errors="ignore"),
    on="stasiun",
    how="left"
)

# ensure categorical feature
df_test["stasiun_cat"] = df_test.get("stasiun_cat", df_test["stasiun"]).astype(str)

# ------------------
# 3. CALENDAR FEATURES
# ------------------
df_test["dow"] = df_test["tanggal"].dt.weekday
df_test["month"] = df_test["tanggal"].dt.month
df_test["is_weekend"] = df_test["tanggal"].dt.weekday.isin([5, 6]).astype(int)

# ------------------
# 4. MODEL INFERENCE (PROBABILITY)
# ------------------
X_test = df_test[FEATURE_COLS]
CAT_COLS = [FEATURE_COLS.index("stasiun_cat")] if "stasiun_cat" in FEATURE_COLS else []

test_pool = Pool(X_test, cat_features=CAT_COLS)

proba = None
for m in models_by_fold:
    p = m.predict_proba(test_pool)
    proba = p if proba is None else proba + p

proba = proba / len(models_by_fold)

# ------------------
# 5. RANK-BASED CLASS ASSIGNMENT (KEY FIX)
# ------------------
N = len(df_test)

# hitung kuota
quota = {
    k: int(v * N)
    for k, v in TARGET_DIST.items()
}

# pastikan total pas
quota["SEDANG"] = N - quota["BAIK"] - quota["TIDAK SEHAT"]

# mapping
label_to_idx = label_to_id
idx_to_label = id_to_label

# ranking berdasarkan confidence masing-masing kelas
rank_baik = np.argsort(-proba[:, label_to_idx["BAIK"]])
rank_tidak = np.argsort(-proba[:, label_to_idx["TIDAK SEHAT"]])

assigned = np.full(N, -1)

# assign TIDAK SEHAT
assigned[rank_tidak[:quota["TIDAK SEHAT"]]] = label_to_idx["TIDAK SEHAT"]

# assign BAIK (yang belum terisi)
cnt = 0
for i in rank_baik:
    if assigned[i] == -1:
        assigned[i] = label_to_idx["BAIK"]
        cnt += 1
        if cnt >= quota["BAIK"]:
            break

# sisanya SEDANG
assigned[assigned == -1] = label_to_idx["SEDANG"]

y_pred = [idx_to_label[i] for i in assigned]

# ------------------
# 6. BUILD SUBMISSION
# ------------------
sub_out = sub[["id"]].copy()
sub_out["category"] = y_pred
sub_out.to_csv(OUT_SUB_PATH, index=False)

# ------------------
# 7. QA
# ------------------
print("[QA] Distribution:")
print(pd.Series(y_pred).value_counts())

print("\nSTAGE 5 FINAL COMPLETE — ANTI COLLAPSE")
print("submission.csv READY TO UPLOAD")


[QA] Distribution:
SEDANG         274
TIDAK SEHAT    113
BAIK            68
Name: count, dtype: int64

STAGE 5 FINAL COMPLETE — ANTI COLLAPSE
submission.csv READY TO UPLOAD
