In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/Data_Klaim.csv
/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/sample_submission.csv
/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/Data_Polis.csv


# DATA FOUNDATION

In [2]:
# ============================================================
# STAGE 1 v3 â€” FOUNDATION (BUGFIX + DATASET-AWARE)
# Fix: year_month merge + YYYYMMDD date parsing
# Exposure mode: claimant vs inforce
# ============================================================

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"

klaim = pd.read_csv(BASE_PATH + "Data_Klaim.csv")
polis = pd.read_csv(BASE_PATH + "Data_Polis.csv")

# =============================
# CLEAN COLUMN NAMES
# =============================
def clean_columns(df):
    df = df.copy()
    df.columns = (
        df.columns.astype(str)
        .str.strip()
        .str.lower()
        .str.replace(" ", "_", regex=False)
        .str.replace("/", "_", regex=False)
        .str.replace("-", "_", regex=False)
    )
    return df

klaim = clean_columns(klaim)
polis = clean_columns(polis)

# =============================
# PARSE DATES (handle YYYYMMDD int)
# =============================
def parse_mixed_date(s: pd.Series) -> pd.Series:
    s0 = s.copy()

    # If numeric like 20140603 (YYYYMMDD), parse with format
    if pd.api.types.is_numeric_dtype(s0):
        # convert to string safely
        ss = s0.astype("Int64").astype(str)
        # keep only plausible 8-digit dates
        mask_8 = ss.str.fullmatch(r"\d{8}")
        out = pd.to_datetime(pd.Series([pd.NaT]*len(ss)), errors="coerce")
        out = pd.Series(out)

        # parse only where 8 digits
        out.loc[mask_8] = pd.to_datetime(ss.loc[mask_8], format="%Y%m%d", errors="coerce")
        return out

    # Else parse normal strings
    return pd.to_datetime(s0, errors="coerce")

for col in klaim.columns:
    if "tanggal" in col:
        klaim[col] = parse_mixed_date(klaim[col])

for col in polis.columns:
    if "tanggal" in col:
        polis[col] = parse_mixed_date(polis[col])

# =============================
# SAFE DEDUP
# =============================
if "claim_id" in klaim.columns:
    klaim = klaim.drop_duplicates(subset=["claim_id"]).reset_index(drop=True)
else:
    klaim = klaim.drop_duplicates().reset_index(drop=True)

polis = polis.drop_duplicates(subset=["nomor_polis"]).reset_index(drop=True)

# =============================
# BASIC CLEANING
# =============================
klaim = klaim.dropna(subset=["nomor_polis", "tanggal_pasien_masuk_rs"]).copy()

klaim["nominal_klaim_yang_disetujui"] = pd.to_numeric(
    klaim["nominal_klaim_yang_disetujui"], errors="coerce"
).fillna(0).clip(lower=0)

# winsorization only on positive values
pos = klaim["nominal_klaim_yang_disetujui"] > 0
if pos.any():
    low_q  = klaim.loc[pos, "nominal_klaim_yang_disetujui"].quantile(0.005)
    high_q = klaim.loc[pos, "nominal_klaim_yang_disetujui"].quantile(0.995)
    klaim.loc[pos, "nominal_klaim_yang_disetujui"] = klaim.loc[pos, "nominal_klaim_yang_disetujui"].clip(low_q, high_q)

# =============================
# MERGE
# =============================
df = klaim.merge(polis, on="nomor_polis", how="left")

# =============================
# SERVICE MONTH
# =============================
df["year_month"] = df["tanggal_pasien_masuk_rs"].dt.to_period("M")

min_m = df["year_month"].min()
max_m = df["year_month"].max()
all_months = pd.period_range(min_m, max_m, freq="M")

# ============================================================
# EXPOSURE OPTIONS (choose one)
# ============================================================
EXPOSURE_MODE = "claimant"  # "claimant" or "inforce"

# 1) claimant exposure: unique policies that claim in that month
expo_claimant = (
    df.groupby("year_month")["nomor_polis"].nunique()
      .reindex(all_months, fill_value=0)
      .rename("exposure_claimant")
      .rename_axis("year_month")
      .reset_index()
)

# 2) inforce exposure: policies effective <= month (no end date in dataset)
start_col = "tanggal_efektif_polis" if "tanggal_efektif_polis" in polis.columns else None

if start_col is not None:
    p = polis[["nomor_polis", start_col]].dropna(subset=[start_col]).copy()
    p["start_m"] = p[start_col].dt.to_period("M")

    # base = policies started before min_m
    base = p.loc[p["start_m"] < min_m, "nomor_polis"].nunique()

    inc = (
        p.loc[p["start_m"] >= min_m]
         .groupby("start_m")["nomor_polis"].nunique()
    )

    expo_inforce = (
        (base + inc.reindex(all_months, fill_value=0).cumsum())
        .rename("exposure_inforce")
        .rename_axis("year_month")
        .reset_index()
    )
else:
    expo_inforce = expo_claimant[["year_month"]].copy()
    expo_inforce["exposure_inforce"] = 0

# merge exposure tables
expo = expo_claimant.merge(expo_inforce, on="year_month", how="left")

# choose exposure
expo["exposure"] = np.where(
    EXPOSURE_MODE == "inforce",
    expo["exposure_inforce"],
    expo["exposure_claimant"]
)

# merge exposure into df (FIXED: expo has year_month for sure)
df = df.merge(expo[["year_month", "exposure"]], on="year_month", how="left")

# compatibility name (biar stage lain aman)
df["active_policies"] = df["exposure"]

# ============================================================
# MONTHLY CORE TABLE
# ============================================================
freq_col = "claim_id" if "claim_id" in df.columns else "nomor_polis"

monthly = (
    df.groupby("year_month")
      .agg(
          frequency=(freq_col, "count"),
          total_claim=("nominal_klaim_yang_disetujui", "sum"),
          exposure=("exposure", "first")
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

monthly["severity"] = monthly["total_claim"] / monthly["frequency"].replace(0, np.nan)
monthly["claim_rate"] = monthly["frequency"] / monthly["exposure"].replace(0, np.nan)

# ============================================================
# LOG FEATURES
# ============================================================
monthly["log_total"] = np.log1p(monthly["total_claim"])
monthly["log_freq"]  = np.log1p(monthly["frequency"])
monthly["log_sev"]   = np.log1p(monthly["severity"])
monthly["log_rate"]  = np.log1p(monthly["claim_rate"])

# ============================================================
# VOLATILITY
# ============================================================
monthly["roll6"] = monthly["total_claim"].rolling(6, min_periods=3).mean()
monthly["std6"]  = monthly["total_claim"].rolling(6, min_periods=3).std()
monthly["vol_ratio"] = monthly["std6"] / monthly["roll6"]
monthly["high_vol_regime"] = (monthly["vol_ratio"] > monthly["vol_ratio"].median()).astype(int)

# ============================================================
# TIME FEATURES
# ============================================================
monthly["month"] = monthly["year_month"].dt.month
monthly["month_sin"] = np.sin(2*np.pi*monthly["month"]/12)
monthly["month_cos"] = np.cos(2*np.pi*monthly["month"]/12)
monthly["month_index"] = np.arange(len(monthly))

# ============================================================
# SAFE LAGS
# ============================================================
for col in ["log_total", "log_freq", "log_sev", "log_rate"]:
    monthly[f"{col}_lag1"] = monthly[col].shift(1)
    monthly[f"{col}_lag2"] = monthly[col].shift(2)
    monthly[f"{col}_lag3"] = monthly[col].shift(3)
    monthly[f"{col}_roll3"] = monthly[col].shift(1).rolling(3).mean()

monthly = monthly.dropna().reset_index(drop=True)

# ============================================================
# FINAL CHECK
# ============================================================
print("EXPOSURE_MODE:", EXPOSURE_MODE)
print("Policy start col:", start_col)
print("Frequency source:", freq_col)
print("Monthly shape:", monthly.shape)
print("Unique months:", monthly["year_month"].nunique())
print("Exposure min/max:", float(monthly["exposure"].min()), float(monthly["exposure"].max()))
print("\nSTAGE 1 v3 â€” READY")


EXPOSURE_MODE: claimant
Policy start col: tanggal_efektif_polis
Frequency source: claim_id
Monthly shape: (16, 34)
Unique months: 16
Exposure min/max: 121.0 158.0

STAGE 1 v3 â€” READY


In [3]:
tmp = monthly.copy()
tmp["freq_per_exposure"] = tmp["frequency"] / tmp["exposure"]
print(tmp[["year_month","frequency","exposure","freq_per_exposure"]].tail(10))
print("freq_per_exposure min/max:",
      tmp["freq_per_exposure"].min(),
      tmp["freq_per_exposure"].max())

   year_month  frequency  exposure  freq_per_exposure
6     2024-10        274       158           1.734177
7     2024-11        270       147           1.836735
8     2024-12        238       133           1.789474
9     2025-01        216       132           1.636364
10    2025-02        246       145           1.696552
11    2025-03        230       126           1.825397
12    2025-04        208       123           1.691057
13    2025-05        239       142           1.683099
14    2025-06        234       124           1.887097
15    2025-07        264       147           1.795918
freq_per_exposure min/max: 1.6363636363636365 1.8870967741935485


# TIME-SERIES DATASET ENGINEERING

In [4]:
# ============================================================
# STAGE 2 â€” ELITE SEGMENT PANEL (SAFE VERSION)
# No KeyError â€¢ Auto-create missing columns â€¢ Short series safe
# ============================================================

import numpy as np
import pandas as pd

# ============================================================
# ðŸ”¹ ENSURE REQUIRED SEGMENT COLUMNS EXIST
# ============================================================

# Care Type
if "care_type" not in df.columns:
    if "inpatient_outpatient" in df.columns:
        df["care_type"] = (
            df["inpatient_outpatient"]
            .astype(str)
            .str.upper()
            .str.strip()
        )
    else:
        df["care_type"] = "UNKNOWN"

df["care_type"] = df["care_type"].fillna("UNKNOWN")


# Cashless
if "is_cashless" not in df.columns:
    if "reimburse_cashless" in df.columns:
        rc = df["reimburse_cashless"].astype(str).str.upper().str.strip()
        df["is_cashless"] = rc.eq("C").astype(int)
    else:
        df["is_cashless"] = 0


# RS Bucket
if "rs_bucket" not in df.columns:
    if "lokasi_rs" in df.columns:
        loc = df["lokasi_rs"].astype(str).str.upper().str.strip()
        df["rs_bucket"] = np.select(
            [
                loc.eq("INDONESIA"),
                loc.eq("SINGAPORE"),
                loc.eq("MALAYSIA")
            ],
            ["ID","SG","MY"],
            default="OTHER"
        )
    else:
        df["rs_bucket"] = "OTHER"

df["rs_bucket"] = df["rs_bucket"].fillna("OTHER")


# Plan Code
if "plan_code" not in df.columns:
    df["plan_code"] = "UNKNOWN"

df["plan_code"] = df["plan_code"].fillna("UNKNOWN")

# ============================================================
# ðŸ”¹ DEFINE SEGMENT COLUMNS
# ============================================================

seg_cols = ["plan_code","care_type","is_cashless","rs_bucket"]

# ============================================================
# ðŸ”¹ BUILD SEGMENT MONTHLY
# ============================================================

seg_monthly = (
    df.groupby(["year_month"] + seg_cols)
      .agg(
          frequency=("nomor_polis","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
          exposure=("nomor_polis","nunique")
      )
      .reset_index()
      .sort_values(seg_cols + ["year_month"])
      .reset_index(drop=True)
)

# ============================================================
# ðŸ”¹ TARGETS
# ============================================================

seg_monthly["severity"] = (
    seg_monthly["total_claim"] /
    seg_monthly["frequency"].replace(0, np.nan)
)

seg_monthly["log_total"] = np.log1p(seg_monthly["total_claim"])
seg_monthly["log_freq"]  = np.log1p(seg_monthly["frequency"])
seg_monthly["log_sev"]   = np.log1p(seg_monthly["severity"])

# ============================================================
# ðŸ”¹ CALENDAR
# ============================================================

seg_monthly["month"] = seg_monthly["year_month"].dt.month
seg_monthly["month_sin"] = np.sin(2*np.pi*seg_monthly["month"]/12)
seg_monthly["month_cos"] = np.cos(2*np.pi*seg_monthly["month"]/12)

# ============================================================
# ðŸ”¹ LAGS (STRICT NO LEAKAGE)
# ============================================================

for col in ["log_total","log_freq","log_sev"]:
    
    seg_monthly[f"{col}_lag1"] = \
        seg_monthly.groupby(seg_cols)[col].shift(1)
    
    seg_monthly[f"{col}_lag2"] = \
        seg_monthly.groupby(seg_cols)[col].shift(2)
    
    seg_monthly[f"{col}_lag3"] = \
        seg_monthly.groupby(seg_cols)[col].shift(3)

    seg_monthly[f"{col}_roll3"] = \
        seg_monthly.groupby(seg_cols)[col] \
        .transform(lambda x: x.shift(1).rolling(3).mean())

# ============================================================
# ðŸ”¹ MOMENTUM
# ============================================================

seg_monthly["momentum_total"] = (
    seg_monthly["log_total_lag1"] -
    seg_monthly["log_total_lag2"]
)

# ============================================================
# ðŸ”¹ SEGMENT WEIGHT
# ============================================================

seg_monthly["seg_weight"] = (
    seg_monthly["frequency"] /
    seg_monthly.groupby("year_month")["frequency"].transform("sum")
).fillna(0)

# ============================================================
# ðŸ”¹ SAFE TRAIN WINDOW
# ============================================================

seg_model = seg_monthly[
    seg_monthly["log_total_lag3"].notna()
].reset_index(drop=True)

seg_model = seg_model.fillna(0)

# ============================================================
# FINAL CHECK
# ============================================================

print("COMPACT PANEL SHAPE:", seg_model.shape)
print("Unique segments:", seg_model[seg_cols].drop_duplicates().shape[0])
print("Columns:", len(seg_model.columns))
print("\nSTAGE 2 â€” ELITE SEGMENT PANEL READY")

COMPACT PANEL SHAPE: (414, 29)
Unique segments: 41
Columns: 29

STAGE 2 â€” ELITE SEGMENT PANEL READY


# MODEL DEVELOPMENT

In [5]:
# ============================================================
# STAGE 3 v15 â€” KAGGLE-MATCH VALIDATION (HORIZON MATCH + DIRECT FREQ)
# - Horizon = unique months in sample_submission (usually 5)
# - Predict TOTAL and FREQUENCY directly (ETS), then derive SEVERITY
# - True recursive (refit each step on simulated history)
# ============================================================

import numpy as np
import pandas as pd
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore")

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"
sample_sub = pd.read_csv(BASE_PATH + "sample_submission.csv")

def mape(y_true, y_pred):
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    mask = y_true != 0
    if mask.sum() == 0:
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

# ==============================
# BUILD MONTHLY (consistent with Stage 1 v3)
# ==============================
monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
          exposure=("active_policies","first")
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

monthly["severity"]   = monthly["total_claim"] / monthly["frequency"].replace(0,np.nan)
monthly["claim_rate"] = monthly["frequency"] / monthly["exposure"].replace(0,np.nan)

# ==============================
# HORIZON = months in sample_submission (Kaggle behavior)
# ==============================
sample_sub["year"]  = sample_sub["id"].str.split("_").str[0]
sample_sub["month"] = sample_sub["id"].str.split("_").str[1]
sample_sub["month_key"] = sample_sub["year"] + "-" + sample_sub["month"]

future_periods = (
    pd.PeriodIndex(sample_sub["month_key"], freq="M")
      .unique()
      .sort_values()
)

H = int(len(future_periods))
# safety: ensure enough train months for ETS + shrink windows
H = min(H, max(1, len(monthly) - 6))

# ==============================
# SPLIT (Kaggle-match horizon)
# ==============================
train = monthly.iloc[:-H].copy()
valid = monthly.iloc[-H:].copy()

sim_df = train.copy()

pred_total = []
pred_freq  = []
pred_sev   = []

# ==============================
# RECURSIVE FORECAST H steps
# ==============================
for step in range(H):

    train_sim = sim_df.copy()

    # -------- TOTAL: ETS on log1p(total_claim) --------
    try:
        model_total = ExponentialSmoothing(
            np.log1p(train_sim["total_claim"]),
            trend="add",
            damped_trend=True,
            seasonal=None
        ).fit()
        total_pred = float(np.expm1(model_total.forecast(1).iloc[0]))
    except:
        total_pred = float(train_sim["total_claim"].iloc[-1])

    # soft shrink to recent stability
    total_anchor = float(train_sim["total_claim"].tail(3).mean())
    total_pred = 0.80 * total_pred + 0.20 * total_anchor
    total_pred = max(total_pred, 1.0)

    # -------- FREQ: ETS on log1p(frequency) (DIRECT) --------
    try:
        model_freq = ExponentialSmoothing(
            np.log1p(train_sim["frequency"]),
            trend="add",
            damped_trend=True,
            seasonal=None
        ).fit()
        freq_pred = float(np.expm1(model_freq.forecast(1).iloc[0]))
    except:
        freq_pred = float(train_sim["frequency"].iloc[-1])

    # soft shrink freq too (avoid wild jumps in short series)
    freq_anchor = float(train_sim["frequency"].tail(3).mean())
    freq_pred = 0.85 * freq_pred + 0.15 * freq_anchor
    freq_pred = max(freq_pred, 1.0)

    # -------- SEVERITY derived --------
    sev_pred = total_pred / freq_pred

    pred_total.append(total_pred)
    pred_freq.append(freq_pred)
    pred_sev.append(sev_pred)

    # update sim_df (recursive history)
    # exposure: keep last known (not used for freq anymore)
    exposure_next = float(train_sim["exposure"].iloc[-1]) if "exposure" in train_sim.columns else np.nan
    new_row = {
        "frequency": freq_pred,
        "total_claim": total_pred,
        "exposure": exposure_next,
        "severity": sev_pred,
        "claim_rate": (freq_pred / exposure_next) if (exposure_next and exposure_next > 0) else np.nan
    }

    sim_df = pd.concat([sim_df, pd.DataFrame([new_row])], ignore_index=True)

# ==============================
# REPORT
# ==============================
m_freq = mape(valid["frequency"], pred_freq)
m_tot  = mape(valid["total_claim"], pred_total)
m_sev  = mape(valid["severity"], pred_sev)

print("\n==============================")
print(f"Horizon months used : {H}")
print("STAGE 3 v15 MAPE Frequency :", round(m_freq, 4))
print("STAGE 3 v15 MAPE Total     :", round(m_tot, 4))
print("STAGE 3 v15 MAPE Severity  :", round(m_sev, 4))
print("Estimated Score            :", round(np.nanmean([m_freq, m_tot, m_sev]), 4))
print("==============================")

# optional: quick preview vs actual
check = valid[["year_month","frequency","total_claim","severity"]].copy()
check["pred_frequency"] = pred_freq
check["pred_total"] = pred_total
check["pred_severity"] = pred_sev
print("\nPreview last horizon months:")
print(check)



Horizon months used : 5
STAGE 3 v15 MAPE Frequency : 5.7278
STAGE 3 v15 MAPE Total     : 6.5396
STAGE 3 v15 MAPE Severity  : 4.901
Estimated Score            : 5.7228

Preview last horizon months:
   year_month  frequency   total_claim      severity  pred_frequency  \
14    2025-03        230  1.297738e+10  5.642340e+07      236.301461   
15    2025-04        208  1.094443e+10  5.261747e+07      235.954524   
16    2025-05        239  1.222689e+10  5.115852e+07      236.731912   
17    2025-06        234  1.227084e+10  5.243948e+07      236.181420   
18    2025-07        264  1.289196e+10  4.883318e+07      236.073148   

      pred_total  pred_severity  
14  1.179829e+10   4.992898e+07  
15  1.181956e+10   5.009255e+07  
16  1.195624e+10   5.050539e+07  
17  1.173822e+10   4.970001e+07  
18  1.172368e+10   4.966122e+07  


# TOTAL CLAIM OPTIMIZATION & VALIDATION, OPTUNA

In [6]:
# ============================================================
# STAGE 4 v17 â€” KAGGLE-MATCH OPTUNA (TOTAL + FREQ) â€¢ TRUE RECURSIVE â€¢ HORIZON=SUB
# - Horizon otomatis dari sample_submission (biasanya 5)
# - Hybrid ETS + LightGBM untuk total & frequency
# - Severity = total/freq (derived)
# ============================================================

!pip install -q optuna lightgbm statsmodels

import optuna
import numpy as np
import pandas as pd
import lightgbm as lgb
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore")

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"
sample_sub = pd.read_csv(BASE_PATH + "sample_submission.csv")

# ==============================
# STRICT MAPE (return fraction)
# ==============================
def mape_frac(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = y_true != 0
    if mask.sum() == 0:
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

# ==============================
# BUILD MONTHLY (TOTAL + FREQ)
# ==============================
monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum")
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

monthly["log_total"] = np.log1p(monthly["total_claim"])
monthly["log_freq"]  = np.log1p(monthly["frequency"])

monthly["month"] = monthly["year_month"].dt.month
monthly["month_sin"] = np.sin(2*np.pi*monthly["month"]/12)
monthly["month_cos"] = np.cos(2*np.pi*monthly["month"]/12)

# ==============================
# HORIZON EXACTLY LIKE KAGGLE (from sample_sub)
# ==============================
sample_sub["year"]  = sample_sub["id"].str.split("_").str[0]
sample_sub["month"] = sample_sub["id"].str.split("_").str[1]
sample_sub["month_key"] = sample_sub["year"] + "-" + sample_sub["month"]

future_periods = (
    pd.PeriodIndex(sample_sub["month_key"], freq="M")
      .unique()
      .sort_values()
)

H = int(len(future_periods))  # usually 5
# safety
H = min(H, max(1, len(monthly) - 8))

train_full = monthly.iloc[:-H].copy()
valid_full = monthly.iloc[-H:].copy()

# ==============================
# Feature builder (short series safe)
# ==============================
def build_features(sim_df: pd.DataFrame):
    x = sim_df.copy()

    # total lags
    for lag in [1,2,3]:
        x[f"t_lag{lag}"] = x["log_total"].shift(lag)
    x["t_roll3"] = x["log_total"].shift(1).rolling(3).mean()

    # freq lags
    for lag in [1,2,3]:
        x[f"f_lag{lag}"] = x["log_freq"].shift(lag)
    x["f_roll3"] = x["log_freq"].shift(1).rolling(3).mean()

    # drop rows without lags (only for ML training)
    x = x.dropna().reset_index(drop=True)
    return x

feat_total = ["month_sin","month_cos","t_lag1","t_lag2","t_lag3","t_roll3"]
feat_freq  = ["month_sin","month_cos","f_lag1","f_lag2","f_lag3","f_roll3"]

# ==============================
# OPTUNA OBJECTIVE (TRUE RECURSIVE)
# ==============================
def objective(trial):

    # blend weights
    alpha_t  = trial.suggest_float("alpha_total", 0.20, 0.90)
    alpha_f  = trial.suggest_float("alpha_freq",  0.20, 0.90)

    # shrink (anti overfit)
    shrink_t = trial.suggest_float("shrink_total", 0.80, 0.99)
    shrink_f = trial.suggest_float("shrink_freq",  0.70, 0.98)

    # LGBM params (simple, short series safe)
    lr     = trial.suggest_float("lr", 0.01, 0.06)
    leaves = trial.suggest_int("leaves", 3, 12)
    min_leaf = trial.suggest_int("min_leaf", 2, 8)

    sim_raw = train_full.copy()
    pred_tot, pred_freq, pred_sev = [], [], []

    for step in range(H):

        # ---------- ETS TOTAL ----------
        try:
            ets_t = ExponentialSmoothing(
                sim_raw["log_total"],
                trend="add",
                damped_trend=True,
                seasonal=None
            ).fit()
            pred_ets_total = float(np.expm1(ets_t.forecast(1).iloc[0]))
        except:
            pred_ets_total = float(sim_raw["total_claim"].iloc[-1])

        # ---------- ETS FREQ ----------
        try:
            ets_f = ExponentialSmoothing(
                sim_raw["log_freq"],
                trend="add",
                damped_trend=True,
                seasonal=None
            ).fit()
            pred_ets_freq = float(np.expm1(ets_f.forecast(1).iloc[0]))
        except:
            pred_ets_freq = float(sim_raw["frequency"].iloc[-1])

        # ---------- ML TRAIN (needs lags) ----------
        sim_ml = build_features(sim_raw)

        # if too few rows after lagging, fallback to ETS-only
        if len(sim_ml) < 6:
            total_pred = pred_ets_total
            freq_pred  = pred_ets_freq
        else:
            mdl_t = lgb.LGBMRegressor(
                n_estimators=350,
                learning_rate=lr,
                num_leaves=leaves,
                min_data_in_leaf=min_leaf,
                random_state=42,
                verbosity=-1
            )
            mdl_f = lgb.LGBMRegressor(
                n_estimators=350,
                learning_rate=lr,
                num_leaves=leaves,
                min_data_in_leaf=min_leaf,
                random_state=42,
                verbosity=-1
            )

            mdl_t.fit(sim_ml[feat_total], sim_ml["log_total"])
            mdl_f.fit(sim_ml[feat_freq],  sim_ml["log_freq"])

            # ---------- BUILD NEXT FEATURES ----------
            last_month = int(sim_raw["month"].iloc[-1])
            next_month = last_month % 12 + 1
            ms = np.sin(2*np.pi*next_month/12)
            mc = np.cos(2*np.pi*next_month/12)

            # total lags from raw history
            t1 = float(sim_raw["log_total"].iloc[-1])
            t2 = float(sim_raw["log_total"].iloc[-2])
            t3 = float(sim_raw["log_total"].iloc[-3])
            t_roll3 = float(sim_raw["log_total"].tail(3).mean())

            X_t = pd.DataFrame([{
                "month_sin": ms, "month_cos": mc,
                "t_lag1": t1, "t_lag2": t2, "t_lag3": t3,
                "t_roll3": t_roll3
            }])

            # freq lags from raw history
            f1 = float(sim_raw["log_freq"].iloc[-1])
            f2 = float(sim_raw["log_freq"].iloc[-2])
            f3 = float(sim_raw["log_freq"].iloc[-3])
            f_roll3 = float(sim_raw["log_freq"].tail(3).mean())

            X_f = pd.DataFrame([{
                "month_sin": ms, "month_cos": mc,
                "f_lag1": f1, "f_lag2": f2, "f_lag3": f3,
                "f_roll3": f_roll3
            }])

            pred_ml_total = float(np.expm1(mdl_t.predict(X_t)[0]))
            pred_ml_freq  = float(np.expm1(mdl_f.predict(X_f)[0]))

            # ---------- HYBRID ----------
            total_pred = alpha_t * pred_ets_total + (1 - alpha_t) * pred_ml_total
            freq_pred  = alpha_f * pred_ets_freq  + (1 - alpha_f) * pred_ml_freq

        # ---------- SHRINK (stability anchor) ----------
        total_anchor = float(sim_raw["total_claim"].tail(3).median())
        freq_anchor  = float(sim_raw["frequency"].tail(3).mean())

        total_pred = shrink_t * total_pred + (1 - shrink_t) * total_anchor
        freq_pred  = shrink_f * freq_pred  + (1 - shrink_f) * freq_anchor

        total_pred = max(float(total_pred), 1.0)
        freq_pred  = max(float(freq_pred),  1.0)

        sev_pred = total_pred / freq_pred

        pred_tot.append(total_pred)
        pred_freq.append(freq_pred)
        pred_sev.append(sev_pred)

        # append recursive
        next_month = int(sim_raw["month"].iloc[-1]) % 12 + 1
        sim_raw = pd.concat([sim_raw, pd.DataFrame([{
            "year_month": None,
            "frequency": freq_pred,
            "total_claim": total_pred,
            "log_total": np.log1p(total_pred),
            "log_freq": np.log1p(freq_pred),
            "month": next_month,
            "month_sin": np.sin(2*np.pi*next_month/12),
            "month_cos": np.cos(2*np.pi*next_month/12),
        }])], ignore_index=True)

    # targets
    y_tot = valid_full["total_claim"].values
    y_fre = valid_full["frequency"].values
    y_sev = valid_full["total_claim"].values / valid_full["frequency"].replace(0, np.nan).values

    mt = mape_frac(y_tot, pred_tot)
    mf = mape_frac(y_fre, pred_freq)
    ms = mape_frac(y_sev, pred_sev)

    return float(np.nanmean([mt, mf, ms]))

# ==============================
# RUN OPTUNA
# ==============================
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=150)

print("\n==============================")
print("Horizon used:", H)
print("Best Params :", study.best_params)
print("Best Score  :", round(study.best_value * 100, 4), "% (avg MAPE)")
print("==============================")


[I 2026-02-19 13:55:11,083] A new study created in memory with name: no-name-54e5a016-bc78-437c-9a1d-5a4963756245
[I 2026-02-19 13:55:11,974] Trial 0 finished with value: 0.0672318268855752 and parameters: {'alpha_total': 0.5934877887930567, 'alpha_freq': 0.4030068453322443, 'shrink_total': 0.8336995200059756, 'shrink_freq': 0.9655055428590823, 'lr': 0.04407345641263193, 'leaves': 3, 'min_leaf': 4}. Best is trial 0 with value: 0.0672318268855752.
[I 2026-02-19 13:55:12,870] Trial 1 finished with value: 0.07529689035391057 and parameters: {'alpha_total': 0.2376007801143759, 'alpha_freq': 0.39392949544485567, 'shrink_total': 0.8442670207758197, 'shrink_freq': 0.842471528165669, 'lr': 0.05568038511397685, 'leaves': 7, 'min_leaf': 3}. Best is trial 0 with value: 0.0672318268855752.
[I 2026-02-19 13:55:13,725] Trial 2 finished with value: 0.06587517721024146 and parameters: {'alpha_total': 0.38844133165093997, 'alpha_freq': 0.6344025532926659, 'shrink_total': 0.8823577590681825, 'shrink_fre


Horizon used: 5
Best Params : {'alpha_total': 0.2008107289732955, 'alpha_freq': 0.8852393138219012, 'shrink_total': 0.9894243114577745, 'shrink_freq': 0.7344782821209231, 'lr': 0.04163964069213416, 'leaves': 9, 'min_leaf': 6}
Best Score  : 4.7091 % (avg MAPE)


# TEST PREDICTION & KAGGLE SUBMISSION

In [7]:
# ============================================================
# STAGE 5 v18 â€” DEFENSIVE (DIRECT TOTAL + DIRECT FREQ) â€¢ TRUE RECURSIVE â€¢ KAGGLE-MATCH
# - Horizon/future months dari sample_submission
# - Predict TOTAL + FREQUENCY directly (ETS + optional LGBM hybrid)
# - Severity = total/freq (derived)
# - Defensive clamp untuk stabilitas public
# ============================================================

!pip install -q lightgbm statsmodels

import numpy as np
import pandas as pd
import lightgbm as lgb
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore")

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"
sample_sub = pd.read_csv(BASE_PATH + "sample_submission.csv")

# ==============================
# (OPTIONAL) PASTE BEST PARAMS FROM STAGE 4 v17
# If you haven't run Stage 4, keep defaults below.
# ==============================
BEST = {
    "alpha_total": 0.55,   # blend ETS vs ML (total)
    "alpha_freq":  0.55,   # blend ETS vs ML (freq)
    "shrink_total": 0.95,  # shrink to recent anchor (total)
    "shrink_freq":  0.92,  # shrink to recent anchor (freq)
    "lr": 0.03,
    "leaves": 6,
    "min_leaf": 4
}

# ==============================
# BUILD MONTHLY
# ==============================
monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("claim_id", "count"),
          total_claim=("nominal_klaim_yang_disetujui", "sum")
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

monthly["log_total"] = np.log1p(monthly["total_claim"])
monthly["log_freq"]  = np.log1p(monthly["frequency"])

monthly["month"] = monthly["year_month"].dt.month
monthly["month_sin"] = np.sin(2*np.pi*monthly["month"]/12)
monthly["month_cos"] = np.cos(2*np.pi*monthly["month"]/12)

# ==============================
# FUTURE PERIODS (from sample_submission)
# ==============================
sample_sub["year"]  = sample_sub["id"].str.split("_").str[0]
sample_sub["month"] = sample_sub["id"].str.split("_").str[1]
sample_sub["month_key"] = sample_sub["year"] + "-" + sample_sub["month"]

future_periods = (
    pd.PeriodIndex(sample_sub["month_key"], freq="M")
      .unique()
      .sort_values()
)

# ==============================
# FEATURE BUILDER (lags) â€” short series safe
# ==============================
def build_features(sim_df: pd.DataFrame) -> pd.DataFrame:
    x = sim_df.copy()

    # total lags
    for lag in [1, 2, 3]:
        x[f"t_lag{lag}"] = x["log_total"].shift(lag)
    x["t_roll3"] = x["log_total"].shift(1).rolling(3).mean()

    # freq lags
    for lag in [1, 2, 3]:
        x[f"f_lag{lag}"] = x["log_freq"].shift(lag)
    x["f_roll3"] = x["log_freq"].shift(1).rolling(3).mean()

    x = x.dropna().reset_index(drop=True)
    return x

feat_total = ["month_sin", "month_cos", "t_lag1", "t_lag2", "t_lag3", "t_roll3"]
feat_freq  = ["month_sin", "month_cos", "f_lag1", "f_lag2", "f_lag3", "f_roll3"]

# We'll simulate on a lag-ready table (dropna first)
sim_raw = monthly.copy()
sim_raw = sim_raw.dropna().reset_index(drop=True)

predictions = {}

# ==============================
# RECURSIVE PREDICTION
# ==============================
for period in future_periods:

    # ---- ETS TOTAL ----
    try:
        ets_t = ExponentialSmoothing(
            sim_raw["log_total"],
            trend="add",
            damped_trend=True,
            seasonal=None
        ).fit()
        pred_ets_total = float(np.expm1(ets_t.forecast(1).iloc[0]))
    except:
        pred_ets_total = float(np.expm1(sim_raw["log_total"].iloc[-1]))

    # ---- ETS FREQ ----
    try:
        ets_f = ExponentialSmoothing(
            sim_raw["log_freq"],
            trend="add",
            damped_trend=True,
            seasonal=None
        ).fit()
        pred_ets_freq = float(np.expm1(ets_f.forecast(1).iloc[0]))
    except:
        pred_ets_freq = float(np.expm1(sim_raw["log_freq"].iloc[-1]))

    # ---- ML (optional; fallback if too few rows) ----
    sim_ml = build_features(sim_raw)

    use_ml = (len(sim_ml) >= 6)
    if use_ml:
        mdl_t = lgb.LGBMRegressor(
            n_estimators=350,
            learning_rate=float(BEST["lr"]),
            num_leaves=int(BEST["leaves"]),
            min_data_in_leaf=int(BEST["min_leaf"]),
            random_state=42,
            verbosity=-1
        )
        mdl_f = lgb.LGBMRegressor(
            n_estimators=350,
            learning_rate=float(BEST["lr"]),
            num_leaves=int(BEST["leaves"]),
            min_data_in_leaf=int(BEST["min_leaf"]),
            random_state=42,
            verbosity=-1
        )

        mdl_t.fit(sim_ml[feat_total], sim_ml["log_total"])
        mdl_f.fit(sim_ml[feat_freq],  sim_ml["log_freq"])

        # next month
        last_month = int(sim_raw["month"].iloc[-1])
        next_month = last_month % 12 + 1
        ms = np.sin(2*np.pi*next_month/12)
        mc = np.cos(2*np.pi*next_month/12)

        # build X_new from raw history
        t1 = float(sim_raw["log_total"].iloc[-1])
        t2 = float(sim_raw["log_total"].iloc[-2])
        t3 = float(sim_raw["log_total"].iloc[-3])
        t_roll3 = float(sim_raw["log_total"].tail(3).mean())

        f1 = float(sim_raw["log_freq"].iloc[-1])
        f2 = float(sim_raw["log_freq"].iloc[-2])
        f3 = float(sim_raw["log_freq"].iloc[-3])
        f_roll3 = float(sim_raw["log_freq"].tail(3).mean())

        X_t = pd.DataFrame([{
            "month_sin": ms, "month_cos": mc,
            "t_lag1": t1, "t_lag2": t2, "t_lag3": t3,
            "t_roll3": t_roll3
        }])

        X_f = pd.DataFrame([{
            "month_sin": ms, "month_cos": mc,
            "f_lag1": f1, "f_lag2": f2, "f_lag3": f3,
            "f_roll3": f_roll3
        }])

        pred_ml_total = float(np.expm1(mdl_t.predict(X_t)[0]))
        pred_ml_freq  = float(np.expm1(mdl_f.predict(X_f)[0]))
    else:
        # fallback ETS-only
        last_month = int(sim_raw["month"].iloc[-1])
        next_month = last_month % 12 + 1
        ms = np.sin(2*np.pi*next_month/12)
        mc = np.cos(2*np.pi*next_month/12)
        pred_ml_total = pred_ets_total
        pred_ml_freq  = pred_ets_freq

    # ---- HYBRID ----
    total_pred = float(BEST["alpha_total"]) * pred_ets_total + (1 - float(BEST["alpha_total"])) * pred_ml_total
    freq_pred  = float(BEST["alpha_freq"])  * pred_ets_freq  + (1 - float(BEST["alpha_freq"]))  * pred_ml_freq

    # ---- SHRINK to anchors ----
    total_anchor = float(sim_raw["total_claim"].tail(3).median())
    freq_anchor  = float(sim_raw["frequency"].tail(3).mean())

    total_pred = float(BEST["shrink_total"]) * total_pred + (1 - float(BEST["shrink_total"])) * total_anchor
    freq_pred  = float(BEST["shrink_freq"])  * freq_pred  + (1 - float(BEST["shrink_freq"]))  * freq_anchor

    # ---- DEFENSIVE CLAMP (reduce public gap) ----
    # clamp to recent window to avoid drift
    t_min = float(sim_raw["total_claim"].tail(6).min()) * 0.90
    t_max = float(sim_raw["total_claim"].tail(6).max()) * 1.10
    total_pred = float(np.clip(total_pred, t_min, t_max))

    f_min = float(sim_raw["frequency"].tail(6).min()) * 0.90
    f_max = float(sim_raw["frequency"].tail(6).max()) * 1.10
    freq_pred = float(np.clip(freq_pred, f_min, f_max))

    total_pred = max(total_pred, 1.0)
    freq_pred  = max(freq_pred,  1.0)

    sev_pred = total_pred / freq_pred

    # ---- STORE ----
    key = f"{period.year}_{str(period.month).zfill(2)}"
    predictions[f"{key}_Total_Claim"] = total_pred
    predictions[f"{key}_Claim_Frequency"] = freq_pred
    predictions[f"{key}_Claim_Severity"] = sev_pred

    # ---- APPEND to sim_raw (recursive history) ----
    sim_raw = pd.concat([sim_raw, pd.DataFrame([{
        "year_month": period,
        "frequency": freq_pred,
        "total_claim": total_pred,
        "log_total": np.log1p(total_pred),
        "log_freq": np.log1p(freq_pred),
        "month": next_month,
        "month_sin": ms,
        "month_cos": mc
    }])], ignore_index=True)

# ==============================
# BUILD SUBMISSION
# ==============================
submission = sample_sub.copy()
submission["value"] = submission["id"].map(predictions)

missing = int(submission["value"].isna().sum())
print("NaN in submission:", missing)

submission = submission[["id", "value"]]
submission.to_csv("submission.csv", index=False)

print("Submission created â€” v18 (direct total+freq, derived severity)")
print(submission.head(9))


NaN in submission: 0
Submission created â€” v18 (direct total+freq, derived severity)
                        id         value
0  2025_08_Claim_Frequency  2.322080e+02
1   2025_08_Claim_Severity  5.133559e+07
2      2025_08_Total_Claim  1.192053e+10
3  2025_09_Claim_Frequency  2.300225e+02
4   2025_09_Claim_Severity  5.022248e+07
5      2025_09_Total_Claim  1.155230e+10
6  2025_10_Claim_Frequency  2.446288e+02
7   2025_10_Claim_Severity  5.042825e+07
8      2025_10_Total_Claim  1.233620e+10


In [8]:
print(submission.head(9)) ### upgragefull

                        id         value
0  2025_08_Claim_Frequency  2.322080e+02
1   2025_08_Claim_Severity  5.133559e+07
2      2025_08_Total_Claim  1.192053e+10
3  2025_09_Claim_Frequency  2.300225e+02
4   2025_09_Claim_Severity  5.022248e+07
5      2025_09_Total_Claim  1.155230e+10
6  2025_10_Claim_Frequency  2.446288e+02
7   2025_10_Claim_Severity  5.042825e+07
8      2025_10_Total_Claim  1.233620e+10
