In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/Data_Klaim.csv
/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/sample_submission.csv
/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/Data_Polis.csv


# DATA FOUNDATION

In [2]:
# ============================================================
# STAGE 1 v4 â€” FOUNDATION (DATASET-AWARE + NO TARGET DISTORTION)
# - Fix YYYYMMDD parsing
# - Keep RAW nominal for target (total_claim)
# - Put winsorization into separate column (optional features)
# - Build monthly with complete month range (fill missing months)
# - Exposure: claimant / inforce (optional)
# ============================================================

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"
klaim = pd.read_csv(BASE_PATH + "Data_Klaim.csv")
polis = pd.read_csv(BASE_PATH + "Data_Polis.csv")

# =============================
# CLEAN COLUMN NAMES
# =============================
def clean_columns(df):
    df = df.copy()
    df.columns = (
        df.columns.astype(str)
        .str.strip()
        .str.lower()
        .str.replace(" ", "_", regex=False)
        .str.replace("/", "_", regex=False)
        .str.replace("-", "_", regex=False)
    )
    return df

klaim = clean_columns(klaim)
polis = clean_columns(polis)

# =============================
# DATE PARSING (handle YYYYMMDD int + dd/mm/yyyy)
# =============================
def parse_mixed_date(s: pd.Series) -> pd.Series:
    s = s.copy()
    idx = s.index

    # normalize to string for pattern checks
    if pd.api.types.is_numeric_dtype(s):
        ss = s.astype("Int64").astype(str)
    else:
        ss = s.astype(str).str.strip()

    ss = ss.replace({"<NA>": np.nan, "nan": np.nan, "None": np.nan, "NaT": np.nan})

    out = pd.Series(pd.NaT, index=idx, dtype="datetime64[ns]")

    # YYYYMMDD (8 digits)
    m8 = ss.str.fullmatch(r"\d{8}", na=False)
    if m8.any():
        out.loc[m8] = pd.to_datetime(ss.loc[m8], format="%Y%m%d", errors="coerce")

    # remaining
    rest = ~m8 & ss.notna()
    if rest.any():
        has_slash = ss.loc[rest].str.contains("/", na=False)
        if has_slash.any():
            out.loc[rest[rest].index.intersection(ss.loc[rest][has_slash].index)] = pd.to_datetime(
                ss.loc[rest][has_slash], errors="coerce", dayfirst=True
            )
        if (~has_slash).any():
            out.loc[rest[rest].index.intersection(ss.loc[rest][~has_slash].index)] = pd.to_datetime(
                ss.loc[rest][~has_slash], errors="coerce"
            )

    return out

for col in klaim.columns:
    if "tanggal" in col:
        klaim[col] = parse_mixed_date(klaim[col])

for col in polis.columns:
    if "tanggal" in col:
        polis[col] = parse_mixed_date(polis[col])

# =============================
# SAFE DEDUP
# =============================
claim_id_col = None
for c in ["claim_id", "id_klaim", "klaim_id"]:
    if c in klaim.columns:
        claim_id_col = c
        break

if claim_id_col is not None:
    klaim = klaim.drop_duplicates(subset=[claim_id_col]).reset_index(drop=True)
else:
    klaim = klaim.drop_duplicates().reset_index(drop=True)

polis = polis.drop_duplicates(subset=["nomor_polis"]).reset_index(drop=True)

# =============================
# BASIC CLEANING
# =============================
# choose service date column
service_col = "tanggal_pasien_masuk_rs" if "tanggal_pasien_masuk_rs" in klaim.columns else None
if service_col is None:
    # fallback: first tanggal* column
    tcols = [c for c in klaim.columns if "tanggal" in c]
    service_col = tcols[0] if len(tcols) else None

if service_col is None:
    raise ValueError("No tanggal column found in klaim for building year_month.")

klaim = klaim.dropna(subset=["nomor_polis", service_col]).copy()

# nominal column
nom_col = "nominal_klaim_yang_disetujui"
if nom_col not in klaim.columns:
    # fallback: try find 'nominal' column
    cand = [c for c in klaim.columns if "nominal" in c]
    if len(cand) == 0:
        raise ValueError("No nominal column found in klaim.")
    nom_col = cand[0]

# IMPORTANT: keep RAW nominal for target
raw_nom = pd.to_numeric(klaim[nom_col], errors="coerce").fillna(0).clip(lower=0)
klaim[nom_col] = raw_nom

# OPTIONAL: winsorized copy for feature engineering (NOT for target)
klaim["nominal_klaim_clip"] = raw_nom.copy()
pos = klaim["nominal_klaim_clip"] > 0
if pos.any():
    low_q  = klaim.loc[pos, "nominal_klaim_clip"].quantile(0.005)
    high_q = klaim.loc[pos, "nominal_klaim_clip"].quantile(0.995)
    klaim.loc[pos, "nominal_klaim_clip"] = klaim.loc[pos, "nominal_klaim_clip"].clip(low_q, high_q)

# =============================
# MERGE
# =============================
df = klaim.merge(polis, on="nomor_polis", how="left")

# =============================
# SERVICE MONTH
# =============================
df["year_month"] = df[service_col].dt.to_period("M")

min_m = df["year_month"].min()
max_m = df["year_month"].max()
all_months = pd.period_range(min_m, max_m, freq="M")

# ============================================================
# EXPOSURE OPTIONS
# ============================================================
EXPOSURE_MODE = "inforce"  # "claimant" or "inforce"

# claimant exposure: unique policies that claim in that month
expo_claimant = (
    df.groupby("year_month")["nomor_polis"].nunique()
      .reindex(all_months, fill_value=0)
      .rename("exposure_claimant")
      .rename_axis("year_month")
      .reset_index()
)

# inforce exposure: cumulative started policies (no end date available)
start_col = None
for c in ["tanggal_efektif_polis", "tanggal_mulai_polis", "tanggal_mulai"]:
    if c in polis.columns:
        start_col = c
        break

if start_col is not None:
    p = polis[["nomor_polis", start_col]].dropna(subset=[start_col]).copy()
    p["start_m"] = p[start_col].dt.to_period("M")

    base = p.loc[p["start_m"] < min_m, "nomor_polis"].nunique()
    inc = p.loc[p["start_m"] >= min_m].groupby("start_m")["nomor_polis"].nunique()

    expo_inforce = (
        (base + inc.reindex(all_months, fill_value=0).cumsum())
        .rename("exposure_inforce")
        .rename_axis("year_month")
        .reset_index()
    )
else:
    expo_inforce = expo_claimant[["year_month"]].copy()
    expo_inforce["exposure_inforce"] = 0

expo = expo_claimant.merge(expo_inforce, on="year_month", how="left")

# choose exposure with fallback safety
expo["exposure"] = np.where(EXPOSURE_MODE == "inforce", expo["exposure_inforce"], expo["exposure_claimant"])
# if inforce is mostly 0 (bad parsing / missing), fallback to claimant
if (EXPOSURE_MODE == "inforce") and (expo["exposure"].sum() == 0):
    expo["exposure"] = expo["exposure_claimant"]

# merge exposure into df (keperluan stage lain)
df = df.merge(expo[["year_month", "exposure"]], on="year_month", how="left")
df["active_policies"] = df["exposure"]

# ============================================================
# MONTHLY CORE TABLE (complete months)
# target total_claim MUST be RAW nominal
# ============================================================
freq_col = claim_id_col if claim_id_col is not None else "nomor_polis"

monthly_core = (
    df.groupby("year_month")
      .agg(
          frequency=(freq_col, "count"),
          total_claim=(nom_col, "sum")
      )
      .reindex(all_months, fill_value=0)
      .rename_axis("year_month")
      .reset_index()
)

monthly = monthly_core.merge(expo[["year_month", "exposure"]], on="year_month", how="left")

monthly["severity"] = monthly["total_claim"] / monthly["frequency"].replace(0, np.nan)
monthly["claim_rate"] = monthly["frequency"] / monthly["exposure"].replace(0, np.nan)

# ============================================================
# LOG FEATURES
# ============================================================
monthly["log_total"] = np.log1p(monthly["total_claim"])
monthly["log_freq"]  = np.log1p(monthly["frequency"])
monthly["log_sev"]   = np.log1p(monthly["severity"])
monthly["log_rate"]  = np.log1p(monthly["claim_rate"])

# ============================================================
# VOLATILITY
# ============================================================
monthly["roll6"] = monthly["total_claim"].rolling(6, min_periods=3).mean()
monthly["std6"]  = monthly["total_claim"].rolling(6, min_periods=3).std()
monthly["vol_ratio"] = monthly["std6"] / monthly["roll6"]
monthly["high_vol_regime"] = (monthly["vol_ratio"] > monthly["vol_ratio"].median()).astype(int)

# ============================================================
# TIME FEATURES
# ============================================================
monthly["month"] = monthly["year_month"].dt.month
monthly["month_sin"] = np.sin(2*np.pi*monthly["month"]/12)
monthly["month_cos"] = np.cos(2*np.pi*monthly["month"]/12)
monthly["month_index"] = np.arange(len(monthly))

# ============================================================
# SAFE LAGS
# ============================================================
for col in ["log_total", "log_freq", "log_sev", "log_rate"]:
    monthly[f"{col}_lag1"] = monthly[col].shift(1)
    monthly[f"{col}_lag2"] = monthly[col].shift(2)
    monthly[f"{col}_lag3"] = monthly[col].shift(3)
    monthly[f"{col}_roll3"] = monthly[col].shift(1).rolling(3).mean()

monthly = monthly.dropna().reset_index(drop=True)

# ============================================================
# FINAL CHECK
# ============================================================
print("SERVICE_COL:", service_col)
print("EXPOSURE_MODE:", EXPOSURE_MODE)
print("Policy start col:", start_col)
print("Frequency source:", freq_col)
print("Monthly shape:", monthly.shape)
print("Unique months:", monthly["year_month"].nunique())
print("Exposure min/max:", float(monthly["exposure"].min()), float(monthly["exposure"].max()))
print("Total_claim min/max:", float(monthly["total_claim"].min()), float(monthly["total_claim"].max()))
print("\nSTAGE 1 v4 â€” READY")

SERVICE_COL: tanggal_pasien_masuk_rs
EXPOSURE_MODE: inforce
Policy start col: tanggal_efektif_polis
Frequency source: claim_id
Monthly shape: (16, 34)
Unique months: 16
Exposure min/max: 4096.0 4096.0
Total_claim min/max: 9610379678.55 17480540371.87

STAGE 1 v4 â€” READY


In [3]:
tmp = monthly.copy()
tmp["freq_per_exposure"] = tmp["frequency"] / tmp["exposure"]
print(tmp[["year_month","frequency","exposure","freq_per_exposure"]].tail(10))
print("freq_per_exposure min/max:",
      tmp["freq_per_exposure"].min(),
      tmp["freq_per_exposure"].max())

   year_month  frequency  exposure  freq_per_exposure
6     2024-10        274      4096           0.066895
7     2024-11        270      4096           0.065918
8     2024-12        238      4096           0.058105
9     2025-01        216      4096           0.052734
10    2025-02        246      4096           0.060059
11    2025-03        230      4096           0.056152
12    2025-04        208      4096           0.050781
13    2025-05        239      4096           0.058350
14    2025-06        234      4096           0.057129
15    2025-07        264      4096           0.064453
freq_per_exposure min/max: 0.05078125 0.06689453125


# TIME-SERIES DATASET ENGINEERING

In [4]:
# ============================================================
# STAGE 2 â€” ELITE SEGMENT PANEL (SAFE VERSION)
# No KeyError â€¢ Auto-create missing columns â€¢ Short series safe
# ============================================================

import numpy as np
import pandas as pd

# ============================================================
# ðŸ”¹ ENSURE REQUIRED SEGMENT COLUMNS EXIST
# ============================================================

# Care Type
if "care_type" not in df.columns:
    if "inpatient_outpatient" in df.columns:
        df["care_type"] = (
            df["inpatient_outpatient"]
            .astype(str)
            .str.upper()
            .str.strip()
        )
    else:
        df["care_type"] = "UNKNOWN"

df["care_type"] = df["care_type"].fillna("UNKNOWN")


# Cashless
if "is_cashless" not in df.columns:
    if "reimburse_cashless" in df.columns:
        rc = df["reimburse_cashless"].astype(str).str.upper().str.strip()
        df["is_cashless"] = rc.eq("C").astype(int)
    else:
        df["is_cashless"] = 0


# RS Bucket
if "rs_bucket" not in df.columns:
    if "lokasi_rs" in df.columns:
        loc = df["lokasi_rs"].astype(str).str.upper().str.strip()
        df["rs_bucket"] = np.select(
            [
                loc.eq("INDONESIA"),
                loc.eq("SINGAPORE"),
                loc.eq("MALAYSIA")
            ],
            ["ID","SG","MY"],
            default="OTHER"
        )
    else:
        df["rs_bucket"] = "OTHER"

df["rs_bucket"] = df["rs_bucket"].fillna("OTHER")


# Plan Code
if "plan_code" not in df.columns:
    df["plan_code"] = "UNKNOWN"

df["plan_code"] = df["plan_code"].fillna("UNKNOWN")

# ============================================================
# ðŸ”¹ DEFINE SEGMENT COLUMNS
# ============================================================

seg_cols = ["plan_code","care_type","is_cashless","rs_bucket"]

# ============================================================
# ðŸ”¹ BUILD SEGMENT MONTHLY
# ============================================================

seg_monthly = (
    df.groupby(["year_month"] + seg_cols)
      .agg(
          frequency=("nomor_polis","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
          exposure=("nomor_polis","nunique")
      )
      .reset_index()
      .sort_values(seg_cols + ["year_month"])
      .reset_index(drop=True)
)

# ============================================================
# ðŸ”¹ TARGETS
# ============================================================

seg_monthly["severity"] = (
    seg_monthly["total_claim"] /
    seg_monthly["frequency"].replace(0, np.nan)
)

seg_monthly["log_total"] = np.log1p(seg_monthly["total_claim"])
seg_monthly["log_freq"]  = np.log1p(seg_monthly["frequency"])
seg_monthly["log_sev"]   = np.log1p(seg_monthly["severity"])

# ============================================================
# ðŸ”¹ CALENDAR
# ============================================================

seg_monthly["month"] = seg_monthly["year_month"].dt.month
seg_monthly["month_sin"] = np.sin(2*np.pi*seg_monthly["month"]/12)
seg_monthly["month_cos"] = np.cos(2*np.pi*seg_monthly["month"]/12)

# ============================================================
# ðŸ”¹ LAGS (STRICT NO LEAKAGE)
# ============================================================

for col in ["log_total","log_freq","log_sev"]:
    
    seg_monthly[f"{col}_lag1"] = \
        seg_monthly.groupby(seg_cols)[col].shift(1)
    
    seg_monthly[f"{col}_lag2"] = \
        seg_monthly.groupby(seg_cols)[col].shift(2)
    
    seg_monthly[f"{col}_lag3"] = \
        seg_monthly.groupby(seg_cols)[col].shift(3)

    seg_monthly[f"{col}_roll3"] = \
        seg_monthly.groupby(seg_cols)[col] \
        .transform(lambda x: x.shift(1).rolling(3).mean())

# ============================================================
# ðŸ”¹ MOMENTUM
# ============================================================

seg_monthly["momentum_total"] = (
    seg_monthly["log_total_lag1"] -
    seg_monthly["log_total_lag2"]
)

# ============================================================
# ðŸ”¹ SEGMENT WEIGHT
# ============================================================

seg_monthly["seg_weight"] = (
    seg_monthly["frequency"] /
    seg_monthly.groupby("year_month")["frequency"].transform("sum")
).fillna(0)

# ============================================================
# ðŸ”¹ SAFE TRAIN WINDOW
# ============================================================

seg_model = seg_monthly[
    seg_monthly["log_total_lag3"].notna()
].reset_index(drop=True)

seg_model = seg_model.fillna(0)

# ============================================================
# FINAL CHECK
# ============================================================

print("COMPACT PANEL SHAPE:", seg_model.shape)
print("Unique segments:", seg_model[seg_cols].drop_duplicates().shape[0])
print("Columns:", len(seg_model.columns))
print("\nSTAGE 2 â€” ELITE SEGMENT PANEL READY")

COMPACT PANEL SHAPE: (414, 29)
Unique segments: 41
Columns: 29

STAGE 2 â€” ELITE SEGMENT PANEL READY


# MODEL DEVELOPMENT

In [5]:
# ============================================================
# STAGE 3 v17 â€” KAGGLE-MATCH VALIDATION (AUTO-TUNED SHRINK)
# - Horizon = unique months in sample_submission (usually 5)
# - Predict TOTAL & FREQ directly (ETS on log1p), derive SEVERITY
# - True recursive (refit each step on simulated history)
# - Auto grid-search shrink weights + anchor type (mean/median)
# ============================================================

import numpy as np
import pandas as pd
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore")

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"
sample_sub = pd.read_csv(BASE_PATH + "sample_submission.csv")

def mape(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = y_true != 0
    if mask.sum() == 0:
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

# ==============================
# BUILD MONTHLY (consistent with Stage 1 v3)
# ==============================
monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
          exposure=("active_policies","first")
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

monthly["severity"]   = monthly["total_claim"] / monthly["frequency"].replace(0, np.nan)
monthly["claim_rate"] = monthly["frequency"] / monthly["exposure"].replace(0, np.nan)

# ==============================
# HORIZON = months in sample_submission (Kaggle behavior)
# ==============================
sample_sub["year"]  = sample_sub["id"].str.split("_").str[0]
sample_sub["month"] = sample_sub["id"].str.split("_").str[1]
sample_sub["month_key"] = sample_sub["year"] + "-" + sample_sub["month"]

future_periods = (
    pd.PeriodIndex(sample_sub["month_key"], freq="M")
      .unique()
      .sort_values()
)

H = int(len(future_periods))
H = min(H, max(1, len(monthly) - 6))  # safety

# ==============================
# SIMULATOR (true recursive)
# ==============================
def simulate(train_df, H, wt_total, wt_freq, anchor_total="mean", anchor_freq="mean"):
    sim_df = train_df.copy()

    pred_total, pred_freq, pred_sev = [], [], []

    for step in range(H):
        hist = sim_df.copy()

        # ---- TOTAL ETS on log1p ----
        try:
            mdl_t = ExponentialSmoothing(
                np.log1p(hist["total_claim"].astype(float)),
                trend="add",
                damped_trend=True,
                seasonal=None
            ).fit()
            total_fc = float(np.expm1(mdl_t.forecast(1).iloc[0]))
        except:
            total_fc = float(hist["total_claim"].iloc[-1])

        # anchor total
        if anchor_total == "median":
            total_anchor = float(hist["total_claim"].tail(3).median())
        else:
            total_anchor = float(hist["total_claim"].tail(3).mean())

        total_pred = wt_total * total_fc + (1 - wt_total) * total_anchor
        total_pred = max(float(total_pred), 1.0)

        # ---- FREQ ETS on log1p ----
        try:
            mdl_f = ExponentialSmoothing(
                np.log1p(hist["frequency"].astype(float)),
                trend="add",
                damped_trend=True,
                seasonal=None
            ).fit()
            freq_fc = float(np.expm1(mdl_f.forecast(1).iloc[0]))
        except:
            freq_fc = float(hist["frequency"].iloc[-1])

        # anchor freq
        if anchor_freq == "median":
            freq_anchor = float(hist["frequency"].tail(3).median())
        else:
            freq_anchor = float(hist["frequency"].tail(3).mean())

        freq_pred = wt_freq * freq_fc + (1 - wt_freq) * freq_anchor
        freq_pred = max(float(freq_pred), 1.0)

        sev_pred = total_pred / freq_pred

        pred_total.append(total_pred)
        pred_freq.append(freq_pred)
        pred_sev.append(sev_pred)

        # ---- append recursive row (keep year_month progressing) ----
        last_period = hist["year_month"].iloc[-1]
        next_period = last_period + 1
        exposure_next = float(hist["exposure"].iloc[-1]) if "exposure" in hist.columns else np.nan

        sim_df = pd.concat([sim_df, pd.DataFrame([{
            "year_month": next_period,
            "frequency": freq_pred,
            "total_claim": total_pred,
            "exposure": exposure_next,
            "severity": sev_pred,
            "claim_rate": (freq_pred / exposure_next) if (exposure_next and exposure_next > 0) else np.nan
        }])], ignore_index=True)

    return pred_total, pred_freq, pred_sev

# ==============================
# SPLIT (Kaggle-match horizon)
# ==============================
train = monthly.iloc[:-H].copy()
valid = monthly.iloc[-H:].copy()

# ==============================
# AUTO SEARCH (small grid, fast)
# ==============================
wt_total_grid = [0.35, 0.45, 0.55, 0.60, 0.65, 0.75, 0.85]
wt_freq_grid  = [0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80]

best = {
    "score": 1e18,
    "params": None,
    "detail": None
}

for wt_t in wt_total_grid:
    for wt_f in wt_freq_grid:
        for a_t in ["mean", "median"]:
            for a_f in ["mean", "median"]:

                pt, pf, ps = simulate(train, H, wt_t, wt_f, a_t, a_f)

                mf = mape(valid["frequency"], pf)
                mt = mape(valid["total_claim"], pt)
                ms = mape(valid["severity"], ps)
                avg = float(np.nanmean([mf, mt, ms]))

                if avg < best["score"]:
                    best["score"] = avg
                    best["params"] = (wt_t, wt_f, a_t, a_f)
                    best["detail"] = (mf, mt, ms)

# ==============================
# RUN BEST + REPORT
# ==============================
wt_t, wt_f, a_t, a_f = best["params"]
pt, pf, ps = simulate(train, H, wt_t, wt_f, a_t, a_f)

mf, mt, ms = best["detail"]
avg = best["score"]

print("\n==============================")
print(f"Horizon months used : {H}")
print("Best Config:")
print(f"  wt_total={wt_t} (ETS weight), anchor_total={a_t}")
print(f"  wt_freq ={wt_f} (ETS weight), anchor_freq ={a_f}")
print("------------------------------")
print("STAGE 3 v17 MAPE Frequency :", round(mf, 4))
print("STAGE 3 v17 MAPE Total     :", round(mt, 4))
print("STAGE 3 v17 MAPE Severity  :", round(ms, 4))
print("Estimated Score            :", round(avg, 4))
print("==============================")

check = valid[["year_month","frequency","total_claim","severity"]].copy()
check["pred_frequency"] = pf
check["pred_total"] = pt
check["pred_severity"] = ps
print("\nPreview last horizon months:")
print(check)


Horizon months used : 5
Best Config:
  wt_total=0.85 (ETS weight), anchor_total=median
  wt_freq =0.2 (ETS weight), anchor_freq =mean
------------------------------
STAGE 3 v17 MAPE Frequency : 5.1557
STAGE 3 v17 MAPE Total     : 7.9753
STAGE 3 v17 MAPE Severity  : 4.7684
Estimated Score            : 5.9665

Preview last horizon months:
   year_month  frequency   total_claim      severity  pred_frequency  \
14    2025-03        230  1.367924e+10  5.947496e+07      234.031716   
15    2025-04        208  1.116425e+10  5.367427e+07      232.851773   
16    2025-05        239  1.222680e+10  5.115814e+07      237.225688   
17    2025-06        234  1.337312e+10  5.715008e+07      234.888808   
18    2025-07        264  1.369923e+10  5.189101e+07      235.077202   

      pred_total  pred_severity  
14  1.224504e+10   5.232214e+07  
15  1.224868e+10   5.260289e+07  
16  1.222798e+10   5.154577e+07  
17  1.221086e+10   5.198572e+07  
18  1.219531e+10   5.187790e+07  


# TOTAL CLAIM OPTIMIZATION & VALIDATION, OPTUNA

In [6]:
# ============================================================
# STAGE 4 v24 â€” SEASON-AWARE RATE+SEV (EXPOSURE) + ETS ENSEMBLE + OPTUNA
# - CV splits otomatis: include window yang month-of-year mirip future (Augâ€“Dec)
# - Clamp range dilonggarkan (anti "too flat")
# - Optional month-of-year seasonal adjustment (train-only, no leakage)
# ============================================================

!pip install -q optuna statsmodels

import optuna
import numpy as np
import pandas as pd
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore")

SEED = 42
np.random.seed(SEED)

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"
sample_sub = pd.read_csv(BASE_PATH + "sample_submission.csv")

assert "df" in globals(), "Variabel df belum ada. Jalankan Stage 1 dulu."
assert "year_month" in df.columns, "df['year_month'] belum ada. Buat dulu: df['year_month']=tanggal.dt.to_period('M')"

# ------------------------------
# MAPE (fraction)
# ------------------------------
def mape_frac(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = np.isfinite(y_true) & np.isfinite(y_pred) & (y_true != 0)
    if mask.sum() == 0:
        return np.nan
    return float(np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])))

# ------------------------------
# Horizon from sample_submission (Kaggle)
# ------------------------------
sample_sub["year"]  = sample_sub["id"].str.split("_").str[0]
sample_sub["month"] = sample_sub["id"].str.split("_").str[1]
sample_sub["month_key"] = sample_sub["year"] + "-" + sample_sub["month"]

future_periods = (
    pd.PeriodIndex(sample_sub["month_key"], freq="M")
      .unique()
      .sort_values()
)
H = int(len(future_periods))
future_moy = set([p.month for p in future_periods])

# ------------------------------
# BUILD MONTHLY (reindex full months, robust)
# ------------------------------
has_exposure = "active_policies" in df.columns

agg_dict = {
    "frequency": ("claim_id","count"),
    "total_claim": ("nominal_klaim_yang_disetujui","sum"),
}
if has_exposure:
    agg_dict["exposure"] = ("active_policies","first")

monthly = (
    df.groupby("year_month")
      .agg(**agg_dict)
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

# ensure Period[M]
if len(monthly) == 0:
    raise ValueError("monthly kosong. Cek df/year_month.")
if not isinstance(monthly.loc[0, "year_month"], pd.Period):
    monthly["year_month"] = pd.PeriodIndex(monthly["year_month"], freq="M")

# reindex complete month range
min_m = monthly["year_month"].min()
max_m = monthly["year_month"].max()
all_months = pd.period_range(min_m, max_m, freq="M")

monthly = (
    monthly.set_index("year_month")
           .reindex(all_months)
           .rename_axis("year_month")
           .reset_index()
)

# fill missing frequency/total with 0; exposure forward/back fill
monthly["frequency"]   = pd.to_numeric(monthly["frequency"], errors="coerce").fillna(0.0)
monthly["total_claim"] = pd.to_numeric(monthly["total_claim"], errors="coerce").fillna(0.0)

if not has_exposure:
    monthly["exposure"] = float(np.nanmean(monthly["frequency"])) * 10.0
else:
    monthly["exposure"] = pd.to_numeric(monthly["exposure"], errors="coerce")

monthly["exposure"] = monthly["exposure"].ffill().bfill()
monthly["exposure"] = monthly["exposure"].fillna(float(np.nanmean(monthly["frequency"])) * 10.0)

# safety clip (level series must be >0 for log work)
monthly["frequency"]   = monthly["frequency"].astype(float).clip(lower=1.0)
monthly["total_claim"] = monthly["total_claim"].astype(float).clip(lower=1.0)
monthly["exposure"]    = monthly["exposure"].astype(float).clip(lower=1.0)

# components
monthly["severity"]   = (monthly["total_claim"] / monthly["frequency"]).astype(float).clip(lower=1e-9)
monthly["claim_rate"] = (monthly["frequency"]   / monthly["exposure"]).astype(float).clip(lower=1e-12)
monthly["month"]      = monthly["year_month"].dt.month

N = len(monthly)
H = min(H, max(1, N - 10))   # sedikit lebih longgar supaya split season bisa masuk
print("N months:", N, "| Horizon H:", H, "| Has exposure:", has_exposure, "| Future MOY:", sorted(list(future_moy)))

# ------------------------------
# Choose CV splits (season-aware)
# - kandidat: semua train_end yang memungkinkan valid H bulan
# - scoring: overlap bulan valid dengan future months (Aug-Dec) + recency
# ------------------------------
min_train = 7  # supaya split Aug-Dec 2024 bisa masuk (train Jan-Jul 2024)
cands = []
for te in range(min_train, N - H + 1):
    valid = monthly.iloc[te:te+H]
    if len(valid) < H:
        continue
    overlap = sum([1 for m in valid["month"].tolist() if m in future_moy]) / float(H)
    recency = te / float(N)
    score = 0.65*overlap + 0.35*recency
    cands.append((score, overlap, recency, te))

if len(cands) == 0:
    raise ValueError("Tidak ada split CV yang valid. Cek N/H.")

# ambil top splits: 1-2 yang paling season-match + 1-2 yang paling recent
cands_sorted = sorted(cands, reverse=True)

top_season = sorted(cands, key=lambda x: (x[1], x[0]), reverse=True)[:2]
top_recent = sorted(cands, key=lambda x: x[2], reverse=True)[:2]
picked = {x[3] for x in (top_season + top_recent)}

train_ends = sorted(list(picked))
# weights: proporsional score
w_raw = np.array([dict((x[3], x[0]) for x in cands_sorted).get(te, 0.1) for te in train_ends], dtype=float)
w_raw = np.maximum(w_raw, 1e-6)
split_w = w_raw / w_raw.sum()

print("CV train_ends:", train_ends, "| weights:", split_w.round(3).tolist())
for te in train_ends:
    v = monthly.iloc[te:te+H][["year_month","month"]]
    print("  split te=", te, "| valid months:", v["year_month"].astype(str).tolist())

# ------------------------------
# Helpers
# ------------------------------
def ets_1step_log(x_log: pd.Series, trend, damped, init_method):
    # extra safety for short series
    if len(x_log) < 4:
        return float(x_log.iloc[-1])
    # for short series, trend add sering unstable
    if trend is not None and len(x_log) < 10:
        trend = None
        damped = False
    try:
        m = ExponentialSmoothing(
            x_log,
            trend=trend,
            damped_trend=(damped if trend is not None else False),
            seasonal=None,
            initialization_method=init_method
        ).fit()
        return float(m.forecast(1).iloc[0])
    except:
        return float(x_log.iloc[-1])

def anchor_level(x_level: pd.Series, k: int, how: str):
    tail = np.asarray(x_level.tail(k), dtype=float)
    if len(tail) == 0:
        return float(np.nan)
    return float(np.median(tail)) if how == "median" else float(np.mean(tail))

def seasonal_factor(hist: pd.DataFrame, col: str, m_next: int, how: str, cap_low: float, cap_high: float):
    x = hist[[col, "month"]].copy()
    x = x[np.isfinite(x[col].values)]
    if len(x) < 6:
        return 1.0
    if how == "median":
        overall = float(np.median(x[col].values))
        mvals = x.loc[x["month"] == m_next, col].values
        mm = float(np.median(mvals)) if len(mvals) else np.nan
    else:
        overall = float(np.mean(x[col].values))
        mvals = x.loc[x["month"] == m_next, col].values
        mm = float(np.mean(mvals)) if len(mvals) else np.nan

    if (not np.isfinite(overall)) or overall <= 0 or (not np.isfinite(mm)) or mm <= 0:
        return 1.0

    fac = mm / overall
    fac = float(np.clip(fac, cap_low, cap_high))
    return fac

# ------------------------------
# One split TRUE RECURSIVE
# ------------------------------
def run_split(monthly_all: pd.DataFrame, train_end: int, H: int, P: dict):
    train = monthly_all.iloc[:train_end].copy().reset_index(drop=True)
    valid = monthly_all.iloc[train_end:train_end+H].copy().reset_index(drop=True)
    if len(valid) < H or len(train) < 4:
        return np.nan, np.nan, np.nan, np.nan, np.nan

    sim = train.copy()

    pred_F, pred_T, pred_S = [], [], []
    pen = []

    for step in range(H):
        k = int(P["k_anchor"])

        exp_next = float(sim["exposure"].iloc[-1])
        exp_next = max(1.0, exp_next)

        # anchors (LEVEL)
        aR = anchor_level(sim["claim_rate"], k, P["anchor_rate"])
        aS = anchor_level(sim["severity"],   k, P["anchor_sev"])

        # build log series for ETS
        lr = np.log(sim["claim_rate"].astype(float).clip(lower=1e-12))
        ls = np.log(sim["severity"].astype(float).clip(lower=1e-12))

        # ETS ensemble for rate
        lr_add  = ets_1step_log(lr, trend="add",  damped=bool(P["damped"]), init_method=P["init_method"])
        lr_none = ets_1step_log(lr, trend=None,   damped=False,             init_method=P["init_method"])
        lr_hat  = float(P["beta"])*lr_add + (1-float(P["beta"]))*lr_none
        r_fc    = float(np.exp(lr_hat))

        # ETS ensemble for severity
        ls_add  = ets_1step_log(ls, trend="add",  damped=bool(P["damped"]), init_method=P["init_method"])
        ls_none = ets_1step_log(ls, trend=None,   damped=False,             init_method=P["init_method"])
        ls_hat  = float(P["beta"])*ls_add + (1-float(P["beta"]))*ls_none
        s_fc    = float(np.exp(ls_hat))

        # shrink to anchor
        r_pred = float(P["wt_rate"])*r_fc + (1-float(P["wt_rate"]))*aR
        s_pred = float(P["wt_sev"]) *s_fc + (1-float(P["wt_sev"])) *aS

        # seasonal adjustment (train-only)
        next_period = sim["year_month"].iloc[-1] + 1
        m_next = int(next_period.month)
        if float(P["season_w"]) > 0:
            fac_r = seasonal_factor(sim, "claim_rate", m_next, P["season_how"], P["season_cap_low"], P["season_cap_high"])
            fac_s = seasonal_factor(sim, "severity",   m_next, P["season_how"], P["season_cap_low"], P["season_cap_high"])
            # apply softly (power)
            r_pred *= float(fac_r) ** float(P["season_w"])
            s_pred *= float(fac_s) ** float(P["season_w"])

        # clamp ratio vs anchor (dilonggarkan)
        r_pred = float(np.clip(r_pred, aR*float(P["capR_low"]), aR*float(P["capR_high"])))
        s_pred = float(np.clip(s_pred, aS*float(P["capS_low"]), aS*float(P["capS_high"])))

        # reconstruct
        f_pred = float(max(1.0, r_pred * exp_next))
        if bool(P["round_freq"]):
            f_pred = float(max(1.0, np.round(f_pred)))

        t_pred = float(max(1.0, f_pred * s_pred))
        s_pred = float(max(1e-9, t_pred / max(1.0, f_pred)))

        pred_F.append(f_pred)
        pred_T.append(t_pred)
        pred_S.append(s_pred)

        # penalty drift vs anchor implied
        aF = float(max(1.0, (aR * exp_next)))
        if bool(P["round_freq"]):
            aF = float(max(1.0, np.round(aF)))
        aT = float(max(1.0, aF * aS))
        pen_f = abs(f_pred - aF) / (abs(aF) + 1e-9)
        pen_t = abs(t_pred - aT) / (abs(aT) + 1e-9)
        pen.append(0.5*(pen_f + pen_t))

        # append recursive row
        sim = pd.concat([sim, pd.DataFrame([{
            "year_month": next_period,
            "month": m_next,
            "exposure": exp_next,
            "claim_rate": float(max(1e-12, f_pred/exp_next)),
            "severity": s_pred,
            "frequency": f_pred,
            "total_claim": t_pred
        }])], ignore_index=True)

    # targets
    yF = valid["frequency"].astype(float).values
    yT = valid["total_claim"].astype(float).values
    yS = (valid["total_claim"].astype(float).values /
          np.clip(valid["frequency"].astype(float).values, 1.0, np.inf))

    mf = mape_frac(yF, pred_F)
    mt = mape_frac(yT, pred_T)
    ms = mape_frac(yS, pred_S)
    avg = float(np.nanmean([mf, mt, ms]))
    stab = float(np.mean(pen)) if len(pen) else np.nan
    return avg, mt, mf, ms, stab

# ------------------------------
# Baseline params (lebih fleksibel)
# ------------------------------
P0 = dict(
    k_anchor=4,
    anchor_rate="mean",
    anchor_sev="mean",
    wt_rate=0.30,
    wt_sev=0.75,
    beta=0.80,
    damped=True,
    init_method="estimated",

    # clamp (lebih longgar dari v23)
    capR_low=0.70, capR_high=1.40,
    capS_low=0.60, capS_high=1.60,

    # seasonal
    season_w=0.35,
    season_how="median",
    season_cap_low=0.85,
    season_cap_high=1.15,

    round_freq=False,
)

def cv_score(P, pen_w=0.01):
    cv = 0.0
    pen = 0.0
    ok = 0
    for te, w in zip(train_ends, split_w):
        avg, _, _, _, stab = run_split(monthly, te, H, P)
        if np.isfinite(avg):
            cv += w*avg
            ok += 1
        if np.isfinite(stab):
            pen += w*stab
    if ok == 0:
        return 1e9
    return float(cv + pen_w*pen)

print("Baseline CV %:", round(cv_score(P0)*100, 4))

# ------------------------------
# OPTUNA objective
# ------------------------------
PEN_W = 0.01

def objective(trial):
    P = dict(
        k_anchor=trial.suggest_int("k_anchor", 2, 6),
        anchor_rate=trial.suggest_categorical("anchor_rate", ["mean","median"]),
        anchor_sev=trial.suggest_categorical("anchor_sev", ["mean","median"]),

        wt_rate=trial.suggest_float("wt_rate", 0.05, 0.65),
        wt_sev=trial.suggest_float("wt_sev",  0.35, 0.98),

        beta=trial.suggest_float("beta", 0.25, 0.95),
        damped=trial.suggest_categorical("damped", [False, True]),
        init_method=trial.suggest_categorical("init_method", ["estimated","heuristic"]),

        # clamp (lebih lebar, terutama severity)
        capR_low=trial.suggest_float("capR_low", 0.55, 0.90),
        capR_high=trial.suggest_float("capR_high", 1.15, 1.70),
        capS_low=trial.suggest_float("capS_low", 0.45, 0.85),
        capS_high=trial.suggest_float("capS_high", 1.15, 2.00),

        # seasonal (tuning ringan)
        season_w=trial.suggest_float("season_w", 0.0, 0.75),
        season_how=trial.suggest_categorical("season_how", ["mean","median"]),
        season_cap_low=trial.suggest_float("season_cap_low", 0.80, 0.95),
        season_cap_high=trial.suggest_float("season_cap_high", 1.05, 1.25),

        round_freq=trial.suggest_categorical("round_freq", [False, True]),
    )

    cv = 0.0
    pen = 0.0
    ok = 0
    for te, w in zip(train_ends, split_w):
        avg, _, _, _, stab = run_split(monthly, te, H, P)
        if np.isfinite(avg):
            cv += w*avg
            ok += 1
        if np.isfinite(stab):
            pen += w*stab

    if ok == 0:
        return 1e9
    return float(cv + PEN_W*pen)

optuna.logging.set_verbosity(optuna.logging.WARNING)
sampler = optuna.samplers.TPESampler(seed=SEED)
study = optuna.create_study(direction="minimize", sampler=sampler)

study.enqueue_trial(P0)
study.optimize(objective, n_trials=500, show_progress_bar=True)

bestP = study.best_params
print("\n==============================")
print("Horizon:", H, "| Splits:", train_ends, "| weights:", split_w.round(3).tolist())
print("Best Params:", bestP)
print("CV Best %  :", round(study.best_value*100, 4))
print("==============================")

# per-split report
rows = []
for te in train_ends:
    avg, mt, mf, ms, stab = run_split(monthly, te, H, bestP)
    rows.append([te, avg, mt, mf, ms, stab])

spl = pd.DataFrame(rows, columns=["train_end","avg","mape_total","mape_freq","mape_sev","stab_pen"])
print("\nPer-split metrics (%):")
print((spl.set_index("train_end")[["avg","mape_total","mape_freq","mape_sev","stab_pen"]]*100).round(4))

print("\nSTAGE 4 v24 â€” READY (season-aware)")

N months: 19 | Horizon H: 5 | Has exposure: True | Future MOY: [8, 9, 10, 11, 12]
CV train_ends: [7, 8, 13, 14] | weights: [0.401, 0.343, 0.123, 0.133]
  split te= 7 | valid months: ['2024-08', '2024-09', '2024-10', '2024-11', '2024-12']
  split te= 8 | valid months: ['2024-09', '2024-10', '2024-11', '2024-12', '2025-01']
  split te= 13 | valid months: ['2025-02', '2025-03', '2025-04', '2025-05', '2025-06']
  split te= 14 | valid months: ['2025-03', '2025-04', '2025-05', '2025-06', '2025-07']
Baseline CV %: 11.5766


  0%|          | 0/500 [00:00<?, ?it/s]


Horizon: 5 | Splits: [7, 8, 13, 14] | weights: [0.401, 0.343, 0.123, 0.133]
Best Params: {'k_anchor': 6, 'anchor_rate': 'mean', 'anchor_sev': 'mean', 'wt_rate': 0.5886366695116942, 'wt_sev': 0.3579173577972926, 'beta': 0.9218093484579944, 'damped': True, 'init_method': 'heuristic', 'capR_low': 0.8718563627174186, 'capR_high': 1.2295914190903159, 'capS_low': 0.8363199994736286, 'capS_high': 1.9567160683078664, 'season_w': 7.233033409505144e-05, 'season_how': 'mean', 'season_cap_low': 0.8414284311003727, 'season_cap_high': 1.224730419283397, 'round_freq': False}
CV Best %  : 9.5159

Per-split metrics (%):
               avg  mape_total  mape_freq  mape_sev  stab_pen
train_end                                                    
7           8.2643      6.4960    10.4590    7.8379    1.7876
8          11.3648     11.3639     9.8619   12.8686    2.1538
13         11.6766     14.6912     4.6199   15.7187    2.3723
14          6.3561      7.4995     6.2268    5.3420    1.7607

STAGE 4 v24 â€”

# TEST PREDICTION & KAGGLE SUBMISSION

In [7]:
# ============================================================
# STAGE 5 v26 â€” ROBUST ENSEMBLE (Stage3 + Stage4) + SEASON-AWARE CV + MEAN BLEND
# Changes vs v25:
# - CV splits season-aware (overlap months with Augâ€“Dec) + recent
# - Choose blend weight by MEAN / MEDIAN (not worst-case)
# - Avoid extreme w=1 unless clearly better
# ============================================================

!pip install -q statsmodels

import numpy as np
import pandas as pd
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore")

SEED = 42
np.random.seed(SEED)

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"
sample_sub = pd.read_csv(BASE_PATH + "sample_submission.csv")

assert "df" in globals(), "df belum ada. Jalankan Stage 1 dulu."
assert "year_month" in df.columns, "df['year_month'] belum ada."

# ------------------------------
# Kaggle horizon from sample_sub
# ------------------------------
sample_sub["year"]  = sample_sub["id"].str.split("_").str[0]
sample_sub["month"] = sample_sub["id"].str.split("_").str[1]
sample_sub["month_key"] = sample_sub["year"] + "-" + sample_sub["month"]

future_periods = (
    pd.PeriodIndex(sample_sub["month_key"], freq="M")
      .unique()
      .sort_values()
)
H = int(len(future_periods))
future_moy = set([p.month for p in future_periods])

# ------------------------------
# BUILD monthly (robust + complete month range)
# ------------------------------
has_exposure = "active_policies" in df.columns
agg_dict = {
    "frequency": ("claim_id","count"),
    "total_claim": ("nominal_klaim_yang_disetujui","sum"),
}
if has_exposure:
    agg_dict["exposure"] = ("active_policies","first")

monthly = (
    df.groupby("year_month")
      .agg(**agg_dict)
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

if len(monthly) == 0:
    raise ValueError("monthly kosong. Cek df/year_month.")

if not isinstance(monthly.loc[0, "year_month"], pd.Period):
    monthly["year_month"] = pd.PeriodIndex(monthly["year_month"], freq="M")

# reindex full range
min_m = monthly["year_month"].min()
max_m = monthly["year_month"].max()
all_months = pd.period_range(min_m, max_m, freq="M")

monthly = (
    monthly.set_index("year_month")
           .reindex(all_months)
           .rename_axis("year_month")
           .reset_index()
)

monthly["frequency"]   = pd.to_numeric(monthly["frequency"], errors="coerce").fillna(0.0)
monthly["total_claim"] = pd.to_numeric(monthly["total_claim"], errors="coerce").fillna(0.0)

if not has_exposure:
    monthly["exposure"] = float(np.nanmean(monthly["frequency"])) * 10.0
else:
    monthly["exposure"] = pd.to_numeric(monthly["exposure"], errors="coerce")

monthly["exposure"] = monthly["exposure"].ffill().bfill()
monthly["exposure"] = monthly["exposure"].fillna(float(np.nanmean(monthly["frequency"])) * 10.0)

# safety clip
monthly["frequency"]   = monthly["frequency"].astype(float).clip(lower=1.0)
monthly["total_claim"] = monthly["total_claim"].astype(float).clip(lower=1.0)
monthly["exposure"]    = monthly["exposure"].astype(float).clip(lower=1.0)

monthly["severity"]   = (monthly["total_claim"] / monthly["frequency"]).astype(float).clip(lower=1e-9)
monthly["claim_rate"] = (monthly["frequency"]   / monthly["exposure"]).astype(float).clip(lower=1e-12)
monthly["month"]      = monthly["year_month"].dt.month

N = len(monthly)
print("N months:", N, "| H:", H, "| Has exposure:", has_exposure, "| Future MOY:", sorted(list(future_moy)))

# ------------------------------
# Metrics
# ------------------------------
def mape_frac(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = np.isfinite(y_true) & np.isfinite(y_pred) & (y_true != 0)
    if mask.sum() == 0:
        return np.nan
    return float(np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])))

def avg_mape(yF, pF, yT, pT):
    yF = np.asarray(yF, float)
    yT = np.asarray(yT, float)
    pF = np.asarray(pF, float)
    pT = np.asarray(pT, float)
    yS = yT / np.clip(yF, 1.0, np.inf)
    pS = pT / np.clip(pF, 1.0, np.inf)
    mf = mape_frac(yF, pF)
    mt = mape_frac(yT, pT)
    ms = mape_frac(yS, pS)
    return float(np.nanmean([mf, mt, ms])), mt, mf, ms

# ============================================================
# Stage 3 v17 (fixed)
# ============================================================
S3 = dict(
    wt_total=0.85, anchor_total="median",
    wt_freq=0.20,  anchor_freq="mean",
    k_anchor=3
)

def anchor_level(x: pd.Series, k: int, how: str):
    tail = np.asarray(x.tail(k), dtype=float)
    return float(np.median(tail)) if how == "median" else float(np.mean(tail))

def ets_1step_log1p(level_series: pd.Series, trend="add", damped=True):
    y = np.log1p(level_series.astype(float).clip(lower=1e-12))
    # guard short series
    if len(y) < 4:
        return float(np.expm1(y.iloc[-1]))
    if trend is not None and len(y) < 10:
        trend = None
        damped = False
    try:
        m = ExponentialSmoothing(
            y, trend=trend,
            damped_trend=(damped if trend is not None else False),
            seasonal=None
        ).fit()
        return float(np.expm1(m.forecast(1).iloc[0]))
    except:
        return float(level_series.iloc[-1])

def simulate_stage3(train_df: pd.DataFrame, H: int):
    sim = train_df.copy().reset_index(drop=True)
    pF, pT = [], []
    for _ in range(H):
        k = int(S3["k_anchor"])

        tot_fc = ets_1step_log1p(sim["total_claim"], trend="add", damped=True)
        tot_anchor = anchor_level(sim["total_claim"], k, S3["anchor_total"])
        tot_pred = float(S3["wt_total"])*tot_fc + (1-float(S3["wt_total"]))*tot_anchor
        tot_pred = max(1.0, tot_pred)

        fre_fc = ets_1step_log1p(sim["frequency"], trend="add", damped=True)
        fre_anchor = anchor_level(sim["frequency"], k, S3["anchor_freq"])
        fre_pred = float(S3["wt_freq"])*fre_fc + (1-float(S3["wt_freq"]))*fre_anchor
        fre_pred = max(1.0, fre_pred)

        pF.append(fre_pred)
        pT.append(tot_pred)

        exp_last = float(sim["exposure"].iloc[-1]) if "exposure" in sim.columns else 1.0
        exp_last = max(1.0, exp_last)

        sim = pd.concat([sim, pd.DataFrame([{
            "year_month": sim["year_month"].iloc[-1] + 1,
            "frequency": fre_pred,
            "total_claim": tot_pred,
            "exposure": exp_last,
            "severity": float(tot_pred / fre_pred),
            "claim_rate": float(fre_pred / exp_last)
        }])], ignore_index=True)

    return np.array(pF, float), np.array(pT, float)

# ============================================================
# Stage 4 (pakai params kamu v23; ganti jika pakai v24 best)
# ============================================================
S4 = {
    'k_anchor': 4,
    'anchor_rate': 'mean',
    'anchor_sev': 'mean',
    'wt_rate': 0.3332890181752667,
    'wt_sev': 0.5501072463530577,
    'beta': 0.8005225975816733,
    'damped': True,
    'init_method': 'estimated',
    'capR_low': 0.7818972835090717,
    'capR_high': 1.3058960367487729,
    'capS_low': 0.6976922342421228,
    'capS_high': 1.1536381686265984
}

ROUND_FREQ = False  # LB biasanya lebih baik False

def ets_1step_log(x_log: pd.Series, trend, damped, init_method):
    if len(x_log) < 4:
        return float(x_log.iloc[-1])
    if trend is not None and len(x_log) < 10:
        trend = None
        damped = False
    try:
        m = ExponentialSmoothing(
            x_log, trend=trend,
            damped_trend=(damped if trend is not None else False),
            seasonal=None, initialization_method=init_method
        ).fit()
        return float(m.forecast(1).iloc[0])
    except:
        return float(x_log.iloc[-1])

def simulate_stage4(train_df: pd.DataFrame, H: int):
    sim = train_df.copy().reset_index(drop=True)
    pF, pT = [], []
    for _ in range(H):
        k = int(S4["k_anchor"])
        exp_next = float(sim["exposure"].iloc[-1])
        exp_next = max(1.0, exp_next)

        aR = anchor_level(sim["claim_rate"], k, S4["anchor_rate"])
        aS = anchor_level(sim["severity"],   k, S4["anchor_sev"])

        lr = np.log(sim["claim_rate"].astype(float).clip(lower=1e-12))
        ls = np.log(sim["severity"].astype(float).clip(lower=1e-12))

        lr_add  = ets_1step_log(lr, trend="add",  damped=bool(S4["damped"]), init_method=S4["init_method"])
        lr_none = ets_1step_log(lr, trend=None,   damped=False,              init_method=S4["init_method"])
        r_fc = float(np.exp(float(S4["beta"])*lr_add + (1-float(S4["beta"]))*lr_none))

        ls_add  = ets_1step_log(ls, trend="add",  damped=bool(S4["damped"]), init_method=S4["init_method"])
        ls_none = ets_1step_log(ls, trend=None,   damped=False,              init_method=S4["init_method"])
        s_fc = float(np.exp(float(S4["beta"])*ls_add + (1-float(S4["beta"]))*ls_none))

        r_pred = float(S4["wt_rate"])*r_fc + (1-float(S4["wt_rate"]))*aR
        s_pred = float(S4["wt_sev"]) *s_fc + (1-float(S4["wt_sev"])) *aS

        r_pred = float(np.clip(r_pred, aR*float(S4["capR_low"]), aR*float(S4["capR_high"])))
        s_pred = float(np.clip(s_pred, aS*float(S4["capS_low"]), aS*float(S4["capS_high"])))

        f_pred = float(max(1.0, r_pred * exp_next))
        if ROUND_FREQ:
            f_pred = float(max(1.0, np.round(f_pred)))

        t_pred = float(max(1.0, f_pred * s_pred))

        pF.append(f_pred)
        pT.append(t_pred)

        sim = pd.concat([sim, pd.DataFrame([{
            "year_month": sim["year_month"].iloc[-1] + 1,
            "exposure": exp_next,
            "claim_rate": float(max(1e-12, f_pred/exp_next)),
            "severity": float(max(1e-9, t_pred/np.clip(f_pred, 1.0, np.inf))),
            "frequency": f_pred,
            "total_claim": t_pred
        }])], ignore_index=True)
    return np.array(pF, float), np.array(pT, float)

# ============================================================
# Season-aware CV splits for blend selection
# ============================================================
# candidates: te such that valid length H
min_train = 7
cands = []
for te in range(min_train, N - H + 1):
    valid = monthly.iloc[te:te+H]
    if len(valid) < H:
        continue
    overlap = sum([1 for m in valid["month"].tolist() if m in future_moy]) / float(H)
    recency = te / float(N)
    score = 0.65*overlap + 0.35*recency
    cands.append((score, overlap, recency, te))

cands_sorted = sorted(cands, reverse=True)
top_season = sorted(cands, key=lambda x: (x[1], x[0]), reverse=True)[:2]
top_recent = sorted(cands, key=lambda x: x[2], reverse=True)[:2]
picked = {x[3] for x in (top_season + top_recent)}
train_ends = sorted(list(picked))

w_raw = np.array([dict((x[3], x[0]) for x in cands_sorted).get(te, 0.1) for te in train_ends], dtype=float)
w_raw = np.maximum(w_raw, 1e-6)
split_w = w_raw / w_raw.sum()

print("CV train_ends:", train_ends, "| weights:", split_w.round(3).tolist())
for te in train_ends:
    v = monthly.iloc[te:te+H][["year_month","month"]]
    print("  split te=", te, "| valid months:", v["year_month"].astype(str).tolist())

# ============================================================
# Blend search (choose by MEAN/MEDIAN, not worst-case)
# ============================================================
w_grid = np.linspace(0.0, 1.0, 21)  # lebih rapat: step 0.05

rows = []
best_mean = None
best_median = None

for w in w_grid:
    split_scores = []
    for te in train_ends:
        tr = monthly.iloc[:te].copy().reset_index(drop=True)
        va = monthly.iloc[te:te+H].copy().reset_index(drop=True)

        pF3, pT3 = simulate_stage3(tr, H)
        pF4, pT4 = simulate_stage4(tr, H)

        pF = w*pF4 + (1-w)*pF3
        pT = w*pT4 + (1-w)*pT3

        sc, mt, mf, ms = avg_mape(va["frequency"].values, pF, va["total_claim"].values, pT)
        split_scores.append(sc)

    meanv = float(np.mean(split_scores))
    medv  = float(np.median(split_scores))
    worst = float(np.max(split_scores))
    rows.append([w, meanv, medv, worst] + split_scores)

    cand_mean = (meanv, medv, worst, w)
    cand_med  = (medv, meanv, worst, w)

    if (best_mean is None) or (cand_mean < best_mean):
        best_mean = cand_mean
    if (best_median is None) or (cand_med < best_median):
        best_median = cand_med

df_rows = pd.DataFrame(
    rows,
    columns=["w_stage4","mean_avg","median_avg","worst_avg"] + [f"split_{i}" for i in range(len(train_ends))]
)

print("\nTop candidates by mean_avg:")
print(df_rows.sort_values(["mean_avg","median_avg","worst_avg"]).head(8))

print("\nTop candidates by median_avg:")
print(df_rows.sort_values(["median_avg","mean_avg","worst_avg"]).head(8))

w_mean = float(best_mean[-1])
w_med  = float(best_median[-1])

# Stabilize: kalau mean dan median beda kecil, ambil tengah supaya tidak ekstrem
if abs(w_mean - w_med) <= 0.10:
    w_best = float(0.5*(w_mean + w_med))
else:
    # kalau beda jauh, pilih mean-based (lebih smooth)
    w_best = w_mean

# Anti-extreme: jika w_best sangat dekat 1.0, pastikan memang jauh lebih baik dari 0.8
if w_best >= 0.95:
    s1 = df_rows.loc[df_rows["w_stage4"].sub(1.0).abs().idxmin(), "mean_avg"]
    s8 = df_rows.loc[df_rows["w_stage4"].sub(0.8).abs().idxmin(), "mean_avg"]
    # kalau bedanya kecil, turunkan ke 0.8 supaya lebih robust
    if (s1 - s8) <= 0.002:
        w_best = 0.80

print("\nChosen w_stage4:", w_best, "(w=1 pure Stage4, w=0 pure Stage3)")

# ============================================================
# FINAL FORECAST on FULL data
# ============================================================
pF3, pT3 = simulate_stage3(monthly, H)
pF4, pT4 = simulate_stage4(monthly, H)

pF = w_best*pF4 + (1-w_best)*pF3
pT = w_best*pT4 + (1-w_best)*pT3
pS = pT / np.clip(pF, 1.0, np.inf)

pred_map = {}
preview = []

for i, period in enumerate(future_periods):
    key = f"{period.year}_{str(period.month).zfill(2)}"
    pred_map[f"{key}_Claim_Frequency"] = float(pF[i])
    pred_map[f"{key}_Total_Claim"]     = float(pT[i])
    pred_map[f"{key}_Claim_Severity"]  = float(pS[i])
    preview.append([str(period), float(pF[i]), float(pT[i]), float(pS[i])])

sub = sample_sub.copy()
sub["value"] = sub["id"].map(pred_map)

missing = int(sub["value"].isna().sum())
print("NaN in submission:", missing)
assert missing == 0, "Ada id belum terisi. Cek key format."

sub = sub[["id","value"]]
sub.to_csv("submission.csv", index=False)

print("\nPreview future predictions:")
print(pd.DataFrame(preview, columns=["period","pred_freq","pred_total","pred_sev"]))
print("\nSaved: submission.csv")
print(sub.head(12))

N months: 19 | H: 5 | Has exposure: True | Future MOY: [8, 9, 10, 11, 12]
CV train_ends: [7, 8, 13, 14] | weights: [0.401, 0.343, 0.123, 0.133]
  split te= 7 | valid months: ['2024-08', '2024-09', '2024-10', '2024-11', '2024-12']
  split te= 8 | valid months: ['2024-09', '2024-10', '2024-11', '2024-12', '2025-01']
  split te= 13 | valid months: ['2025-02', '2025-03', '2025-04', '2025-05', '2025-06']
  split te= 14 | valid months: ['2025-03', '2025-04', '2025-05', '2025-06', '2025-07']

Top candidates by mean_avg:
   w_stage4  mean_avg  median_avg  worst_avg   split_0   split_1   split_2  \
0      0.00  0.090516    0.096506   0.109386  0.086339  0.109386  0.106673   
1      0.05  0.090949    0.096635   0.110987  0.086303  0.110987  0.106967   
2      0.10  0.091390    0.096764   0.112587  0.086267  0.112587  0.107261   
3      0.15  0.091831    0.096892   0.114187  0.086230  0.114187  0.107553   
4      0.20  0.092285    0.097045   0.115785  0.086193  0.115785  0.107897   
5      0.25  

In [8]:
print(sub.head(12)) ##  12 persen 

                         id         value
0   2025_08_Claim_Frequency  2.435652e+02
1    2025_08_Claim_Severity  5.161099e+07
2       2025_08_Total_Claim  1.257064e+10
3   2025_09_Claim_Frequency  2.449253e+02
4    2025_09_Claim_Severity  5.133448e+07
5       2025_09_Total_Claim  1.257311e+10
6   2025_10_Claim_Frequency  2.479505e+02
7    2025_10_Claim_Severity  5.023752e+07
8       2025_10_Total_Claim  1.245642e+10
9   2025_11_Claim_Frequency  2.438276e+02
10   2025_11_Claim_Severity  5.107072e+07
11      2025_11_Total_Claim  1.245245e+10


In [9]:
import numpy as np
import pandas as pd
from statsmodels.tsa.holtwinters import ExponentialSmoothing

def mape_frac(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = np.isfinite(y_true) & np.isfinite(y_pred) & (y_true != 0)
    return float(np.mean(np.abs((y_true[mask]-y_pred[mask]) / y_true[mask])))

# PASTE best params Stage4 v23 kamu
BEST = {
    'k_anchor': 4,
    'anchor_rate': 'mean',
    'anchor_sev': 'mean',
    'wt_rate': 0.3332890181752667,
    'wt_sev': 0.5501072463530577,
    'beta': 0.8005225975816733,
    'damped': True,
    'init_method': 'estimated',
    'capR_low': 0.7818972835090717,
    'capR_high': 1.3058960367487729,
    'capS_low': 0.6976922342421228,
    'capS_high': 1.1536381686265984
}

# monthly harus yang sama dengan Stage4 v23 (19 bulan, ada exposure/claim_rate/severity)
# Kalau kamu sudah punya `monthly` dari Stage4 v23, pakai itu. Jangan rebuild lagi.
assert "monthly" in globals(), "monthly belum ada (pakai monthly dari Stage4 v23)"
monthly_stage4 = monthly.copy().reset_index(drop=True)

# horizon sama seperti Kaggle (5)
H = 5
N = len(monthly_stage4)
train_end = N - H
train = monthly_stage4.iloc[:train_end].copy().reset_index(drop=True)
valid = monthly_stage4.iloc[train_end:train_end+H].copy().reset_index(drop=True)

def ets_1step_log(x_log: pd.Series, trend, damped, init_method):
    try:
        m = ExponentialSmoothing(
            x_log,
            trend=trend,
            damped_trend=(damped if trend is not None else False),
            seasonal=None,
            initialization_method=init_method
        ).fit()
        return float(m.forecast(1).iloc[0])
    except:
        return float(x_log.iloc[-1])

def anchor_level(x_level: pd.Series, k: int, how: str):
    tail = np.asarray(x_level.tail(k), dtype=float)
    return float(np.median(tail)) if how == "median" else float(np.mean(tail))

sim = train.copy()
pred_F, pred_T, pred_S = [], [], []

for _ in range(H):
    k = int(BEST["k_anchor"])
    exp_next = float(sim["exposure"].iloc[-1])

    aR = anchor_level(sim["claim_rate"], k, BEST["anchor_rate"])
    aS = anchor_level(sim["severity"],   k, BEST["anchor_sev"])

    lr = np.log(sim["claim_rate"].astype(float).clip(lower=1e-12))
    ls = np.log(sim["severity"].astype(float).clip(lower=1e-12))

    lr_add  = ets_1step_log(lr, trend="add",  damped=bool(BEST["damped"]), init_method=BEST["init_method"])
    lr_none = ets_1step_log(lr, trend=None,   damped=False,                init_method=BEST["init_method"])
    r_fc = float(np.exp(float(BEST["beta"])*lr_add + (1-float(BEST["beta"]))*lr_none))

    ls_add  = ets_1step_log(ls, trend="add",  damped=bool(BEST["damped"]), init_method=BEST["init_method"])
    ls_none = ets_1step_log(ls, trend=None,   damped=False,                init_method=BEST["init_method"])
    s_fc = float(np.exp(float(BEST["beta"])*ls_add + (1-float(BEST["beta"]))*ls_none))

    r_pred = float(BEST["wt_rate"])*r_fc + (1-float(BEST["wt_rate"]))*aR
    s_pred = float(BEST["wt_sev"]) *s_fc + (1-float(BEST["wt_sev"])) *aS

    r_pred = float(np.clip(r_pred, aR*float(BEST["capR_low"]), aR*float(BEST["capR_high"])))
    s_pred = float(np.clip(s_pred, aS*float(BEST["capS_low"]), aS*float(BEST["capS_high"])))

    f_pred = float(max(1.0, np.round(r_pred * exp_next)))
    t_pred = float(max(1.0, f_pred * s_pred))
    s_pred = float(t_pred / f_pred)

    pred_F.append(f_pred)
    pred_T.append(t_pred)
    pred_S.append(s_pred)

    sim = pd.concat([sim, pd.DataFrame([{
        "year_month": sim["year_month"].iloc[-1] + 1,
        "exposure": exp_next,
        "claim_rate": float(max(1e-12, f_pred/exp_next)),
        "severity": s_pred,
        "frequency": f_pred,
        "total_claim": t_pred
    }])], ignore_index=True)

yF = valid["frequency"].values
yT = valid["total_claim"].values
yS = (valid["total_claim"].values / np.clip(valid["frequency"].values, 1.0, np.inf))

mf = mape_frac(yF, pred_F)
mt = mape_frac(yT, pred_T)
ms = mape_frac(yS, pred_S)
avg = float(np.nanmean([mf, mt, ms]))

print("Backtest last-H (%):",
      "avg", round(avg*100,4),
      "| total", round(mt*100,4),
      "| freq", round(mf*100,4),
      "| sev", round(ms*100,4))
print(pd.DataFrame({"ym": valid["year_month"], "yF": yF, "pF": pred_F, "yT": yT, "pT": pred_T}))

Backtest last-H (%): avg 5.7701 | total 6.4797 | freq 6.2969 | sev 4.5335
        ym     yF     pF            yT            pT
0  2025-03  230.0  241.0  1.367924e+10  1.282809e+10
1  2025-04  208.0  236.0  1.116425e+10  1.263251e+10
2  2025-05  239.0  236.0  1.222680e+10  1.272460e+10
3  2025-06  234.0  239.0  1.337312e+10  1.315459e+10
4  2025-07  264.0  238.0  1.369923e+10  1.269653e+10
