In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/Data_Klaim.csv
/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/sample_submission.csv
/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/Data_Polis.csv


# DATA FOUNDATION

In [2]:
# ============================================================
# STAGE 1 v4 â€” FOUNDATION (DATASET-AWARE + NO TARGET DISTORTION)
# - Fix YYYYMMDD parsing
# - Keep RAW nominal for target (total_claim)
# - Put winsorization into separate column (optional features)
# - Build monthly with complete month range (fill missing months)
# - Exposure: claimant / inforce (optional)
# ============================================================

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"
klaim = pd.read_csv(BASE_PATH + "Data_Klaim.csv")
polis = pd.read_csv(BASE_PATH + "Data_Polis.csv")

# =============================
# CLEAN COLUMN NAMES
# =============================
def clean_columns(df):
    df = df.copy()
    df.columns = (
        df.columns.astype(str)
        .str.strip()
        .str.lower()
        .str.replace(" ", "_", regex=False)
        .str.replace("/", "_", regex=False)
        .str.replace("-", "_", regex=False)
    )
    return df

klaim = clean_columns(klaim)
polis = clean_columns(polis)

# =============================
# DATE PARSING (handle YYYYMMDD int + dd/mm/yyyy)
# =============================
def parse_mixed_date(s: pd.Series) -> pd.Series:
    s = s.copy()
    idx = s.index

    # normalize to string for pattern checks
    if pd.api.types.is_numeric_dtype(s):
        ss = s.astype("Int64").astype(str)
    else:
        ss = s.astype(str).str.strip()

    ss = ss.replace({"<NA>": np.nan, "nan": np.nan, "None": np.nan, "NaT": np.nan})

    out = pd.Series(pd.NaT, index=idx, dtype="datetime64[ns]")

    # YYYYMMDD (8 digits)
    m8 = ss.str.fullmatch(r"\d{8}", na=False)
    if m8.any():
        out.loc[m8] = pd.to_datetime(ss.loc[m8], format="%Y%m%d", errors="coerce")

    # remaining
    rest = ~m8 & ss.notna()
    if rest.any():
        has_slash = ss.loc[rest].str.contains("/", na=False)
        if has_slash.any():
            out.loc[rest[rest].index.intersection(ss.loc[rest][has_slash].index)] = pd.to_datetime(
                ss.loc[rest][has_slash], errors="coerce", dayfirst=True
            )
        if (~has_slash).any():
            out.loc[rest[rest].index.intersection(ss.loc[rest][~has_slash].index)] = pd.to_datetime(
                ss.loc[rest][~has_slash], errors="coerce"
            )

    return out

for col in klaim.columns:
    if "tanggal" in col:
        klaim[col] = parse_mixed_date(klaim[col])

for col in polis.columns:
    if "tanggal" in col:
        polis[col] = parse_mixed_date(polis[col])

# =============================
# SAFE DEDUP
# =============================
claim_id_col = None
for c in ["claim_id", "id_klaim", "klaim_id"]:
    if c in klaim.columns:
        claim_id_col = c
        break

if claim_id_col is not None:
    klaim = klaim.drop_duplicates(subset=[claim_id_col]).reset_index(drop=True)
else:
    klaim = klaim.drop_duplicates().reset_index(drop=True)

polis = polis.drop_duplicates(subset=["nomor_polis"]).reset_index(drop=True)

# =============================
# BASIC CLEANING
# =============================
# choose service date column
service_col = "tanggal_pasien_masuk_rs" if "tanggal_pasien_masuk_rs" in klaim.columns else None
if service_col is None:
    # fallback: first tanggal* column
    tcols = [c for c in klaim.columns if "tanggal" in c]
    service_col = tcols[0] if len(tcols) else None

if service_col is None:
    raise ValueError("No tanggal column found in klaim for building year_month.")

klaim = klaim.dropna(subset=["nomor_polis", service_col]).copy()

# nominal column
nom_col = "nominal_klaim_yang_disetujui"
if nom_col not in klaim.columns:
    # fallback: try find 'nominal' column
    cand = [c for c in klaim.columns if "nominal" in c]
    if len(cand) == 0:
        raise ValueError("No nominal column found in klaim.")
    nom_col = cand[0]

# IMPORTANT: keep RAW nominal for target
raw_nom = pd.to_numeric(klaim[nom_col], errors="coerce").fillna(0).clip(lower=0)
klaim[nom_col] = raw_nom

# OPTIONAL: winsorized copy for feature engineering (NOT for target)
klaim["nominal_klaim_clip"] = raw_nom.copy()
pos = klaim["nominal_klaim_clip"] > 0
if pos.any():
    low_q  = klaim.loc[pos, "nominal_klaim_clip"].quantile(0.005)
    high_q = klaim.loc[pos, "nominal_klaim_clip"].quantile(0.995)
    klaim.loc[pos, "nominal_klaim_clip"] = klaim.loc[pos, "nominal_klaim_clip"].clip(low_q, high_q)

# =============================
# MERGE
# =============================
df = klaim.merge(polis, on="nomor_polis", how="left")

# =============================
# SERVICE MONTH
# =============================
df["year_month"] = df[service_col].dt.to_period("M")

min_m = df["year_month"].min()
max_m = df["year_month"].max()
all_months = pd.period_range(min_m, max_m, freq="M")

# ============================================================
# EXPOSURE OPTIONS
# ============================================================
EXPOSURE_MODE = "inforce"  # "claimant" or "inforce"

# claimant exposure: unique policies that claim in that month
expo_claimant = (
    df.groupby("year_month")["nomor_polis"].nunique()
      .reindex(all_months, fill_value=0)
      .rename("exposure_claimant")
      .rename_axis("year_month")
      .reset_index()
)

# inforce exposure: cumulative started policies (no end date available)
start_col = None
for c in ["tanggal_efektif_polis", "tanggal_mulai_polis", "tanggal_mulai"]:
    if c in polis.columns:
        start_col = c
        break

if start_col is not None:
    p = polis[["nomor_polis", start_col]].dropna(subset=[start_col]).copy()
    p["start_m"] = p[start_col].dt.to_period("M")

    base = p.loc[p["start_m"] < min_m, "nomor_polis"].nunique()
    inc = p.loc[p["start_m"] >= min_m].groupby("start_m")["nomor_polis"].nunique()

    expo_inforce = (
        (base + inc.reindex(all_months, fill_value=0).cumsum())
        .rename("exposure_inforce")
        .rename_axis("year_month")
        .reset_index()
    )
else:
    expo_inforce = expo_claimant[["year_month"]].copy()
    expo_inforce["exposure_inforce"] = 0

expo = expo_claimant.merge(expo_inforce, on="year_month", how="left")

# choose exposure with fallback safety
expo["exposure"] = np.where(EXPOSURE_MODE == "inforce", expo["exposure_inforce"], expo["exposure_claimant"])
# if inforce is mostly 0 (bad parsing / missing), fallback to claimant
if (EXPOSURE_MODE == "inforce") and (expo["exposure"].sum() == 0):
    expo["exposure"] = expo["exposure_claimant"]

# merge exposure into df (keperluan stage lain)
df = df.merge(expo[["year_month", "exposure"]], on="year_month", how="left")
df["active_policies"] = df["exposure"]

# ============================================================
# MONTHLY CORE TABLE (complete months)
# target total_claim MUST be RAW nominal
# ============================================================
freq_col = claim_id_col if claim_id_col is not None else "nomor_polis"

monthly_core = (
    df.groupby("year_month")
      .agg(
          frequency=(freq_col, "count"),
          total_claim=(nom_col, "sum")
      )
      .reindex(all_months, fill_value=0)
      .rename_axis("year_month")
      .reset_index()
)

monthly = monthly_core.merge(expo[["year_month", "exposure"]], on="year_month", how="left")

monthly["severity"] = monthly["total_claim"] / monthly["frequency"].replace(0, np.nan)
monthly["claim_rate"] = monthly["frequency"] / monthly["exposure"].replace(0, np.nan)

# ============================================================
# LOG FEATURES
# ============================================================
monthly["log_total"] = np.log1p(monthly["total_claim"])
monthly["log_freq"]  = np.log1p(monthly["frequency"])
monthly["log_sev"]   = np.log1p(monthly["severity"])
monthly["log_rate"]  = np.log1p(monthly["claim_rate"])

# ============================================================
# VOLATILITY
# ============================================================
monthly["roll6"] = monthly["total_claim"].rolling(6, min_periods=3).mean()
monthly["std6"]  = monthly["total_claim"].rolling(6, min_periods=3).std()
monthly["vol_ratio"] = monthly["std6"] / monthly["roll6"]
monthly["high_vol_regime"] = (monthly["vol_ratio"] > monthly["vol_ratio"].median()).astype(int)

# ============================================================
# TIME FEATURES
# ============================================================
monthly["month"] = monthly["year_month"].dt.month
monthly["month_sin"] = np.sin(2*np.pi*monthly["month"]/12)
monthly["month_cos"] = np.cos(2*np.pi*monthly["month"]/12)
monthly["month_index"] = np.arange(len(monthly))

# ============================================================
# SAFE LAGS
# ============================================================
for col in ["log_total", "log_freq", "log_sev", "log_rate"]:
    monthly[f"{col}_lag1"] = monthly[col].shift(1)
    monthly[f"{col}_lag2"] = monthly[col].shift(2)
    monthly[f"{col}_lag3"] = monthly[col].shift(3)
    monthly[f"{col}_roll3"] = monthly[col].shift(1).rolling(3).mean()

monthly = monthly.dropna().reset_index(drop=True)

# ============================================================
# FINAL CHECK
# ============================================================
print("SERVICE_COL:", service_col)
print("EXPOSURE_MODE:", EXPOSURE_MODE)
print("Policy start col:", start_col)
print("Frequency source:", freq_col)
print("Monthly shape:", monthly.shape)
print("Unique months:", monthly["year_month"].nunique())
print("Exposure min/max:", float(monthly["exposure"].min()), float(monthly["exposure"].max()))
print("Total_claim min/max:", float(monthly["total_claim"].min()), float(monthly["total_claim"].max()))
print("\nSTAGE 1 v4 â€” READY")

SERVICE_COL: tanggal_pasien_masuk_rs
EXPOSURE_MODE: inforce
Policy start col: tanggal_efektif_polis
Frequency source: claim_id
Monthly shape: (16, 34)
Unique months: 16
Exposure min/max: 4096.0 4096.0
Total_claim min/max: 9610379678.55 17480540371.87

STAGE 1 v4 â€” READY


In [3]:
tmp = monthly.copy()
tmp["freq_per_exposure"] = tmp["frequency"] / tmp["exposure"]
print(tmp[["year_month","frequency","exposure","freq_per_exposure"]].tail(10))
print("freq_per_exposure min/max:",
      tmp["freq_per_exposure"].min(),
      tmp["freq_per_exposure"].max())

   year_month  frequency  exposure  freq_per_exposure
6     2024-10        274      4096           0.066895
7     2024-11        270      4096           0.065918
8     2024-12        238      4096           0.058105
9     2025-01        216      4096           0.052734
10    2025-02        246      4096           0.060059
11    2025-03        230      4096           0.056152
12    2025-04        208      4096           0.050781
13    2025-05        239      4096           0.058350
14    2025-06        234      4096           0.057129
15    2025-07        264      4096           0.064453
freq_per_exposure min/max: 0.05078125 0.06689453125


# TIME-SERIES DATASET ENGINEERING

In [4]:
# ============================================================
# STAGE 2 â€” ELITE SEGMENT PANEL (SAFE VERSION)
# No KeyError â€¢ Auto-create missing columns â€¢ Short series safe
# ============================================================

import numpy as np
import pandas as pd

# ============================================================
# ðŸ”¹ ENSURE REQUIRED SEGMENT COLUMNS EXIST
# ============================================================

# Care Type
if "care_type" not in df.columns:
    if "inpatient_outpatient" in df.columns:
        df["care_type"] = (
            df["inpatient_outpatient"]
            .astype(str)
            .str.upper()
            .str.strip()
        )
    else:
        df["care_type"] = "UNKNOWN"

df["care_type"] = df["care_type"].fillna("UNKNOWN")


# Cashless
if "is_cashless" not in df.columns:
    if "reimburse_cashless" in df.columns:
        rc = df["reimburse_cashless"].astype(str).str.upper().str.strip()
        df["is_cashless"] = rc.eq("C").astype(int)
    else:
        df["is_cashless"] = 0


# RS Bucket
if "rs_bucket" not in df.columns:
    if "lokasi_rs" in df.columns:
        loc = df["lokasi_rs"].astype(str).str.upper().str.strip()
        df["rs_bucket"] = np.select(
            [
                loc.eq("INDONESIA"),
                loc.eq("SINGAPORE"),
                loc.eq("MALAYSIA")
            ],
            ["ID","SG","MY"],
            default="OTHER"
        )
    else:
        df["rs_bucket"] = "OTHER"

df["rs_bucket"] = df["rs_bucket"].fillna("OTHER")


# Plan Code
if "plan_code" not in df.columns:
    df["plan_code"] = "UNKNOWN"

df["plan_code"] = df["plan_code"].fillna("UNKNOWN")

# ============================================================
# ðŸ”¹ DEFINE SEGMENT COLUMNS
# ============================================================

seg_cols = ["plan_code","care_type","is_cashless","rs_bucket"]

# ============================================================
# ðŸ”¹ BUILD SEGMENT MONTHLY
# ============================================================

seg_monthly = (
    df.groupby(["year_month"] + seg_cols)
      .agg(
          frequency=("nomor_polis","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
          exposure=("nomor_polis","nunique")
      )
      .reset_index()
      .sort_values(seg_cols + ["year_month"])
      .reset_index(drop=True)
)

# ============================================================
# ðŸ”¹ TARGETS
# ============================================================

seg_monthly["severity"] = (
    seg_monthly["total_claim"] /
    seg_monthly["frequency"].replace(0, np.nan)
)

seg_monthly["log_total"] = np.log1p(seg_monthly["total_claim"])
seg_monthly["log_freq"]  = np.log1p(seg_monthly["frequency"])
seg_monthly["log_sev"]   = np.log1p(seg_monthly["severity"])

# ============================================================
# ðŸ”¹ CALENDAR
# ============================================================

seg_monthly["month"] = seg_monthly["year_month"].dt.month
seg_monthly["month_sin"] = np.sin(2*np.pi*seg_monthly["month"]/12)
seg_monthly["month_cos"] = np.cos(2*np.pi*seg_monthly["month"]/12)

# ============================================================
# ðŸ”¹ LAGS (STRICT NO LEAKAGE)
# ============================================================

for col in ["log_total","log_freq","log_sev"]:
    
    seg_monthly[f"{col}_lag1"] = \
        seg_monthly.groupby(seg_cols)[col].shift(1)
    
    seg_monthly[f"{col}_lag2"] = \
        seg_monthly.groupby(seg_cols)[col].shift(2)
    
    seg_monthly[f"{col}_lag3"] = \
        seg_monthly.groupby(seg_cols)[col].shift(3)

    seg_monthly[f"{col}_roll3"] = \
        seg_monthly.groupby(seg_cols)[col] \
        .transform(lambda x: x.shift(1).rolling(3).mean())

# ============================================================
# ðŸ”¹ MOMENTUM
# ============================================================

seg_monthly["momentum_total"] = (
    seg_monthly["log_total_lag1"] -
    seg_monthly["log_total_lag2"]
)

# ============================================================
# ðŸ”¹ SEGMENT WEIGHT
# ============================================================

seg_monthly["seg_weight"] = (
    seg_monthly["frequency"] /
    seg_monthly.groupby("year_month")["frequency"].transform("sum")
).fillna(0)

# ============================================================
# ðŸ”¹ SAFE TRAIN WINDOW
# ============================================================

seg_model = seg_monthly[
    seg_monthly["log_total_lag3"].notna()
].reset_index(drop=True)

seg_model = seg_model.fillna(0)

# ============================================================
# FINAL CHECK
# ============================================================

print("COMPACT PANEL SHAPE:", seg_model.shape)
print("Unique segments:", seg_model[seg_cols].drop_duplicates().shape[0])
print("Columns:", len(seg_model.columns))
print("\nSTAGE 2 â€” ELITE SEGMENT PANEL READY")

COMPACT PANEL SHAPE: (414, 29)
Unique segments: 41
Columns: 29

STAGE 2 â€” ELITE SEGMENT PANEL READY


# MODEL DEVELOPMENT

In [5]:
# ============================================================
# STAGE 3 v17 â€” KAGGLE-MATCH VALIDATION (AUTO-TUNED SHRINK)
# - Horizon = unique months in sample_submission (usually 5)
# - Predict TOTAL & FREQ directly (ETS on log1p), derive SEVERITY
# - True recursive (refit each step on simulated history)
# - Auto grid-search shrink weights + anchor type (mean/median)
# ============================================================

import numpy as np
import pandas as pd
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore")

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"
sample_sub = pd.read_csv(BASE_PATH + "sample_submission.csv")

def mape(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = y_true != 0
    if mask.sum() == 0:
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

# ==============================
# BUILD MONTHLY (consistent with Stage 1 v3)
# ==============================
monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
          exposure=("active_policies","first")
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

monthly["severity"]   = monthly["total_claim"] / monthly["frequency"].replace(0, np.nan)
monthly["claim_rate"] = monthly["frequency"] / monthly["exposure"].replace(0, np.nan)

# ==============================
# HORIZON = months in sample_submission (Kaggle behavior)
# ==============================
sample_sub["year"]  = sample_sub["id"].str.split("_").str[0]
sample_sub["month"] = sample_sub["id"].str.split("_").str[1]
sample_sub["month_key"] = sample_sub["year"] + "-" + sample_sub["month"]

future_periods = (
    pd.PeriodIndex(sample_sub["month_key"], freq="M")
      .unique()
      .sort_values()
)

H = int(len(future_periods))
H = min(H, max(1, len(monthly) - 6))  # safety

# ==============================
# SIMULATOR (true recursive)
# ==============================
def simulate(train_df, H, wt_total, wt_freq, anchor_total="mean", anchor_freq="mean"):
    sim_df = train_df.copy()

    pred_total, pred_freq, pred_sev = [], [], []

    for step in range(H):
        hist = sim_df.copy()

        # ---- TOTAL ETS on log1p ----
        try:
            mdl_t = ExponentialSmoothing(
                np.log1p(hist["total_claim"].astype(float)),
                trend="add",
                damped_trend=True,
                seasonal=None
            ).fit()
            total_fc = float(np.expm1(mdl_t.forecast(1).iloc[0]))
        except:
            total_fc = float(hist["total_claim"].iloc[-1])

        # anchor total
        if anchor_total == "median":
            total_anchor = float(hist["total_claim"].tail(3).median())
        else:
            total_anchor = float(hist["total_claim"].tail(3).mean())

        total_pred = wt_total * total_fc + (1 - wt_total) * total_anchor
        total_pred = max(float(total_pred), 1.0)

        # ---- FREQ ETS on log1p ----
        try:
            mdl_f = ExponentialSmoothing(
                np.log1p(hist["frequency"].astype(float)),
                trend="add",
                damped_trend=True,
                seasonal=None
            ).fit()
            freq_fc = float(np.expm1(mdl_f.forecast(1).iloc[0]))
        except:
            freq_fc = float(hist["frequency"].iloc[-1])

        # anchor freq
        if anchor_freq == "median":
            freq_anchor = float(hist["frequency"].tail(3).median())
        else:
            freq_anchor = float(hist["frequency"].tail(3).mean())

        freq_pred = wt_freq * freq_fc + (1 - wt_freq) * freq_anchor
        freq_pred = max(float(freq_pred), 1.0)

        sev_pred = total_pred / freq_pred

        pred_total.append(total_pred)
        pred_freq.append(freq_pred)
        pred_sev.append(sev_pred)

        # ---- append recursive row (keep year_month progressing) ----
        last_period = hist["year_month"].iloc[-1]
        next_period = last_period + 1
        exposure_next = float(hist["exposure"].iloc[-1]) if "exposure" in hist.columns else np.nan

        sim_df = pd.concat([sim_df, pd.DataFrame([{
            "year_month": next_period,
            "frequency": freq_pred,
            "total_claim": total_pred,
            "exposure": exposure_next,
            "severity": sev_pred,
            "claim_rate": (freq_pred / exposure_next) if (exposure_next and exposure_next > 0) else np.nan
        }])], ignore_index=True)

    return pred_total, pred_freq, pred_sev

# ==============================
# SPLIT (Kaggle-match horizon)
# ==============================
train = monthly.iloc[:-H].copy()
valid = monthly.iloc[-H:].copy()

# ==============================
# AUTO SEARCH (small grid, fast)
# ==============================
wt_total_grid = [0.35, 0.45, 0.55, 0.60, 0.65, 0.75, 0.85]
wt_freq_grid  = [0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80]

best = {
    "score": 1e18,
    "params": None,
    "detail": None
}

for wt_t in wt_total_grid:
    for wt_f in wt_freq_grid:
        for a_t in ["mean", "median"]:
            for a_f in ["mean", "median"]:

                pt, pf, ps = simulate(train, H, wt_t, wt_f, a_t, a_f)

                mf = mape(valid["frequency"], pf)
                mt = mape(valid["total_claim"], pt)
                ms = mape(valid["severity"], ps)
                avg = float(np.nanmean([mf, mt, ms]))

                if avg < best["score"]:
                    best["score"] = avg
                    best["params"] = (wt_t, wt_f, a_t, a_f)
                    best["detail"] = (mf, mt, ms)

# ==============================
# RUN BEST + REPORT
# ==============================
wt_t, wt_f, a_t, a_f = best["params"]
pt, pf, ps = simulate(train, H, wt_t, wt_f, a_t, a_f)

mf, mt, ms = best["detail"]
avg = best["score"]

print("\n==============================")
print(f"Horizon months used : {H}")
print("Best Config:")
print(f"  wt_total={wt_t} (ETS weight), anchor_total={a_t}")
print(f"  wt_freq ={wt_f} (ETS weight), anchor_freq ={a_f}")
print("------------------------------")
print("STAGE 3 v17 MAPE Frequency :", round(mf, 4))
print("STAGE 3 v17 MAPE Total     :", round(mt, 4))
print("STAGE 3 v17 MAPE Severity  :", round(ms, 4))
print("Estimated Score            :", round(avg, 4))
print("==============================")

check = valid[["year_month","frequency","total_claim","severity"]].copy()
check["pred_frequency"] = pf
check["pred_total"] = pt
check["pred_severity"] = ps
print("\nPreview last horizon months:")
print(check)


Horizon months used : 5
Best Config:
  wt_total=0.85 (ETS weight), anchor_total=median
  wt_freq =0.2 (ETS weight), anchor_freq =mean
------------------------------
STAGE 3 v17 MAPE Frequency : 5.1557
STAGE 3 v17 MAPE Total     : 7.9753
STAGE 3 v17 MAPE Severity  : 4.7684
Estimated Score            : 5.9665

Preview last horizon months:
   year_month  frequency   total_claim      severity  pred_frequency  \
14    2025-03        230  1.367924e+10  5.947496e+07      234.031716   
15    2025-04        208  1.116425e+10  5.367427e+07      232.851773   
16    2025-05        239  1.222680e+10  5.115814e+07      237.225688   
17    2025-06        234  1.337312e+10  5.715008e+07      234.888808   
18    2025-07        264  1.369923e+10  5.189101e+07      235.077202   

      pred_total  pred_severity  
14  1.224504e+10   5.232214e+07  
15  1.224868e+10   5.260289e+07  
16  1.222798e+10   5.154577e+07  
17  1.221086e+10   5.198572e+07  
18  1.219531e+10   5.187790e+07  


# TOTAL CLAIM OPTIMIZATION & VALIDATION, OPTUNA

In [6]:
# ============================================================
# STAGE 4 v30 â€” GLOBAL SEASON PRIOR (DESEASON LOG) + ETS + DRIFT + ANCHOR
# Goal: turunin lagi (especially Frequency) dengan memasukkan month-of-year prior.
#
# IMPORTANT:
# - season prior dihitung dari seluruh history (Jan 2024â€“Jul 2025).
# - Ini membuat CV te=7/8 lebih optimistis (karena split train tidak punya Augâ€“Dec),
#   tapi biasanya lebih "Kaggle-useful" untuk forecast Augâ€“Dec 2025.
# ============================================================

!pip install -q optuna statsmodels

import numpy as np
import pandas as pd
import optuna
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore")

SEED = 42
np.random.seed(SEED)

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"
sample_sub = pd.read_csv(BASE_PATH + "sample_submission.csv")

assert "df" in globals(), "df belum ada. Jalankan Stage 1 dulu."
assert "year_month" in df.columns, "df['year_month'] belum ada."

# ------------------------------
# Horizon + future MOY
# ------------------------------
sample_sub["year"]  = sample_sub["id"].str.split("_").str[0]
sample_sub["month"] = sample_sub["id"].str.split("_").str[1]
sample_sub["month_key"] = sample_sub["year"] + "-" + sample_sub["month"]

future_periods = (
    pd.PeriodIndex(sample_sub["month_key"], freq="M")
      .unique()
      .sort_values()
)
H0 = int(len(future_periods))
future_moy = set([p.month for p in future_periods])

# ------------------------------
# Build monthly portfolio (complete)
# ------------------------------
monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

if not isinstance(monthly.loc[0, "year_month"], pd.Period):
    monthly["year_month"] = pd.PeriodIndex(monthly["year_month"], freq="M")

min_m = monthly["year_month"].min()
max_m = monthly["year_month"].max()
all_months = pd.period_range(min_m, max_m, freq="M")

monthly = (
    monthly.set_index("year_month")
           .reindex(all_months, fill_value=0.0)
           .rename_axis("year_month")
           .reset_index()
)

monthly["frequency"]   = pd.to_numeric(monthly["frequency"], errors="coerce").fillna(0.0).clip(lower=1.0)
monthly["total_claim"] = pd.to_numeric(monthly["total_claim"], errors="coerce").fillna(0.0).clip(lower=1.0)
monthly["severity"]    = (monthly["total_claim"] / monthly["frequency"]).astype(float)
monthly["month"]       = monthly["year_month"].dt.month
monthly["t"]           = np.arange(len(monthly), dtype=int)

N = len(monthly)
H = min(H0, max(1, N - 10))
print("N months:", N, "| Horizon H:", H, "| Future MOY:", sorted(list(future_moy)))

# ------------------------------
# CV splits (same pick, but weight Kaggle-like)
# ------------------------------
min_train = 7
cands = []
for te in range(min_train, N - H + 1):
    valid = monthly.iloc[te:te+H]
    overlap = sum([1 for m in valid["month"].tolist() if m in future_moy]) / float(H)
    recency = te / float(N)
    score = 0.65*overlap + 0.35*recency
    cands.append((score, overlap, recency, te))

cands_sorted = sorted(cands, reverse=True)
top_season = sorted(cands, key=lambda x: (x[1], x[0]), reverse=True)[:2]
top_recent = sorted(cands, key=lambda x: x[2], reverse=True)[:2]
picked = {x[3] for x in (top_season + top_recent)}
train_ends = sorted(list(picked))

# heavy weight to season-overlap splits
OVERLAP_POWER = 4.0
RECENCY_MIX = 0.10

def overlap_ratio(te):
    v = monthly.iloc[te:te+H]
    return sum([1 for m in v["month"].tolist() if m in future_moy]) / float(H)

ov = np.array([overlap_ratio(te) for te in train_ends], float)
rc = np.array([te/float(N) for te in train_ends], float)
w_raw = (ov ** OVERLAP_POWER) + RECENCY_MIX * rc
w_raw = np.maximum(w_raw, 1e-6)
split_w = w_raw / w_raw.sum()

print("CV train_ends:", train_ends, "| weights:", split_w.round(3).tolist())
for te in train_ends:
    v = monthly.iloc[te:te+H][["year_month","month"]]
    print("  split te=", te, "| valid months:", v["year_month"].astype(str).tolist())

# ------------------------------
# Metrics
# ------------------------------
def mape_frac(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = np.isfinite(y_true) & np.isfinite(y_pred) & (y_true != 0)
    if mask.sum() == 0:
        return np.nan
    return float(np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])))

def avg_mape(yF, pF, yT, pT):
    yF = np.asarray(yF, float); pF = np.asarray(pF, float)
    yT = np.asarray(yT, float); pT = np.asarray(pT, float)
    yS = yT / np.clip(yF, 1.0, np.inf)
    pS = pT / np.clip(pF, 1.0, np.inf)
    mf = mape_frac(yF, pF)
    mt = mape_frac(yT, pT)
    ms = mape_frac(yS, pS)
    return float(np.nanmean([mf, mt, ms])), mt, mf, ms

# ------------------------------
# GLOBAL season prior (log space)
# ------------------------------
def build_season_log(series_level, months, clip_abs=0.25):
    # additive season in log1p space: s[m] = median(log1p(x_m)) - median(log1p(x_all))
    y = np.log1p(np.asarray(series_level, float).clip(min=0))
    m = np.asarray(months, int)
    base = float(np.median(y))
    s = np.zeros(13, dtype=float)
    for mm in range(1, 13):
        vals = y[m == mm]
        if len(vals) > 0:
            s[mm] = float(np.median(vals) - base)
        else:
            s[mm] = 0.0
    s = np.clip(s, -float(clip_abs), float(clip_abs))
    return s

SEASON_LOG_F_FULL = build_season_log(monthly["frequency"].values, monthly["month"].values, clip_abs=0.30)
SEASON_LOG_T_FULL = build_season_log(monthly["total_claim"].values, monthly["month"].values, clip_abs=0.35)

# ------------------------------
# Helpers
# ------------------------------
def ets_1step_on_log(series_log):
    y = np.asarray(series_log, float)
    if len(y) < 4:
        return float(y[-1])
    trend = "add" if len(y) >= 10 else None
    damped = True if trend is not None else False
    try:
        m = ExponentialSmoothing(
            y,
            trend=trend,
            damped_trend=damped if trend is not None else False,
            seasonal=None,
            initialization_method="estimated"
        ).fit()
        return float(m.forecast(1)[0])
    except:
        return float(y[-1])

def drift_on_log(series_log, k=3):
    y = np.asarray(series_log, float)
    if len(y) < (k + 2):
        return float(y[-1])
    deltas = np.diff(y[-(k+1):])
    return float(y[-1] + np.mean(deltas))

def anchor_on_log(series_log, k=3, how="mean"):
    tail = np.asarray(series_log[-k:], float)
    return float(np.median(tail)) if how == "median" else float(np.mean(tail))

# ------------------------------
# Simulator (deseason log -> forecast -> reseason)
# ------------------------------
def simulate(train_df, H, P):
    sim = train_df.copy().reset_index(drop=True)

    # season strengths
    sf = float(P["season_scale_f"])
    st = float(P["season_scale_t"])

    # log1p series
    sim["logF"] = np.log1p(sim["frequency"].values)
    sim["logT"] = np.log1p(sim["total_claim"].values)

    pF, pT = [], []

    for _ in range(H):
        next_period = sim["year_month"].iloc[-1] + 1
        m_next = int(next_period.month)

        # deseason logs
        logF_ds = sim["logF"].values - sf * SEASON_LOG_F_FULL[sim["month"].values]
        logT_ds = sim["logT"].values - st * SEASON_LOG_T_FULL[sim["month"].values]

        # forecast in deseason space
        f_ets = ets_1step_on_log(logF_ds)
        f_drift = drift_on_log(logF_ds, k=int(P["drift_k_f"]))
        f_an = anchor_on_log(logF_ds, k=int(P["k_anchor"]), how=P["anchor_f"])

        wE = float(P["w_ets_f"]); wD = float(P["w_drift_f"])
        f_ds_pred = wE*f_ets + wD*f_drift + (1.0-wE-wD)*f_an

        t_ets = ets_1step_on_log(logT_ds)
        t_drift = drift_on_log(logT_ds, k=int(P["drift_k_t"]))
        t_an = anchor_on_log(logT_ds, k=int(P["k_anchor"]), how=P["anchor_t"])

        wE = float(P["w_ets_t"]); wD = float(P["w_drift_t"])
        t_ds_pred = wE*t_ets + wD*t_drift + (1.0-wE-wD)*t_an

        # reseason
        logF_pred = float(f_ds_pred + sf * SEASON_LOG_F_FULL[m_next])
        logT_pred = float(t_ds_pred + st * SEASON_LOG_T_FULL[m_next])

        F_pred = float(np.expm1(logF_pred))
        T_pred = float(np.expm1(logT_pred))

        # clamp vs anchor in LEVEL space (important)
        # anchor level for next month
        logF_an_next = float(f_an + sf * SEASON_LOG_F_FULL[m_next])
        logT_an_next = float(t_an + st * SEASON_LOG_T_FULL[m_next])
        F_an = float(np.expm1(logF_an_next))
        T_an = float(np.expm1(logT_an_next))

        F_pred = float(np.clip(F_pred, max(1.0, F_an*float(P["capF_low"])), F_an*float(P["capF_high"])))
        T_pred = float(np.clip(T_pred, max(1.0, T_an*float(P["capT_low"])), T_an*float(P["capT_high"])))

        pF.append(F_pred)
        pT.append(T_pred)

        sim = pd.concat([sim, pd.DataFrame([{
            "year_month": next_period,
            "month": m_next,
            "t": int(sim["t"].iloc[-1] + 1),
            "frequency": F_pred,
            "total_claim": T_pred,
            "severity": float(T_pred / max(1.0, F_pred)),
            "logF": float(np.log1p(F_pred)),
            "logT": float(np.log1p(T_pred)),
        }])], ignore_index=True)

    return np.array(pF, float), np.array(pT, float)

# ------------------------------
# CV runner
# ------------------------------
def run_cv(P):
    s = 0.0
    rows = []
    for te, w in zip(train_ends, split_w):
        tr = monthly.iloc[:te].copy().reset_index(drop=True)
        va = monthly.iloc[te:te+H].copy().reset_index(drop=True)

        pF, pT = simulate(tr, H, P)
        sc, mt, mf, ms = avg_mape(va["frequency"].values, pF, va["total_claim"].values, pT)
        s += w*sc
        rows.append((te, sc, mt, mf, ms))
    return float(s), rows

# ------------------------------
# Baseline params
# ------------------------------
P0 = dict(
    k_anchor=5,
    anchor_f="mean",
    anchor_t="median",

    w_ets_f=0.70, w_drift_f=0.20,
    w_ets_t=0.25, w_drift_t=0.10,

    drift_k_f=3,
    drift_k_t=4,

    # season prior strength (0..1)
    season_scale_f=1.0,
    season_scale_t=0.8,

    # clamps
    capF_low=0.70, capF_high=1.45,
    capT_low=0.70, capT_high=1.40,
)

base_score, base_rows = run_cv(P0)
print("Baseline CV %:", round(base_score*100, 4))
print("Baseline per-split (%):")
for te, sc, mt, mf, ms in base_rows:
    print(f"  te={te} | avg={sc*100:.3f} | total={mt*100:.3f} | freq={mf*100:.3f} | sev={ms*100:.3f}")

# ------------------------------
# Optuna tuning
# ------------------------------
def objective(trial):
    P = dict(
        k_anchor=trial.suggest_int("k_anchor", 3, 8),
        anchor_f=trial.suggest_categorical("anchor_f", ["mean","median"]),
        anchor_t=trial.suggest_categorical("anchor_t", ["mean","median"]),

        w_ets_f=trial.suggest_float("w_ets_f", 0.30, 0.90),
        w_drift_f=trial.suggest_float("w_drift_f", 0.00, 0.40),
        w_ets_t=trial.suggest_float("w_ets_t", 0.10, 0.85),
        w_drift_t=trial.suggest_float("w_drift_t", 0.00, 0.35),

        drift_k_f=trial.suggest_int("drift_k_f", 2, 5),
        drift_k_t=trial.suggest_int("drift_k_t", 2, 6),

        season_scale_f=trial.suggest_float("season_scale_f", 0.0, 1.2),
        season_scale_t=trial.suggest_float("season_scale_t", 0.0, 1.2),

        capF_low=trial.suggest_float("capF_low", 0.55, 0.90),
        capF_high=trial.suggest_float("capF_high", 1.05, 1.80),
        capT_low=trial.suggest_float("capT_low", 0.55, 0.90),
        capT_high=trial.suggest_float("capT_high", 1.05, 1.80),
    )

    # convex constraints (anchor weight >= 0.05)
    if (P["w_ets_f"] + P["w_drift_f"]) > 0.95:
        return 1e9
    if (P["w_ets_t"] + P["w_drift_t"]) > 0.95:
        return 1e9

    sc, _ = run_cv(P)
    return sc

optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=SEED))
study.enqueue_trial(P0)

N_TRIALS = 200
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

bestP = study.best_params
best_score, best_rows = run_cv(bestP)

print("\n==============================")
print("Splits:", train_ends, "| weights:", split_w.round(3).tolist(), "| overlap:", ov.round(3).tolist())
print("Best Params:", bestP)
print("CV Best %  :", round(best_score*100, 4))
print("==============================")
print("Best per-split (%):")
for te, sc, mt, mf, ms in best_rows:
    print(f"  te={te} | avg={sc*100:.3f} | total={mt*100:.3f} | freq={mf*100:.3f} | sev={ms*100:.3f}")

print("\nSTAGE 4 v30 â€” READY (global season prior)")

N months: 19 | Horizon H: 5 | Future MOY: [8, 9, 10, 11, 12]
CV train_ends: [7, 8, 13, 14] | weights: [0.636, 0.277, 0.042, 0.045]
  split te= 7 | valid months: ['2024-08', '2024-09', '2024-10', '2024-11', '2024-12']
  split te= 8 | valid months: ['2024-09', '2024-10', '2024-11', '2024-12', '2025-01']
  split te= 13 | valid months: ['2025-02', '2025-03', '2025-04', '2025-05', '2025-06']
  split te= 14 | valid months: ['2025-03', '2025-04', '2025-05', '2025-06', '2025-07']
Baseline CV %: 3.2595
Baseline per-split (%):
  te=7 | avg=1.588 | total=1.175 | freq=2.059 | sev=1.530
  te=8 | avg=6.439 | total=9.479 | freq=4.847 | sev=4.992
  te=13 | avg=6.772 | total=8.802 | freq=8.416 | sev=3.098
  te=14 | avg=4.022 | total=2.204 | freq=5.524 | sev=4.338


  0%|          | 0/200 [00:00<?, ?it/s]


Splits: [7, 8, 13, 14] | weights: [0.636, 0.277, 0.042, 0.045] | overlap: [1.0, 0.8, 0.0, 0.0]
Best Params: {'k_anchor': 4, 'anchor_f': 'mean', 'anchor_t': 'median', 'w_ets_f': 0.4140902012641963, 'w_drift_f': 0.23427959162969844, 'w_ets_t': 0.1253855830888168, 'w_drift_t': 0.0557421860395439, 'drift_k_f': 5, 'drift_k_t': 5, 'season_scale_f': 1.007823739519456, 'season_scale_t': 0.9473922819889735, 'capF_low': 0.8025030597351724, 'capF_high': 1.4449152050189749, 'capT_low': 0.5583495487333925, 'capT_high': 1.7549317144514707}
CV Best %  : 2.7153
Best per-split (%):
  te=7 | avg=1.006 | total=0.853 | freq=1.516 | sev=0.651
  te=8 | avg=5.992 | total=9.223 | freq=4.088 | sev=4.666
  te=13 | avg=5.574 | total=6.910 | freq=7.950 | sev=1.862
  te=14 | avg=4.018 | total=2.471 | freq=4.810 | sev=4.773

STAGE 4 v30 â€” READY (global season prior)


# TEST PREDICTION & KAGGLE SUBMISSION

In [7]:
# ============================================================
# STAGE 5 v31 â€” FINAL SUBMISSION (Stage3 + Stage4 v30) 2D BLEND (wF, wT)
# - Stage3: ETS log1p (freq & total) + anchor shrink
# - Stage4 v30: GLOBAL SEASON PRIOR (deseason log) + ETS + drift + anchor + clamp
# - Blend separately:
#     F = (1-wF)*F3 + wF*F4
#     T = (1-wT)*T3 + wT*T4
#   Severity = T/F
# - CV splits & weights MATCH Stage4 v30 (Kaggle-like): [7,8,13,14] weights [0.636,0.277,0.042,0.045]
# ============================================================

!pip install -q statsmodels

import numpy as np
import pandas as pd
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore")

SEED = 42
np.random.seed(SEED)

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"
sample_sub = pd.read_csv(BASE_PATH + "sample_submission.csv")

assert "df" in globals(), "df belum ada. Jalankan Stage 1 dulu."
assert "year_month" in df.columns, "df['year_month'] belum ada."

# ------------------------------
# Horizon + future periods
# ------------------------------
sample_sub["year"]  = sample_sub["id"].str.split("_").str[0]
sample_sub["month"] = sample_sub["id"].str.split("_").str[1]
sample_sub["month_key"] = sample_sub["year"] + "-" + sample_sub["month"]

future_periods = (
    pd.PeriodIndex(sample_sub["month_key"], freq="M")
      .unique()
      .sort_values()
)
H = int(len(future_periods))
future_moy = set([p.month for p in future_periods])

# ------------------------------
# Build monthly portfolio (complete range)
# ------------------------------
monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)
if len(monthly) == 0:
    raise ValueError("monthly kosong. Cek df/year_month.")

if not isinstance(monthly.loc[0, "year_month"], pd.Period):
    monthly["year_month"] = pd.PeriodIndex(monthly["year_month"], freq="M")

min_m = monthly["year_month"].min()
max_m = monthly["year_month"].max()
all_months = pd.period_range(min_m, max_m, freq="M")

monthly = (
    monthly.set_index("year_month")
           .reindex(all_months, fill_value=0.0)
           .rename_axis("year_month")
           .reset_index()
)
monthly["frequency"]   = pd.to_numeric(monthly["frequency"], errors="coerce").fillna(0.0).clip(lower=1.0)
monthly["total_claim"] = pd.to_numeric(monthly["total_claim"], errors="coerce").fillna(0.0).clip(lower=1.0)
monthly["severity"]    = (monthly["total_claim"] / monthly["frequency"]).astype(float)
monthly["month"]       = monthly["year_month"].dt.month
monthly["t"]           = np.arange(len(monthly), dtype=int)

N = len(monthly)
H = min(H, max(1, N - 10))
print("N months:", N, "| H:", H, "| Future MOY:", sorted(list(future_moy)))

# ============================================================
# Metrics
# ============================================================
def mape_frac(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = np.isfinite(y_true) & np.isfinite(y_pred) & (y_true != 0)
    if mask.sum() == 0:
        return np.nan
    return float(np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])))

def avg_mape(yF, pF, yT, pT):
    yF = np.asarray(yF, float); pF = np.asarray(pF, float)
    yT = np.asarray(yT, float); pT = np.asarray(pT, float)
    yS = yT / np.clip(yF, 1.0, np.inf)
    pS = pT / np.clip(pF, 1.0, np.inf)
    mf = mape_frac(yF, pF)
    mt = mape_frac(yT, pT)
    ms = mape_frac(yS, pS)
    return float(np.nanmean([mf, mt, ms])), mt, mf, ms

# ============================================================
# Stage 3 v17 (your fixed)
# ============================================================
S3 = dict(
    wt_total=0.85, anchor_total="median",
    wt_freq=0.20,  anchor_freq="mean",
    k_anchor=3
)

def anchor_level_series(x: pd.Series, k: int, how: str):
    tail = np.asarray(x.tail(k), dtype=float)
    return float(np.median(tail)) if how == "median" else float(np.mean(tail))

def ets_1step_log1p_series(level_series: pd.Series, trend="add", damped=True):
    y = np.log1p(level_series.astype(float).clip(lower=1e-12))
    if len(y) < 4:
        return float(np.expm1(y.iloc[-1]))
    if trend is not None and len(y) < 10:
        trend = None
        damped = False
    try:
        m = ExponentialSmoothing(
            y, trend=trend,
            damped_trend=(damped if trend is not None else False),
            seasonal=None
        ).fit()
        return float(np.expm1(m.forecast(1).iloc[0]))
    except:
        return float(level_series.iloc[-1])

def simulate_stage3(train_df: pd.DataFrame, H: int):
    sim = train_df.copy().reset_index(drop=True)
    pF, pT = [], []
    for _ in range(H):
        k = int(S3["k_anchor"])

        tot_fc = ets_1step_log1p_series(sim["total_claim"], trend="add", damped=True)
        tot_anchor = anchor_level_series(sim["total_claim"], k, S3["anchor_total"])
        tot_pred = float(S3["wt_total"])*tot_fc + (1-float(S3["wt_total"]))*tot_anchor
        tot_pred = max(1.0, tot_pred)

        fre_fc = ets_1step_log1p_series(sim["frequency"], trend="add", damped=True)
        fre_anchor = anchor_level_series(sim["frequency"], k, S3["anchor_freq"])
        fre_pred = float(S3["wt_freq"])*fre_fc + (1-float(S3["wt_freq"]))*fre_anchor
        fre_pred = max(1.0, fre_pred)

        pF.append(fre_pred)
        pT.append(tot_pred)

        sim = pd.concat([sim, pd.DataFrame([{
            "year_month": sim["year_month"].iloc[-1] + 1,
            "month": int((sim["year_month"].iloc[-1] + 1).month),
            "t": int(sim["t"].iloc[-1] + 1),
            "frequency": fre_pred,
            "total_claim": tot_pred,
            "severity": float(tot_pred / fre_pred),
        }])], ignore_index=True)

    return np.array(pF, float), np.array(pT, float)

# ============================================================
# Stage 4 v30 BEST params (PASTE FROM YOUR OUTPUT)
# ============================================================
BEST4 = {
    'k_anchor': 4, 'anchor_f': 'mean', 'anchor_t': 'median',
    'w_ets_f': 0.4140902012641963, 'w_drift_f': 0.23427959162969844,
    'w_ets_t': 0.1253855830888168, 'w_drift_t': 0.0557421860395439,
    'drift_k_f': 5, 'drift_k_t': 5,
    'season_scale_f': 1.007823739519456, 'season_scale_t': 0.9473922819889735,
    'capF_low': 0.8025030597351724, 'capF_high': 1.4449152050189749,
    'capT_low': 0.5583495487333925, 'capT_high': 1.7549317144514707
}

def build_season_log(series_level, months, clip_abs=0.30):
    y = np.log1p(np.asarray(series_level, float).clip(min=0))
    m = np.asarray(months, int)
    base = float(np.median(y))
    s = np.zeros(13, dtype=float)
    for mm in range(1, 13):
        vals = y[m == mm]
        s[mm] = float(np.median(vals) - base) if len(vals) else 0.0
    return np.clip(s, -float(clip_abs), float(clip_abs))

# season prior from FULL history (exact Stage4 v30 behavior)
SEASON_LOG_F_FULL = build_season_log(monthly["frequency"].values, monthly["month"].values, clip_abs=0.30)
SEASON_LOG_T_FULL = build_season_log(monthly["total_claim"].values, monthly["month"].values, clip_abs=0.35)

def ets_1step_on_log(series_log):
    y = np.asarray(series_log, float)
    if len(y) < 4:
        return float(y[-1])
    trend = "add" if len(y) >= 10 else None
    damped = True if trend is not None else False
    try:
        m = ExponentialSmoothing(
            y,
            trend=trend,
            damped_trend=damped if trend is not None else False,
            seasonal=None,
            initialization_method="estimated"
        ).fit()
        return float(m.forecast(1)[0])
    except:
        return float(y[-1])

def drift_on_log(series_log, k=3):
    y = np.asarray(series_log, float)
    if len(y) < (k + 2):
        return float(y[-1])
    deltas = np.diff(y[-(k+1):])
    return float(y[-1] + np.mean(deltas))

def anchor_on_log(series_log, k=3, how="mean"):
    tail = np.asarray(series_log[-k:], float)
    return float(np.median(tail)) if how == "median" else float(np.mean(tail))

def simulate_stage4v30(train_df: pd.DataFrame, H: int):
    P = BEST4
    sim = train_df.copy().reset_index(drop=True)
    sim["logF"] = np.log1p(sim["frequency"].values)
    sim["logT"] = np.log1p(sim["total_claim"].values)

    sf = float(P["season_scale_f"])
    st = float(P["season_scale_t"])

    pF, pT = [], []

    for _ in range(H):
        next_period = sim["year_month"].iloc[-1] + 1
        m_next = int(next_period.month)

        logF_ds = sim["logF"].values - sf * SEASON_LOG_F_FULL[sim["month"].values]
        logT_ds = sim["logT"].values - st * SEASON_LOG_T_FULL[sim["month"].values]

        f_ets = ets_1step_on_log(logF_ds)
        f_drift = drift_on_log(logF_ds, k=int(P["drift_k_f"]))
        f_an = anchor_on_log(logF_ds, k=int(P["k_anchor"]), how=P["anchor_f"])
        f_ds_pred = float(P["w_ets_f"])*f_ets + float(P["w_drift_f"])*f_drift + (1.0-float(P["w_ets_f"])-float(P["w_drift_f"]))*f_an

        t_ets = ets_1step_on_log(logT_ds)
        t_drift = drift_on_log(logT_ds, k=int(P["drift_k_t"]))
        t_an = anchor_on_log(logT_ds, k=int(P["k_anchor"]), how=P["anchor_t"])
        t_ds_pred = float(P["w_ets_t"])*t_ets + float(P["w_drift_t"])*t_drift + (1.0-float(P["w_ets_t"])-float(P["w_drift_t"]))*t_an

        logF_pred = float(f_ds_pred + sf * SEASON_LOG_F_FULL[m_next])
        logT_pred = float(t_ds_pred + st * SEASON_LOG_T_FULL[m_next])

        F_pred = float(np.expm1(logF_pred))
        T_pred = float(np.expm1(logT_pred))

        # clamp vs anchor (level)
        logF_an_next = float(f_an + sf * SEASON_LOG_F_FULL[m_next])
        logT_an_next = float(t_an + st * SEASON_LOG_T_FULL[m_next])
        F_an = float(np.expm1(logF_an_next))
        T_an = float(np.expm1(logT_an_next))

        F_pred = float(np.clip(F_pred, max(1.0, F_an*float(P["capF_low"])), F_an*float(P["capF_high"])))
        T_pred = float(np.clip(T_pred, max(1.0, T_an*float(P["capT_low"])), T_an*float(P["capT_high"])))

        pF.append(F_pred)
        pT.append(T_pred)

        sim = pd.concat([sim, pd.DataFrame([{
            "year_month": next_period,
            "month": m_next,
            "t": int(sim["t"].iloc[-1] + 1),
            "frequency": F_pred,
            "total_claim": T_pred,
            "severity": float(T_pred / max(1.0, F_pred)),
            "logF": float(np.log1p(F_pred)),
            "logT": float(np.log1p(T_pred)),
        }])], ignore_index=True)

    return np.array(pF, float), np.array(pT, float)

# ============================================================
# CV splits & weights MUST match Stage4 v30 run
# ============================================================
train_ends = [7, 8, 13, 14]
split_w = np.array([0.636, 0.277, 0.042, 0.045], float)
split_w = split_w / split_w.sum()
print("Blend train_ends:", train_ends, "| weights:", split_w.round(3).tolist())

# ============================================================
# 2D blend search: choose wF and wT separately
# ============================================================
grid = np.linspace(0.0, 1.0, 21)  # step 0.05
best = None
rows = []

for wF in grid:
    for wT in grid:
        split_scores = []
        for te, w in zip(train_ends, split_w):
            tr = monthly.iloc[:te].copy().reset_index(drop=True)
            va = monthly.iloc[te:te+H].copy().reset_index(drop=True)

            F3, T3 = simulate_stage3(tr, H)
            F4, T4 = simulate_stage4v30(tr, H)

            F = (1-wF)*F3 + wF*F4
            T = (1-wT)*T3 + wT*T4

            sc, mt, mf, ms = avg_mape(va["frequency"].values, F, va["total_claim"].values, T)
            split_scores.append(sc)

        meanw = float(np.sum(split_w * np.array(split_scores)))
        worst = float(np.max(split_scores))
        rows.append([wF, wT, meanw, worst] + split_scores)

        cand = (meanw, worst, wF, wT)
        if (best is None) or (cand < best):
            best = cand

best_mean, best_worst, wF_best, wT_best = best
print("\nBest blend (2D):")
print("  wF_best =", wF_best, "(Stage4 weight for Frequency)")
print("  wT_best =", wT_best, "(Stage4 weight for Total)")
print("  CV mean =", round(best_mean*100, 4), "% | worst =", round(best_worst*100, 4), "%")

dfb = pd.DataFrame(rows, columns=["wF","wT","mean_avg","worst_avg"] + [f"split_{i}" for i in range(len(train_ends))])
print("\nTop 10 by mean_avg:")
print(dfb.sort_values(["mean_avg","worst_avg"]).head(10))

# ============================================================
# FINAL forecast on FULL data
# ============================================================
F3, T3 = simulate_stage3(monthly, H)
F4, T4 = simulate_stage4v30(monthly, H)

F = (1-wF_best)*F3 + wF_best*F4
T = (1-wT_best)*T3 + wT_best*T4
S = T / np.clip(F, 1.0, np.inf)

pred_map = {}
preview = []
for i, period in enumerate(future_periods):
    key = f"{period.year}_{str(period.month).zfill(2)}"
    pred_map[f"{key}_Claim_Frequency"] = float(F[i])
    pred_map[f"{key}_Total_Claim"]     = float(T[i])
    pred_map[f"{key}_Claim_Severity"]  = float(S[i])
    preview.append([str(period), float(F[i]), float(T[i]), float(S[i])])

sub = sample_sub.copy()
sub["value"] = sub["id"].map(pred_map)
missing = int(sub["value"].isna().sum())
print("NaN in submission:", missing)
assert missing == 0, "Ada id belum terisi. Cek key format."

sub = sub[["id","value"]]
sub.to_csv("submission.csv", index=False)

print("\nPreview future predictions:")
print(pd.DataFrame(preview, columns=["period","pred_freq","pred_total","pred_sev"]))
print("\nSaved: submission.csv")
print(sub.head(12))

N months: 19 | H: 5 | Future MOY: [8, 9, 10, 11, 12]
Blend train_ends: [7, 8, 13, 14] | weights: [0.636, 0.277, 0.042, 0.045]

Best blend (2D):
  wF_best = 1.0 (Stage4 weight for Frequency)
  wT_best = 1.0 (Stage4 weight for Total)
  CV mean = 2.7148 % | worst = 5.9924 %

Top 10 by mean_avg:
       wF    wT  mean_avg  worst_avg   split_0   split_1   split_2   split_3
440  1.00  1.00  0.027148   0.059924  0.010063  0.059924  0.055742  0.040181
419  0.95  1.00  0.027608   0.060917  0.010501  0.060917  0.053238  0.040423
439  1.00  0.95  0.027650   0.060254  0.010683  0.060254  0.056984  0.039366
418  0.95  0.95  0.028299   0.060834  0.011568  0.060834  0.054961  0.039608
398  0.90  1.00  0.029244   0.062815  0.012304  0.062815  0.052115  0.040664
417  0.95  0.90  0.029478   0.061349  0.013117  0.061349  0.057053  0.038792
438  1.00  0.90  0.029507   0.061363  0.013037  0.061363  0.059136  0.038551
397  0.90  0.95  0.029595   0.062734  0.012836  0.062734  0.053834  0.039848
396  0.90  0.9

In [8]:
print(sub.head(12)) ##  19,23433

                         id         value
0   2025_08_Claim_Frequency  2.232146e+02
1    2025_08_Claim_Severity  5.929055e+07
2       2025_08_Total_Claim  1.323452e+10
3   2025_09_Claim_Frequency  2.041827e+02
4    2025_09_Claim_Severity  5.933691e+07
5       2025_09_Total_Claim  1.211557e+10
6   2025_10_Claim_Frequency  2.703015e+02
7    2025_10_Claim_Severity  4.604846e+07
8       2025_10_Total_Claim  1.244697e+10
9   2025_11_Claim_Frequency  2.655660e+02
10   2025_11_Claim_Severity  5.043058e+07
11      2025_11_Total_Claim  1.339265e+10
