In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/Data_Klaim.csv
/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/sample_submission.csv
/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/Data_Polis.csv


# DATA FOUNDATION

In [2]:
# ============================================================
# STAGE 1 v2 â€” MAPE OPTIMIZED FOUNDATION
# Short Series Safe â€¢ No Target Distortion â€¢ Trend Ready
# ============================================================

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"

klaim = pd.read_csv(BASE_PATH + "Data_Klaim.csv")
polis = pd.read_csv(BASE_PATH + "Data_Polis.csv")

# ============================================================
# CLEAN COLUMN NAMES
# ============================================================

def clean_columns(df):
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_", regex=False)
        .str.replace("/", "_", regex=False)
        .str.replace("-", "_", regex=False)
    )
    return df

klaim = clean_columns(klaim)
polis = clean_columns(polis)

klaim = klaim.drop_duplicates().reset_index(drop=True)
polis = polis.drop_duplicates().reset_index(drop=True)

# ============================================================
# DATE PARSING
# ============================================================

for col in klaim.columns:
    if "tanggal" in col:
        klaim[col] = pd.to_datetime(klaim[col], errors="coerce")

for col in polis.columns:
    if "tanggal" in col:
        polis[col] = pd.to_datetime(polis[col], errors="coerce")

# ============================================================
# BASIC CLEANING
# ============================================================

klaim = klaim.dropna(subset=["nomor_polis", "tanggal_pasien_masuk_rs"])
klaim["nominal_klaim_yang_disetujui"] = klaim["nominal_klaim_yang_disetujui"].fillna(0)

# Light winsorization (too aggressive smoothing hurts MAPE)
low_q = klaim["nominal_klaim_yang_disetujui"].quantile(0.005)
high_q = klaim["nominal_klaim_yang_disetujui"].quantile(0.995)

klaim["nominal_klaim_yang_disetujui"] = \
    klaim["nominal_klaim_yang_disetujui"].clip(low_q, high_q)

# ============================================================
# MERGE
# ============================================================

df = klaim.merge(polis, on="nomor_polis", how="left")

# ============================================================
# SERVICE MONTH
# ============================================================

df["year_month"] = df["tanggal_pasien_masuk_rs"].dt.to_period("M")

# ============================================================
# TRUE EXPOSURE (NO OVER-SMOOTH)
# ============================================================

exposure_monthly = (
    df.groupby("year_month")
      .agg(active_policies=("nomor_polis","nunique"))
      .reset_index()
      .sort_values("year_month")
)

df = df.merge(exposure_monthly, on="year_month", how="left")

# ============================================================
# MONTHLY CORE TABLE (DO NOT MODIFY RAW TOTAL)
# ============================================================

monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("nomor_polis","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
          exposure=("active_policies","mean")
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

# Keep raw total_claim untouched

monthly["severity"] = (
    monthly["total_claim"] /
    monthly["frequency"].replace(0,np.nan)
)

monthly["claim_rate"] = (
    monthly["frequency"] /
    monthly["exposure"].replace(0,np.nan)
)

# ============================================================
# LOG DOMAIN FEATURES (IMPORTANT FOR MAPE)
# ============================================================

monthly["log_total"] = np.log1p(monthly["total_claim"])
monthly["log_freq"]  = np.log1p(monthly["frequency"])
monthly["log_sev"]   = np.log1p(monthly["severity"])
monthly["log_rate"]  = np.log1p(monthly["claim_rate"])

# ============================================================
# VOLATILITY (SHORT SERIES SAFE)
# ============================================================

monthly["roll6"] = monthly["total_claim"].rolling(6, min_periods=3).mean()
monthly["std6"]  = monthly["total_claim"].rolling(6, min_periods=3).std()

monthly["vol_ratio"] = monthly["std6"] / monthly["roll6"]

monthly["high_vol_regime"] = (
    monthly["vol_ratio"] > monthly["vol_ratio"].median()
).astype(int)

# ============================================================
# TIME FEATURES
# ============================================================

monthly["month"] = monthly["year_month"].dt.month
monthly["month_sin"] = np.sin(2*np.pi*monthly["month"]/12)
monthly["month_cos"] = np.cos(2*np.pi*monthly["month"]/12)
monthly["month_index"] = np.arange(len(monthly))

# ============================================================
# SAFE LAGS (NO ZERO FILL)
# ============================================================

for col in ["log_total","log_freq","log_sev","log_rate"]:
    monthly[f"{col}_lag1"] = monthly[col].shift(1)
    monthly[f"{col}_lag2"] = monthly[col].shift(2)
    monthly[f"{col}_lag3"] = monthly[col].shift(3)

    monthly[f"{col}_roll3"] = monthly[col].shift(1).rolling(3).mean()

# ============================================================
# DROP EARLY MONTHS (IMPORTANT FOR MAPE STABILITY)
# ============================================================

monthly = monthly.dropna().reset_index(drop=True)

# ============================================================
# FINAL CHECK
# ============================================================

print("Monthly shape:", monthly.shape)
print("Unique months:", monthly["year_month"].nunique())
print("Vol regime ratio:", round(monthly["high_vol_regime"].mean(),3))
print("\nSTAGE 1 v2 â€” MAPE OPTIMIZED FOUNDATION READY")


Monthly shape: (16, 34)
Unique months: 16
Vol regime ratio: 0.438

STAGE 1 v2 â€” MAPE OPTIMIZED FOUNDATION READY


# TIME-SERIES DATASET ENGINEERING

In [3]:
# ============================================================
# STAGE 2 â€” ELITE SEGMENT PANEL (SAFE VERSION)
# No KeyError â€¢ Auto-create missing columns â€¢ Short series safe
# ============================================================

import numpy as np
import pandas as pd

# ============================================================
# ðŸ”¹ ENSURE REQUIRED SEGMENT COLUMNS EXIST
# ============================================================

# Care Type
if "care_type" not in df.columns:
    if "inpatient_outpatient" in df.columns:
        df["care_type"] = (
            df["inpatient_outpatient"]
            .astype(str)
            .str.upper()
            .str.strip()
        )
    else:
        df["care_type"] = "UNKNOWN"

df["care_type"] = df["care_type"].fillna("UNKNOWN")


# Cashless
if "is_cashless" not in df.columns:
    if "reimburse_cashless" in df.columns:
        rc = df["reimburse_cashless"].astype(str).str.upper().str.strip()
        df["is_cashless"] = rc.eq("C").astype(int)
    else:
        df["is_cashless"] = 0


# RS Bucket
if "rs_bucket" not in df.columns:
    if "lokasi_rs" in df.columns:
        loc = df["lokasi_rs"].astype(str).str.upper().str.strip()
        df["rs_bucket"] = np.select(
            [
                loc.eq("INDONESIA"),
                loc.eq("SINGAPORE"),
                loc.eq("MALAYSIA")
            ],
            ["ID","SG","MY"],
            default="OTHER"
        )
    else:
        df["rs_bucket"] = "OTHER"

df["rs_bucket"] = df["rs_bucket"].fillna("OTHER")


# Plan Code
if "plan_code" not in df.columns:
    df["plan_code"] = "UNKNOWN"

df["plan_code"] = df["plan_code"].fillna("UNKNOWN")

# ============================================================
# ðŸ”¹ DEFINE SEGMENT COLUMNS
# ============================================================

seg_cols = ["plan_code","care_type","is_cashless","rs_bucket"]

# ============================================================
# ðŸ”¹ BUILD SEGMENT MONTHLY
# ============================================================

seg_monthly = (
    df.groupby(["year_month"] + seg_cols)
      .agg(
          frequency=("nomor_polis","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
          exposure=("nomor_polis","nunique")
      )
      .reset_index()
      .sort_values(seg_cols + ["year_month"])
      .reset_index(drop=True)
)

# ============================================================
# ðŸ”¹ TARGETS
# ============================================================

seg_monthly["severity"] = (
    seg_monthly["total_claim"] /
    seg_monthly["frequency"].replace(0, np.nan)
)

seg_monthly["log_total"] = np.log1p(seg_monthly["total_claim"])
seg_monthly["log_freq"]  = np.log1p(seg_monthly["frequency"])
seg_monthly["log_sev"]   = np.log1p(seg_monthly["severity"])

# ============================================================
# ðŸ”¹ CALENDAR
# ============================================================

seg_monthly["month"] = seg_monthly["year_month"].dt.month
seg_monthly["month_sin"] = np.sin(2*np.pi*seg_monthly["month"]/12)
seg_monthly["month_cos"] = np.cos(2*np.pi*seg_monthly["month"]/12)

# ============================================================
# ðŸ”¹ LAGS (STRICT NO LEAKAGE)
# ============================================================

for col in ["log_total","log_freq","log_sev"]:
    
    seg_monthly[f"{col}_lag1"] = \
        seg_monthly.groupby(seg_cols)[col].shift(1)
    
    seg_monthly[f"{col}_lag2"] = \
        seg_monthly.groupby(seg_cols)[col].shift(2)
    
    seg_monthly[f"{col}_lag3"] = \
        seg_monthly.groupby(seg_cols)[col].shift(3)

    seg_monthly[f"{col}_roll3"] = \
        seg_monthly.groupby(seg_cols)[col] \
        .transform(lambda x: x.shift(1).rolling(3).mean())

# ============================================================
# ðŸ”¹ MOMENTUM
# ============================================================

seg_monthly["momentum_total"] = (
    seg_monthly["log_total_lag1"] -
    seg_monthly["log_total_lag2"]
)

# ============================================================
# ðŸ”¹ SEGMENT WEIGHT
# ============================================================

seg_monthly["seg_weight"] = (
    seg_monthly["frequency"] /
    seg_monthly.groupby("year_month")["frequency"].transform("sum")
).fillna(0)

# ============================================================
# ðŸ”¹ SAFE TRAIN WINDOW
# ============================================================

seg_model = seg_monthly[
    seg_monthly["log_total_lag3"].notna()
].reset_index(drop=True)

seg_model = seg_model.fillna(0)

# ============================================================
# FINAL CHECK
# ============================================================

print("COMPACT PANEL SHAPE:", seg_model.shape)
print("Unique segments:", seg_model[seg_cols].drop_duplicates().shape[0])
print("Columns:", len(seg_model.columns))
print("\nSTAGE 2 â€” ELITE SEGMENT PANEL READY")

COMPACT PANEL SHAPE: (414, 29)
Unique segments: 41
Columns: 29

STAGE 2 â€” ELITE SEGMENT PANEL READY


# MODEL DEVELOPMENT

In [4]:
# ============================================================
# STRUCTURAL v14 â€” KAGGLE-MATCH VALIDATION
# Train full history â€¢ Predict last 4 months only
# ============================================================

import numpy as np
import pandas as pd
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore")

def mape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

# ==============================
# BUILD MONTHLY
# ==============================

monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
          exposure=("active_policies","first")
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

monthly["severity"]   = monthly["total_claim"] / monthly["frequency"].replace(0,np.nan)
monthly["claim_rate"] = monthly["frequency"] / monthly["exposure"].replace(0,np.nan)

# ==============================
# SPLIT IDENTICAL TO KAGGLE
# ==============================

train = monthly.iloc[:-4].copy()
valid = monthly.iloc[-4:].copy()

sim_df = train.copy()

pred_total = []
pred_freq  = []
pred_sev   = []

for i in range(4):

    train_sim = sim_df.copy()

    # DIRECT TOTAL MODEL
    try:
        model_total = ExponentialSmoothing(
            np.log1p(train_sim["total_claim"]),
            trend="add",
            damped_trend=True,
            seasonal=None
        ).fit()

        total_pred = np.expm1(model_total.forecast(1).iloc[0])
    except:
        total_pred = train_sim["total_claim"].iloc[-1]

    # soft shrink
    total_pred = 0.8 * total_pred + 0.2 * train_sim["total_claim"].tail(3).mean()

    # derive frequency from recent stable rate
    exposure_next = train_sim["exposure"].iloc[-1]
    rate_recent = train_sim["claim_rate"].tail(3).mean()
    freq_pred_i = max(rate_recent * exposure_next, 1)

    sev_pred_i = total_pred / freq_pred_i

    pred_total.append(total_pred)
    pred_freq.append(freq_pred_i)
    pred_sev.append(sev_pred_i)

    new_row = {
        "frequency": freq_pred_i,
        "total_claim": total_pred,
        "exposure": exposure_next,
        "severity": sev_pred_i,
        "claim_rate": rate_recent
    }

    sim_df = pd.concat([sim_df, pd.DataFrame([new_row])], ignore_index=True)

print("\n==============================")
print("STRUCT v14 MAPE Frequency :", round(mape(valid["frequency"], pred_freq),4))
print("STRUCT v14 MAPE Total     :", round(mape(valid["total_claim"], pred_total),4))
print("STRUCT v14 MAPE Severity  :", round(mape(valid["severity"], pred_sev),4))
print("Estimated Score           :", round(np.mean([
    mape(valid["frequency"], pred_freq),
    mape(valid["total_claim"], pred_total),
    mape(valid["severity"], pred_sev)
]),4))
print("==============================")



STRUCT v14 MAPE Frequency : 8.4632
STRUCT v14 MAPE Total     : 5.0886
STRUCT v14 MAPE Severity  : 6.7861
Estimated Score           : 6.7793


# TOTAL CLAIM OPTIMIZATION & VALIDATION, OPTUNA

In [5]:
# ============================================================
# STAGE 4 â€” TRUE KAGGLE MATCH v2 (STABLE SHORT SERIES)
# No feature drift â€¢ Conservative ML â€¢ Clamp safe
# ============================================================

!pip install -q optuna lightgbm

import optuna
import numpy as np
import pandas as pd
import lightgbm as lgb
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore")

def mape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

# ==============================
# BUILD MONTHLY (NO GLOBAL DROP)
# ==============================

monthly = (
    df.groupby("year_month")
      .agg(total_claim=("nominal_klaim_yang_disetujui","sum"))
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

monthly["log_total"] = np.log1p(monthly["total_claim"])
monthly["month"] = monthly["year_month"].dt.month
monthly["month_sin"] = np.sin(2*np.pi*monthly["month"]/12)
monthly["month_cos"] = np.cos(2*np.pi*monthly["month"]/12)

# Kaggle split
train_full = monthly.iloc[:-4].copy()
valid_full = monthly.iloc[-4:].copy()

# ==============================
# OBJECTIVE
# ==============================

def objective(trial):

    alpha  = trial.suggest_float("alpha", 0.6, 0.9)
    lr     = trial.suggest_float("lr", 0.005, 0.015)
    leaves = trial.suggest_int("leaves", 3, 6)
    shrink = trial.suggest_float("shrink", 0.85, 0.98)

    sim_df = train_full.copy()
    preds = []

    for step in range(4):

        sub_train = sim_df.copy()

        # ===== BUILD FEATURES INSIDE LOOP =====
        sub_train["log_lag1"] = sub_train["log_total"].shift(1)
        sub_train["log_lag2"] = sub_train["log_total"].shift(2)
        sub_train["log_lag3"] = sub_train["log_total"].shift(3)
        sub_train["log_roll3"] = sub_train["log_total"].shift(1).rolling(3).mean()

        sub_train = sub_train.dropna().reset_index(drop=True)

        features = [
            "month_sin","month_cos",
            "log_lag1","log_lag2","log_lag3","log_roll3"
        ]

        # ================= ETS =================
        try:
            ets = ExponentialSmoothing(
                sub_train["log_total"],
                trend="add",
                damped_trend=True,
                seasonal=None
            ).fit()

            pred_ets = np.expm1(ets.forecast(1).iloc[0])
        except:
            pred_ets = sub_train["total_claim"].iloc[-1]

        # ================= LOW VAR LIGHTGBM =================
        model = lgb.LGBMRegressor(
            n_estimators=120,
            learning_rate=lr,
            num_leaves=leaves,
            min_data_in_leaf=5,
            feature_fraction=0.7,
            bagging_fraction=0.7,
            bagging_freq=1,
            verbosity=-1,
            random_state=42
        )

        model.fit(sub_train[features], sub_train["log_total"])

        # ===== BUILD NEXT ROW =====
        last_row = sim_df.iloc[-1]
        next_month = (last_row["month"] % 12) + 1

        log_lag1 = sim_df["log_total"].iloc[-1]
        log_lag2 = sim_df["log_total"].iloc[-2]
        log_lag3 = sim_df["log_total"].iloc[-3]
        log_roll3 = sim_df["log_total"].iloc[-3:].mean()

        X_new = pd.DataFrame([{
            "month_sin": np.sin(2*np.pi*next_month/12),
            "month_cos": np.cos(2*np.pi*next_month/12),
            "log_lag1": log_lag1,
            "log_lag2": log_lag2,
            "log_lag3": log_lag3,
            "log_roll3": log_roll3
        }])

        pred_ml = np.expm1(model.predict(X_new)[0])

        # ================= HYBRID =================
        pred = alpha * pred_ets + (1-alpha) * pred_ml

        # ===== STRONG SHRINK =====
        median_anchor = sim_df["total_claim"].tail(3).median()
        pred = shrink * pred + (1-shrink) * median_anchor

        # ===== CLAMP (VERY IMPORTANT FOR PUBLIC) =====
        lower = sim_df["total_claim"].tail(6).min() * 0.9
        upper = sim_df["total_claim"].tail(6).max() * 1.1
        pred = np.clip(pred, lower, upper)

        preds.append(pred)

        # ===== UPDATE =====
        new_row = {
            "year_month": None,
            "total_claim": pred,
            "log_total": np.log1p(pred),
            "month": next_month,
            "month_sin": np.sin(2*np.pi*next_month/12),
            "month_cos": np.cos(2*np.pi*next_month/12)
        }

        sim_df = pd.concat([sim_df, pd.DataFrame([new_row])], ignore_index=True)

    return mape(valid_full["total_claim"], preds)

# ==============================
# RUN OPTUNA
# ==============================

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=120)

print("\nBest Params:", study.best_params)
print("Best 4M MAPE:", round(study.best_value*100,4), "%")


[I 2026-02-17 02:08:11,126] A new study created in memory with name: no-name-2b0124bd-712e-41a7-8c0b-70c384e6db54
[I 2026-02-17 02:08:11,452] Trial 0 finished with value: 0.05578777658993996 and parameters: {'alpha': 0.8147725082905524, 'lr': 0.009324211448132913, 'leaves': 6, 'shrink': 0.9069721138844941}. Best is trial 0 with value: 0.05578777658993996.
[I 2026-02-17 02:08:11,657] Trial 1 finished with value: 0.05153320477157752 and parameters: {'alpha': 0.6260829395379682, 'lr': 0.008964468794301158, 'leaves': 5, 'shrink': 0.9385097236863122}. Best is trial 1 with value: 0.05153320477157752.
[I 2026-02-17 02:08:11,875] Trial 2 finished with value: 0.05288967745489815 and parameters: {'alpha': 0.7044475220703637, 'lr': 0.013924201515301878, 'leaves': 3, 'shrink': 0.8952940137799288}. Best is trial 1 with value: 0.05153320477157752.
[I 2026-02-17 02:08:12,076] Trial 3 finished with value: 0.05115438296668548 and parameters: {'alpha': 0.665400123752443, 'lr': 0.009610923836699904, 'lea


Best Params: {'alpha': 0.6000342276087071, 'lr': 0.014569460482219097, 'leaves': 5, 'shrink': 0.9735873357556633}
Best 4M MAPE: 4.838 %


# TEST PREDICTION & KAGGLE SUBMISSION

In [6]:
# ============================================================
# STAGE 5 â€” TRUE MIRROR STAGE 4 (NO DRIFT Â· NO EXTRA CLAMP)
# ============================================================

import numpy as np
import pandas as pd
import lightgbm as lgb
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore")

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"
sample_sub = pd.read_csv(BASE_PATH + "sample_submission.csv")

# ==============================
# USE REAL BEST PARAMS
# ==============================
BEST = {
    "alpha": 0.600033096789066,
    "lr": 0.014745202277965204,
    "leaves": 4,
    "shrink": 0.976030431306347
}

def safe_expm1(x):
    return np.expm1(np.clip(x, -50, 50))

# ==============================
# BUILD MONTHLY (NO GLOBAL DROP)
# ==============================
monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
          exposure=("active_policies","first"),
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

monthly["log_total"] = np.log1p(monthly["total_claim"])
monthly["log_freq"]  = np.log1p(np.maximum(monthly["frequency"],1))
monthly["month"] = monthly["year_month"].dt.month
monthly["month_sin"] = np.sin(2*np.pi*monthly["month"]/12)
monthly["month_cos"] = np.cos(2*np.pi*monthly["month"]/12)

sim_df = monthly.copy()

# ==============================
# FUTURE PERIODS
# ==============================
sample_sub["year"]  = sample_sub["id"].str.split("_").str[0]
sample_sub["month"] = sample_sub["id"].str.split("_").str[1]
sample_sub["month_key"] = sample_sub["year"] + "-" + sample_sub["month"]

future_periods = (
    pd.PeriodIndex(sample_sub["month_key"], freq="M")
      .unique()
      .sort_values()
)

predictions = {}

for period in future_periods:

    # =============================
    # BUILD TRAIN FEATURES DYNAMIC
    # =============================
    train_feat = sim_df.copy()
    train_feat["log_lag1"] = train_feat["log_total"].shift(1)
    train_feat["log_lag2"] = train_feat["log_total"].shift(2)
    train_feat["log_lag3"] = train_feat["log_total"].shift(3)
    train_feat["log_roll3"] = train_feat["log_total"].shift(1).rolling(3).mean()
    train_feat = train_feat.dropna().reset_index(drop=True)

    features = [
        "month_sin","month_cos",
        "log_lag1","log_lag2","log_lag3","log_roll3"
    ]

    # =============================
    # NEXT MONTH FEATURES
    # =============================
    last_month = int(sim_df["month"].iloc[-1])
    next_month = (last_month % 12) + 1

    log_lag1 = float(sim_df["log_total"].iloc[-1])
    log_lag2 = float(sim_df["log_total"].iloc[-2])
    log_lag3 = float(sim_df["log_total"].iloc[-3])
    log_roll3 = float(sim_df["log_total"].iloc[-3:].mean())

    X_new = pd.DataFrame([{
        "month_sin": np.sin(2*np.pi*next_month/12),
        "month_cos": np.cos(2*np.pi*next_month/12),
        "log_lag1": log_lag1,
        "log_lag2": log_lag2,
        "log_lag3": log_lag3,
        "log_roll3": log_roll3
    }])

    # =============================
    # ETS TOTAL
    # =============================
    try:
        ets = ExponentialSmoothing(
            sim_df["log_total"],
            trend="add",
            damped_trend=True,
            seasonal=None
        ).fit()
        pred_ets = safe_expm1(float(ets.forecast(1).iloc[0]))
    except:
        pred_ets = float(sim_df["total_claim"].iloc[-1])

    # =============================
    # LGB TOTAL
    # =============================
    model = lgb.LGBMRegressor(
        n_estimators=120,
        learning_rate=BEST["lr"],
        num_leaves=BEST["leaves"],
        min_data_in_leaf=5,
        feature_fraction=0.7,
        bagging_fraction=0.7,
        bagging_freq=1,
        verbosity=-1,
        random_state=42
    )

    model.fit(train_feat[features], train_feat["log_total"])
    pred_ml = safe_expm1(float(model.predict(X_new)[0]))

    # =============================
    # HYBRID + SHRINK (IDENTICAL)
    # =============================
    pred_total = BEST["alpha"] * pred_ets + (1 - BEST["alpha"]) * pred_ml

    median_anchor = float(sim_df["total_claim"].tail(3).median())
    pred_total = BEST["shrink"] * pred_total + (1 - BEST["shrink"]) * median_anchor

    # CLAMP SAME AS STAGE 4
    lo = float(sim_df["total_claim"].tail(6).min()) * 0.9
    hi = float(sim_df["total_claim"].tail(6).max()) * 1.1
    pred_total = float(np.clip(pred_total, lo, hi))

    # =============================
    # FREQUENCY (SIMPLE ETS)
    # =============================
    try:
        ets_f = ExponentialSmoothing(
            sim_df["log_freq"],
            trend="add",
            damped_trend=True,
            seasonal=None
        ).fit()
        pred_freq = safe_expm1(float(ets_f.forecast(1).iloc[0]))
    except:
        pred_freq = float(sim_df["frequency"].iloc[-1])

    pred_freq = max(pred_freq, 1.0)

    # =============================
    # SEVERITY
    # =============================
    pred_sev = pred_total / pred_freq

    # =============================
    # UPDATE
    # =============================
    new_row = {
        "year_month": period,
        "month": next_month,
        "month_sin": np.sin(2*np.pi*next_month/12),
        "month_cos": np.cos(2*np.pi*next_month/12),
        "total_claim": pred_total,
        "log_total": np.log1p(pred_total),
        "frequency": pred_freq,
        "log_freq": np.log1p(pred_freq),
        "exposure": sim_df["exposure"].iloc[-1]
    }

    sim_df = pd.concat([sim_df, pd.DataFrame([new_row])], ignore_index=True)

    key = f"{period.year}_{str(period.month).zfill(2)}"
    predictions[f"{key}_Total_Claim"] = pred_total
    predictions[f"{key}_Claim_Frequency"] = pred_freq
    predictions[f"{key}_Claim_Severity"] = pred_sev

# ==============================
# SUBMISSION
# ==============================
submission = sample_sub.copy()
submission["value"] = submission["id"].map(predictions)
submission = submission[["id","value"]]
submission.to_csv("submission.csv", index=False)

print("Submission created â€” PERFECT MIRROR")
print("NaN count:", submission["value"].isna().sum())

Submission created â€” PERFECT MIRROR
NaN count: 0


In [7]:
print(monthly[["year_month","exposure"]])

   year_month  exposure
0     2024-01       179
1     2024-02       120
2     2024-03       174
3     2024-04       138
4     2024-05       152
5     2024-06       130
6     2024-07       142
7     2024-08       138
8     2024-09       121
9     2024-10       158
10    2024-11       147
11    2024-12       133
12    2025-01       132
13    2025-02       145
14    2025-03       126
15    2025-04       123
16    2025-05       142
17    2025-06       124
18    2025-07       147


In [8]:
print(submission.head(9))

                        id         value
0  2025_08_Claim_Frequency  2.351593e+02
1   2025_08_Claim_Severity  5.154749e+07
2      2025_08_Total_Claim  1.212187e+10
3  2025_09_Claim_Frequency  2.350778e+02
4   2025_09_Claim_Severity  5.058006e+07
5      2025_09_Total_Claim  1.189025e+10
6  2025_10_Claim_Frequency  2.350126e+02
7   2025_10_Claim_Severity  5.138587e+07
8      2025_10_Total_Claim  1.207633e+10
