In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/Data_Klaim.csv
/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/sample_submission.csv
/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/Data_Polis.csv


# DATA FOUNDATION

In [2]:
# ============================================================
# STAGE 1 v2 ‚Äî MAPE OPTIMIZED FOUNDATION
# Short Series Safe ‚Ä¢ No Target Distortion ‚Ä¢ Trend Ready
# ============================================================

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"

klaim = pd.read_csv(BASE_PATH + "Data_Klaim.csv")
polis = pd.read_csv(BASE_PATH + "Data_Polis.csv")

# ============================================================
# CLEAN COLUMN NAMES
# ============================================================

def clean_columns(df):
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_", regex=False)
        .str.replace("/", "_", regex=False)
        .str.replace("-", "_", regex=False)
    )
    return df

klaim = clean_columns(klaim)
polis = clean_columns(polis)

klaim = klaim.drop_duplicates().reset_index(drop=True)
polis = polis.drop_duplicates().reset_index(drop=True)

# ============================================================
# DATE PARSING
# ============================================================

for col in klaim.columns:
    if "tanggal" in col:
        klaim[col] = pd.to_datetime(klaim[col], errors="coerce")

for col in polis.columns:
    if "tanggal" in col:
        polis[col] = pd.to_datetime(polis[col], errors="coerce")

# ============================================================
# BASIC CLEANING
# ============================================================

klaim = klaim.dropna(subset=["nomor_polis", "tanggal_pasien_masuk_rs"])
klaim["nominal_klaim_yang_disetujui"] = klaim["nominal_klaim_yang_disetujui"].fillna(0)

# Light winsorization (too aggressive smoothing hurts MAPE)
low_q = klaim["nominal_klaim_yang_disetujui"].quantile(0.005)
high_q = klaim["nominal_klaim_yang_disetujui"].quantile(0.995)

klaim["nominal_klaim_yang_disetujui"] = \
    klaim["nominal_klaim_yang_disetujui"].clip(low_q, high_q)

# ============================================================
# MERGE
# ============================================================

df = klaim.merge(polis, on="nomor_polis", how="left")

# ============================================================
# SERVICE MONTH
# ============================================================

df["year_month"] = df["tanggal_pasien_masuk_rs"].dt.to_period("M")

# ============================================================
# TRUE EXPOSURE (NO OVER-SMOOTH)
# ============================================================

exposure_monthly = (
    df.groupby("year_month")
      .agg(active_policies=("nomor_polis","nunique"))
      .reset_index()
      .sort_values("year_month")
)

df = df.merge(exposure_monthly, on="year_month", how="left")

# ============================================================
# MONTHLY CORE TABLE (DO NOT MODIFY RAW TOTAL)
# ============================================================

monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("nomor_polis","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
          exposure=("active_policies","mean")
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

# Keep raw total_claim untouched

monthly["severity"] = (
    monthly["total_claim"] /
    monthly["frequency"].replace(0,np.nan)
)

monthly["claim_rate"] = (
    monthly["frequency"] /
    monthly["exposure"].replace(0,np.nan)
)

# ============================================================
# LOG DOMAIN FEATURES (IMPORTANT FOR MAPE)
# ============================================================

monthly["log_total"] = np.log1p(monthly["total_claim"])
monthly["log_freq"]  = np.log1p(monthly["frequency"])
monthly["log_sev"]   = np.log1p(monthly["severity"])
monthly["log_rate"]  = np.log1p(monthly["claim_rate"])

# ============================================================
# VOLATILITY (SHORT SERIES SAFE)
# ============================================================

monthly["roll6"] = monthly["total_claim"].rolling(6, min_periods=3).mean()
monthly["std6"]  = monthly["total_claim"].rolling(6, min_periods=3).std()

monthly["vol_ratio"] = monthly["std6"] / monthly["roll6"]

monthly["high_vol_regime"] = (
    monthly["vol_ratio"] > monthly["vol_ratio"].median()
).astype(int)

# ============================================================
# TIME FEATURES
# ============================================================

monthly["month"] = monthly["year_month"].dt.month
monthly["month_sin"] = np.sin(2*np.pi*monthly["month"]/12)
monthly["month_cos"] = np.cos(2*np.pi*monthly["month"]/12)
monthly["month_index"] = np.arange(len(monthly))

# ============================================================
# SAFE LAGS (NO ZERO FILL)
# ============================================================

for col in ["log_total","log_freq","log_sev","log_rate"]:
    monthly[f"{col}_lag1"] = monthly[col].shift(1)
    monthly[f"{col}_lag2"] = monthly[col].shift(2)
    monthly[f"{col}_lag3"] = monthly[col].shift(3)

    monthly[f"{col}_roll3"] = monthly[col].shift(1).rolling(3).mean()

# ============================================================
# DROP EARLY MONTHS (IMPORTANT FOR MAPE STABILITY)
# ============================================================

monthly = monthly.dropna().reset_index(drop=True)

# ============================================================
# FINAL CHECK
# ============================================================

print("Monthly shape:", monthly.shape)
print("Unique months:", monthly["year_month"].nunique())
print("Vol regime ratio:", round(monthly["high_vol_regime"].mean(),3))
print("\nSTAGE 1 v2 ‚Äî MAPE OPTIMIZED FOUNDATION READY")


Monthly shape: (16, 34)
Unique months: 16
Vol regime ratio: 0.438

STAGE 1 v2 ‚Äî MAPE OPTIMIZED FOUNDATION READY


# TIME-SERIES DATASET ENGINEERING

In [3]:
# ============================================================
# STAGE 2 ‚Äî ELITE SEGMENT PANEL (SAFE VERSION)
# No KeyError ‚Ä¢ Auto-create missing columns ‚Ä¢ Short series safe
# ============================================================

import numpy as np
import pandas as pd

# ============================================================
# üîπ ENSURE REQUIRED SEGMENT COLUMNS EXIST
# ============================================================

# Care Type
if "care_type" not in df.columns:
    if "inpatient_outpatient" in df.columns:
        df["care_type"] = (
            df["inpatient_outpatient"]
            .astype(str)
            .str.upper()
            .str.strip()
        )
    else:
        df["care_type"] = "UNKNOWN"

df["care_type"] = df["care_type"].fillna("UNKNOWN")


# Cashless
if "is_cashless" not in df.columns:
    if "reimburse_cashless" in df.columns:
        rc = df["reimburse_cashless"].astype(str).str.upper().str.strip()
        df["is_cashless"] = rc.eq("C").astype(int)
    else:
        df["is_cashless"] = 0


# RS Bucket
if "rs_bucket" not in df.columns:
    if "lokasi_rs" in df.columns:
        loc = df["lokasi_rs"].astype(str).str.upper().str.strip()
        df["rs_bucket"] = np.select(
            [
                loc.eq("INDONESIA"),
                loc.eq("SINGAPORE"),
                loc.eq("MALAYSIA")
            ],
            ["ID","SG","MY"],
            default="OTHER"
        )
    else:
        df["rs_bucket"] = "OTHER"

df["rs_bucket"] = df["rs_bucket"].fillna("OTHER")


# Plan Code
if "plan_code" not in df.columns:
    df["plan_code"] = "UNKNOWN"

df["plan_code"] = df["plan_code"].fillna("UNKNOWN")

# ============================================================
# üîπ DEFINE SEGMENT COLUMNS
# ============================================================

seg_cols = ["plan_code","care_type","is_cashless","rs_bucket"]

# ============================================================
# üîπ BUILD SEGMENT MONTHLY
# ============================================================

seg_monthly = (
    df.groupby(["year_month"] + seg_cols)
      .agg(
          frequency=("nomor_polis","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
          exposure=("nomor_polis","nunique")
      )
      .reset_index()
      .sort_values(seg_cols + ["year_month"])
      .reset_index(drop=True)
)

# ============================================================
# üîπ TARGETS
# ============================================================

seg_monthly["severity"] = (
    seg_monthly["total_claim"] /
    seg_monthly["frequency"].replace(0, np.nan)
)

seg_monthly["log_total"] = np.log1p(seg_monthly["total_claim"])
seg_monthly["log_freq"]  = np.log1p(seg_monthly["frequency"])
seg_monthly["log_sev"]   = np.log1p(seg_monthly["severity"])

# ============================================================
# üîπ CALENDAR
# ============================================================

seg_monthly["month"] = seg_monthly["year_month"].dt.month
seg_monthly["month_sin"] = np.sin(2*np.pi*seg_monthly["month"]/12)
seg_monthly["month_cos"] = np.cos(2*np.pi*seg_monthly["month"]/12)

# ============================================================
# üîπ LAGS (STRICT NO LEAKAGE)
# ============================================================

for col in ["log_total","log_freq","log_sev"]:
    
    seg_monthly[f"{col}_lag1"] = \
        seg_monthly.groupby(seg_cols)[col].shift(1)
    
    seg_monthly[f"{col}_lag2"] = \
        seg_monthly.groupby(seg_cols)[col].shift(2)
    
    seg_monthly[f"{col}_lag3"] = \
        seg_monthly.groupby(seg_cols)[col].shift(3)

    seg_monthly[f"{col}_roll3"] = \
        seg_monthly.groupby(seg_cols)[col] \
        .transform(lambda x: x.shift(1).rolling(3).mean())

# ============================================================
# üîπ MOMENTUM
# ============================================================

seg_monthly["momentum_total"] = (
    seg_monthly["log_total_lag1"] -
    seg_monthly["log_total_lag2"]
)

# ============================================================
# üîπ SEGMENT WEIGHT
# ============================================================

seg_monthly["seg_weight"] = (
    seg_monthly["frequency"] /
    seg_monthly.groupby("year_month")["frequency"].transform("sum")
).fillna(0)

# ============================================================
# üîπ SAFE TRAIN WINDOW
# ============================================================

seg_model = seg_monthly[
    seg_monthly["log_total_lag3"].notna()
].reset_index(drop=True)

seg_model = seg_model.fillna(0)

# ============================================================
# FINAL CHECK
# ============================================================

print("COMPACT PANEL SHAPE:", seg_model.shape)
print("Unique segments:", seg_model[seg_cols].drop_duplicates().shape[0])
print("Columns:", len(seg_model.columns))
print("\nSTAGE 2 ‚Äî ELITE SEGMENT PANEL READY")

COMPACT PANEL SHAPE: (414, 29)
Unique segments: 41
Columns: 29

STAGE 2 ‚Äî ELITE SEGMENT PANEL READY


# MODEL DEVELOPMENT

In [4]:
# ============================================================
# STRUCTURAL v8 ‚Äî DUAL ANCHOR SAFE (MAPE VERSION)
# Short series optimized ‚Ä¢ No shape error
# ============================================================

import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore")

def mape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

# ============================================================
# BUILD MONTHLY
# ============================================================

monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
          exposure=("active_policies","first")
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

monthly["severity"]   = monthly["total_claim"] / monthly["frequency"].replace(0,np.nan)
monthly["claim_rate"] = monthly["frequency"] / monthly["exposure"].replace(0,np.nan)

monthly["log_total"] = np.log1p(monthly["total_claim"])
monthly["log_rate"]  = np.log1p(monthly["claim_rate"])
monthly["log_sev"]   = np.log1p(monthly["severity"])

monthly["t"] = np.arange(len(monthly))

long_run_rate = monthly["claim_rate"].median()
long_run_sev  = monthly["severity"].median()

freq_err  = []
sev_err   = []
total_err = []

# ============================================================
# ROLLING BACKTEST
# ============================================================

for i in range(8, len(monthly)-1):

    train = monthly.iloc[:i]
    valid = monthly.iloc[i]

    # =========================
    # RATE MODEL (OLS SAFE SHAPE)
    # =========================

    X_train = sm.add_constant(train["t"].values)
    model_rate = sm.OLS(train["log_rate"], X_train).fit()

    future_t = np.array([[1, train["t"].iloc[-1] + 1]])  # FIXED SHAPE
    trend_rate = model_rate.predict(future_t)[0]

    resid_rate = (train["log_rate"] - model_rate.predict(X_train)).tail(3).mean()
    rate_pred = np.expm1(trend_rate + 0.6 * resid_rate)

    rate_pred = 0.85 * rate_pred + 0.15 * long_run_rate
    freq_pred = rate_pred * valid["exposure"]

    # =========================
    # SEVERITY MODEL (ETS + SHRINK)
    # =========================

    try:
        model_sev = ExponentialSmoothing(
            train["log_sev"],
            trend="add",
            damped_trend=True,
            seasonal=None
        ).fit()

        sev_raw = np.expm1(model_sev.forecast(1).iloc[0])
    except:
        sev_raw = train["severity"].iloc[-1]

    recent_std = train["severity"].tail(6).std()
    global_std = monthly["severity"].std()

    vol_ratio = recent_std / global_std if global_std != 0 else 1
    w = np.clip(1 / (1 + vol_ratio), 0.65, 0.9)

    sev_pred = w * sev_raw + (1 - w) * long_run_sev

    # =========================
    # TOTAL ANCHOR MODEL
    # =========================

    X_train_total = sm.add_constant(train["t"].values)
    model_total = sm.OLS(train["log_total"], X_train_total).fit()

    trend_total = model_total.predict(future_t)[0]
    resid_total = (train["log_total"] - model_total.predict(X_train_total)).tail(3).mean()

    total_anchor = np.expm1(trend_total + 0.6 * resid_total)

    # =========================
    # STRUCTURAL TOTAL
    # =========================

    total_struct = freq_pred * sev_pred

    total_pred = 0.55 * total_anchor + 0.45 * total_struct

    # anti drift
    lower = train["total_claim"].tail(6).min() * 0.85
    upper = train["total_claim"].tail(6).max() * 1.15
    total_pred = np.clip(total_pred, lower, upper)

    # =========================
    # METRICS
    # =========================

    freq_err.append(mape([valid["frequency"]],[freq_pred]))
    sev_err.append(mape([valid["severity"]],[sev_pred]))
    total_err.append(mape([valid["total_claim"]],[total_pred]))

print("\n==============================")
print("STRUCT v8 MAPE Frequency :", round(np.mean(freq_err),4))
print("STRUCT v8 MAPE Total     :", round(np.mean(total_err),4))
print("STRUCT v8 MAPE Severity  :", round(np.mean(sev_err),4))
print("Estimated Score          :", round(np.mean([
    np.mean(freq_err),
    np.mean(total_err),
    np.mean(sev_err)
]),4))
print("==============================")



STRUCT v8 MAPE Frequency : 4.1204
STRUCT v8 MAPE Total     : 9.7044
STRUCT v8 MAPE Severity  : 7.3284
Estimated Score          : 7.0511


# TOTAL CLAIM OPTIMIZATION & VALIDATION, OPTUNA

In [5]:
# ============================================================
# STAGE 4 ‚Äî TOTAL CLAIM OPTIMIZATION (OPTUNA ¬∑ MAPE TARGET)
# Short Series Safe ‚Ä¢ Structural + ML Blend
# ============================================================

!pip install -q optuna lightgbm

import optuna
import numpy as np
import pandas as pd
import lightgbm as lgb
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore")

# ============================================================
# MAPE
# ============================================================

def mape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

# ============================================================
# BUILD MONTHLY
# ============================================================

monthly = (
    df.groupby("year_month")
      .agg(
          total_claim=("nominal_klaim_yang_disetujui","sum")
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

monthly["log_total"] = np.log1p(monthly["total_claim"])

monthly["month"] = monthly["year_month"].dt.month
monthly["month_sin"] = np.sin(2*np.pi*monthly["month"]/12)
monthly["month_cos"] = np.cos(2*np.pi*monthly["month"]/12)

# lags
for lag in [1,2,3]:
    monthly[f"log_lag{lag}"] = monthly["log_total"].shift(lag)

monthly["log_roll3"] = monthly["log_total"].shift(1).rolling(3).mean()

monthly = monthly.dropna().reset_index(drop=True)

features = ["month_sin","month_cos",
            "log_lag1","log_lag2","log_lag3","log_roll3"]

# ============================================================
# OPTUNA OBJECTIVE
# ============================================================

def objective(trial):

    alpha = trial.suggest_float("alpha", 0.4, 0.9)
    lr    = trial.suggest_float("lr", 0.005, 0.03)
    leaves = trial.suggest_int("leaves", 3, 12)
    shrink = trial.suggest_float("shrink", 0.7, 0.98)

    errors = []

    # rolling CV (last 6 folds)
    for i in range(len(monthly)-6, len(monthly)-1):

        train = monthly.iloc[:i]
        valid = monthly.iloc[i:i+1]

        # ======================
        # 1Ô∏è‚É£ ETS STRUCTURAL
        # ======================

        try:
            ets = ExponentialSmoothing(
                train["log_total"],
                trend="add",
                damped_trend=True,
                seasonal=None
            ).fit()

            pred_ets = np.expm1(ets.forecast(1).iloc[0])
        except:
            pred_ets = train["total_claim"].iloc[-1]

        # ======================
        # 2Ô∏è‚É£ LIGHTGBM LOW VAR
        # ======================

        model = lgb.LGBMRegressor(
            n_estimators=300,
            learning_rate=lr,
            num_leaves=leaves,
            min_data_in_leaf=4,
            feature_fraction=0.8,
            bagging_fraction=0.8,
            bagging_freq=1,
            verbosity=-1,
            random_state=42
        )

        model.fit(train[features], train["log_total"])
        pred_ml = np.expm1(model.predict(valid[features])[0])

        # ======================
        # 3Ô∏è‚É£ HYBRID
        # ======================

        pred = alpha * pred_ets + (1-alpha) * pred_ml

        # shrink to rolling median
        median_anchor = train["total_claim"].tail(3).median()
        pred = shrink * pred + (1-shrink) * median_anchor

        errors.append(
            mape(valid["total_claim"], [pred])
        )

    return np.mean(errors)

# ============================================================
# RUN OPTUNA
# ============================================================

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=150)

print("\nBest Params:", study.best_params)
print("Best CV MAPE:", round(study.best_value*100,4), "%")

# ============================================================
# FINAL VALIDATION (LAST 4 MONTH)
# ============================================================

best = study.best_params
valid = monthly.iloc[-4:]
final_preds = []

for i in range(4):

    sub_train = monthly.iloc[:len(monthly)-4+i]
    sub_valid = monthly.iloc[len(monthly)-4+i:len(monthly)-4+i+1]

    try:
        ets = ExponentialSmoothing(
            sub_train["log_total"],
            trend="add",
            damped_trend=True,
            seasonal=None
        ).fit()

        pred_ets = np.expm1(ets.forecast(1).iloc[0])
    except:
        pred_ets = sub_train["total_claim"].iloc[-1]

    model = lgb.LGBMRegressor(
        n_estimators=300,
        learning_rate=best["lr"],
        num_leaves=best["leaves"],
        min_data_in_leaf=4,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=1,
        verbosity=-1,
        random_state=42
    )

    model.fit(sub_train[features], sub_train["log_total"])
    pred_ml = np.expm1(model.predict(sub_valid[features])[0])

    pred = best["alpha"] * pred_ets + (1-best["alpha"]) * pred_ml

    median_anchor = sub_train["total_claim"].tail(3).median()
    pred = best["shrink"] * pred + (1-best["shrink"]) * median_anchor

    final_preds.append(pred)

final_mape = mape(valid["total_claim"], final_preds)

print("\n==============================")
print("Final 4M MAPE:", round(final_mape*100,4), "%")
print("==============================")
print("STAGE 4 ‚Äî TOTAL CLAIM OPTIMIZED")

[I 2026-02-16 14:19:32,171] A new study created in memory with name: no-name-a34ed126-e016-435e-b21e-4969f97717e6
[I 2026-02-16 14:19:32,652] Trial 0 finished with value: 0.08522255029906169 and parameters: {'alpha': 0.760886980723803, 'lr': 0.008619561149354905, 'leaves': 9, 'shrink': 0.9170383810642054}. Best is trial 0 with value: 0.08522255029906169.
[I 2026-02-16 14:19:33,132] Trial 1 finished with value: 0.0903277466928074 and parameters: {'alpha': 0.7315166174779959, 'lr': 0.01380930511982912, 'leaves': 9, 'shrink': 0.7965225628798089}. Best is trial 0 with value: 0.08522255029906169.
[I 2026-02-16 14:19:33,599] Trial 2 finished with value: 0.08336909942557716 and parameters: {'alpha': 0.7883823468997287, 'lr': 0.018313253690579415, 'leaves': 9, 'shrink': 0.9593485435617202}. Best is trial 2 with value: 0.08336909942557716.
[I 2026-02-16 14:19:34,048] Trial 3 finished with value: 0.08384488873238287 and parameters: {'alpha': 0.7669031228800081, 'lr': 0.018761683603268645, 'leave


Best Params: {'alpha': 0.400822162331984, 'lr': 0.029946310999476873, 'leaves': 10, 'shrink': 0.9798757642876355}
Best CV MAPE: 6.7382 %

Final 4M MAPE: 2.4365 %
STAGE 4 ‚Äî TOTAL CLAIM OPTIMIZED


# TEST PREDICTION & KAGGLE SUBMISSION

In [6]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"
sample_sub = pd.read_csv(BASE_PATH + "sample_submission.csv")

# ============================================================
# BUILD MONTHLY (IDENTICAL TO STAGE 3/4)
# ============================================================

monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
          exposure=("active_policies","first")
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

monthly["severity"]   = monthly["total_claim"] / monthly["frequency"].replace(0,np.nan)
monthly["claim_rate"] = monthly["frequency"] / monthly["exposure"].replace(0,np.nan)

monthly["log_rate"]  = np.log1p(monthly["claim_rate"])
monthly["log_sev"]   = np.log1p(monthly["severity"])
monthly["log_total"] = np.log1p(monthly["total_claim"])

monthly["t"] = np.arange(len(monthly))

# ============================================================
# üîí FREEZE GLOBAL PARAMETERS (IMPORTANT)
# ============================================================

LONG_RUN_RATE = monthly["claim_rate"].median()
LONG_RUN_SEV  = monthly["severity"].median()
GLOBAL_STD_SEV = monthly["severity"].std()

# ============================================================
# PREPARE FUTURE PERIODS
# ============================================================

sample_sub["year"]  = sample_sub["id"].str.split("_").str[0]
sample_sub["month"] = sample_sub["id"].str.split("_").str[1]
sample_sub["month_key"] = sample_sub["year"] + "-" + sample_sub["month"]

future_periods = (
    pd.PeriodIndex(sample_sub["month_key"], freq="M")
      .unique()
      .sort_values()
)

sim_df = monthly.copy()
predictions = {}

# ============================================================
# TRUE RECURSIVE FORECAST
# ============================================================

for period in future_periods:

    train = sim_df.copy()

    # ===== OLS SETUP =====
    X_train = sm.add_constant(train["t"].values)
    future_t = train["t"].iloc[-1] + 1
    future_X = np.array([[1, future_t]])

    # =========================
    # 1Ô∏è‚É£ CLAIM RATE (STRUCT v8)
    # =========================
    model_rate = sm.OLS(train["log_rate"], X_train).fit()
    trend_rate = model_rate.predict(future_X)[0]

    resid_rate = (train["log_rate"] - model_rate.predict(X_train)).tail(3).mean()
    rate_pred = np.expm1(trend_rate + 0.6 * resid_rate)

    rate_pred = 0.85 * rate_pred + 0.15 * LONG_RUN_RATE

    exposure_next = train["exposure"].iloc[-1]
    freq_pred = max(rate_pred * exposure_next, 1)

    # =========================
    # 2Ô∏è‚É£ SEVERITY (ETS + SHRINK LOCKED)
    # =========================
    try:
        from statsmodels.tsa.holtwinters import ExponentialSmoothing
        model_sev = ExponentialSmoothing(
            train["log_sev"],
            trend="add",
            damped_trend=True,
            seasonal=None
        ).fit()

        sev_raw = np.expm1(model_sev.forecast(1).iloc[0])
    except:
        sev_raw = train["severity"].iloc[-1]

    recent_std = train["severity"].tail(6).std()
    vol_ratio = recent_std / GLOBAL_STD_SEV if GLOBAL_STD_SEV != 0 else 1

    w = np.clip(1 / (1 + vol_ratio), 0.65, 0.9)

    sev_pred = w * sev_raw + (1 - w) * LONG_RUN_SEV

    # =========================
    # 3Ô∏è‚É£ TOTAL ANCHOR (OLS)
    # =========================
    model_total = sm.OLS(train["log_total"], X_train).fit()
    trend_total = model_total.predict(future_X)[0]

    resid_total = (train["log_total"] - model_total.predict(X_train)).tail(3).mean()
    total_anchor = np.expm1(trend_total + 0.6 * resid_total)

    # =========================
    # 4Ô∏è‚É£ STRUCTURAL TOTAL
    # =========================
    total_struct = freq_pred * sev_pred

    total_pred = 0.55 * total_anchor + 0.45 * total_struct

    # anti drift clamp (same as training)
    lower = train["total_claim"].tail(6).min() * 0.85
    upper = train["total_claim"].tail(6).max() * 1.15
    total_pred = np.clip(total_pred, lower, upper)

    # =========================
    # UPDATE RECURSIVE DATASET
    # =========================
    new_row = {
        "year_month": period,
        "frequency": freq_pred,
        "total_claim": total_pred,
        "exposure": exposure_next,
        "severity": sev_pred,
        "claim_rate": rate_pred,
        "log_rate": np.log1p(rate_pred),
        "log_sev": np.log1p(sev_pred),
        "log_total": np.log1p(total_pred),
        "t": future_t
    }

    sim_df = pd.concat([sim_df, pd.DataFrame([new_row])], ignore_index=True)

    key = f"{period.year}_{str(period.month).zfill(2)}"

    predictions[f"{key}_Total_Claim"] = total_pred
    predictions[f"{key}_Claim_Frequency"] = freq_pred
    predictions[f"{key}_Claim_Severity"] = sev_pred

# ============================================================
# BUILD SUBMISSION
# ============================================================

submission = sample_sub.copy()
submission["value"] = submission["id"].map(predictions)
submission = submission[["id","value"]]

submission.to_csv("submission.csv", index=False)

print("Submission created ‚Äî LOCKED MATCH TO STAGE 3/4")
print(submission.head(9))

Submission created ‚Äî LOCKED MATCH TO STAGE 3/4
                        id         value
0  2025_08_Claim_Frequency  2.620096e+02
1   2025_08_Claim_Severity  5.117496e+07
2      2025_08_Total_Claim  1.260867e+10
3  2025_09_Claim_Frequency  2.647959e+02
4   2025_09_Claim_Severity  5.109508e+07
5      2025_09_Total_Claim  1.273460e+10
6  2025_10_Claim_Frequency  2.630853e+02
7   2025_10_Claim_Severity  5.090586e+07
8      2025_10_Total_Claim  1.275515e+10
