In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/Data_Klaim.csv
/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/sample_submission.csv
/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/Data_Polis.csv


# DATA FOUNDATION

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"

klaim = pd.read_csv(BASE_PATH + "Data_Klaim.csv")
polis = pd.read_csv(BASE_PATH + "Data_Polis.csv")

# ============================================================
# CLEAN COLUMN NAMES
# ============================================================

def clean_columns(df):
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_", regex=False)
        .str.replace("/", "_", regex=False)
        .str.replace("-", "_", regex=False)
    )
    return df

klaim = clean_columns(klaim)
polis = clean_columns(polis)

klaim = klaim.drop_duplicates().reset_index(drop=True)
polis = polis.drop_duplicates().reset_index(drop=True)

# ============================================================
# DATE PARSING
# ============================================================

for col in klaim.columns:
    if "tanggal" in col:
        klaim[col] = pd.to_datetime(klaim[col], errors="coerce")

for col in polis.columns:
    if "tanggal" in col:
        polis[col] = pd.to_datetime(polis[col], errors="coerce")

# ============================================================
# BASIC CLEANING
# ============================================================

klaim = klaim.dropna(subset=["nomor_polis", "tanggal_pasien_masuk_rs"])
klaim["nominal_klaim_yang_disetujui"] = klaim["nominal_klaim_yang_disetujui"].fillna(0)

# ============================================================
# MERGE
# ============================================================

df = klaim.merge(polis, on="nomor_polis", how="left")

for col in ["plan_code", "gender", "domisili"]:
    if col in df.columns:
        df[col] = df[col].fillna("UNKNOWN")

# ============================================================
# SERVICE MONTH (NO LEAKAGE)
# ============================================================

df["year_month"] = df["tanggal_pasien_masuk_rs"].dt.to_period("M")

# ============================================================
# CORE DEMOGRAPHIC FEATURES
# ============================================================

if "tanggal_lahir" in df.columns:
    df["age"] = (
        (df["tanggal_pasien_masuk_rs"] - df["tanggal_lahir"]).dt.days / 365
    ).clip(0, 120)

if "tanggal_efektif_polis" in df.columns:
    df["tenure_days"] = (
        (df["tanggal_pasien_masuk_rs"] -
         df["tanggal_efektif_polis"]).dt.days
    ).clip(lower=0)

if "tanggal_pasien_keluar_rs" in df.columns:
    df["los"] = (
        df["tanggal_pasien_keluar_rs"] -
        df["tanggal_pasien_masuk_rs"]
    ).dt.days.clip(lower=0)

# ============================================================
# SEGMENT FEATURES
# ============================================================

df["care_type"] = (
    df["inpatient_outpatient"]
    .astype(str).str.upper().str.strip()
)
df["care_type"] = df["care_type"].replace(["NAN", "NONE"], "UNKNOWN")

df["is_inpatient"] = df["care_type"].eq("IP").astype(int)

rc = df["reimburse_cashless"].astype(str).str.upper().str.strip()
df["is_cashless"] = rc.eq("C").astype(int)

loc = df["lokasi_rs"].astype(str).str.upper().str.strip()
df["rs_bucket"] = np.select(
    [
        loc.eq("INDONESIA"),
        loc.eq("SINGAPORE"),
        loc.eq("MALAYSIA")
    ],
    ["ID", "SG", "MY"],
    default="OTHER"
)

# ICD grouping (VERY IMPORTANT FOR SEVERITY DRIVER)
df["icd_group"] = (
    df["icd_diagnosis"]
    .astype(str)
    .str.split(".").str[0]
    .str[:3]
)

# ============================================================
# PORTFOLIO EXPOSURE (MONTHLY ACTIVE POLICIES)
# ============================================================

all_months = pd.period_range(
    df["year_month"].min(),
    df["year_month"].max(),
    freq="M"
)

polis_month = polis["tanggal_efektif_polis"].dt.to_period("M")

exposure_list = []

for m in all_months:
    active = (polis_month <= m).sum()
    exposure_list.append({
        "year_month": m,
        "exposure": active
    })

exposure_df = pd.DataFrame(exposure_list)
df = df.merge(exposure_df, on="year_month", how="left")

# ============================================================
# SEGMENT EXPOSURE (NEW – IMPORTANT FOR 5%)
# ============================================================

seg_exposure_list = []

for m in all_months:
    active_seg = (
        polis[polis["tanggal_efektif_polis"].dt.to_period("M") <= m]
        .groupby("plan_code")
        .size()
        .reset_index(name="segment_exposure")
    )
    active_seg["year_month"] = m
    seg_exposure_list.append(active_seg)

seg_exposure = pd.concat(seg_exposure_list, ignore_index=True)

df = df.merge(
    seg_exposure,
    on=["year_month","plan_code"],
    how="left"
)

df["segment_exposure"] = df["segment_exposure"].fillna(0)


# ============================================================
# SEVERITY SHOCK FEATURE
# ============================================================

monthly_tmp = (
    df.groupby("year_month")["nominal_klaim_yang_disetujui"]
    .sum()
    .reset_index()
)

monthly_tmp["rolling_mean"] = (
    monthly_tmp["nominal_klaim_yang_disetujui"]
    .rolling(3).mean()
)

monthly_tmp["rolling_std"] = (
    monthly_tmp["nominal_klaim_yang_disetujui"]
    .rolling(3).std()
)

monthly_tmp["is_spike"] = (
    monthly_tmp["nominal_klaim_yang_disetujui"] >
    monthly_tmp["rolling_mean"] + 2 * monthly_tmp["rolling_std"]
).astype(int)

df = df.merge(
    monthly_tmp[["year_month","is_spike"]],
    on="year_month",
    how="left"
)

df["is_spike"] = df["is_spike"].fillna(0)

# ============================================================
# SANITY CHECK
# ============================================================

print("Final shape:", df.shape)
print("Unique months:", df["year_month"].nunique())
print("Exposure unique:", df["exposure"].nunique())
print("Segment exposure unique:", df["segment_exposure"].nunique())
print("ICD groups:", df["icd_group"].nunique())
print("\nSTAGE 1 — COMPETITION LEVEL READY")


Final shape: (4627, 30)
Unique months: 19
Exposure unique: 1
Segment exposure unique: 3
ICD groups: 397

STAGE 1 — COMPETITION LEVEL READY


# TIME-SERIES DATASET ENGINEERING

In [3]:
import numpy as np
import pandas as pd

# ============================================================
# CONFIG
# ============================================================

seg_cols = [
    "plan_code",
    "care_type",
    "is_cashless",
    "rs_bucket"
]

# ============================================================
# 1. BUILD SEGMENT MONTHLY BASE
# ============================================================

seg_monthly = (
    df.groupby(["year_month"] + seg_cols)
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
          exposure=("exposure","max")
      )
      .reset_index()
)

seg_monthly = seg_monthly.sort_values(seg_cols + ["year_month"]).reset_index(drop=True)

# ============================================================
# 2. CORE INSURANCE METRICS (STABLE VERSION)
# ============================================================

# claim rate
seg_monthly["claim_rate"] = (
    seg_monthly["frequency"] /
    seg_monthly["exposure"].replace(0,np.nan)
).fillna(0)

# raw severity
seg_monthly["raw_severity"] = (
    seg_monthly["total_claim"] /
    seg_monthly["frequency"].replace(0,np.nan)
).fillna(0)

# shrink severity (Bayesian smoothing)
global_sev = (
    seg_monthly["total_claim"].sum() /
    seg_monthly["frequency"].sum()
)

alpha = 5  # smoothing strength (tune later)

seg_monthly["severity"] = (
    (seg_monthly["frequency"] * seg_monthly["raw_severity"] +
     alpha * global_sev)
    /
    (seg_monthly["frequency"] + alpha)
)

# ============================================================
# 3. TARGET TRANSFORM (FOR TREE STABILITY)
# ============================================================

seg_monthly["log_rate"] = np.log1p(seg_monthly["claim_rate"])
seg_monthly["log_sev"]  = np.log1p(seg_monthly["severity"])

# ============================================================
# 4. CALENDAR FEATURES
# ============================================================

seg_monthly["year_month_dt"] = seg_monthly["year_month"].dt.to_timestamp()
seg_monthly["month"] = seg_monthly["year_month_dt"].dt.month

seg_monthly["month_sin"] = np.sin(2*np.pi*seg_monthly["month"]/12)
seg_monthly["month_cos"] = np.cos(2*np.pi*seg_monthly["month"]/12)

unique_months = sorted(seg_monthly["year_month"].unique())
month_index_map = {m:i for i,m in enumerate(unique_months)}
seg_monthly["month_index"] = seg_monthly["year_month"].map(month_index_map)

# ============================================================
# 5. PORTFOLIO LEVEL SIGNAL (LOW NOISE ANCHOR)
# ============================================================

portfolio = (
    seg_monthly.groupby("year_month")
      .agg(
          portfolio_rate=("claim_rate","mean"),
          portfolio_total=("total_claim","sum"),
          portfolio_freq=("frequency","sum")
      )
      .reset_index()
)

portfolio["log_portfolio_rate"]  = np.log1p(portfolio["portfolio_rate"])
portfolio["log_portfolio_total"] = np.log1p(portfolio["portfolio_total"])

# lag portfolio
for lag in [1,2]:
    portfolio[f"log_portfolio_rate_lag{lag}"] = portfolio["log_portfolio_rate"].shift(lag)
    portfolio[f"log_portfolio_total_lag{lag}"] = portfolio["log_portfolio_total"].shift(lag)

seg_monthly = seg_monthly.merge(
    portfolio[[
        "year_month",
        "log_portfolio_rate_lag1",
        "log_portfolio_rate_lag2",
        "log_portfolio_total_lag1",
        "log_portfolio_total_lag2"
    ]],
    on="year_month",
    how="left"
)

# ============================================================
# 6. SEGMENT SHARE (FOR RECONCILIATION)
# ============================================================

portfolio_total = seg_monthly.groupby("year_month")["total_claim"].transform("sum")

seg_monthly["segment_weight"] = (
    seg_monthly["total_claim"] /
    portfolio_total.replace(0,np.nan)
).fillna(0)

# ============================================================
# 7. LAG + MOMENTUM FEATURES (ANTI LEAKAGE)
# ============================================================

seg_monthly = seg_monthly.sort_values(seg_cols + ["year_month"])

for col in ["log_rate","log_sev"]:

    seg_monthly[f"{col}_lag1"] = seg_monthly.groupby(seg_cols)[col].shift(1)
    seg_monthly[f"{col}_lag2"] = seg_monthly.groupby(seg_cols)[col].shift(2)
    seg_monthly[f"{col}_lag3"] = seg_monthly.groupby(seg_cols)[col].shift(3)

    seg_monthly[f"{col}_roll3"] = (
        seg_monthly.groupby(seg_cols)[col]
        .transform(lambda x: x.shift(1).rolling(3).mean())
    )

    seg_monthly[f"{col}_ema3"] = (
        seg_monthly.groupby(seg_cols)[col]
        .transform(lambda x: x.shift(1).ewm(span=3).mean())
    )

# momentum
seg_monthly["rate_momentum"] = (
    seg_monthly["log_rate_lag1"] -
    seg_monthly["log_rate_lag3"]
)

seg_monthly["sev_momentum"] = (
    seg_monthly["log_sev_lag1"] -
    seg_monthly["log_sev_lag3"]
)

# volatility
seg_monthly["rate_vol"] = (
    seg_monthly.groupby(seg_cols)["claim_rate"]
    .transform(lambda x: x.shift(1).rolling(3).std())
)

# ============================================================
# 8. DROP EARLY MONTHS
# ============================================================

seg_model = seg_monthly[
    seg_monthly["log_rate_lag3"].notna()
].reset_index(drop=True)

print("CHAMPIONSHIP PANEL SHAPE:", seg_model.shape)
print(seg_model.head())

print("\nSTAGE 2 — 5% READY FOUNDATION")


CHAMPIONSHIP PANEL SHAPE: (414, 36)
  year_month plan_code care_type  is_cashless rs_bucket  frequency  \
0    2024-04     M-001        IP            0        ID          2   
1    2024-05     M-001        IP            0        ID         12   
2    2024-06     M-001        IP            0        ID          2   
3    2024-07     M-001        IP            0        ID          1   
4    2024-08     M-001        IP            0        ID          2   

   total_claim  exposure  claim_rate  raw_severity  ...  log_rate_roll3  \
0    4740000.0      4096    0.000488  2.370000e+06  ...        0.002113   
1  211597118.0      4096    0.002930  1.763309e+07  ...        0.001544   
2   14095300.0      4096    0.000488  7.047650e+06  ...        0.002275   
3   24080631.0      4096    0.000244  2.408063e+07  ...        0.001301   
4  123137551.8      4096    0.000488  6.156878e+07  ...        0.001219   

   log_rate_ema3  log_sev_lag1 log_sev_lag2  log_sev_lag3  log_sev_roll3  \
0       0.002473

# MODEL DEVELOPMENT

In [4]:
# ============================================================
# STAGE 3 — CHAMPIONSHIP STABLE GROWTH FRAMEWORK
# ============================================================

import numpy as np
import pandas as pd
import lightgbm as lgb
import warnings
warnings.filterwarnings("ignore")

def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# ===============================
# BUILD PORTFOLIO MONTHLY
# ===============================

monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
          exposure=("exposure","max")
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

monthly["severity"] = monthly["total_claim"] / monthly["frequency"]

monthly["log_total"] = np.log1p(monthly["total_claim"])
monthly["log_freq"]  = np.log1p(monthly["frequency"])
monthly["log_sev"]   = np.log1p(monthly["severity"])

# ===============================
# GROWTH FEATURES (KEY CHANGE)
# ===============================

monthly["g_total"] = monthly["log_total"].diff()
monthly["g_freq"]  = monthly["log_freq"].diff()
monthly["g_sev"]   = monthly["log_sev"].diff()

# ===============================
# CALENDAR
# ===============================

monthly["month_index"] = np.arange(len(monthly))
monthly["month"] = monthly["year_month"].dt.month
monthly["month_sin"] = np.sin(2*np.pi*monthly["month"]/12)
monthly["month_cos"] = np.cos(2*np.pi*monthly["month"]/12)

# ===============================
# LAGS ON GROWTH
# ===============================

for col in ["g_total","g_freq","g_sev"]:
    monthly[f"{col}_lag1"] = monthly[col].shift(1)
    monthly[f"{col}_lag2"] = monthly[col].shift(2)
    monthly[f"{col}_roll3"] = monthly[col].shift(1).rolling(3).mean()

monthly = monthly.dropna().reset_index(drop=True)

# ===============================
# SPLIT
# ===============================

train_cut = pd.Period("2025-01", freq="M")

train = monthly[monthly["year_month"] < train_cut]
valid = monthly[monthly["year_month"] >= train_cut].iloc[:4]

feature_cols = [
    "month_index","month_sin","month_cos",
    "g_total_lag1","g_total_lag2","g_total_roll3",
    "g_freq_lag1","g_freq_lag2","g_freq_roll3",
    "g_sev_lag1","g_sev_lag2","g_sev_roll3"
]

X_train = train[feature_cols]

# ===============================
# LIGHTGBM (LOW VARIANCE SETUP)
# ===============================

params = dict(
    n_estimators=600,
    learning_rate=0.03,
    num_leaves=7,
    min_child_samples=5,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42
)

model_g_total = lgb.LGBMRegressor(**params)
model_g_freq  = lgb.LGBMRegressor(**params)
model_g_sev   = lgb.LGBMRegressor(**params)

model_g_total.fit(X_train, train["g_total"])
model_g_freq.fit(X_train, train["g_freq"])
model_g_sev.fit(X_train, train["g_sev"])

# ===============================
# PREDICT GROWTH (NON RECURSIVE)
# ===============================

X_valid = valid[feature_cols]

pred_g_total = model_g_total.predict(X_valid)
pred_g_freq  = model_g_freq.predict(X_valid)
pred_g_sev   = model_g_sev.predict(X_valid)

# ===============================
# RECONSTRUCT FROM LAST OBSERVED
# ===============================

last_log_total = train["log_total"].iloc[-1]
last_log_freq  = train["log_freq"].iloc[-1]
last_log_sev   = train["log_sev"].iloc[-1]

pred_log_total = last_log_total + np.cumsum(pred_g_total)
pred_log_freq  = last_log_freq  + np.cumsum(pred_g_freq)
pred_log_sev   = last_log_sev   + np.cumsum(pred_g_sev)

pred_total = np.expm1(pred_log_total)
pred_freq  = np.expm1(pred_log_freq)
pred_sev   = np.expm1(pred_log_sev)

# ===============================
# STABILITY CONTROLS
# ===============================

# Cap extreme change (anti explosion)
pred_total = np.clip(pred_total,
                     0.7 * train["total_claim"].iloc[-1],
                     1.3 * train["total_claim"].iloc[-1])

# Anchor shrinkage
alpha = 0.85
pred_total = alpha * pred_total + (1 - alpha) * train["total_claim"].iloc[-1]

pred_sev_final = pred_total / pred_freq

# ===============================
# EVALUATION
# ===============================

freq_mape = mape(valid["frequency"], pred_freq)
total_mape = mape(valid["total_claim"], pred_total)
sev_mape = mape(valid["severity"], pred_sev_final)

print("\n==============================")
print("MAPE Frequency :", round(freq_mape,2))
print("MAPE Total     :", round(total_mape,2))
print("MAPE Severity  :", round(sev_mape,2))
print("Estimated Score:",
      round((freq_mape+total_mape+sev_mape)/3,2))
print("==============================")


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 8, number of used features: 0
[LightGBM] [Info] Start training from score 0.006107
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 8, number of used features: 0
[LightGBM] [Info] Start training from score -0.000522
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 8, number of used features: 0
[LightGBM] [Info] Start training from score 0.006631

MAPE Frequency : 7.74
MAPE Total     : 19.22
MAPE Severity  : 14.92
Estimated Score: 13.96


# TOTAL CLAIM OPTIMIZATION & VALIDATION

In [5]:
# ============================================================
# STAGE 4 — DIRECT TOTAL ONLY (ULTRA STABLE)
# ============================================================

import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore")

def safe_mape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

# ===============================
# BUILD MONTHLY
# ===============================

monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum")
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

monthly["severity"] = monthly["total_claim"] / monthly["frequency"]

monthly["log_total"] = np.log1p(monthly["total_claim"])

# ===============================
# CALENDAR FEATURES
# ===============================

monthly["month_index"] = np.arange(len(monthly))
monthly["month"] = monthly["year_month"].dt.month
monthly["month_sin"] = np.sin(2*np.pi*monthly["month"]/12)
monthly["month_cos"] = np.cos(2*np.pi*monthly["month"]/12)

# ===============================
# LAGS + ROLLING + GROWTH
# ===============================

monthly["lag1"] = monthly["log_total"].shift(1)
monthly["lag2"] = monthly["log_total"].shift(2)
monthly["lag3"] = monthly["log_total"].shift(3)

monthly["roll3"] = monthly["log_total"].shift(1).rolling(3).mean()

monthly["growth1"] = monthly["log_total"].diff().shift(1)
monthly["growth2"] = monthly["log_total"].diff().shift(2)

monthly = monthly.dropna().reset_index(drop=True)

# ===============================
# SPLIT
# ===============================

train_cut = pd.Period("2025-01", freq="M")

train = monthly[monthly["year_month"] < train_cut]
valid = monthly[monthly["year_month"] >= train_cut].iloc[:4]

feature_cols = [
    "month_index","month_sin","month_cos",
    "lag1","lag2","lag3",
    "roll3","growth1","growth2"
]

X_train = train[feature_cols]
X_valid = valid[feature_cols]

y_train = train["log_total"]

# ===============================
# 1️⃣ RIDGE DIRECT MULTI-HORIZON
# ===============================

ridge = Ridge(alpha=150)
ridge.fit(X_train, y_train)

pred_log_ridge = ridge.predict(X_valid)

# ===============================
# 2️⃣ HOLT LOG TOTAL
# ===============================

holt_model = ExponentialSmoothing(
    train["log_total"],
    trend="add",
    damped_trend=True,
    seasonal=None
).fit()

pred_log_holt = holt_model.forecast(len(valid))

# ===============================
# 3️⃣ ADAPTIVE BLEND
# ===============================

err_ridge = safe_mape(
    np.expm1(train["log_total"].iloc[-4:]),
    np.expm1(ridge.predict(X_train.iloc[-4:]))
)

err_holt = safe_mape(
    np.expm1(train["log_total"].iloc[-4:]),
    np.expm1(holt_model.fittedvalues.iloc[-4:])
)

w_ridge = 1/(err_ridge+1e-6)
w_holt  = 1/(err_holt+1e-6)

w_sum = w_ridge + w_holt
w_ridge /= w_sum
w_holt  /= w_sum

pred_log_blend = w_ridge * pred_log_ridge + w_holt * pred_log_holt

# ===============================
# 4️⃣ VOLATILITY ADAPTIVE SHRINKAGE
# ===============================

last_log = train["log_total"].iloc[-1]
hist_std = train["log_total"].diff().std()

beta = 1/(1+hist_std)

pred_log_final = last_log + beta * (pred_log_blend - last_log)

pred_total = np.expm1(pred_log_final)

# ===============================
# 5️⃣ DERIVE FREQ & SEVERITY SMOOTH
# ===============================

# Frequency baseline via Holt (stable)
model_freq = ExponentialSmoothing(
    train["frequency"],
    trend="add",
    damped_trend=True,
    seasonal=None
).fit()

freq_pred = model_freq.forecast(len(valid))
freq_pred = np.clip(freq_pred, 1, None)

sev_pred = pred_total / freq_pred

# ===============================
# 6️⃣ EVALUATION
# ===============================

freq_mape = safe_mape(valid["frequency"], freq_pred)
total_mape = safe_mape(valid["total_claim"], pred_total)
sev_mape = safe_mape(valid["severity"], sev_pred)

final_score = (freq_mape + sev_mape + total_mape) / 3

print("\n==============================")
print("MAPE Frequency :", round(freq_mape,4))
print("MAPE Severity  :", round(sev_mape,4))
print("MAPE Total     :", round(total_mape,4))
print("Final Score    :", round(final_score,4))
print("==============================")
print("STAGE 4 — DIRECT TOTAL COMPLETE")


MAPE Frequency : 0.1231
MAPE Severity  : 0.154
MAPE Total     : 0.2048
Final Score    : 0.1606
STAGE 4 — DIRECT TOTAL COMPLETE


# TEST PREDICTION & KAGGLE SUBMISSION

In [6]:
# ============================================================
# STAGE 5 — FINAL SUBMISSION (STABLE + CONSISTENT)
# ============================================================

import pandas as pd
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"
sample_sub = pd.read_csv(BASE_PATH + "sample_submission.csv")

# ============================================================
# REFIT ON FULL HISTORY
# ============================================================

monthly = monthly.sort_values("year_month").reset_index(drop=True)

# Frequency (additive damped)
model_freq_full = ExponentialSmoothing(
    monthly["frequency"],
    trend="add",
    damped_trend=True,
    seasonal=None
).fit()

# Severity (log scale)
model_sev_full = ExponentialSmoothing(
    np.log1p(monthly["severity"]),
    trend="add",
    damped_trend=True,
    seasonal=None
).fit()

# Direct total
model_total_full = ExponentialSmoothing(
    monthly["total_claim"],
    trend="add",
    damped_trend=True,
    seasonal=None
).fit()

# ============================================================
# EXTRACT FUTURE MONTHS
# ============================================================

sample_sub["year"]  = sample_sub["id"].str.split("_").str[0]
sample_sub["month"] = sample_sub["id"].str.split("_").str[1]
sample_sub["month_key"] = sample_sub["year"] + "-" + sample_sub["month"]

future_periods = (
    pd.PeriodIndex(sample_sub["month_key"], freq="M")
      .unique()
      .sort_values()
)

steps = len(future_periods)

# ============================================================
# FORECAST
# ============================================================

freq_forecast = np.array(model_freq_full.forecast(steps))
freq_forecast = np.clip(freq_forecast, 1, None)

sev_forecast = np.array(np.expm1(model_sev_full.forecast(steps)))
sev_forecast = np.clip(sev_forecast, 1, None)

total_actuarial = freq_forecast * sev_forecast

total_holt = np.array(model_total_full.forecast(steps))
total_holt = np.clip(total_holt, 1, None)

# ============================================================
# ADAPTIVE BLEND (BASED ON TRAIN ERROR)
# ============================================================

def safe_mape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

# last 4 months performance check
backtest_steps = min(4, len(monthly)-1)

holt_back = model_total_full.fittedvalues[-backtest_steps:]
act_back = (
    model_freq_full.fittedvalues[-backtest_steps:] *
    np.expm1(model_sev_full.fittedvalues[-backtest_steps:])
)

true_back = monthly["total_claim"].iloc[-backtest_steps:]

err_holt = safe_mape(true_back, holt_back)
err_act  = safe_mape(true_back, act_back)

w_holt = 1/(err_holt + 1e-6)
w_act  = 1/(err_act  + 1e-6)

w_sum = w_holt + w_act
w_holt /= w_sum
w_act  /= w_sum

total_blend = w_holt * total_holt + w_act * total_actuarial

# ============================================================
# VOLATILITY SHRINKAGE
# ============================================================

last_total = monthly["total_claim"].iloc[-1]
hist_vol = monthly["total_claim"].pct_change().std()

beta = 1 / (1 + hist_vol)

total_final = last_total + beta * (total_blend - last_total)

# Ensure actuarial consistency
sev_final = total_final / freq_forecast

# ============================================================
# BUILD SUBMISSION
# ============================================================

predictions = {}

for i, period in enumerate(future_periods):
    key = f"{period.year}_{str(period.month).zfill(2)}"
    predictions[f"{key}_Claim_Frequency"] = freq_forecast[i]
    predictions[f"{key}_Claim_Severity"]  = sev_final[i]
    predictions[f"{key}_Total_Claim"]     = total_final[i]

submission = sample_sub.copy()
submission["value"] = submission["id"].map(predictions)

if submission["value"].isna().sum() > 0:
    print("ERROR: Some IDs not matched")
else:
    print("All IDs matched successfully.")

submission = submission[["id", "value"]]
submission.to_csv("submission.csv", index=False)

print("\nSubmission file created.")
print(submission.head(9))


All IDs matched successfully.

Submission file created.
                        id         value
0  2025_08_Claim_Frequency  2.373871e+02
1   2025_08_Claim_Severity  5.513039e+07
2      2025_08_Total_Claim  1.308724e+10
3  2025_09_Claim_Frequency  2.373191e+02
4   2025_09_Claim_Severity  5.515007e+07
5      2025_09_Total_Claim  1.308816e+10
6  2025_10_Claim_Frequency  2.372631e+02
7   2025_10_Claim_Severity  5.516604e+07
8      2025_10_Total_Claim  1.308887e+10
