In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/Data_Klaim.csv
/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/sample_submission.csv
/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/Data_Polis.csv


# DATA FOUNDATION

In [2]:
# ============================================================
# STAGE 1 v3 — CLEAN FOUNDATION (NO TARGET DISTORTION)
# True Exposure • No Clipping • Forecast Ready
# ============================================================

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"

klaim = pd.read_csv(BASE_PATH + "Data_Klaim.csv")
polis = pd.read_csv(BASE_PATH + "Data_Polis.csv")

# ============================================================
# CLEAN COLUMN NAMES
# ============================================================

def clean_columns(df):
    df = df.copy()
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_", regex=False)
        .str.replace("/", "_", regex=False)
        .str.replace("-", "_", regex=False)
    )
    return df

klaim = clean_columns(klaim)
polis = clean_columns(polis)

klaim = klaim.drop_duplicates().reset_index(drop=True)
polis = polis.drop_duplicates().reset_index(drop=True)

# ============================================================
# DATE PARSING
# ============================================================

for col in klaim.columns:
    if "tanggal" in col:
        klaim[col] = pd.to_datetime(klaim[col], errors="coerce")

for col in polis.columns:
    if "tanggal" in col:
        polis[col] = pd.to_datetime(polis[col], errors="coerce")

# ============================================================
# BASIC CLEANING (NO CLIPPING)
# ============================================================

klaim = klaim.dropna(subset=["nomor_polis", "tanggal_pasien_masuk_rs"])
klaim["nominal_klaim_yang_disetujui"] = klaim["nominal_klaim_yang_disetujui"].fillna(0)

# ============================================================
# MERGE
# ============================================================

df = klaim.merge(polis, on="nomor_polis", how="left")

# ============================================================
# MONTH DEFINITION (SERVICE MONTH DEFAULT)
# ============================================================

MONTH_COL = "tanggal_pasien_masuk_rs"
# Alternative test:
# MONTH_COL = "tanggal_pembayaran_klaim"

df["year_month"] = df[MONTH_COL].dt.to_period("M")

# ============================================================
# TRUE EXPOSURE (TOTAL POLICIES)
# ============================================================

EXPOSURE_TRUE = polis["nomor_polis"].nunique()

# ============================================================
# MONTHLY CORE TABLE (RAW TOTAL PRESERVED)
# ============================================================

monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum")
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

monthly["exposure"] = EXPOSURE_TRUE

monthly["severity"] = (
    monthly["total_claim"] /
    monthly["frequency"].replace(0,np.nan)
)

monthly["claim_rate"] = (
    monthly["frequency"] /
    monthly["exposure"]
)

# ============================================================
# LOG DOMAIN (SAFE FOR MAPE)
# ============================================================

monthly["log_total"] = np.log1p(monthly["total_claim"])
monthly["log_freq"]  = np.log1p(monthly["frequency"])
monthly["log_sev"]   = np.log1p(monthly["severity"])
monthly["log_rate"]  = np.log1p(monthly["claim_rate"])

# ============================================================
# TIME FEATURES
# ============================================================

monthly["month"] = monthly["year_month"].dt.month
monthly["month_sin"] = np.sin(2*np.pi*monthly["month"]/12)
monthly["month_cos"] = np.cos(2*np.pi*monthly["month"]/12)
monthly["month_index"] = np.arange(len(monthly))

# ============================================================
# SAFE LAGS
# ============================================================

for col in ["log_total","log_freq","log_sev","log_rate"]:
    monthly[f"{col}_lag1"] = monthly[col].shift(1)
    monthly[f"{col}_lag2"] = monthly[col].shift(2)
    monthly[f"{col}_lag3"] = monthly[col].shift(3)
    monthly[f"{col}_roll3"] = monthly[col].shift(1).rolling(3).mean()

# ============================================================
# DROP ONLY STRICT LAG NA (KEEP MAX HISTORY)
# ============================================================

monthly = monthly.dropna().reset_index(drop=True)

# ============================================================
# FINAL CHECK
# ============================================================

print("Monthly shape:", monthly.shape)
print("Unique months:", monthly['year_month'].nunique())
print("Exposure (constant):", EXPOSURE_TRUE)
print("\nSTAGE 1 v3 — CLEAN FOUNDATION READY")


Monthly shape: (16, 30)
Unique months: 16
Exposure (constant): 4096

STAGE 1 v3 — CLEAN FOUNDATION READY


In [3]:
# ============================================================
# STAGE 1 v4 — USE PAYMENT DATE (CRITICAL TEST)
# ============================================================

df["year_month"] = df["tanggal_pembayaran_klaim"].dt.to_period("M")

monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum")
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

print(monthly)

   year_month  frequency   total_claim
0     2024-01          8  1.283162e+08
1     2024-02         92  2.684171e+09
2     2024-03         97  3.809944e+09
3     2024-04        221  9.281203e+09
4     2024-05        233  1.103847e+10
5     2024-06        221  1.127720e+10
6     2024-07        205  1.159773e+10
7     2024-08        285  1.895989e+10
8     2024-09        250  1.484250e+10
9     2024-10        242  1.114198e+10
10    2024-11        365  1.740396e+10
11    2024-12        295  1.409901e+10
12    2025-01        293  1.697253e+10
13    2025-02        183  9.559585e+09
14    2025-03        234  1.494105e+10
15    2025-04        184  7.538943e+09
16    2025-05        201  9.628068e+09
17    2025-06        204  1.617766e+10
18    2025-07        272  1.862361e+10
19    2025-08        245  1.546896e+10
20    2025-09        197  1.041073e+10
21    2025-10         58  4.900102e+09
22    2025-11          3  1.356322e+08
23    2025-12          2  1.366003e+08


# TIME-SERIES DATASET ENGINEERING

In [4]:
# ============================================================
# STAGE 2 v3 — STABLE SEGMENT SHARE PANEL
# Plan-Level Only • Share-Based • Short Series Safe
# ============================================================

import numpy as np
import pandas as pd

# ============================================================
# ENSURE PLAN CODE EXISTS
# ============================================================

if "plan_code" not in df.columns:
    df["plan_code"] = "UNKNOWN"

df["plan_code"] = df["plan_code"].fillna("UNKNOWN")

# ============================================================
# BUILD MONTHLY PLAN-LEVEL PANEL
# ============================================================

seg_monthly = (
    df.groupby(["year_month", "plan_code"])
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum")
      )
      .reset_index()
      .sort_values(["plan_code","year_month"])
      .reset_index(drop=True)
)

# ============================================================
# MERGE GLOBAL TOTAL (UNTUK SHARE)
# ============================================================

global_monthly = (
    df.groupby("year_month")
      .agg(total_global=("nominal_klaim_yang_disetujui","sum"))
      .reset_index()
)

seg_monthly = seg_monthly.merge(global_monthly, on="year_month", how="left")

# ============================================================
# SHARE OF TOTAL (STABLE TARGET)
# ============================================================

seg_monthly["share_total"] = (
    seg_monthly["total_claim"] /
    seg_monthly["total_global"].replace(0,np.nan)
)

# ============================================================
# SMOOTH SHARE (SHORT SERIES SAFE)
# ============================================================

seg_monthly["share_roll3"] = (
    seg_monthly.groupby("plan_code")["share_total"]
    .transform(lambda x: x.rolling(3, min_periods=1).mean())
)

# ============================================================
# LOG DOMAIN (OPTIONAL)
# ============================================================

seg_monthly["log_total"] = np.log1p(seg_monthly["total_claim"])
seg_monthly["log_freq"]  = np.log1p(seg_monthly["frequency"])

# ============================================================
# LAG FEATURES (SAFE)
# ============================================================

for col in ["log_total","log_freq","share_roll3"]:
    seg_monthly[f"{col}_lag1"] = \
        seg_monthly.groupby("plan_code")[col].shift(1)
    
    seg_monthly[f"{col}_lag2"] = \
        seg_monthly.groupby("plan_code")[col].shift(2)

# ============================================================
# DROP EARLY NA (MINIMAL LOSS)
# ============================================================

seg_model = seg_monthly.dropna().reset_index(drop=True)

# ============================================================
# FINAL CHECK
# ============================================================

print("SEGMENT PANEL SHAPE:", seg_model.shape)
print("Unique plan codes:", seg_model["plan_code"].nunique())
print("Months per plan (min):", 
      seg_model.groupby("plan_code")["year_month"].nunique().min())

print("\nSTAGE 2 v3 — STABLE SEGMENT SHARE PANEL READY")


SEGMENT PANEL SHAPE: (62, 15)
Unique plan codes: 3
Months per plan (min): 19

STAGE 2 v3 — STABLE SEGMENT SHARE PANEL READY


# MODEL DEVELOPMENT

In [5]:
# ============================================================
# STAGE 3 FINAL — PURE ETS TOTAL (CLEAN VERSION)
# 5-STEP RECURSIVE • SERVICE DATE BASED
# ============================================================

import numpy as np
import pandas as pd
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore")

def mape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

# ============================================================
# REBUILD MONTHLY USING SERVICE DATE
# ============================================================

df["year_month"] = df["tanggal_pasien_masuk_rs"].dt.to_period("M")

monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum")
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

monthly["severity"] = (
    monthly["total_claim"] /
    monthly["frequency"].replace(0, np.nan)
)

# ============================================================
# 5-STEP VALIDATION SPLIT
# ============================================================

train = monthly.iloc[:-5].copy()
valid = monthly.iloc[-5:].copy()

sim_df = train.copy()

pred_total = []
pred_freq  = []
pred_sev   = []

# ============================================================
# RECURSIVE FORECAST
# ============================================================

for step in range(5):

    train_sim = sim_df.copy()

    # -------- TOTAL MODEL --------
    try:
        model_total = ExponentialSmoothing(
            np.log1p(train_sim["total_claim"]),
            trend="add",
            damped_trend=True,
            seasonal=None
        ).fit(optimized=True)

        total_pred = np.expm1(model_total.forecast(1).iloc[0])
    except:
        total_pred = train_sim["total_claim"].iloc[-1]

    # -------- FREQUENCY MODEL --------
    try:
        model_freq = ExponentialSmoothing(
            np.log1p(train_sim["frequency"]),
            trend="add",
            damped_trend=True,
            seasonal=None
        ).fit(optimized=True)

        freq_pred = np.expm1(model_freq.forecast(1).iloc[0])
    except:
        freq_pred = train_sim["frequency"].iloc[-1]

    freq_pred = max(freq_pred, 1)

    sev_pred = total_pred / freq_pred

    pred_total.append(total_pred)
    pred_freq.append(freq_pred)
    pred_sev.append(sev_pred)

    new_row = {
        "year_month": None,
        "frequency": freq_pred,
        "total_claim": total_pred,
        "severity": sev_pred
    }

    sim_df = pd.concat([sim_df, pd.DataFrame([new_row])], ignore_index=True)

# ============================================================
# RESULTS
# ============================================================

print("\n==============================")
print("STAGE 3 FINAL MAPE Frequency :", round(mape(valid["frequency"], pred_freq),4))
print("STAGE 3 FINAL MAPE Total     :", round(mape(valid["total_claim"], pred_total),4))
print("STAGE 3 FINAL MAPE Severity  :", round(mape(valid["severity"], pred_sev),4))
print("Estimated Score              :", round(np.mean([
    mape(valid["frequency"], pred_freq),
    mape(valid["total_claim"], pred_total),
    mape(valid["severity"], pred_sev)
]),4))
print("==============================")



STAGE 3 FINAL MAPE Frequency : 5.8639
STAGE 3 FINAL MAPE Total     : 7.9326
STAGE 3 FINAL MAPE Severity  : 5.4876
Estimated Score              : 6.428


# TOTAL CLAIM OPTIMIZATION & VALIDATION, OPTUNA

In [6]:
# ============================================================
# STAGE 4 v2 — ADVANCED ETS OPTIMIZATION (TOTAL ONLY)
# Tune smoothing parameters directly
# ============================================================

!pip install -q optuna

import optuna
import numpy as np
import pandas as pd
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore")

def mape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

# ==============================
# BUILD MONTHLY
# ==============================

df["year_month"] = df["tanggal_pasien_masuk_rs"].dt.to_period("M")

monthly = (
    df.groupby("year_month")
      .agg(total_claim=("nominal_klaim_yang_disetujui","sum"))
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

monthly["log_total"] = np.log1p(monthly["total_claim"])

train_full = monthly.iloc[:-4].copy()
valid_full = monthly.iloc[-4:].copy()

# ==============================
# OBJECTIVE
# ==============================

def objective(trial):

    trend_type = trial.suggest_categorical("trend", ["add", None])
    damped = trial.suggest_categorical("damped", [True, False])

    alpha = trial.suggest_float("alpha", 0.2, 0.95)
    beta  = trial.suggest_float("beta", 0.01, 0.5)
    phi   = trial.suggest_float("phi", 0.7, 0.99)

    sim_df = train_full.copy()
    preds = []

    for step in range(4):

        sub_train = sim_df.copy()

        try:
            model = ExponentialSmoothing(
                sub_train["log_total"],
                trend=trend_type,
                damped_trend=damped,
                seasonal=None,
                initialization_method="estimated"
            ).fit(
                smoothing_level=alpha,
                smoothing_trend=beta if trend_type=="add" else None,
                damping_trend=phi if damped else None,
                optimized=False
            )

            pred = np.expm1(model.forecast(1).iloc[0])

        except:
            pred = sub_train["total_claim"].iloc[-1]

        preds.append(pred)

        new_row = {
            "year_month": None,
            "total_claim": pred,
            "log_total": np.log1p(pred)
        }

        sim_df = pd.concat([sim_df, pd.DataFrame([new_row])], ignore_index=True)

    return mape(valid_full["total_claim"], preds)

# ==============================
# RUN OPTUNA
# ==============================

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=120)

print("\nBest Params:", study.best_params)
print("Best 4M TOTAL MAPE:", round(study.best_value*100,4), "%")

[I 2026-02-17 13:43:38,084] A new study created in memory with name: no-name-62d02ce2-e4a4-4834-bf34-e3b2aad6d8ce
[I 2026-02-17 13:43:38,100] Trial 0 finished with value: 0.09576210396959155 and parameters: {'trend': 'add', 'damped': False, 'alpha': 0.4956318279793044, 'beta': 0.032989565370572405, 'phi': 0.9160269635013812}. Best is trial 0 with value: 0.09576210396959155.
[I 2026-02-17 13:43:38,114] Trial 1 finished with value: 0.10292274758273408 and parameters: {'trend': None, 'damped': False, 'alpha': 0.5254114159001175, 'beta': 0.4377144199247941, 'phi': 0.9349599505724282}. Best is trial 0 with value: 0.09576210396959155.
[I 2026-02-17 13:43:38,129] Trial 2 finished with value: 0.15573100442062646 and parameters: {'trend': 'add', 'damped': False, 'alpha': 0.9498162615239718, 'beta': 0.3215191645703494, 'phi': 0.9844332500210311}. Best is trial 0 with value: 0.09576210396959155.
[I 2026-02-17 13:43:38,144] Trial 3 finished with value: 0.12201337028600724 and parameters: {'trend':


Best Params: {'trend': 'add', 'damped': False, 'alpha': 0.2051142172913708, 'beta': 0.35086618590849417, 'phi': 0.7318611253157359}
Best 4M TOTAL MAPE: 5.9603 %


# TEST PREDICTION & KAGGLE SUBMISSION

In [7]:
# ============================================================
# STAGE 5 FINAL — OPTIMIZED TOTAL + ETS FREQ/SEV
# ============================================================

import numpy as np
import pandas as pd
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings("ignore")

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"
sample_sub = pd.read_csv(BASE_PATH + "sample_submission.csv")

# ==============================
# BUILD MONTHLY (SERVICE DATE)
# ==============================

df["year_month"] = df["tanggal_pasien_masuk_rs"].dt.to_period("M")

monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum")
      )
      .reset_index()
      .sort_values("year_month")
      .reset_index(drop=True)
)

monthly["severity"] = (
    monthly["total_claim"] /
    monthly["frequency"].replace(0,np.nan)
)

monthly["log_total"] = np.log1p(monthly["total_claim"])

# ==============================
# PREPARE FUTURE MONTHS
# ==============================

sample_sub["year"]  = sample_sub["id"].str.split("_").str[0]
sample_sub["month"] = sample_sub["id"].str.split("_").str[1]
sample_sub["month_key"] = sample_sub["year"] + "-" + sample_sub["month"]

future_periods = (
    pd.PeriodIndex(sample_sub["month_key"], freq="M")
      .unique()
      .sort_values()
)

sim_df = monthly.copy()
predictions = {}

# ==============================
# RECURSIVE FORECAST
# ==============================

for period in future_periods:

    train_sim = sim_df.copy()

    # ----- TOTAL (LOCKED PARAMS) -----
    model_total = ExponentialSmoothing(
        train_sim["log_total"],
        trend="add",
        damped_trend=False,
        seasonal=None,
        initialization_method="estimated"
    ).fit(
        smoothing_level=0.20106970125157483,
        smoothing_trend=0.3759840708860101,
        optimized=False
    )

    total_pred = np.expm1(model_total.forecast(1).iloc[0])

    # ----- FREQUENCY -----
    model_freq = ExponentialSmoothing(
        np.log1p(train_sim["frequency"]),
        trend="add",
        damped_trend=True,
        seasonal=None
    ).fit(optimized=True)

    freq_pred = np.expm1(model_freq.forecast(1).iloc[0])
    freq_pred = max(freq_pred, 1)

    # ----- SEVERITY -----
    model_sev = ExponentialSmoothing(
        np.log1p(train_sim["severity"]),
        trend="add",
        damped_trend=True,
        seasonal=None
    ).fit(optimized=True)

    sev_pred = np.expm1(model_sev.forecast(1).iloc[0])

    # ----- UPDATE -----
    new_row = {
        "year_month": period,
        "frequency": freq_pred,
        "total_claim": total_pred,
        "severity": sev_pred,
        "log_total": np.log1p(total_pred)
    }

    sim_df = pd.concat([sim_df, pd.DataFrame([new_row])], ignore_index=True)

    # save predictions
    key_total = f"{period.year}_{str(period.month).zfill(2)}_Total_Claim"
    key_freq  = f"{period.year}_{str(period.month).zfill(2)}_Claim_Frequency"
    key_sev   = f"{period.year}_{str(period.month).zfill(2)}_Claim_Severity"

    predictions[key_total] = total_pred
    predictions[key_freq]  = freq_pred
    predictions[key_sev]   = sev_pred

submission = sample_sub.copy()
submission["value"] = submission["id"].map(predictions)
submission = submission[["id","value"]]

submission.to_csv("submission.csv", index=False)

print("Submission created — OPTIMIZED TOTAL")

Submission created — OPTIMIZED TOTAL


In [8]:
print(submission.head(9)) 

                        id         value
0  2025_08_Claim_Frequency  2.351593e+02
1   2025_08_Claim_Severity  5.285407e+07
2      2025_08_Total_Claim  1.324990e+10
3  2025_09_Claim_Frequency  2.350778e+02
4   2025_09_Claim_Severity  5.283369e+07
5      2025_09_Total_Claim  1.345157e+10
6  2025_10_Claim_Frequency  2.350126e+02
7   2025_10_Claim_Severity  5.281739e+07
8      2025_10_Total_Claim  1.365632e+10
