In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/Data_Klaim.csv
/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/sample_submission.csv
/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/Data_Polis.csv


# DATA FOUNDATION

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"

klaim = pd.read_csv(BASE_PATH + "Data_Klaim.csv")
polis = pd.read_csv(BASE_PATH + "Data_Polis.csv")

print("Initial Klaim :", klaim.shape)
print("Initial Polis :", polis.shape)

# ============================================================
# CLEAN COLUMN NAMES
# ============================================================

def clean_columns(df):
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_", regex=False)
        .str.replace("/", "_", regex=False)
        .str.replace("-", "_", regex=False)
    )
    return df

klaim = clean_columns(klaim)
polis = clean_columns(polis)

klaim = klaim.drop_duplicates().reset_index(drop=True)
polis = polis.drop_duplicates().reset_index(drop=True)

# ============================================================
# DATE PARSING
# ============================================================

for col in klaim.columns:
    if "tanggal" in col:
        klaim[col] = pd.to_datetime(klaim[col], errors="coerce")

for col in polis.columns:
    if "tanggal" in col:
        try:
            polis[col] = pd.to_datetime(
                polis[col].astype(str),
                format="%Y%m%d",
                errors="raise"
            )
        except:
            polis[col] = pd.to_datetime(polis[col], errors="coerce")

# ============================================================
# BASIC CLEANING (SAFE)
# ============================================================

klaim = klaim.dropna(subset=["nomor_polis"])
klaim = klaim.dropna(subset=["tanggal_pasien_masuk_rs"])

klaim = klaim[
    klaim["nominal_klaim_yang_disetujui"] > 0
].copy()

# FIX claim_ratio safely
if "nominal_biaya_rs_yang_terjadi" in klaim.columns:
    denom = klaim["nominal_biaya_rs_yang_terjadi"].replace(0, np.nan)
    klaim["claim_ratio"] = (
        klaim["nominal_klaim_yang_disetujui"] / denom
    )
    klaim["claim_ratio"] = (
        klaim["claim_ratio"]
        .replace([np.inf, -np.inf], np.nan)
        .clip(0, 5)
        .fillna(klaim["claim_ratio"].median())
    )

# ============================================================
# MERGE
# ============================================================

df = klaim.merge(polis, on="nomor_polis", how="left")

print("After merge :", df.shape)

# ============================================================
# USE SERVICE MONTH (CRITICAL FIX)
# ============================================================

df["year_month"] = df["tanggal_pasien_masuk_rs"].dt.to_period("M")

print("Month range:",
      df["year_month"].min(),
      "→",
      df["year_month"].max())

# ============================================================
# FEATURE ENGINEERING
# ============================================================

# Age
if "tanggal_lahir" in df.columns:
    df["age"] = (
        (df["tanggal_pasien_masuk_rs"] - df["tanggal_lahir"]).dt.days / 365
    )

# Tenure
if "tanggal_efektif_polis" in df.columns:
    df["tenure_days"] = (
        (df["tanggal_pasien_masuk_rs"] -
         df["tanggal_efektif_polis"]).dt.days
    )

# Length of Stay (handle NaT safely)
if "tanggal_pasien_keluar_rs" in df.columns:
    df["los"] = (
        df["tanggal_pasien_keluar_rs"] -
        df["tanggal_pasien_masuk_rs"]
    ).dt.days

# ============================================================
# SEGMENT FEATURES (FOR SEASONAL-YOY MODEL)
# ============================================================

df["is_inpatient"] = (
    df["inpatient_outpatient"]
    .astype(str)
    .str.upper()
    .str.startswith("IP")
    .astype(int)
)

df["is_cashless"] = (
    df["reimburse_cashless"]
    .astype(str)
    .str.lower()
    .eq("cashless")
    .astype(int)
)

df["rs_bucket"] = np.where(
    df["lokasi_rs"].astype(str).str.lower() == "indonesia",
    "ID",
    "NONID"
)

# ============================================================
# EXPOSURE (CORRECT RANGE)
# ============================================================

all_months = pd.period_range(
    df["year_month"].min(),
    df["year_month"].max(),
    freq="M"
)

exposure_list = []

for m in all_months:
    active = polis[
        polis["tanggal_efektif_polis"].dt.to_period("M") <= m
    ]["nomor_polis"].nunique()

    exposure_list.append({
        "year_month": m,
        "exposure": active
    })

exposure_df = pd.DataFrame(exposure_list)

df = df.merge(exposure_df, on="year_month", how="left")

# ============================================================
# FINAL CHECK
# ============================================================

print("\nFinal shape:", df.shape)
print("Unique months:", df["year_month"].nunique())
print("\nSample:")
print(df.head())

print("\nSTAGE 1 COMPLETE — TOP VERSION READY")


Initial Klaim : (4627, 13)
Initial Polis : (4096, 6)
After merge : (4614, 19)
Month range: 2024-01 → 2025-07

Final shape: (4614, 27)
Unique months: 19

Sample:
   claim_id nomor_polis reimburse_cashless inpatient_outpatient icd_diagnosis  \
0  C-0001-M    POL-0176                  R                   OP           C50   
1  C-0002-M    POL-3288                  R                   OP           C34   
2  C-0003-M    POL-1786                  R                   OP         C18.9   
3  C-0004-M    POL-1786                  R                   OP           C34   
4  C-0005-M    POL-2778                  R                   OP           C50   

                           icd_description status_klaim  \
0             MALIGNANT NEOPLASM OF BREAST         PAID   
1  MALIGNANT NEOPLASM OF BRONCHUS AND LUNG         PAID   
2   MALIGNANT NEOPLASM, COLON, UNSPECIFIED         PAID   
3  MALIGNANT NEOPLASM OF BRONCHUS AND LUNG         PAID   
4             MALIGNANT NEOPLASM OF BREAST         PAID  

# TIME-SERIES DATASET ENGINEERING

In [3]:
import numpy as np
import pandas as pd

# ============================================================
# 1. SEGMENT MONTHLY DATASET (MAIN MODEL)
# ============================================================

seg_cols = [
    "plan_code",
    "is_inpatient",
    "is_cashless",
    "rs_bucket"
]

seg_monthly = (
    df.groupby(["year_month"] + seg_cols)
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum")
      )
      .reset_index()
)

print("Segment Monthly Shape:", seg_monthly.shape)

# ============================================================
# 2. PORTFOLIO MONTHLY (FOR ML BLEND)
# ============================================================

monthly = (
    df.groupby("year_month")
      .agg(
          frequency=("claim_id","count"),
          total_claim=("nominal_klaim_yang_disetujui","sum"),
          exposure=("exposure","max"),
          avg_age=("age","mean"),
          avg_tenure=("tenure_days","mean"),
          avg_los=("los","mean"),
          avg_claim_ratio=("claim_ratio","mean")
      )
      .reset_index()
)

monthly = monthly.sort_values("year_month").reset_index(drop=True)

monthly["severity"] = monthly["total_claim"] / monthly["frequency"]

# ============================================================
# 3. CALENDAR FEATURES
# ============================================================

monthly["year_month_dt"] = monthly["year_month"].dt.to_timestamp()
monthly["month"] = monthly["year_month_dt"].dt.month
monthly["month_sin"] = np.sin(2*np.pi*monthly["month"]/12)
monthly["month_cos"] = np.cos(2*np.pi*monthly["month"]/12)

monthly["month_index"] = np.arange(len(monthly))

# ============================================================
# 4. CORE LAGS (MINIMAL BUT STRONG)
# ============================================================

for col in ["frequency","total_claim","severity"]:
    monthly[f"{col}_lag1"] = monthly[col].shift(1)
    monthly[f"{col}_roll3"] = monthly[col].shift(1).rolling(3).mean()
    monthly[f"{col}_expanding"] = monthly[col].shift(1).expanding().mean()

# ============================================================
# 5. MIX FEATURES (IMPORTANT FOR BLEND)
# ============================================================

mix = (
    df.groupby("year_month")
      .agg(
          share_inpatient=("is_inpatient","mean"),
          share_cashless=("is_cashless","mean"),
      )
      .reset_index()
)

monthly = monthly.merge(mix, on="year_month", how="left")

for col in ["share_inpatient","share_cashless"]:
    monthly[f"{col}_lag1"] = monthly[col].shift(1)

# ============================================================
# 6. DO NOT DROP MONTHS (FILL SMART)
# ============================================================

for col in monthly.columns:
    if col not in ["year_month","year_month_dt"]:
        monthly[col] = monthly[col].fillna(method="bfill")

print("\nPortfolio Monthly Shape:", monthly.shape)
print("Time Range:",
      monthly["year_month"].min(),
      "→",
      monthly["year_month"].max())

print("\nColumns:")
print(monthly.columns.tolist())

print("\nPreview:")
print(monthly.head())

print("\nSTAGE 2 COMPLETE — SEGMENT MODEL READY")


Segment Monthly Shape: (189, 7)

Portfolio Monthly Shape: (19, 27)
Time Range: 2024-01 → 2025-07

Columns:
['year_month', 'frequency', 'total_claim', 'exposure', 'avg_age', 'avg_tenure', 'avg_los', 'avg_claim_ratio', 'severity', 'year_month_dt', 'month', 'month_sin', 'month_cos', 'month_index', 'frequency_lag1', 'frequency_roll3', 'frequency_expanding', 'total_claim_lag1', 'total_claim_roll3', 'total_claim_expanding', 'severity_lag1', 'severity_roll3', 'severity_expanding', 'share_inpatient', 'share_cashless', 'share_inpatient_lag1', 'share_cashless_lag1']

Preview:
  year_month  frequency   total_claim  exposure    avg_age   avg_tenure  \
0    2024-01        299  2.026098e+10      4096  59.930627  3345.050167   
1    2024-02        208  1.385965e+10      4096  58.807337  3348.644231   
2    2024-03        278  1.431126e+10      4096  57.909796  3309.902878   
3    2024-04        238  1.144106e+10      4096  56.926154  3380.004202   
4    2024-05        263  1.221146e+10      4096  57.

# MODEL DEVELOPMENT

In [4]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import numpy as np
import pandas as pd

def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

monthly = monthly.sort_values("year_month").reset_index(drop=True)

# =========================
# CREATE RATE
# =========================

monthly["claim_rate"] = monthly["frequency"] / monthly["exposure"]

train_cut = pd.Period("2025-01", freq="M")

train_data = monthly[monthly["year_month"] < train_cut]
valid_data = monthly[monthly["year_month"] >= train_cut].iloc[:4]

steps = 4

# =========================
# MODEL CLAIM RATE
# =========================

model_rate = ExponentialSmoothing(
    train_data["claim_rate"],
    trend="add",
    damped_trend=True,
    seasonal=None
).fit()

pred_rate = model_rate.forecast(steps)

# =========================
# MODEL SEVERITY (LOG)
# =========================

model_sev = ExponentialSmoothing(
    np.log1p(train_data["severity"]),
    trend="add",
    damped_trend=True,
    seasonal=None
).fit()

pred_sev = np.expm1(model_sev.forecast(steps))

# =========================
# RECONSTRUCT
# =========================

exposure_future = valid_data["exposure"].values

pred_freq = pred_rate * exposure_future
pred_freq = np.clip(pred_freq, 1, None)

pred_total = pred_freq * pred_sev

# =========================
# TRUE VALUES
# =========================

true_freq = valid_data["frequency"].values
true_total = valid_data["total_claim"].values
true_sev = valid_data["severity"].values

print("MAPE Frequency :", round(mape(true_freq,pred_freq),2))
print("MAPE Total     :", round(mape(true_total,pred_total),2))
print("MAPE Severity  :", round(mape(true_sev,pred_sev),2))
print("Estimated Score:",
      round((mape(true_freq,pred_freq)+
             mape(true_total,pred_total)+
             mape(true_sev,pred_sev))/3,2))


MAPE Frequency : 8.12
MAPE Total     : 19.19
MAPE Severity  : 16.1
Estimated Score: 14.47


# TOTAL CLAIM OPTIMIZATION & VALIDATION

In [5]:
# ============================================================
# IMPORT
# ============================================================

import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# ============================================================
# SAFE MAPE
# ============================================================

def safe_mape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

# ============================================================
# SPLIT
# ============================================================

monthly = monthly.sort_values("year_month").reset_index(drop=True)

train_cut = pd.Period("2025-01", freq="M")

train = monthly[monthly["year_month"] < train_cut].copy()
valid = monthly[monthly["year_month"] >= train_cut].iloc[:4].copy()

# ============================================================
# 1️⃣ FREQUENCY MODEL (HOLT DAMPED)
# ============================================================

model_freq = ExponentialSmoothing(
    train["frequency"],
    trend="add",
    damped_trend=True,
    seasonal=None
).fit()

freq_pred_valid = model_freq.forecast(len(valid))
freq_pred_valid = np.clip(freq_pred_valid, 1, None)

# ============================================================
# 2️⃣ SEVERITY MODEL (LOG HOLT)
# ============================================================

model_sev = ExponentialSmoothing(
    np.log1p(train["severity"]),
    trend="add",
    damped_trend=True,
    seasonal=None
).fit()

sev_pred_valid = np.expm1(model_sev.forecast(len(valid)))
sev_pred_valid = np.clip(sev_pred_valid, 1, None)

# ============================================================
# 3️⃣ ACTUARIAL TOTAL
# ============================================================

total_pred_valid_actuarial = freq_pred_valid * sev_pred_valid

# ============================================================
# 4️⃣ RIDGE TOTAL
# ============================================================

features_total = [
    "month_index",
    "month_sin",
    "month_cos",
    "total_lag1",
    "frequency_lag1",
    "severity_lag1"
]

features_total = [f for f in features_total if f in monthly.columns]

X_train_total = train[features_total]
X_valid_total = valid[features_total]

y_train_total = np.log1p(train["total_claim"])
y_valid_total = valid["total_claim"]

ridge_total = Ridge(alpha=50)
ridge_total.fit(X_train_total, y_train_total)

total_pred_valid_ridge = np.expm1(ridge_total.predict(X_valid_total))
total_pred_valid_ridge = np.clip(total_pred_valid_ridge, 1, None)

# ============================================================
# 5️⃣ HOLT TOTAL
# ============================================================

holt_model = ExponentialSmoothing(
    train["total_claim"],
    trend="add",
    damped_trend=True,
    seasonal=None
).fit()

total_pred_valid_holt = holt_model.forecast(len(valid))
total_pred_valid_holt = np.clip(total_pred_valid_holt, 1, None)

# ============================================================
# 6️⃣ EVALUATE INDIVIDUAL
# ============================================================

models = {
    "Actuarial": total_pred_valid_actuarial,
    "Ridge": total_pred_valid_ridge,
    "Holt": total_pred_valid_holt
}

print("\nIndividual Total MAPE:")
for k, v in models.items():
    print(k, ":", round(safe_mape(y_valid_total, v), 4))

# ============================================================
# 7️⃣ STABLE FIXED BLEND
# ============================================================

weights = {
    "Actuarial": 0.5,
    "Ridge": 0.25,
    "Holt": 0.25
}

total_pred_valid_blend = sum(
    weights[k] * models[k] for k in models
)

mape_total_final = safe_mape(y_valid_total, total_pred_valid_blend)

# ============================================================
# 8️⃣ FINAL SCORE
# ============================================================

mape_freq_final = safe_mape(valid["frequency"], freq_pred_valid)
mape_sev_final  = safe_mape(valid["severity"], sev_pred_valid)

final_score = (mape_freq_final + mape_sev_final + mape_total_final) / 3

print("\nFINAL COMPETITION SCORE")
print("MAPE Frequency :", round(mape_freq_final, 4))
print("MAPE Severity  :", round(mape_sev_final, 4))
print("MAPE Total     :", round(mape_total_final, 4))
print("Final Score    :", round(final_score, 4))

print("\nSTAGE 4 COMPLETE — FULL SELF CONTAINED")



Individual Total MAPE:
Actuarial : 0.1919
Ridge : 0.2227
Holt : 0.188

FINAL COMPETITION SCORE
MAPE Frequency : 0.0812
MAPE Severity  : 0.161
MAPE Total     : 0.1853
Final Score    : 0.1425

STAGE 4 COMPLETE — FULL SELF CONTAINED


# TEST PREDICTION & KAGGLE SUBMISSION

In [6]:
# ============================================================
# STAGE 5 — FINAL SUBMISSION CLEAN
# ============================================================

import pandas as pd
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"
sample_sub = pd.read_csv(BASE_PATH + "sample_submission.csv")

# ============================================================
# PREPARE FULL DATA (TRAIN ALL HISTORY)
# ============================================================

monthly = monthly.sort_values("year_month").reset_index(drop=True)

# Refit models on FULL data (sampai Jul 2025)

model_freq_full = ExponentialSmoothing(
    monthly["frequency"],
    trend="add",
    damped_trend=True,
    seasonal=None
).fit()

model_sev_full = ExponentialSmoothing(
    np.log1p(monthly["severity"]),
    trend="add",
    damped_trend=True,
    seasonal=None
).fit()

model_total_full = ExponentialSmoothing(
    monthly["total_claim"],
    trend="add",
    damped_trend=True,
    seasonal=None
).fit()

# ============================================================
# EXTRACT FUTURE MONTHS
# ============================================================

sample_sub["year"]  = sample_sub["id"].str.split("_").str[0]
sample_sub["month"] = sample_sub["id"].str.split("_").str[1]
sample_sub["month_key"] = sample_sub["year"] + "-" + sample_sub["month"]

future_periods = (
    pd.PeriodIndex(sample_sub["month_key"], freq="M")
      .unique()
      .sort_values()
)

steps = len(future_periods)

# ============================================================
# FORECAST (CONVERT TO NUMPY)
# ============================================================

freq_forecast = np.array(model_freq_full.forecast(steps))
freq_forecast = np.clip(freq_forecast, 1, None)

sev_forecast = np.array(np.expm1(model_sev_full.forecast(steps)))
sev_forecast = np.clip(sev_forecast, 1, None)

total_actuarial = freq_forecast * sev_forecast

total_holt = np.array(model_total_full.forecast(steps))
total_holt = np.clip(total_holt, 1, None)

# Stable blend
total_forecast = 0.6 * total_actuarial + 0.4 * total_holt

# ============================================================
# STABLE BLEND (FIXED)
# ============================================================

# 60% actuarial + 40% holt (lebih stabil)
total_forecast = 0.6 * total_actuarial + 0.4 * total_holt

# ============================================================
# BUILD SUBMISSION
# ============================================================

predictions = {}

for i, period in enumerate(future_periods):

    key = f"{period.year}_{str(period.month).zfill(2)}"

    predictions[f"{key}_Claim_Frequency"] = freq_forecast[i]
    predictions[f"{key}_Claim_Severity"]  = sev_forecast[i]
    predictions[f"{key}_Total_Claim"]     = total_forecast[i]

submission = sample_sub.copy()
submission["value"] = submission["id"].map(predictions)

if submission["value"].isna().sum() > 0:
    print("ERROR: Some IDs not matched")
else:
    print("All IDs matched successfully.")

submission = submission[["id", "value"]]
submission.to_csv("submission.csv", index=False)

print("\nSubmission file created.")
print(submission.head(9))


All IDs matched successfully.

Submission file created.
                        id         value
0  2025_08_Claim_Frequency  2.353129e+02
1   2025_08_Claim_Severity  5.291305e+07
2      2025_08_Total_Claim  1.252789e+10
3  2025_09_Claim_Frequency  2.352243e+02
4   2025_09_Claim_Severity  5.289162e+07
5      2025_09_Total_Claim  1.251231e+10
6  2025_10_Claim_Frequency  2.351535e+02
7   2025_10_Claim_Severity  5.287449e+07
8      2025_10_Total_Claim  1.249910e+10
