In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/Data_Klaim.csv
/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/sample_submission.csv
/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/Data_Polis.csv


# DATA FOUNDATION

In [2]:
# ============================================================
# STAGE 1 â€” DATA FOUNDATION (FINAL SAFE VERSION)
# ============================================================

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"

klaim = pd.read_csv(BASE_PATH + "Data_Klaim.csv")
polis = pd.read_csv(BASE_PATH + "Data_Polis.csv")

print("Initial Klaim shape :", klaim.shape)
print("Initial Polis shape :", polis.shape)

# ============================================================
# CLEAN COLUMN NAMES
# ============================================================

def clean_columns(df):
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_", regex=False)
        .str.replace("/", "_", regex=False)
        .str.replace("-", "_", regex=False)
    )
    return df

klaim = clean_columns(klaim)
polis = clean_columns(polis)

klaim = klaim.drop_duplicates().reset_index(drop=True)
polis = polis.drop_duplicates().reset_index(drop=True)

# ============================================================
# DATE PARSING (SAFE & FLEXIBLE)
# ============================================================

# Parse all possible date columns safely
for col in klaim.columns:
    if "tanggal" in col:
        klaim[col] = pd.to_datetime(klaim[col], errors="coerce")

for col in polis.columns:
    if "tanggal" in col:
        # Try YYYYMMDD first
        try:
            polis[col] = pd.to_datetime(
                polis[col].astype(str),
                format="%Y%m%d",
                errors="raise"
            )
        except:
            polis[col] = pd.to_datetime(polis[col], errors="coerce")

# Check tanggal efektif polis range
if "tanggal_efektif_polis" in polis.columns:
    print("\nTanggal efektif polis range:")
    print(polis["tanggal_efektif_polis"].min())
    print(polis["tanggal_efektif_polis"].max())

# ============================================================
# CLEANING
# ============================================================

klaim = klaim.dropna(subset=["nomor_polis"])

if "tanggal_pembayaran_klaim" in klaim.columns:
    klaim = klaim.dropna(subset=["tanggal_pembayaran_klaim"])

if "nominal_klaim_yang_disetujui" in klaim.columns:
    klaim = klaim[klaim["nominal_klaim_yang_disetujui"] > 0]

if (
    "tanggal_pasien_masuk_rs" in klaim.columns and
    "tanggal_pasien_keluar_rs" in klaim.columns
):
    klaim = klaim[
        klaim["tanggal_pasien_keluar_rs"] >=
        klaim["tanggal_pasien_masuk_rs"]
    ]

print("Klaim shape after cleaning :", klaim.shape)

# ============================================================
# MERGE
# ============================================================

df = klaim.merge(polis, on="nomor_polis", how="left")
print("Merged shape :", df.shape)

# ============================================================
# FEATURE ENGINEERING (SAFE)
# ============================================================

if "tanggal_pasien_masuk_rs" in df.columns and "tanggal_lahir" in df.columns:
    df["age"] = (
        (df["tanggal_pasien_masuk_rs"] - df["tanggal_lahir"]).dt.days / 365
    )

if "tanggal_pasien_masuk_rs" in df.columns and "tanggal_efektif_polis" in df.columns:
    df["tenure_days"] = (
        (df["tanggal_pasien_masuk_rs"] - df["tanggal_efektif_polis"]).dt.days
    )

if "tanggal_pasien_keluar_rs" in df.columns and "tanggal_pasien_masuk_rs" in df.columns:
    df["los"] = (
        (df["tanggal_pasien_keluar_rs"] - df["tanggal_pasien_masuk_rs"]).dt.days
    )

if (
    "nominal_klaim_yang_disetujui" in df.columns and
    "nominal_biaya_rs_yang_terjadi" in df.columns
):
    df["claim_ratio"] = (
        df["nominal_klaim_yang_disetujui"] /
        df["nominal_biaya_rs_yang_terjadi"]
    )

if "tanggal_pembayaran_klaim" in df.columns:
    df["year_month"] = df["tanggal_pembayaran_klaim"].dt.to_period("M")

# ============================================================
# TRUE EXPOSURE CALCULATION
# ============================================================

if "tanggal_efektif_polis" in polis.columns:

    all_months = pd.period_range(
        df["year_month"].min(),
        df["year_month"].max(),
        freq="M"
    )

    exposure_list = []

    for month in all_months:
        active_policies = polis[
            polis["tanggal_efektif_polis"].dt.to_period("M") <= month
        ]["nomor_polis"].nunique()

        exposure_list.append({
            "year_month": month,
            "exposure": active_policies
        })

    exposure_monthly = pd.DataFrame(exposure_list)

    df = df.merge(exposure_monthly, on="year_month", how="left")

# ============================================================
# STABILITY FEATURES
# ============================================================

if "age" in df.columns:
    df["age_bucket"] = pd.cut(
        df["age"],
        bins=[0,30,45,60,100],
        labels=["young","adult","mature","senior"]
    )

if "inpatient_outpatient" in df.columns:
    df["is_inpatient"] = (
        df["inpatient_outpatient"]
        .astype(str)
        .str.lower()
        .str.contains("in")
        .astype(int)
    )

# ============================================================
# FINAL CHECK
# ============================================================

print("\nFinal Data Info:")
print(df.info())

if "year_month" in df.columns:
    print("\nUnique Months:", df["year_month"].nunique())

if "exposure" in df.columns:
    print("\nExposure Sample:")
    print(df[["year_month","exposure"]].drop_duplicates().head())

print("\nSTAGE 1 COMPLETE â€” SAFE & FULLY FIXED")


Initial Klaim shape : (4627, 13)
Initial Polis shape : (4096, 6)

Tanggal efektif polis range:
2011-12-05 00:00:00
2018-02-20 00:00:00
Klaim shape after cleaning : (4579, 13)
Merged shape : (4579, 18)

Final Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4579 entries, 0 to 4578
Data columns (total 26 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   claim_id                       4579 non-null   object        
 1   nomor_polis                    4579 non-null   object        
 2   reimburse_cashless             4579 non-null   object        
 3   inpatient_outpatient           4544 non-null   object        
 4   icd_diagnosis                  4575 non-null   object        
 5   icd_description                4575 non-null   object        
 6   status_klaim                   4579 non-null   object        
 7   tanggal_pembayaran_klaim       4579 non-null   datetime64[ns]
 8  

# TIME-SERIES DATASET ENGINEERING

In [3]:
# ============================================================
# STAGE 2 â€” TIME SERIES ENGINEERING (SMALL DATA OPTIMIZED)
# ============================================================

import pandas as pd
import numpy as np

# ============================================================
# 1. MONTHLY AGGREGATION
# ============================================================

agg_dict = {
    "nominal_klaim_yang_disetujui": "sum",
    "claim_id": "count",
    "age": "mean",
    "tenure_days": "mean",
    "los": "mean",
    "claim_ratio": "mean"
}

monthly = (
    df.groupby("year_month")
      .agg(agg_dict)
      .reset_index()
)

monthly = monthly.rename(columns={
    "nominal_klaim_yang_disetujui": "total_claim",
    "claim_id": "frequency",
    "age": "avg_age",
    "tenure_days": "avg_tenure",
    "los": "avg_los",
    "claim_ratio": "avg_claim_ratio"
})

# ============================================================
# 2. BASIC METRICS
# ============================================================

monthly["severity"] = monthly["total_claim"] / monthly["frequency"]

monthly = monthly.sort_values("year_month").reset_index(drop=True)
monthly["year_month_dt"] = monthly["year_month"].dt.to_timestamp()

monthly["month_index"] = np.arange(len(monthly))
monthly["month"] = monthly["year_month_dt"].dt.month

monthly["month_sin"] = np.sin(2 * np.pi * monthly["month"] / 12)
monthly["month_cos"] = np.cos(2 * np.pi * monthly["month"] / 12)

# ============================================================
# 3. CORE LAG FEATURES (MINIMAL & STRONG)
# ============================================================

monthly["freq_lag1"] = monthly["frequency"].shift(1)
monthly["sev_lag1"] = monthly["severity"].shift(1)
monthly["total_lag1"] = monthly["total_claim"].shift(1)

# ============================================================
# 4. EXPANDING MEAN (VERY IMPORTANT FOR SMALL DATA)
# ============================================================

monthly["freq_expanding"] = (
    monthly["frequency"].shift(1).expanding().mean()
)

monthly["sev_expanding"] = (
    monthly["severity"].shift(1).expanding().mean()
)

monthly["total_expanding"] = (
    monthly["total_claim"].shift(1).expanding().mean()
)

# ============================================================
# 5. STABLE ROLLING (ONLY 3)
# ============================================================

monthly["freq_roll3"] = (
    monthly["frequency"].shift(1).rolling(3).mean()
)

monthly["sev_roll3"] = (
    monthly["severity"].shift(1).rolling(3).mean()
)

# ============================================================
# 6. STABILIZED GROWTH (CLIPPED)
# ============================================================

monthly["freq_growth"] = (
    monthly["frequency"].pct_change().shift(1)
)

monthly["sev_growth"] = (
    monthly["severity"].pct_change().shift(1)
)

monthly["freq_growth"] = monthly["freq_growth"].clip(-1, 1)
monthly["sev_growth"] = monthly["sev_growth"].clip(-1, 1)

# ============================================================
# 7. SHRINKAGE FEATURE (ANTI-OVERFIT)
# ============================================================

global_freq_mean = monthly["frequency"].mean()
global_sev_mean = monthly["severity"].mean()

monthly["freq_shrink"] = (
    0.7 * monthly["freq_expanding"] +
    0.3 * global_freq_mean
)

monthly["sev_shrink"] = (
    0.7 * monthly["sev_expanding"] +
    0.3 * global_sev_mean
)

# ============================================================
# 8. DROP NA
# ============================================================

monthly = monthly.dropna().reset_index(drop=True)

# ============================================================
# FINAL CHECK
# ============================================================

print("Final Monthly Dataset Shape:", monthly.shape)
print("Time Range:",
      monthly["year_month"].min(),
      "to",
      monthly["year_month"].max())

print("\nColumns:")
print(monthly.columns.tolist())

print("\nPreview:")
print(monthly.head())

print("\nSTAGE 2 COMPLETE â€” SMALL DATA OPTIMIZED")


Final Monthly Dataset Shape: (21, 25)
Time Range: 2024-04 to 2025-12

Columns:
['year_month', 'total_claim', 'frequency', 'avg_age', 'avg_tenure', 'avg_los', 'avg_claim_ratio', 'severity', 'year_month_dt', 'month_index', 'month', 'month_sin', 'month_cos', 'freq_lag1', 'sev_lag1', 'total_lag1', 'freq_expanding', 'sev_expanding', 'total_expanding', 'freq_roll3', 'sev_roll3', 'freq_growth', 'sev_growth', 'freq_shrink', 'sev_shrink']

Preview:
  year_month   total_claim  frequency    avg_age   avg_tenure   avg_los  \
0    2024-04  9.281203e+09        218  57.657471  3361.160550  1.050459   
1    2024-05  1.103847e+10        233  57.683685  3327.296137  1.158798   
2    2024-06  1.127720e+10        221  56.691502  3417.877828  1.389140   
3    2024-07  1.159773e+10        202  58.999864  3434.153465  1.608911   
4    2024-08  1.895989e+10        283  58.629827  3470.325088  1.572438   

   avg_claim_ratio      severity year_month_dt  month_index  ...  \
0         0.914381  4.257433e+07    2

# MODEL DEVELOPMENT

In [4]:
# ============================================================
# STAGE 3 â€” SMALL DATA OPTIMIZED MODEL (FINAL SAFE VERSION)
# ============================================================

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.linear_model import Ridge

# ============================================================
# 1. TIME-BASED SPLIT
# ============================================================

train = monthly[monthly["year_month"] < "2025-04"].copy()
valid = monthly[(monthly["year_month"] >= "2025-04") & 
                (monthly["year_month"] < "2025-08")].copy()

print("Train size:", len(train))
print("Valid size:", len(valid))

if len(valid) == 0:
    raise ValueError("Validation set kosong.")

# ============================================================
# 2. FEATURE SET (SIMPLIFIED â€” SMALL DATA FRIENDLY)
# ============================================================

features = [
    "month_index",
    "month_sin",
    "month_cos",
    "freq_lag1",
    "sev_lag1",
    "total_lag1",
    "freq_roll3",
    "sev_roll3"
]

features = [f for f in features if f in monthly.columns]

X_train = train[features]
X_valid = valid[features]

# ============================================================
# 3. FREQUENCY MODEL â€” POISSON LIGHTGBM (SMALL TREE)
# ============================================================

y_train_freq = train["frequency"]
y_valid_freq = valid["frequency"]

model_freq = lgb.LGBMRegressor(
    objective="poisson",
    n_estimators=40,        # ðŸ”¥ kecil
    learning_rate=0.1,
    num_leaves=4,
    max_depth=2,
    min_child_samples=3,
    reg_lambda=5,
    random_state=42
)

model_freq.fit(X_train, y_train_freq)

freq_pred_valid = model_freq.predict(X_valid)
freq_pred_valid = np.clip(freq_pred_valid, 1, None)

# ============================================================
# 4. SEVERITY MODEL â€” RIDGE REGRESSION (STABLE)
# ============================================================

y_train_sev = np.log1p(train["severity"])
y_valid_sev = valid["severity"]

model_sev = Ridge(alpha=10)   # ðŸ”¥ strong regularization
model_sev.fit(X_train, y_train_sev)

sev_pred_valid = np.expm1(model_sev.predict(X_valid))
sev_pred_valid = np.clip(sev_pred_valid, 1, None)

# ============================================================
# 5. TOTAL CLAIM â€” ACTUARIAL
# ============================================================

total_pred_valid = freq_pred_valid * sev_pred_valid

# ============================================================
# 6. SAFE MAPE
# ============================================================

def safe_mape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

mape_freq = safe_mape(valid["frequency"], freq_pred_valid)
mape_sev = safe_mape(valid["severity"], sev_pred_valid)
mape_total = safe_mape(valid["total_claim"], total_pred_valid)

final_score = (mape_freq + mape_sev + mape_total) / 3

print("\nMAPE Frequency :", round(mape_freq, 4))
print("MAPE Severity  :", round(mape_sev, 4))
print("MAPE Total     :", round(mape_total, 4))
print("Final Score    :", round(final_score, 4))

print("\nSTAGE 3 COMPLETE â€” SMALL DATA OPTIMIZED")

Train size: 12
Valid size: 4
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001156 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 12, number of used features: 8
[LightGBM] [Info] Start training from score 5.526780

MAPE Frequency : 0.1738
MAPE Severity  : 0.4136
MAPE Total     : 0.4532
Final Score    : 0.3469

STAGE 3 COMPLETE â€” SMALL DATA OPTIMIZED


# TOTAL CLAIM OPTIMIZATION & VALIDATION

In [5]:
# ============================================================
# STAGE 4 â€” TOTAL CLAIM OPTIMIZATION (SMALL DATA MASTER v2)
# ============================================================

import numpy as np
from sklearn.linear_model import Ridge
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# ============================================================
# 1. SAFE MAPE
# ============================================================

def safe_mape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

# ============================================================
# 2. ACTUARIAL (FROM STAGE 3)
# ============================================================

total_pred_valid_actuarial = freq_pred_valid * sev_pred_valid

# ============================================================
# 3. DIRECT RIDGE TOTAL (LOG STABLE)
# ============================================================

features_total = [
    "month_index",
    "month_sin",
    "month_cos",
    "total_lag1",
    "freq_lag1",
    "sev_lag1"
]

features_total = [f for f in features_total if f in monthly.columns]

X_train_total = train[features_total]
X_valid_total = valid[features_total]

y_train_total = np.log1p(train["total_claim"])
y_valid_total = valid["total_claim"]

ridge_total = Ridge(alpha=50)   # sedikit lebih regularized
ridge_total.fit(X_train_total, y_train_total)

total_pred_valid_ridge = np.expm1(ridge_total.predict(X_valid_total))
total_pred_valid_ridge = np.clip(total_pred_valid_ridge, 1, None)

ridge_features = features_total.copy()

# ============================================================
# 4. HOLT TREND MODEL (DAMPED)
# ============================================================

holt_model = ExponentialSmoothing(
    train["total_claim"],
    trend="add",
    damped_trend=True,
    seasonal=None
).fit(optimized=True)

total_pred_valid_holt = holt_model.forecast(len(valid))
total_pred_valid_holt = np.clip(total_pred_valid_holt, 1, None)

# ============================================================
# 5. EVALUATE INDIVIDUAL MODELS
# ============================================================

models = {
    "Actuarial": total_pred_valid_actuarial,
    "Ridge": total_pred_valid_ridge,
    "Holt": total_pred_valid_holt
}

mape_scores = {k: safe_mape(y_valid_total, v) for k, v in models.items()}

print("\nIndividual MAPE:")
for k, v in mape_scores.items():
    print(k, ":", round(v, 4))

# ============================================================
# 6. STABLE BLEND (SOFT INVERSE ERROR)
# ============================================================

# gunakan soft weighting agar tidak overfit valid
epsilon = 0.02
weights = {k: 1 / (v + epsilon) for k, v in mape_scores.items()}

total_weight = sum(weights.values())
weights = {k: v / total_weight for k, v in weights.items()}

print("\nBlend Weights:")
for k, v in weights.items():
    print(k, ":", round(v, 3))

# ============================================================
# 7. FINAL BLEND
# ============================================================

total_pred_valid_blend = sum(
    weights[k] * models[k] for k in models
)

mape_total_blend = safe_mape(y_valid_total, total_pred_valid_blend)

print("\nMAPE Total (Final Blend):", round(mape_total_blend, 4))

# ============================================================
# 8. FINAL COMPETITION SCORE
# ============================================================

mape_freq_final = safe_mape(valid["frequency"], freq_pred_valid)
mape_sev_final  = safe_mape(valid["severity"], sev_pred_valid)
mape_total_final = mape_total_blend

final_score = (mape_freq_final + mape_sev_final + mape_total_final) / 3

print("\nFinal Competition Score")
print("MAPE Frequency :", round(mape_freq_final, 4))
print("MAPE Severity  :", round(mape_sev_final, 4))
print("MAPE Total     :", round(mape_total_final, 4))
print("Final Score    :", round(final_score, 4))

print("\nSTAGE 4 COMPLETE â€” STABLE SMALL DATA BLEND")



Individual MAPE:
Actuarial : 0.4532
Ridge : 0.411
Holt : 0.4557

Blend Weights:
Actuarial : 0.323
Ridge : 0.355
Holt : 0.322

MAPE Total (Final Blend): 0.417

Final Competition Score
MAPE Frequency : 0.1738
MAPE Severity  : 0.4136
MAPE Total     : 0.417
Final Score    : 0.3348

STAGE 4 COMPLETE â€” STABLE SMALL DATA BLEND


# TEST PREDICTION & KAGGLE SUBMISSION

In [6]:
# ============================================================
# STAGE 5 â€” FINAL SUBMISSION GENERATOR (MASTER SAFE VERSION)
# ============================================================

import pandas as pd
import numpy as np

BASE_PATH = "/kaggle/input/datasets/dimaspashaakrilian/dsc-itb/"
sample_sub = pd.read_csv(BASE_PATH + "sample_submission.csv")

# ============================================================
# EXTRACT FUTURE MONTHS (SORTED)
# ============================================================

sample_sub["year"]  = sample_sub["id"].str.split("_").str[0]
sample_sub["month"] = sample_sub["id"].str.split("_").str[1]

sample_sub["month_key"] = sample_sub["year"] + "-" + sample_sub["month"]

future_periods = (
    pd.PeriodIndex(sample_sub["month_key"], freq="M")
      .unique()
      .sort_values()
)

# ============================================================
# COPY HISTORICAL DATA
# ============================================================

future_df = monthly.copy().sort_values("year_month").reset_index(drop=True)
predictions = {}

# ============================================================
# FEATURE LIST (SAME AS STAGE 3)
# ============================================================

features = [
    "month_index",
    "month_sin",
    "month_cos",
    "freq_lag1",
    "sev_lag1",
    "total_lag1",
    "freq_roll3",
    "sev_roll3"
]

features = [f for f in features if f in monthly.columns]

# ============================================================
# RECURSIVE FORECASTING LOOP
# ============================================================

for period in future_periods:

    last_row = future_df.iloc[-1]

    new_row = {}

    # ---- TIME FEATURES ----
    new_row["year_month"] = period
    new_row["year_month_dt"] = period.to_timestamp()
    new_row["month_index"] = last_row["month_index"] + 1
    new_row["month"] = period.month

    new_row["month_sin"] = np.sin(2 * np.pi * period.month / 12)
    new_row["month_cos"] = np.cos(2 * np.pi * period.month / 12)

    # ---- LAG FEATURES ----
    new_row["freq_lag1"] = last_row["frequency"]
    new_row["sev_lag1"] = last_row["severity"]
    new_row["total_lag1"] = last_row["total_claim"]

    new_row["freq_roll3"] = future_df["frequency"].tail(3).mean()
    new_row["sev_roll3"]  = future_df["severity"].tail(3).mean()

    # Convert to DataFrame
    temp = pd.DataFrame([new_row])

    # Ensure no missing columns
    for col in features:
        if col not in temp.columns:
            temp[col] = 0

    temp = temp.fillna(0)

    X_temp = temp[features]

    # =====================================================
    # PREDICT FREQUENCY (POISSON â†’ NO EXP)
    # =====================================================

    freq_pred = model_freq.predict(X_temp)[0]
    freq_pred = max(freq_pred, 1)

    # =====================================================
    # PREDICT SEVERITY (RIDGE LOG1P)
    # =====================================================

    sev_pred = np.expm1(model_sev.predict(X_temp))[0]
    sev_pred = max(sev_pred, 1)

    # =====================================================
    # TOTAL CLAIM
    # =====================================================

    total_actuarial = freq_pred * sev_pred

    # If Stage 4 ridge_total exists
    if "ridge_total" in globals():

        # Ensure feature order EXACT SAME
        for col in ridge_features:
            if col not in temp.columns:
                temp[col] = 0

        X_ridge = temp[ridge_features]
        X_ridge = X_ridge.fillna(0)

        total_ridge = np.expm1(
            ridge_total.predict(X_ridge)
        )[0]

        total_ridge = max(total_ridge, 1)
        total_pred = total_ridge

    else:
        total_pred = total_actuarial

    # =====================================================
    # STORE BACK FOR NEXT ITERATION
    # =====================================================

    new_row["frequency"] = freq_pred
    new_row["severity"] = sev_pred
    new_row["total_claim"] = total_pred

    future_df = pd.concat(
        [future_df, pd.DataFrame([new_row])],
        ignore_index=True
    )

    # =====================================================
    # SAVE USING EXACT FORMAT YYYY_MM
    # =====================================================

    month_str = f"{period.year}_{str(period.month).zfill(2)}"

    predictions[f"{month_str}_Claim_Frequency"] = freq_pred
    predictions[f"{month_str}_Claim_Severity"]  = sev_pred
    predictions[f"{month_str}_Total_Claim"]     = total_pred


# ============================================================
# BUILD FINAL SUBMISSION
# ============================================================

submission = sample_sub.copy()
submission["value"] = submission["id"].map(predictions)

missing = submission["value"].isna().sum()

if missing > 0:
    print("ERROR: Ada ID tidak cocok sebanyak:", missing)
else:
    print("All IDs matched perfectly.")

submission = submission[["id", "value"]]
submission.to_csv("submission.csv", index=False)

print("\nSubmission file created successfully.")
print(submission.head(9))

All IDs matched perfectly.

Submission file created successfully.
                        id         value
0  2025_08_Claim_Frequency  2.519411e+02
1   2025_08_Claim_Severity  1.670376e+08
2      2025_08_Total_Claim  5.855885e+10
3  2025_09_Claim_Frequency  2.509336e+02
4   2025_09_Claim_Severity  7.313531e+06
5      2025_09_Total_Claim  2.619107e+08
6  2025_10_Claim_Frequency  2.606578e+02
7   2025_10_Claim_Severity  2.283074e+07
8      2025_10_Total_Claim  2.186332e+10
