In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor

# --------------------------
# 1️⃣ Generate Synthetic Data
# --------------------------

np.random.seed(42)

n_members = 2000
start_date = datetime(2015, 1, 1)
end_date = datetime(2025, 1, 1)

# Membership data
members = pd.DataFrame({
    "member_id": range(1, n_members + 1),
    "age": np.random.randint(20, 75, n_members),
    "gender": np.random.choice(["M", "F"], n_members),
    "plan_type": np.random.choice(["Silver", "Gold", "Platinum"], n_members, p=[0.5, 0.3, 0.2]),
    "join_date": [start_date + timedelta(days=np.random.randint(0, 365*5)) for _ in range(n_members)],
})

# Claim data
n_claims = 10000
claim_dates = [start_date + timedelta(days=np.random.randint(0, (end_date - start_date).days)) for _ in range(n_claims)]
claims = pd.DataFrame({
    "claim_id": range(1, n_claims + 1),
    "member_id": np.random.choice(members["member_id"], n_claims),
    "claim_date": claim_dates,
    "claim_amount": np.random.exponential(scale=15000, size=n_claims).astype(int)
})

# --------------------------
# 2️⃣ Define high-claim threshold
# --------------------------
threshold = claims["claim_amount"].quantile(0.9)
claims["is_high_claim"] = (claims["claim_amount"] > threshold).astype(int)

# --------------------------
# 3️⃣ Create historical (past 2 years) features & future (next 2 years) targets
# --------------------------

cutoff_date = datetime(2021, 1, 1)  # pretend we are training as of Jan 2021
past_window_start = cutoff_date - timedelta(days=365*2)
future_window_end = cutoff_date + timedelta(days=365*2)

# Past 2 years (feature window)
past_claims = claims[(claims["claim_date"] >= past_window_start) & (claims["claim_date"] < cutoff_date)]
# Future 2 years (target window)
future_claims = claims[(claims["claim_date"] >= cutoff_date) & (claims["claim_date"] < future_window_end)]

# Aggregate past claims
past_features = past_claims.groupby("member_id").agg(
    past_claim_count=("claim_id", "count"),
    past_total_amount=("claim_amount", "sum"),
    past_high_claims=("is_high_claim", "sum"),
    avg_claim_amount=("claim_amount", "mean"),
    last_claim_date=("claim_date", "max")
).reset_index()

# Fill missing (members with no past claims)
past_features["days_since_last_claim"] = (cutoff_date - past_features["last_claim_date"]).dt.days
past_features.drop(columns="last_claim_date", inplace=True)
past_features.fillna({"days_since_last_claim": 9999, "avg_claim_amount": 0,
                      "past_claim_count": 0, "past_total_amount": 0, "past_high_claims": 0}, inplace=True)

# Target: future high claim count & amount
future_targets = future_claims[future_claims["is_high_claim"] == 1].groupby("member_id").agg(
    future_high_claim_count=("claim_id", "count"),
    future_high_claim_amount=("claim_amount", "sum")
).reset_index()

# Merge features + targets + member info
data = members.merge(past_features, on="member_id", how="left").merge(future_targets, on="member_id", how="left")
data.fillna({"future_high_claim_count": 0, "future_high_claim_amount": 0}, inplace=True)

# --------------------------
# 4️⃣ Encode categorical variables
# --------------------------
data = pd.get_dummies(data, columns=["gender", "plan_type"], drop_first=True)

# --------------------------
# 5️⃣ Split Train/Test
# --------------------------
X = data.drop(columns=["member_id", "future_high_claim_count", "future_high_claim_amount", "join_date"])
y_count = data["future_high_claim_count"]
y_amount = data["future_high_claim_amount"]

X_train, X_test, y_count_train, y_count_test, y_amount_train, y_amount_test = train_test_split(
    X, y_count, y_amount, test_size=0.2, random_state=42
)

# --------------------------
# 6️⃣ Train Models
# --------------------------
count_model = XGBRegressor(objective='count:poisson', n_estimators=200, learning_rate=0.05, random_state=42)
count_model.fit(X_train, y_count_train)

amount_model = XGBRegressor(objective='reg:squarederror', n_estimators=200, learning_rate=0.05, random_state=42)
amount_model.fit(X_train, y_amount_train)

# --------------------------
# 7️⃣ Evaluate Models
# --------------------------
count_pred = count_model.predict(X_test)
amount_pred = amount_model.predict(X_test)

print("=== Claim COUNT Model ===")
print("MAE:", mean_absolute_error(y_count_test, count_pred))
print("R² :", r2_score(y_count_test, count_pred))

print("\n=== Claim AMOUNT Model ===")
print("MAE:", mean_absolute_error(y_amount_test, amount_pred))
print("R² :", r2_score(y_amount_test, amount_pred))

# --------------------------
# 8️⃣ Predict future for all members
# --------------------------
data["pred_future_high_claim_count"] = count_model.predict(X)
data["pred_future_high_claim_amount"] = amount_model.predict(X)
data["predicted_total_high_claim_cost"] = data["pred_future_high_claim_count"] * data["pred_future_high_claim_amount"]

# Display sample output
print("\nSample predictions:")
print(data[["member_id", "pred_future_high_claim_count", "pred_future_high_claim_amount", "predicted_total_high_claim_cost"]].head(10))
