In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import joblib

from sklearn.linear_model import PoissonRegressor, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_poisson_deviance
import warnings
warnings.filterwarnings("ignore")

In [2]:
# 1) Load dataset
DATA_PATH = "cleaned_medical_insurance.csv"   # <-- put your real file name here
df = pd.read_csv(DATA_PATH)

print("Shape:", df.shape)
df = df[(df["age"] >= 18) & (df["age"] < 90)]
df.head()


Shape: (100000, 54)


Unnamed: 0.1,Unnamed: 0,age,sex,region,urban_rural,income,education,marital_status,employment_status,household_size,...,liver_disease,arthritis,mental_health,proc_imaging_count,proc_surgery_count,proc_physio_count,proc_consult_count,proc_lab_count,is_high_risk,had_major_procedure
0,0,52,Female,North,Suburban,22700.0,Doctorate,Married,Retired,3,...,0,1,0,1,0,2,0,1,0,0
1,1,79,Female,North,Urban,12800.0,No HS,Married,Employed,3,...,0,1,1,0,0,1,0,1,1,0
2,2,68,Male,North,Rural,40700.0,HS,Married,Retired,5,...,0,0,1,1,0,2,1,0,1,0
4,4,53,Male,Central,Suburban,89600.0,Doctorate,Married,Self-employed,2,...,0,1,0,2,0,1,1,0,1,0
5,5,63,Female,North,Rural,305000.0,HS,Single,Employed,3,...,0,0,0,0,0,0,0,1,1,0


In [3]:
# 2) Detect claims-count target
claim_targets = [
    "claims_count", "num_claims", "total_claims", "claim_frequency",
    "claim_count", "n_claims", "claims"
]

target = None
for c in claim_targets:
    if c in df.columns:
        target = c
        break

if target is None:
    raise ValueError(
        "No claims-count column found. "
        "Check df.columns and update claim_targets."
    )

y = df[target].copy()
X = df.drop(columns=[target]).copy()

print("Target column:", target)
print(y.describe())


Target column: claims_count
count    96531.000000
mean         1.623748
std          2.033106
min          0.000000
25%          0.000000
50%          1.000000
75%          2.000000
max         23.000000
Name: claims_count, dtype: float64


In [4]:
# 3) Remove ID-like columns
id_like = [c for c in X.columns if "id" in c.lower()]
X = X.drop(columns=id_like, errors="ignore")

# 3.5) Remove highly correlated premium variables
# Keep only ONE premium variable to reduce redundancy
cols_to_drop = ["monthly_premium"]  # keeping only annual_premium
X = X.drop(columns=[c for c in cols_to_drop if c in X.columns], errors="ignore")

In [5]:
# 4) Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

In [6]:
# 5) Preprocessing
num_cols = X_train.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object","category","bool"]).columns.tolist()

numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer([
    ("num", numeric_pipe, num_cols),
    ("cat", categorical_pipe, cat_cols)
])

In [11]:
# save original splits 
joblib.dump(X_train, "X_train.pkl")
joblib.dump(X_test,  "X_test.pkl")
joblib.dump(y_train, "y_train.pkl")
joblib.dump(y_test,  "y_test.pkl")

# save the preprocess pipeline
joblib.dump(preprocess, "preprocess_pipeline.pkl")

print("Saved: X_train.pkl, X_test.pkl, y_train.pkl, y_test.pkl, preprocess_pipeline.pkl")



Saved: X_train.pkl, X_test.pkl, y_train.pkl, y_test.pkl, preprocess_pipeline.pkl
