In [1]:
%pip install pandas numpy scikit-learn xgboost

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
import xgboost as xgb

In [3]:
# Load the dataset
df = pd.read_csv('insurance_dataset.csv')

In [4]:
# Create a copy of the dataframe for feature engineering
df_work = df.copy()

In [5]:
df_work.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,premium
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [6]:
df_work['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [7]:
# Feature 1: BMI Category
def bmi_category(bmi):
    if bmi < 18.5:
        return "underweight"
    elif bmi < 25:
        return "normal"
    elif bmi < 30:
        return "overweight"
    else:
        return "obese"

df_work["bmi_category"] = df_work["bmi"].apply(bmi_category)

In [8]:
# Feature 2: Smoking Risk
def smoking_risk(row):
    if row["smoker"] == "yes" and row["bmi"] > 30:
        return "high"
    elif row["smoker"] == "yes" or row["bmi"] > 27:
        return "medium"
    else:
        return "low"

df_work["smoking_risk"] = df_work.apply(smoking_risk, axis=1)

In [9]:
df_work.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,premium,bmi_category,smoking_risk
0,19,female,27.9,0,yes,southwest,16884.92,overweight,medium
1,18,male,33.8,1,no,southeast,1725.55,obese,medium
2,28,male,33.0,3,no,southeast,4449.46,obese,medium
3,33,male,22.7,0,no,northwest,21984.47,normal,low
4,32,male,28.9,0,no,northwest,3866.86,overweight,medium


In [10]:
# Select features and target
X = df_work[["age", "sex", "children", "region", "bmi_category", "smoking_risk"]]
y = df_work["premium"]

In [11]:
# Define categorical and numeric features
categorical_features = ["sex", "region", "bmi_category", "smoking_risk"]
numeric_features = ["age", "children"]

In [12]:
# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

In [13]:
# Create a pipeline with preprocessing and XGBoost regressor
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", xgb.XGBRegressor(random_state=42))
])

In [14]:
# Split data with 88% train and 12% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=1)

In [15]:
# Train the model
pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [16]:
# Predict on test set
y_pred = pipeline.predict(X_test)



In [17]:
 # Calculate MAPE and RMSE
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))


In [18]:
# Print evaluation metrics
print(f"MAPE: {mape:.4f}")
print(f"RMSE: {rmse:.2f}")

MAPE: 0.4355
RMSE: 6937.83


In [19]:
# Save the trained pipeline
import pickle
with open("xgboost_insurance_model.pkl", "wb") as f:
    pickle.dump(pipeline, f)