# üè• Healthcare Insurance Cost Analysis  
## üìä Notebook 11 ‚Äì XGBoost Regression

| Feild | Description |
|-------|-------------|
|**Author:** |Robert Steven Elliott  |
|**Course:** |Code Institute ‚Äì Data Analytics with AI Bootcamp |  
|**Project Type:** |Individual Formative Project  | 
|**Date:** |October 2025  |

---

## Change Working Directory

In [15]:
import sys
from pathlib import Path
PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))
print("‚úÖ Working directory set to project root:", PROJECT_ROOT)

‚úÖ Working directory set to project root: /home/robert/Projects/health-insurance-cost-analysis


## Import Libraries and Dataset

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
from scipy import stats
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
import xgboost as xgb

from utils.data_handler import load_data, data_overview, clean_data

pd.set_option('display.max_columns', None)
sns.set_theme(style="whitegrid")

input_path = PROJECT_ROOT / "data" / "final" / "insurance_final.csv"
figure_path = PROJECT_ROOT / "figures"

if not figure_path.exists():
    figure_path.mkdir(parents=True, exist_ok=True)
    print(f"‚úÖ Created figure directory at: {figure_path}")

df = load_data(input_path)
df = clean_data(df, categorical_cols=['sex', 'smoker', 'region', 'bmi_category', 'age_group', 'family_size_category'])
data_overview(df)
print("‚úÖ Data loaded successfully.")
df.head()

DataFrame Shape: (1337, 10)

Data Types:
 age                        int64
sex                     category
bmi                      float64
children                   int64
smoker                  category
region                  category
charges                  float64
bmi_category            category
age_group               category
family_size_category    category
dtype: object

Missing Values:
 age                     0
sex                     0
bmi                     0
children                0
smoker                  0
region                  0
charges                 0
bmi_category            0
age_group               0
family_size_category    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337 entries, 0 to 1336
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   age                   1337 non-null   int64   
 1   sex                   1337 non-null   category
 2   bmi

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,bmi_category,age_group,family_size_category
0,19,female,27.9,0,yes,southwest,16884.924,Overweight,18-25,No Children
1,18,male,33.77,1,no,southeast,1725.5523,Obese,18-25,Small Family
2,28,male,33.0,3,no,southeast,4449.462,Obese,26-35,Medium Family
3,33,male,22.705,0,no,northwest,21984.47061,Normal,26-35,No Children
4,32,male,28.88,0,no,northwest,3866.8552,Overweight,26-35,No Children


## Split + preprocess (scaling numeric + one-hot categoricals)

In [17]:
NUM_COLS = ["age", "bmi", "children"]
CAT_COLS = ["sex", "smoker", "region"]
TARGET = "charges"

X = df[NUM_COLS + CAT_COLS].copy()
y = df[TARGET].astype("float32").copy()
y_log = np.log1p(y).astype("float32")

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

# Train/validation split (for early stopping)
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("scaler", StandardScaler())]), NUM_COLS),
        ("cat", OneHotEncoder(handle_unknown="ignore"), CAT_COLS),
    ],
    remainder="drop",
)

X_tr_p = preprocess.fit_transform(X_tr)
X_val_p = preprocess.transform(X_val)
X_test_p = preprocess.transform(X_test)

print("Train:", X_tr_p.shape, "Valid:", X_val_p.shape, "Test:", X_test_p.shape)


Train: (855, 11) Valid: (214, 11) Test: (268, 11)


## Train XGBoost (with early stopping)

In [18]:
xgb_model = xgb.XGBRegressor(
    n_estimators=5000,
    learning_rate=0.03,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.0,
    reg_lambda=1.0,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1,
)

xgb_model.fit(
    X_tr_p, y_tr,
    eval_set=[(X_val_p, y_val)],
    verbose=200
)


[0]	validation_0-rmse:0.87874
[200]	validation_0-rmse:0.38023
[400]	validation_0-rmse:0.39721
[600]	validation_0-rmse:0.41171
[800]	validation_0-rmse:0.42264
[1000]	validation_0-rmse:0.42777
[1200]	validation_0-rmse:0.43422
[1400]	validation_0-rmse:0.43908
[1600]	validation_0-rmse:0.44472
[1800]	validation_0-rmse:0.44721
[2000]	validation_0-rmse:0.45034
[2200]	validation_0-rmse:0.45334
[2400]	validation_0-rmse:0.45565
[2600]	validation_0-rmse:0.45815
[2800]	validation_0-rmse:0.45984
[3000]	validation_0-rmse:0.46126
[3200]	validation_0-rmse:0.46264
[3400]	validation_0-rmse:0.46388
[3600]	validation_0-rmse:0.46496
[3800]	validation_0-rmse:0.46609
[4000]	validation_0-rmse:0.46685
[4200]	validation_0-rmse:0.46773
[4400]	validation_0-rmse:0.46846
[4600]	validation_0-rmse:0.46908
[4800]	validation_0-rmse:0.46985
[4999]	validation_0-rmse:0.47060


0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'reg:squarederror'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.9
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


## Evaluate (log scale + original ¬£ scale)

In [19]:
def evaluate_log_target(y_true_log, y_pred_log, label=""):
    mae_log = mean_absolute_error(y_true_log, y_pred_log)
    rmse_log = root_mean_squared_error(y_true_log, y_pred_log)
    r2 = r2_score(y_true_log, y_pred_log)

    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)

    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)

    print(f"== {label} ==")
    print(f"MAE (log):  {mae_log:,.4f}")
    print(f"RMSE (log): {rmse_log:,.4f}")
    print(f"R¬≤ (log):   {r2:,.4f}")
    print(f"MAE:        ¬£{mae:,.2f}")
    print(f"RMSE:       ¬£{rmse:,.2f}")

pred_test = xgb_model.predict(X_test_p)
evaluate_log_target(y_test, pred_test, label="XGBoost")

== XGBoost ==
MAE (log):  0.2587
RMSE (log): 0.4539
R¬≤ (log):   0.7780
MAE:        ¬£3,639.97
RMSE:       ¬£12,061.14


## Feature importance (gain + top features)

In [20]:
ohe = preprocess.named_transformers_["cat"]
num_names = NUM_COLS
cat_names = list(ohe.get_feature_names_out(CAT_COLS))
feature_names = num_names + cat_names

importances = xgb_model.feature_importances_
imp = pd.DataFrame({"feature": feature_names, "importance": importances})
imp = imp.sort_values("importance", ascending=False)

display(imp.head(20))
fig = px.bar(
    imp.head(20),
    x="importance",
    y="feature",
    orientation="h",
    title="Top 20 Feature Importances - XGBoost Regressor",
)
fig.update_layout(yaxis={"categoryorder": "total ascending"})
fig.write_image(figure_path / "xgboost_feature_importances.png", scale=2)
fig.show()


Unnamed: 0,feature,importance
5,smoker_no,0.543724
6,smoker_yes,0.291962
0,age,0.053058
2,children,0.026457
7,region_northeast,0.015133
4,sex_male,0.01452
1,bmi,0.013034
9,region_southeast,0.012125
8,region_northwest,0.010832
3,sex_female,0.009916


## Save model + preprocessing

In [21]:


model_foler = PROJECT_ROOT / "models"
Path(model_foler).mkdir(exist_ok=True)

# Save model in JSON (portable)
xgb_model.get_booster().save_model(model_foler / "xgboost_charges.json")

# Save preprocessing pipeline
joblib.dump(preprocess, model_foler / "xgb_preprocess.joblib")

print("Saved: models/xgboost_charges.json and models/xgb_preprocess.joblib")


Saved: models/xgboost_charges.json and models/xgb_preprocess.joblib
