# üè• Healthcare Insurance Cost Analysis  
## üìä Notebook 12 - PyTorch Tabular Regression

| Feild | Description |
|-------|-------------|
|**Author:** | Robert Steven Elliott  |
|**Course:** | Code Institute ‚Äì Data Analytics with AI Bootcamp |  
|**Project Type:** | Individual Formative Project  | 
|**Date:** | December 2025  |

---

## Surpress Warnings

In [1]:
import warnings
warnings.filterwarnings("ignore", message=".*cuda capability.*")
warnings.filterwarnings("ignore", message=".*not compatible with the current PyTorch installation.*")
warnings.filterwarnings("ignore", message=".*Please install PyTorch with a following CUDA configurations.*")

## Change Working Directory

In [2]:
import sys
from pathlib import Path
PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))
print("‚úÖ Working directory set to project root:", PROJECT_ROOT)

‚úÖ Working directory set to project root: /home/robert/Projects/health-insurance-cost-analysis


## Load Libraries

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from utils.data_handler import load_data, data_overview, clean_data
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from math import inf
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
import joblib


## Load Data

In [4]:
input_path = PROJECT_ROOT / "data" / "final" / "insurance_final.csv"

df = load_data(input_path)
data_overview(df)
df = clean_data(df, categorical_cols=['sex', 'smoker', 'region'])
selected_features = [
    "age", "bmi", "children", "charges",
    "sex", "smoker", "region"
]
df_selected = df[selected_features]

DataFrame Shape: (1337, 10)

Data Types:
 age                       int64
sex                      object
bmi                     float64
children                  int64
smoker                   object
region                   object
charges                 float64
bmi_category             object
age_group                object
family_size_category     object
dtype: object

Missing Values:
 age                     0
sex                     0
bmi                     0
children                0
smoker                  0
region                  0
charges                 0
bmi_category            0
age_group               0
family_size_category    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337 entries, 0 to 1336
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   1337 non-null   int64  
 1   sex                   1337 non-null   object 
 2   bmi              

## Train/validation/test split + preprocessing for embeddings

In [5]:

NUM_COLS = ["age", "bmi", "children"]
CAT_COLS = ["sex", "smoker", "region"]
TARGET = "charges"

df_model = df[NUM_COLS + CAT_COLS + [TARGET]].copy()

cat_maps = {}
for c in CAT_COLS:
    df_model[c] = df_model[c].astype("category")
    cat_maps[c] = dict(enumerate(df_model[c].cat.categories))
    df_model[c] = df_model[c].cat.codes.astype("int64")

X_num = df_model[NUM_COLS].values.astype("float32")
X_cat = df_model[CAT_COLS].values.astype("int64")
y = df_model[TARGET].values.astype("float32")
y_log = np.log1p(y).astype("float32")

Xn_train, Xn_temp, Xc_train, Xc_temp, y_train, y_temp = train_test_split(
    X_num, X_cat, y_log, test_size=0.3, random_state=42
)
Xn_valid, Xn_test, Xc_valid, Xc_test, y_valid, y_test = train_test_split(
    Xn_temp, Xc_temp, y_temp, test_size=0.5, random_state=42
)

scaler = StandardScaler()
Xn_train = scaler.fit_transform(Xn_train).astype("float32")
Xn_valid = scaler.transform(Xn_valid).astype("float32")
Xn_test  = scaler.transform(Xn_test).astype("float32")

cardinalities = {c: int(df_model[c].nunique()) for c in CAT_COLS}
print("Cardinalities:", cardinalities)


Cardinalities: {'sex': 2, 'smoker': 2, 'region': 4}


## 3. PyTorch Dataset/DataLoader

In [6]:
def cuda_is_compatible():
    if not torch.cuda.is_available():
        return False, "CUDA not available"

    try:
        cap = torch.cuda.get_device_capability(0)   # (6,1) for GTX 1070
        major = cap[0]

        # Modern PyTorch typically needs sm_70+
        if major < 7:
            return False, f"GPU capability {cap} not supported by this torch build"

        return True, f"GPU capability {cap} compatible"

    except Exception as e:
        return False, f"CUDA query failed: {e}"
    


ok, msg = cuda_is_compatible()
print(ok, msg)

DEVICE = "cuda" if ok else "cpu"
    
print("Device:", DEVICE)

class TabularDataset(Dataset):
    def __init__(self, X_num, X_cat, y):
        self.X_num = torch.tensor(X_num, dtype=torch.float32)
        self.X_cat = torch.tensor(X_cat, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32).view(-1, 1)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X_num[idx], self.X_cat[idx], self.y[idx]

train_ds = TabularDataset(Xn_train, Xc_train, y_train)
valid_ds = TabularDataset(Xn_valid, Xc_valid, y_valid)
test_ds  = TabularDataset(Xn_test,  Xc_test,  y_test)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_ds, batch_size=256, shuffle=False)
test_loader  = DataLoader(test_ds,  batch_size=256, shuffle=False)


False GPU capability (6, 1) not supported by this torch build
Device: cpu


    Found GPU0 NVIDIA GeForce GTX 1070 which is of cuda capability 6.1.
    Minimum and Maximum cuda capability supported by this version of PyTorch is
    (7.0) - (12.0)
    
    Please install PyTorch with a following CUDA
    configurations:  12.6 following instructions at
    https://pytorch.org/get-started/locally/
    
NVIDIA GeForce GTX 1070 with CUDA capability sm_61 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_70 sm_75 sm_80 sm_86 sm_90 sm_100 sm_120.
If you want to use the NVIDIA GeForce GTX 1070 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



## Model: MLP + categorical embeddings

In [7]:

def emb_dim(n):
    # Simple heuristic for embedding size
    return min(16, max(2, (n + 1) // 2))

class TabularRegressor(nn.Module):
    def __init__(self, num_features, cardinalities):
        super().__init__()
        self.cat_cols = list(cardinalities.keys())
        self.embs = nn.ModuleList([
            nn.Embedding(cardinalities[c], emb_dim(cardinalities[c]))
            for c in self.cat_cols
        ])
        emb_out = sum(e.embedding_dim for e in self.embs)

        self.mlp = nn.Sequential(
            nn.Linear(num_features + emb_out, 64),
            nn.ReLU(),
            nn.Dropout(0.15),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.10),
            nn.Linear(32, 1),
        )

    def forward(self, x_num, x_cat):
        embs = []
        for i, e in enumerate(self.embs):
            embs.append(e(x_cat[:, i]))
        x = torch.cat([x_num] + embs, dim=1)
        return self.mlp(x)

model = TabularRegressor(num_features=len(NUM_COLS), cardinalities=cardinalities).to(DEVICE)
loss_fn = nn.MSELoss()
optim = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)

print(model)


TabularRegressor(
  (embs): ModuleList(
    (0-1): 2 x Embedding(2, 2)
    (2): Embedding(4, 2)
  )
  (mlp): Sequential(
    (0): Linear(in_features=9, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.15, inplace=False)
    (3): Linear(in_features=64, out_features=32, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.1, inplace=False)
    (6): Linear(in_features=32, out_features=1, bias=True)
  )
)


## Train with early stopping

In [8]:
def run_epoch(loader, train=False):
    model.train(train)
    total_loss = 0.0
    n = 0
    for x_num, x_cat, yb in loader:
        x_num, x_cat, yb = x_num.to(DEVICE), x_cat.to(DEVICE), yb.to(DEVICE)
        if train:
            optim.zero_grad(set_to_none=True)
        pred = model(x_num, x_cat)
        loss = loss_fn(pred, yb)
        if train:
            loss.backward()
            optim.step()
        bs = yb.size(0)
        total_loss += loss.item() * bs
        n += bs
    return total_loss / max(1, n)

best = inf
patience = 10
wait = 0
best_state = None

for epoch in range(1, 201):
    tr = run_epoch(train_loader, train=True)
    va = run_epoch(valid_loader, train=False)

    if va < best - 1e-5:
        best = va
        wait = 0
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
    else:
        wait += 1

    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d} | train MSE: {tr:.5f} | valid MSE: {va:.5f}")

    if wait >= patience:
        print("Early stopping.")
        break

model.load_state_dict(best_state)


Epoch 001 | train MSE: 78.26109 | valid MSE: 75.19976
Epoch 010 | train MSE: 1.74905 | valid MSE: 0.74695
Epoch 020 | train MSE: 1.50261 | valid MSE: 0.51671
Epoch 030 | train MSE: 1.34116 | valid MSE: 0.42877
Epoch 040 | train MSE: 1.20718 | valid MSE: 0.32378
Epoch 050 | train MSE: 1.23254 | valid MSE: 0.30376
Epoch 060 | train MSE: 1.22669 | valid MSE: 0.29230
Epoch 070 | train MSE: 1.08921 | valid MSE: 0.26641
Epoch 080 | train MSE: 0.99717 | valid MSE: 0.24197
Epoch 090 | train MSE: 1.01451 | valid MSE: 0.22896
Epoch 100 | train MSE: 1.02343 | valid MSE: 0.22926
Epoch 110 | train MSE: 1.00292 | valid MSE: 0.24202
Epoch 120 | train MSE: 0.92133 | valid MSE: 0.21790
Epoch 130 | train MSE: 1.03104 | valid MSE: 0.20847
Early stopping.


<All keys matched successfully>

## Evaluate on the test set (log scale + original scale)

In [9]:

model.eval()
preds = []
trues = []

with torch.no_grad():
    for x_num, x_cat, yb in test_loader:
        x_num, x_cat = x_num.to(DEVICE), x_cat.to(DEVICE)
        pred = model(x_num, x_cat).cpu().numpy().reshape(-1)
        preds.append(pred)
        trues.append(yb.numpy().reshape(-1))

y_pred_log = np.concatenate(preds)
y_true_log = np.concatenate(trues)

# Log-scale metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print("MAE (log): ", mean_absolute_error(y_true_log, y_pred_log))
print("RMSE (log):", root_mean_squared_error(y_true_log, y_pred_log))
print("R¬≤ (log):  ", r2_score(y_true_log, y_pred_log))

# Original scale metrics
y_true = np.expm1(y_true_log)
y_pred = np.expm1(y_pred_log)

print("\nMAE:  ¬£", mean_absolute_error(y_true, y_pred))
print("RMSE: ¬£", root_mean_squared_error(y_true, y_pred))


MAE (log):  0.26491236686706543
RMSE (log): 0.40614697337150574
R¬≤ (log):   0.8155003190040588

MAE:  ¬£ 3897.201416015625
RMSE: ¬£ 7287.44384765625


## Save model + scaler

In [10]:
model_path = PROJECT_ROOT / "models"

model_path.mkdir(exist_ok=True)

torch.save(model.state_dict(), model_path / "torch_tabular_mlp.pt")
joblib.dump(scaler, model_path / "numeric_scaler.joblib")
joblib.dump(cat_maps, model_path / "cat_maps.joblib")
print("Saved: models/torch_tabular_mlp.pt, numeric_scaler.joblib, cat_maps.joblib")

Saved: models/torch_tabular_mlp.pt, numeric_scaler.joblib, cat_maps.joblib
