In [None]:
# 1. Data gathering
# 2. Data preprocessing
# 3. Feature engineering
# 4. Model training
# 5. Testing

In [None]:
## Dataset used
# https://www.kaggle.com/datasets/mirichoi0218/insurance

In [None]:
!pip install kaggle



In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mirichoi0218/insurance")

print(f"Path of dataset files : {path}")

Downloading from https://www.kaggle.com/api/v1/datasets/download/mirichoi0218/insurance?dataset_version_number=1...


100%|██████████| 16.0k/16.0k [00:00<00:00, 3.46MB/s]

Extracting files...
Path of dataset files : /root/.cache/kagglehub/datasets/mirichoi0218/insurance/versions/1





In [None]:
import os
print(os.getcwd())

/content


In [None]:
import pandas as pd


In [None]:
os.listdir('/root/.cache/kagglehub/datasets/mirichoi0218/insurance/versions/1')


['insurance.csv']

In [None]:
df = pd.read_csv(os.path.join(path, 'insurance.csv'))

In [None]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [None]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split


In [None]:
# Split dataset before encoding
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [None]:
# Encode cetagorical variable
label_encoder = {}
for col in ['sex', 'smoker', 'region']:
  le = LabelEncoder()
  train_df[col] = le.fit_transform(train_df[col])
  test_df[col] = le.transform(test_df[col])
  label_encoder[col] = le



In [None]:
# Features and target
X_train = train_df.drop(columns=['charges'])
y_train = train_df['charges']

X_test = test_df.drop(columns=['charges'])
y_test = test_df['charges']

In [None]:
print(X_train.head())
print(y_train.head())

      age  sex    bmi  children  smoker  region
560    46    0  19.95         2       0       1
1285   47    0  24.32         0       0       0
1142   52    0  24.86         0       0       2
969    39    0  34.32         5       0       2
486    54    0  21.47         3       0       1
560      9193.83850
1285     8534.67180
1142    27117.99378
969      8596.82780
486     12475.35130
Name: charges, dtype: float64


In [None]:
# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
print(X_train)

[[ 0.47222651 -1.0246016  -1.75652513  0.73433626 -0.50874702 -0.45611589]
 [ 0.54331294 -1.0246016  -1.03308239 -0.91119211 -0.50874702 -1.35325561]
 [ 0.8987451  -1.0246016  -0.94368672 -0.91119211 -0.50874702  0.44102382]
 ...
 [ 1.3252637   0.97598911 -0.89153925 -0.91119211 -0.50874702 -1.35325561]
 [-0.16755139 -1.0246016   2.82086429  0.73433626  1.96561348  1.33816354]
 [ 1.1120044   0.97598911 -0.10932713 -0.91119211 -0.50874702  1.33816354]]


In [None]:
# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [None]:
print(y_train_tensor)
print(y_train_tensor.shape)

tensor([[ 9193.8389],
        [ 8534.6719],
        [27117.9941],
        ...,
        [11931.1250],
        [46113.5117],
        [10214.6357]])
torch.Size([1070, 1])


In [None]:
print(X_test_tensor)
print(X_test_tensor.shape)

tensor([[ 0.4011, -1.0246, -0.8915,  0.7343, -0.5087, -1.3533],
        [-0.2386, -1.0246, -0.0895, -0.9112, -0.5087, -0.4561],
        [ 1.7518, -1.0246, -0.6085, -0.9112,  1.9656, -0.4561],
        ...,
        [-0.0965,  0.9760, -0.4197, -0.0884, -0.5087, -1.3533],
        [ 1.0409, -1.0246,  2.7894, -0.9112,  1.9656,  0.4410],
        [ 0.8277, -1.0246,  0.6025, -0.0884, -0.5087,  1.3382]])
torch.Size([268, 6])


In [None]:
# Define Neural network model

class SimpleNNRegressionModel(nn.Module):
  def __init__(self, input_dim):
    super(SimpleNNRegressionModel, self).__init__()
    self.network = nn.Sequential(
        nn.Linear(input_dim, 64),
        nn.ReLU(),
        nn.Linear(64, 128),
        nn.ReLU(),
        nn.Linear(128, 1)
    )

  def forward(self, x):
    return self.network(x)

In [None]:
X_train_tensor.shape

torch.Size([1070, 6])

In [None]:
input_dim = X_train_tensor.shape[1]
model = SimpleNNRegressionModel(input_dim)

In [None]:
print(model)

SimpleNNRegressionModel(
  (network): Sequential(
    (0): Linear(in_features=6, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=1, bias=True)
  )
)


In [None]:
# Loss and optmiser

criterion = nn.MSELoss()
optimiser = optim.Adam(model.parameters(), lr=0.01)

In [None]:
# Training loop
epochs = 30000

for epoch in range(epochs):
  model.train()
  optimiser.zero_grad()
  predictions = model(X_train_tensor)
  loss = criterion(predictions, y_train_tensor)
  loss.backward()

  optimiser.step()

  if (epoch+1) % 100 == 0:
    print(f"Epoch [{epoch+1}/{epochs}], Loss : {loss.item():.4f}")

Epoch [100/30000], Loss : 45702688.0000
Epoch [200/30000], Loss : 32253306.0000
Epoch [300/30000], Loss : 30073744.0000
Epoch [400/30000], Loss : 27901882.0000
Epoch [500/30000], Loss : 26460680.0000
Epoch [600/30000], Loss : 25507344.0000
Epoch [700/30000], Loss : 24747198.0000
Epoch [800/30000], Loss : 24136398.0000
Epoch [900/30000], Loss : 23693618.0000
Epoch [1000/30000], Loss : 23380548.0000
Epoch [1100/30000], Loss : 23087808.0000
Epoch [1200/30000], Loss : 22771090.0000
Epoch [1300/30000], Loss : 22460362.0000
Epoch [1400/30000], Loss : 22175922.0000
Epoch [1500/30000], Loss : 21902986.0000
Epoch [1600/30000], Loss : 21621392.0000
Epoch [1700/30000], Loss : 21310200.0000
Epoch [1800/30000], Loss : 21082658.0000
Epoch [1900/30000], Loss : 20882306.0000
Epoch [2000/30000], Loss : 20683656.0000
Epoch [2100/30000], Loss : 20402342.0000
Epoch [2200/30000], Loss : 20001754.0000
Epoch [2300/30000], Loss : 19465622.0000
Epoch [2400/30000], Loss : 18907132.0000
Epoch [2500/30000], Loss 

In [None]:
# Model Evaluation

model.eval()
y_pred = model(X_test_tensor).detach().numpy()

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_test_numpy = y_test_tensor.numpy()

# Calculate metrics
mse = mean_squared_error(y_test_numpy, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test_numpy, y_pred)
r2 = r2_score(y_test_numpy, y_pred)

print(f"MSE : {mse}")
print(f"RMSE : {rmse}")
print(f"MAE : {mae}")
print(f"R2-Score : {r2}")


MSE : 64239584.0
RMSE : 8014.960012376855
MAE : 5253.6044921875
R2-Score : 0.586215078830719


In [None]:
def predict_charges(age, sex, bmi, children, smoker,region):
  input_data = pd.DataFrame([[age, sex, bmi, children, smoker,region]],
               columns=['age', 'sex', 'bmi', 'children', 'smoker', 'region'])

  for col in ['sex', 'smoker', 'region']:
    input_data[col] = label_encoder[col].transform(input_data[col])
  input_data = scaler.transform(input_data)
  input_tensor = torch.tensor(input_data, dtype=torch.float32)
  predicted_charge = model(input_tensor).item()
  return predicted_charge


In [None]:
predicted = predict_charges(45, 'female', 27.9, 0, 'no', 'southwest')

In [None]:
print(f"Predicted insurance charge: ${predicted:.2f}")

Predicted insurance charge: $7492.10
