#### 1. Data gathering
#### 2. Data preprocessing
#### 3. feature engineering
#### 4. Model training
#### 5. Testing

In [54]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mirichoi0218/insurance")

print(f"Path to dataset files: {path}")

Path to dataset files: /home/td/.cache/kagglehub/datasets/mirichoi0218/insurance/versions/1


In [55]:
import os
print(os.getcwd())

/home/td/Doc/Prog/Ai|Ml/pytorch


In [56]:
import pandas as pd

In [57]:
os.listdir('/home/td/.cache/kagglehub/datasets/mirichoi0218/insurance/versions/1')

['insurance.csv']

In [58]:
df = pd.read_csv(os.path.join(path, 'insurance.csv'))

In [59]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [60]:
df['region'].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [62]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [63]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [64]:
# Split dataset before encoding

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [65]:
# Encode catagorical variable

label_encoder = {}
for col in ['sex', 'smoker', 'region']:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    label_encoder[col] = le

In [66]:
# Features and target

X_train = train_df.drop(columns=['charges'])
y_train = train_df['charges']

X_test = test_df.drop(columns=['charges'])
y_test = test_df['charges']

In [67]:
X_train.shape

(1070, 6)

In [68]:
print(X_train.head())
print(y_train.head())

      age  sex    bmi  children  smoker  region
560    46    0  19.95         2       0       1
1285   47    0  24.32         0       0       0
1142   52    0  24.86         0       0       2
969    39    0  34.32         5       0       2
486    54    0  21.47         3       0       1
560      9193.83850
1285     8534.67180
1142    27117.99378
969      8596.82780
486     12475.35130
Name: charges, dtype: float64


In [69]:
# Normalize features 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [70]:
# sc1=StandardScaler()
# y_train=sc1.fit_transform(y_train)

In [71]:
print(X_train)

[[ 0.47222651 -1.0246016  -1.75652513  0.73433626 -0.50874702 -0.45611589]
 [ 0.54331294 -1.0246016  -1.03308239 -0.91119211 -0.50874702 -1.35325561]
 [ 0.8987451  -1.0246016  -0.94368672 -0.91119211 -0.50874702  0.44102382]
 ...
 [ 1.3252637   0.97598911 -0.89153925 -0.91119211 -0.50874702 -1.35325561]
 [-0.16755139 -1.0246016   2.82086429  0.73433626  1.96561348  1.33816354]
 [ 1.1120044   0.97598911 -0.10932713 -0.91119211 -0.50874702  1.33816354]]


In [72]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(1, -1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [73]:
print(y_test_tensor)
print(y_test_tensor.shape)

tensor([[ 9095.0684],
        [ 5272.1758],
        [29330.9824],
        [ 9301.8936],
        [33750.2930],
        [ 4536.2588],
        [ 2117.3389],
        [14210.5361],
        [ 3732.6250],
        [10264.4424],
        [18259.2168],
        [ 7256.7231],
        [ 3947.4131],
        [46151.1250],
        [48673.5586],
        [44202.6523],
        [ 9800.8887],
        [42969.8516],
        [ 8233.0977],
        [21774.3223],
        [ 5080.0962],
        [ 7441.5010],
        [ 1256.2990],
        [ 2755.0210],
        [11085.5869],
        [10923.9336],
        [12644.5889],
        [18804.7520],
        [ 9715.8408],
        [ 1131.5066],
        [15828.8213],
        [11842.6240],
        [ 2020.5522],
        [ 5693.4307],
        [ 2904.0879],
        [ 7448.4038],
        [ 2597.7791],
        [ 7337.7480],
        [23887.6621],
        [38709.1758],
        [ 4687.7969],
        [ 2643.2686],
        [11674.1299],
        [12124.9922],
        [ 4889.9995],
        [1

In [74]:
# Define Neural Network model

class SimpleNNRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNNRegressionModel, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            # nn.Sigmoid()
        )
    def forward(self, x):
        return self.network(x)

In [75]:
X_train_tensor.shape

torch.Size([1070, 6])

In [76]:
device=torch.device('cpu')

In [77]:
input_dim = X_train_tensor.shape[1]
model = SimpleNNRegressionModel(input_dim).to(device)

In [78]:
print(model)

SimpleNNRegressionModel(
  (network): Sequential(
    (0): Linear(in_features=6, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=1, bias=True)
  )
)


In [79]:
# Loss and Optmiser

criterion = nn.L1Loss()
optimiser = optim.Adam(model.parameters(),lr=0.0001)

In [80]:
###
# x_train_tensor = 1000000 --> 10 gb -- out of memory
# 1000000 --> weight and bias
# we are teaching human : A book of 1000 pages 
# 100 epoch
# 1 epoch --> 1070 rows
# ###

In [81]:
# Training Loop
epochs = 100

for epoch in range(epochs):
    model.train()

    predictions = model(X_train_tensor)
    loss = criterion(predictions, y_train_tensor)
    optimiser.zero_grad()
    loss.backward()


    optimiser.step()

    if(epoch+1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}],Loss : {loss.item()/len(df)}")

Epoch [10/100],Loss : 9.974407201747011
Epoch [20/100],Loss : 9.97428020482997


  return F.l1_loss(input, target, reduction=self.reduction)


Epoch [30/100],Loss : 9.974145909239537
Epoch [40/100],Loss : 9.974001395506352
Epoch [50/100],Loss : 9.973840094824364
Epoch [60/100],Loss : 9.973659817591555
Epoch [70/100],Loss : 9.973454724869208
Epoch [80/100],Loss : 9.973227006259343
Epoch [90/100],Loss : 9.972971552690582
Epoch [100/100],Loss : 9.97268763429559


### Understanding Component of custom daataLoader in Pytorch
1. Dataset (torch.utils.data.dataset)
2. DataLoader (torch.utils.data.dataloader)

In [82]:
# Creating our custom Dataset in pytorch
# init() - initialised the datset, loads data, applied preprocessing
# len() - return the total numbers of samples in the dataset
# getitem() - Define how to retrieve a single data sample when is provided 

In [83]:
import torch
from torch.utils.data import Dataset, DataLoader

In [84]:
class InsuranceDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.X)
    def __getitem__(self,idx):
        features = torch.tensor(self.X[idx], dtype = torch.float32)
        target = torch.tensor(self.y.values[idx], dtype = torch.float32)
        return features, target

In [85]:
dataset = InsuranceDataset(X_train, y_train)

In [86]:
dataLoader = DataLoader(dataset, batch_size=32, shuffle=True)

In [87]:
for batch_idx, (features, targets)in enumerate(dataLoader):
    print(f'Batch {batch_idx+1} :')
    print("Features :",features.shape)
    print("Target :",targets.shape)
    break

Batch 1 :
Features : torch.Size([32, 6])
Target : torch.Size([32])


In [88]:
# Training Loop
epochs = 1000

for epoch in range(epochs):
    model.train()

    for batch_idx,(batch_X, batch_y) in enumerate(dataLoader):
        print("Current Batch : ",{batch_idx})
        optimiser.zero_grad()
        predictions = model(batch_X)
        loss = criterion(predictions, batch_y)
        loss.backward()
        optimiser.step()
        print(f"Epoch [{batch_idx+1}/{epochs}],Loss : {loss.item()/len(df)}")

        # torch.nn.util.clip_grad_norm_(model.parameter(), clip_value)
        optimiser.step()

    if(epoch+1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}],Loss : {loss.item()/len(df)}")

Current Batch :  {0}
Epoch [1/1000],Loss : 11.95974781623692
Current Batch :  {1}
Epoch [2/1000],Loss : 13.304446643778027
Current Batch :  {2}
Epoch [3/1000],Loss : 10.740164307735427
Current Batch :  {3}
Epoch [4/1000],Loss : 11.941787240751122
Current Batch :  {4}
Epoch [5/1000],Loss : 10.718120854353513
Current Batch :  {5}
Epoch [6/1000],Loss : 7.968333975616592
Current Batch :  {6}
Epoch [7/1000],Loss : 8.07638426639574
Current Batch :  {7}
Epoch [8/1000],Loss : 9.208441353699552
Current Batch :  {8}
Epoch [9/1000],Loss : 10.443749124159194
Current Batch :  {9}
Epoch [10/1000],Loss : 9.21265560771674
Current Batch :  {10}
Epoch [11/1000],Loss : 11.188282417787743
Current Batch :  {11}
Epoch [12/1000],Loss : 9.572904258921898
Current Batch :  {12}
Epoch [13/1000],Loss : 11.883528499859866
Current Batch :  {13}
Epoch [14/1000],Loss : 10.395312208053065
Current Batch :  {14}
Epoch [15/1000],Loss : 8.049266775270926
Current Batch :  {15}
Epoch [16/1000],Loss : 8.400648268170777
Curre

  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)


Epoch [17/1000],Loss : 8.233148822869955
Current Batch :  {17}
Epoch [18/1000],Loss : 11.274567334641256
Current Batch :  {18}
Epoch [19/1000],Loss : 10.325135463378176
Current Batch :  {19}
Epoch [20/1000],Loss : 13.872946153307176
Current Batch :  {20}
Epoch [21/1000],Loss : 6.938648811192078
Current Batch :  {21}
Epoch [22/1000],Loss : 7.7929016022047835
Current Batch :  {22}
Epoch [23/1000],Loss : 9.438256142563528
Current Batch :  {23}
Epoch [24/1000],Loss : 11.131430861126681
Current Batch :  {24}
Epoch [25/1000],Loss : 13.226092465433483
Current Batch :  {25}
Epoch [26/1000],Loss : 9.869736196748878
Current Batch :  {26}
Epoch [27/1000],Loss : 9.553518982389761
Current Batch :  {27}
Epoch [28/1000],Loss : 11.551380471085576
Current Batch :  {28}
Epoch [29/1000],Loss : 10.632026432875561
Current Batch :  {29}
Epoch [30/1000],Loss : 9.080749602952167
Current Batch :  {30}
Epoch [31/1000],Loss : 10.480207457492526
Current Batch :  {31}
Epoch [32/1000],Loss : 9.40609015905269
Curren

In [89]:
# Model Evaluation

model.eval()
y_pred = model(X_test_tensor).detach().numpy()

In [90]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_test_numpy = y_test_tensor.numpy()

# Calculate Metrics

mes = mean_squared_error(y_test_numpy,y_pred)
rmse = mes ** 0.5
mae = mean_absolute_error(y_test_numpy,y_pred)
r2 = r2_score(y_test_numpy,y_pred)

print(f"MSE : {mes}")
print(f"RMSE : {r2}")
print(f"MAE : {mae}")
print(f"R2-Score : {r2}")

MSE : 161873504.0
RMSE : -0.042672038078308105
MAE : 8370.5166015625
R2-Score : -0.042672038078308105


In [91]:
def predict_charges(age, sex, bmi, children, smoker, region):
    input_data = pd.DataFrame([[age, sex, bmi, children, smoker, region]],
                         columns=['age', 'sex', 'bmi', 'children', 'smoker', 'region'])
    
    for col in ['sex', 'smoker', 'region']:
        input_data[col] = label_encoder[col].transform(input_data[col])
    input_data = scaler.transform(input_data)
    input_tensor = torch.tensor(input_data, dtype=torch.float32)
    predict_charge = model(input_tensor).item()
    return predict_charge

In [92]:
df.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


In [93]:
predicted = predict_charges(61, 'female', 29.07, 0, 'no','northwest')
print(f"Predicted insurance charge : Rs {predicted}")

Predicted insurance charge : Rs 9904.2685546875
