<a href="https://colab.research.google.com/github/Timmmtech/Pytorch-Basics/blob/main/insurance_cost_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Insurance cost prediction using linear regression
In this project, we're going to use information like a person's age, sex, BMI, no. of children and smoking habit to predict the price of yearly medical bills. This kind of model is useful for insurance companies to determine the yearly insurance premium for a person. The dataset for this problem is taken from: https://www.kaggle.com/mirichoi0218/insurance

We will create a model with the following steps:

Download and explore the dataset
Prepare the dataset for training
Create a linear regression model
Train the model to fit the data
Make predictions using the trained model

In [6]:
!pip install jovian --quiet

import torch
import jovian

import torchvision
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn.functional as F
from torchvision.datasets.utils import download_url
from torch.utils.data import DataLoader, TensorDataset, random_split

import warnings
warnings.filterwarnings('ignore')

#Some styling
sns.set_style("darkgrid")
plt.style.use("fivethirtyeight")
pd.pandas.set_option('display.max_columns', None)

%matplotlib inline

In [11]:
## Download the dataset
DATASET_URL = "https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv"
DATA_FILENAME = "insurance.csv"
download_url(DATASET_URL, '.')

100%|██████████| 54.3k/54.3k [00:00<00:00, 23.1MB/s]


In [13]:
dataframe = pd.read_csv(DATA_FILENAME)
dataframe.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [14]:
dataframe.shape

(1338, 7)

### Q: How many rows does the dataset have?

In [15]:
num_rows = dataframe.shape[0]
print(num_rows)

1338


### Q: How many columns does the dataset have

In [19]:
num_cols = dataframe.shape[1]
print(num_cols)

7


### Q: What are the column titles of the input variables?

In [20]:
input_cols = dataframe.columns[:-1]
input_cols

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region'], dtype='object')

### Q: Which of the input columns are non-numeric or categorial variables ?

In [21]:
categorical_cols = ['sex', 'smoker', 'region']
print(categorical_cols)

['sex', 'smoker', 'region']


### Q: What are the column titles of output/target variable(s)?

In [22]:
output_cols = ['charges']

### Prepare the dataset for training

In [23]:
## Convert Pandas df to a Pytorch tensors for training
def dataframe_to_arrays(dataframe):
     # Make a copy of the original dataframe
     dataframe1 = dataframe.copy(deep=True)
     # Convert non-numeric categorical columns to numbers
     for col in categorical_cols:
         dataframe1[col] = dataframe1[col].astype('category').cat.codes
     # Extract input & outupts as numpy arrays
     inputs_array = dataframe1[input_cols].to_numpy()
     targets_array = dataframe1[output_cols].to_numpy()
     return inputs_array, targets_array

In [24]:
inputs_array, targets_array = dataframe_to_arrays(dataframe)
inputs_array.shape, targets_array.shape

((1338, 6), (1338, 1))


### Q: Convert the numpy arrays inputs_array and targets_array into PyTorch tensors. Make sure that the data type is torch.float32.

In [25]:
inputs = torch.from_numpy(inputs_array).type(torch.float32)
targets = torch.from_numpy(targets_array).type(torch.float32)

In [26]:
inputs.dtype, targets.dtype

(torch.float32, torch.float32)

In [27]:
## create tensor dataset
dataset = TensorDataset(inputs, targets)

### Q: Pick a number between 0.1 and 0.2 to determine the fraction of data that will be used for creating the validation set. Then use random_split to create training & validation datasets.

In [28]:
val_percent = 0.2 # between 0.1 and 0.2
val_size = int(num_rows * val_percent)
train_size = num_rows - val_size


train_ds, val_ds = random_split(dataset, [train_size, val_size]) # Use the random_split function to split dataset into 2 parts of the desired length

### Q: Pick a batch size for the data loader.

In [30]:
batch_size = 32

In [31]:
train_loader = DataLoader(train_ds, batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size)

In [32]:
## check if batch data and others are working perfectly fine
for xb, yb in train_loader:
     print("inputs:", xb)
     print("targets:", yb)
     break

inputs: tensor([[19.0000,  0.0000, 25.7450,  1.0000,  0.0000,  1.0000],
        [43.0000,  1.0000, 30.1000,  1.0000,  0.0000,  3.0000],
        [35.0000,  1.0000, 24.1300,  1.0000,  0.0000,  1.0000],
        [57.0000,  0.0000, 29.8100,  0.0000,  1.0000,  2.0000],
        [52.0000,  0.0000, 31.7300,  2.0000,  0.0000,  1.0000],
        [53.0000,  0.0000, 26.7000,  2.0000,  0.0000,  3.0000],
        [32.0000,  0.0000, 20.5200,  0.0000,  0.0000,  0.0000],
        [18.0000,  0.0000, 28.2150,  0.0000,  0.0000,  0.0000],
        [30.0000,  1.0000, 27.6450,  1.0000,  0.0000,  0.0000],
        [59.0000,  0.0000, 36.7650,  1.0000,  1.0000,  0.0000],
        [35.0000,  0.0000, 43.3400,  2.0000,  0.0000,  2.0000],
        [20.0000,  1.0000, 22.0000,  1.0000,  0.0000,  3.0000],
        [41.0000,  1.0000, 28.8000,  1.0000,  0.0000,  3.0000],
        [60.0000,  1.0000, 28.9000,  0.0000,  0.0000,  3.0000],
        [57.0000,  1.0000, 34.0100,  0.0000,  0.0000,  1.0000],
        [52.0000,  0.0000, 18.33

### Create a Linear Regression Model

In [34]:
input_size = len(input_cols)
output_size = len(output_cols)

In [51]:
class InsuranceModel(nn.Module):
  def __init__(self):
      super().__init__()
      self.linear = nn.Linear(input_size, output_size)

  def forward(self, xb):
      out = self.linear(xb)
      return out

  def training_step(self, batch):
    inputs, targets = batch
    # Generate predictions
    out = self(inputs)
    # Calculate loss
    loss = F.mse_loss(out, targets)
    return loss

  def validation_step(self, batch):
      inputs, targets = batch
      out = self(inputs)
      loss = F.mse_loss(out, targets)
      return {'val_loss': loss.detach()}

  def validation_epoch_end(self, outputs):
      batch_losses = [x['val_loss'] for x in outputs]
      epoch_loss = torch.stack(batch_losses).mean()
      return {'val_loss': epoch_loss.item()}

  def epoch_end(self, epoch, result, num_epochs):
      # Print result every 20th epoch
      if (epoch+1) % 100 == 0 or epoch == num_epochs-1:
          print("Epoch [{}], val_loss: {:.4f}".format(epoch+1, result['val_loss']))




In [36]:
## creating a model using InsuranceModel class
model = InsuranceModel()

In [37]:
## check out weight and biases of the model
list(model.parameters())

[Parameter containing:
 tensor([[-0.0421, -0.1904, -0.0428, -0.1493,  0.0132,  0.3404]],
        requires_grad=True),
 Parameter containing:
 tensor([-0.3395], requires_grad=True)]

### Train the model to fit the data

In [52]:
def evaluate (model, val_loader):
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

def fit(epochs, lr, model, train_loader, val_loader, opt_func=torch.optim.SGD):
    history = []
    optimizer = opt_func(model.parameters(), lr)
    for epoch in range(epochs):
      # Training Phase
         for batch in train_loader:
             loss = model.training_step(batch)
             loss.backward()
             optimizer.step()
             optimizer.zero_grad()
             # Validation Phase
             result = evaluate(model, val_loader)
             model.epoch_end(epoch, result, epochs)
             history.append(result)
    return history

### Q: Use the evaluate function to calculate the loss on the validation set before training.

In [39]:
result = evaluate(model, val_loader) # Use the the evaluate function
print(result)

{'val_loss': 327191552.0}


### Q: Train the model 4-5 times with different learning rates & for different number of epochs.

In [70]:
model = InsuranceModel()

In [71]:
epochs = 2000
lr = 1e-4
history1 = fit(epochs, lr, model, train_loader, val_loader)

Epoch [100], val_loss: 128164896.0000
Epoch [100], val_loss: 129131976.0000
Epoch [100], val_loss: 128748832.0000
Epoch [100], val_loss: 127451960.0000
Epoch [100], val_loss: 126559760.0000
Epoch [100], val_loss: 126888160.0000
Epoch [100], val_loss: 126496424.0000
Epoch [100], val_loss: 128489160.0000
Epoch [100], val_loss: 127684296.0000
Epoch [100], val_loss: 126941736.0000
Epoch [100], val_loss: 126415032.0000
Epoch [100], val_loss: 127697152.0000
Epoch [100], val_loss: 130667480.0000
Epoch [100], val_loss: 126620248.0000
Epoch [100], val_loss: 128090864.0000
Epoch [100], val_loss: 126376160.0000
Epoch [100], val_loss: 126374744.0000
Epoch [100], val_loss: 135075728.0000
Epoch [100], val_loss: 129035304.0000
Epoch [100], val_loss: 126792872.0000
Epoch [100], val_loss: 128696928.0000
Epoch [100], val_loss: 128871240.0000
Epoch [100], val_loss: 127240376.0000
Epoch [100], val_loss: 126513264.0000
Epoch [100], val_loss: 130435360.0000
Epoch [100], val_loss: 126763024.0000
Epoch [100],

In [72]:
epochs = 1000
lr = 1e-4
history2 = fit(epochs, lr, model, train_loader, val_loader)

Epoch [100], val_loss: 40958672.0000
Epoch [100], val_loss: 40552200.0000
Epoch [100], val_loss: 40108060.0000
Epoch [100], val_loss: 41278384.0000
Epoch [100], val_loss: 41091120.0000
Epoch [100], val_loss: 43532304.0000
Epoch [100], val_loss: 41445048.0000
Epoch [100], val_loss: 41076384.0000
Epoch [100], val_loss: 48601200.0000
Epoch [100], val_loss: 42416884.0000
Epoch [100], val_loss: 40659196.0000
Epoch [100], val_loss: 40411832.0000
Epoch [100], val_loss: 40900928.0000
Epoch [100], val_loss: 40516008.0000
Epoch [100], val_loss: 40092836.0000
Epoch [100], val_loss: 40527636.0000
Epoch [100], val_loss: 40742144.0000
Epoch [100], val_loss: 41501696.0000
Epoch [100], val_loss: 40263236.0000
Epoch [100], val_loss: 40025604.0000
Epoch [100], val_loss: 40748768.0000
Epoch [100], val_loss: 40969980.0000
Epoch [100], val_loss: 40375384.0000
Epoch [100], val_loss: 40012088.0000
Epoch [100], val_loss: 41584632.0000
Epoch [100], val_loss: 41028904.0000
Epoch [100], val_loss: 40088236.0000
E

### Q: What is the final validation loss of your model?

In [48]:
val_loss = 38290980.0000

### Make predictions using the trained model

In [73]:
def predict_single(input, target, model):
    inputs = input.unsqueeze(0)
    predictions = model(inputs)

    prediction = predictions[0].detach()
    print("Input:", input)
    print("Target:", target)
    print("Prediction:", prediction)

In [74]:
input, target = val_ds[0]
predict_single(input, target, model)

Input: tensor([25.0000,  1.0000, 30.5900,  0.0000,  0.0000,  0.0000])
Target: tensor([2727.3950])
Prediction: tensor([5998.5039])


In [75]:
input, target = val_ds[10]
predict_single(input, target, model)

Input: tensor([22.0000,  1.0000, 52.5800,  1.0000,  1.0000,  2.0000])
Target: tensor([44501.3984])
Prediction: tensor([31312.2109])
