# Insurance cost prediction using linear regression


In [None]:
import torch
import jovian
import torchvision
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn.functional as F
from torchvision.datasets.utils import download_url
from torch.utils.data import DataLoader, TensorDataset, random_split

In [None]:
project_name='02-insurance-linear-regression' # will be used by jovian.commit

## Step 1: Download and explore the data

Let us begin by downloading the data. We'll use the `download_url` function from PyTorch to get the data as a CSV (comma-separated values) file. 

In [None]:
DATASET_URL = "https://hub.jovian.ml/wp-content/uploads/2020/05/insurance.csv"
DATA_FILENAME = "insurance.csv"
download_url(DATASET_URL, '.')

Downloading https://hub.jovian.ml/wp-content/uploads/2020/05/insurance.csv to ./insurance.csv


HBox(children=(FloatProgress(value=0.0, max=55628.0), HTML(value='')))




To load the dataset into memory, we'll use the `read_csv` function from the `pandas` library. The data will be loaded as a Pandas dataframe. See this short tutorial to learn more: https://data36.com/pandas-tutorial-1-basics-reading-data-files-dataframes-data-selection/

In [None]:
dataframe_raw = pd.read_csv(DATA_FILENAME)
dataframe_raw.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


We're going to do a slight customization of the data, so that you every participant receives a slightly different version of the dataset. Fill in your name below as a string (enter at least 5 characters)

In [None]:
your_name = 'nouman' # at least 5 characters

The `customize_dataset` function will customize the dataset slightly using your name as a source of random numbers.

In [None]:
def customize_dataset(dataframe_raw, rand_str):
    dataframe = dataframe_raw.copy(deep=True)
    # drop some rows
    dataframe = dataframe.sample(int(0.95*len(dataframe)), random_state=int(ord(rand_str[0])))
    # scale input
    dataframe.bmi = dataframe.bmi * ord(rand_str[1])/100.
    # scale target
    dataframe.charges = dataframe.charges * ord(rand_str[2])/100.
    # drop column
    if ord(rand_str[3]) % 2 == 1:
        dataframe = dataframe.drop(['region'], axis=1)
    return dataframe

In [None]:
dataframe = customize_dataset(dataframe_raw, your_name)
dataframe.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges
538,46,female,31.1355,1,no,9632.724075
1217,29,male,41.3919,2,no,4747.995837
837,56,female,31.4241,0,no,13639.531113
1082,38,male,22.1445,1,no,6851.405925
563,50,male,49.6947,1,no,10598.714451


Let us answer some basic questions about the dataset. 


**Q: How many rows does the dataset have?**

In [None]:
num_rows = dataframe.shape[0]
print(num_rows)

1271


**Q: How many columns doe the dataset have**

In [None]:
num_cols = dataframe.shape[1]
print(num_cols)

6


**Q: What are the column titles of the input variables?**

In [None]:
input_cols = dataframe.columns[0:5]
(input_cols)

Index(['age', 'sex', 'bmi', 'children', 'smoker'], dtype='object')

**Q: Which of the input columns are non-numeric or categorial variables ?**

Hint: `sex` is one of them. List the columns that are not numbers.

In [None]:
categorical_cols = ['smoker', 'sex']
categorical_cols

['smoker', 'sex']

**Q: What are the column titles of output/target variable(s)?**

In [None]:
output_cols = ['charges']

Remember to commit your notebook to Jovian after every step, so that you don't lose your work.

In [None]:
!pip install jovian --upgrade -q

In [None]:
import jovian

In [None]:
jovian.commit()

[jovian] Detected Colab notebook...[0m
[jovian] Please enter your API key ( from https://jovian.ai/ ):[0m
API KEY:

## Step 2: Prepare the dataset for training

We need to convert the data from the Pandas dataframe into a PyTorch tensors for training. To do this, the first step is to convert it numpy arrays. If you've filled out `input_cols`, `categorial_cols` and `output_cols` correctly, this following function will perform the conversion to numpy arrays.

In [None]:
def dataframe_to_arrays(dataframe):
    # Make a copy of the original dataframe
    dataframe1 = dataframe.copy(deep=True)
    # Convert non-numeric categorical columns to numbers
    for col in categorical_cols:
        dataframe1[col] = dataframe1[col].astype('category').cat.codes
    # Extract input & outupts as numpy arrays
    inputs_array = dataframe1[input_cols].to_numpy()
    targets_array = dataframe1[output_cols].to_numpy()
    return inputs_array, targets_array

Read through the [Pandas documentation](https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html) to understand how we're converting categorical variables into numbers.

In [None]:
inputs_array, targets_array = dataframe_to_arrays(dataframe)
inputs_array.shape, inputs_array, targets_array

((1271, 5), array([[46.     ,  0.     , 31.1355 ,  1.     ,  0.     ],
        [29.     ,  1.     , 41.3919 ,  2.     ,  0.     ],
        [56.     ,  0.     , 31.4241 ,  0.     ,  0.     ],
        ...,
        [46.     ,  0.     , 22.1445 ,  2.     ,  0.     ],
        [63.     ,  1.     , 45.87075,  3.     ,  0.     ],
        [60.     ,  0.     , 31.857  ,  1.     ,  0.     ]]), array([[ 9632.724075 ],
        [ 4747.995837 ],
        [13639.531113 ],
        ...,
        [10756.791045 ],
        [18199.5708375],
        [15472.89081  ]]))

**Q: Convert the numpy arrays `inputs_array` and `targets_array` into PyTorch tensors. Make sure that the data type is `torch.float32`.**

In [None]:
inputs = torch.from_numpy(inputs_array).type(torch.float32)
targets = torch.from_numpy(targets_array).type(torch.float32)

In [None]:
inputs.dtype, targets.dtype

(torch.float32, torch.float32)

Next, we need to create PyTorch datasets & data loaders for training & validation. We'll start by creating a `TensorDataset`.

In [None]:
dataset = TensorDataset(inputs, targets)

**Q: Pick a number between `0.1` and `0.2` to determine the fraction of data that will be used for creating the validation set. Then use `random_split` to create training & validation datasets.**

In [None]:
val_percent = 0.2 # between 0.1 and 0.2
val_size = int(num_rows * val_percent)
train_size = num_rows - val_size


train_ds, val_ds = random_split(dataset, [train_size,val_size]) # Use the random_split function to split dataset into 2 parts of the desired length

Finally, we can create data loaders for training & validation.

**Q: Pick a batch size for the data loader.**

In [None]:
batch_size = 100

In [None]:
train_loader = DataLoader(train_ds, batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size)

Let's look at a batch of data to verify everything is working fine so far.

In [None]:
for xb, yb in train_loader:
    print("inputs:", xb)
    print("targets:", yb)
    break

inputs: tensor([[34.0000,  1.0000, 37.9731,  0.0000,  0.0000],
        [18.0000,  1.0000, 25.7631,  0.0000,  0.0000],
        [55.0000,  1.0000, 36.3802,  0.0000,  0.0000],
        [20.0000,  1.0000, 31.1077,  1.0000,  1.0000],
        [18.0000,  0.0000, 34.5543,  0.0000,  0.0000],
        [61.0000,  1.0000, 37.2239,  0.0000,  0.0000],
        [35.0000,  1.0000, 19.8246,  1.0000,  0.0000],
        [49.0000,  0.0000, 47.3748,  2.0000,  0.0000],
        [44.0000,  0.0000, 42.2466,  0.0000,  1.0000],
        [19.0000,  1.0000, 37.8510,  0.0000,  0.0000],
        [55.0000,  1.0000, 42.4908,  0.0000,  0.0000],
        [33.0000,  1.0000, 30.4750,  2.0000,  0.0000],
        [55.0000,  1.0000, 39.1219,  1.0000,  0.0000],
        [29.0000,  1.0000, 32.1623,  1.0000,  0.0000],
        [44.0000,  1.0000, 34.0659,  2.0000,  0.0000],
        [29.0000,  1.0000, 41.3919,  2.0000,  0.0000],
        [40.0000,  1.0000, 27.8388,  0.0000,  0.0000],
        [57.0000,  1.0000, 46.7643,  1.0000,  1.0000],
  

Let's save our work by committing to Jovian.

In [None]:
jovian.commit(project=project_name, environment=None)

[jovian] Detected Colab notebook...[0m
[jovian] Uploading colab notebook to Jovian...[0m
[jovian] Committed successfully! https://jovian.ai/noumanamir453/02-insurance-linear-regression[0m


'https://jovian.ai/noumanamir453/02-insurance-linear-regression'

## Step 3: Create a Linear Regression Model

Our model itself is a fairly straightforward linear regression (we'll build more complex models in the next assignment). 


In [None]:
input_size = len(input_cols)
output_size = len(output_cols)

**Q: Complete the class definition below by filling out the constructor (`__init__`), `forward`, `training_step` and `validation_step` methods.**

Hint: Think carefully about picking a good loss fuction (it's not cross entropy). Maybe try 2-3 of them and see which one works best. See https://pytorch.org/docs/stable/nn.functional.html#loss-functions

In [None]:
class InsuranceModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(input_size, output_size)                  # fill this (hint: use input_size & output_size defined above)
        
    def forward(self, xb):
        out = self.linear(xb)                        # fill this
        return out
    
    def training_step(self, batch):
        inputs, targets = batch 
        # Generate predictions
        out = self(inputs)          
        # Calcuate loss
        loss = F.l1_loss(out, targets)                # fill this
        return loss
    
    def validation_step(self, batch):
        inputs, targets = batch
        # Generate predictions
        out = self(inputs)
        # Calculate loss
        loss = loss = F.l1_loss(out, targets)                            # fill this    
        return {'val_loss': loss.detach()}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        return {'val_loss': epoch_loss.item()}
    
    def epoch_end(self, epoch, result, num_epochs):
        # Print result every 20th epoch
        if (epoch+1) % 20 == 0 or epoch == num_epochs-1:
            print("Epoch [{}], val_loss: {:.4f}".format(epoch+1, result['val_loss']))

Let us create a model using the `InsuranceModel` class. You may need to come back later and re-run the next cell to reinitialize the model, in case the loss becomes `nan` or `infinity`.

In [None]:
model = InsuranceModel()

Let's check out the weights and biases of the model using `model.parameters`.

In [None]:
list(model.parameters())

[Parameter containing:
 tensor([[-0.1089, -0.3771,  0.2057,  0.4306, -0.1588]], requires_grad=True),
 Parameter containing:
 tensor([0.4260], requires_grad=True)]

One final commit before we train the model.

In [None]:
jovian.commit(project=project_name, environment=None)

[jovian] Detected Colab notebook...[0m
[jovian] Uploading colab notebook to Jovian...[0m
[jovian] Committed successfully! https://jovian.ai/noumanamir453/02-insurance-linear-regression[0m


'https://jovian.ai/noumanamir453/02-insurance-linear-regression'

## Step 4: Train the model to fit the data

To train our model, we'll use the same `fit` function explained in the lecture. That's the benefit of defining a generic training loop - you can use it for any problem.

In [None]:
def evaluate(model, val_loader):
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

def fit(epochs, lr, model, train_loader, val_loader, opt_func=torch.optim.SGD):
    history = []
    optimizer = opt_func(model.parameters(), lr)
    for epoch in range(epochs):
        # Training Phase 
        for batch in train_loader:
            loss = model.training_step(batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        # Validation phase
        result = evaluate(model, val_loader)
        model.epoch_end(epoch, result, epochs)
        history.append(result)
    return history

**Q: Use the `evaluate` function to calculate the loss on the validation set before training.**

In [None]:
result = evaluate(model, val_loader) # Use the the evaluate function
print(result)

{'val_loss': 14452.15625}



We are now ready to train the model. You may need to run the training loop many times, for different number of epochs and with different learning rates, to get a good result. Also, if your loss becomes too large (or `nan`), you may have to re-initialize the model by running the cell `model = InsuranceModel()`. Experiment with this for a while, and try to get to as low a loss as possible.

**Q: Train the model 4-5 times with different learning rates & for different number of epochs.**

Hint: Vary learning rates by orders of 10 (e.g. `1e-2`, `1e-3`, `1e-4`, `1e-5`, `1e-6`) to figure out what works.

In [None]:
epochs = 1000
lr = 2e-1
history1 = fit(epochs, lr, model, train_loader, val_loader)

Epoch [20], val_loss: 7319.7539
Epoch [40], val_loss: 7129.5098
Epoch [60], val_loss: 7121.5366
Epoch [80], val_loss: 7105.4390
Epoch [100], val_loss: 7098.9023
Epoch [120], val_loss: 7094.6851
Epoch [140], val_loss: 7094.0078
Epoch [160], val_loss: 7103.9043
Epoch [180], val_loss: 7081.8970
Epoch [200], val_loss: 7074.4917
Epoch [220], val_loss: 7067.9937
Epoch [240], val_loss: 7085.2788
Epoch [260], val_loss: 7059.6562
Epoch [280], val_loss: 7055.6714
Epoch [300], val_loss: 7056.5195
Epoch [320], val_loss: 7048.4727
Epoch [340], val_loss: 7051.7578
Epoch [360], val_loss: 7039.9937
Epoch [380], val_loss: 7042.7754
Epoch [400], val_loss: 7062.8335
Epoch [420], val_loss: 7029.9136
Epoch [440], val_loss: 7027.0757
Epoch [460], val_loss: 7026.2671
Epoch [480], val_loss: 7035.1479
Epoch [500], val_loss: 7016.8267
Epoch [520], val_loss: 7023.6831
Epoch [540], val_loss: 7010.2104
Epoch [560], val_loss: 7008.4585
Epoch [580], val_loss: 7021.5034
Epoch [600], val_loss: 7002.5640
Epoch [620], v

In [None]:
epochs = 1000
lr = 0.1
history2 = fit(epochs, lr, model, train_loader, val_loader)

Epoch [20], val_loss: 6967.9136
Epoch [40], val_loss: 6958.7695
Epoch [60], val_loss: 6967.5210
Epoch [80], val_loss: 6952.6152
Epoch [100], val_loss: 6959.7070
Epoch [120], val_loss: 6953.4819
Epoch [140], val_loss: 6948.9609
Epoch [160], val_loss: 6958.2603
Epoch [180], val_loss: 6946.0015
Epoch [200], val_loss: 6948.1992
Epoch [220], val_loss: 6952.7778
Epoch [240], val_loss: 6943.4478
Epoch [260], val_loss: 6952.5190
Epoch [280], val_loss: 6941.8950
Epoch [300], val_loss: 6945.8647
Epoch [320], val_loss: 6944.9712
Epoch [340], val_loss: 6937.9868
Epoch [360], val_loss: 6948.8354
Epoch [380], val_loss: 6939.1030
Epoch [400], val_loss: 6937.8613
Epoch [420], val_loss: 6935.1392
Epoch [440], val_loss: 6935.9922
Epoch [460], val_loss: 6937.5215
Epoch [480], val_loss: 6933.0415
Epoch [500], val_loss: 6930.1548
Epoch [520], val_loss: 6932.1660
Epoch [540], val_loss: 6926.7632
Epoch [560], val_loss: 6932.7515
Epoch [580], val_loss: 6928.8276
Epoch [600], val_loss: 6928.4751
Epoch [620], v

In [None]:
epochs = 1000
lr = 0.6
history3 = fit(epochs, lr, model, train_loader, val_loader)

Epoch [20], val_loss: 6892.8384
Epoch [40], val_loss: 6890.4844
Epoch [60], val_loss: 6884.3867
Epoch [80], val_loss: 6892.7739
Epoch [100], val_loss: 7073.9263
Epoch [120], val_loss: 6864.5161
Epoch [140], val_loss: 6864.6968
Epoch [160], val_loss: 6851.6958
Epoch [180], val_loss: 6846.0103
Epoch [200], val_loss: 6850.2734
Epoch [220], val_loss: 7046.2441
Epoch [240], val_loss: 6953.2671
Epoch [260], val_loss: 6907.4761
Epoch [280], val_loss: 6850.9409
Epoch [300], val_loss: 6996.0938
Epoch [320], val_loss: 6843.9395
Epoch [340], val_loss: 6874.0112
Epoch [360], val_loss: 6801.4453
Epoch [380], val_loss: 6821.5391
Epoch [400], val_loss: 6859.4492
Epoch [420], val_loss: 6775.2563
Epoch [440], val_loss: 6803.1621
Epoch [460], val_loss: 6845.9233
Epoch [480], val_loss: 6758.1440
Epoch [500], val_loss: 6848.4507
Epoch [520], val_loss: 6942.0259
Epoch [540], val_loss: 6877.0117
Epoch [560], val_loss: 6757.2407
Epoch [580], val_loss: 6731.1421
Epoch [600], val_loss: 6722.2349
Epoch [620], v

In [None]:
epochs = 1000
lr = 0.2
history4 = fit(epochs, lr, model, train_loader, val_loader)

Epoch [20], val_loss: 6615.6133
Epoch [40], val_loss: 6610.6758
Epoch [60], val_loss: 6659.9380
Epoch [80], val_loss: 6611.9800
Epoch [100], val_loss: 6623.4116
Epoch [120], val_loss: 6611.2886
Epoch [140], val_loss: 6626.7637
Epoch [160], val_loss: 6612.8237
Epoch [180], val_loss: 6612.6055
Epoch [200], val_loss: 6612.4595
Epoch [220], val_loss: 6597.4082
Epoch [240], val_loss: 6605.5195
Epoch [260], val_loss: 6613.7124
Epoch [280], val_loss: 6599.7070
Epoch [300], val_loss: 6613.1157
Epoch [320], val_loss: 6604.4224
Epoch [340], val_loss: 6600.5767
Epoch [360], val_loss: 6580.5532
Epoch [380], val_loss: 6579.5039
Epoch [400], val_loss: 6596.0757
Epoch [420], val_loss: 6585.1758
Epoch [440], val_loss: 6621.8823
Epoch [460], val_loss: 6575.4204
Epoch [480], val_loss: 6591.8110
Epoch [500], val_loss: 6634.9453
Epoch [520], val_loss: 6599.5776
Epoch [540], val_loss: 6587.7754
Epoch [560], val_loss: 6558.9995
Epoch [580], val_loss: 6581.0337
Epoch [600], val_loss: 6566.0874
Epoch [620], v

In [None]:
epochs = 50000
lr = 7e-2
history5 = fit(epochs, lr, model, train_loader, val_loader)

Epoch [20], val_loss: 3964.5769
Epoch [40], val_loss: 3964.1238
Epoch [60], val_loss: 3957.9539
Epoch [80], val_loss: 3956.2927
Epoch [100], val_loss: 3953.0156
Epoch [120], val_loss: 3965.8542
Epoch [140], val_loss: 3962.2546
Epoch [160], val_loss: 3957.1047
Epoch [180], val_loss: 3956.5320
Epoch [200], val_loss: 3969.0959
Epoch [220], val_loss: 3954.2288
Epoch [240], val_loss: 3954.3164
Epoch [260], val_loss: 3962.9160
Epoch [280], val_loss: 3968.5039
Epoch [300], val_loss: 3956.1797
Epoch [320], val_loss: 3963.5930
Epoch [340], val_loss: 3962.6162
Epoch [360], val_loss: 3952.1648
Epoch [380], val_loss: 3948.6331
Epoch [400], val_loss: 3963.4512
Epoch [420], val_loss: 3968.0847
Epoch [440], val_loss: 3960.9006
Epoch [460], val_loss: 3950.8367
Epoch [480], val_loss: 3958.4836
Epoch [500], val_loss: 3956.7852
Epoch [520], val_loss: 3956.8464
Epoch [540], val_loss: 3951.2317
Epoch [560], val_loss: 3946.8740
Epoch [580], val_loss: 3957.2500
Epoch [600], val_loss: 3959.8516
Epoch [620], v

**Q: What is the final validation loss of your model?**

In [None]:
val_loss = evaluate(model, val_loader)
val_loss

{'val_loss': 3851.38671875}

Let's log the final validation loss to Jovian and commit the notebook

In [None]:
jovian.log_metrics(val_loss=val_loss)

[jovian] Metrics logged.[0m


In [None]:
jovian.commit(project=project_name, environment=None)

[jovian] Detected Colab notebook...[0m
[jovian] Uploading colab notebook to Jovian...[0m
[jovian] Attaching records (metrics, hyperparameters, dataset etc.)[0m
[jovian] Committed successfully! https://jovian.ai/noumanamir453/02-insurance-linear-regression[0m


'https://jovian.ai/noumanamir453/02-insurance-linear-regression'

## Step 5: Make predictions using the trained model

**Q: Complete the following function definition to make predictions on a single input**

In [None]:
def predict_single(input, target, model):
    inputs = input.unsqueeze(0)
    predictions = model(input)                # fill this
    prediction = predictions[0].detach()
    print("Input:", input)
    print("Target:", target)
    print("Prediction:", prediction)

In [None]:
input, target = val_ds[0]
predict_single(input, target, model)

Input: tensor([58.0000,  0.0000, 25.2747,  0.0000,  0.0000])
Target: tensor([13845.5254])
Prediction: tensor(13698.7568)


In [None]:
input, target = val_ds[10]
predict_single(input, target, model)

Input: tensor([49.0000,  1.0000, 40.9035,  0.0000,  0.0000])
Target: tensor([9507.1680])
Prediction: tensor(10641.6943)


In [None]:
input, target = val_ds[23]
predict_single(input, target, model)

Input: tensor([55.0000,  1.0000, 23.8650,  1.0000,  0.0000])
Target: tensor([12626.5928])
Prediction: tensor(12735.9404)


In [None]:
jovian.commit(project=project_name, environment=None)
jovian.commit(project=project_name, environment=None) # try again, kaggle fails sometimes

[jovian] Detected Colab notebook...[0m
[jovian] Uploading colab notebook to Jovian...[0m
[jovian] Attaching records (metrics, hyperparameters, dataset etc.)[0m
[jovian] Committed successfully! https://jovian.ai/noumanamir453/02-insurance-linear-regression[0m
[jovian] Detected Colab notebook...[0m
[jovian] Uploading colab notebook to Jovian...[0m
[jovian] Attaching records (metrics, hyperparameters, dataset etc.)[0m
[jovian] Committed successfully! https://jovian.ai/noumanamir453/02-insurance-linear-regression[0m


'https://jovian.ai/noumanamir453/02-insurance-linear-regression'

In [None]:
torch.save(model.state_dict(), 'insurance-regression.pth')