Import Data

In [1]:
# Import libraries. You may or may not use all of these.
!pip install -q git+https://github.com/tensorflow/docs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for tensorflow-docs (setup.py) ... [?25l[?25hdone
Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [2]:
# Import data
!wget https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv
dataset = pd.read_csv('insurance.csv')
dataset.tail()

--2025-06-14 10:58:56--  https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 50264 (49K) [text/csv]
Saving to: ‘insurance.csv’


2025-06-14 10:58:56 (4.54 MB/s) - ‘insurance.csv’ saved [50264/50264]



Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95
1337,61,female,29.1,0,yes,northwest,29141.36


Data Pre-Processing

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [6]:
dataset.groupby('region').count()

Unnamed: 0_level_0,age,sex,bmi,children,smoker,expenses
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
northeast,324,324,324,324,324,324
northwest,325,325,325,325,325,325
southeast,364,364,364,364,364,364
southwest,325,325,325,325,325,325


In [19]:
new_data = dataset.copy()

In [20]:
new_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [21]:
new_data.isnull().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
expenses,0


In [22]:
new_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [23]:
new_data.groupby('sex').count()

Unnamed: 0_level_0,age,bmi,children,smoker,region,expenses
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,662,662,662,662,662,662
male,676,676,676,676,676,676


In [28]:
new_data = new_data.replace({'sex' : {'male' : 0, 'female' : 1}})
new_data.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
1333,50,0,31.0,3,no,northwest,10600.55
1334,18,1,31.9,0,no,northeast,2205.98
1335,18,1,36.9,0,no,southeast,1629.83
1336,21,1,25.8,0,no,southwest,2007.95
1337,61,1,29.1,0,yes,northwest,29141.36


In [29]:
new_data = new_data.replace({'smoker' : {'yes' : 1 , 'no' : 0}})
new_data.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
1333,50,0,31.0,3,0,northwest,10600.55
1334,18,1,31.9,0,0,northeast,2205.98
1335,18,1,36.9,0,0,southeast,1629.83
1336,21,1,25.8,0,0,southwest,2007.95
1337,61,1,29.1,0,1,northwest,29141.36


In [30]:
new_data = new_data.replace({'region' : {'southwest' : 0, 'southeast' : 1, 'northwest' : 2, 'northeast' : 3}})
new_data.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
1333,50,0,31.0,3,0,2,10600.55
1334,18,1,31.9,0,0,3,2205.98
1335,18,1,36.9,0,0,1,1629.83
1336,21,1,25.8,0,0,0,2007.95
1337,61,1,29.1,0,1,2,29141.36


Splitting data for training and testing

In [34]:
#separating data for input and output
x = new_data.drop(columns='expenses')
y = new_data['expenses']

In [52]:
#training and testing split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=12)

In [53]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)


(1137, 6)
(201, 6)
(1137,)
(201,)


Selecting Model

In [54]:
model = LinearRegression()

In [55]:
model.fit(x_train, y_train)

Evaluating Model

In [56]:
#train & test prediction
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

In [57]:
#MSE and R2 score
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

In [58]:
print("MSE Train:", mse_train)
print("MSE Test:", mse_test)
print("R2 Train:", r2_train)
print("R2 Test:", r2_test)

MSE Train: 35289284.04955867
MSE Test: 43702493.01704442
R2 Train: 0.7587751490930728
R2 Test: 0.704366731572654


Input Predictions

In [66]:
input_data = (18	,1,	31.9,	0	,0	,3)
input_data_arr = np.array(input_data)
input_data_reshape = input_data_arr.reshape(1,-1)

prediction = model.predict(input_data_reshape)
prediction

array([3380.32657387])