# The purpose of this project was to leverage Machine Learning models to build an Insurance prediction model.

The different statistical models used here are:-

 1. Random Forest Regressor
 2. XG Boost Regressor
 3. Gradient Boost Regressor

In [1]:
#Importing the required libraries
import pandas as pd
import numpy as np

#Importing the plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Importing the statistical libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score 
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

import warnings

In [2]:
df = pd.read_csv('insurance.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


In [5]:
df.shape

(1338, 7)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [7]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [8]:
df['sex'].value_counts()

male      676
female    662
Name: sex, dtype: int64

In [9]:
df['region'].value_counts()

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64

# Encoding the dataset

In [10]:
encoder = LabelEncoder()
labels = encoder.fit_transform(df['sex'])
df['sex'] = labels

In [11]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


LABEL ENCODER:

     1 -> Male,
     
     0 -> Female

In [12]:
labels1 = encoder.fit_transform(df['smoker'])
df['smoker'] = labels1

In [13]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


LABEL ENCODER:

    1 -> Yes (smoker),
    
    0 -> No (smoker)

In [14]:
labels2 = encoder.fit_transform(df['region'])
df['region'] = labels2

In [15]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


LABEL ENCODER:

0 -> Northeast,

1 -> Northwest,

2 -> Southeast,

3 -> Southwest,

In [16]:
X = df.drop('charges', axis=1)
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.9,0,1,3
1,18,1,33.77,1,0,2
2,28,1,33.0,3,0,2
3,33,1,22.705,0,0,1
4,32,1,28.88,0,0,1


In [17]:
y = df['charges']
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

In [18]:
# Splitting the data for analysis
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

# Random Forest Regressor Model

In [19]:
# Initialising the model
Rf = RandomForestRegressor()

# fitting the model
Rf.fit(X_train, y_train)

# Predict the model
pred = Rf.predict(X_test)

In [20]:
rmse = np.sqrt(mean_squared_error(y_test, pred))
r2 = r2_score(y_test, pred)

print('The rmse score for the Random forest model is:', rmse)
print('The r2 score for the Random forest model is:', r2)

The rmse score for the Random forest model is: 4594.921176267573
The r2 score for the Random forest model is: 0.8640033936945182


In [21]:
results = pd.DataFrame({'Actual': y_test, 'Predicted': pred})
results.head()

Unnamed: 0,Actual,Predicted
764,9095.06825,10520.595923
887,5272.1758,5089.068032
890,29330.98315,27998.923759
1293,9301.89355,10540.631099
259,33750.2918,34673.964175


# XG Boost regressor

In [22]:
# Initialising the model
Xgboost = XGBRegressor()

# fitting the model
Xgboost.fit(X_train, y_train)

# Predict the model
pred1 = Xgboost.predict(X_test)

In [23]:
rmse2 = np.sqrt(mean_squared_error(y_test, pred1))
r2 = r2_score(y_test, pred1)

print('The rmse score for XGBoost Regressor is:', rmse2)
print('The r2 score for XGBoost Regressor is:', r2)

The rmse score for XGBoost Regressor is: 4736.27039640462
The r2 score for XGBoost Regressor is: 0.855507629258488


In [24]:
results2 = pd.DataFrame({'Actual': y_test, 'Predicted': pred1})
results.head()

Unnamed: 0,Actual,Predicted
764,9095.06825,10520.595923
887,5272.1758,5089.068032
890,29330.98315,27998.923759
1293,9301.89355,10540.631099
259,33750.2918,34673.964175


# Gradient Boost Regressor

In [25]:
# Initialising the model
gbr = GradientBoostingRegressor()

# Fitting the model
gbr.fit(X_train, y_train)

# Predicting the results
pred2 = gbr.predict(X_test)

In [26]:
rmse3 = np.sqrt(mean_squared_error(y_test, pred2))
r2 = r2_score(y_test, pred2)

print('The rmse score for Gradient boost regressor is:', rmse3)
print('The r2 score for Gradient boost regressor is:', r2)

The rmse score for Gradient boost regressor is: 4352.538932159728
The r2 score for Gradient boost regressor is: 0.8779726251291786


In [27]:
results3 = pd.DataFrame({'Actual': y_test, 'Predicted': pred2})
results3.head()

Unnamed: 0,Actual,Predicted
764,9095.06825,11001.128629
887,5272.1758,5840.174656
890,29330.98315,28001.980112
1293,9301.89355,9745.291602
259,33750.2918,33639.100981


In [28]:
# Buidling a prediction model
input_data = (30,1,28,2,1,2)
input_data_array = np.asarray(input_data)

# reshaping the data so that it works for only one instance at a time
input_data_reshaped = input_data_array.reshape(1,-1)

prediction = Rf.predict(input_data_reshaped)
prediction2 = Xgboost.predict(input_data_reshaped)
prediction3 = gbr.predict(input_data_reshaped)

print('Predicted Medical Insurance Cost using Random Forest is : ',str(prediction))
print('Predicted Medical Insurance Cost using XGboost is:', str(prediction2))
print('Predicted Medical Insurance Cost using GBR is:', str(prediction3))

Predicted Medical Insurance Cost using Random Forest is :  [19296.186596]
Predicted Medical Insurance Cost using XGboost is: [18855.555]
Predicted Medical Insurance Cost using GBR is: [19129.68695118]


