In [183]:
# import all the libraries that are required to build the model
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from  sklearn.metrics import r2_score

In [184]:
# Loading the data set using pandas
Data = pd.read_csv(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\ML_Regression_Assignment_Insurance_Data\DataSet\insurance_pre.csv")

In [185]:
# The data set has 1338 rows and 6 columns
Data.shape

(1338, 6)

In [186]:
Data.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.9,0,yes,16884.924
1,18,male,33.77,1,no,1725.5523
2,28,male,33.0,3,no,4449.462
3,33,male,22.705,0,no,21984.47061
4,32,male,28.88,0,no,3866.8552


# Data Pre-processing

In [187]:
# Convert the sex and smoker nominal category columns into numeric values via one-hot-encoding using pandas get_dummies method

Data = pd.get_dummies(Data,drop_first=True)

In [188]:
Data.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.9,0,16884.924,0,1
1,18,33.77,1,1725.5523,1,0
2,28,33.0,3,4449.462,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.88,0,3866.8552,1,0


In [189]:
Data.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [190]:
# Re-order the columns for better visualization

Data = Data.loc[:,['age', 'bmi', 'children','sex_male', 'smoker_yes','charges']]

In [191]:
Data.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,charges
0,19,27.9,0,0,1,16884.924
1,18,33.77,1,1,0,1725.5523
2,28,33.0,3,1,0,4449.462
3,33,22.705,0,1,0,21984.47061
4,32,28.88,0,1,0,3866.8552


In [192]:
# check that data set does not contain any null values
Data[Data.isnull().any(axis=1)]

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,charges


In [193]:
# check that data set does not contain any na values
Data[Data.isna().any(axis=1)]

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,charges


In [194]:
# split data into dependent and independent variables

dependent = Data[['age', 'bmi', 'children','sex_male', 'smoker_yes']]
independent = Data[['charges']]

In [195]:
dependent.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.9,0,0,1
1,18,33.77,1,1,0
2,28,33.0,3,1,0
3,33,22.705,0,1,0
4,32,28.88,0,1,0


In [196]:
# Lets standardize the data so that all the column values converted into same scale. so that there is a chance for a model to perform better
Std = StandardScaler()
dependent = Std.fit_transform(dependent)


In [197]:
dependent

array([[-1.43876426, -0.45332   , -0.90861367, -1.0105187 ,  1.97058663],
       [-1.50996545,  0.5096211 , -0.07876719,  0.98959079, -0.5074631 ],
       [-0.79795355,  0.38330685,  1.58092576,  0.98959079, -0.5074631 ],
       ...,
       [-1.50996545,  1.0148781 , -0.90861367, -1.0105187 , -0.5074631 ],
       [-1.29636188, -0.79781341, -0.90861367, -1.0105187 , -0.5074631 ],
       [ 1.55168573, -0.26138796, -0.90861367, -1.0105187 ,  1.97058663]])

In [198]:
# Converting the above array values to DataFrame format to visualize the Standardized Data 
dependent = pd.DataFrame(data=dependent,columns=['age', 'bmi', 'children','sex_male', 'smoker_yes'])

In [199]:
dependent.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,-1.438764,-0.45332,-0.908614,-1.010519,1.970587
1,-1.509965,0.509621,-0.078767,0.989591,-0.507463
2,-0.797954,0.383307,1.580926,0.989591,-0.507463
3,-0.441948,-1.305531,-0.908614,0.989591,-0.507463
4,-0.513149,-0.292556,-0.908614,0.989591,-0.507463


# Lets create and Train the Mutiple Linear Regression Model

In [200]:
# Lets split the data into train and test
X_Train,X_Test,Y_Train,Y_Test = train_test_split(dependent,independent,test_size=0.30,random_state=0)

In [201]:
X_Train

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
1163,-1.509965,-0.401646,-0.908614,-1.010519,-0.507463
196,-0.014740,0.350498,-0.908614,-1.010519,-0.507463
438,0.910875,2.638918,3.240619,-1.010519,-0.507463
183,0.341265,-0.697746,-0.908614,-1.010519,-0.507463
1298,-0.441948,-0.526320,0.751079,0.989591,-0.507463
...,...,...,...,...,...
763,-0.869155,-0.760083,-0.908614,0.989591,-0.507463
835,0.198863,0.870519,0.751079,0.989591,-0.507463
1216,0.056461,-0.915925,-0.908614,0.989591,-0.507463
559,-1.438764,0.798339,-0.908614,0.989591,-0.507463


In [202]:
# Create a model using the X_Train,Y_Train
Linear_Regression_Model = LinearRegression()
Linear_Regression_Model.fit(X_Train,Y_Train)

In [203]:
# In Y=mx+c linear equation below is the value of intercept "C"
Linear_Regression_Model.intercept_

array([13183.95527221])

In [204]:
# In Y=mx+c linear equation below is the value of slope of each features (m1,m2,m3,m4,m5)
Linear_Regression_Model.coef_

array([[3620.73540717, 1957.15236774,  565.86506996,  -20.87298589,
        9450.44277595]])

In [205]:
# Test the model with X_Test data
Predicted_Y = Linear_Regression_Model.predict(X_Test)

In [207]:
# Lets test the performance of model using the r2_score metric 

R2_Score = r2_score(Y_Test,Predicted_Y)

In [208]:
#From below Output we got the prediction accuracy of Mutiple Linear Regression Model is 78 percentage
R2_Score

0.7894790349867009

## As part of conclusion we got to know that Multiple linear regression give us the performance of 78 percentage in predicting the output with high probability. So let's try few other regression models if we can see any increase in this performance score

In [209]:
# Lets save the model

import pickle

pickle.dump(Linear_Regression_Model,open(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\ML_Regression_Assignment_Insurance_Data\Final Models\MLR_Model_Final.sav",'wb'))

In [210]:
# Open the model and test for one input randomly
Linear_Regression_Model = pickle.load(open(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\ML_Regression_Assignment_Insurance_Data\Final Models\MLR_Model.sav",'rb'))

In [211]:
Linear_Regression_Model.predict([[-1.50996545, -0.40164599, -0.90861367, -1.0105187 , -0.5074631 ]])



array([[1641.87632063]])

In [212]:
X_Train.iloc[0]

age          -1.509965
bmi          -0.401646
children     -0.908614
sex_male     -1.010519
smoker_yes   -0.507463
Name: 1163, dtype: float64

In [213]:
Y_Train.iloc[0]

charges    2200.83085
Name: 1163, dtype: float64