Step - 1 : Business Problem Understanding

In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

Step -2 : Data Understanding



In [84]:
#Load Data & understand every variable
df=pd.read_csv("insurance.csv")

Data Understanding

In [85]:
df.shape

(1338, 7)

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [87]:
df["children"].value_counts()


children
0    574
1    324
2    240
3    157
4     25
5     18
Name: count, dtype: int64

In [88]:
df["region"].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [89]:
df["smoker"].value_counts()

smoker
no     1064
yes     274
Name: count, dtype: int64

Exploratory Data Analysis

In [90]:
continuous_features=["age","bmi","expenses"]
discreate_categorical=["sex","smoker","region"]
discrete_count=["children"]

In [91]:
df[continuous_features].describe()

Unnamed: 0,age,bmi,expenses
count,1338.0,1338.0,1338.0
mean,39.207025,30.665471,13270.422414
std,14.04996,6.098382,12110.01124
min,18.0,16.0,1121.87
25%,27.0,26.3,4740.2875
50%,39.0,30.4,9382.03
75%,51.0,34.7,16639.915
max,64.0,53.1,63770.43


In [92]:
df[discreate_categorical].describe()


Unnamed: 0,sex,smoker,region
count,1338,1338,1338
unique,2,2,4
top,male,no,southeast
freq,676,1064,364


In [93]:
df[continuous_features].corr()

Unnamed: 0,age,bmi,expenses
age,1.0,0.109341,0.299008
bmi,0.109341,1.0,0.198576
expenses,0.299008,0.198576,1.0


Step -3 : Data Preprocessing

Data cleaning


In [94]:

df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [95]:
df.duplicated().sum()

1

In [96]:
df.drop_duplicates(inplace=True)

In [97]:
df.shape



(1337, 7)

In [98]:
df.drop("region",axis=1,inplace=True)

Encoding

In [99]:
#encoding sex column
df["sex"].replace({"female":0,"male":1},inplace=True)

#encoding "smoker" column
df["smoker"].replace({"no":0,"yes":1},inplace=True)

X&Y

In [100]:
X=df.drop("expenses",axis=1)
Y=df["expenses"]

Train Test Split

In [101]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test =train_test_split(X,Y,test_size=0.2,random_state=9)

Step - 4,5 : Modelling & Evaluation

Applying Hyperparameter tuning for Lasso Regression

In [102]:
from sklearn.model_selection import GridSearchCV

#model

from sklearn.linear_model import Lasso
estimator =Lasso()

# paramenters & values
param_grid ={ "alpha":list(range(1,100))}

# Identifying the best value of the parameter within given values for the given data 
model_hp=GridSearchCV(estimator,param_grid,cv=5,scoring="r2")
model_hp.fit(X_train,Y_train)
model_hp.best_params_


{'alpha': 60}

bulid lasso model using best hyperparamenters

In [103]:
#modelling 
from sklearn.linear_model import Lasso
lasso_best =Lasso(alpha=60)
lasso_best.fit(X_train,Y_train)

print("Intercept",lasso_best.intercept_)
print("Coefficients:",lasso_best.coef_)

#prediction & evalution on train data
Ypred_train=lasso_best.predict(X_train)

from sklearn.metrics import r2_score
print("Train R2 :", r2_score(Y_train,Ypred_train)) 

from sklearn.model_selection import cross_val_score
print("CV score: ",cross_val_score(lasso_best,X_train,Y_train,cv=5).mean())

#prediction & Evaluation on test data

Ypred_test =lasso_best.predict(X_test)
print("Test R2: ",r2_score(Y_test,Ypred_test))

Intercept -12045.192067679147
Coefficients: [  264.37213484    -0.           317.0408608    373.19599922
 23621.90440143]
Train R2 : 0.7592042059376203
CV score:  0.753731844759959
Test R2:  0.700892897068879


Final Model

In [104]:

#x=x.drop("sex",axis=1)
X= X.drop("sex",axis=1)
Y=df["expenses"]


X_train,X_test,Y_train,Y_test =train_test_split(X,Y,test_size=0.2,random_state =9)

#modelling
from sklearn.linear_model import Lasso
lasso_best=Lasso(alpha=60)
lasso_best.fit(X_train,Y_train)

print("Intercept :", lasso_best.intercept_)
print("Coefficients:", lasso_best.coef_)

#prediction & Evaluation on train data
Ypred_train = lasso_best.predict(X_train)
print("Train R2:",r2_score(Y_train,Ypred_train))
print("CV score :",cross_val_score(lasso_best,X_train,Y_train,cv=5).mean())


#prediction & Evaluation on train data
Ypred_test = lasso_best.predict(X_test)
print("Test R2 :",r2_score(Y_test,Ypred_test))




Intercept : -12045.187463841938
Coefficients: [  264.37194096   317.04095573   373.19607238 23621.90427308]
Train R2: 0.7592042058163877
CV score : 0.7538402453637711
Test R2 : 0.700892917983346


Prediction on New Data

Data

In [105]:
input_data={"age":35,
            "sex":"male",
            "bmi":31.4,
            "children":5,
            "smoker":"yes",
            "region":"southeast"}

In [106]:
df_test =pd.DataFrame(input_data,index=[0])
df_test

Unnamed: 0,age,sex,bmi,children,smoker,region
0,35,male,31.4,5,yes,southeast


preprocessing the data

In [107]:
df_test.drop(["region","sex"],axis=1,inplace=True)
df_test["smoker"].replace({"no":0,"yes":1},inplace=True)

predict

In [108]:
lasso_best.predict(df_test)

array([32650.80111484])

In [109]:
(264.37194096*35)+(317.04095573*31.4)+(373.19607238*5)+(23621.90427308*1)-(12045.187463841938)

32650.801114660062