*Predicting the Insurance cost based on several features related to patient like physical and health metrics*

**Lasso**

**Linear Regression**

**Ridge**

In [1]:
import pandas as pd #Dataframe Manipulation library
import numpy as np #Data Manipulation library

#sklearn modules for Linear Regression
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OneHotEncoder,OrdinalEncoder,LabelEncoder

#Libraries for preprocessing via Pipeline and HyperParameter Tuning
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

#Libraries for Plotting 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
import plotly.graph_objects as go

In [2]:
# Loading the dataset
df = pd.read_csv("../input/insurance/insurance.csv")

In [3]:
df = df.reset_index()
df = df.dropna()
df.head()

Unnamed: 0,index,age,sex,bmi,children,smoker,region,charges
0,0,19,female,27.9,0,yes,southwest,16884.924
1,1,18,male,33.77,1,no,southeast,1725.5523
2,2,28,male,33.0,3,no,southeast,4449.462
3,3,33,male,22.705,0,no,northwest,21984.47061
4,4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.columns

Index(['index', 'age', 'sex', 'bmi', 'children', 'smoker', 'region',
       'charges'],
      dtype='object')

In [5]:
df.describe(include = 'all')

Unnamed: 0,index,age,sex,bmi,children,smoker,region,charges
count,1338.0,1338.0,1338,1338.0,1338.0,1338,1338,1338.0
unique,,,2,,,2,4,
top,,,male,,,no,southeast,
freq,,,676,,,1064,364,
mean,668.5,39.207025,,30.663397,1.094918,,,13270.422265
std,386.391641,14.04996,,6.098187,1.205493,,,12110.011237
min,0.0,18.0,,15.96,0.0,,,1121.8739
25%,334.25,27.0,,26.29625,0.0,,,4740.28715
50%,668.5,39.0,,30.4,1.0,,,9382.033
75%,1002.75,51.0,,34.69375,2.0,,,16639.912515


#### Pipeline for continuous and categorical features

In [6]:
df.dtypes

index         int64
age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [7]:
x_train,x_test,y_train,y_test = train_test_split(df.iloc[:,:7],df.iloc[:,-1:],
                                                test_size=0.2,
                                                random_state=41)

*Selecting Numeric and categorical features using List Comprehensions and dtypes*

In [8]:
df_new = df.iloc[:,:7]

#df_new.columns
print("The dataframe information is as below: \n" )
df_new.info()

The dataframe information is as below: 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   index     1338 non-null   int64  
 1   age       1338 non-null   int64  
 2   sex       1338 non-null   object 
 3   bmi       1338 non-null   float64
 4   children  1338 non-null   int64  
 5   smoker    1338 non-null   object 
 6   region    1338 non-null   object 
dtypes: float64(1), int64(3), object(3)
memory usage: 83.6+ KB


In [9]:
df.iloc[:5,-1:]

Unnamed: 0,charges
0,16884.924
1,1725.5523
2,4449.462
3,21984.47061
4,3866.8552


In [10]:
numerical_features = [col for col in df_new.columns if df_new[col].dtypes in ['int64','float64']]
print(f"The Numerical Features are {numerical_features}")

#categorical_features = [col for col in df.columns if df[col].dtypes in ['object']]  OR
categorical_features = [col for col in df_new.columns if col not in numerical_features]
print(f"The Categorical Features are {categorical_features}")

The Numerical Features are ['index', 'age', 'bmi', 'children']
The Categorical Features are ['sex', 'smoker', 'region']


**LASSO**

In [11]:
numerical_transformer = Pipeline(steps=
                                 [('imputer',SimpleImputer(strategy='mean')),
                                  ('scaler',StandardScaler())
                                 ])
categorical_transformer = Pipeline(steps=
                                 [('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
                                  ('scaler',OneHotEncoder(sparse=False,handle_unknown='error'))
                                 ])

preprocessor = ColumnTransformer(transformers=[
    ('num_trans',numerical_transformer,numerical_features),
    ('cat_trans',categorical_transformer,categorical_features)
])

lasso_mdl = Pipeline(steps=
                [
                    ('preprocessor',preprocessor),
                    ('model',Lasso(max_iter=500,random_state=41))
                ])

In [12]:
lasso_mdl.fit(x_train,y_train)
lasso_score = lasso_mdl.score(x_test,y_test)*100
print(f"The score of the Lasso model is: {lasso_score:0.3f}%")
lasso_y_predict = lasso_mdl.predict(x_test)
#y_test = y_test.reset_index()
print(f"\n The test values are: ")
print(y_test[:5])
print(f"\n The predicted values of the Lasso model are: ")
print(lasso_y_predict[:5])

The score of the Lasso model is: 68.482%

 The test values are: 
          charges
1083   4076.49700
1237  12224.35085
519    3857.75925
79     6571.02435
930    2927.06470

 The predicted values of the Lasso model are: 
[ 6401.69901139 12233.68886916  6613.41226134  9447.63852663
 10074.78571932]


**Linear Regression**

In [13]:
numerical_transformer = Pipeline(steps=
                                 [('imputer',SimpleImputer(strategy='median')),
                                  ('scaler',StandardScaler())
                                 ])
categorical_transformer = Pipeline(steps=
                                 [('imputer',SimpleImputer(strategy='most_frequent',fill_value='missing')),
                                  ('scaler',OrdinalEncoder(handle_unknown='error'))
                                 ])

preprocessor = ColumnTransformer(transformers=[
    ('num_trans',numerical_transformer,numerical_features),
    ('cat_trans',categorical_transformer,categorical_features)
])

linear_mdl = Pipeline(steps=
                [
                    ('preprocessor',preprocessor),
                    ('model',LinearRegression())
                ])

In [14]:
linear_mdl.fit(x_train,y_train)
linear_score = linear_mdl.score(x_test,y_test)*100
print(f"The score of the Linear Regression model is: {linear_score:0.3f}%")
linear_y_predict = linear_mdl.predict(x_test)
#y_test = y_test.reset_index()
print(f"\n The test values are: ")
print(y_test[:5])
print(f"\n The predicted values of the Linear model are: ")
print(linear_y_predict[:5])

The score of the Linear Regression model is: 68.494%

 The test values are: 
          charges
1083   4076.49700
1237  12224.35085
519    3857.75925
79     6571.02435
930    2927.06470

 The predicted values of the Linear model are: 
[[ 6221.97192053]
 [12129.61796578]
 [ 6578.68748637]
 [ 9279.67288513]
 [10246.44266928]]


**Ridge**

In [15]:
numerical_transformer = Pipeline(steps=
                                 [('imputer',SimpleImputer(strategy='mean')),
                                  ('scaler',MinMaxScaler())
                                 ])
categorical_transformer = Pipeline(steps=
                                 [('imputer',SimpleImputer(strategy='most_frequent',fill_value='missing')),
                                  ('encoder',OrdinalEncoder(handle_unknown='error'))
#('encoder',LabelEncoder()) This transformer should be used to encode target values, *i.e.* `y`, and
#not the input `X`)
                                 ])

preprocessor = ColumnTransformer(transformers=[
    ('num_trans',numerical_transformer,numerical_features),
    ('cat_trans',categorical_transformer,categorical_features)
])

ridge_mdl = Pipeline(steps=
                [
                    ('preprocessor',preprocessor),
                    ('model',Ridge(alpha=1.0))
                ])

In [16]:
ridge_mdl.fit(x_train,y_train)
ridge_score = ridge_mdl.score(x_test,y_test)*100
print(f"The score of the Ridge model is: {ridge_score:0.3f}%")
ridge_y_predict = ridge_mdl.predict(x_test)
#y_test = y_test.reset_index()
print(f"\n The test values are: ")
print(y_test[:5])
print(f"\n The predicted values of the Ridge Model are: ")
print(ridge_y_predict[:5])

The score of the Ridge model is: 68.615%

 The test values are: 
          charges
1083   4076.49700
1237  12224.35085
519    3857.75925
79     6571.02435
930    2927.06470

 The predicted values of the Ridge Model are: 
[[ 6277.36083609]
 [12132.57119328]
 [ 6618.37752966]
 [ 9276.38323781]
 [10140.01103767]]
