# Importing Libraries<hr>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

import statsmodels.api as sm

# pipeline and deployment 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Reading Dataset<hr>

In [2]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.shape

(1338, 7)

# Dropping outliers from 'bmi' column <hr>

In [4]:
# Defining a function to check for outliers in a given column using iqr method.

def outliers(df, col):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_whisker = q1 - (1.5 * iqr)
    upper_whisker = q3 + (1.5 * iqr)
    num_of_outliers = df[col][(df[col] < lower_whisker) | (df[col] > upper_whisker)].shape[0]
    print(f'Lower Whisker = {lower_whisker}')
    print(f'Upper Whisker = {upper_whisker}')
    print(f'Number of Outliers = {num_of_outliers}')
    return (lower_whisker, upper_whisker, num_of_outliers)

In [5]:
outliers(df, 'bmi')

Lower Whisker = 13.7
Upper Whisker = 47.290000000000006
Number of Outliers = 9


(13.7, 47.290000000000006, 9)

In [6]:
lw, uw, n_outliers = outliers(df, 'bmi')

Lower Whisker = 13.7
Upper Whisker = 47.290000000000006
Number of Outliers = 9


In [7]:
df = df[((df['bmi'] >= lw) & (df['bmi'] <= uw))]
df.shape

(1329, 7)

# Dropping Duplicate Rows<hr>

In [8]:
df.drop_duplicates(inplace = True)

In [9]:
df.shape

(1328, 7)

# Separating Input and output columns<hr>

In [10]:
X = df.drop(columns = 'charges')
y = df['charges']

# Splitting the dataset into train and test<hr>

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 24)

In [12]:
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of y_train: {y_train.shape}\n')

print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_test: {y_test.shape}')

Shape of X_train: (1062, 6)
Shape of y_train: (1062,)

Shape of X_test: (266, 6)
Shape of y_test: (266,)


# Removing and Adjusting y column <hr>

In [13]:
# removing y values >50,000
train_out_index = y_train[y_train>50000].index
test_out_index = y_test[y_test>50000].index

In [14]:
X_train.drop(index = train_out_index, inplace=True)
y_train.drop(index = train_out_index, inplace=True)

X_test.drop(index = test_out_index, inplace=True)
y_test.drop(index = test_out_index, inplace=True)

In [15]:
y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

In [16]:
X_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
569,48,male,40.565,2,yes,northwest
133,19,male,25.555,0,no,northwest
867,57,male,43.7,1,no,southwest
564,18,female,32.12,2,no,southeast
335,64,male,34.5,0,no,southwest


# Steps for running in pipeline<hr>

## Step 1) Simple Imputation

In [17]:
# column wise list for imputation

mean_col = ['bmi']
median_col = ['children', 'age']
mode_col = ['smoker', 'sex', 'region']

In [18]:
# this will be the final order of execution

ct_imputation = ColumnTransformer( transformers= [
    ('mode_imputer', SimpleImputer(strategy='most_frequent'), mode_col),
    ('median_imputer', SimpleImputer(strategy='median'), median_col),
    ('mean_imputer', SimpleImputer(strategy='mean'), mean_col)
    ], remainder= 'passthrough')

In [19]:
ct_imputation.fit_transform(X_train)

array([['yes', 'male', 'northwest', 2.0, 48.0, 40.565],
       ['no', 'male', 'northwest', 0.0, 19.0, 25.555],
       ['no', 'male', 'southwest', 1.0, 57.0, 43.7],
       ...,
       ['no', 'female', 'northwest', 1.0, 56.0, 26.6],
       ['no', 'male', 'northeast', 3.0, 27.0, 32.585],
       ['yes', 'male', 'southeast', 0.0, 61.0, 35.86]], dtype=object)

## Step 2) Encoding

In [20]:
ct_encoding = ColumnTransformer(transformers=[
    ('ord_enc', OrdinalEncoder(), [0,1]),
    ('ohe_enc', OneHotEncoder(sparse_output=False, 
                              handle_unknown='ignore'), [2])
], remainder= 'passthrough')

In [21]:
ct_encoding.fit_transform(ct_imputation.fit_transform(X_train))

array([[1.0, 1.0, 0.0, ..., 2.0, 48.0, 40.565],
       [0.0, 1.0, 0.0, ..., 0.0, 19.0, 25.555],
       [0.0, 1.0, 0.0, ..., 1.0, 57.0, 43.7],
       ...,
       [0.0, 0.0, 0.0, ..., 1.0, 56.0, 26.6],
       [0.0, 1.0, 1.0, ..., 3.0, 27.0, 32.585],
       [1.0, 1.0, 0.0, ..., 0.0, 61.0, 35.86]], dtype=object)

## Step 3) Polynomial Features

In [22]:
## columns required - smoker (1), bmi (last i.e 8), age(7)

ct_poly = ColumnTransformer(transformers=[
    ('poly_features', PolynomialFeatures(degree=(1,4),
                                         include_bias=False), [0,7,8]),
], remainder='passthrough')

In [23]:
ct_poly.fit_transform(ct_encoding.fit_transform(ct_imputation.fit_transform(X_train))).shape

(1057, 40)

## Step 4) Defining the Model

In [24]:
model = LinearRegression()

# Pipeline <hr>

In [25]:
# similar work can be done through make_pipeline instead of pipeline. only difference is that there it is no need ot specify name

pipe = Pipeline(steps=[
    ('imputation', ct_imputation),
    ('encoding', ct_encoding),
    ('polynom', ct_poly),
    ('scaling', StandardScaler()),
    ('model', model)
])

In [26]:
pipe.fit(X_train, y_train_log)

In [27]:
y_pred_log = pipe.predict(X_test)

In [28]:
r2_score(y_test, np.exp(y_pred_log))

0.829925864344089

# Exporting the model

In [29]:
import pickle

In [30]:
pickle.dump(pipe, open('insurance_model.pkl', 'wb'))

<center>- - - - - DONE - - - - - </center>

# Loading the model

In [31]:
new_model = pickle.load(open('insurance_model.pkl', 'rb'))

In [32]:
r2_score(y_test, np.exp(new_model.predict(X_test)))

0.829925864344089

In [33]:
X_train.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region'], dtype='object')

In [34]:
#creating a temp data tlo check for its prediction accuracy
temp = pd.DataFrame(data=[[44, 'male', np.nan, 2, 'no', np.nan]], columns = X_train.columns)
temp

Unnamed: 0,age,sex,bmi,children,smoker,region
0,44,male,,2,no,


In [35]:
new_model.predict(temp), np.exp(new_model.predict(temp)) # the output is coming in log sp now we will have to take it's exponet

(array([9.02231611]), array([8285.9460087]))

- from above, we can predict that the insuranvce amout for the particulatr entered data would be 8,266.45 Rs