In [1]:
import sqlalchemy as sa
import pandas as pd
import numpy as np
import sklearn

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



import pickle
import json

### 2.load data

In [2]:
df = pd.read_csv("medical_insurance.csv")
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
df.columns = df.columns.astype(str)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [7]:
df.isnull()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
1333,False,False,False,False,False,False,False
1334,False,False,False,False,False,False,False
1335,False,False,False,False,False,False,False
1336,False,False,False,False,False,False,False


### 3. EDA & Feature Engineering

### 3.2 sex column

In [8]:
df['sex']

0       female
1         male
2         male
3         male
4         male
         ...  
1333      male
1334    female
1335    female
1336    female
1337    female
Name: sex, Length: 1338, dtype: object

In [9]:
df['sex'].replace({'male': 1, 'female': 0},inplace=True)


In [10]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,yes,southwest,16884.92400
1,18,1,33.770,1,no,southeast,1725.55230
2,28,1,33.000,3,no,southeast,4449.46200
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,no,northwest,10600.54830
1334,18,0,31.920,0,no,northeast,2205.98080
1335,18,0,36.850,0,no,southeast,1629.83350
1336,21,0,25.800,0,no,southwest,2007.94500


### 3.5 smoker column

In [11]:
df['smoker']

0       yes
1        no
2        no
3        no
4        no
       ... 
1333     no
1334     no
1335     no
1336     no
1337    yes
Name: smoker, Length: 1338, dtype: object

In [12]:
df['smoker'].replace({'yes': 1, 'no': 0},inplace=True)

In [13]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,southwest,16884.92400
1,18,1,33.770,1,0,southeast,1725.55230
2,28,1,33.000,3,0,southeast,4449.46200
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.880,0,0,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,northwest,10600.54830
1334,18,0,31.920,0,0,northeast,2205.98080
1335,18,0,36.850,0,0,southeast,1629.83350
1336,21,0,25.800,0,0,southwest,2007.94500


### 3.6 region column

In [14]:
df['region']

0       southwest
1       southeast
2       southeast
3       northwest
4       northwest
          ...    
1333    northwest
1334    northeast
1335    southeast
1336    southwest
1337    northwest
Name: region, Length: 1338, dtype: object

In [15]:
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [16]:
df=pd.get_dummies(df, columns=['region'], dtype = int)
df


Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,1,16884.92400,0,0,0,1
1,18,1,33.770,1,0,1725.55230,0,0,1,0
2,28,1,33.000,3,0,4449.46200,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.880,0,0,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,10600.54830,0,1,0,0
1334,18,0,31.920,0,0,2205.98080,1,0,0,0
1335,18,0,36.850,0,0,1629.83350,0,0,1,0
1336,21,0,25.800,0,0,2007.94500,0,0,0,1


In [17]:
df.corr()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
age,1.0,-0.020856,0.109272,0.042469,-0.025019,0.299008,0.002475,-0.000407,-0.011642,0.010016
sex,-0.020856,1.0,0.046371,0.017163,0.076185,0.057292,-0.002425,-0.011156,0.017117,-0.004184
bmi,0.109272,0.046371,1.0,0.012759,0.00375,0.198341,-0.138156,-0.135996,0.270025,-0.006205
children,0.042469,0.017163,0.012759,1.0,0.007673,0.067998,-0.022808,0.024806,-0.023066,0.021914
smoker,-0.025019,0.076185,0.00375,0.007673,1.0,0.787251,0.002811,-0.036945,0.068498,-0.036945
charges,0.299008,0.057292,0.198341,0.067998,0.787251,1.0,0.006349,-0.039905,0.073982,-0.04321
region_northeast,0.002475,-0.002425,-0.138156,-0.022808,0.002811,0.006349,1.0,-0.320177,-0.345561,-0.320177
region_northwest,-0.000407,-0.011156,-0.135996,0.024806,-0.036945,-0.039905,-0.320177,1.0,-0.346265,-0.320829
region_southeast,-0.011642,0.017117,0.270025,-0.023066,0.068498,0.073982,-0.345561,-0.346265,1.0,-0.346265
region_southwest,0.010016,-0.004184,-0.006205,0.021914,-0.036945,-0.04321,-0.320177,-0.320829,-0.346265,1.0


In [18]:
df.corr().loc[['charges'],:]

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
charges,0.299008,0.057292,0.198341,0.067998,0.787251,1.0,0.006349,-0.039905,0.073982,-0.04321


### VIF

In [19]:
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [20]:
x = df.drop('charges', axis = 1)
vif_values = []
for i in range(x.shape[1]):
    vif_value = variance_inflation_factor(x.values, i)
    vif_values.append(vif_value)

In [21]:
s1 = pd.Series(vif_values, index = x.columns)
s1

age                  1.016822
sex                  1.008900
bmi                  1.106630
children             1.004011
smoker               1.012074
region_northeast     8.603069
region_northwest     8.636205
region_southeast    11.535195
region_southwest     9.218449
dtype: float64

### MODEL BUILDING

In [22]:
x = df.drop('charges', axis = 1)
y = df['charges']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0, shuffle=True)
# y_test

In [23]:
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

### On testing data

In [24]:
y_pred=lr_model.predict(x_test)
y_pred[:5]

array([11169.92711879,  9486.70908541, 38181.12305256, 16266.31328948,
        6914.64800729])

In [25]:
y_test[:5] # Y actual

578      9724.53000
610      8547.69130
569     45702.02235
1034    12950.07120
198      9644.25250
Name: charges, dtype: float64

In [26]:
mse=mean_squared_error(y_test,y_pred)
print("mean squared error:",mse)
rmse=np.sqrt(mse)
print("Root mean squared error:",rmse)
mae=mean_absolute_error(y_test,y_pred)
print("mean absolute error:",mae)
r2score=r2_score(y_test,y_pred)
print("R2 score :",r2score)

mean squared error: 31827950.22952384
Root mean squared error: 5641.62655885019
mean absolute error: 3933.2726494052376
R2 score : 0.7999876970680433


### On training data

In [27]:
y_pred_train = lr_model.predict(x_train)
y_pred_train[:5]

array([33590.42222974,  3376.99258877, 30805.63186119,  8212.64131876,
       14206.98092806])

In [28]:
y_train[:5]

621     40182.24600
194      1137.46970
240     38511.62830
1168     4670.64000
1192    13019.16105
Name: charges, dtype: float64

In [29]:
mse=mean_squared_error(y_train,y_pred_train)
print("mean squared error:",mse)
rmse=np.sqrt(mse)
print("Root mean squared error:",rmse)
mae=mean_absolute_error(y_train,y_pred_train)
print("mean absolute error:",mae)
r2score=r2_score(y_train,y_pred_train)
print("R2 score :",r2score)

mean squared error: 37701533.128629126
Root mean squared error: 6140.157418880165
mean absolute error: 4234.551143314701
R2 score : 0.7370262574551634


### Save the file

In [30]:
with open('Build_new_model_Medical_insurance.pkl','wb')as f:
        pickle.dump(lr_model,f)