In [1]:
# Importing the libraries 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# Import psql to run queries 

import pandasql as psql

In [2]:

# load the Health Insurance dataset 

HealthIns = pd.read_csv(r"C:\Users\badda\Downloads\Health_Ins_Expenses.csv", header=0)

# Copy to back-up file

HealthIns_BK = HealthIns.copy()

# Display the first 5 records

HealthIns.head()

Unnamed: 0,Record_ID,Age,Gender,BMI,Children,Smoker,Region,Expenses
0,QK-136276906,43,male,36.25,1,yes,southeast,40293.04
1,NR-126120553,40,male,34.56,2,no,southeast,23569.63
2,HY-182936067,48,male,26.04,5,no,southwest,10115.35
3,HF-142445422,50,male,31.09,3,yes,northwest,40736.57
4,NM-183693148,42,male,33.04,1,yes,northeast,39144.85


In [3]:
# Display the dataset information

HealthIns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6695 entries, 0 to 6694
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Record_ID  6695 non-null   object 
 1   Age        6695 non-null   int64  
 2   Gender     6695 non-null   object 
 3   BMI        6695 non-null   float64
 4   Children   6695 non-null   int64  
 5   Smoker     6695 non-null   object 
 6   Region     6695 non-null   object 
 7   Expenses   6695 non-null   float64
dtypes: float64(2), int64(2), object(4)
memory usage: 418.6+ KB


In [4]:
# Displaying Duplicate values with in dataset

HealthIns_dup = HealthIns[HealthIns.duplicated(keep='last')]

# Display the duplicate records

HealthIns_dup

Unnamed: 0,Record_ID,Age,Gender,BMI,Children,Smoker,Region,Expenses
240,ZZ-131148293,22,female,24.73,1,no,southeast,2216.02
241,BQ-160063838,21,female,37.4,0,no,southeast,1634.08
242,UO-189044287,57,male,43.7,1,no,southwest,11576.13
246,OT-130753432,41,female,30.18,1,no,southeast,6515.36
247,BH-153144132,50,male,36.2,0,no,southwest,8457.82


In [5]:
# Remove the identified duplicate records 

HealthIns = HealthIns.drop_duplicates()

# Display the shape of the dataset

HealthIns.shape

(6690, 8)

In [6]:
# Re-setting the row index

HealthIns = HealthIns.reset_index(drop=True)

# Copy file to back-up file after deletion of duplicate records

HealthIns_BK2 = HealthIns.copy()

In [7]:
HealthIns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6690 entries, 0 to 6689
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Record_ID  6690 non-null   object 
 1   Age        6690 non-null   int64  
 2   Gender     6690 non-null   object 
 3   BMI        6690 non-null   float64
 4   Children   6690 non-null   int64  
 5   Smoker     6690 non-null   object 
 6   Region     6690 non-null   object 
 7   Expenses   6690 non-null   float64
dtypes: float64(2), int64(2), object(4)
memory usage: 418.3+ KB


In [8]:
HealthIns.nunique()

Record_ID    6690
Age            51
Gender          2
BMI          2227
Children        6
Smoker          2
Region          4
Expenses     6673
dtype: int64

In [9]:
HealthIns.isnull().sum()

Record_ID    0
Age          0
Gender       0
BMI          0
Children     0
Smoker       0
Region       0
Expenses     0
dtype: int64

In [10]:
HealthIns['Gender'].value_counts()

Gender
male      3401
female    3289
Name: count, dtype: int64

In [11]:
HealthIns['Gender'] = HealthIns['Gender'].str.replace('female','0')
HealthIns['Gender'] = HealthIns['Gender'].str.replace('male','1')
HealthIns['Gender'] = HealthIns['Gender'].astype(int)


In [12]:
HealthIns

Unnamed: 0,Record_ID,Age,Gender,BMI,Children,Smoker,Region,Expenses
0,QK-136276906,43,1,36.25,1,yes,southeast,40293.04
1,NR-126120553,40,1,34.56,2,no,southeast,23569.63
2,HY-182936067,48,1,26.04,5,no,southwest,10115.35
3,HF-142445422,50,1,31.09,3,yes,northwest,40736.57
4,NM-183693148,42,1,33.04,1,yes,northeast,39144.85
...,...,...,...,...,...,...,...,...
6685,TS-123508014,64,1,26.40,0,no,northeast,14394.56
6686,CY-192619047,33,0,26.27,0,no,southeast,3763.99
6687,EU-143129392,45,0,28.60,2,no,southeast,8516.83
6688,VG-170585902,58,1,32.53,1,no,northeast,11772.37


In [13]:
HealthIns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6690 entries, 0 to 6689
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Record_ID  6690 non-null   object 
 1   Age        6690 non-null   int64  
 2   Gender     6690 non-null   int32  
 3   BMI        6690 non-null   float64
 4   Children   6690 non-null   int64  
 5   Smoker     6690 non-null   object 
 6   Region     6690 non-null   object 
 7   Expenses   6690 non-null   float64
dtypes: float64(2), int32(1), int64(2), object(3)
memory usage: 392.1+ KB


In [14]:
HealthIns['Smoker'].value_counts()

Smoker
no     5320
yes    1370
Name: count, dtype: int64

In [16]:
HealthIns['Smoker'] = HealthIns['Smoker'].str.replace('no','0')
HealthIns['Smoker'] = HealthIns['Smoker'].str.replace('yes','1')
HealthIns['Smoker'] = HealthIns['Smoker'].astype(int)
HealthIns['Smoker'].value_counts()

Smoker
0    5320
1    1370
Name: count, dtype: int64

In [17]:
HealthIns['Region'].value_counts()

Region
southeast    1820
southwest    1625
northwest    1625
northeast    1620
Name: count, dtype: int64

In [18]:
HealthIns['Region'] = HealthIns['Region'].str.replace('southeast','0')
HealthIns['Region'] = HealthIns['Region'].str.replace('southwest','1')
HealthIns['Region'] = HealthIns['Region'].str.replace('northwest','2')
HealthIns['Region'] = HealthIns['Region'].str.replace('northeast','3')
HealthIns['Region'] = HealthIns['Region'].astype(int)
HealthIns['Region'].value_counts()

Region
0    1820
1    1625
2    1625
3    1620
Name: count, dtype: int64

In [19]:
HealthIns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6690 entries, 0 to 6689
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Record_ID  6690 non-null   object 
 1   Age        6690 non-null   int64  
 2   Gender     6690 non-null   int32  
 3   BMI        6690 non-null   float64
 4   Children   6690 non-null   int64  
 5   Smoker     6690 non-null   int32  
 6   Region     6690 non-null   int32  
 7   Expenses   6690 non-null   float64
dtypes: float64(2), int32(3), int64(2), object(1)
memory usage: 339.9+ KB


In [21]:
HealthIns.columns

Index(['Record_ID', 'Age', 'Gender', 'BMI', 'Children', 'Smoker', 'Region',
       'Expenses'],
      dtype='object')

In [22]:
cols = ['Age','BMI','Children','Region']

In [23]:
del HealthIns['Record_ID']

In [24]:
HealthIns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6690 entries, 0 to 6689
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       6690 non-null   int64  
 1   Gender    6690 non-null   int32  
 2   BMI       6690 non-null   float64
 3   Children  6690 non-null   int64  
 4   Smoker    6690 non-null   int32  
 5   Region    6690 non-null   int32  
 6   Expenses  6690 non-null   float64
dtypes: float64(2), int32(3), int64(2)
memory usage: 287.6 KB


In [25]:
HealthIns.head()

Unnamed: 0,Age,Gender,BMI,Children,Smoker,Region,Expenses
0,43,1,36.25,1,1,0,40293.04
1,40,1,34.56,2,0,0,23569.63
2,48,1,26.04,5,0,1,10115.35
3,50,1,31.09,3,1,2,40736.57
4,42,1,33.04,1,1,3,39144.85


In [26]:
IndepVar = []

for col in HealthIns.columns:
    if col!='Expenses':
        IndepVar.append(col)
TargetVar = 'Expenses'

x = HealthIns[IndepVar]
y = HealthIns[TargetVar]

In [27]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 42)

x_train.shape,x_test.shape,y_train.shape,y_test.shape

((4683, 6), (2007, 6), (4683,), (2007,))

In [30]:
from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0,1))

x_train[cols] = mmscaler.fit_transform(x_train[cols])
x_train = pd.DataFrame(x_train)

x_test[cols] = mmscaler.fit_transform(x_test[cols])
x_test = pd.DataFrame(x_test)

In [31]:
from sklearn.linear_model import LinearRegression

ModelRGR = LinearRegression()

ModelRGR.fit(x_train,y_train)

y_pred = ModelRGR.predict(x_test)


from sklearn import metrics

print('Mean Absolute Error(MAE):  ',round(metrics.mean_absolute_error(y_test,y_pred),3))   #here 3 is to end up with 3 decimal values
print('Mean Squared error(MSE):  ',round(metrics.mean_squared_error(y_test,y_pred),3))
print('Root Mean Squared error(RMSE):  ',round(np.sqrt(metrics.mean_squared_error(y_test,y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6)) 
print('Mean Absolute Percentage Error (MAPE):', round(metrics.mean_absolute_percentage_error(y_test, y_pred)*100,3), '%')
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))



#def MAPE (y_test, y_pred): 
#    y_test, y_pred = np.array(y_test), np.array(y_pred)
#    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100



r_squared = round(metrics.r2_score(y_test, y_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error(MAE):   4267.845
Mean Squared error(MSE):   36595775.897
Root Mean Squared error(RMSE):   6049.444
R2_score: 0.757142
Mean Absolute Percentage Error (MAPE): 44.832 %
Root Mean Squared Log Error (RMSLE): 8.708
Adj R Square:  0.756924


In [35]:
Results = pd.DataFrame({'Expenses_A':y_test,'Expenses_P':y_pred})

ResultsFinal = HealthIns_BK2.merge(Results,left_index = True,right_index = True)

ResultsFinal.head()

Unnamed: 0,Record_ID,Age,Gender,BMI,Children,Smoker,Region,Expenses,Expenses_A,Expenses_P
8,UI-128730277,33,female,29.57,1,no,southeast,4361.26,4361.26,5345.596078
12,YJ-150953209,33,male,28.0,1,no,northwest,4460.67,4460.67,5237.655028
14,VR-165967234,37,female,27.26,1,no,southeast,5018.02,5018.02,5517.141078
15,ZI-160704168,33,male,42.4,5,no,southwest,6666.24,6666.24,12036.513954
17,GV-197150137,61,male,36.1,3,no,southwest,27941.29,27941.29,15720.567756


In [36]:
ResultsFinal['%Error'] = round(((ResultsFinal['Expenses_A']-ResultsFinal['Expenses_P'])/ResultsFinal['Expenses_A'])*100,3)

ResultsFinal.sample(10)

Unnamed: 0,Record_ID,Age,Gender,BMI,Children,Smoker,Region,Expenses,Expenses_A,Expenses_P,%Error
4129,WW-190551843,45,male,38.87,2,yes,southeast,42565.81,42565.81,35393.441212,16.85
5488,KM-153604722,26,male,30.9,2,no,northwest,3877.3,3877.3,5044.831373,-30.112
3113,ZB-157539323,44,female,25.62,2,no,northwest,8032.41,8032.41,7942.341594,1.121
3786,BY-169614822,41,female,32.97,1,no,southwest,6242.41,6242.41,8919.742098,-42.889
2417,GU-143203283,59,female,31.17,0,no,southwest,10717.79,10717.79,12235.182992,-14.158
4196,YL-161189935,58,female,31.04,0,no,northwest,11847.69,11847.69,12318.964338,-3.978
4868,KD-151556096,19,female,17.8,0,no,southwest,1727.79,1727.79,-2452.715685,241.957
4885,AT-160472567,26,male,34.18,4,no,southeast,4522.6,4522.6,6482.4193,-43.334
654,FE-149936235,54,male,42.02,2,yes,southeast,47278.65,47278.65,38748.63916,18.042
3784,YO-179904158,61,male,28.3,1,yes,northwest,28868.66,28868.66,35849.421541,-24.181


In [37]:
del ResultsFinal['Expenses_A']

ResultsFinal.sample(10)

Unnamed: 0,Record_ID,Age,Gender,BMI,Children,Smoker,Region,Expenses,Expenses_P,%Error
2943,WK-145294113,42,male,21.43,3,no,southeast,6672.01,5406.316069,18.97
295,HO-129243456,49,female,38.94,0,no,northeast,7943.45,13272.599198,-67.089
800,RT-162007742,55,male,37.77,0,no,southwest,8815.11,13283.45674,-50.69
422,RU-152531520,30,male,30.83,3,no,southwest,4270.98,6148.591483,-43.962
2304,UP-184098408,60,male,25.56,0,no,northeast,11943.16,10938.510328,8.412
6025,KB-145621200,58,female,33.4,0,no,northwest,12231.61,13158.420847,-7.577
4221,MW-157772081,51,male,36.9,1,no,southwest,8069.73,12494.549193,-54.832
1438,HH-192685014,54,female,31.53,0,no,southwest,9640.26,11121.715163,-15.367
6009,KM-184408110,46,male,26.6,1,no,southeast,7742.11,7210.972301,6.86
4128,PY-139809622,28,female,25.87,0,no,northeast,3052.01,3409.189058,-11.703
