# Baseline - Decision Support System for Health Insurance

## Import Libraries

In [113]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

## Import Data

In [114]:
df = pd.read_csv('../Project/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Understand Data

In [115]:
df.shape

(1338, 7)

In [116]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


# ...

# 

# Data Preprocessing

# ...

# 

In [117]:
##Converting objects labels into categorical
df[['sex', 'smoker', 'region']] = df[['sex', 'smoker', 'region']].astype('category')
df.dtypes

age            int64
sex         category
bmi          float64
children       int64
smoker      category
region      category
charges      float64
dtype: object

In [118]:
##Converting category labels into numerical using LabelEncoder
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
label.fit(df.sex.drop_duplicates())
df.sex = label.transform(df.sex)
label.fit(df.smoker.drop_duplicates())
df.smoker = label.transform(df.smoker)
label.fit(df.region.drop_duplicates())
df.region = label.transform(df.region)
df.dtypes

age           int64
sex           int32
bmi         float64
children      int64
smoker        int32
region        int32
charges     float64
dtype: object

# ...

## Exploratory Data Analysis

# ...

# 

# Model Building

In [127]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [128]:
x = df.drop('charges', axis = 1)
y = df['charges']
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.3, random_state=101)

In [130]:
scale= StandardScaler()
scale.fit(x_train)
x_train= scale.transform(x_train)
x_test= scale.transform(x_test)

## Linear Regression

In [135]:
linearReg = LinearRegression()
linearReg.fit(x_train, y_train)

LinearRegression()

In [136]:
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

In [137]:
y_pred = linearReg.predict(X_test)
y_pred = pd.DataFrame(y_pred)
meanAbsoluteError= metrics.mean_absolute_error(y_test, y_pred)
meanSquareError = metrics.mean_squared_error(y_test, y_pred)
rootMeanSquareError =np.sqrt(meanSquareError)
pd.DataFrame([meanAbsoluteError, meanSquareError, rootMeanSquareError], index=['meanAbsoluteError', 'meanSquareError', 'rootMeanSquareError'], columns=['Metrics'])

Unnamed: 0,Metrics
meanAbsoluteError,205041.5
meanSquareError,44314070000.0
rootMeanSquareError,210509.1


In [140]:
print(linearReg.intercept_)
print(linearReg.coef_)
print(linearReg.score(x_test, y_test))

13463.722554539529
[3381.87452209   32.57376286 2236.97326776  594.42725466 9660.36255796
 -373.9007914 ]
0.7613126015198816


In [139]:
r2_score(y_test, linearReg.predict(x_test))

0.7613126015198816