# **Analysis and prediction of Health Insurance Data**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

# Importing dataset

In [None]:
dataset = pd.read_csv('/kaggle/input/health-insurance-dataset/Health_insurance.csv')

# Data description

In [None]:
dataset.head()

In [None]:
dataset.describe()

In [None]:
dataset.info()

In [None]:
dataset['region'].unique()

# Preprocessing

we need to convert the categorical featuress into numerical, we will use label encoder to do the job

In [None]:
df = dataset.copy()
for col_name in dataset.select_dtypes('object'):
  df[col_name] = LabelEncoder().fit_transform(dataset[col_name])

df.head()

# EDA

Let's start with basic visualization, first we will look at age vs charges graph

In [None]:
sns.lineplot(x=df.age, y=df.charges, data= df).set(title='Age vs Charges')

Charges increase with age, butt he graph doesn't look linear, there is some feature that is affecting

Let us try to identify them

First let us look at the smoker feature

In [None]:
val = list(df.smoker.value_counts()/len(df))
print(val)

In [None]:
labels = ['Yes (79.5%)','No (20.5 %)']
colors = ['red', 'blue',"white"]
fig = plt.figure(figsize=(4,4),dpi=100)
val.append(sum(val))
ax = fig.add_subplot(1,1,1)
ax.pie(val, colors=colors)
ax.add_artist(plt.Circle((0, 0), 0.6, color='white'))
plt.legend( labels, loc="best")
plt.axis('equal')
plt.tight_layout()
plt.show()

dataset is of 79.5% smokers and 20.5% non smokers, let'see if this feature has influence over charges 

In [None]:
sns.violinplot(data=dataset, x='smoker', y='charges').set(title='Smoker vs Charges')

There's huge variation in charges between smoker and non smoker, charges density for non smoker ends at 20,000 while charges for smoker starts at 20,000

Let's see charges over age for smoker and non smoker

In [None]:
sns.lineplot(x=df.age, y=df.charges, hue=dataset.smoker, data= dataset).set(title='Age vs Charges')

Charges for smoker is very high compared to non smoker

let's see smokers region wise and check it's effect on charges

In [None]:
labels = dataset['region'].unique()
sizes = [24, 27, 24, 24]
explode = (0, 0.1, 0, 0)
  
fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode = explode,
        labels = labels, autopct ='% 1.1f %%',
        shadow = True, startangle = 180)
ax1.axis('equal')
  
ax1.set_title('smoker v region')
plt.show()

smokers are 3% more in southeast region

In [None]:
sns.stripplot(data=dataset, x='region', y='charges', hue=dataset.smoker).set(title='Region vs Charges')

Smokers in southeast region pays higher than other region, but still the difference is not very huge.

So let us check bmi vs smokers

In [None]:
sns.boxenplot(data=dataset, x='smoker', y='bmi', hue='sex').set(title='Sex vs BMI')

There is no significant difference in bmi for smokers and non smokers

Let's take next feature (i.e.,) Children

In [None]:
sns.lineplot(data=dataset, x='children', y='charges', hue='smoker').set(title='Children vs Charges')

Smokers with more than 3 children pay lesser than those with lesser 3 children, whereas non smokers with more than 3 children pay higher than their counterparts.

Number of chilren might affect charges, but there might be another factor that is contributing to this, so let us look at BMI vs children

In [None]:
sns.lineplot(data=dataset, x='children', y='bmi', hue='smoker').set(title='Children vs BMI')

In [None]:
sns.boxplot(data=dataset, x='smoker', y='bmi', hue='children').set(title='Children vs BMI')

BMI of smokers with more than 3 children is optimal (18.5 - 24.9) thus the charges are less and not just because of children. 

we can check this by plotting the charges vs age for people with less than children

In [None]:
children3 =  dataset[dataset['children']<=3]
sns.lineplot(x='age', y='charges', hue='smoker', data= children3).set(title='Age vs Charges for people with 0-3 children')


In [None]:
sns.lineplot(x=df.age, y=df.charges, hue=dataset.smoker, data= dataset).set(title='Age vs Charges for people with 3-5 children')

In [None]:
sns.lineplot(x='age', y='charges', hue='smoker', data= children3).set(title='Age vs Charges for people with 0-3 & 3-5 children merged');
sns.lineplot(x=df.age, y=df.charges, hue=dataset.smoker, data= dataset);

As we can see that charges doesn't have huge variation between persons irrespective of the number of children.

Thus we can conclude that children doesn't affect the charges

Now that we know BMI is the reason for the decrease in charges for smoker, we can plot BMI vs charges

In [None]:
sns.lineplot(data=dataset,x='bmi',y='charges', hue='smoker').set(title='BMI vs charges')

BMI plays a very crucial role in deciding the charges.

Value 30 seems to be the threshold, let us split the dataset with the threshold to verify this

In [None]:
bmi1 = dataset[dataset['bmi']<=30]
bmi2 = dataset[dataset['bmi']>=30]
sns.catplot(x='age', y='charges', hue='smoker', data= bmi1, kind='point', aspect=3).set(title='Age vs charges for people with BMI less than 30')


In [None]:
sns.catplot(x='age', y='charges', hue='smoker', data= bmi2, kind='point', aspect=3).set(title='Age vs charges for people with BMI more than 30')

Both the graphs are almost flat 

We can clearly see that charges for persons with BMI less than 30 ends at 25,000 whereas the charges for persons with BMI more than 30 starts at 35,000 

We have identified the features that influence the charges the most, they are:
1. Smoker
2. Age
3. BMI

We can verify this with Correlation plots

In [None]:
sns.pairplot(data=dataset,hue='smoker')

In [None]:
sns.heatmap(df.corr(),cmap='Blues').set(title='Correlation Heatmap')

From the Correlation Heatmap we can infer that smoker,age and bmi is highly correlated with charges

## Feature Selection

The prediction accuracy can be improved (sometimes) by shrinking or setting some parameters to zero.

In this notebook we will use Best subset selection.

Best subset regression finds for each  𝑚∈{0,1,2,…,𝑘}  the subset of size  𝑚  that gives smallest residual sum of squares (RSS). The question of how to choose  𝑚  involves the tradeoff between bias and variance. This is infeasible for  𝑘>>40.

In [None]:
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error

In [None]:
#Best selection method based on RSS
#some helper function 
def fit_linear_reg(X,y):
    #fit linear regression model and return RSS and r squared
    model_k = LinearRegression()
    model_k.fit(X,y)
    RSS = mean_squared_error(y,model_k.predict(X))*len(y) 
    R_squared = model_k.score(X,y) 
    adj_R_squared = 1 - (1-R_squared)*(len(y)-1)/(len(y)-X.shape[1]-1)
    return RSS, R_squared, adj_R_squared 

In [None]:
#import tqdm for progress bar 
from tqdm import tqdm_notebook
from tqdm.notebook import tnrange
import itertools

#Initialize variables 
y = df.charges
X = df.drop(columns='charges',axis=1) 

m = 11

RSS_list, R_squared_list, feature_list = [], [], []
adj_R_squared_list = []
numb_features = []

#looping over m=1 to m=7 features in X
#Looping over k = 1 to k = 17 features in X
for k in tnrange(1,len(X.columns) + 1, desc = 'Loop...'):

    #Looping over all possible combinations: 
    for combo in itertools.combinations(X.columns,k):
        tmp_result = fit_linear_reg(X[list(combo)],y)   #Store temp result 
        RSS_list.append(tmp_result[0])                  #Append lists
        R_squared_list.append(tmp_result[1])
        adj_R_squared_list.append(tmp_result[2])
        feature_list.append(combo)
        numb_features.append(len(combo))   

#Store in DataFrame
dfs = pd.DataFrame({'numb_features': numb_features,'RSS': RSS_list, 'R_squared':R_squared_list,'Adj_R_squared': adj_R_squared_list, 'features':feature_list})

In [None]:
df_min = dfs[dfs.groupby('numb_features')['RSS'].transform(min) == dfs['RSS']]
df_max = dfs[dfs.groupby('numb_features')['R_squared'].transform(max) == dfs['R_squared']]
df_max_adj = dfs[dfs.groupby('numb_features')['Adj_R_squared'].transform(max) == dfs['Adj_R_squared']]
display(df_min.head())
display(df_max_adj.head(10))

In [None]:
dfs['min_RSS'] = dfs.groupby('numb_features')['RSS'].transform(min)
dfs['max_R_squared'] = dfs.groupby('numb_features')['R_squared'].transform(max)
dfs['max_Adj_R_squared'] = dfs.groupby('numb_features')['Adj_R_squared'].transform(max)
dfs.head()

In [None]:
fig = plt.figure(figsize = (16,6))
ax = fig.add_subplot(1, 2, 1)

ax.scatter(dfs.numb_features,dfs.RSS, alpha = .2, color = 'darkblue' )
ax.set_xlabel('# Features')
ax.set_ylabel('RSS')
ax.set_title('RSS - Best subset selection')
ax.plot(dfs.numb_features,dfs.min_RSS,color = 'r', label = 'Best subset')
ax.legend()

ax = fig.add_subplot(1, 2, 2)
ax.scatter(dfs.numb_features,dfs.Adj_R_squared, alpha = .2, color = 'darkblue' )
ax.plot(dfs.numb_features,dfs.max_Adj_R_squared,color = 'r', label = 'Best subset')
ax.set_xlabel('# Features')
ax.set_ylabel('Aj R squared')
ax.set_title('Adj_R_squared - Best subset selection')
ax.legend()

plt.show()

From the plots we can select 3 features from the data, which are:
1. smoker
2. age
3. bmi

In [None]:
sns.pairplot(data=df[['smoker','age','bmi']])

## Model building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df[['smoker','age','bmi']]
y = df.charges

scaler = StandardScaler()
X = scaler.fit_transform(X) #Standardizing the dataset for better performance

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2)

In [None]:
model = LinearRegression() 
model.fit(Xtrain,ytrain)  

## Evaluation

In [None]:
print("Accuracy:")
print(f"\tTrain data: {round(model.score(Xtrain,ytrain)*100,2)}%")
print(f"\tTest data: {round(model.score(Xtest,ytest)*100,2)}%")

In [None]:
yhat = model.predict(Xtest)
results= pd.DataFrame({"Ground Truth":ytest,"Charges Predicted":yhat})
results.head()

In [None]:
sns.violinplot(data=results,ci=False)

In [None]:
sns.stripplot(data=results)

Our Model will be able to predict charges till 20000 accurately

### **Train Accuracy: 74%**
### **Test Accuracy: 77%**