# Imports

In [312]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

from tqdm import tqdm_notebook
from tqdm.notebook import tnrange
import itertools

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

import warnings


warnings.filterwarnings("ignore")

### Loading the Dataset

In [313]:
data = pd.read_csv('../input/health-insurance-dataset/Health_insurance.csv')

data.head()

# We can see that this dataset has 7 columns, in which it has 3 nominal categorical data and the rest are numbers.

### Shape of the Data

In [314]:
data.shape

# We can see the shape of the dataset, i.e. 1338 rows and 7 columns

### Features present in the Dataset

In [315]:
data.columns

# Here, in this dataset, "Charges" is the target variable.

### Checking for NULL values

In [316]:
data.isnull().sum()

# Here, we can see that the dataset has "no" NULL values.

### Information regarding the Dataset

In [317]:
data.info()

### Descriptive statistics of the Data

In [318]:
data.describe().T

### Data Types of the Data

In [319]:
data.dtypes

In [321]:
for i in ['sex','smoker', 'region']:
    print(f"{data[i].value_counts()}\n\n")

In [322]:
data_1 = data.copy()

# EDA

In [324]:
numerical = ['age', 'bmi', 'children', 'charges']
categorical = ['sex', 'smoker', 'region']

### Age

In [402]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [15, 5]})
sns.distplot(data['age']).set(title="Density plot of Age.")
plt.show()

#### **Inference**

* The data is slightly skewed
* The figure has 2 peaks

In [403]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [15, 5]})
sns.boxplot(data.age).set(title="Box plot of Age.")
plt.show()

#### **Inference**

* There are no outliers in this data

### BMI

In [404]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [15, 5]})
sns.distplot(x = data.bmi).set(title="Density plot of BMI.")
plt.axvline(data.bmi.mean(),color='b')
plt.axvline(data.bmi.median(),color='r')
plt.show()

#### **Inference**

* This is normally distributed

In [405]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [15, 5]})
sns.boxplot(x = data.bmi).set(title="Box plot of BMI.")
plt.show()

##### **Inference**:
*    BMI contains outliers on the higher range.  
*    Its density plot will be skewed on the right.

### Number of Children

In [400]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [15, 5]})
sns.histplot(data['children'], bins=20).set(xlabel='Number of children', title="Histogram plot of Number of children of the Insuree.")
plt.show()

In [394]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [15, 5]})
sns.boxplot(data.children).set(title="Box plot of Children.")
plt.show()

#### **Inference**

* We can see most of the people have no children

### Charges

In [408]:
sns.set(style='whitegrid', font_scale=1.1, rc={"figure.figsize": [15, 5]})
sns.distplot(
    data['charges'], kde=True, bins=20, hist_kws={"alpha": 1}
).set(xlabel='Charges', title="Density plot of Charges.")
plt.show()

#### **Inference**

* This plot is heavily skewed to the left

In [387]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [15, 5]})
sns.boxplot(data.charges).set(title="Box plot for Charges.")
plt.show()

#### **Inference**

* Charges contains outliers

### Gender

In [392]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [15, 5]})
sns.countplot(data['sex']).set(title="Count plot of Males and Females.")
plt.show()

In [386]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [12, 12]})
plt.pie(data['sex'].value_counts().tolist(), labels = ['male', 'female'], explode= [0.1,0], autopct= '%.2f', shadow= True)
plt.title("Pie chart of Males and Females.")
plt.show()

#### **Inference**

* The dataset is almost evenly distributed

### Smokers

In [410]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [15, 5]})
sns.countplot(data['smoker'], hue = data.sex).set(title="Count plot of Smokers and Non-Smokers(Gender-wise).")
plt.show()

In [385]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [12, 12]})
plt.pie(data['smoker'].value_counts().tolist(), labels = ['Smokers', 'Non-Smokers'], explode= [0.1,0], autopct= '%.2f', shadow= True)
plt.title("Pie chart of Smokers and Non-Smokers.")
plt.show()

In [337]:
data.groupby('smoker')['charges'].max()

#### **Inference**

* Charges is high for people who smoke
* The dataset contains more number of Non Smokers

### Regions

In [380]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [15, 5]})
sns.countplot(data['region']).set(title="Count plot of all the regions in the dataset.")
plt.show()

In [384]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [12, 12]})
plt.pie(data['region'].value_counts().tolist(), labels = ['southwest','southeast','northwest','northeast'], explode= [0.1,0,0,0], autopct= '%.2f', shadow= True)
plt.title("Pie chart of all the regions.")
plt.show()

#### **Inference**

* All the regions are replresented evenly

### Scatter Plots

In [434]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [12, 12]})
sns.scatterplot(x = data.bmi,y = data.charges, hue = data.smoker).set(title="Scatter plot of BMI vs Charges.")
plt.show()

# Scatter plot of BMI vs Charges

#### **Inference**

* We can see that Smokers have high BMI and have high charges and Non-Smokers have less BMI and have less charges.

In [436]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [12, 12]})
sns.scatterplot(x = data.age,y = data.charges,hue=data.region).set(title="Scatter plot of Charges vs Age (Region-wise).")
plt.show()

# Here we can see the scatter plot of Charges vs Age, region wise. 

#### **Inference**

* We can't see anything strange.

### Pair plot

In [426]:
sns.set(palette="deep", font_scale=1.1, rc={"figure.figsize": [15, 15]})
t = sns.pairplot(data, hue = 'smoker')
t.fig.suptitle("Pair plot of all the numerical columns.", y = 1.01)
plt.show()

# This is the pair plot of all the numerical columns in our dataset

#### **Inference**

* Charges and Age are positively correlated.
* 

### Correlation plot

In [423]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [7, 7]})
sns.heatmap(data.corr(), annot= True).set(title="Heat map of the correlation matrix.")
plt.show()

#### **Inference**

* Age and Charges are comparitively weakly correlated to each other
* After Age, BMI is correlated to Charges

# Data wrangling

In [347]:
# As we saw earlier in the dataset, it had three nominal data, we will use one hot encoding to convert the string data to numbers.

data_1 = pd.get_dummies(data_1)

In [348]:
data_1

# Now we have got all the columns as quantitative data.

In [349]:
(data_1.columns )

# Feature Selection

In [350]:
# Using Best-Subset Selection

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

def fit_linear_reg(X,y):
    # fit linear regression model and return RSS and r squared
    model_k = LinearRegression()
    model_k.fit(X,y)
    RSS = mean_squared_error(y,model_k.predict(X))*len(y) 
    R_squared = model_k.score(X,y) 
    adj_R_squared = 1 - (1-R_squared)*(len(y)-1)/(len(y)-X.shape[1]-1)
    return RSS, R_squared, adj_R_squared 



In [351]:
# tqdm for progress bar 
from tqdm import tqdm_notebook
from tqdm.notebook import tnrange
import itertools


m = 11

RSS_list, R_squared_list, feature_list = [], [], []
adj_R_squared_list = []
numb_features = []

#looping over m=1 to m=11 features in X
#Looping over k = 1 to k = 11 features in X
for k in tnrange(1,len(X.columns) + 1, desc = 'Loop...'):

    #Looping over all possible combinations: from 11 choose k
    for combo in itertools.combinations(X.columns,k):
        tmp_result = fit_linear_reg(X[list(combo)],y)   #Store temp result 
        RSS_list.append(tmp_result[0])                  #Append lists
        R_squared_list.append(tmp_result[1])
        adj_R_squared_list.append(tmp_result[2])
        feature_list.append(combo)
        numb_features.append(len(combo))   

# Store in DataFrame
df = pd.DataFrame({'numb_features': numb_features,'RSS': RSS_list, 'R_squared':R_squared_list,'Adj_R_squared': adj_R_squared_list, 'features':feature_list})

In [352]:
df_min = df[df.groupby('numb_features')['RSS'].transform(min) == df['RSS']]
df_max = df[df.groupby('numb_features')['R_squared'].transform(max) == df['R_squared']]
df_max_adj = df[df.groupby('numb_features')['Adj_R_squared'].transform(max) == df['Adj_R_squared']]
display(df_min.head())
display(df_max_adj.head())

In [353]:
df = df.sort_values(by=['Adj_R_squared'], ascending= False)
display(df)

In [354]:
selected_features = list(df.iloc[0][4])

print(selected_features)

# Modelling

In [355]:
from sklearn.model_selection import train_test_split

y = data_1['charges']
X = data_1[selected_features]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=26)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [356]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [357]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

print(f"Training dataset R2 score:\t{round(model.score(X_train, y_train)*100, 3)}%")
print(f"Test dataset R2 score:\t\t{round(model.score(X_test, y_test)*100, 3)}%")



##### We can see that
* *R^2* value of the *Training data* is **75.526%** 
* *R^2* value of the *Test data* is **73.156%**