In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


from plotly.offline import init_notebook_mode, iplot, plot 
import plotly.graph_objs as go 
import plotly.express as px
import seaborn as sns
from matplotlib import pyplot as plt
from plotly.subplots import make_subplots
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Objective:**
1. Perform EDA
1. Data Cleanup And Prepration
1. Build Model


In [None]:
df = pd.read_csv('/kaggle/input/medical-insurance-premium-prediction/Medicalpremium.csv')
df.shape

**Basic Stats about the data:**

In [None]:
df.describe().T

In [None]:
df.isna().sum()

# Feature Engineering:

In [None]:
def conditions(s):
    if (s['Diabetes']!=0) or (s['BloodPressureProblems']!= 0) or (s['AnyTransplants']!=0) or (s['AnyChronicDiseases']!=0) or (s['KnownAllergies']!=0) or (s['HistoryOfCancerInFamily']!=0) or (s['NumberOfMajorSurgeries']>1) or (np.any(s['BMI']==list(range(19,25)))):
        return 1
    else:
        return 0

Adding few other features that I can think of and can be derived using existing data. 
1. **Body Mass Index (BMI)**
1. **Overall Customer's:** Customer's overall health status including BMI. Value 1 is assigned to customer's having any medical condition or if BMI is outside normal range and value 0 is assigned in rest of the cases.

In [None]:
#Calculating BMI
df['BMI']=(df['Weight']/df['Height']**2)*10000
#Creating a new variable to see number of customers who are totally fit vs others
df['anymedicalcondition']= df.apply(conditions, axis=1)

# EDA

**Checking if BMI has any impact on Premium Price:**

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x = df['PremiumPrice'],
    y = df['BMI'],
    mode = 'markers'
))
fig.update_layout(title='Premium Price (INR) Vs BMI')
fig.update_xaxes(title='Premium Price (INR)')
fig.update_yaxes(title= 'BMI')
fig.show()

It's quite clear from the chart, we don’t see any specific patter which either indicates positive or negative impact of BMI on Premium Price.

**Looking at Age distribution of Customer by Diabetes condition**

In [None]:
this_figure = make_subplots(rows=1, cols=2) 


fig1 = px.box(df, x="Diabetes", y="Age")
figure2_traces = []
for trace in range(len(fig1["data"])):
    figure2_traces.append(fig1["data"][trace])


for traces in figure2_traces:
    this_figure.append_trace(traces, row=1, col=2)
this_figure.add_trace(go.Histogram(x=df.loc[df['Diabetes']!=1,'Age'],name='Age Dist (without Diabetes)'),row=1,col=1)
this_figure.add_trace(go.Histogram(x=df.loc[df['Diabetes']==1,'Age'],name='Age Dist (with Diabetes)'),row=1,col=1)

this_figure.update_layout(title='Age Distribution of Diabetes and Non Diabetic customers')
this_figure.update_xaxes(title='Age (in years)',row=1,col=1)
this_figure.update_xaxes(title='Diabetic?',row=1,col=2)
this_figure.update_yaxes(title='Age',row=1,col=2)
this_figure.update_yaxes(title='Count',row=1,col=1)

this_figure.show()

Above histogram clearly shows our dataset consists of high number of customers having diabetes with higher age.

In [None]:
def CorrMtx(df, dropDuplicates = True):

    df = df.corr()

    # Exclude duplicate correlations by masking uper right values
    if dropDuplicates:    
        mask = np.zeros_like(df, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True

    # Set background color / chart style
    sns.set_style(style = 'white')

    # Set up  matplotlib figure
    f, ax = plt.subplots(figsize=(11, 9))

    # Add diverging colormap from red to blue
    cmap = sns.diverging_palette(250, 10, as_cmap=True)

    # Draw correlation plot with or without duplicates
    if dropDuplicates:
        sns.heatmap(df, mask=mask, cmap=cmap, annot=True,
                square=True,
                linewidth=.5, cbar_kws={"shrink": .5}, ax=ax)
    else:
        sns.heatmap(df, cmap=cmap, annot=True,
                square=True,
                linewidth=.5, cbar_kws={"shrink": .5}, ax=ax)

In [None]:
CorrMtx(df.corr())

**Few key insights that we can observe from above chart are:**
1. Premium Price is highly influenced by Age of the customer, Correlation coefficient is **+0.85** shows very high positive relationship.
1. Premium Price is also slightly influenced by Number of Surgeries customer has had in past, Correlation coefficient is **+0.43**.
1. Our engineered feature anymedicalcondition also shows positive correlation of **+0.4** (which is obvious as we derived it from combination of other features)


*On a side note: Premium Price shows very slight negative correlation for few variables which can be ignored but seems interesting to explore.*

In [None]:
fig = px.scatter(df, x="Age", y="PremiumPrice", color="Age" ,trendline="lowess")
fig.update_layout(title='Age Vs Premium Price')
fig.show()

In [None]:
fig = px.scatter(df, x="NumberOfMajorSurgeries", y="PremiumPrice" , color="NumberOfMajorSurgeries" ,trendline="ols")
fig1 = px.box(df, x="NumberOfMajorSurgeries", y="PremiumPrice")
figure1_traces = []
figure2_traces = []
for trace in range(len(fig["data"])):
    figure1_traces.append(fig["data"][trace])
for trace in range(len(fig1["data"])):
    figure2_traces.append(fig1["data"][trace])

this_figure = make_subplots(rows=1, cols=2) 

for traces in figure1_traces:
    this_figure.append_trace(traces, row=1, col=1)
for traces in figure2_traces:
    this_figure.append_trace(traces, row=1, col=2)
this_figure.update_layout(title='Number of Major Surgeries Vs Premium Price')
this_figure.update_xaxes(title='Number of Major Surgeries')
this_figure.update_yaxes(title='Premium Price (INR)')
this_figure.show()

In [None]:
fig = px.scatter(df, x="anymedicalcondition", y="PremiumPrice", color="Age" ,trendline="ols")
fig.update_layout(title='Medical Conditions Vs Premium Price')

fig = px.scatter(df, x="anymedicalcondition", y="PremiumPrice" , color="NumberOfMajorSurgeries" ,trendline="ols")
fig1 = px.box(df, x="anymedicalcondition", y="PremiumPrice")
figure1_traces = []
figure2_traces = []
for trace in range(len(fig["data"])):
    figure1_traces.append(fig["data"][trace])
for trace in range(len(fig1["data"])):
    figure2_traces.append(fig1["data"][trace])

this_figure = make_subplots(rows=1, cols=2) 

for traces in figure1_traces:
    this_figure.append_trace(traces, row=1, col=1)
for traces in figure2_traces:
    this_figure.append_trace(traces, row=1, col=2)
this_figure.update_layout(title='Medical Condition Vs Premium Price')
this_figure.update_xaxes(title='Medical Condition')
this_figure.update_yaxes(title='Premium Price (INR)')
this_figure.show()

Model Building

In [None]:
!pip install pycaret
df.drop('anymedicalcondition',axis = 1,inplace=True)
from pycaret.regression import *

In [None]:
random_seed = 69
def data_sampling(dataset, frac: float, random_seed: int):
    data_sampled_a = dataset.sample(frac=frac, random_state=random_seed)
    data_sampled_b =  dataset.drop(data_sampled_a.index).reset_index(drop=True)
    data_sampled_a.reset_index(drop=True, inplace=True)
    return data_sampled_a, data_sampled_b   

In [None]:
df_seen, df_unseen = data_sampling(df, 0.9, random_seed)

In [None]:
exp1 = setup(df_seen,
             target = 'PremiumPrice',feature_selection = True,silent= True)

In [None]:
models = ['svm','knn','dt','rf','et','ada','gbr','mlp','xgboost','lightgbm']
compare_models(fold = 10, round = 4, include = models) 

In [None]:
top3 = compare_models(n_select = 3)
tuned_top3 = [tune_model(i) for i in top3]
stacker = stack_models(tuned_top3)
best_rmse = automl(optimize = 'RMSE')

In [None]:
evaluate_model(best_rmse)

In [None]:
pred_unseen = predict_model(best_rmse, data = df_unseen)
pred_unseen.head(10)