In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
df = pd.read_csv('/kaggle/input/ushealthinsurancedataset/insurance.csv')
df.head()

# EDA Of Insurace Data

In [5]:
from pandas_profiling import ProfileReport

In [6]:
ProfileReport(df)

# As per EDA we can get general idea about data and it's correlations
 - age and charges have high correlation , That is understandable As age increases Insurance premium also increases.
 - A person being a smoker increases insurance premium charges , As smoking can cause health issues.
 - The data has no missing values , That's great
 - The data only has one duplicate row , we can take care of that in data preprocessing
 - Smoker feature is not equally distributed , 1064 Non Smokers are there with only 274 smokers , Being unbalanced data this has a possibility to bias the data , We can take care of that after seeing how our data model performs and do actions as needed.
 - Now we can prepare data for modeling

# Data Preperation

In [9]:
df.info()

In [10]:
df.sex = df.sex.astype('category')
df.smoker = df.smoker.astype('category')
df.region = df.region.astype('category')

In [11]:
df.info()

In [14]:
df = pd.get_dummies(df)
df.head()

In [15]:
X = df.drop('charges' , axis = 1)
X.head()

In [16]:
y = df['charges']
y.head()

In [17]:
from sklearn.model_selection import train_test_split

In [19]:
train_test_split?

In [20]:
# Splitting into training sets and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Now we need to try all Regresson models and check which one fits data better right ? There's a Quick and effective way to do this

In [22]:
!pip install lazypredict

In [24]:
from lazypredict.Supervised import LazyRegressor

In [25]:
LazyRegressor?

In [26]:
lazy_model = LazyRegressor(verbose=0 , ignore_warnings=True , custom_metric=None)

In [27]:
all_models , all_predictions = lazy_model.fit(X_train , X_test , y_train , y_test)

In [28]:
all_models_dictionary = lazy_model.provide_models(X_train , X_test , y_train , y_test)

In [33]:
pd.DataFrame(all_models).sort_values(by='RMSE',ascending=True)

# GradientBoostingRegressor has the lowest RMSE score (it’s able to fit the dataset the best out of all the potential models.)

In [38]:
all_models_dictionary['GradientBoostingRegressor']

In [42]:
all_models_dictionary['GradientBoostingRegressor']

In [43]:
from sklearn.ensemble import GradientBoostingRegressor

In [44]:
gbr_model = GradientBoostingRegressor(random_state=42)

In [45]:
gbr_model.fit(X_train , y_train)

In [46]:
y_pred = gbr_model.predict(X_test)

In [51]:
# Checking Model metrics
from sklearn.metrics import explained_variance_score

In [52]:
explained_variance_score( y_test , y_pred )

# The best possible score is 1.0 for explained_variance_score , here we got 0.86 , Pretty good for a start

# Now Let's Dive Deep into Deep Learning

In [53]:
from fastai.tabular.all import *

In [54]:
df.head()

In [55]:
df.info()

In [57]:
list(df.columns)

In [58]:
splits = RandomSplitter(valid_pct=0.2)(range_of(df))

In [59]:
to = TabularPandas(
    df , 
    procs = [ Categorify , FillMissing , Normalize ] , 
    cont_names = ['age',
 'bmi',
 'children',
 'sex_female',
 'sex_male',
 'smoker_no',
 'smoker_yes',
 'region_northeast',
 'region_northwest',
 'region_southeast',
 'region_southwest'] , 
    y_names = 'charges' , 
    splits =  splits
)

In [61]:
dls = to.dataloaders(bs = 64)

In [62]:
dls.show_batch()

In [65]:
learn = tabular_learner(
    dls , 
    metrics = accuracy
)

In [66]:
learn

In [70]:
learn.lr_find()

In [71]:
learn.fit_one_cycle(10 , 0.1 )

In [69]:
learn.show_results()

In [77]:
X_test[:].iloc[1]

In [78]:
learn.predict(X_test[:].iloc[1])

# Need to debug the deep learning model , Will look into that