# Experimentation

## Data Ingestion

In [33]:
import pandas as pd
import numpy as np
import os

import warnings
warnings.simplefilter(action='ignore', category=RuntimeWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.options.display.max_colwidth = None
pd.set_option("display.float_format", lambda x: '%.2f' % x)

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [34]:
pwd

'C:\\Users\\User\\Documents\\workspace_datahackermen\\insurance_premium_prediction'

In [35]:
FILE_NAME = "insurance.csv"
DATA_FOLDER = "data"

In [36]:
main_path = os.getcwd()
main_path

'C:\\Users\\User\\Documents\\workspace_datahackermen\\insurance_premium_prediction'

In [37]:
file_path = os.path.join(os.path.join(main_path, DATA_FOLDER), FILE_NAME)

In [38]:
file_path

'C:\\Users\\User\\Documents\\workspace_datahackermen\\insurance_premium_prediction\\data\\insurance.csv'

In [39]:
%%time

insurance_data = pd.read_csv(file_path)

CPU times: total: 0 ns
Wall time: 0 ns


In [40]:
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.77,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.88,0,no,northwest,3866.86


In [41]:
insurance_data.shape

(1338, 7)

## Data Inspection

In [42]:
insurance_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


## Exploration Data Analysis

* Statistical Analysis
* Plots
* Correlation

## Model Building

In [43]:
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.77,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.88,0,no,northwest,3866.86


## Train Test Split

In [44]:
# 0.3, 0.7

In [45]:
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

In [46]:
# y = mX + c

In [47]:
X = insurance_data[["age", "sex", "bmi", "children", "smoker", "region"]]

In [48]:
type(X)

pandas.core.frame.DataFrame

In [49]:
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.7,0,no,northwest
4,32,male,28.88,0,no,northwest


In [50]:
y = insurance_data["charges"]

In [51]:
type(y)

pandas.core.series.Series

In [52]:
y[0:5]

0   16884.92
1    1725.55
2    4449.46
3   21984.47
4    3866.86
Name: charges, dtype: float64

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [54]:
X_train.shape, X_test.shape

((896, 6), (442, 6))

In [55]:
train_data, test_data = train_test_split(insurance_data, test_size=0.33, random_state=42)

In [56]:
train_data.shape, test_data.shape

((896, 7), (442, 7))

In [57]:
train_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1046,43,female,25.08,0,no,northeast,7325.05
682,39,male,35.3,2,yes,southwest,40103.89
1037,45,female,30.5,1,yes,northwest,39725.52
490,19,female,32.9,0,no,southwest,1748.77
39,60,male,39.9,0,yes,southwest,48173.36


In [58]:
test_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
764,45,female,25.18,2,no,northeast,9095.07
887,36,female,30.02,0,no,northwest,5272.18
890,64,female,26.89,0,yes,northwest,29330.98
1293,46,male,25.75,3,no,northwest,9301.89
259,19,male,31.92,0,yes,northwest,33750.29


In [59]:
label = 'charges'
print("Summary of class variable: \n", train_data[label].describe())

Summary of class variable: 
 count     896.00
mean    13379.69
std     12110.71
min      1121.87
25%      4819.84
50%      9556.30
75%     17092.92
max     62592.87
Name: charges, dtype: float64


In [60]:
%%time

save_path = 'models'
predictor = TabularPredictor(label=label, path=save_path).fit(train_data, presets="best_quality")

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "models\"
AutoGluon Version:  0.6.2
Python Version:     3.9.16
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Train Data Rows:    896
Train Data Columns: 6
Label Column: charges
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (62592.87309, 1121.8739, 13379.68825, 12110.71371)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    20439.73

CPU times: total: 2.02 s
Wall time: 2min 5s


In [79]:
predictor.leaderboard(silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,-4498.22,0.89,70.22,0.0,0.22,3,True,22
1,WeightedEnsemble_L2,-4526.3,0.33,29.42,0.0,0.27,2,True,12
2,CatBoost_BAG_L2,-4557.96,0.55,56.01,0.03,23.14,2,True,16
3,CatBoost_BAG_L1,-4571.29,0.02,16.63,0.02,16.63,1,True,6
4,LightGBMXT_BAG_L2,-4589.46,0.59,34.21,0.07,1.34,2,True,13
5,NeuralNetFastAI_BAG_L2,-4593.29,0.61,35.58,0.09,2.71,2,True,18
6,NeuralNetTorch_BAG_L2,-4603.23,0.6,42.53,0.08,9.66,2,True,20
7,LightGBMXT_BAG_L1,-4654.5,0.03,1.1,0.03,1.1,1,True,3
8,NeuralNetTorch_BAG_L1,-4666.16,0.06,8.21,0.06,8.21,1,True,10
9,ExtraTreesMSE_BAG_L2,-4675.72,0.61,33.14,0.09,0.27,2,True,17


## Model Evaluation

In [62]:
test_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
764,45,female,25.18,2,no,northeast,9095.07
887,36,female,30.02,0,no,northwest,5272.18
890,64,female,26.89,0,yes,northwest,29330.98
1293,46,male,25.75,3,no,northwest,9301.89
259,19,male,31.92,0,yes,northwest,33750.29


In [63]:
y_test = test_data[label]  # values to predict

In [64]:
y_test[0:5]

764     9095.07
887     5272.18
890    29330.98
1293    9301.89
259    33750.29
Name: charges, dtype: float64

In [65]:
test_data_nolab = test_data.drop(columns=[label])  # delete label column to prove we're not cheating
test_data_nolab.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
764,45,female,25.18,2,no,northeast
887,36,female,30.02,0,no,northwest
890,64,female,26.89,0,yes,northwest
1293,46,male,25.75,3,no,northwest
259,19,male,31.92,0,yes,northwest


In [66]:
save_path

'models'

Now, load in the stored models.

In [67]:
predictor

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x21fbd019100>

## Load the save model

In [68]:
save_model_predictor = TabularPredictor.load(save_path)

In [69]:
save_model_predictor

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x21fb57743a0>

## Predict the target

In [70]:
y_pred = save_model_predictor.predict(test_data_nolab)

In [71]:
y_pred[0:5]

764    10723.30
887     5583.99
890    27826.00
1293   10634.95
259    33329.56
Name: charges, dtype: float32

In [80]:
# print("Predictions:  \n", y_pred)
perf = save_model_predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: root_mean_squared_error on test data: -4386.483042941255
	Note: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.
Evaluations on test data:
{
    "root_mean_squared_error": -4386.483042941255,
    "mean_squared_error": -19241233.48601117,
    "mean_absolute_error": -2342.560326017958,
    "r2": 0.8687005447840972,
    "pearsonr": 0.9321100214542465,
    "median_absolute_error": -1380.4939927734379
}


In [81]:
save_model_predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-4368.44,-4526.3,0.77,0.33,29.42,0.0,0.0,0.27,2,True,12
1,NeuralNetTorch_BAG_L2,-4385.51,-4603.23,1.56,0.6,42.53,0.35,0.08,9.66,2,True,20
2,WeightedEnsemble_L3,-4386.48,-4498.22,2.2,0.89,70.22,0.0,0.0,0.22,3,True,22
3,CatBoost_BAG_L1,-4406.31,-4571.29,0.02,0.02,16.63,0.02,0.02,16.63,1,True,6
4,NeuralNetFastAI_BAG_L2,-4432.41,-4593.29,1.72,0.61,35.58,0.5,0.09,2.71,2,True,18
5,LightGBMXT_BAG_L1,-4447.07,-4654.5,0.03,0.03,1.1,0.03,0.03,1.1,1,True,3
6,NeuralNetTorch_BAG_L1,-4471.38,-4666.16,0.13,0.06,8.21,0.13,0.06,8.21,1,True,10
7,XGBoost_BAG_L1,-4471.93,-4814.51,0.19,0.02,0.68,0.19,0.02,0.68,1,True,9
8,NeuralNetFastAI_BAG_L1,-4483.06,-4780.13,0.47,0.06,2.74,0.47,0.06,2.74,1,True,8
9,LightGBM_BAG_L1,-4485.48,-4702.5,0.03,0.02,1.14,0.03,0.02,1.14,1,True,4


## Features Importance

In [83]:
%%time

save_model_predictor.feature_importance(train_data)

Computing feature importance via permutation shuffling for 6 features using 896 rows with 5 shuffle sets...
	92.4s	= Expected runtime (18.48s per shuffle set)
	30.34s	= Actual runtime (Completed 5 of 5 shuffle sets)


CPU times: total: 2min 30s
Wall time: 30.4 s


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
smoker,11240.68,564.38,0.0,5,12402.74,10078.62
bmi,3800.24,279.57,0.0,5,4375.89,3224.59
age,2986.26,97.4,0.0,5,3186.8,2785.72
children,329.18,24.5,0.0,5,379.63,278.73
region,211.48,13.68,0.0,5,239.65,183.31
sex,159.7,21.05,0.0,5,203.05,116.36


In [84]:
%%time

save_model_predictor.feature_importance(test_data)

These features in provided data are not utilized by the predictor and will be ignored: ['predicted_charges', 'error_values']
Computing feature importance via permutation shuffling for 6 features using 442 rows with 5 shuffle sets...
	84.56s	= Expected runtime (16.91s per shuffle set)
	21.19s	= Actual runtime (Completed 5 of 5 shuffle sets)


CPU times: total: 1min 58s
Wall time: 21.2 s


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
smoker,10933.66,306.87,0.0,5,11565.5,10301.82
bmi,3255.05,190.69,0.0,5,3647.68,2862.42
age,2201.95,118.09,0.0,5,2445.1,1958.81
children,185.54,23.15,0.0,5,233.21,137.88
region,29.21,24.45,0.03,5,79.55,-21.14
sex,-31.8,20.49,0.99,5,10.38,-73.98


## Bringing it all together

In [90]:
test_data["predicted_charges"] = y_pred

In [91]:
test_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,predicted_charges,error_values
764,45,female,25.18,2,no,northeast,9095.07,10723.3,1628.24
887,36,female,30.02,0,no,northwest,5272.18,5583.99,311.81
890,64,female,26.89,0,yes,northwest,29330.98,27826.0,1504.99
1293,46,male,25.75,3,no,northwest,9301.89,10634.95,1333.05
259,19,male,31.92,0,yes,northwest,33750.29,33329.56,420.73


In [92]:
test_data["error_values"] = abs(test_data["charges"] - test_data["predicted_charges"])

In [93]:
test_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,predicted_charges,error_values
764,45,female,25.18,2,no,northeast,9095.07,10723.3,1628.24
887,36,female,30.02,0,no,northwest,5272.18,5583.99,311.81
890,64,female,26.89,0,yes,northwest,29330.98,27826.0,1504.99
1293,46,male,25.75,3,no,northwest,9301.89,10634.95,1333.05
259,19,male,31.92,0,yes,northwest,33750.29,33329.56,420.73


## Using input for prediction

In [94]:
test_data["sex"].unique()

array(['female', 'male'], dtype=object)

In [95]:
test_data["smoker"].unique()

array(['no', 'yes'], dtype=object)

In [87]:
test_data["region"].unique()

array(['northeast', 'northwest', 'southwest', 'southeast'], dtype=object)

In [96]:
input_data_dict = {
    "age": 5,
    "sex": "female",
    "bmi": 25.175,
    "children": 2,
    "smoker": "no",
    "region": "northeast"
}

In [97]:
input_data_dict

{'age': 5,
 'sex': 'female',
 'bmi': 25.175,
 'children': 2,
 'smoker': 'no',
 'region': 'northeast'}

In [98]:
input_data = pd.DataFrame([input_data_dict])

In [99]:
input_data

Unnamed: 0,age,sex,bmi,children,smoker,region
0,5,female,25.18,2,no,northeast


In [100]:
save_model_predictor.predict(input_data)

0   4871.29
Name: charges, dtype: float32

In [101]:
save_model_predictor.predict(input_data)[0]

4871.2876

## Streamlit

```python
# Display interactive widgets
st.button('Click me')
st.experimental_data_editor('Edit data', data)
st.checkbox('I agree')
st.radio('Pick one', ['cats', 'dogs'])
st.selectbox('Pick one', ['cats', 'dogs'])
st.multiselect('Buy', ['milk', 'apples', 'potatoes'])
st.slider('Pick a number', 0, 100)
st.select_slider('Pick a size', ['S', 'M', 'L'])
st.text_input('First name')
st.number_input('Pick a number', 0, 10)
st.text_area('Text to translate')
st.date_input('Your birthday')
st.time_input('Meeting time')
st.file_uploader('Upload a CSV')
st.download_button('Download file', data)
st.camera_input("Take a picture")
st.color_picker('Pick a color')

# Use widgets' returned values in variables:
>>> for i in range(int(st.number_input('Num:'))):
>>>   foo()
>>> if st.sidebar.selectbox('I:',['f']) == 'f':
>>>   b()
>>> my_slider_val = st.slider('Quinn Mallory', 1, 88)
>>> st.write(slider_val)

# Disable widgets to remove interactivity:
>>> st.slider('Pick a number', 0, 100, disabled=True)
```

In [104]:
input_data_dict

{'age': 5,
 'sex': 'female',
 'bmi': 25.175,
 'children': 2,
 'smoker': 'no',
 'region': 'northeast'}

```python
## This is going to be a Streamlit App

import streamlit as st

st.title('Insurance Premium Charge Prediction')

age = st.slider('age', 16, 100)
sex = st.selectbox("sex", options=["male", "female"])
bmi = st.slider('bmi', 15, 100)
children = st.number_input('children', 0, 20)
smoker = st.selectbox('smoker', options=["yes"])
region = st.selectbox('region', options=['northeast', 'northwest', 'southwest', 'southeast'])
```