In this Notebook the model is trained from the in /notebook encoded Dataframe

In [2]:
import pandas as pd
import numpy as np

In [3]:
# load in data
data = "../data/insurance_encoded.csv"
df = pd.read_csv(data)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,True,27.9,0,True,16884.924,False,False,False,True
1,18,False,33.77,1,False,1725.5523,False,False,True,False
2,28,False,33.0,3,False,4449.462,False,False,True,False
3,33,False,22.705,0,False,21984.47061,False,True,False,False
4,32,False,28.88,0,False,3866.8552,False,True,False,False


In [4]:
X = df.drop(columns="charges")
y = df["charges"]

X.shape, y.shape

((1338, 9), (1338,))

In [5]:
# train / test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1070, 9), (1070,), (268, 9), (268,))

In [6]:
# create model and search paramters
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

gbr = GradientBoostingRegressor(random_state=42)

param_dist = {
    "n_estimators": np.arange(50, 300, 50),  # number of trees
    "max_depth": np.arange(3, 10),  # max. depth of tres
    "learning_rate": np.logspace(-3, 0, 10),  # learningrate
    "subsample": np.linspace(0.5, 1.0, 6),  # samplesize of data
    "min_samples_split": np.arange(2, 20, 2),  # min. number of samples for splot
    "min_samples_leaf": np.arange(1, 20, 2),  # min. number of samples for leaf
}

In [7]:
# radnomized search for optimal parameters
random_search = RandomizedSearchCV(
    estimator=gbr,
    param_distributions=param_dist,
    n_iter=100,
    scoring="neg_mean_squared_error",
    cv=5,
    random_state=42,
    n_jobs=-1,
    verbose=2,
)

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [8]:
random_search.best_params_

{'subsample': np.float64(0.7),
 'n_estimators': np.int64(50),
 'min_samples_split': np.int64(2),
 'min_samples_leaf': np.int64(11),
 'max_depth': np.int64(3),
 'learning_rate': np.float64(0.1)}

In [9]:
# chose model and make predictions
gbr_best = random_search.best_estimator_
y_pred = gbr_best.predict(X_test)

In [10]:
# evaluate predictions
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

np.float64(4334.830634093009)

In [11]:
# create a result dataframe
results = pd.DataFrame(
    data={"true": y_test, "pred": y_pred, "dif": abs(y_pred) - abs(y_test)}
)
results = pd.concat([results, X_test], axis=1)
results.head()

Unnamed: 0,true,pred,dif,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
764,9095.06825,10889.792154,1794.723904,45,True,25.175,2,False,True,False,False,False
887,5272.1758,5960.476736,688.300936,36,True,30.02,0,False,False,True,False,False
890,29330.98315,27183.819782,-2147.163368,64,True,26.885,0,True,False,True,False,False
1293,9301.89355,10050.722415,748.828865,46,False,25.745,3,False,False,True,False,False
259,33750.2918,34601.056899,850.765099,19,False,31.92,0,True,False,True,False,False


In [13]:
# histogram of diff
import plotly.express as px

fig = px.histogram(results, x="dif", title="Histogram of modell errors")
fig.add_vline(results.dif.median(), annotation_text="median diff")
fig.update_xaxes(title="Insurance Cost diff in $")
fig.show()

Most of the errors fall in area of -2000$ to + 2000$ (wher as negative Values represent an underestimation of the isurance cost), with a median Error of about 1300$

In [14]:
scatter = px.scatter(
    results,
    x="true",
    y="pred",
    color="smoker",
    title="predictet insurance Cost in realtion to real insurance Cost and smoker",
)
scatter.update_xaxes(title="True Insurance Cost in $")
scatter.update_yaxes(title="Predicted Insurance Cost in $")
scatter.show()

The points that are getting underestimatet are mostly Persons who dont smoke but still have high insurance costs. From our exploratory Data Analysis (in /notebook/health_insurance.ipynb) we already observed, that the attribute smoker has the highest correlation to the isurance Cost of a person. (Granted that the data is correct) This means that there has to be an unkown attribute in play, driving up the insurance Cost for those people.

In [15]:
results_smoker = results.query("smoker == True").copy()
results_smoker.head()

Unnamed: 0,true,pred,dif,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
890,29330.98315,27183.819782,-2147.163368,64,True,26.885,0,True,False,True,False,False
259,33750.2918,34601.056899,850.765099,19,False,31.92,0,True,False,True,False,False
780,18259.216,17929.45804,-329.75796,30,False,24.4,3,True,False,False,False,True
265,46151.1245,45635.159435,-515.965065,46,False,42.35,3,True,False,False,True,False
901,48673.5588,48012.254949,-661.303851,60,False,40.92,0,True,False,False,True,False


In [16]:
smoker_hist = px.histogram(
    results_smoker, x="dif", title="Histogramm of errors for only Smokers"
)
smoker_hist.update_xaxes(title="Difference in $")
smoker_hist.add_vline(results_smoker.dif.median())
smoker_hist.show()

Looking at the results for the smoker only Histogram of errors, we can see that magnitude of errors drastically decreased compared to the whole sample of data. Also the median error decreades to about 1100 $

In [None]:
# saving model for deployment in streamlit
import joblib

joblib.dump(gbr_best, "gbr_insurance_model.pkl")

['gbr_insurance_model.pkl']