In [42]:
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import root_mean_squared_error as rmse

import warnings
warnings.filterwarnings("ignore")

In [12]:
#import clean data
ML_data = pd.read_csv('ML_insurance_data_clean.csv')
ML_data = ML_data.sample(frac=1)
print(ML_data.head())

      age     bmi  children smoker      charges
1290   19  34.900         0    yes  34828.65400
6      46  33.440         1     no   8240.58960
284    52  31.200         0     no   9625.92000
1043   55  35.245         1     no  11394.06555
94     64  31.300         2    yes  47291.05500


In [14]:
#train processor to convert sex and smoker to boolean
cat_col = ML_data[['smoker']]
text_processor = OrdinalEncoder()
result = text_processor.fit_transform(cat_col)
pickle.dump(text_processor,open('text_processor.pkl','wb'))

In [22]:
#run processor on data
smoker = text_processor.transform(ML_data[['smoker']])
ML_data['smoker'] = smoker

In [23]:
#split train and test
X = ML_data.drop(columns=['charges'])
y = ML_data.charges

X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.8)

In [43]:
#train model using grid search to optimise parameters
parameters = {'n_estimators': [100, 300, 500],
              'max_depth': [None, 10, 20],
              'min_samples_split': [None, 2, 5],
              'min_samples_leaf': [2, 3],
              'bootstrap': [True, False]}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid=parameters, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Estimator:", grid_search.best_estimator_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 500}
Best Estimator: RandomForestRegressor(max_depth=10, min_samples_leaf=3, min_samples_split=5,
                      n_estimators=500)


In [44]:
#Calculate unbiased performance metric
pred = grid_search.best_estimator_.predict(X_test)
print(rmse(pred, y_test))

5064.897785853082


In [46]:
#export model
import pickle
pickle.dump(grid_search.best_estimator_,open('model.pkl','wb'))