In [1]:
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import root_mean_squared_error as rmse

import warnings
warnings.filterwarnings("ignore")

In [2]:
#import clean data
ML_data = pd.read_csv('ML_insurance_data_clean.csv')
ML_data = ML_data.sample(frac=1)
print(ML_data.head())

      age     sex    bmi  children smoker     charges
734    49  female  34.77         1     no   9583.8933
932    45  female  35.30         0     no   7348.1420
858    57    male  28.10         0     no  10965.4460
425    45    male  24.31         5     no   9788.8659
1296   28  female  26.51         2     no   4340.4409


In [3]:
#train processor to convert sex and smoker to boolean
cat_col = ML_data[['sex','smoker']]
text_processor = OrdinalEncoder()
result = text_processor.fit_transform(cat_col)
pickle.dump(text_processor,open('text_processor.pkl','wb'))

In [4]:
#run processor on data
sex, smoker = text_processor.transform(ML_data[['sex','smoker']]).transpose()
ML_data['sex'] = sex
ML_data['smoker'] = smoker

In [5]:
#split train and test
X = ML_data.drop(columns=['charges'])
y = ML_data.charges

X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.8)

In [6]:
#train model using grid search to optimise parameters
parameters = {'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [None, 2, 5],
    'min_samples_leaf': [2, 3],
    'bootstrap': [True, False]}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid=parameters, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Estimator:", grid_search.best_estimator_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 300}
Best Estimator: RandomForestRegressor(max_depth=10, min_samples_leaf=3, n_estimators=300)


In [7]:
#Calculate unbiased performance metric
pred = grid_search.best_estimator_.predict(X_test)
print(rmse(pred, y_test))

4731.006954709864


In [8]:
#export model
import pickle
pickle.dump(grid_search.best_estimator_,open('model.pkl','wb'))