In [8]:
import pandas as pd
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import root_mean_squared_error as rmse

import warnings
warnings.filterwarnings("ignore")

In [4]:
#import clean data
ML_data = pd.read_csv('ML_insurance_data_clean.csv')
ML_data = ML_data.sample(frac=1)
print(ML_data.head())

      age     sex     bmi  children smoker      charges
696    41    male  35.750         1    yes  40273.64550
1134   50  female  28.120         3     no  11085.58680
255    55  female  25.365         3     no  13047.33235
36     62  female  32.965         3     no  15612.19335
710    50  female  23.540         2     no  10107.22060


In [5]:
#train processor to convert sex and smoker to boolean
cat_col = ML_data[['sex','smoker']]
text_processor = OrdinalEncoder()
result = text_processor.fit_transform(cat_col)
pickle.dump(text_processor,open('text_processor.pkl','wb'))

In [6]:
#run processor on data
sex, smoker = text_processor.transform(ML_data[['sex','smoker']]).transpose()
ML_data['sex'] = sex
ML_data['smoker'] = smoker

In [7]:
#split train and test
X = ML_data.drop(columns=['charges'])
y = ML_data.charges

X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.8)

In [13]:
#train model using grid search to optimise parameters
parameters = {'fit_intercept':[True,False],
    'positive':[True,False],
    'tol':[1e-16,1e-15,1e-14,1e-13,1e-12,1e-11,1e-10,1e-9,1e-8,1e-7,1e-6,1e-5,1e-4]}

grid_search = GridSearchCV(LinearRegression(), param_grid=parameters, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Estimator:", grid_search.best_estimator_)

Fitting 5 folds for each of 52 candidates, totalling 260 fits
Best Parameters: {'fit_intercept': True, 'positive': True, 'tol': 1e-16}
Best Estimator: LinearRegression(positive=True, tol=1e-16)


In [14]:
#Calculate unbiased performance metric
pred = grid_search.best_estimator_.predict(X_test)
print(rmse(pred, y_test))

5721.630698959106


In [8]:
#export model
import pickle
pickle.dump(grid_search.best_estimator_,open('model.pkl','wb'))