In [18]:
#importing the Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [19]:
#Reading the Dataset
dataset=pd.read_csv("insurance_pre.csv")

In [20]:
#Convert categorical(nominal or ordinal) data into numerical data and delete first column
dataset=pd.get_dummies(dataset,drop_first=True)

In [21]:
#Get the column names
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [22]:
independent=dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]

In [23]:
dependent=dataset[['charges']]

In [24]:
#split into training set and test set
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(independent,dependent,test_size=0.30,random_state=0)

In [25]:
#Convert the data into standardized data
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test-sc.transform(X_test)

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
578,51.105407,30.288630,1.067464,-0.023787,0.504670
610,46.462420,29.591808,1.067464,0.976766,0.504670
569,47.391018,38.990508,1.236590,-0.023787,-0.981493
1034,59.462783,37.156104,0.898339,-0.023787,0.504670
198,50.176810,20.088165,0.898339,0.976766,0.504670
...,...,...,...,...,...
1261,28.819071,36.081487,1.067464,-0.023787,0.504670
494,22.318889,26.510680,1.574841,-0.023787,-0.981493
97,53.891199,37.072149,0.898339,-0.023787,0.504670
418,62.248575,37.810949,1.067464,-0.023787,0.504670


In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
param_grid = {'criterion':['mse','mae'],'max_features': ['auto','sqrt','log2'],'n_estimators':[10,100]}
grid = GridSearchCV(RandomForestRegressor(), param_grid, refit=True, verbose=3, n_jobs=-1)
#fitting the model for grid search
grid.fit(X_train,Y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   38.0s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:   40.9s finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae'],

In [27]:
# print best parameter after tuning. Format is print(grid.best_params_)
result=grid.cv_results_
#print(result)
grid_predictions = grid.predict(X_test)

In [28]:
# print classification report as well as find R Value
from sklearn.metrics import r2_score
r_score=r2_score(Y_test,grid_predictions)
print("The R_score value for best parameter {}:".format(grid.best_params_),r_score)

The R_score value for best parameter {'criterion': 'mse', 'max_features': 'sqrt', 'n_estimators': 100}: 0.11642151185294936


In [29]:
#to view entire result table
table=pd.DataFrame.from_dict(result)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.059296,0.002623,0.006336,0.001253077,mse,auto,10,"{'criterion': 'mse', 'max_features': 'auto', '...",0.780554,0.800759,0.782148,0.78782,0.009172,9
1,0.322801,0.014157,0.019989,0.0008164377,mse,auto,100,"{'criterion': 'mse', 'max_features': 'auto', '...",0.801331,0.807898,0.791786,0.800338,0.006615,5
2,0.028654,0.000475,0.005658,0.0004674207,mse,sqrt,10,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.80457,0.782154,0.769806,0.78551,0.014389,11
3,0.262503,0.003857,0.019655,0.0009428531,mse,sqrt,100,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.814218,0.819277,0.799347,0.810948,0.008459,1
4,0.029647,0.001246,0.00533,0.0004711459,mse,log2,10,"{'criterion': 'mse', 'max_features': 'log2', '...",0.784268,0.773702,0.774493,0.777487,0.004805,12
5,0.250512,0.00377,0.018988,0.0008153667,mse,log2,100,"{'criterion': 'mse', 'max_features': 'log2', '...",0.81261,0.804459,0.797552,0.804874,0.006154,4
6,0.180221,0.004494,0.005331,0.001247617,mae,auto,10,"{'criterion': 'mae', 'max_features': 'auto', '...",0.810023,0.777779,0.784764,0.790855,0.013851,7
7,1.693291,0.016569,0.018655,0.000941392,mae,auto,100,"{'criterion': 'mae', 'max_features': 'auto', '...",0.800299,0.796782,0.793798,0.79696,0.002657,6
8,0.106267,0.001246,0.004997,0.0008159507,mae,sqrt,10,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.802586,0.790115,0.768637,0.787113,0.014021,10
9,1.055683,0.013662,0.017989,1.94668e-07,mae,sqrt,100,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.814729,0.813518,0.802906,0.810384,0.005311,2


In [30]:
age=float(input("Age:"))
bmi=float(input("BMI:"))
children=float(input("Children:"))
sex_male=int(input("Sex Male 0 or 1:"))
smoker_yes=int(input("Smoker 0 or 1:"))

Age:44
BMI:70
Children:1
Sex Male 0 or 1:0
Smoker 0 or 1:1


In [33]:
y_pred=grid.predict([[age,bmi,children,sex_male,smoker_yes]])

In [34]:
print("Future_Prediction={}".format(y_pred))

Future_Prediction=[48183.0554289]
