In [1]:
#importing the Libraries
import pandas as pd

In [2]:
#Reading the Dataset
dataset=pd.read_csv("insurance_pre.csv")

In [3]:
#Convert categorical(nominal or ordinal) data into numerical data and delete first column
dataset=pd.get_dummies(dataset,drop_first=True)

In [4]:
#Get the column names
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [5]:
independent=dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]

In [6]:
dependent=dataset[['charges']]

In [7]:
#split into training set and test set
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(independent,dependent,test_size=0.30,random_state=0)

In [8]:
#Convert the data into standardized data
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test-sc.transform(X_test)

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
578,51.105407,30.288630,1.067464,-0.023787,0.504670
610,46.462420,29.591808,1.067464,0.976766,0.504670
569,47.391018,38.990508,1.236590,-0.023787,-0.981493
1034,59.462783,37.156104,0.898339,-0.023787,0.504670
198,50.176810,20.088165,0.898339,0.976766,0.504670
...,...,...,...,...,...
1261,28.819071,36.081487,1.067464,-0.023787,0.504670
494,22.318889,26.510680,1.574841,-0.023787,-0.981493
97,53.891199,37.072149,0.898339,-0.023787,0.504670
418,62.248575,37.810949,1.067464,-0.023787,0.504670


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
param_grid = {'criterion':['mse','mae','friedman_mse'],'max_features': ['auto','sqrt','log2'],'splitter':['best','random']}
grid = GridSearchCV(DecisionTreeRegressor(), param_grid, refit=True, verbose=3, n_jobs=-1)
#fitting the model for grid search
grid.fit(X_train,Y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  47 out of  54 | elapsed:    4.9s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:    4.9s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=None,
                                             splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae', 'friedman_mse'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'splitter': ['best', 'random']},
             p

In [16]:
# print best parameter after tuning. Format is print(grid.best_params_)
result=grid.cv_results_
#print(result)
grid_predictions = grid.predict(X_test)

In [17]:
# print classification report as well as find R Value
from sklearn.metrics import r2_score
r_score=r2_score(Y_test,grid_predictions)
print("The R_score value for best parameter {}:".format(grid.best_params_),r_score)

The R_score value for best parameter {'criterion': 'mae', 'max_features': 'log2', 'splitter': 'best'}: -0.5556686880748127


In [12]:
#to view entire result table
table=pd.DataFrame.from_dict(result)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.06398,0.048029,0.004002,0.0,mse,auto,best,"{'criterion': 'mse', 'max_features': 'auto', '...",0.570609,0.70534,0.640119,0.638689,0.055013,6
1,0.017218,0.016582,0.004664,0.0009420102,mse,auto,random,"{'criterion': 'mse', 'max_features': 'auto', '...",0.609237,0.630168,0.53696,0.592122,0.03993,13
2,0.007993,0.004237,0.005996,0.00215803,mse,sqrt,best,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.599126,0.597951,0.548313,0.581796,0.023682,14
3,0.004663,0.00047,0.003998,0.0008149776,mse,sqrt,random,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.530107,0.583219,0.608928,0.574085,0.03282,15
4,0.005661,0.001699,0.004664,0.001699051,mse,log2,best,"{'criterion': 'mse', 'max_features': 'log2', '...",0.580806,0.679538,0.695,0.651781,0.050582,3
5,0.006995,0.002942,0.003332,0.0004713147,mse,log2,random,"{'criterion': 'mse', 'max_features': 'log2', '...",0.61827,0.701542,0.376648,0.565486,0.137789,17
6,0.033311,0.005903,0.002999,8.991328e-07,mae,auto,best,"{'criterion': 'mae', 'max_features': 'auto', '...",0.620984,0.653622,0.677762,0.650789,0.023266,4
7,0.030314,0.000935,0.005991,6.788304e-06,mae,auto,random,"{'criterion': 'mae', 'max_features': 'auto', '...",0.675373,0.644088,0.658497,0.659319,0.012785,2
8,0.025651,0.008726,0.003999,0.001413326,mae,sqrt,best,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.623628,0.597655,0.496669,0.572651,0.054763,16
9,0.02032,0.00047,0.003331,0.0004704713,mae,sqrt,random,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.563122,0.729642,0.614548,0.635771,0.069618,7


In [13]:
age=float(input("Age:"))
bmi=float(input("BMI:"))
children=float(input("Children:"))
sex_male=int(input("Sex Male 0 or 1:"))
smoker_yes=int(input("Smoker 0 or 1:"))

Age:45
BMI:5
Children:2
Sex Male 0 or 1:1
Smoker 0 or 1:1


In [14]:
y_pred=grid.predict([[age,bmi,children,sex_male,smoker_yes]])

In [15]:
print("Future_Prediction={}".format(y_pred))

Future_Prediction=[63770.42801]
