In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

In [4]:
dataset = pd.read_csv("insurance_pre.csv")
print(dataset)


      age     sex     bmi  children smoker      charges
0      19  female  27.900         0    yes  16884.92400
1      18    male  33.770         1     no   1725.55230
2      28    male  33.000         3     no   4449.46200
3      33    male  22.705         0     no  21984.47061
4      32    male  28.880         0     no   3866.85520
...   ...     ...     ...       ...    ...          ...
1333   50    male  30.970         3     no  10600.54830
1334   18  female  31.920         0     no   2205.98080
1335   18  female  36.850         0     no   1629.83350
1336   21  female  25.800         0     no   2007.94500
1337   61  female  29.070         0    yes  29141.36030

[1338 rows x 6 columns]


In [6]:
dataset=pd.get_dummies(dataset,drop_first=True)
print(dataset)

      age     bmi  children      charges  sex_male  smoker_yes
0      19  27.900         0  16884.92400     False        True
1      18  33.770         1   1725.55230      True       False
2      28  33.000         3   4449.46200      True       False
3      33  22.705         0  21984.47061      True       False
4      32  28.880         0   3866.85520      True       False
...   ...     ...       ...          ...       ...         ...
1333   50  30.970         3  10600.54830      True       False
1334   18  31.920         0   2205.98080     False       False
1335   18  36.850         0   1629.83350     False       False
1336   21  25.800         0   2007.94500     False       False
1337   61  29.070         0  29141.36030     False        True

[1338 rows x 6 columns]


In [8]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [10]:
independent_input=dataset[['age', 'bmi', 'children','sex_male', 'smoker_yes']]
print(independent_input)

      age     bmi  children  sex_male  smoker_yes
0      19  27.900         0     False        True
1      18  33.770         1      True       False
2      28  33.000         3      True       False
3      33  22.705         0      True       False
4      32  28.880         0      True       False
...   ...     ...       ...       ...         ...
1333   50  30.970         3      True       False
1334   18  31.920         0     False       False
1335   18  36.850         0     False       False
1336   21  25.800         0     False       False
1337   61  29.070         0     False        True

[1338 rows x 5 columns]


In [14]:
dependent_output=dataset[["charges"]]
print(dependent_output)

          charges
0     16884.92400
1      1725.55230
2      4449.46200
3     21984.47061
4      3866.85520
...           ...
1333  10600.54830
1334   2205.98080
1335   1629.83350
1336   2007.94500
1337  29141.36030

[1338 rows x 1 columns]


In [18]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(independent_input,dependent_output,test_size=1/3,random_state=0)
print(x_train)
print(x_test)
print(y_train)
print(y_test)

      age    bmi  children  sex_male  smoker_yes
482    18  31.35         0     False       False
338    50  32.30         1      True        True
356    46  43.89         3      True       False
869    25  24.30         3     False       False
182    22  19.95         3      True       False
...   ...    ...       ...       ...         ...
763    27  26.03         0      True       False
835    42  35.97         2      True       False
1216   40  25.08         0      True       False
559    19  35.53         0      True       False
684    33  18.50         1     False       False

[892 rows x 5 columns]
      age     bmi  children  sex_male  smoker_yes
578    52  30.200         1      True       False
610    47  29.370         1     False       False
569    48  40.565         2      True        True
1034   61  38.380         0      True       False
198    51  18.050         0     False       False
...   ...     ...       ...       ...         ...
261    20  26.840         1     False 

In [27]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [37]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
parameter_grid={'criterion':['squared_error', 'absolute_error','friedman_mse'],
               'max_features':[None,'sqrt','log2'],
               'splitter':['best','random']}
grid=GridSearchCV(DecisionTreeRegressor(), parameter_grid, refit=True, verbose=3, n_jobs=-1)
grid.fit(x_train,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [39]:
re=grid.cv_results_

In [41]:
print("The R_score value for best model {}".format(grid.best_params_))

The R_score value for best model {'criterion': 'squared_error', 'max_features': None, 'splitter': 'best'}


In [43]:
table=pd.DataFrame.from_dict(re)
print(table)

    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        0.005558      0.000633         0.002308        0.000253   
1        0.003603      0.000557         0.002198        0.000321   
2        0.003887      0.000469         0.002063        0.000484   
3        0.016238      0.017227         0.003301        0.002911   
4        0.010204      0.010995         0.002628        0.003176   
5        0.004033      0.003918         0.002427        0.003938   
6        0.027799      0.014366         0.002260        0.000695   
7        0.029391      0.011463         0.003315        0.003474   
8        0.015792      0.007430         0.001632        0.002215   
9        0.019138      0.011235         0.002183        0.002675   
10       0.013523      0.003088         0.002261        0.002670   
11       0.010848      0.002261         0.003514        0.001360   
12       0.005693      0.001862         0.002826        0.002207   
13       0.003201      0.000655         0.001357

In [45]:
age_input=float(input("Age"))
bmi_input=float(input("BMI"))
children_input=float(input("children"))
sex_male_input=int(input("Sex male 0 or 1"))
smoker_yes_input=int(input("smoker yes 0 or 1"))

Age 20
BMI 35
children 2
Sex male 0 or 1 0
smoker yes 0 or 1 1


In [47]:
Future_prediction=grid.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]])
print("Future prediction {}".format(Future_prediction))

Future prediction [63770.42801]


In [49]:
import pickle
filename="Finalised_grid_search.sav"
pickle.dump(grid,open(filename,'wb'))

In [51]:
loaded_model=pickle.load(open("Finalised_grid_search.sav",'rb'))

In [53]:
result=loaded_model.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]])
print(result)

[63770.42801]
