In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv("C:\\Users\\sushr\\DataSets\\insurance.csv", encoding='utf-8')
print(data.head(20))

    age     sex     bmi  children smoker     region      charges
0    19  female  27.900         0    yes  southwest  16884.92400
1    18    male  33.770         1     no  southeast   1725.55230
2    28    male  33.000         3     no  southeast   4449.46200
3    33    male  22.705         0     no  northwest  21984.47061
4    32    male  28.880         0     no  northwest   3866.85520
5    31  female  25.740         0     no  southeast   3756.62160
6    46  female  33.440         1     no  southeast   8240.58960
7    37  female  27.740         3     no  northwest   7281.50560
8    37    male  29.830         2     no  northeast   6406.41070
9    60  female  25.840         0     no  northwest  28923.13692
10   25    male  26.220         0     no  northeast   2721.32080
11   62  female  26.290         0    yes  southeast  27808.72510
12   23    male  34.400         0     no  southwest   1826.84300
13   56  female  39.820         0     no  southeast  11090.71780
14   27    male     NaN  

## Data Preparation

### Data Cleaning-removing nulls

In [3]:
count_nan = data.isnull().sum()
count_nan[count_nan > 0]

bmi    5
dtype: int64

In [4]:
data['bmi'].fillna(data['bmi'].mean(), inplace = True)

In [5]:
count_nan = data.isnull().sum()
count_nan[count_nan > 0]

Series([], dtype: int64)

### Convert categorical data to numerical data using sklearn

In [6]:
#Create ndarray for label encoding (sklearn)
sex = data.iloc[:,1:2].values

In [7]:
smoker = data.iloc[:,4:5].values
smoker

array([['yes'],
       ['no'],
       ['no'],
       ...,
       ['no'],
       ['no'],
       ['yes']], dtype=object)

In [8]:
le = LabelEncoder()
sex[:,0] = le.fit_transform(sex[:,0])
sex =  pd.DataFrame(sex)
sex.columns = ['sex']
le_sex_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("sklearn label encoder results for sex: ")
print(le_sex_mapping)
print(sex[:10])

sklearn label encoder results for sex: 
{'female': 0, 'male': 1}
  sex
0   0
1   1
2   1
3   1
4   1
5   0
6   0
7   0
8   1
9   0


In [9]:
le = LabelEncoder()
smoker[:,0] = le.fit_transform(smoker[:,0])
smoker =  pd.DataFrame(smoker)
smoker.columns = ['smoker']
le_smoker_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("sklearn label encoder results for smoker: ")
print(le_smoker_mapping)
print(smoker[:10])

sklearn label encoder results for smoker: 
{'no': 0, 'yes': 1}
  smoker
0      1
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0


In [10]:
#sklearn one hot encoding: maps each category to 0
region = data.iloc[:,5:6].values
ohe = OneHotEncoder()

In [11]:
#create ndarray for ine hot encoding (sklearn)
region = ohe.fit_transform(region).toarray()
region = pd.DataFrame(region)
region.columns = ['northeast','northwest','southeast','southwest']
print("Sklearn One Hot Endcoder results for region: ")
print(region[:10])

Sklearn One Hot Endcoder results for region: 
   northeast  northwest  southeast  southwest
0        0.0        0.0        0.0        1.0
1        0.0        0.0        1.0        0.0
2        0.0        0.0        1.0        0.0
3        0.0        1.0        0.0        0.0
4        0.0        1.0        0.0        0.0
5        0.0        0.0        1.0        0.0
6        0.0        0.0        1.0        0.0
7        0.0        1.0        0.0        0.0
8        1.0        0.0        0.0        0.0
9        0.0        1.0        0.0        0.0


### Divide data into train and test

In [12]:
#putting the data together

#taking numerical data from the original data
X_num = data[['age', 'bmi', 'children']]

#take the encoded data and data and add to numerical data
X_final = pd.concat([X_num, sex, smoker, region], axis = 1)

#define y as being the "charges column" from the original data
y_final = data[['charges']].copy()

#Test train split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state = 0)

### Feature Scaling

In [13]:
#Normalized scaler (fit transform on train, fit only on test)
n_scaler = MinMaxScaler()
X_train = n_scaler.fit_transform(X_train.astype(np.float))
X_test = n_scaler.transform(X_test.astype(np.float))
X_train
X_test

array([[0.73913043, 0.38310465, 0.2       , ..., 0.        , 0.        ,
        1.        ],
       [0.63043478, 0.36077482, 0.2       , ..., 0.        , 1.        ,
        0.        ],
       [0.65217391, 0.66195857, 0.4       , ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.47826087, 0.45238095, 0.2       , ..., 0.        , 0.        ,
        0.        ],
       [0.15217391, 0.29163304, 0.6       , ..., 0.        , 0.        ,
        1.        ],
       [0.04347826, 0.29270917, 0.2       , ..., 0.        , 1.        ,
        0.        ]])

In [14]:
#Standard scaler (fit transform on train, fit only on test)
s_scaler = StandardScaler()
X_train = s_scaler.fit_transform(X_train.astype(np.float))
X_test = s_scaler.transform(X_test.astype(np.float))
X_test

array([[ 0.89345423, -0.08075516, -0.07800765, ..., -0.56360186,
        -0.61002347,  1.70166159],
       [ 0.53613634, -0.21546103, -0.07800765, ..., -0.56360186,
         1.63928118, -0.58766091],
       [ 0.60759992,  1.60144516,  0.7540739 , ...,  1.77430216,
        -0.61002347, -0.58766091],
       ...,
       [ 0.03589131,  0.3371576 , -0.07800765, ..., -0.56360186,
        -0.61002347, -0.58766091],
       [-1.03606235, -0.63256231,  1.58615545, ..., -0.56360186,
        -0.61002347,  1.70166159],
       [-1.39338023, -0.62607047, -0.07800765, ..., -0.56360186,
         1.63928118, -0.58766091]])

## Modelling

### Linear Regression

In [15]:
lr = LinearRegression().fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
y_test_predict = lr.predict(X_test)

In [16]:
#print score
print("lr.coef_: {}".format(lr.coef_))
print("lr.intercept_: {}".format(lr.intercept_))
print("lr train score %.3f, lr test score: %3f" % (lr.score(X_train, y_train), lr.score(X_test, y_test)))

lr.coef_: [[3.62226019e+03 2.10071048e+03 4.10266447e+02 3.05360391e+02
  9.17372937e+03 4.90633733e+17 4.97902094e+17 5.17514292e+17
  5.08468032e+17]]
lr.intercept_: [13160.41071543]
lr train score 0.726, lr test score: 0.781803


### Polynomial Regression

In [17]:
poly = PolynomialFeatures (degree = 2)
X_poly = poly.fit_transform(X_final)

#fit model
poly_lr = LinearRegression().fit(X_train, y_train)
y_train_pred = poly_lr.predict(X_train)
y_test_pred = poly_lr.predict(X_test)

#print score
print('poly train score %.3f, poly test score: %.3f' % (
poly_lr.score(X_train,y_train),
poly_lr.score(X_test, y_test)))

poly train score 0.726, poly test score: 0.782


### Support Vector Regression

In [18]:
svr = SVR(kernel='linear', C = 300)

#fit model
svr = svr.fit(X_train,y_train.values.ravel())
y_train_pred = svr.predict(X_train)
y_test_pred = svr.predict(X_test)

#print score
print('svr train score %.3f, svr test score: %.3f' % (
svr.score(X_train,y_train),
svr.score(X_test, y_test)))

svr train score 0.598, svr test score: 0.628


In [19]:
#Decision Tree Regressor

In [20]:
dt = DecisionTreeRegressor(random_state = 0)

#fit model
dt = dt.fit(X_train, y_train.values.ravel())
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

#print score
print('dt train score %.3f, dt test score: %.3f' % (
dt.score(X_train,y_train),
dt.score(X_test, y_test)))

dt train score 0.999, dt test score: 0.717


In [21]:
#Random Forest Regressor

forest = RandomForestRegressor(n_estimators = 100,
                              criterion = 'mse',
                              random_state = 1,
                              n_jobs = -1)
#Fit model
forest.fit(X_train,y_train.values.ravel())
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)

#print score
print('forest train score %.3f, forest test score: %.3f' % (
forest.score(X_train,y_train),
forest.score(X_test, y_test)))

forest train score 0.973, forest test score: 0.858


### Hyperparameter Optimization

In [22]:
#Function to print best hyperparamaters: 
def print_best_params(gd_model):
    param_dict = gd_model.best_estimator_.get_params()
    model_str = str(gd_model.estimator).split('(')[0]
    print("\n*** {} Best Parameters ***".format(model_str))
    for k in param_dict:
        print("{}: {}".format(k, param_dict[k]))
    print()

#test train split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state = 0 )

#standard scaler (fit transform on train, fit only on test)
sc = StandardScaler()
X_train = sc.fit_transform(X_train.astype(np.float))
X_test= sc.transform(X_test.astype(np.float))

In [23]:
###SVR parameter grid###
param_grid_svr = dict(kernel=[ 'linear', 'poly'],
                     degree=[2],
                     C=[600, 700, 800, 900],
                     epsilon=[0.0001, 0.00001, 0.000001])
svr = GridSearchCV(SVR(), param_grid=param_grid_svr, cv=5, verbose=3)

#fit model
svr = svr.fit(X_train,y_train.values.ravel())

#print score
print('\n\nsvr train score %.3f, svr test score: %.3f' % (
svr.score(X_train,y_train),
svr.score(X_test, y_test)))
#print(svr.best_estimator_.get_params())

print_best_params(svr)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] C=600, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=600, degree=2, epsilon=0.0001, kernel=linear, score=0.67094463651947, total=   0.0s
[CV] C=600, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=600, degree=2, epsilon=0.0001, kernel=linear, score=0.6625292914357526, total=   0.0s
[CV] C=600, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=600, degree=2, epsilon=0.0001, kernel=linear, score=0.5712061595045999, total=   0.0s
[CV] C=600, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=600, degree=2, epsilon=0.0001, kernel=linear, score=0.6362750162212835, total=   0.0s
[CV] C=600, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=600, degree=2, epsilon=0.0001, kernel=linear, score=0.5548173603340135, total=   0.0s
[CV] C=600, degree=2, epsilon=0.0001, kernel=poly ....................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV]  C=600, degree=2, epsilon=0.0001, kernel=poly, score=0.4881094371831467, total=   0.0s
[CV] C=600, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=600, degree=2, epsilon=0.0001, kernel=poly, score=0.33499997468312026, total=   0.0s
[CV] C=600, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=600, degree=2, epsilon=0.0001, kernel=poly, score=0.2772703710878426, total=   0.0s
[CV] C=600, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=600, degree=2, epsilon=0.0001, kernel=poly, score=0.3723043840409911, total=   0.0s
[CV] C=600, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=600, degree=2, epsilon=0.0001, kernel=poly, score=0.22153363729023146, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=600, degree=2, epsilon=1e-05, kernel=linear, score=0.6709446346193868, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=600, degree



[CV]  C=600, degree=2, epsilon=1e-05, kernel=linear, score=0.5712061607198453, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=600, degree=2, epsilon=1e-05, kernel=linear, score=0.6362750123822793, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=600, degree=2, epsilon=1e-05, kernel=linear, score=0.5548173529480436, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=600, degree=2, epsilon=1e-05, kernel=poly, score=0.48810943653189487, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=600, degree=2, epsilon=1e-05, kernel=poly, score=0.33499997689403516, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=600, degree=2, epsilon=1e-05, kernel=poly, score=0.2772703810520277, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=600, degree=



[CV]  C=600, degree=2, epsilon=1e-06, kernel=linear, score=0.6709446389835759, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=600, degree=2, epsilon=1e-06, kernel=linear, score=0.6625293013672464, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=600, degree=2, epsilon=1e-06, kernel=linear, score=0.5712061595164191, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=600, degree=2, epsilon=1e-06, kernel=linear, score=0.6362750123814565, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=600, degree=2, epsilon=1e-06, kernel=linear, score=0.5548173627037023, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-06, kernel=poly .....................
[CV]  C=600, degree=2, epsilon=1e-06, kernel=poly, score=0.4881094364722293, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-06, kernel=poly .....................
[CV]  C=600, degre



[CV]  C=600, degree=2, epsilon=1e-06, kernel=poly, score=0.37230438654613307, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-06, kernel=poly .....................
[CV]  C=600, degree=2, epsilon=1e-06, kernel=poly, score=0.22153362971579116, total=   0.0s
[CV] C=700, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=700, degree=2, epsilon=0.0001, kernel=linear, score=0.6709747693026937, total=   0.0s
[CV] C=700, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=700, degree=2, epsilon=0.0001, kernel=linear, score=0.6624561858555376, total=   0.0s
[CV] C=700, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=700, degree=2, epsilon=0.0001, kernel=linear, score=0.5724144868494866, total=   0.0s
[CV] C=700, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=700, degree=2, epsilon=0.0001, kernel=linear, score=0.6363731442538463, total=   0.0s
[CV] C=700, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=700, d



[CV]  C=700, degree=2, epsilon=0.0001, kernel=poly, score=0.3602429735034349, total=   0.0s
[CV] C=700, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=700, degree=2, epsilon=0.0001, kernel=poly, score=0.3091155696011279, total=   0.0s
[CV] C=700, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=700, degree=2, epsilon=0.0001, kernel=poly, score=0.40559133119411916, total=   0.0s
[CV] C=700, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=700, degree=2, epsilon=0.0001, kernel=poly, score=0.2484504619727486, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=700, degree=2, epsilon=1e-05, kernel=linear, score=0.6709747345453811, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=700, degree=2, epsilon=1e-05, kernel=linear, score=0.6624561880297739, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=700, degree



[CV]  C=700, degree=2, epsilon=1e-06, kernel=linear, score=0.6363731443307941, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=700, degree=2, epsilon=1e-06, kernel=linear, score=0.5548173698965086, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-06, kernel=poly .....................
[CV]  C=700, degree=2, epsilon=1e-06, kernel=poly, score=0.5349347513790828, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-06, kernel=poly .....................
[CV]  C=700, degree=2, epsilon=1e-06, kernel=poly, score=0.36024297587149634, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-06, kernel=poly .....................
[CV]  C=700, degree=2, epsilon=1e-06, kernel=poly, score=0.30911557130230394, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-06, kernel=poly .....................
[CV]  C=700, degree=2, epsilon=1e-06, kernel=poly, score=0.4055913429417422, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-06, kernel=poly .....................
[CV]  C=700, degree=2,



[CV]  C=800, degree=2, epsilon=0.0001, kernel=linear, score=0.6213656589030019, total=   0.0s
[CV] C=800, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=800, degree=2, epsilon=0.0001, kernel=linear, score=0.5724145025212162, total=   0.0s
[CV] C=800, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=800, degree=2, epsilon=0.0001, kernel=linear, score=0.5882541640507608, total=   0.0s
[CV] C=800, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=800, degree=2, epsilon=0.0001, kernel=linear, score=0.5552533046948135, total=   0.0s
[CV] C=800, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=800, degree=2, epsilon=0.0001, kernel=poly, score=0.5711591884358298, total=   0.0s
[CV] C=800, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=800, degree=2, epsilon=0.0001, kernel=poly, score=0.37722844864373384, total=   0.0s
[CV] C=800, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=800, 



[CV]  C=800, degree=2, epsilon=0.0001, kernel=poly, score=0.26736572306686857, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=800, degree=2, epsilon=1e-05, kernel=linear, score=0.6855041210135858, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=800, degree=2, epsilon=1e-05, kernel=linear, score=0.6213656506868842, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=800, degree=2, epsilon=1e-05, kernel=linear, score=0.5724144958694981, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=800, degree=2, epsilon=1e-05, kernel=linear, score=0.5882541612746357, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=800, degree=2, epsilon=1e-05, kernel=linear, score=0.5552533049106408, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=800, deg



[CV]  C=800, degree=2, epsilon=1e-05, kernel=poly, score=0.34323473416351835, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=800, degree=2, epsilon=1e-05, kernel=poly, score=0.44224319233339693, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=800, degree=2, epsilon=1e-05, kernel=poly, score=0.2673657204416662, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=800, degree=2, epsilon=1e-06, kernel=linear, score=0.6855041245799787, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=800, degree=2, epsilon=1e-06, kernel=linear, score=0.6213656497496609, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=800, degree=2, epsilon=1e-06, kernel=linear, score=0.5724144859421205, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=800, degree=



[CV]  C=900, degree=2, epsilon=0.0001, kernel=linear, score=0.6096395958779766, total=   0.0s
[CV] C=900, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=900, degree=2, epsilon=0.0001, kernel=linear, score=0.5729075042584972, total=   0.0s
[CV] C=900, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=900, degree=2, epsilon=0.0001, kernel=linear, score=0.5709755630929456, total=   0.0s
[CV] C=900, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=900, degree=2, epsilon=0.0001, kernel=linear, score=0.5554505802042341, total=   0.0s
[CV] C=900, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=900, degree=2, epsilon=0.0001, kernel=poly, score=0.5839199139987796, total=   0.0s
[CV] C=900, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=900, degree=2, epsilon=0.0001, kernel=poly, score=0.38599288128145026, total=   0.0s
[CV] C=900, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=900, 



[CV]  C=900, degree=2, epsilon=1e-05, kernel=linear, score=0.7087079476575762, total=   0.0s
[CV] C=900, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=900, degree=2, epsilon=1e-05, kernel=linear, score=0.6096396156859728, total=   0.0s
[CV] C=900, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=900, degree=2, epsilon=1e-05, kernel=linear, score=0.5729075031838491, total=   0.0s
[CV] C=900, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=900, degree=2, epsilon=1e-05, kernel=linear, score=0.5709755569821123, total=   0.0s
[CV] C=900, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=900, degree=2, epsilon=1e-05, kernel=linear, score=0.5554505853811699, total=   0.0s
[CV] C=900, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=900, degree=2, epsilon=1e-05, kernel=poly, score=0.5839199124719683, total=   0.0s
[CV] C=900, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=900, degre



[CV]  C=900, degree=2, epsilon=1e-05, kernel=poly, score=0.461022477874403, total=   0.0s
[CV] C=900, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=900, degree=2, epsilon=1e-05, kernel=poly, score=0.2939124275982824, total=   0.0s
[CV] C=900, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=900, degree=2, epsilon=1e-06, kernel=linear, score=0.7087079467836674, total=   0.0s
[CV] C=900, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=900, degree=2, epsilon=1e-06, kernel=linear, score=0.6096396088064401, total=   0.0s
[CV] C=900, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=900, degree=2, epsilon=1e-06, kernel=linear, score=0.5729074935397986, total=   0.0s
[CV] C=900, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=900, degree=2, epsilon=1e-06, kernel=linear, score=0.570975554742604, total=   0.0s
[CV] C=900, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=900, degree=2,

[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    3.3s finished


In [24]:
###Decision Tree parameter grid
param_grid_dt = dict(min_samples_leaf=np.arange(9, 13, 1, int), 
                  max_depth = np.arange(4,7,1, int),
                  min_impurity_decrease = [0, 1, 2],
                 )
dt = GridSearchCV(DecisionTreeRegressor(random_state=0), param_grid=param_grid_dt, cv=5,  verbose=3)



#fit model
dt = dt.fit(X_train,y_train.values.ravel())


#print score
print('\n\ndt train score %.3f, dt test score: %.3f' % (
dt.score(X_train,y_train),
dt.score(X_test, y_test)))
print_best_params(dt)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] max_depth=4, min_impurity_decrease=0, min_samples_leaf=9 ........
[CV]  max_depth=4, min_impurity_decrease=0, min_samples_leaf=9, score=0.8174556867332768, total=   0.0s
[CV] max_depth=4, min_impurity_decrease=0, min_samples_leaf=9 ........
[CV]  max_depth=4, min_impurity_decrease=0, min_samples_leaf=9, score=0.8186735560816233, total=   0.0s
[CV] max_depth=4, min_impurity_decrease=0, min_samples_leaf=9 ........
[CV]  max_depth=4, min_impurity_decrease=0, min_samples_leaf=9, score=0.845892337032524, total=   0.0s
[CV] max_depth=4, min_impurity_decrease=0, min_samples_leaf=9 ........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV]  max_depth=4, min_impurity_decrease=0, min_samples_leaf=9, score=0.8119115903518325, total=   0.0s
[CV] max_depth=4, min_impurity_decrease=0, min_samples_leaf=9 ........
[CV]  max_depth=4, min_impurity_decrease=0, min_samples_leaf=9, score=0.7957717631864334, total=   0.0s
[CV] max_depth=4, min_impurity_decrease=0, min_samples_leaf=10 .......
[CV]  max_depth=4, min_impurity_decrease=0, min_samples_leaf=10, score=0.8175820488474479, total=   0.0s
[CV] max_depth=4, min_impurity_decrease=0, min_samples_leaf=10 .......
[CV]  max_depth=4, min_impurity_decrease=0, min_samples_leaf=10, score=0.8186735560816233, total=   0.0s
[CV] max_depth=4, min_impurity_decrease=0, min_samples_leaf=10 .......
[CV]  max_depth=4, min_impurity_decrease=0, min_samples_leaf=10, score=0.845892337032524, total=   0.0s
[CV] max_depth=4, min_impurity_decrease=0, min_samples_leaf=10 .......
[CV]  max_depth=4, min_impurity_decrease=0, min_samples_leaf=10, score=0.814855308828605, total=   0.0s
[CV] max_depth=4, m

[CV]  max_depth=5, min_impurity_decrease=1, min_samples_leaf=12, score=0.7870699566205422, total=   0.0s
[CV] max_depth=5, min_impurity_decrease=2, min_samples_leaf=9 ........
[CV]  max_depth=5, min_impurity_decrease=2, min_samples_leaf=9, score=0.8270930350937251, total=   0.0s
[CV] max_depth=5, min_impurity_decrease=2, min_samples_leaf=9 ........
[CV]  max_depth=5, min_impurity_decrease=2, min_samples_leaf=9, score=0.8067246877367681, total=   0.0s
[CV] max_depth=5, min_impurity_decrease=2, min_samples_leaf=9 ........
[CV]  max_depth=5, min_impurity_decrease=2, min_samples_leaf=9, score=0.8533870721467597, total=   0.0s
[CV] max_depth=5, min_impurity_decrease=2, min_samples_leaf=9 ........
[CV]  max_depth=5, min_impurity_decrease=2, min_samples_leaf=9, score=0.8210767357529519, total=   0.0s
[CV] max_depth=5, min_impurity_decrease=2, min_samples_leaf=9 ........
[CV]  max_depth=5, min_impurity_decrease=2, min_samples_leaf=9, score=0.7872605325189177, total=   0.0s
[CV] max_depth=5, mi

[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:    0.2s finished


In [25]:
###Random Forest parameter grid###
param_grid_rf = dict(n_estimators=[20],
                     max_depth=np.arange(1, 13, 2),
                     min_samples_split=[2],
                     min_samples_leaf= np.arange(1, 15, 2, int),
                     bootstrap=[True, False],
                     oob_score=[False, ])


forest = GridSearchCV(RandomForestRegressor(random_state=0), param_grid=param_grid_rf, cv=5, verbose=3)

#fit model
forest.fit(X_train,y_train.values.ravel())


#print score
print('\n\nforest train score %.3f, forest test score: %.3f' % (
forest.score(X_train,y_train),
forest.score(X_test, y_test)))

print_best_params(forest)

Fitting 5 folds for each of 84 candidates, totalling 420 fits
[CV] bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.6157562073868407, total=   0.0s
[CV] bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.45448510589474655, total=   0.0s
[CV] bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV]  bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.6461987587113623, total=   0.0s
[CV] bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.5951820393497897, total=   0.0s
[CV] bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.5820985074413877, total=   0.0s
[CV] bootstrap=True, max_depth=1, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=1, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.6157562073868407, total=   0.0s
[CV] bootstrap=True, max_depth=1, min_samples_leaf=3, min_samples_split=2

[CV]  bootstrap=True, max_depth=3, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.8345392836771851, total=   0.0s
[CV] bootstrap=True, max_depth=3, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=3, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.814223257398913, total=   0.0s
[CV] bootstrap=True, max_depth=3, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=3, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.8580084637868451, total=   0.0s
[CV] bootstrap=True, max_depth=3, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=3, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.8269932574355883, total=   0.0s
[CV] bootstrap=True, max_depth=3, min_samples_leaf=3, min_samples_split=2,

[CV]  bootstrap=True, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7898178181636132, total=   0.0s
[CV] bootstrap=True, max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.8375514425811351, total=   0.0s
[CV] bootstrap=True, max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.8141723275679593, total=   0.0s
[CV] bootstrap=True, max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.8533337287056237, total=   0.0s
[CV] bootstrap=True, max_depth=5, min_samples_leaf=3, min_samples_split=2

[CV]  bootstrap=True, max_depth=7, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7788876381724702, total=   0.0s
[CV] bootstrap=True, max_depth=7, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=7, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.8373176579639524, total=   0.0s
[CV] bootstrap=True, max_depth=7, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=7, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.8105135067929906, total=   0.0s
[CV] bootstrap=True, max_depth=7, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=7, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7786320137545486, total=   0.0s
[CV] bootstrap=True, max_depth=7, min_samples_leaf=3, min_samples_split=2

[CV]  bootstrap=True, max_depth=7, min_samples_leaf=13, min_samples_split=2, n_estimators=20, oob_score=False, score=0.8348977537240649, total=   0.0s
[CV] bootstrap=True, max_depth=7, min_samples_leaf=13, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=7, min_samples_leaf=13, min_samples_split=2, n_estimators=20, oob_score=False, score=0.793080370562495, total=   0.0s
[CV] bootstrap=True, max_depth=9, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=9, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.8150343735717724, total=   0.0s
[CV] bootstrap=True, max_depth=9, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=9, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7636001235941536, total=   0.0s
[CV] bootstrap=True, max_depth=9, min_samples_leaf=1, min_samples_split

[CV]  bootstrap=True, max_depth=11, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7608243101105114, total=   0.0s
[CV] bootstrap=True, max_depth=11, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=11, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.8238974165471219, total=   0.0s
[CV] bootstrap=True, max_depth=11, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=11, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7949406042281539, total=   0.0s
[CV] bootstrap=True, max_depth=11, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=11, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7621662233610347, total=   0.0s
[CV] bootstrap=True, max_depth=11, min_samples_leaf=3, min_samples

[CV]  bootstrap=False, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.4510068688489669, total=   0.0s
[CV] bootstrap=False, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=False, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.6472611250118853, total=   0.0s
[CV] bootstrap=False, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=False, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.5946410570834861, total=   0.0s
[CV] bootstrap=False, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=False, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.5839674705308063, total=   0.0s
[CV] bootstrap=False, max_depth=1, min_samples_leaf=3, min_samples

[CV]  bootstrap=False, max_depth=3, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.8061637349235868, total=   0.0s
[CV] bootstrap=False, max_depth=3, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=False, max_depth=3, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.8425266759699168, total=   0.0s
[CV] bootstrap=False, max_depth=3, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=False, max_depth=3, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.8042150434366917, total=   0.0s
[CV] bootstrap=False, max_depth=3, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=False, max_depth=3, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7865239823742289, total=   0.0s
[CV] bootstrap=False, max_depth=3, min_samples_leaf=5, min_samples

[CV]  bootstrap=False, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7266837165077519, total=   0.0s
[CV] bootstrap=False, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=False, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7753660718324636, total=   0.0s
[CV] bootstrap=False, max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=False, max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.8280186274776042, total=   0.0s
[CV] bootstrap=False, max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=False, max_depth=5, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7979654670605356, total=   0.0s
[CV] bootstrap=False, max_depth=5, min_samples_leaf=3, min_samples

[CV]  bootstrap=False, max_depth=7, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7679670578234237, total=   0.0s
[CV] bootstrap=False, max_depth=7, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=False, max_depth=7, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.6567392910162, total=   0.0s
[CV] bootstrap=False, max_depth=7, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=False, max_depth=7, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7691576940836282, total=   0.0s
[CV] bootstrap=False, max_depth=7, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=False, max_depth=7, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7885595577135569, total=   0.0s
[CV] bootstrap=False, max_depth=7, min_samples_leaf=3, min_samples_sp

[CV]  bootstrap=False, max_depth=9, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7451473272326024, total=   0.0s
[CV] bootstrap=False, max_depth=9, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=False, max_depth=9, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7644778171617034, total=   0.0s
[CV] bootstrap=False, max_depth=9, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=False, max_depth=9, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7102434950363952, total=   0.0s
[CV] bootstrap=False, max_depth=9, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=False, max_depth=9, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7757774848226835, total=   0.0s
[CV] bootstrap=False, max_depth=9, min_samples_leaf=3, min_samples

[CV]  bootstrap=False, max_depth=11, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7542397894009193, total=   0.0s
[CV] bootstrap=False, max_depth=11, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=False, max_depth=11, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.6997669736142875, total=   0.0s
[CV] bootstrap=False, max_depth=11, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=False, max_depth=11, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7749179854450254, total=   0.0s
[CV] bootstrap=False, max_depth=11, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=False, max_depth=11, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.7702907641786806, total=   0.0s
[CV] bootstrap=False, max_depth=11, min_samples_leaf=3, min

[Parallel(n_jobs=1)]: Done 420 out of 420 | elapsed:    9.9s finished
