In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.datasets import load_boston

In [3]:
boston=load_boston()

In [4]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [5]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [6]:
boston_df=pd.DataFrame(boston.data,columns=boston.feature_names)

In [7]:
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [8]:
X=boston_df
y=boston.target

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=42)

In [10]:
from sklearn.tree import DecisionTreeRegressor
regressor=DecisionTreeRegressor()

In [11]:
parameter={'criterion':[ 'friedman_mse'],
           'splitter':['best','random'],
           'max_depth':[int(x) for x in np.linspace(1, 5, num = 5)],
           'max_features': ['auto','sqrt', 'log2'],
          }

In [12]:
## first way
# hyperparameter tuning 

from sklearn.model_selection import GridSearchCV

ghregressor=GridSearchCV(regressor,param_grid=parameter,scoring='neg_mean_squared_error',cv=10)

ghregressor.fit(X_train,y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=None,
                                             splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ['friedman_mse'],
                         'max_depth': [1, 2, 3, 4, 5],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'splitte

In [13]:
print(ghregressor.best_params_)

{'criterion': 'friedman_mse', 'max_depth': 5, 'max_features': 'auto', 'splitter': 'random'}


In [14]:
# to select the scoring parameter
sorted(sklearn.metrics.SCORERS.keys()) 

<IPython.core.display.Javascript object>

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']

In [15]:
y_pred=ghregressor.predict(X_test)

In [16]:
from sklearn.metrics import r2_score

In [17]:
round(r2_score(y_pred,y_test)*100,2)

53.21

In [18]:
## second way
# hyperparameter tuning 

from sklearn.model_selection import RandomizedSearchCV
rsregressor=RandomizedSearchCV(regressor,param_distributions=parameter,scoring='neg_mean_squared_error',cv=10)
rsregressor.fit(X_train,y_train)



RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=DecisionTreeRegressor(criterion='mse',
                                                   max_depth=None,
                                                   max_features=None,
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   presort=False,
                                                   random_state=None,
                                                   splitter='best'),
                   iid='warn', n_iter=10, n_jobs=None,
                   param_d

In [19]:
print(rsregressor.best_params_)

{'splitter': 'random', 'max_features': 'auto', 'max_depth': 5, 'criterion': 'friedman_mse'}


In [20]:
y_pred=rsregressor.predict(X_test)

In [21]:
from sklearn.metrics import r2_score

In [22]:
round(r2_score(y_pred,y_test)*100,2)

65.35

In [23]:
## NOTE : The only difference between both the approaches is 
## in grid search we define the combinations and do training of the model 
## whereas in RandomizedSearchCV the model selects the combinations randomly.