In [1]:

import numpy as np
import pandas as pd
from pandas_datareader import data

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn import metrics
from sklearn.metrics import r2_score


In [2]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"

# Read the dataset
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)

# Process the raw data
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

# Convert to a pandas DataFrame
columns = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
    'TAX', 'PTRATIO', 'B', 'LSTAT'
]
df = pd.DataFrame(data, columns=columns)


In [3]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [4]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
rt = DecisionTreeRegressor(criterion='absolute_error', max_depth=4)

In [7]:
rt.fit(X_train, y_train)

In [8]:
y_pred = rt.predict(X_test)

In [9]:
r2_score(y_test, y_pred)

0.6590098519498115

## HyperParameter Tuning 

In [10]:
param_grid = {
    'max_depth':[2,4,8,10, None],
    'criterion':['mse', 'mae'],
    'max_features':[0.25, 0.5, 1.0],
    'min_samples_split':[0.25, 0.5, 1.0]
}

In [11]:
reg = GridSearchCV(DecisionTreeRegressor(), param_grid=param_grid)

In [12]:
reg.fit(X_train, y_train)

ValueError: 
All the 450 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
225 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\susha\anaconda3\envs\machine_learning\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\susha\anaconda3\envs\machine_learning\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\susha\anaconda3\envs\machine_learning\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\susha\anaconda3\envs\machine_learning\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'criterion' parameter of DecisionTreeRegressor must be a str among {'poisson', 'friedman_mse', 'absolute_error', 'squared_error'}. Got 'mse' instead.

--------------------------------------------------------------------------------
225 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\susha\anaconda3\envs\machine_learning\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\susha\anaconda3\envs\machine_learning\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\susha\anaconda3\envs\machine_learning\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\susha\anaconda3\envs\machine_learning\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'criterion' parameter of DecisionTreeRegressor must be a str among {'poisson', 'friedman_mse', 'absolute_error', 'squared_error'}. Got 'mae' instead.


In [13]:
# code is not runnning 

# reg.best_score_
# reg.best_params_

## Feature Importance 

In [15]:
for importance, name in sorted(zip(rt.feature_importances_, X_train.columns), reverse=True):
    print(name, " : ", importance)

INDUS  :  0.4798787481566447
RM  :  0.21624610847124354
CRIM  :  0.13467966573816118
DIS  :  0.10733245944617402
AGE  :  0.06186301818777645
ZN  :  0.0
TAX  :  0.0
RAD  :  0.0
PTRATIO  :  0.0
NOX  :  0.0
CHAS  :  0.0
B  :  0.0


In [16]:
# this technique helpful for dimensionality Reduction