In [4]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

##### Loading boston dataset.

In [8]:

data_url = "http://lib.stat.cmu.edu/datasets/boston"
df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([df.values[::2, :], df.values[1::2, :2]])
target = df.values[1::2, 2]

In [9]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3
1,396.9,4.98,24.0,,,,,,,,
2,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8
3,396.9,9.14,21.6,,,,,,,,
4,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8


In [22]:
df.dropna(axis=0,inplace=True)

In [23]:
X = df.iloc[:,0:10]
y = df.iloc[:,10]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [25]:
rt = DecisionTreeRegressor(criterion ='squared_error' , max_depth=5)
     

In [26]:
rt.fit(X_train,y_train)

In [27]:
y_pred = rt.predict(X_test)


In [28]:

r2_score(y_test,y_pred)

0.5755777082884603

##### Hyperparameter tunning.

In [30]:
param_grid = {
    'max_depth':[2,4,8,10,None],
    'criterion':['squared_error','absolute_error','friedman_mse'],
    'max_features':[0.25,0.5,1.0],
    'min_samples_split':[0.25,0.5,1.0]
}

In [31]:

reg = GridSearchCV(DecisionTreeRegressor(),param_grid=param_grid)

In [32]:

reg.fit(X_train,y_train)

In [33]:
reg.best_score_

0.7318352353747176

In [34]:
reg.best_params_

{'criterion': 'friedman_mse',
 'max_depth': 8,
 'max_features': 1.0,
 'min_samples_split': 0.25}

### Feature Importance

##### Decision tree calculates the imp of each column in the dataset,so this can be useful when we have a large dataset and we need to drop some columns.We can access the importance of each column by attribute feature_importance.

In [35]:
for importance, name in sorted(zip(rt.feature_importances_, X_train.columns),reverse=True):
    print(name, importance)

9 0.39967562904819975
4 0.3469266002676566
2 0.15502867555619165
0 0.06375871991263338
7 0.034610375215318415
8 0.0
6 0.0
5 0.0
3 0.0
1 0.0
