In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Decision Trees

### Exercises

1. If the features are exactly equivalent of all the instances, then depth = 0. In the worst case, where each instance is in a leaf node, then it would be ~20.
2. Based on the CART algorithm, the node must not split if the Gini impurity cannot be lowered. So always.
3. Yes, reducing "max_" hyperparameters regularize the model. Same as with increasing "min_" hyperparameters.
4. DTs don't require scaled features, unless the axis orientation is having an effect (which requires standardizing + PCA)
5. ~11.7 hours
6. ~2 hours i.e, double

### 7

In [2]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(10000, noise=0.4, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'max_leaf_nodes': [2, 4, 6, 8, 10]
}

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [9]:
print(grid_search.best_params_)
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)

cv_res.head(5)

{'max_leaf_nodes': 4}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_leaf_nodes,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
1,0.004643,0.000394,0.000663,0.0001,4,{'max_leaf_nodes': 4},0.842145,0.859393,0.855964,0.8525,0.007455,1
2,0.004599,0.000267,0.000581,5e-05,6,{'max_leaf_nodes': 6},0.842145,0.859393,0.855964,0.8525,0.007455,1
3,0.004589,0.00011,0.000484,7e-05,8,{'max_leaf_nodes': 8},0.842145,0.859393,0.855964,0.8525,0.007455,1
4,0.004638,0.000171,0.000444,6.7e-05,10,{'max_leaf_nodes': 10},0.842145,0.856393,0.855964,0.851501,0.006618,4
0,0.004253,0.000609,0.000663,7.7e-05,2,{'max_leaf_nodes': 2},0.753656,0.782527,0.777194,0.771126,0.012544,5


In [10]:
grid_search.score(X_test, y_test)

0.863

### 8

In [12]:
from sklearn.model_selection import ShuffleSplit

rs = ShuffleSplit(n_splits=1000, test_size=0.01, train_size=0.01, random_state=42)
trees = []
test_set_scores = []

for train_index, test_index in rs.split(X_train):
    dt = DecisionTreeClassifier(max_leaf_nodes=4)
    
    # Train
    dt.fit(X[train_index], y[train_index])
    
    # Check test value
    test_set_scores.append(dt.score(X_test, y_test))
    
    trees.append(dt)
    

In [13]:
test_set_scores[:5]

[0.854, 0.8095, 0.873, 0.849, 0.859]

In [17]:
from scipy import stats

# majority-vote
forest_pred = [stats.mode([dt.predict([test_instance]) for dt in trees]) for test_instance in X_test]

In [22]:
from sklearn.metrics import accuracy_score

y_pred = np.concatenate([pred.mode for pred in forest_pred])
accuracy_score(y_test, y_pred)

0.869