# Hands-on Machine Learning Chapter 06 - Decision Trees - Exercises

# Environment Setup

In [0]:
import numpy as np
np.random.seed(42)

# Matplotlib for generating figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Scikit Imports
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, ShuffleSplit, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.base import clone

# For finding most frequently occurring label in predictions
from scipy.stats import mode

# Exercise 7 - `moons` Dataset

## Importing Data

In [0]:
from sklearn.datasets import make_moons

In [0]:
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

In [28]:
print("Shape of training samples: ", X.shape)
print("Shape of training labels: ", y.shape)

Shape of training samples:  (10000, 2)
Shape of training labels:  (10000,)


In [29]:
# Is this a binary classification problem?
np.unique(y)

array([0, 1])

## Split into training and test sets

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=42)

In [31]:
print("X train:\t", X_train.shape)
print("X_test:\t\t", X_test.shape)
print("y_train:\t", y_train.shape)
print("y_test:\t\t", y_test.shape)

X train:	 (2000, 2)
X_test:		 (8000, 2)
y_train:	 (2000,)
y_test:		 (8000,)


## Grid Search for Best Hyperparameters

### Defining Parameter Grid

In [0]:
param_grid = {
    'max_leaf_nodes': list(range(2, 100)),    # Must be greater than 1
    'min_samples_split': list(range(2, 5)),   # Must be greater than 1
}

### Creating a `GridSearchCV` Object

Creating this object with a `DecisionTreeClassifier` as the estimator, and the specified `param_grid`. Number of folds to be fitted per model are 3. Random state of the classifier is same so that no effect of stochastic selection of features.

In [0]:
decision_tree_cv = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), 
                                param_grid=param_grid, 
                                cv=3, 
                                verbose=1)

### Fitting Folds

In [42]:
decision_tree_cv.fit(X_train, y_train)

Fitting 3 folds for each of 294 candidates, totalling 882 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 882 out of 882 | elapsed:    2.5s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=42,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,


In [43]:
decision_tree_cv.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=17,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

## Training Best Estimator

### Extracting Best Estimator

In [0]:
best_decision_tree = decision_tree_cv.best_estimator_

In [45]:
best_decision_tree.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': 17,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': 42,
 'splitter': 'best'}

### Training on Entire Dataset

In [0]:
decision_tree_clf_best = DecisionTreeClassifier(min_samples_split=2, 
                                                max_leaf_nodes=17, 
                                                random_state=42)

In [48]:
decision_tree_clf_best.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=17,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [0]:
y_pred_new = decision_tree_clf_best.predict(X_test)
y_pred_cv = decision_tree_cv.best_estimator_.predict(X_test)

In [52]:
print("Accuracy Score of CV's Best Estimator: ", accuracy_score(y_test, y_pred_cv))
print("Accuracy Score of My Best Estimator: ", accuracy_score(y_test, y_pred_new))

Accuracy Score of CV's Best Estimator:  0.852875
Accuracy Score of My Best Estimator:  0.852875


Both models give the same accuracy score, although it is on the lower side compared to Aurelien's results.

# Exercise 2 - Grow a Forest

We will be training 1,000 `DecisionTreeClassifiers` with the same hyperparameters as derived above on different subsets of the training data, and evaluating each of them on the test set. 

We will then find the predictions of all 1,000 trees on each instance of the test set. The final prediction for this instance of the test set will be the class predicted by the majority of the Decision Trees. These are called **majority vote predictions**.

In [0]:
n_trees = 1000        # Number of trees to train
n_instances = 100     # Number of samples per training set

In [0]:
# List of training subsets 
mini_sets = []

In [0]:
# Get indices for 1000 different training and test sets 
rs = ShuffleSplit(n_splits=n_trees,                             # One split per tree
                  test_size=len(X_train) - n_instances,         # All samples expect training are test set
                  random_state=42)

In [0]:
# Split the training set into 1000 training subsets
for mini_train_index, mini_test_index in rs.split(X_train):
  X_mini_train = X_train[mini_train_index]         # Subset samples
  y_mini_train = y_train[mini_train_index]         # Subset labels
  mini_sets.append((X_mini_train, y_mini_train))   # Append them to the list of sets

In [0]:
# List of 1000 Decision Tree estimators with same hyperparams as best estimator from grid search
forest = [clone(decision_tree_cv.best_estimator_) for _ in range(n_trees)]

In [0]:
# Will append each decision tree's accuracy scores to this
accuracy_scores = []

In [0]:
# Fit every tree to a separate subset
for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
  # Fit this tree to its corresponding subset of the training data
  tree.fit(X_mini_train, y_mini_train)

  # Evaluate it on the entire dataset's test data
  y_pred = tree.predict(X_test)
  accuracy_scores.append(accuracy_score(y_test, y_pred))

In [71]:
# What is the average accuracy score?
np.mean(accuracy_scores)

0.79613325

This is understandably worse than the original `DecisionTreeClassifier`, because each tree has been trained on 100 times fewer samples than the original.

In [0]:
# Make an empty array with one row per trained decision tree and one column per each sample in test set
y_pred_test = np.empty([n_trees, len(X_test)], dtype=np.uint8)

In [0]:
# Make predictions on each test set sample using each tree
for tree_index, tree in enumerate(forest):
  y_pred_test[tree_index] = tree.predict(X_test)

In [0]:
# Use the `scipy.stats.mode` method to find the most frequently occurring prediction for each tree
y_pred_majority_votes, n_votes = mode(y_pred_test, axis=0)

In [77]:
# Does it improve accuracy?
accuracy_score(y_test, y_pred_majority_votes.reshape([-1]))

0.866125

It did. Accuracy increased from 85.2 to 86.6!