# Training and Visualizing a Decision Tree

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "decision_trees"

def image_path(fig_id):
    return os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id)

def save_fig(fig_id, tight_layout=True):
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(image_path(fig_id) + ".png", format='png', dpi=300)


In [2]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X = iris["data"][:,2:]
y = iris["target"]

tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [3]:
from sklearn.tree import export_graphviz

export_graphviz(tree_clf, 
               out_file=image_path("iris_tree.dot"),
               feature_names=iris.feature_names[2:],
               class_names=iris.target_names,
               rounded=True,
               filled=True)

# Estimating Class Probabilities

In [4]:
tree_clf.predict_proba([[5, 1.5]])

array([[0.        , 0.90740741, 0.09259259]])

In [7]:
tree_clf.predict([[5, 1.5]])

array([1])

# Exercises

## Exercice 7

In [1]:
from sklearn.datasets import make_moons

In [5]:
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

In [7]:
y

array([1, 0, 0, ..., 1, 0, 1], dtype=int64)

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [26]:
param_grid = {"max_leaf_nodes": list(range(2, 200)), "min_samples_split": [2, 3, 4]}

tree_clf = DecisionTreeClassifier(random_state=40)
grid_search = GridSearchCV(tree_clf, param_grid, verbose = 3)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 594 candidates, totalling 1782 fits
[CV] max_leaf_nodes=2, min_samples_split=2 ...........................
[CV]  max_leaf_nodes=2, min_samples_split=2, score=0.7536557930258717, total=   0.0s
[CV] max_leaf_nodes=2, min_samples_split=2 ...........................
[CV]  max_leaf_nodes=2, min_samples_split=2, score=0.7825271841019873, total=   0.0s
[CV] max_leaf_nodes=2, min_samples_split=2 ...........................
[CV]  max_leaf_nodes=2, min_samples_split=2, score=0.7771942985746436, total=   0.0s
[CV] max_leaf_nodes=2, min_samples_split=3 ...........................
[CV]  max_leaf_nodes=2, min_samples_split=3, score=0.7536557930258717, total=   0.0s
[CV] max_leaf_nodes=2, min_samples_split=3 ...........................
[CV]  max_leaf_nodes=2, min_samples_split=3, score=0.7825271841019873, total=   0.0s
[CV] max_leaf_nodes=2, min_samples_split=3 ...........................
[CV]  max_leaf_nodes=2, min_samples_split=3, score=0.7771942985746436, total=   0.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV]  max_leaf_nodes=6, min_samples_split=4, score=0.8421447319085115, total=   0.0s
[CV] max_leaf_nodes=6, min_samples_split=4 ...........................
[CV]  max_leaf_nodes=6, min_samples_split=4, score=0.859392575928009, total=   0.0s
[CV] max_leaf_nodes=6, min_samples_split=4 ...........................
[CV]  max_leaf_nodes=6, min_samples_split=4, score=0.8559639909977494, total=   0.0s
[CV] max_leaf_nodes=7, min_samples_split=2 ...........................
[CV]  max_leaf_nodes=7, min_samples_split=2, score=0.8421447319085115, total=   0.0s
[CV] max_leaf_nodes=7, min_samples_split=2 ...........................
[CV]  max_leaf_nodes=7, min_samples_split=2, score=0.859392575928009, total=   0.0s
[CV] max_leaf_nodes=7, min_samples_split=2 ...........................
[CV]  max_leaf_nodes=7, min_samples_split=2, score=0.8559639909977494, total=   0.0s
[CV] max_leaf_nodes=7, min_samples_split=3 ...........................
[CV]  max_leaf_nodes=7, min_samples_split=3, score=0.8421447319085

[CV]  max_leaf_nodes=14, min_samples_split=2, score=0.8548931383577053, total=   0.0s
[CV] max_leaf_nodes=14, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=14, min_samples_split=2, score=0.8488372093023255, total=   0.0s
[CV] max_leaf_nodes=14, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=14, min_samples_split=3, score=0.8410198725159355, total=   0.0s
[CV] max_leaf_nodes=14, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=14, min_samples_split=3, score=0.8548931383577053, total=   0.0s
[CV] max_leaf_nodes=14, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=14, min_samples_split=3, score=0.8488372093023255, total=   0.0s
[CV] max_leaf_nodes=14, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=14, min_samples_split=4, score=0.8410198725159355, total=   0.0s
[CV] max_leaf_nodes=14, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=14, min_samples_split=4, score=0.8548

[CV]  max_leaf_nodes=20, min_samples_split=2, score=0.8563390847711928, total=   0.0s
[CV] max_leaf_nodes=20, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=20, min_samples_split=3, score=0.8511436070491188, total=   0.0s
[CV] max_leaf_nodes=20, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=20, min_samples_split=3, score=0.8575178102737158, total=   0.0s
[CV] max_leaf_nodes=20, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=20, min_samples_split=3, score=0.8563390847711928, total=   0.0s
[CV] max_leaf_nodes=20, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=20, min_samples_split=4, score=0.8511436070491188, total=   0.0s
[CV] max_leaf_nodes=20, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=20, min_samples_split=4, score=0.8575178102737158, total=   0.0s
[CV] max_leaf_nodes=20, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=20, min_samples_split=4, score=0.8563

[CV]  max_leaf_nodes=26, min_samples_split=3, score=0.8563390847711928, total=   0.0s
[CV] max_leaf_nodes=26, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=26, min_samples_split=4, score=0.8466441694788152, total=   0.0s
[CV] max_leaf_nodes=26, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=26, min_samples_split=4, score=0.8571428571428571, total=   0.0s
[CV] max_leaf_nodes=26, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=26, min_samples_split=4, score=0.8563390847711928, total=   0.0s
[CV] max_leaf_nodes=27, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=27, min_samples_split=2, score=0.8466441694788152, total=   0.0s
[CV] max_leaf_nodes=27, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=27, min_samples_split=2, score=0.854143232095988, total=   0.0s
[CV] max_leaf_nodes=27, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=27, min_samples_split=2, score=0.85483

[CV]  max_leaf_nodes=32, min_samples_split=3, score=0.8522684664416947, total=   0.0s
[CV] max_leaf_nodes=32, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=32, min_samples_split=3, score=0.8559639909977494, total=   0.0s
[CV] max_leaf_nodes=32, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=32, min_samples_split=4, score=0.8485189351331084, total=   0.0s
[CV] max_leaf_nodes=32, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=32, min_samples_split=4, score=0.8522684664416947, total=   0.0s
[CV] max_leaf_nodes=32, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=32, min_samples_split=4, score=0.8559639909977494, total=   0.0s
[CV] max_leaf_nodes=33, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=33, min_samples_split=2, score=0.8485189351331084, total=   0.0s
[CV] max_leaf_nodes=33, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=33, min_samples_split=2, score=0.8526

[CV]  max_leaf_nodes=41, min_samples_split=2, score=0.854143232095988, total=   0.0s
[CV] max_leaf_nodes=41, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=41, min_samples_split=2, score=0.8548387096774194, total=   0.0s
[CV] max_leaf_nodes=41, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=41, min_samples_split=3, score=0.8511436070491188, total=   0.0s
[CV] max_leaf_nodes=41, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=41, min_samples_split=3, score=0.854143232095988, total=   0.0s
[CV] max_leaf_nodes=41, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=41, min_samples_split=3, score=0.8548387096774194, total=   0.0s
[CV] max_leaf_nodes=41, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=41, min_samples_split=4, score=0.8511436070491188, total=   0.0s
[CV] max_leaf_nodes=41, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=41, min_samples_split=4, score=0.854143

[CV]  max_leaf_nodes=50, min_samples_split=2, score=0.8488938882639671, total=   0.0s
[CV] max_leaf_nodes=50, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=50, min_samples_split=2, score=0.8530183727034121, total=   0.0s
[CV] max_leaf_nodes=50, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=50, min_samples_split=2, score=0.8567141785446362, total=   0.0s
[CV] max_leaf_nodes=50, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=50, min_samples_split=3, score=0.8488938882639671, total=   0.0s
[CV] max_leaf_nodes=50, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=50, min_samples_split=3, score=0.8530183727034121, total=   0.0s
[CV] max_leaf_nodes=50, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=50, min_samples_split=3, score=0.8567141785446362, total=   0.0s
[CV] max_leaf_nodes=50, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=50, min_samples_split=4, score=0.8488

[CV]  max_leaf_nodes=58, min_samples_split=3, score=0.8503937007874016, total=   0.0s
[CV] max_leaf_nodes=58, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=58, min_samples_split=3, score=0.8567141785446362, total=   0.0s
[CV] max_leaf_nodes=58, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=58, min_samples_split=4, score=0.8481439820022497, total=   0.0s
[CV] max_leaf_nodes=58, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=58, min_samples_split=4, score=0.8503937007874016, total=   0.0s
[CV] max_leaf_nodes=58, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=58, min_samples_split=4, score=0.8567141785446362, total=   0.0s
[CV] max_leaf_nodes=59, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=59, min_samples_split=2, score=0.8470191226096738, total=   0.0s
[CV] max_leaf_nodes=59, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=59, min_samples_split=2, score=0.8503

[CV]  max_leaf_nodes=66, min_samples_split=3, score=0.8552138034508627, total=   0.0s
[CV] max_leaf_nodes=66, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=66, min_samples_split=4, score=0.8440194975628047, total=   0.0s
[CV] max_leaf_nodes=66, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=66, min_samples_split=4, score=0.8492688413948256, total=   0.0s
[CV] max_leaf_nodes=66, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=66, min_samples_split=4, score=0.8552138034508627, total=   0.0s
[CV] max_leaf_nodes=67, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=67, min_samples_split=2, score=0.8440194975628047, total=   0.0s
[CV] max_leaf_nodes=67, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=67, min_samples_split=2, score=0.8492688413948256, total=   0.0s
[CV] max_leaf_nodes=67, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=67, min_samples_split=2, score=0.8548

[CV]  max_leaf_nodes=74, min_samples_split=4, score=0.8485189351331084, total=   0.0s
[CV] max_leaf_nodes=74, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=74, min_samples_split=4, score=0.8537134283570893, total=   0.0s
[CV] max_leaf_nodes=75, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=75, min_samples_split=2, score=0.8428946381702287, total=   0.0s
[CV] max_leaf_nodes=75, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=75, min_samples_split=2, score=0.8485189351331084, total=   0.0s
[CV] max_leaf_nodes=75, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=75, min_samples_split=2, score=0.8537134283570893, total=   0.0s
[CV] max_leaf_nodes=75, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=75, min_samples_split=3, score=0.8428946381702287, total=   0.0s
[CV] max_leaf_nodes=75, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=75, min_samples_split=3, score=0.8485

[CV]  max_leaf_nodes=83, min_samples_split=2, score=0.8492688413948256, total=   0.0s
[CV] max_leaf_nodes=83, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=83, min_samples_split=2, score=0.8529632408102026, total=   0.0s
[CV] max_leaf_nodes=83, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=83, min_samples_split=3, score=0.84251968503937, total=   0.0s
[CV] max_leaf_nodes=83, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=83, min_samples_split=3, score=0.8492688413948256, total=   0.0s
[CV] max_leaf_nodes=83, min_samples_split=3 ..........................
[CV]  max_leaf_nodes=83, min_samples_split=3, score=0.8529632408102026, total=   0.0s
[CV] max_leaf_nodes=83, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=83, min_samples_split=4, score=0.84251968503937, total=   0.0s
[CV] max_leaf_nodes=83, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=83, min_samples_split=4, score=0.84926884

[CV]  max_leaf_nodes=91, min_samples_split=3, score=0.8522130532633159, total=   0.0s
[CV] max_leaf_nodes=91, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=91, min_samples_split=4, score=0.8402699662542182, total=   0.0s
[CV] max_leaf_nodes=91, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=91, min_samples_split=4, score=0.8503937007874016, total=   0.0s
[CV] max_leaf_nodes=91, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=91, min_samples_split=4, score=0.8522130532633159, total=   0.0s
[CV] max_leaf_nodes=92, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=92, min_samples_split=2, score=0.8402699662542182, total=   0.0s
[CV] max_leaf_nodes=92, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=92, min_samples_split=2, score=0.8496437945256843, total=   0.0s
[CV] max_leaf_nodes=92, min_samples_split=2 ..........................
[CV]  max_leaf_nodes=92, min_samples_split=2, score=0.8522

[CV]  max_leaf_nodes=99, min_samples_split=3, score=0.8507126781695424, total=   0.0s
[CV] max_leaf_nodes=99, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=99, min_samples_split=4, score=0.8391451068616423, total=   0.0s
[CV] max_leaf_nodes=99, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=99, min_samples_split=4, score=0.847769028871391, total=   0.0s
[CV] max_leaf_nodes=99, min_samples_split=4 ..........................
[CV]  max_leaf_nodes=99, min_samples_split=4, score=0.8507126781695424, total=   0.0s
[CV] max_leaf_nodes=100, min_samples_split=2 .........................
[CV]  max_leaf_nodes=100, min_samples_split=2, score=0.8391451068616423, total=   0.0s
[CV] max_leaf_nodes=100, min_samples_split=2 .........................
[CV]  max_leaf_nodes=100, min_samples_split=2, score=0.847769028871391, total=   0.0s
[CV] max_leaf_nodes=100, min_samples_split=2 .........................
[CV]  max_leaf_nodes=100, min_samples_split=2, score=0.850

[CV]  max_leaf_nodes=107, min_samples_split=2, score=0.8499624906226556, total=   0.0s
[CV] max_leaf_nodes=107, min_samples_split=3 .........................
[CV]  max_leaf_nodes=107, min_samples_split=3, score=0.8387701537307837, total=   0.0s
[CV] max_leaf_nodes=107, min_samples_split=3 .........................
[CV]  max_leaf_nodes=107, min_samples_split=3, score=0.8455193100862393, total=   0.0s
[CV] max_leaf_nodes=107, min_samples_split=3 .........................
[CV]  max_leaf_nodes=107, min_samples_split=3, score=0.8499624906226556, total=   0.0s
[CV] max_leaf_nodes=107, min_samples_split=4 .........................
[CV]  max_leaf_nodes=107, min_samples_split=4, score=0.8387701537307837, total=   0.0s
[CV] max_leaf_nodes=107, min_samples_split=4 .........................
[CV]  max_leaf_nodes=107, min_samples_split=4, score=0.8455193100862393, total=   0.0s
[CV] max_leaf_nodes=107, min_samples_split=4 .........................
[CV]  max_leaf_nodes=107, min_samples_split=4, score

[CV]  max_leaf_nodes=114, min_samples_split=3, score=0.8495873968492123, total=   0.0s
[CV] max_leaf_nodes=114, min_samples_split=4 .........................
[CV]  max_leaf_nodes=114, min_samples_split=4, score=0.8391451068616423, total=   0.0s
[CV] max_leaf_nodes=114, min_samples_split=4 .........................
[CV]  max_leaf_nodes=114, min_samples_split=4, score=0.8428946381702287, total=   0.0s
[CV] max_leaf_nodes=114, min_samples_split=4 .........................
[CV]  max_leaf_nodes=114, min_samples_split=4, score=0.8484621155288822, total=   0.0s
[CV] max_leaf_nodes=115, min_samples_split=2 .........................
[CV]  max_leaf_nodes=115, min_samples_split=2, score=0.8395200599925009, total=   0.0s
[CV] max_leaf_nodes=115, min_samples_split=2 .........................
[CV]  max_leaf_nodes=115, min_samples_split=2, score=0.8428946381702287, total=   0.0s
[CV] max_leaf_nodes=115, min_samples_split=2 .........................
[CV]  max_leaf_nodes=115, min_samples_split=2, score

[CV]  max_leaf_nodes=122, min_samples_split=3, score=0.8372703412073491, total=   0.0s
[CV] max_leaf_nodes=122, min_samples_split=3 .........................
[CV]  max_leaf_nodes=122, min_samples_split=3, score=0.8417697787776528, total=   0.0s
[CV] max_leaf_nodes=122, min_samples_split=3 .........................
[CV]  max_leaf_nodes=122, min_samples_split=3, score=0.8480870217554388, total=   0.0s
[CV] max_leaf_nodes=122, min_samples_split=4 .........................
[CV]  max_leaf_nodes=122, min_samples_split=4, score=0.838395200599925, total=   0.0s
[CV] max_leaf_nodes=122, min_samples_split=4 .........................
[CV]  max_leaf_nodes=122, min_samples_split=4, score=0.8428946381702287, total=   0.0s
[CV] max_leaf_nodes=122, min_samples_split=4 .........................
[CV]  max_leaf_nodes=122, min_samples_split=4, score=0.8477119279819955, total=   0.0s
[CV] max_leaf_nodes=123, min_samples_split=2 .........................
[CV]  max_leaf_nodes=123, min_samples_split=2, score=

[CV]  max_leaf_nodes=130, min_samples_split=2, score=0.8428946381702287, total=   0.0s
[CV] max_leaf_nodes=130, min_samples_split=2 .........................
[CV]  max_leaf_nodes=130, min_samples_split=2, score=0.8477119279819955, total=   0.0s
[CV] max_leaf_nodes=130, min_samples_split=3 .........................
[CV]  max_leaf_nodes=130, min_samples_split=3, score=0.838395200599925, total=   0.0s
[CV] max_leaf_nodes=130, min_samples_split=3 .........................
[CV]  max_leaf_nodes=130, min_samples_split=3, score=0.8428946381702287, total=   0.0s
[CV] max_leaf_nodes=130, min_samples_split=3 .........................
[CV]  max_leaf_nodes=130, min_samples_split=3, score=0.8477119279819955, total=   0.0s
[CV] max_leaf_nodes=130, min_samples_split=4 .........................
[CV]  max_leaf_nodes=130, min_samples_split=4, score=0.838395200599925, total=   0.0s
[CV] max_leaf_nodes=130, min_samples_split=4 .........................
[CV]  max_leaf_nodes=130, min_samples_split=4, score=0

[CV]  max_leaf_nodes=137, min_samples_split=4, score=0.8480870217554388, total=   0.0s
[CV] max_leaf_nodes=138, min_samples_split=2 .........................
[CV]  max_leaf_nodes=138, min_samples_split=2, score=0.8387701537307837, total=   0.0s
[CV] max_leaf_nodes=138, min_samples_split=2 .........................
[CV]  max_leaf_nodes=138, min_samples_split=2, score=0.8380202474690663, total=   0.0s
[CV] max_leaf_nodes=138, min_samples_split=2 .........................
[CV]  max_leaf_nodes=138, min_samples_split=2, score=0.8480870217554388, total=   0.0s
[CV] max_leaf_nodes=138, min_samples_split=3 .........................
[CV]  max_leaf_nodes=138, min_samples_split=3, score=0.8387701537307837, total=   0.0s
[CV] max_leaf_nodes=138, min_samples_split=3 .........................
[CV]  max_leaf_nodes=138, min_samples_split=3, score=0.8380202474690663, total=   0.0s
[CV] max_leaf_nodes=138, min_samples_split=3 .........................
[CV]  max_leaf_nodes=138, min_samples_split=3, score

[CV]  max_leaf_nodes=145, min_samples_split=3, score=0.8477119279819955, total=   0.0s
[CV] max_leaf_nodes=145, min_samples_split=4 .........................
[CV]  max_leaf_nodes=145, min_samples_split=4, score=0.8380202474690663, total=   0.0s
[CV] max_leaf_nodes=145, min_samples_split=4 .........................
[CV]  max_leaf_nodes=145, min_samples_split=4, score=0.8353955755530559, total=   0.0s
[CV] max_leaf_nodes=145, min_samples_split=4 .........................
[CV]  max_leaf_nodes=145, min_samples_split=4, score=0.8477119279819955, total=   0.0s
[CV] max_leaf_nodes=146, min_samples_split=2 .........................
[CV]  max_leaf_nodes=146, min_samples_split=2, score=0.838395200599925, total=   0.0s
[CV] max_leaf_nodes=146, min_samples_split=2 .........................
[CV]  max_leaf_nodes=146, min_samples_split=2, score=0.8368953880764904, total=   0.0s
[CV] max_leaf_nodes=146, min_samples_split=2 .........................
[CV]  max_leaf_nodes=146, min_samples_split=2, score=

[CV]  max_leaf_nodes=153, min_samples_split=2, score=0.8353955755530559, total=   0.0s
[CV] max_leaf_nodes=153, min_samples_split=2 .........................
[CV]  max_leaf_nodes=153, min_samples_split=2, score=0.8477119279819955, total=   0.0s
[CV] max_leaf_nodes=153, min_samples_split=3 .........................
[CV]  max_leaf_nodes=153, min_samples_split=3, score=0.8376452943382077, total=   0.0s
[CV] max_leaf_nodes=153, min_samples_split=3 .........................
[CV]  max_leaf_nodes=153, min_samples_split=3, score=0.8353955755530559, total=   0.0s
[CV] max_leaf_nodes=153, min_samples_split=3 .........................
[CV]  max_leaf_nodes=153, min_samples_split=3, score=0.8477119279819955, total=   0.0s
[CV] max_leaf_nodes=153, min_samples_split=4 .........................
[CV]  max_leaf_nodes=153, min_samples_split=4, score=0.8380202474690663, total=   0.0s
[CV] max_leaf_nodes=153, min_samples_split=4 .........................
[CV]  max_leaf_nodes=153, min_samples_split=4, score

[CV]  max_leaf_nodes=160, min_samples_split=3, score=0.8350206224221972, total=   0.0s
[CV] max_leaf_nodes=160, min_samples_split=3 .........................
[CV]  max_leaf_nodes=160, min_samples_split=3, score=0.8473368342085521, total=   0.0s
[CV] max_leaf_nodes=160, min_samples_split=4 .........................
[CV]  max_leaf_nodes=160, min_samples_split=4, score=0.8372703412073491, total=   0.0s
[CV] max_leaf_nodes=160, min_samples_split=4 .........................
[CV]  max_leaf_nodes=160, min_samples_split=4, score=0.8353955755530559, total=   0.0s
[CV] max_leaf_nodes=160, min_samples_split=4 .........................
[CV]  max_leaf_nodes=160, min_samples_split=4, score=0.8473368342085521, total=   0.0s
[CV] max_leaf_nodes=161, min_samples_split=2 .........................
[CV]  max_leaf_nodes=161, min_samples_split=2, score=0.8376452943382077, total=   0.0s
[CV] max_leaf_nodes=161, min_samples_split=2 .........................
[CV]  max_leaf_nodes=161, min_samples_split=2, score

[CV]  max_leaf_nodes=168, min_samples_split=2, score=0.8469617404351087, total=   0.0s
[CV] max_leaf_nodes=168, min_samples_split=3 .........................
[CV]  max_leaf_nodes=168, min_samples_split=3, score=0.8376452943382077, total=   0.0s
[CV] max_leaf_nodes=168, min_samples_split=3 .........................
[CV]  max_leaf_nodes=168, min_samples_split=3, score=0.8353955755530559, total=   0.0s
[CV] max_leaf_nodes=168, min_samples_split=3 .........................
[CV]  max_leaf_nodes=168, min_samples_split=3, score=0.8469617404351087, total=   0.0s
[CV] max_leaf_nodes=168, min_samples_split=4 .........................
[CV]  max_leaf_nodes=168, min_samples_split=4, score=0.8372703412073491, total=   0.0s
[CV] max_leaf_nodes=168, min_samples_split=4 .........................
[CV]  max_leaf_nodes=168, min_samples_split=4, score=0.8361454818147731, total=   0.0s
[CV] max_leaf_nodes=168, min_samples_split=4 .........................
[CV]  max_leaf_nodes=168, min_samples_split=4, score

[CV]  max_leaf_nodes=175, min_samples_split=4, score=0.8357705286839145, total=   0.0s
[CV] max_leaf_nodes=175, min_samples_split=4 .........................
[CV]  max_leaf_nodes=175, min_samples_split=4, score=0.8458364591147787, total=   0.0s
[CV] max_leaf_nodes=176, min_samples_split=2 .........................
[CV]  max_leaf_nodes=176, min_samples_split=2, score=0.8361454818147731, total=   0.0s
[CV] max_leaf_nodes=176, min_samples_split=2 .........................
[CV]  max_leaf_nodes=176, min_samples_split=2, score=0.8357705286839145, total=   0.0s
[CV] max_leaf_nodes=176, min_samples_split=2 .........................
[CV]  max_leaf_nodes=176, min_samples_split=2, score=0.845086271567892, total=   0.0s
[CV] max_leaf_nodes=176, min_samples_split=3 .........................
[CV]  max_leaf_nodes=176, min_samples_split=3, score=0.8361454818147731, total=   0.0s
[CV] max_leaf_nodes=176, min_samples_split=3 .........................
[CV]  max_leaf_nodes=176, min_samples_split=3, score=

[CV]  max_leaf_nodes=183, min_samples_split=3, score=0.8350206224221972, total=   0.0s
[CV] max_leaf_nodes=183, min_samples_split=3 .........................
[CV]  max_leaf_nodes=183, min_samples_split=3, score=0.8357705286839145, total=   0.0s
[CV] max_leaf_nodes=183, min_samples_split=3 .........................
[CV]  max_leaf_nodes=183, min_samples_split=3, score=0.8432108027006752, total=   0.0s
[CV] max_leaf_nodes=183, min_samples_split=4 .........................
[CV]  max_leaf_nodes=183, min_samples_split=4, score=0.8353955755530559, total=   0.0s
[CV] max_leaf_nodes=183, min_samples_split=4 .........................
[CV]  max_leaf_nodes=183, min_samples_split=4, score=0.8357705286839145, total=   0.0s
[CV] max_leaf_nodes=183, min_samples_split=4 .........................
[CV]  max_leaf_nodes=183, min_samples_split=4, score=0.8432108027006752, total=   0.0s
[CV] max_leaf_nodes=184, min_samples_split=2 .........................
[CV]  max_leaf_nodes=184, min_samples_split=2, score

[CV]  max_leaf_nodes=191, min_samples_split=2, score=0.8338957630296213, total=   0.0s
[CV] max_leaf_nodes=191, min_samples_split=2 .........................
[CV]  max_leaf_nodes=191, min_samples_split=2, score=0.8353955755530559, total=   0.0s
[CV] max_leaf_nodes=191, min_samples_split=2 .........................
[CV]  max_leaf_nodes=191, min_samples_split=2, score=0.8420855213803451, total=   0.0s
[CV] max_leaf_nodes=191, min_samples_split=3 .........................
[CV]  max_leaf_nodes=191, min_samples_split=3, score=0.8338957630296213, total=   0.0s
[CV] max_leaf_nodes=191, min_samples_split=3 .........................
[CV]  max_leaf_nodes=191, min_samples_split=3, score=0.8353955755530559, total=   0.0s
[CV] max_leaf_nodes=191, min_samples_split=3 .........................
[CV]  max_leaf_nodes=191, min_samples_split=3, score=0.8420855213803451, total=   0.0s
[CV] max_leaf_nodes=191, min_samples_split=4 .........................
[CV]  max_leaf_nodes=191, min_samples_split=4, score

[CV]  max_leaf_nodes=198, min_samples_split=2, score=0.83427071616048, total=   0.0s
[CV] max_leaf_nodes=198, min_samples_split=2 .........................
[CV]  max_leaf_nodes=198, min_samples_split=2, score=0.8357705286839145, total=   0.0s
[CV] max_leaf_nodes=198, min_samples_split=2 .........................
[CV]  max_leaf_nodes=198, min_samples_split=2, score=0.8405851462865717, total=   0.0s
[CV] max_leaf_nodes=198, min_samples_split=3 .........................
[CV]  max_leaf_nodes=198, min_samples_split=3, score=0.83427071616048, total=   0.0s
[CV] max_leaf_nodes=198, min_samples_split=3 .........................
[CV]  max_leaf_nodes=198, min_samples_split=3, score=0.8357705286839145, total=   0.0s
[CV] max_leaf_nodes=198, min_samples_split=3 .........................
[CV]  max_leaf_nodes=198, min_samples_split=3, score=0.8405851462865717, total=   0.0s
[CV] max_leaf_nodes=198, min_samples_split=4 .........................
[CV]  max_leaf_nodes=198, min_samples_split=4, score=0.8

[Parallel(n_jobs=1)]: Done 1782 out of 1782 | elapsed:   15.1s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=40,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 7...6, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199], 'min_samples_split': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [27]:
grid_search.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=17,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=40,
            splitter='best')

In [28]:
grid_search.best_score_

0.8555

In [29]:
grid_search.best_estimator_.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=17,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=40,
            splitter='best')

In [30]:
from sklearn.metrics import accuracy_score

y_pred = grid_search.best_estimator_.predict(X_test)
accuracy_score(y_test, y_pred)


0.8695

## Exercise 8

In [31]:
from sklearn.model_selection import ShuffleSplit

In [34]:
n_trees = 1000
n_instances = 100

In [38]:
rs = ShuffleSplit(n_splits=n_trees, test_size=len(X_train) - n_instances, random_state=42)

In [39]:
mini_sets = []

for mini_train_index, mini_test_index in rs.split(X_train):
    X_mini_train = X_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append((X_mini_train, y_mini_train))

In [42]:
from sklearn.base import clone
import numpy as np

forest = [clone(grid_search.best_estimator_) for _ in range(n_trees)]
accuracy_scores = []

for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    
np.mean(accuracy_scores)

0.805546

In [43]:
Y_pred = np.empty([n_trees, len(X_test)], dtype=np.uint8)

for tree_idx, tree in enumerate(forest):
    Y_pred[tree_idx] = tree.predict(X_test)

In [44]:
from scipy.stats import mode

y_pred_majority_vote = mode(Y_pred, axis=0)

In [53]:
accuracy_score(y_test, y_pred_majority_vote.mode.reshape(len(y_test), -1))

0.872