In [27]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "decision_trees"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [30]:
from sklearn.datasets import make_moons

X,y=make_moons(n_samples=10000, noise=0.4, random_state=42)
X[:5], y[:5]

(array([[ 0.9402914 ,  0.12230559],
        [ 0.12454026, -0.42477546],
        [ 0.26198823,  0.50841438],
        [-0.49523824,  0.07258876],
        [-0.87941281,  0.54937303]]),
 array([1, 0, 0, 0, 0]))

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2, random_state=42)

In [32]:
len(X_train), len(X_test)

(8000, 2000)

In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

params={"max_leaf_nodes":list(range(2,100)), "min_samples_split":[2,3,4]}
grid_search_cv=GridSearchCV(DecisionTreeClassifier(random_state=42), params, verbose=1, cv=3)

grid_search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 294 candidates, totalling 882 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [34]:
grid_search_cv.best_estimator_

In [35]:
from sklearn.metrics import accuracy_score

y_pred=grid_search_cv.predict(X_test)
accuracy_score(y_test,y_pred)

0.8695

In [36]:
from sklearn.model_selection import ShuffleSplit

n_trees=1000
n_instances=100

mini_sets=[]

rs=ShuffleSplit(n_splits=n_trees, test_size=len(X_train)-n_instances, random_state=42)

for mini_train_index, mini_test_index in rs.split(X_train):
    X_mini_train=X_train[mini_train_index]
    y_mini_train=y_train[mini_train_index]
    mini_sets.append((X_mini_train, y_mini_train))

In [37]:
len(mini_sets)

1000

In [42]:
len(mini_sets[0][0]), len(mini_sets[0][1])

(100, 100)

In [45]:
from sklearn.base import clone

forest=[clone(grid_search_cv.best_estimator_) for _ in range(n_trees)]

accuracy_scores=[]

for tree, (X_mini_train, y_mini_train) in zip(forest,mini_sets):
    tree.fit(X_mini_train, y_mini_train)
    y_pred=tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_pred,y_test))

np.mean(accuracy_scores)

np.float64(0.805471)

In [49]:
y_pred=np.empty([n_trees, len(X_test)], dtype=np.uint8)
y_pred

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [51]:
len(y_pred)

1000

In [52]:
for tree_index, tree in enumerate(forest):
    y_pred[tree_index]=tree.predict(X_test)

In [54]:
from scipy.stats import mode

y_pred_majority_votes, n_votes=mode(y_pred, axis=0)

In [55]:
y_pred_majority_votes[:5], n_votes[:5]

(array([1, 1, 0, 0, 0]), array([951, 912, 963, 951, 739]))

In [56]:
accuracy_score(y_pred_majority_votes.reshape([-1]), y_test)

0.872

In [64]:
y_pred_majority_votes.reshape([-1]).shape

(2000,)

In [65]:
y_test.shape

(2000,)

In [66]:
y_pred_majority_votes.shape

(2000,)