In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X = iris.data[:, 2:] # petal length and width
y = iris.target

tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [13]:
tree_clf2 = DecisionTreeClassifier(max_depth=3)
tree_clf2.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [8]:
from sklearn.tree import export_graphviz

export_graphviz(tree_clf, out_file="iris_tree.dot", feature_names=iris.feature_names[2:],
        class_names=iris.target_names, rounded=True, filled=True)

In [16]:
export_graphviz(tree_clf2, out_file="iris_tree2.dot", feature_names=iris.feature_names[2:],
        class_names=iris.target_names, rounded=True, filled=True)

In [12]:
print(tree_clf.predict_proba([[5, 1.5]]))
print(tree_clf.predict([[5, 1.5]]))

[[0.         0.90740741 0.09259259]]
[1]


In [15]:
print(tree_clf.predict_proba([[6, 1.5]]))
print(tree_clf2.predict_proba([[6, 1.5]]))

[[0.         0.90740741 0.09259259]]
[[0.         0.33333333 0.66666667]]


In [17]:
from sklearn.datasets import make_moons
Xm, ym = make_moons(n_samples=100, noise=0.25, random_state=53)

deep_tree_clf1 = DecisionTreeClassifier(random_state=42)
deep_tree_clf2 = DecisionTreeClassifier(min_samples_leaf=4, random_state=42)
deep_tree_clf1.fit(Xm, ym)
deep_tree_clf2.fit(Xm, ym)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [20]:
export_graphviz(deep_tree_clf1, out_file="moon_tree1.dot",
        class_names=str(ym), rounded=True, filled=True)
export_graphviz(deep_tree_clf2, out_file="moon_tree2.dot",
        class_names=str(ym), rounded=True, filled=True)

In [22]:
# Quadratic training set + noise
import numpy as np
np.random.seed(42)
m = 200
X = np.random.rand(m, 1)
y = 4 * (X - 0.5) ** 2
y = y + np.random.randn(m, 1) / 10

In [23]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=42, splitter='best')

# Exercises

1) depth = log(1000000)/log(2) = 20 

2) Always lower since it splits at a Gini that provides lower value than parent otherwise it stops splitting.

3) Yes, this will regularise the model.

4) No need to scale features, but we should have less regularisation such as increasing `max_` hyperparameters or decreasing `min_` hyperparameters.

5) Training time: O(n x mlogm) so for m = 1 000 000, 1000000(log1000000/log2) = 1 hour. So 19 931 569 steps took 1 hour. New steps is 10000000(log10000000/log2) = 232 534 967 steps. This is 1 hour * (232534967/19931569) = 11.67 hours.

6) No, presort only speeds up training for a few thousand instances, not a hundred thousand.

## 7)

In [27]:
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=10000, noise=0.4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
from sklearn.model_selection import GridSearchCV
# 14:57
param_grid = {'max_leaf_nodes':[None, 4, 8, 16, 32], 'min_samples_leaf':[1, 3, 8, 15]}
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=3)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=42,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_leaf_nodes': [None, 4, 8, 16, 32],
                         'min_samples_leaf': [1, 3, 8, 15]},
            

In [30]:
params = grid_search.best_params_
tree_clf = DecisionTreeClassifier(**params)
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=16,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [31]:
from sklearn.metrics import accuracy_score

y_pred = tree_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.843


## 8)

In [39]:
from sklearn.model_selection import ShuffleSplit

decision_trees = []
split_indices = ShuffleSplit(n_splits=1000, test_size=0.001, train_size=0.999, random_state=42)
print(split_indices.get_n_splits(X_train))
for indices, test_indices in split_indices.split(X_train):
    tree_clf = DecisionTreeClassifier(**params)
    tree_clf.fit(X_train[indices], y_train[indices])
    decision_trees.append(tree_clf)
    y_pred = tree_clf.predict(X_test)
    print(accuracy_score(y_test, y_pred), end=', ')

1000
0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.844, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.8445, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.844, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.841, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.8435, 0.844, 0.8445, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.8445, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.843, 0.84

In [41]:
from scipy.stats import mode # 15:50
y_pred = [] 
for instance in X_test:
    preds = []
    for clf in decision_trees:
        pred = clf.predict([instance])
        preds.append(pred)
    modal = mode(preds, axis=None)
    y_pred.append(float(modal[0]))    

In [42]:
print(accuracy_score(y_test, y_pred))

0.843


### Solution

In [43]:
n_trees = 1000
n_instances = 100

mini_sets = []

rs = ShuffleSplit(n_splits=n_trees, test_size=len(X_train) - n_instances, random_state=42)
for mini_train_index, mini_test_index in rs.split(X_train):
    X_mini_train = X_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append((X_mini_train, y_mini_train))

In [45]:
from sklearn.base import clone

forest = [clone(grid_search.best_estimator_) for _ in range(n_trees)]

accuracy_scores = []

for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)
    
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

np.mean(accuracy_scores)

0.7912994999999999

In [46]:
Y_pred = np.empty([n_trees, len(X_test)], dtype=np.uint8)

for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_test)

In [47]:
from scipy.stats import mode

y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)

In [48]:
accuracy_score(y_test, y_pred_majority_votes.reshape([-1]))

0.845