# 7.

In [2]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

X, y = datasets.make_moons(n_samples=10000, noise=0.4)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2,
                                                    random_state=42, stratify=y)

In [39]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


tree_clf = DecisionTreeClassifier()
param_grid = {
    'random_state': [42],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_leaf_nodes': [None, 3, 4, 5],
    'max_depth': [None, 2, 3],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
}
grid_search = GridSearchCV(tree_clf, param_grid, cv=5, verbose=1)
grid_search.fit(X_train, y_train)
# {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': 4, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 42, 'splitter': 'best'}
print(grid_search.best_params_)

y_pred = grid_search.best_estimator_.predict(X_test)
print(accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 648 candidates, totalling 3240 fits
{'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': 4, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 42, 'splitter': 'best'}
0.848


# 8.

In [35]:
from sklearn.model_selection import ShuffleSplit
from scipy.stats import mode
import numpy as np


shuffle_split = ShuffleSplit(n_splits=1000, train_size=100, random_state=42)

trees = []
for train_index, _ in shuffle_split.split(X_train):
    X_train_split = X_train[train_index]
    y_train_split = y_train[train_index]
    tree_clf = DecisionTreeClassifier(**grid_search.best_params_)
    tree_clf.fit(X_train_split, y_train_split)
    trees.append(tree_clf)

predictions = []
for tree in trees:
    predictions.append(tree.predict(X_test))

mode_predictions = mode(predictions, axis=0).mode

print(accuracy_score(y_test, mode_predictions))

0.8515
