### Setup 

In [1]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score
from scipy.stats import mode

random_seed = 43

### Decision Tree with Moon Dataset

In [2]:
X_raw, y_raw = make_moons(n_samples=10000, noise=0.4)
X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.2, random_state=random_seed)

In [3]:
dt_params = {
    'max_depth': [None, 5, 8, 10],
    'min_samples_leaf': [5, 10, 15, 20],
    'max_leaf_nodes': [None, 15, 20, 25, 30]
}
dt_clf = GridSearchCV(DecisionTreeClassifier(random_state=random_seed), param_grid=dt_params, cv=5, n_jobs=-1)

dt_clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=43,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'min_samples_leaf': [5, 10, 15, 20], 'max_leaf_nodes': [None, 15, 20, 25, 30], 'max_depth': [None, 5, 8, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [4]:
def show_top_k_cv_results(clf, k):
    params = dt_clf.cv_results_['params']
    scores = dt_clf.cv_results_['mean_test_score']
    for params, score in sorted(list(zip(scores, params)), key=lambda t: t[0], reverse=True)[:k]:
        print(params, score)

show_top_k_cv_results(dt_clf, 5)

0.858875 {'min_samples_leaf': 10, 'max_leaf_nodes': 25, 'max_depth': None}
0.858875 {'min_samples_leaf': 10, 'max_leaf_nodes': 25, 'max_depth': 8}
0.858875 {'min_samples_leaf': 10, 'max_leaf_nodes': 25, 'max_depth': 10}
0.858625 {'min_samples_leaf': 20, 'max_leaf_nodes': 25, 'max_depth': None}
0.858625 {'min_samples_leaf': 20, 'max_leaf_nodes': 25, 'max_depth': 8}


In [5]:
y_predict = dt_clf.predict(X_test)
accuracy_score(y_test, y_predict)

0.8505

### Grow a Forest

In [6]:
sets = 1000
sample_per_set = 100
train_size = sample_per_set / len(X_raw)

spliter = ShuffleSplit(n_splits=sets, random_state=random_seed, train_size=train_size, test_size=1-train_size)
accuracy_scores = []
forest = []

for train_index, test_index in spliter.split(X_raw):
    dt_best_cls = DecisionTreeClassifier(**dt_clf.best_params_)
    dt_best_cls.fit(X_raw[train_index], y_raw[train_index])
    forest.append(dt_best_cls)
    y_pred = dt_best_cls.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

print(np.mean(accuracy_scores))

0.7917204999999999


#### Get Majority Vote

In [7]:
y_forest = np.empty([len(forest), len(X_test)])
for y_forest_index, decision_tree in enumerate(forest):
    y_forest[y_forest_index] = decision_tree.predict(X_test)

y_pred_majority_votes, _ = mode(y_forest, axis=0)
majority_vote_predictions = y_pred_majority_votes.reshape([-1])

#### Compare to test result

In [8]:
print(accuracy_score(y_test, majority_vote_predictions))

0.8315
