In [166]:
from sklearn.datasets import make_moons

In [167]:
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

In [168]:
X.shape, y.shape


((10000, 2), (10000,))

In [169]:
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

In [170]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7500, 2), (2500, 2), (7500,), (2500,))

In [171]:
X_train[:5], X_test[:5], y_train[:5], y_test[:5]

(array([[ 0.13454082, -0.06983736],
        [-0.16927771,  0.18242721],
        [ 0.42640832,  0.88862737],
        [-1.04820507,  0.92397042],
        [-0.97001816,  1.3529998 ]]),
 array([[ 0.69945888, -0.8734481 ],
        [ 1.7764418 ,  0.13222334],
        [-1.14450821,  0.24446319],
        [-0.13106113,  1.14018203],
        [ 0.01229165,  0.573495  ]]),
 array([1, 1, 0, 0, 0]),
 array([1, 1, 0, 1, 0]))

In [172]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
params_grid = {
    'max_leaf_nodes' : [2, 5, 10, 20, 50, 100, None], 
    'min_samples_split' : [2, 5, 10, 20],           
    'min_samples_leaf' : [1, 2, 5, 10]             
}


In [173]:
clf = DecisionTreeClassifier(random_state=42)

In [174]:
grid_search = GridSearchCV(clf, params_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

0,1,2
,estimator,DecisionTreeC...ndom_state=42)
,param_grid,"{'max_leaf_nodes': [2, 5, ...], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 5, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,50
,min_impurity_decrease,0.0


In [175]:
grid_search.best_params_

{'max_leaf_nodes': 50, 'min_samples_leaf': 1, 'min_samples_split': 2}

In [176]:
grid_search

0,1,2
,estimator,DecisionTreeC...ndom_state=42)
,param_grid,"{'max_leaf_nodes': [2, 5, ...], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 5, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,50
,min_impurity_decrease,0.0


In [177]:
final_clf = DecisionTreeClassifier(random_state=42, **grid_search.best_params_)
final_clf.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,50
,min_impurity_decrease,0.0


In [178]:
tst_accuracy = final_clf.score(X_test, y_test)
print(f"Test set accuracy: {tst_accuracy:.3f}")

Test set accuracy: 0.855


In [179]:
from sklearn.model_selection import ShuffleSplit
n_splits = 1000
subset = 100
rs = ShuffleSplit(n_splits=n_splits, train_size=subset, random_state=42)

In [180]:
subset_list = []
for train_index, _ in rs.split(X_train):
    X_subset = X_train[train_index]
    y_subset = y_train[train_index]
    subset_list.append((X_subset, y_subset))

In [181]:
test_acr = []
for i, (X_subset, y_subset) in enumerate(subset_list):
    tree = DecisionTreeClassifier(random_state=i, **grid_search.best_params_)
    tree.fit(X_subset, y_subset)
    tst2_accuracy = tree.score(X_test, y_test)
    test_acr.append(tst2_accuracy)
print(f"Mean test set 2 accuracy of 1,000 trees: {np.mean(test_acr):.3f}")
print(f"First 10 tree accuracies: {test_acr[:10]}")

Mean test set 2 accuracy of 1,000 trees: 0.793
First 10 tree accuracies: [0.7908, 0.8364, 0.7924, 0.812, 0.7764, 0.7984, 0.762, 0.7908, 0.8248, 0.8212]


In [182]:
prd = []
from scipy import stats
for i, (X_subset, y_subset) in enumerate(subset_list):
    tree = DecisionTreeClassifier(random_state=i, **grid_search.best_params_)
    tree.fit(X_subset, y_subset)
    prd.append(tree.predict(X_test))
prd = np.array(prd)
votes = stats.mode(prd, axis=0, keepdims=True)#get most common prediction for each instance
y_pred = votes.mode.ravel()# for 2d to 1d
ensemble_accuracy = np.mean(y_pred == y_test)# calculate proportion of correct predictions by comparing with votes predictions
print(f"Ensemble accuracy: {ensemble_accuracy:.3f}")


Ensemble accuracy: 0.868


In [183]:
print(f"Single tree accuracy: {tst_accuracy:.3f}")
print(f"Ensemble accuracy: {ensemble_accuracy:.3f}")

Single tree accuracy: 0.855
Ensemble accuracy: 0.868
