In [2]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=10000, noise=0.4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
param_grid = [
    {"max_leaf_nodes": [4, 5, 6, 7, 10, 100, 1000], "max_depth": [1, 2, 3, 6, 9, 12]},
]

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 3,
                           scoring="neg_mean_squared_error",
                           return_train_score=True)

grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

{'max_depth': 2, 'max_leaf_nodes': 4}


In [4]:
from sklearn.metrics import accuracy_score
grid_search.best_estimator_.fit(X_train, y_train)
y_pred = grid_search.best_estimator_.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.869


In [10]:
from sklearn.model_selection import ShuffleSplit
import numpy as np
from scipy.stats import mode
rs = ShuffleSplit(n_splits=1000, train_size=100, random_state=42)
avg_accuracy = 0
y_pred_arr = []
for i, (train_index,_) in enumerate(rs.split(X_train)):
    X_train_small_tree = X_train[train_index]
    y_train_small_tree = y_train[train_index]
    grid_search.best_estimator_.fit(X_train_small_tree, y_train_small_tree)
    y_pred = grid_search.best_estimator_.predict(X_test)
    y_pred_arr.append(y_pred)
    avg_accuracy += accuracy_score(y_test, y_pred)
y_pred_arr = np.array(y_pred_arr)
avg_accuracy /= 1000
print(f"個々の決定木の平均正解率は{avg_accuracy}")
result = mode(y_pred_arr, axis=0)
y_pred_all = result[0]
print(f"多数決での平均正解率は{accuracy_score(y_test, y_pred_all)}")

個々の決定木の平均正解率は0.8330289999999997
多数決での平均正解率は0.867
