In [3]:
# 1-a. 초승달 데이터셋
from sklearn.datasets import make_moons
X_moons, y_moons = make_moons(n_samples=10000, noise=0.4, random_state=42)

# 1-b. train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_moons, y_moons, test_size=0.2, random_state=42
)

# 1-c. GridSearchCV 실행
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {"max_leaf_nodes": list(range(2, 100))}
dt_clf = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(
    dt_clf,
    param_grid,
    cv=3,
    scoring="accuracy",
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

# 최적 모델 확인
print("best params:", grid_search.best_params_)
print("best estimator:", grid_search.best_estimator_)

# 1-d. 테스트 정확도
from sklearn.metrics import accuracy_score
y_pred = grid_search.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Test accuracy (best Decision Tree):", acc)


best params: {'max_leaf_nodes': 17}
best estimator: DecisionTreeClassifier(max_leaf_nodes=17, random_state=42)
Test accuracy (best Decision Tree): 0.8695


In [4]:
# 2-a. ShuffleSplit으로 100개 샘플의 미니훈련셋을 1000개 생성
import numpy as np
from sklearn.model_selection import ShuffleSplit

n_trees = 1000
n_instances = 100
mini_sets = []

ss = ShuffleSplit(
    n_splits=n_trees,
    train_size=n_instances,
    random_state=42
)

for mini_train_idx, _ in ss.split(X_train):
    X_mini = X_train[mini_train_idx]
    y_mini = y_train[mini_train_idx]
    mini_sets.append((X_mini, y_mini))

# 2-b. 최적 모델 clone 후 미니셋별 훈련 + 개별 정확도 계산
from sklearn.base import clone
from sklearn.metrics import accuracy_score

forest = []
for X_mini, y_mini in mini_sets:
    dt = clone(grid_search.best_estimator_)
    dt.fit(X_mini, y_mini)
    forest.append(dt)

individual_accuracies = [
    accuracy_score(y_test, tree.predict(X_test)) for tree in forest
]
print("평균 개별 트리 정확도:", np.mean(individual_accuracies))

# 2-c. 1000개 트리의 예측값 모아서 최빈값 계산
from scipy.stats import mode

Y_pred = np.asarray([tree.predict(X_test) for tree in forest])
y_pred_majority_votes, _ = mode(Y_pred, axis=0)
y_pred_majority_votes = y_pred_majority_votes.reshape([-1])

# 2-d. 다수결 정확도
acc_forest = accuracy_score(y_test, y_pred_majority_votes)
print("Random forest (majority vote) accuracy:", acc_forest)


평균 개별 트리 정확도: 0.805471
Random forest (majority vote) accuracy: 0.872
