In [1]:
import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Step 1: Generate the moons dataset
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

# Step 2: Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Use GridSearchCV to find the best hyperparameters
param_grid = {
    'max_leaf_nodes': list(range(2, 100)),  # Try various values for max_leaf_nodes
    'min_samples_split': [2, 3, 4],
}
dt_clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dt_clf, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Step 4: Train the model on the full training set using the best hyperparameters
best_dt_clf = grid_search.best_estimator_
y_pred = best_dt_clf.predict(X_test)

# Evaluate the model's performance on the test set
accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy:.2%}")

Best hyperparameters: {'max_leaf_nodes': 17, 'min_samples_split': 2}
Test set accuracy: 86.95%


In [2]:
from sklearn.model_selection import ShuffleSplit
from scipy.stats import mode

# Step 1: Generate 1,000 subsets of the training set
n_trees = 1000
n_instances = 100
subsets = []

shuffle_split = ShuffleSplit(n_splits=n_trees, train_size=n_instances, random_state=42)
for subset_train_index, _ in shuffle_split.split(X_train):
    X_subset = X_train[subset_train_index]
    y_subset = y_train[subset_train_index]
    subsets.append((X_subset, y_subset))

# Step 2: Train one Decision Tree on each subset
forest = [DecisionTreeClassifier(**grid_search.best_params_, random_state=42) for _ in range(n_trees)]
accuracy_scores = []

for i, (X_subset, y_subset) in enumerate(subsets):
    forest[i].fit(X_subset, y_subset)
    y_pred = forest[i].predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

# Evaluate the performance of individual trees
print(f"Average accuracy of individual trees: {np.mean(accuracy_scores):.2%}")

# Step 3: Generate majority-vote predictions
y_pred_all_trees = np.array([tree.predict(X_test) for tree in forest])
y_pred_majority_votes, _ = mode(y_pred_all_trees, axis=0)

# Step 4: Evaluate the majority-vote predictions
forest_accuracy = accuracy_score(y_test, y_pred_majority_votes.ravel())
print(f"Random Forest accuracy: {forest_accuracy:.2%}")

Average accuracy of individual trees: 80.55%
Random Forest accuracy: 87.20%
