Train and fine-tune a decision tree for the moon dataset by following given steps

Step 1: Use make_moons to generate a moons dataset

In [1]:
from sklearn.datasets import make_moons

X_moons, y_moons = make_moons(n_samples=10000, noise=0.4)

Step 2: Use train_test_split to split the dataset into a training set and a test set

In [2]:
from sklearn.model_selection import train_test_split

X_moons_train, X_moons_test, y_moons_train, y_moons_test = train_test_split(
    X_moons, y_moons, test_size=0.2, random_state=42
)

Step 3: Use grid search with cross validation to find good hyperparameters values for a DecisionTreeClassifier

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pca_pipeline = make_pipeline(StandardScaler(), PCA())
X_rotated_moons_train = pca_pipeline.fit_transform(X_moons_train)
X_rotated_moons_test = pca_pipeline.transform(X_moons_test)
tree_clf = DecisionTreeClassifier(random_state=42)

In [4]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_distrib = {
    "max_depth": list(range(1, 10)),
    "min_samples_split": [2, 3, 4],
    # "min_samples_leaf": uniform(1, 10),
    # "max_features": ['auto', 'sqrt', 'log2'],
    "max_leaf_nodes": list(range(2, 100))
}

rnd_search_cv = RandomizedSearchCV(tree_clf, param_distrib, n_iter=100, cv=5,
                                   random_state=42)
rnd_search_cv.fit(X_rotated_moons_train, y_moons_train)
rnd_search_cv.best_estimator_

DecisionTreeClassifier(max_depth=8, max_leaf_nodes=26, min_samples_split=3,
                       random_state=42)

In [5]:
rnd_search_cv.best_score_

0.860125

In [6]:
from sklearn.metrics import accuracy_score

y_pred = rnd_search_cv.best_estimator_.predict(X_rotated_moons_test)
accuracy_score(y_pred, y_moons_test)

0.854

6_8: Grow a forest by following the given steps

Step 1: Generate 1_000 subsets of the training set, each containing 100 instances selected randomly

In [10]:
from sklearn.model_selection import ShuffleSplit

n_trees = 1000
n_instances = 100

mini_sets = []

rs = ShuffleSplit(n_splits=n_trees, test_size=len(X_moons_train) - n_instances,
                  random_state=42)

for mini_train_index, mini_test_index in rs.split(X_moons_train):
    X_mini_train = X_moons_train[mini_train_index]
    y_mini_train = y_moons_train[mini_train_index]
    mini_sets.append((X_mini_train, y_mini_train))

Step 2: Train one decision tree on each subset, using the best hyperparameter values found above. Evaluate these 1_000 decision trees on the test set.

In [12]:
from sklearn.base import clone
import numpy as np

forest = [clone(rnd_search_cv.best_estimator_) for _ in range(n_trees)]

accuracy_scores = []

for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)

    y_pred = tree.predict(X_moons_test)
    accuracy_scores.append(accuracy_score(y_moons_test, y_pred))

np.mean(accuracy_scores)

0.7905115

In [13]:
Y_pred = np.empty([n_trees, len(X_moons_test)], dtype=np.uint8)

for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_moons_test)

In [14]:
from scipy.stats import mode

y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)

Step 4: Evaluate these predictions on the test set

In [17]:
accuracy_score(y_moons_test, y_pred_majority_votes.reshape([-1]))

0.868