In [11]:
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, RandomizedSearchCV, ShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the wine dataset
wine = load_wine()
X, y = wine.data, wine.target
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Define the hyperparameter grid
param_dist = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Decision Tree classifier
dt_classifier = DecisionTreeClassifier()

# Use RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(dt_classifier, param_distributions=param_dist, n_iter=100, scoring='accuracy', cv=5, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Evaluate the accuracy on the test set
y_pred = random_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy:", accuracy)
# Create 10 subsets of the training dataset using ShuffleSplit
shuffle_split = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

# Initialize an empty list to store individual decision trees
trees = []

# Train 10 decision trees on each subset
for train_index, _ in shuffle_split.split(X_train):
    subset_X_train, subset_y_train = X_train[train_index], y_train[train_index]

    # Create a Decision Tree with the best hyperparameters found
    tree = DecisionTreeClassifier(**best_params)
    tree.fit(subset_X_train, subset_y_train)
    trees.append(tree)

# Evaluate all the trees on the test dataset
ensemble_predictions = np.array([tree.predict(X_test) for tree in trees])
ensemble_predictions = np.transpose(ensemble_predictions)

# Use majority voting to get the final prediction
final_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=ensemble_predictions)

# Evaluate the accuracy of the ensemble on the test set
ensemble_accuracy = accuracy_score(y_test, final_predictions)
print("Random Forest Ensemble Test Set Accuracy:", ensemble_accuracy)


Best Hyperparameters: {'splitter': 'random', 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 50, 'criterion': 'gini'}
Test Set Accuracy: 0.9722222222222222
Random Forest Ensemble Test Set Accuracy: 0.9722222222222222
