In [None]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, RandomizedSearchCV, ShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint

# Step 1: Load the wine dataset
wine_data = load_wine()
X, y = wine_data.data, wine_data.target

# Step 2: Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['auto', 'sqrt', 'log2', None]
}

tree_classifier = DecisionTreeClassifier(random_state=42)
random_search = RandomizedSearchCV(tree_classifier, param_distributions=param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1)

random_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Step 4: Evaluate the tuned Decision Tree
best_tree = random_search.best_estimator_
accuracy_on_test_set = best_tree.score(X_test, y_test) * 100

print(f"Accuracy on the test set: {accuracy_on_test_set:.2f}%")

# Now, let's move on to building the Random Forest

# Step 1: Create 10 subsets of the training dataset using ShuffleSplit
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

# Step 2: Train 10 decision trees on each subset using the best hyperparameter values
forest = RandomForestClassifier(**random_search.best_params_, n_estimators=10, random_state=42)

# Fit the Random Forest on the entire training set
forest.fit(X_train, y_train)

# Step 3: Evaluate all the trees on the test dataset
accuracy_on_test_set_forest = forest.score(X_test, y_test) * 100

print(f"Accuracy on the test set using Random Forest: {accuracy_on_test_set_forest:.2f}%")


In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, RandomizedSearchCV, ShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint

In [5]:
wine_data = load_wine()
X, y = wine_data.data, wine_data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
param_dist = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['auto', 'sqrt', 'log2', None]
}


In [8]:
tree_classifier = DecisionTreeClassifier(random_state=42)
random_search = RandomizedSearchCV(tree_classifier, param_distributions=param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1)

random_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 6, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 5, 'splitter': 'best'}


In [16]:
best_tree = random_search.best_estimator_
accuracy_on_test_set = best_tree.score(X_test, y_test) * 100

print(f"Accuracy on the test set: {accuracy_on_test_set:.2f}%")

Accuracy on the test set: 94.44%
