In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ShuffleSplit

In [2]:
#Loading the wine dataset
wine = load_wine()
X = wine.data
y = wine.target

In [3]:
#Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
#Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['auto', 'sqrt', 'log2', None]
}

In [5]:
dt_classifier = DecisionTreeClassifier()
random_search = RandomizedSearchCV(dt_classifier, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy')
random_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_iter=100,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x0000020F7538A550>,
                                        'max_features': ['auto', 'sqrt', 'log2',
                                                         None],
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x0000020F78CF9460>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x0000020F78D394F0>,
                                        'splitter': ['best', 'random']},
                   scoring='accuracy')

In [6]:
# Best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'criterion': 'entropy', 'max_depth': 14, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 9, 'splitter': 'best'}


In [8]:
#Evaluate the model
best_dt_model = random_search.best_estimator_
y_pred = best_dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Decision Tree:", accuracy)

Accuracy of Decision Tree: 0.9444444444444444


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Step 1: Create 10 subsets of the training dataset using ShuffleSplit
shuffle_split = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

# Step 2: Train 1 decision tree on each subset
forest = []
for train_index, _ in shuffle_split.split(X_train):
    X_train_subset, y_train_subset = X_train[train_index], y_train[train_index]
    dt_classifier = DecisionTreeClassifier(**best_params)  # Using the best parameters found earlier
    dt_classifier.fit(X_train_subset, y_train_subset)
    forest.append(dt_classifier)

# Step 3: Evaluate all the trees on the test dataset
forest_predictions = [dt.predict(X_test) for dt in forest]

# Calculate accuracy for each tree
forest_accuracies = [accuracy_score(y_test, pred) for pred in forest_predictions]

# Calculate average accuracy of the forest
average_accuracy = sum(forest_accuracies) / len(forest_accuracies)
print("Average accuracy of Random Forest:", average_accuracy)


RANDOM FOREST

In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
#Create 10 subsets of the training dataset using ShuffleSplit
shuffle_split = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

In [11]:
#Train 1 decision tree on each subset
forest = []
for train_index, _ in shuffle_split.split(X_train):
    X_train_subset, y_train_subset = X_train[train_index], y_train[train_index]
    dt_classifier = DecisionTreeClassifier(**best_params)  # Using the best parameters found earlier
    dt_classifier.fit(X_train_subset, y_train_subset)
    forest.append(dt_classifier)

In [12]:
#Evaluate all the trees on the test dataset
forest_predictions = [dt.predict(X_test) for dt in forest]

In [13]:
# Calculate accuracy for each tree
forest_accuracies = [accuracy_score(y_test, pred) for pred in forest_predictions]

In [14]:
# Calculate average accuracy of the forest
average_accuracy = sum(forest_accuracies) / len(forest_accuracies)
print("Average accuracy of Random Forest:", average_accuracy)

Average accuracy of Random Forest: 0.9138888888888888
