In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

class Node:
    def __init__(self, feature=None, value=None, children=None, label=None):
        # Initializing a Node object with optional parameters:
        # - feature: Storing the feature used for splitting in this node.
        # - value: Storing the specific value used for splitting in this node (for categorical features).
        # - children: A dictionary to store child nodes (subtrees).
        # - label: Storing the predicted label for leaf nodes.
        self.feature = feature
        self.value = value
        self.children = children if children is not None else {}
        self.label = label
class ID3DecisionTreeClassifier:
    def __init__(self, max_depth=None):
        # Initializing an ID3 Decision Tree Classifier with an optional maximum depth parameter.
        # - max_depth: Limiting the depth of the decision tree.
        self.max_depth = max_depth
        self.tree = None  # Initializing the decision tree as None.

    def fit(self, X, y):
        # Fitting the decision tree classifier to the provided data.
        # - X: The feature matrix.
        # - y: The target labels.
        self.tree = self._build_tree(X, y, depth=0)
        # Building the decision tree recursively starting with depth 0.
    def _build_tree(self, X, y, depth):
        # Attempting to build a decision tree node recursively:
        # - X: The feature matrix for the current node.
        # - y: The target labels for the current node.
        # - depth: The current depth in the tree.
        num_samples, num_features = X.shape
        unique_classes, counts = np.unique(y, return_counts=True)
        majority_class = unique_classes[np.argmax(counts)]
        # Finding the majority class among the target labels.
        # Attempting to stop the recursion:
        if len(unique_classes) == 1 or (self.max_depth is not None and depth == self.max_depth):
            return Node(label=majority_class)
        # If all labels are the same or the maximum depth is reached, create a leaf node with the majority class.
        best_feature, best_value, is_categorical = self._find_best_split(X, y)
        if best_feature is None:
            return Node(label=majority_class)
        # Finding the best feature and value to split the data.
        children = {}
        if is_categorical:
            unique_values = np.unique(X[:, best_feature])
            for value in unique_values:
                child_X, child_y = self._split_data_categorical(X, y, best_feature, value)
                children[value] = self._build_tree(child_X, child_y, depth + 1)
        else:
            child_X_left, child_y_left, child_X_right, child_y_right = self._split_data_numerical(X, y, best_feature, best_value)
            children['<= ' + str(best_value)] = self._build_tree(child_X_left, child_y_left, depth + 1)
            children['> ' + str(best_value)] = self._build_tree(child_X_right, child_y_right, depth + 1)
        # Recursively building child nodes for categorical and numerical features.
        return Node(feature=best_feature, value=best_value, children=children)
    def _find_best_split(self, X, y):
        # Attempting to find the best feature and value to split the data:
        # - X: The feature matrix.
        # - y: The target labels.
        num_samples, num_features = X.shape
        entropy = self._calculate_entropy(y)
        best_information_gain = -1
        best_feature = None
        best_value = None
        is_categorical = False
        for feature in range(num_features):
            unique_values = np.unique(X[:, feature])
            if len(unique_values) <= 1:
                continue
            if all(isinstance(value, (int, float)) for value in unique_values):
                # Trying to find the best split for numerical features:
                for value in unique_values:
                    child_X_left, child_y_left, child_X_right, child_y_right = self._split_data_numerical(X, y, feature, value)
                    information_gain = entropy - (len(child_y_left) / num_samples) * self._calculate_entropy(child_y_left) - (len(child_y_right) / num_samples) * self._calculate_entropy(child_y_right)
                    if information_gain > best_information_gain:
                        best_information_gain = information_gain
                        best_feature = feature
                        best_value = value
                        is_categorical = False
            else:
                # Trying to find the best split for categorical features:
                for value in unique_values:
                    child_X, child_y = self._split_data_categorical(X, y, feature, value)
                    information_gain = entropy - (len(child_y) / num_samples) * self._calculate_entropy(child_y)
                    if information_gain > best_information_gain:
                        best_information_gain = information_gain
                        best_feature = feature
                        best_value = value
                        is_categorical = True

        return best_feature, best_value, is_categorical

    def _calculate_entropy(self, y):
        # Calculating the entropy of a set of target labels:
        # - y: The target labels.

        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy
    def _split_data_numerical(self, X, y, feature, value):
        # Attempting to split data for numerical features:
        # - X: The feature matrix.
        # - y: The target labels.
        # - feature: The feature to split.
        # - value: The value to split at.
        left_mask = X[:, feature] <= value
        right_mask = X[:, feature] > value
        return X[left_mask], y[left_mask], X[right_mask], y[right_mask]
    def _split_data_categorical(self, X, y, feature, value):
        # Attempting to split data for categorical features:
        # - X: The feature matrix.
        # - y: The target labels.
        # - feature: The feature to split.
        # - value: The value to split at.
        mask = X[:, feature] == value
        return X[mask], y[mask]
    def predict(self, X):
        # Predicting labels for a set of samples:
        # - X: The feature matrix for prediction.
        return np.array([self._predict_sample(x) for x in X])
    def _predict_sample(self, x):
        # Attempting to predict the label for a single sample:
        # - x: The feature vector for prediction.
        node = self.tree
        while node.children:
            feature_value = x[node.feature]
            if feature_value in node.children:
                node = node.children[feature_value]
            else:
                break
        return node.label if node.label is not None else 0


In [2]:
import os
path = r"C:\Users\User\Downloads"
os.chdir(path)
print("Current Working Directory: ", os.getcwd())

Current Working Directory:  C:\Users\User\Downloads


In [8]:
# Attempting to read the Mushroom dataset from an Excel file named 'agricus.xlsx'.
# The pandas library is being utilized for this operation, with 'pd' as its alias.
data = pd.read_excel("agricus.xlsx")

# Putting effort to handle any missing values that might be present in the dataset.
# It is known that the Mushroom dataset may contain missing values denoted as '?'.
# Replacing these '?' symbols with None (which is treated as NaN in pandas) to facilitate further processing.
# This step is crucial as handling missing data is essential for accurate analysis.
data = data.replace("?", None)

# Trying to impute the missing values with the mode (most frequent value) for each column.
# This approach is one of the standard methods for dealing with missing data,
# especially when deleting the missing data is not a viable option.
data = data.fillna(data.mode().iloc[0])

# Attempting to convert all categorical features in the dataset into numerical values.
# In the Mushroom dataset, all columns are considered categorical.
# This conversion is necessary as many machine learning algorithms require numerical input.
categorical_columns = data.columns
for column in categorical_columns:
    data[column] = data[column].astype('category').cat.codes

# Splitting the dataset into features (X) and labels (y).
# In the Mushroom dataset, the first column is typically used as the class label.
# This separation is essential for supervised learning, where the algorithm learns from the features to predict the labels.
X = data.iloc[:, 1:].values  # Extracting features: all columns except the first one
y = data.iloc[:, 0].values   # Extracting labels: the first column

# Dividing the data into training and testing sets.
# This split is crucial for evaluating the model's performance on unseen data.
# The 'train_test_split' function from sklearn is being used here,
# setting aside 20% of the data for testing (test_size=0.2).
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing a list of max_depth values to evaluate.
# These values determine the maximum depth of the decision tree.
# Experimenting with different depths can help in understanding the effect of depth on model performance.
max_depth_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# Initializing a list to store the accuracy for each max_depth value.
# Accuracy is a common metric for evaluating classification models.
accuracies = []

# Looping over the different max_depth values.
# For each value, a decision tree classifier is being trained and its accuracy is evaluated.
for max_depth in max_depth_values:
    classifier = ID3DecisionTreeClassifier(max_depth=max_depth)
    classifier.fit(X_train, y_train)  # Training the classifier with the training set
    y_pred = classifier.predict(X_test)  # Predicting labels for the test set
    accuracy = accuracy_score(y_test, y_pred)  # Calculating the accuracy of the predictions
    accuracies.append(accuracy)  # Appending the calculated accuracy to the list

# Printing the max_depth values alongside their corresponding accuracies.
# This output is helpful in determining the optimal depth for the decision tree.
for max_depth, accuracy in zip(max_depth_values, accuracies):
    print(f"Max Depth: {max_depth}, Accuracy: {accuracy}")



Max Depth: 1, Accuracy: 0.5612307692307692
Max Depth: 2, Accuracy: 0.6184615384615385
Max Depth: 3, Accuracy: 0.8061538461538461
Max Depth: 4, Accuracy: 0.9766153846153847
Max Depth: 5, Accuracy: 0.9969230769230769
Max Depth: 6, Accuracy: 0.9969230769230769
Max Depth: 7, Accuracy: 0.9987692307692307
Max Depth: 8, Accuracy: 0.9993846153846154
Max Depth: 9, Accuracy: 0.9993846153846154
Max Depth: 10, Accuracy: 0.9993846153846154


In [10]:
# Importing necessary classes from sklearn for decision tree classification and various performance metrics.
# The DecisionTreeClassifier is essential for building the tree model,
# while accuracy_score, precision_score, recall_score, and f1_score are for evaluating its performance.
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Including the time library for tracking the training duration of the classifiers.
import time
# Splitting the dataset into features (X) and labels (y) for the machine learning process.
# The features (predictors) are all columns except the first one, and the label (response variable) is the first column.
X = data.iloc[:, 1:].values
y = data.iloc[:, 0].values

# Dividing the data into training and testing sets to evaluate the model's performance on unseen data.
# This is a critical step in validating the effectiveness of the model.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing an instance of a custom ID3DecisionTreeClassifier with a specified maximum depth of 3.
# This classifier is likely a custom implementation of the ID3 algorithm.
the_classifier = ID3DecisionTreeClassifier(max_depth=3)

# Initializing an instance of the scikit-learn DecisionTreeClassifier,
# specifying the maximum depth and using 'entropy' as the criterion for splitting.
# This classifier represents a standard implementation of decision trees in Python.
sklearn_classifier = DecisionTreeClassifier(max_depth=3, criterion='entropy')

# Measuring and recording the training time for the custom ID3DecisionTreeClassifier.
# Timing starts before training and ends after, calculating the time difference to get the training duration.
start_time = time.time()
the_classifier.fit(X_train, y_train)
the_training_time = time.time() - start_time

# Similarly, measuring and recording the training time for the scikit-learn DecisionTreeClassifier.
start_time = time.time()
sklearn_classifier.fit(X_train, y_train)
sklearn_training_time = time.time() - start_time

# Generating predictions with the custom classifier for the test data.
the_predictions = the_classifier.predict(X_test)

# Generating predictions with the sklearn classifier for the test data.
sklearn_predictions = sklearn_classifier.predict(X_test)

# Evaluating the performance of the custom classifier using accuracy, precision, recall, and F1 score.
# These metrics give a comprehensive view of the model's performance, considering aspects like false positives and negatives.
the_accuracy = accuracy_score(y_test, the_predictions)
the_precision = precision_score(y_test, the_predictions, average='weighted')
the_recall = recall_score(y_test, the_predictions, average='weighted')
the_f1 = f1_score(y_test, the_predictions, average='weighted')

# Similarly, evaluating the sklearn classifier with the same set of metrics.
sklearn_accuracy = accuracy_score(y_test, sklearn_predictions)
sklearn_precision = precision_score(y_test, sklearn_predictions, average='weighted')
sklearn_recall = recall_score(y_test, sklearn_predictions, average='weighted')
sklearn_f1 = f1_score(y_test, sklearn_predictions, average='weighted')

# Compiling all results into a pandas DataFrame for a consolidated view.
# This includes both classifiers' performance metrics and their training times for comparison.
results = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'Training Time'],
    'the Classifier': [the_accuracy, the_precision, the_recall, the_f1, the_training_time],
    'Sklearn Classifier': [sklearn_accuracy, sklearn_precision, sklearn_recall, sklearn_f1, sklearn_training_time]
})

# Printing the results for a quick inspection and comparison between the custom and sklearn classifiers.
print(results)



          Metric  the Classifier  Sklearn Classifier
0       Accuracy        0.806154            0.948923
1      Precision        0.806111            0.950407
2         Recall        0.806154            0.948923
3       F1 Score        0.806108            0.948937
4  Training Time        0.096362            0.005984


In [12]:
# Initializing a DataFrame named 'results_df' with specific columns. 
# This DataFrame is intended to store various metrics for performance evaluation of classifiers.
# These metrics include Accuracy, Precision, Recall, F1 Score, and Training Time, among others.
results_df = pd.DataFrame(columns=['Classifier', 'Max_Depth', 'Training_Size', 'Accuracy', 'Precision', 'Recall', 'F1_Score', 'Training_Time'])

# Setting up arrays for max_depths and training_sizes to iterate through.
# These represent different configurations to test the classifiers under various conditions.
max_depths = [3, 5, 10]  # Example max_depth values like 3, 5, and 10 to explore varying tree depths.
training_sizes = [0.6, 0.7, 0.8]  # Example training sizes (60%, 70%, 80%) to analyze different training set sizes.

# Beginning a nested loop: The outer loop iterates over max_depths, and the inner loop over training_sizes.
# This double loop is effectively testing all combinations of depths and training sizes.
for depth in max_depths:
    for size in training_sizes:
        # Splitting the dataset into training and testing sets based on the current training size.
        # The 'train_test_split' function is being used from scikit-learn, 
        # ensuring each split has the same random state for consistency.
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=size, random_state=42)

        # Initializing and training the custom ID3DecisionTreeClassifier with the current depth.
        # Also measuring the time taken to train the classifier.
        the_classifier = ID3DecisionTreeClassifier(max_depth=depth)
        start_time = time.time()
        the_classifier.fit(X_train, y_train)
        the_training_time = time.time() - start_time

        # Predicting and evaluating the performance of the custom classifier on the test set.
        # Calculating key metrics such as accuracy, precision, recall, and F1 score.
        the_predictions = the_classifier.predict(X_test)
        the_accuracy = accuracy_score(y_test, the_predictions)
        the_precision = precision_score(y_test, the_predictions, average='weighted')
        the_recall = recall_score(y_test, the_predictions, average='weighted')
        the_f1 = f1_score(y_test, the_predictions, average='weighted')

        # Constructing a new DataFrame row with the results of the custom classifier
        # and appending it to the 'results_df' DataFrame.
        the_new_row = pd.DataFrame([['theClassifier', depth, size, the_accuracy, the_precision, the_recall, the_f1, the_training_time]], 
                                   columns=results_df.columns)
        results_df = pd.concat([results_df, the_new_row], ignore_index=True)

        # Repeating the training and evaluation process for the sklearn DecisionTreeClassifier.
        # This classifier uses 'entropy' as the criterion and the same depth as the custom classifier.
        sklearn_classifier = DecisionTreeClassifier(max_depth=depth, criterion='entropy')
        start_time = time.time()
        sklearn_classifier.fit(X_train, y_train)
        sklearn_training_time = time.time() - start_time
        sklearn_predictions = sklearn_classifier.predict(X_test)
        sklearn_accuracy = accuracy_score(y_test, sklearn_predictions)
        sklearn_precision = precision_score(y_test, sklearn_predictions, average='weighted')
        sklearn_recall = recall_score(y_test, sklearn_predictions, average='weighted')
        sklearn_f1 = f1_score(y_test, sklearn_predictions, average='weighted')

        # Constructing a new DataFrame row with the results of the sklearn classifier
        # and appending it to the 'results_df' DataFrame.
        sklearn_new_row = pd.DataFrame([['Sklearn', depth, size, sklearn_accuracy, sklearn_precision, sklearn_recall, sklearn_f1, sklearn_training_time]], 
                                       columns=results_df.columns)
        results_df = pd.concat([results_df, sklearn_new_row], ignore_index=True)

# Finally, saving the accumulated results in 'results_df' to a CSV file named 'mushroom_comparison_results.csv'.
# The 'index=False' parameter ensures that the DataFrame index is not included in the CSV file.
results_df.to_csv('mushroom_comparison_results.csv', index=False)



  results_df = pd.concat([results_df, the_new_row], ignore_index=True)


In [13]:
results_df

Unnamed: 0,Classifier,Max_Depth,Training_Size,Accuracy,Precision,Recall,F1_Score,Training_Time
0,theClassifier,3,0.6,0.908308,0.911338,0.908308,0.908275,0.140143
1,Sklearn,3,0.6,0.957846,0.958543,0.957846,0.957857,0.003991
2,theClassifier,3,0.7,0.906071,0.908881,0.906071,0.906038,0.081295
3,Sklearn,3,0.7,0.95324,0.954195,0.95324,0.95325,0.003991
4,theClassifier,3,0.8,0.806154,0.806111,0.806154,0.806108,0.0793
5,Sklearn,3,0.8,0.948923,0.950407,0.948923,0.948937,0.006014
6,theClassifier,5,0.6,0.996,0.996031,0.996,0.995999,0.210542
7,Sklearn,5,0.6,0.984615,0.984647,0.984615,0.984613,0.003986
8,theClassifier,5,0.7,0.995898,0.995931,0.995898,0.995898,0.19861
9,Sklearn,5,0.7,0.983593,0.983602,0.983593,0.983592,0.003989
