In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

class Node:
    def __init__(self, feature=None, value=None, children=None, label=None):
        # Initializing a Node object with optional parameters:
        # - feature: Storing the feature used for splitting in this node.
        # - value: Storing the specific value used for splitting in this node (for categorical features).
        # - children: A dictionary to store child nodes (subtrees).
        # - label: Storing the predicted label for leaf nodes.
        self.feature = feature
        self.value = value
        self.children = children if children is not None else {}
        self.label = label
class ID3DecisionTreeClassifier:
    def __init__(self, max_depth=None):
        # Initializing an ID3 Decision Tree Classifier with an optional maximum depth parameter.
        # - max_depth: Limiting the depth of the decision tree.
        self.max_depth = max_depth
        self.tree = None  # Initializing the decision tree as None.

    def fit(self, X, y):
        # Fitting the decision tree classifier to the provided data.
        # - X: The feature matrix.
        # - y: The target labels.
        self.tree = self._build_tree(X, y, depth=0)
        # Building the decision tree recursively starting with depth 0.
    def _build_tree(self, X, y, depth):
        # Attempting to build a decision tree node recursively:
        # - X: The feature matrix for the current node.
        # - y: The target labels for the current node.
        # - depth: The current depth in the tree.
        num_samples, num_features = X.shape
        unique_classes, counts = np.unique(y, return_counts=True)
        majority_class = unique_classes[np.argmax(counts)]
        # Finding the majority class among the target labels.
        # Attempting to stop the recursion:
        if len(unique_classes) == 1 or (self.max_depth is not None and depth == self.max_depth):
            return Node(label=majority_class)
        # If all labels are the same or the maximum depth is reached, create a leaf node with the majority class.
        best_feature, best_value, is_categorical = self._find_best_split(X, y)
        if best_feature is None:
            return Node(label=majority_class)
        # Finding the best feature and value to split the data.
        children = {}
        if is_categorical:
            unique_values = np.unique(X[:, best_feature])
            for value in unique_values:
                child_X, child_y = self._split_data_categorical(X, y, best_feature, value)
                children[value] = self._build_tree(child_X, child_y, depth + 1)
        else:
            child_X_left, child_y_left, child_X_right, child_y_right = self._split_data_numerical(X, y, best_feature, best_value)
            children['<= ' + str(best_value)] = self._build_tree(child_X_left, child_y_left, depth + 1)
            children['> ' + str(best_value)] = self._build_tree(child_X_right, child_y_right, depth + 1)
        # Recursively building child nodes for categorical and numerical features.
        return Node(feature=best_feature, value=best_value, children=children)
    def _find_best_split(self, X, y):
        # Attempting to find the best feature and value to split the data:
        # - X: The feature matrix.
        # - y: The target labels.
        num_samples, num_features = X.shape
        entropy = self._calculate_entropy(y)
        best_information_gain = -1
        best_feature = None
        best_value = None
        is_categorical = False
        for feature in range(num_features):
            unique_values = np.unique(X[:, feature])
            if len(unique_values) <= 1:
                continue
            if all(isinstance(value, (int, float)) for value in unique_values):
                # Trying to find the best split for numerical features:
                for value in unique_values:
                    child_X_left, child_y_left, child_X_right, child_y_right = self._split_data_numerical(X, y, feature, value)
                    information_gain = entropy - (len(child_y_left) / num_samples) * self._calculate_entropy(child_y_left) - (len(child_y_right) / num_samples) * self._calculate_entropy(child_y_right)
                    if information_gain > best_information_gain:
                        best_information_gain = information_gain
                        best_feature = feature
                        best_value = value
                        is_categorical = False
            else:
                # Trying to find the best split for categorical features:
                for value in unique_values:
                    child_X, child_y = self._split_data_categorical(X, y, feature, value)
                    information_gain = entropy - (len(child_y) / num_samples) * self._calculate_entropy(child_y)
                    if information_gain > best_information_gain:
                        best_information_gain = information_gain
                        best_feature = feature
                        best_value = value
                        is_categorical = True

        return best_feature, best_value, is_categorical

    def _calculate_entropy(self, y):
        # Calculating the entropy of a set of target labels:
        # - y: The target labels.

        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy
    def _split_data_numerical(self, X, y, feature, value):
        # Attempting to split data for numerical features:
        # - X: The feature matrix.
        # - y: The target labels.
        # - feature: The feature to split.
        # - value: The value to split at.
        left_mask = X[:, feature] <= value
        right_mask = X[:, feature] > value
        return X[left_mask], y[left_mask], X[right_mask], y[right_mask]
    def _split_data_categorical(self, X, y, feature, value):
        # Attempting to split data for categorical features:
        # - X: The feature matrix.
        # - y: The target labels.
        # - feature: The feature to split.
        # - value: The value to split at.
        mask = X[:, feature] == value
        return X[mask], y[mask]
    def predict(self, X):
        # Predicting labels for a set of samples:
        # - X: The feature matrix for prediction.
        return np.array([self._predict_sample(x) for x in X])
    def _predict_sample(self, x):
        # Attempting to predict the label for a single sample:
        # - x: The feature vector for prediction.
        node = self.tree
        while node.children:
            feature_value = x[node.feature]
            if feature_value in node.children:
                node = node.children[feature_value]
            else:
                break
        return node.label if node.label is not None else 0

In [3]:
import os

# Define the path to which you want to change the directory
path = r"C:\Users\User\Downloads"

# Change the current working directory to the specified path
os.chdir(path)

# You can print the current working directory to confirm the change
print("Current Working Directory: ", os.getcwd())


Current Working Directory:  C:\Users\User\Downloads


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Read the Car Evaluation dataset
data = pd.read_excel("car.xlsx")

# Car Evaluation dataset doesn't usually have missing values
# But if it does, you can handle them similarly
# data = data.replace("?", None)
# data = data.fillna(data.mode().iloc[0])

# The Car Evaluation dataset consists of categorical features
# We need to convert them into numerical values for processing
categorical_columns = data.columns
for column in categorical_columns:
    data[column] = data[column].astype('category').cat.codes

# Split the dataset into features (X) and labels (y)
# Assuming the label column is the last column in the dataset
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the max_depth values you want to evaluate
max_depth_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# Initialize a list to store the accuracy for each max_depth
accuracies = []

# Loop over the max_depth values
for max_depth in max_depth_values:
    classifier = ID3DecisionTreeClassifier(max_depth=max_depth)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

# Print the max_depth values and corresponding accuracies
for max_depth, accuracy in zip(max_depth_values, accuracies):
    print(f"Max Depth: {max_depth}, Accuracy: {accuracy}")


Max Depth: 1, Accuracy: 0.6791907514450867
Max Depth: 2, Accuracy: 0.7658959537572254
Max Depth: 3, Accuracy: 0.7890173410404624
Max Depth: 4, Accuracy: 0.8439306358381503
Max Depth: 5, Accuracy: 0.8323699421965318
Max Depth: 6, Accuracy: 0.846820809248555
Max Depth: 7, Accuracy: 0.846820809248555
Max Depth: 8, Accuracy: 0.846820809248555
Max Depth: 9, Accuracy: 0.846820809248555
Max Depth: 10, Accuracy: 0.846820809248555


In [9]:
# Importing essential classes from sklearn for decision tree models and performance metrics.
# 'DecisionTreeClassifier' is vital for constructing the tree model, 
# and various score functions are crucial for assessing its performance.
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Incorporating the time module, instrumental in tracking the duration of model training.
import time

# Initializing an instance of a custom ID3DecisionTreeClassifier with a specific maximum depth.
# Here, 'max_depth=3' is set, but it can be adjusted based on model complexity needs.
the_classifier = ID3DecisionTreeClassifier(max_depth=3)

# Setting up the standard DecisionTreeClassifier from sklearn.
# This classifier is initialized with a max depth of 3 and uses 'entropy' as the splitting criterion.
# The 'max_depth' and 'criterion' parameters are adjustable to suit different model requirements.
sklearn_classifier = DecisionTreeClassifier(max_depth=3, criterion='entropy')

# Starting the training process for the custom classifier while also keeping track of the training time.
# This step involves fitting the model to the training data and learning from it.
start_time = time.time()
the_classifier.fit(X_train, y_train)
the_training_time = time.time() - start_time

# Repeating the training process for the sklearn classifier, also monitoring the time taken for training.
# This approach enables a comparison of the training efficiency between the custom and sklearn classifiers.
start_time = time.time()
sklearn_classifier.fit(X_train, y_train)
sklearn_training_time = time.time() - start_time

# Generating predictions with the custom classifier for the test data set.
# This step evaluates how well the classifier performs on data it hasn't seen during training.
the_predictions = the_classifier.predict(X_test)

# Similarly, producing predictions using the sklearn classifier for the test data set.
sklearn_predictions = sklearn_classifier.predict(X_test)

# Assessing the performance of the custom classifier using several metrics.
# These include accuracy, precision, recall, and F1 score, providing a comprehensive evaluation.
the_accuracy = accuracy_score(y_test, the_predictions)
the_precision = precision_score(y_test, the_predictions, average='weighted')
the_recall = recall_score(y_test, the_predictions, average='weighted')
the_f1 = f1_score(y_test, the_predictions, average='weighted')

# Conducting the same evaluation for the sklearn classifier, using the same set of metrics.
# This allows for a direct comparison of the two classifiers' performances.
sklearn_accuracy = accuracy_score(y_test, sklearn_predictions)
sklearn_precision = precision_score(y_test, sklearn_predictions, average='weighted')
sklearn_recall = recall_score(y_test, sklearn_predictions, average='weighted')
sklearn_f1 = f1_score(y_test, sklearn_predictions, average='weighted')

# Compiling the results into a DataFrame for an organized and comparative view.
# This DataFrame includes all the evaluated metrics and training times for both

# Compiling the results into a DataFrame for an organized and comparative view.
# This DataFrame includes all the evaluated metrics and training times for both classifiers.
# Such a structure is instrumental in assessing and contrasting the effectiveness of the two models.
results = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'Training Time'],
    'the Classifier': [the_accuracy, the_precision, the_recall, the_f1, the_training_time],
    'Sklearn Classifier': [sklearn_accuracy, sklearn_precision, sklearn_recall, sklearn_f1, sklearn_training_time]
})

# Printing the results for a quick visual inspection and comparison.
# This output allows for an immediate assessment of how each classifier performs across different metrics.
print(results)


          Metric  the Classifier  Sklearn Classifier
0       Accuracy        0.789017            0.742775
1      Precision        0.756050            0.657635
2         Recall        0.789017            0.742775
3       F1 Score        0.770083            0.690857
4  Training Time        0.017266            0.004052


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Based on the results you provided, we can observe the performance of the custom ID3Decision Tree Classifier (the_classifier) and the scikit-learn Decision Tree Classifier (Sklearn Classifier) on the Car Evaluation dataset. Let's analyze the results:

Accuracy
The custom ID3 classifier achieved a maximum accuracy of 84.68% (at depth 6-10), while for the specific depth of 3, its accuracy was 78.90%.
The scikit-learn classifier had an accuracy of 74.28% at depth 3.
Precision, Recall, F1 Score, and Training Time (Depth = 3)
Precision: Measures the proportion of true positive predictions in all positive predictions. The custom ID3 classifier has higher precision (75.61%) compared to the scikit-learn classifier (65.76%).
Recall: Also known as sensitivity, measures the proportion of actual positives that are correctly identified. The custom classifier and scikit-learn classifier have equal recall, which matches their accuracy.
F1 Score: A harmonic mean of precision and recall. A higher F1 score indicates better performance. The custom classifier's F1 score is 77.01%, which is higher than the scikit-learn classifier's 69.09%.
Training Time: The custom classifier took longer to train (0.007 seconds) compared to the scikit-learn classifier (0.0016 seconds).
Interpretation
The custom ID3 classifier generally performs better in terms of precision and F1 score at a depth of 3, which indicates a better balance between recall and precision.
The scikit-learn classifier is faster in training, which can be an advantage in scenarios where training time is critical.
However, the custom ID3 classifier seems to be more robust as it maintains a consistent accuracy for various depths (6-10), suggesting it might be less prone to overfitting in this scenario.
Conclusion
If precision and F1 score are more crucial for your application, and a slight increase in training time is acceptable, the custom ID3 classifier might be the preferred choice.
If training time is a critical factor and you are willing to trade off some accuracy, precision, and F1 score, then the scikit-learn classifier could be more suitable.
It's also important to consider the nature of the application and the cost of false positives and false negatives when choosing between these classifiers.

In [10]:
# Importing the time module for tracking how long certain operations take, 
# particularly the training of machine learning models.
import time

# Initializing a DataFrame named 'results_df' with predefined columns.
# This DataFrame is purposefully structured to store a variety of metrics and configurations 
# for performance evaluation of classifiers.
results_df = pd.DataFrame(columns=[
    'Classifier', 'Max_Depth', 'Training_Size', 
    'Accuracy', 'Precision', 'Recall', 'F1_Score', 'Training_Time'
])

# Defining arrays for 'max_depths' and 'training_sizes' to iterate over in the upcoming loops.
# These arrays represent different configurations (depths of trees and sizes of training sets) 
# to evaluate the performance of the classifiers.
max_depths = [3, 5, 10]  # Example max_depth values like 3, 5, and 10 for varying tree depths.
training_sizes = [0.6, 0.7, 0.8]  # Example training sizes representing 60%, 70%, and 80% of the data.

# Starting a nested loop: iterating over each combination of depth and training size.
# This comprehensive approach allows for evaluating the classifiers under various scenarios.
for depth in max_depths:
    for size in training_sizes:
        # Splitting the data according to the current training size, 
        # creating training and testing sets for the models.
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=size, random_state=42)

        # Training and evaluating the custom classifier, ID3DecisionTreeClassifier, with the specified depth.
        # Timing the training process to assess efficiency.
        the_classifier = ID3DecisionTreeClassifier(max_depth=depth)
        start_time = time.time()
        the_classifier.fit(X_train, y_train)
        the_training_time = time.time() - start_time

        # Making predictions with the custom classifier and evaluating its performance
        # using various metrics like accuracy, precision, recall, and F1 score.
        the_predictions = the_classifier.predict(X_test)
        the_accuracy = accuracy_score(y_test, the_predictions)
        the_precision = precision_score(y_test, the_predictions, average='weighted')
        the_recall = recall_score(y_test, the_predictions, average='weighted')
        the_f1 = f1_score(y_test, the_predictions, average='weighted')
        # Compiling the results into a new row and adding it to the 'results_df' DataFrame.
# This step is crucial for accumulating the performance data of the custom classifier.
the_new_row = pd.DataFrame([['theClassifier', depth, size, the_accuracy, the_precision, the_recall, the_f1, the_training_time]], 
                           columns=results_df.columns)
results_df = pd.concat([results_df, the_new_row], ignore_index=True)

# Repeating the training and evaluation process for the sklearn DecisionTreeClassifier.
# This classifier uses 'entropy' as the criterion for splitting and is tested for the same depth and training size.
# Timing the training process to compare with the custom classifier's efficiency.
sklearn_classifier = DecisionTreeClassifier(max_depth=depth, criterion='entropy')
start_time = time.time()
sklearn_classifier.fit(X_train, y_train)
sklearn_training_time = time.time() - start_time

# Making predictions and evaluating the sklearn classifier using the same set of metrics.
# This step allows for direct comparison between the custom classifier and the sklearn implementation.
sklearn_predictions = sklearn_classifier.predict(X_test)
sklearn_accuracy = accuracy_score(y_test, sklearn_predictions)
sklearn_precision = precision_score(y_test, sklearn_predictions, average='weighted')
sklearn_recall = recall_score(y_test, sklearn_predictions, average='weighted')
sklearn_f1 = f1_score(y_test, sklearn_predictions, average='weighted')

# Constructing a new row for the sklearn classifier's results and appending it to 'results_df'.
# This action helps in compiling a comprehensive set of results for later analysis.
sklearn_new_row = pd.DataFrame([['Sklearn', depth, size, sklearn_accuracy, sklearn_precision, sklearn_recall, sklearn_f1, sklearn_training_time]], 
                               columns=results_df.columns)
results_df = pd.concat([results_df, sklearn_new_row], ignore_index=True)

# After completing all iterations and evaluations, saving the collected results into a CSV file.
# This file 'car_comparison_results.csv' will store all performance metrics for further analysis or record-keeping.
# The 'index=False' parameter is used to prevent pandas from adding an unnecessary index column to the CSV.
results_df.to_csv('car_comparison_results.csv', index=False)





  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  results_df = pd.concat([results_df, the_new_row], ignore_index=True)


In [7]:
results_df

Unnamed: 0,Classifier,Max_Depth,Training_Size,Accuracy,Precision,Recall,F1_Score,Training_Time
0,theClassifier,3,0.6,0.803468,0.768683,0.803468,0.784588,0.014511
1,Sklearn,3,0.6,0.763006,0.682897,0.763006,0.716635,0.000988
2,theClassifier,3,0.7,0.795761,0.739364,0.795761,0.765877,0.008976
3,Sklearn,3,0.7,0.757225,0.672296,0.757225,0.706765,0.000997
4,theClassifier,3,0.8,0.789017,0.75605,0.789017,0.770083,0.012965
5,Sklearn,3,0.8,0.742775,0.657635,0.742775,0.690857,0.00196
6,theClassifier,5,0.6,0.82948,0.823136,0.82948,0.825416,0.033878
7,Sklearn,5,0.6,0.854046,0.840327,0.854046,0.842514,0.00196
8,theClassifier,5,0.7,0.820809,0.819409,0.820809,0.819728,0.038932
9,Sklearn,5,0.7,0.863198,0.847324,0.863198,0.851329,0.001031
