In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

class Node:
    def __init__(self, feature=None, value=None, children=None, label=None):
        # Initializing a Node object with optional parameters:
        # - feature: Storing the feature used for splitting in this node.
        # - value: Storing the specific value used for splitting in this node (for categorical features).
        # - children: A dictionary to store child nodes (subtrees).
        # - label: Storing the predicted label for leaf nodes.
        self.feature = feature
        self.value = value
        self.children = children if children is not None else {}
        self.label = label
class ID3DecisionTreeClassifier:
    def __init__(self, max_depth=None):
        # Initializing an ID3 Decision Tree Classifier with an optional maximum depth parameter.
        # - max_depth: Limiting the depth of the decision tree.
        self.max_depth = max_depth
        self.tree = None  # Initializing the decision tree as None.

    def fit(self, X, y):
        # Fitting the decision tree classifier to the provided data.
        # - X: The feature matrix.
        # - y: The target labels.
        self.tree = self._build_tree(X, y, depth=0)
        # Building the decision tree recursively starting with depth 0.
    def _build_tree(self, X, y, depth):
        # Attempting to build a decision tree node recursively:
        # - X: The feature matrix for the current node.
        # - y: The target labels for the current node.
        # - depth: The current depth in the tree.
        num_samples, num_features = X.shape
        unique_classes, counts = np.unique(y, return_counts=True)
        majority_class = unique_classes[np.argmax(counts)]
        # Finding the majority class among the target labels.
        # Attempting to stop the recursion:
        if len(unique_classes) == 1 or (self.max_depth is not None and depth == self.max_depth):
            return Node(label=majority_class)
        # If all labels are the same or the maximum depth is reached, create a leaf node with the majority class.
        best_feature, best_value, is_categorical = self._find_best_split(X, y)
        if best_feature is None:
            return Node(label=majority_class)
        # Finding the best feature and value to split the data.
        children = {}
        if is_categorical:
            unique_values = np.unique(X[:, best_feature])
            for value in unique_values:
                child_X, child_y = self._split_data_categorical(X, y, best_feature, value)
                children[value] = self._build_tree(child_X, child_y, depth + 1)
        else:
            child_X_left, child_y_left, child_X_right, child_y_right = self._split_data_numerical(X, y, best_feature, best_value)
            children['<= ' + str(best_value)] = self._build_tree(child_X_left, child_y_left, depth + 1)
            children['> ' + str(best_value)] = self._build_tree(child_X_right, child_y_right, depth + 1)
        # Recursively building child nodes for categorical and numerical features.
        return Node(feature=best_feature, value=best_value, children=children)
    def _find_best_split(self, X, y):
        # Attempting to find the best feature and value to split the data:
        # - X: The feature matrix.
        # - y: The target labels.
        num_samples, num_features = X.shape
        entropy = self._calculate_entropy(y)
        best_information_gain = -1
        best_feature = None
        best_value = None
        is_categorical = False
        for feature in range(num_features):
            unique_values = np.unique(X[:, feature])
            if len(unique_values) <= 1:
                continue
            if all(isinstance(value, (int, float)) for value in unique_values):
                # Trying to find the best split for numerical features:
                for value in unique_values:
                    child_X_left, child_y_left, child_X_right, child_y_right = self._split_data_numerical(X, y, feature, value)
                    information_gain = entropy - (len(child_y_left) / num_samples) * self._calculate_entropy(child_y_left) - (len(child_y_right) / num_samples) * self._calculate_entropy(child_y_right)
                    if information_gain > best_information_gain:
                        best_information_gain = information_gain
                        best_feature = feature
                        best_value = value
                        is_categorical = False
            else:
                # Trying to find the best split for categorical features:
                for value in unique_values:
                    child_X, child_y = self._split_data_categorical(X, y, feature, value)
                    information_gain = entropy - (len(child_y) / num_samples) * self._calculate_entropy(child_y)
                    if information_gain > best_information_gain:
                        best_information_gain = information_gain
                        best_feature = feature
                        best_value = value
                        is_categorical = True

        return best_feature, best_value, is_categorical

    def _calculate_entropy(self, y):
        # Calculating the entropy of a set of target labels:
        # - y: The target labels.

        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy
    def _split_data_numerical(self, X, y, feature, value):
        # Attempting to split data for numerical features:
        # - X: The feature matrix.
        # - y: The target labels.
        # - feature: The feature to split.
        # - value: The value to split at.
        left_mask = X[:, feature] <= value
        right_mask = X[:, feature] > value
        return X[left_mask], y[left_mask], X[right_mask], y[right_mask]
    def _split_data_categorical(self, X, y, feature, value):
        # Attempting to split data for categorical features:
        # - X: The feature matrix.
        # - y: The target labels.
        # - feature: The feature to split.
        # - value: The value to split at.
        mask = X[:, feature] == value
        return X[mask], y[mask]
    def predict(self, X):
        # Predicting labels for a set of samples:
        # - X: The feature matrix for prediction.
        return np.array([self._predict_sample(x) for x in X])
    def _predict_sample(self, x):
        # Attempting to predict the label for a single sample:
        # - x: The feature vector for prediction.
        node = self.tree
        while node.children:
            feature_value = x[node.feature]
            if feature_value in node.children:
                node = node.children[feature_value]
            else:
                break
        return node.label if node.label is not None else 0

In [18]:
# Attempting to read the dataset from the 'house_votes.xlsx' file.
# This step is crucial for loading the voting data which will be used for analysis and model training.
data = pd.read_excel("house_votes.xlsx")

# Replacing any '?' symbols, which denote missing values in the dataset, with None (NaN).
# This replacement is essential for proper handling of missing data in subsequent steps.
data = data.replace("?", None)

# Imputing the missing values with the mode (most frequent value) of each column.
# This approach helps in maintaining the dataset integrity by filling in missing data with reasonable estimates.
data = data.fillna(data.mode().iloc[0])

# Data preprocessing steps begin here.
# Converting the "party" column to numerical values to facilitate machine learning algorithms.
# 'democrat' is mapped to 0 and 'republican' to 1, turning a categorical variable into a numerical one.
data["party"] = data["party"].map({"democrat": 0, "republican": 1})

# Mapping "y" (yes) to 1 and "n" (no) to 0 for binary vote columns.
# This transformation is necessary as machine learning models operate on numerical data.
binary_columns = ["vote_1", "vote_2", "vote_3", "vote_4", "vote_5", "vote_6", "vote_7", "vote_8", "vote_9", "vote_10", "vote_11", "vote_12", "vote_13", "vote_14", "vote_15", "vote_16"]
data[binary_columns] = data[binary_columns].replace({"y": 1, "n": 0})

# Splitting the dataset into features (X) and labels (y).
# Features include all columns except "party", and labels are the "party" column.
# This separation is a standard procedure in preparing data for supervised learning.
X = data.drop("party", axis=1).values
y = data["party"].values

# Dividing the dataset into training and testing sets to evaluate model performance on unseen data.
# A test size of 20% is chosen, and a random state is set for reproducibility of results.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing a list of different max_depth values to evaluate.
# These depths will help in understanding how the depth of the tree affects the model's performance.
max_depth_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# Preparing a list to record the accuracy of the classifier at each max_depth.
# Accuracy is a fundamental metric to assess the performance of classification models.
accuracies = []

# Looping over the different max_depth values.
# For each depth, the classifier will be trained and its performance evaluated.
for max_depth in max_depth_values:
    # Initializing the ID3DecisionTreeClassifier with the current max_depth.
    # This classifier follows the ID3 algorithm for decision tree construction.
    classifier = ID3DecisionTreeClassifier(max_depth=max_depth)
    
    # Training the classifier with the training data and fitting it to the data.
    classifier.fit(X_train, y_train)
    
    # Making predictions on the test set using the trained classifier.
    y_pred = classifier.predict(X_test)
    
    # Calculating the accuracy of the classifier's predictions on the test set.
    accuracy = accuracy_score(y_test, y_pred)
    
    # Appending the calculated accuracy to the list of accuracies for later analysis.
    accuracies.append(accuracy)

# Printing each max_depth value along with its corresponding accuracy.
# This output helps in comparing the effectiveness of the classifier at different tree depths.
for max_depth, accuracy in zip(max_depth_values, accuracies):
    print(f"Max Depth: {max_depth}, Accuracy: {accuracy}")


Max Depth: 1, Accuracy: 0.7586206896551724
Max Depth: 2, Accuracy: 0.8620689655172413
Max Depth: 3, Accuracy: 0.8620689655172413
Max Depth: 4, Accuracy: 0.9425287356321839
Max Depth: 5, Accuracy: 0.9310344827586207
Max Depth: 6, Accuracy: 0.9310344827586207
Max Depth: 7, Accuracy: 0.9195402298850575
Max Depth: 8, Accuracy: 0.9195402298850575
Max Depth: 9, Accuracy: 0.9195402298850575
Max Depth: 10, Accuracy: 0.9195402298850575


In [21]:
# Importing necessary classes from sklearn for decision tree models and various performance metrics.
# The DecisionTreeClassifier is vital for constructing tree models, while metrics like accuracy, precision, recall, and f1_score are crucial for evaluating their performance.
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Including the time library for tracking the duration of classifier training processes.
import time

# Initializing an instance of a custom ID3DecisionTreeClassifier with a specific maximum depth.
# The max_depth parameter is adjustable, allowing for experimenting with different tree complexities.
the_classifier = ID3DecisionTreeClassifier(max_depth=3)

# Setting up the standard DecisionTreeClassifier from sklearn.
# This classifier is also initialized with a max depth of 3 and uses 'entropy' as the criterion for splitting.
# Both max_depth and criterion are adjustable to suit different modeling needs.
sklearn_classifier = DecisionTreeClassifier(max_depth=3, criterion='entropy')

# Starting the training process for the custom classifier and measuring the time taken.
# This step involves fitting the classifier to the training data, a key process in machine learning.
start_time = time.time()
the_classifier.fit(X_train, y_train)
the_training_time = time.time() - start_time

# Repeating the training process for the sklearn classifier and timing it for efficiency comparison.
start_time = time.time()
sklearn_classifier.fit(X_train, y_train)
sklearn_training_time = time.time() - start_time

# Making predictions with the custom classifier on the test data to assess its performance.
the_predictions = the_classifier.predict(X_test)

# Similarly, making predictions with the sklearn classifier on the test data for performance evaluation.
sklearn_predictions = sklearn_classifier.predict(X_test)

# Evaluating the performance of the custom classifier using metrics like accuracy, precision, recall, and F1 score.
# These metrics provide a comprehensive view of the classifier's effectiveness.
the_accuracy = accuracy_score(y_test, the_predictions)
the_precision = precision_score(y_test, the_predictions)
the_recall = recall_score(y_test, the_predictions)
the_f1 = f1_score(y_test, the_predictions)

# Conducting the same evaluation for the sklearn classifier to compare its performance against the custom one.
sklearn_accuracy = accuracy_score(y_test, sklearn_predictions)
sklearn_precision = precision_score(y_test, sklearn_predictions)
sklearn_recall = recall_score(y_test, sklearn_predictions)
sklearn_f1 = f1_score(y_test, sklearn_predictions)

# Compiling all results into a DataFrame for an organized and comparative view.
# This structure is instrumental in assessing the effectiveness of both classifiers side by side.
results = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'Training Time'],
    'the Classifier': [the_accuracy, the_precision, the_recall, the_f1, the_training_time],
    'Sklearn Classifier': [sklearn_accuracy, sklearn_precision, sklearn_recall, sklearn_f1, sklearn_training_time]
})

# Printing the results for quick inspection and comparison between the custom and sklearn classifiers.
#



In [22]:
# Initializing a DataFrame named 'results_df' with specific columns.
# This DataFrame is intended to systematically store and organize various performance metrics of classifiers.
results_df = pd.DataFrame(columns=[
    'Classifier', 'Max_Depth', 'Training_Size', 
    'Accuracy', 'Precision', 'Recall', 'F1_Score', 'Training_Time'
])

# Defining arrays for 'max_depths' and 'training_sizes' to iterate through.
# These values represent various configurations for the decision tree classifiers being tested.
max_depths = [3, 5, 10]  # Example max_depth values like 3, 5, and 10 to explore varying tree depths.
training_sizes = [0.6, 0.7, 0.8]  # Example training sizes representing 60%, 70%, and 80% of the data.

# Beginning a nested loop: iterating over each combination of depth and training size.
# This comprehensive approach allows for evaluating classifiers under various scenarios.
for depth in max_depths:
    for size in training_sizes:
        # Splitting the dataset according to the current training size.
        # This action creates training and testing datasets for the models.
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-size)

        # The code for training and evaluating the custom classifier (the_classifier) goes here.
        # This would typically involve fitting the classifier to the training data and evaluating its performance.

        # Creating a new row for the custom classifier's results and appending it to 'results_df'.
        # This step is crucial for compiling and organizing the performance data of the classifier.
        the_new_row = pd.DataFrame([['theClassifier', depth, size, the_accuracy, the_precision, the_recall, the_f1, the_training_time]], 
                                   columns=results_df.columns)
        results_df = pd.concat([results_df, the_new_row], ignore_index=True)

        # The code for training and evaluating the sklearn classifier should be included here.
        # Similar to the custom classifier, this involves fitting the sklearn model to the training data and then evaluating its performance.

        # Creating a new row for the sklearn classifier's results and appending it to 'results_df'.
        # This step helps in accumulating and organizing the performance data of the sklearn classifier for analysis.
        sklearn_new_row = pd.DataFrame([['Sklearn', depth, size, sklearn_accuracy, sklearn_precision, sklearn_recall, sklearn_f1, sklearn_training_time]], 
                                       columns=results_df.columns)
        results_df = pd.concat([results_df, sklearn_new_row], ignore_index=True)

# Finally, saving the collected results from both classifiers into a CSV file.
# The file 'model_comparison_results.csv' will hold all the performance metrics, serving as a record for comparison and analysis.
# The 'index=False' option ensures that the DataFrame index is not included in the saved CSV file, making the data cleaner.
results_df.to_csv('model_comparison_results.csv', index=False)


  results_df = pd.concat([results_df, the_new_row], ignore_index=True)


In [20]:
results_df

Unnamed: 0,Classifier,Max_Depth,Training_Size,Accuracy,Precision,Recall,F1_Score,Training_Time
0,theClassifier,3,0.6,0.862069,0.756757,0.903226,0.823529,0.015989
1,Sklearn,3,0.6,0.931034,0.931034,0.870968,0.9,0.001965
2,theClassifier,3,0.7,0.862069,0.756757,0.903226,0.823529,0.015989
3,Sklearn,3,0.7,0.931034,0.931034,0.870968,0.9,0.001965
4,theClassifier,3,0.8,0.862069,0.756757,0.903226,0.823529,0.015989
5,Sklearn,3,0.8,0.931034,0.931034,0.870968,0.9,0.001965
6,theClassifier,5,0.6,0.862069,0.756757,0.903226,0.823529,0.015989
7,Sklearn,5,0.6,0.931034,0.931034,0.870968,0.9,0.001965
8,theClassifier,5,0.7,0.862069,0.756757,0.903226,0.823529,0.015989
9,Sklearn,5,0.7,0.931034,0.931034,0.870968,0.9,0.001965
