# Implementation of a decision tree classifier

Imports

In [14]:
import numpy as np
from collections import Counter
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
import time

from src.calculations import find_best_split
from src.tree_node import TreeNode
from src.util import tree_to_json

## Learning Algorithm

In [15]:
""""
    Recursively implemented algorithm for "learning" / creating a decision tree

    Parameters:
    X (2D Array): Feature matrix
    y (Array): Class labels
    impurity_measure (String): Type of impurity measure to use ('gini' / 'entropy')
    prune (bool, optional): True or false regarding the question to prune the tree. By default False is set.
    prune_data (tuple, optional): Validation data for pruning. By default None is set.

    Return:
    Root node of the decision tree (Type: TreeNode)
"""


def learn(X, y, impurity_measure, prune=False, prune_data=None):
    # If all labels in the current node are the same (pure node)
    if len(np.unique(y)) == 1:
        # Returning leaf node with the class label
        leaf = TreeNode()
        leaf.label = y[0]
        return leaf
    # if all features have the same value
    if np.all(X[:, 0] == X[:, 0][0]):
        # Returning leaf node with the class label being the most frequent class in the current node.
        leaf = TreeNode()
        leaf.label = Counter(y).most_common(1)[0][0]
        return leaf
    # If not stopped, Calculating best split for decision tree based on information gain
    best_feature_index, best_threshold = find_best_split(X, y, impurity_measure)

    if best_feature_index is None:
        leaf = TreeNode()
        leaf.label = Counter(y).most_common(1)[0][0]
        return leaf
    # Creating tree node on the basis of calculated best feature and threshold
    node = TreeNode()
    node.feature_index = best_feature_index
    node.threshold = best_threshold
    # Splitting data in left and right subtree
    left_mask = X[:, best_feature_index] <= best_threshold
    right_mask = ~left_mask
    # Recursive call of functions for left and right subtrees for creating child nodes
    node.left = learn(X[left_mask], y[left_mask], impurity_measure, prune, prune_data)
    node.right = learn(
        X[right_mask], y[right_mask], impurity_measure, prune, prune_data
    )
    # When pruning is enabled
    if prune:
        # Comparing accuracy of node before pruning
        majority_class = Counter(y).most_common(1)[0][0]
        accuracy_before_prune = np.mean(predict(X, node) == y)

        node_label = node.label if node.label is not None else majority_class
        # Comparing accuracy of node after pruning
        node_accuracy = np.mean(predict(prune_data[0], node) == prune_data[1])
        #  If pruning improves accuracy or maintains the same accuracy, the node is pruned
        if node_accuracy >= accuracy_before_prune:
            #  Making it like a leaf note
            node.left = None
            node.right = None
            node.label = node_label
    # returns the constructed tree with the root node (-> which recursively defines the decision tree)
    return node

In [16]:
"""
    Predicting the class label for a single sample using the decision tree.
    The decision tree is recursively traversated, until leaf node is reached.
    Parameters:
    node (TreeNode): Current node in the decision tree.
    x (Array): Feature values for a single sample.

    Return:
    Predicted class label (Type: float)
"""


def predict_single(node, x):
    # When leaf node is reached
    if node.label is not None:
        # Return class label
        return node.label
    # Recursive Traversation of tree based on feature value in comparison with the threshold of the node
    if x[node.feature_index] <= node.threshold:
        return predict_single(node.left, x)
    else:
        return predict_single(node.right, x)


"""
    Predicting  class labels for multiple samples using the decision tree.

    Parameters:
    X (2D Array): Feature matrix for multiple samples.
    tree (TreeNode): Root node of the decision tree.

    Return:
    Predicted class labels for the input samples (Type: Array)
"""


def predict(X, tree):
    # Recursive Traversation of tree till leaf is reached and return of array of predicted class labels
    return np.array([predict_single(tree, x) for x in X])



In [17]:

# Load the dataset
wine_data = pd.read_csv("./data/wine_dataset.csv")
# X = wine_data[['citric acid', 'residual sugar', 'pH', 'sulphates', 'alcohol']].values
X = wine_data.iloc[:, :-1].values
# y = wine_data['type'].values
y = wine_data.iloc[:, -1].values


# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

# Train and evaluate our custom decision tree (entropy with pruning)
start_time = time.time()
# tree_custom = learn(X_train, y_train, impurity_measure='gini', prune=True, prune_data=(X_val, y_val))
# tree_custom = learn(X_train, y_train, impurity_measure='gini', prune=False, prune_data=None)
tree_custom = learn(
    X_train, y_train, impurity_measure="entropy", prune=True, prune_data=(X_val, y_val)
)
# tree_custom = learn(X_train, y_train, impurity_measure='entropy', prune=False, prune_data=None)
y_val_pred_custom = predict(X_val, tree_custom)
custom_accuracy = accuracy_score(y_val, y_val_pred_custom)
custom_time = time.time() - start_time

# Train and evaluate scikit-learn's DecisionTreeClassifier
start_time = time.time()
tree_sklearn = DecisionTreeClassifier(criterion="gini", random_state=42)
tree_sklearn.fit(X_train, y_train)
y_val_pred_sklearn = tree_sklearn.predict(X_val)
# Accuracy calculation realized by percentage-wise match of input arrays
sklearn_accuracy = accuracy_score(y_val, y_val_pred_sklearn)
sklearn_time = time.time() - start_time

# Print the results
print("Custom Decision Tree:")
print("Accuracy:", custom_accuracy)
print("Time:", custom_time)

print("\nScikit-Learn Decision Tree:")
print("Accuracy:", sklearn_accuracy)
print("Time:", sklearn_time)


# Retrieve feature names from the dataset
# feature_names = wine_data[['citric acid', 'residual sugar', 'pH', 'sulphates', 'alcohol']].columns.tolist()
feature_names = wine_data.columns[:-1].tolist()

# Visualize the custom decision tree as JSON
custom_tree_json = tree_to_json(tree_custom, feature_names)
# print("Custom Decision Tree (JSON format):")
# print(custom_tree_json)


Custom Decision Tree:
Accuracy: 0.875
Time: 1.101304054260254

Scikit-Learn Decision Tree:
Accuracy: 0.8729166666666667
Time: 0.003976106643676758
