Below is the creation of decision tree and accuracy computation on test data

File Paths: 
- training data: data/mush_train.data
- test data: data/mush_test.data

In [2]:
import numpy as np

#Tree Node Class
class TreeNode(): 
    def __init__(self, attribute_index, value_path, children, info_gain, class_value): 

        #decision node
        self.attribute_index = attribute_index
        self.value_path = value_path #all nodes except root
        self.children = children
        self.info_gain = info_gain

        #leaf node
        self.class_value = class_value

#Decision Tree Class
class DecisionTree(): 
    def __init__(self):
        self.root = None

    #check if label set is pure
    def is_pure(self, label): 
        classes = np.unique(label)
        if len(classes) == 1: 
            return True
        else:
            return False
    
    #classify label
    def classify(self, label):
        classes, class_count = np.unique(label, return_counts=True)
        best_class = classes[class_count.argmax()]
        return best_class

    #split dataset by attribute value
    def split(self, dataset, label, best_attribute_index, value):
        #indices of rows for values[j]
        split_indices = np.where(dataset[:,best_attribute_index] == value)[0]
        split_dataset = dataset[split_indices]
        split_label = label[split_indices]
        return split_dataset, split_label

    #compute entropy
    def calculate_entropy(self, label): 
        classes, class_count = np.unique(label, return_counts=True)
        total_count = np.sum(class_count)

        entropy_value = 0
        for i in range(len(classes)): 
            proportion = class_count[i] / total_count
            entropy_value += (-proportion) * np.log2(proportion)
        
        return entropy_value

    #compute conditional entropy
    def calculate_conditional_entropy(self, attribute_set, label):
        values, value_count = np.unique(attribute_set, return_counts=True)
        
        total_count = np.sum(value_count)

        conditional_entropy_value = 0
        #iterate through each value of the attribute
        for i in range(len(values)): 
            condition_set = np.array([label[y] for y in np.where(attribute_set == values[i])])
            conditional_entropy_value += (value_count[i] / total_count) * self.calculate_entropy(condition_set)

        return conditional_entropy_value

    #create decision tree 
    def create_decision_tree(self, dataset, label, value, counter):
        #base case
        if self.is_pure(label): 
            class_value = self.classify(label)
            return TreeNode(None, value, None, None, class_value)
        
        #recursive step
        max_info_gain = 0
        best_attribute_index = 0

        current_entropy = self.calculate_entropy(label)

        #iterate through all attributes to find the one with most information gain 
        for i in range(dataset.shape[1]):
            attribute_set = dataset[:, i]
            
            conditional_entropy = self.calculate_conditional_entropy(attribute_set, label)
            info_gain = current_entropy - conditional_entropy
            
            if info_gain >= max_info_gain: 
                max_info_gain = info_gain
                best_attribute_index = i
        
        print("best_attribute_index:", best_attribute_index)
        print("max info gain:", max_info_gain)

        #terminating criteria
        if max_info_gain == 0: 
            class_value = self.classify(label)
            return TreeNode(None, value, None, None, class_value)

        #create new TreeNode
        current_node = None
        #for first node 
        if counter == 0:
            self.root = TreeNode(best_attribute_index, value, [], max_info_gain, None)
            current_node = self.root
        else: 
            current_node = TreeNode(best_attribute_index, value, [], max_info_gain, None)
            
        counter += 1

        #split set by each attribute value
        values, value_count = np.unique(dataset[:, best_attribute_index], return_counts=True)
        #iterate through all attribute values
        for j in range(len(values)):
            #split data
            split_dataset, split_label = self.split(dataset, label, best_attribute_index, values[j])

            #compute children on current node recursively
            if len(split_label) != 0:
                treeNode = self.create_decision_tree(split_dataset, split_label, values[j], counter)
                current_node.children.append(treeNode)

        return current_node


#read file and store data
def read_data(filename): 
    file = open(filename, 'r', encoding='utf-8-sig')
    dataset = []
    for line in file:
        data = line.strip().split(',')
        y_data = data[0]
        x_data = data[1:23]
        dataset.append((x_data, y_data))

    X = np.array([x for x, y in dataset])
    Y = np.array([y for x, y in dataset])

    return X, Y

#print tree
def print_tree(node, level=0):
    if node.attribute_index != None: 
        print("  " * level, node.attribute_index)
        for child in node.children:
            print_tree(child, level + 1)
    else:
        print("  " * level, node.class_value)

#prediction 
def prediction(node, x): 
    class_label = None

    #decision node
    if node.attribute_index != None: 
        index = node.attribute_index
        for child in node.children:
            if x[index] == child.value_path: 
                class_label = prediction(child, x)
                break
    #leaf node
    else:
        class_label = node.class_value

    return class_label


#main(): 
#train data
train_X, train_Y = read_data("data/mush_train.data")

#create decision tree using training data
training_tree = DecisionTree()
tree = training_tree.create_decision_tree(train_X, train_Y, None, 0)

#print tree
print()
print("Decision tree:")
print_tree(tree)

#test data
X, Y = read_data("data/mush_test.data")
correct_predictions = 0

#compute accuracy 
for x, y in zip(X, Y): 
    class_label = prediction(tree, x)

    if class_label == y: #correct classification
        correct_predictions += 1
    
accuracy = (correct_predictions / X.shape[0]) * 100
print("\naccuracy:", accuracy, "%")

best_attribute_index: 4
max info gain: 0.8593408948415395
best_attribute_index: 19
max info gain: 0.028289904575436464
best_attribute_index: 21
max info gain: 0.5042359749194675
best_attribute_index: 20
max info gain: 0.7219280948873623

Decision tree:
 4
   e
   p
   p
   e
   19
     e
     e
     e
     p
     21
       p
       20
         p
         e
       e
   p
   p
   p

accuracy: 84.0937114673243 %
