In [2]:
import pandas as pd
import numpy as np
# Your dataset
df = {
    'a1':['True','True','False','False','False','True','True','True','False','False'],
    'a2':['Hot','Hot','Hot','Cool','Cool','Cool','Hot','Hot','Cool','Cool'],
    'a3':['High','High','High','Normal','Normal','High','High','Normal','Normal','High'],
    'Classification':['No','No','Yes','Yes','Yes','No','No','Yes','Yes','Yes']
}
df = pd.DataFrame(df)
print(df)


      a1    a2      a3 Classification
0   True   Hot    High             No
1   True   Hot    High             No
2  False   Hot    High            Yes
3  False  Cool  Normal            Yes
4  False  Cool  Normal            Yes
5   True  Cool    High             No
6   True   Hot    High             No
7   True   Hot  Normal            Yes
8  False  Cool  Normal            Yes
9  False  Cool    High            Yes


In [3]:
# Base node class that represents a node of the tree
class Node:
    def __init__(self, label):
        self.label = label
        self.children = {}

    def display(self, space=''):
        print(f"{space}{self.label.upper()}:")
        indented = space + ' ' * 3

        for val, child in self.children.items():
            print(f"{indented}{val}:")
            if isinstance(child, Node):
                child.display(indented + ' ' * 3)
            else:
                print(f"{indented + ' ' * 3}{child}")

    def add_child(self, attr, child):
        self.children[attr] = child

    def predict(self, instance):
        child = self.children[instance[self.label]]

        if isinstance(child, Node):
            return child.predict(instance)
        else:
            return child

In [4]:
# Base Decision Tree Classifier Class
class DecisionTree:
    def __init__(self, data, xlabels, ylabel):
        self.data = data
        self.xlabels = xlabels
        self.ylabel = ylabel
        self.tree = self.build_tree(self.data, self.xlabels)

    # Function to get the entropy of a table
    def get_entropy(self, data):
        entropy = 0
        n = len(data)
        frequencies = list(data[self.ylabel].value_counts())

        for freq in frequencies:
            entropy -= freq / n * np.log2(freq / n)

        return entropy

    # Function to get information gain of an attribute in a table
    def get_info_gain(self, data, label):
        entropy = self.get_entropy(data)
        sub_entropy = 0
        n = len(data)

        for sublabel, freq in data[label].value_counts().to_dict().items():
            ent = self.get_entropy(data[data[label] == sublabel])
            sub_entropy += (freq / n * ent)

        return entropy - sub_entropy

    # Function to build the decision tree
    def build_tree(self, data, xlabels):
        # Select the attribute with the highest info gain
        info_gain = []
        for xlabel in xlabels:
            info_gain.append((xlabel, self.get_info_gain(data, xlabel)))

        print(info_gain)
        this_attr = max(info_gain, key=lambda e: e[1])[0]

        node = Node(this_attr)

        # Build the subtrees with the highest info gain attribute as the root node
        # If entropy of any sub-table is 0, then the tree is terminated at that sub-table
        # value
        for val in data[this_attr].unique():
            trunc_data = data[data[this_attr] == val]
            print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
            print(trunc_data)
            print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
            entropy = self.get_entropy(trunc_data)
            print("------------------------------------------")
            print("Entropy of Above Set:",entropy)
            print("------------------------------------------")

            if entropy == 0:
                node.add_child(val, trunc_data[self.ylabel].iloc[0])
            else:
                subtree = self.build_tree(trunc_data, [l for l in xlabels if l != this_attr])
                node.add_child(val, subtree)

        return node

    def predict(self, instance):
        return self.tree.predict(instance)


In [5]:
import numpy as np
clf = DecisionTree(df, ['a1','a2' ,'a3'], 'Classification')

[('a1', 0.6099865470109874), ('a2', 0.12451124978365313), ('a3', 0.4199730940219749)]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     a1    a2      a3 Classification
0  True   Hot    High             No
1  True   Hot    High             No
5  True  Cool    High             No
6  True   Hot    High             No
7  True   Hot  Normal            Yes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
------------------------------------------
Entropy of Above Set: 0.7219280948873623
------------------------------------------
[('a2', 0.07290559532005603), ('a3', 0.7219280948873623)]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     a1    a2    a3 Classification
0  True   Hot  High             No
1  True   Hot  High             No
5  True  Cool  High             No
6  True   Hot  High             No
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
------------------------------------------
Entropy of Above Set: 0.0
------------------------------------------
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     

In [7]:
clf.tree.display()

A1:
   True:
      A3:
         High:
            No
         Normal:
            Yes
   False:
      Yes
