In [None]:
import pandas as pd
import numpy as np
# Your dataset
data = {
    'Age': [34, 53, 46, 32, 39, 15, 73, 58, 50, 23, 50, 66, 37, 68, 23],
    'Sex': ['M', 'M', 'M', 'M', 'F', 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'F', 'F', 'M'],
    'BP': ['NORMAL', 'NORMAL', 'HIGH', 'LOW', 'NORMAL', 'NORMAL', 'NORMAL', 'HIGH', 'NORMAL', 'NORMAL', 'NORMAL', 'NORMAL', 'HIGH', 'LOW', 'NORMAL'],
    'Cholesterol': ['HIGH', 'NORMAL', 'NORMAL', 'NORMAL', 'NORMAL', 'HIGH', 'HIGH', 'NORMAL', 'NORMAL', 'HIGH', 'NORMAL', 'NORMAL', 'HIGH', 'HIGH', 'NORMAL'],
    'Na_to_K': [14.133, 7.285, 9.445, 13.938, 9.709, 9.084, 19.221, 14.239, 15.79, 12.26, 12.295, 8.107, 13.091, 10.291, 31.686],
    'Drug': ['drugX', 'drugX', 'drugY', 'drugX', 'drugX', 'drugX', 'drugY', 'drugY', 'drugY', 'drugX', 'drugX', 'drugX', 'drugY', 'drugY', 'drugY']
}

df = pd.DataFrame(data)
df['Drug'].replace('drugX', 'drugC', inplace=True)
age_bins = [0, 30, 50, float('inf')]
age_labels = ['Young', 'Adult', 'Elder']

na_to_k_bins = [0, 15, 20, float('inf')]
na_to_k_labels = ['A', 'B', 'C']

df['Age_Range'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)
df['Na_to_K_Range'] = pd.cut(df['Na_to_K'], bins=na_to_k_bins, labels=na_to_k_labels, right=False)

df.drop(['Age', 'Na_to_K'], inplace=True, axis=1)
df = df[[col for col in df.columns if col != 'Drug'] + ['Drug']]

print(df)


   Sex      BP Cholesterol Age_Range Na_to_K_Range   Drug
0    M  NORMAL        HIGH     Adult             A  drugC
1    M  NORMAL      NORMAL     Elder             A  drugC
2    M    HIGH      NORMAL     Adult             A  drugY
3    M     LOW      NORMAL     Adult             A  drugC
4    F  NORMAL      NORMAL     Adult             A  drugC
5    M  NORMAL        HIGH     Young             A  drugC
6    F  NORMAL        HIGH     Elder             B  drugY
7    F    HIGH      NORMAL     Elder             A  drugY
8    M  NORMAL      NORMAL     Elder             B  drugY
9    M  NORMAL        HIGH     Young             A  drugC
10   F  NORMAL      NORMAL     Elder             A  drugC
11   F  NORMAL      NORMAL     Elder             A  drugC
12   F    HIGH        HIGH     Adult             A  drugY
13   F     LOW        HIGH     Elder             A  drugY
14   M  NORMAL      NORMAL     Young             C  drugY


In [None]:
# Base node class that represents a node of the tree
class Node:
    def __init__(self, label):
        self.label = label
        self.children = {}

    def display(self, space=''):
        print(f"{space}{self.label.upper()}:")
        indented = space + ' ' * 3

        for val, child in self.children.items():
            print(f"{indented}{val}:")
            if isinstance(child, Node):
                child.display(indented + ' ' * 3)
            else:
                print(f"{indented + ' ' * 3}{child}")

    def add_child(self, attr, child):
        self.children[attr] = child

    def predict(self, instance):
        child = self.children[instance[self.label]]

        if isinstance(child, Node):
            return child.predict(instance)
        else:
            return child

In [None]:
# Base Decision Tree Classifier Class
class DecisionTree:
    def __init__(self, data, xlabels, ylabel):
        self.data = data
        self.xlabels = xlabels
        self.ylabel = ylabel
        self.tree = self.build_tree(self.data, self.xlabels)

    # Function to get the entropy of a table
    def get_entropy(self, data):
        entropy = 0
        n = len(data)
        frequencies = list(data[self.ylabel].value_counts())

        for freq in frequencies:
            entropy -= freq / n * np.log2(freq / n)

        return entropy

    # Function to get information gain of an attribute in a table
    def get_info_gain(self, data, label):
        entropy = self.get_entropy(data)
        sub_entropy = 0
        n = len(data)

        for sublabel, freq in data[label].value_counts().to_dict().items():
            ent = self.get_entropy(data[data[label] == sublabel])
            sub_entropy += (freq / n * ent)

        return entropy - sub_entropy

    # Function to build the decision tree
    def build_tree(self, data, xlabels):
        # Select the attribute with the highest info gain
        info_gain = []
        for xlabel in xlabels:
            info_gain.append((xlabel, self.get_info_gain(data, xlabel)))

        print(info_gain)
        this_attr = max(info_gain, key=lambda e: e[1])[0]

        node = Node(this_attr)

        # Build the subtrees with the highest info gain attribute as the root node
        # If entropy of any sub-table is 0, then the tree is terminated at that sub-table
        # value
        for val in data[this_attr].unique():
            trunc_data = data[data[this_attr] == val]
            print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
            print(trunc_data)
            print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
            entropy = self.get_entropy(trunc_data)
            print("------------------------------------------")
            print("Entropy of Above Set:",entropy)
            print("------------------------------------------")

            if entropy == 0:
                node.add_child(val, trunc_data[self.ylabel].iloc[0])
            else:
                subtree = self.build_tree(trunc_data, [l for l in xlabels if l != this_attr])
                node.add_child(val, subtree)

        return node

    def predict(self, instance):
        return self.tree.predict(instance)


In [None]:
import numpy as np
clf = DecisionTree(df, ['Sex','BP' ,'Cholesterol', 'Age_Range' ,'Na_to_K_Range'], 'Drug')

[('Sex', 0.02798703360567112), ('BP', 0.2759310324945081), ('Cholesterol', 0.0021459960787032495), ('Age_Range', 0.029709136869865094), ('Na_to_K_Range', 0.2621549647380449)]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   Sex      BP Cholesterol Age_Range Na_to_K_Range   Drug
0    M  NORMAL        HIGH     Adult             A  drugC
1    M  NORMAL      NORMAL     Elder             A  drugC
4    F  NORMAL      NORMAL     Adult             A  drugC
5    M  NORMAL        HIGH     Young             A  drugC
6    F  NORMAL        HIGH     Elder             B  drugY
8    M  NORMAL      NORMAL     Elder             B  drugY
9    M  NORMAL        HIGH     Young             A  drugC
10   F  NORMAL      NORMAL     Elder             A  drugC
11   F  NORMAL      NORMAL     Elder             A  drugC
14   M  NORMAL      NORMAL     Young             C  drugY
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
------------------------------------------
Entropy of Above Set: 0.8812908992306927
-----------------

In [None]:
clf.tree.display()

BP:
   NORMAL:
      NA_TO_K_RANGE:
         A:
            drugC
         B:
            drugY
         C:
            drugY
   HIGH:
      drugY
   LOW:
      SEX:
         M:
            drugC
         F:
            drugY


In [None]:
clf.predict({ 'Sex':'F','BP':'HIGH','Cholesterol':'HIGH','Age_Range':'Young','Na_to_K_Range':'C'})

'drugY'