In [1]:
import numpy as np
import pandas as pd

class ID3DecisionTree:
    def __init__(self):
        self.tree = None

    def entropy(self, target_col):
        """
        Calculates the entropy of the target column.
        """
        elements, counts = np.unique(target_col, return_counts=True)
        entropy = 0
        for i in range(len(elements)):

            p_x = counts[i] / np.sum(counts)
            entropy += -p_x * np.log2(p_x)
        return entropy

    def information_gain(self, data, split_attribute_name, target_name):
        """
        Calculates the Information Gain of a specific attribute.
        IG = Total_Entropy - Weighted_Average_Entropy_of_Children
        """
        total_entropy = self.entropy(data[target_name])
        vals, counts = np.unique(data[split_attribute_name], return_counts=True)
        weighted_entropy = 0

        for i in range(len(vals)):
            subset = data[data[split_attribute_name] == vals[i]]

            prob = counts[i] / np.sum(counts)

            weighted_entropy += prob * self.entropy(subset[target_name])

        gain = total_entropy - weighted_entropy
        return gain

    def id3(self, data, original_data, features, target_attribute_name, parent_node_class=None):
        """
        The recursive ID3 algorithm.
        """
        if len(np.unique(data[target_attribute_name])) <= 1:
            return np.unique(data[target_attribute_name])[0]

        elif len(data) == 0:
            return np.unique(original_data[target_attribute_name])[
                np.argmax(np.unique(original_data[target_attribute_name], return_counts=True)[1])
            ]

        elif len(features) == 0:
            return parent_node_class

        else:
            parent_node_class = np.unique(data[target_attribute_name])[
                np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])
            ]

            item_values = [self.information_gain(data, feature, target_attribute_name) for feature in features]

            best_feature_index = np.argmax(item_values)
            best_feature = features[best_feature_index]

            tree = {best_feature: {}}

            features = [i for i in features if i != best_feature]

            for value in np.unique(data[best_feature]):
                sub_data = data.where(data[best_feature] == value).dropna()

                subtree = self.id3(sub_data, original_data, features, target_attribute_name, parent_node_class)

                tree[best_feature][value] = subtree

            return tree

    def fit(self, data, target_attribute_name):
        """
        Wrapper to start the ID3 algorithm.
        """
        features = data.columns.tolist()
        features.remove(target_attribute_name)
        self.tree = self.id3(data, data, features, target_attribute_name)

    def predict(self, query, tree=None):
        """
        Predicts the class for a new query (dictionary format).
        """
        if tree is None:
            tree = self.tree

        for key in tree.keys():
            feature_name = key

        try:
            value = query[feature_name]
            subtree = tree[feature_name][value]
        except KeyError:
            return "Unseen Value/Key Error"
        if isinstance(subtree, dict):
            return self.predict(query, subtree)
        else:
            return subtree

In [2]:
# 1. Create the Dataset
dataset = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}

df = pd.DataFrame(dataset)

# 2. Initialize and Train
model = ID3DecisionTree()
print("Training Model...")
model.fit(df, target_attribute_name='PlayTennis')

# 3. Print the structure (Dictionary form)
import pprint
print("\nGenerated Decision Tree:")
pprint.pprint(model.tree)

# 4. Make a Prediction
# Scenario: Sunny Outlook, Cool Temp, High Humidity, Strong Wind
new_sample = {'Outlook': 'Sunny', 'Temperature': 'Cool', 'Humidity': 'High', 'Wind': 'Strong'}
prediction = model.predict(new_sample)

print(f"\nPrediction for {new_sample}: {prediction}")

Training Model...

Generated Decision Tree:
{'Outlook': {'Overcast': 'Yes',
             'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},
             'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}

Prediction for {'Outlook': 'Sunny', 'Temperature': 'Cool', 'Humidity': 'High', 'Wind': 'Strong'}: No
