In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
data = {
    "RID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
    "age": ["youth", "youth", "middle_aged", "senior", "senior", "senior", "middle_aged", "youth", "youth", "senior", "youth", "middle_aged", "middle_aged", "senior"],
    "income": ["high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium"],
    "student": ["no", "no", "no", "no", "no", "yes", "yes", "yes", "yes", "yes", "yes", "no", "yes", "no"],
    "credit_rating": ["fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent"],
    "buys_computer": ["no", "no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "yes", "yes", "yes", "yes", "no"]
}

In [3]:
df = pd.DataFrame(data)
print("Dataset:")
print(df)

Dataset:
    RID          age  income student credit_rating buys_computer
0     1        youth    high      no          fair            no
1     2        youth    high      no     excellent            no
2     3  middle_aged    high      no          fair           yes
3     4       senior  medium      no          fair           yes
4     5       senior     low      no          fair           yes
5     6       senior     low     yes     excellent            no
6     7  middle_aged     low     yes     excellent           yes
7     8        youth  medium     yes          fair           yes
8     9        youth     low     yes          fair           yes
9    10       senior  medium     yes          fair           yes
10   11        youth  medium     yes     excellent           yes
11   12  middle_aged  medium      no     excellent           yes
12   13  middle_aged    high     yes          fair           yes
13   14       senior  medium      no     excellent            no


In [4]:
def entropy(target_col):
    """Calculate the entropy of a dataset."""
    elements, counts = np.unique(target_col, return_counts=True)
    entropy_val = -np.sum([(counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(elements))])
    return entropy_val

In [5]:
def info_gain(data, split_attr, target_attr):
    """Calculate the Information Gain for a splitting attribute."""
    total_entropy = entropy(data[target_attr])
    vals, counts = np.unique(data[split_attr], return_counts=True)
    weighted_entropy = np.sum([(counts[i] / np.sum(counts)) * entropy(data.where(data[split_attr] == vals[i]).dropna()[target_attr]) for i in range(len(vals))])
    info_gain_val = total_entropy - weighted_entropy
    return info_gain_val


In [6]:
target_entropy = entropy(df["buys_computer"])
print(f"Entropy of the target attribute: {target_entropy:.4f}")


Entropy of the target attribute: 0.8631


In [7]:
attributes = ["age", "income", "student", "credit_rating"]
print("\nInformation Gain for each attribute:")
for attr in attributes:
    ig = info_gain(df, attr, "buys_computer")
    print(f"{attr}: {ig:.4f}")



Information Gain for each attribute:
age: 0.1696
income: 0.0670
student: 0.0747
credit_rating: 0.1239


In [8]:
def gini_index(data, split_attr, target_attr):
    """Calculate the Gini Index for a splitting attribute."""
    vals, counts = np.unique(data[split_attr], return_counts=True)
    gini = 0
    for i in range(len(vals)):
        subset = data.where(data[split_attr] == vals[i]).dropna()[target_attr]
        _, subset_counts = np.unique(subset, return_counts=True)
        prob = subset_counts / np.sum(subset_counts)
        gini += (counts[i] / np.sum(counts)) * (1 - np.sum(prob ** 2))
    return gini


In [9]:
print("\nGini Index for each attribute:")
for attr in attributes:
    gini = gini_index(df, attr, "buys_computer")
    print(f"{attr}: {gini:.4f}")



Gini Index for each attribute:
age: 0.3429
income: 0.3690
student: 0.3673
credit_rating: 0.3393


In [10]:
class DecisionTree:
    def __init__(self, method="id3"):
        self.method = method
        self.tree = {}

    def fit(self, data, attributes, target):
        if len(np.unique(data[target])) == 1:
            return np.unique(data[target])[0]
        elif len(attributes) == 0:
            return data[target].mode()[0]

        if self.method == "id3":
            gains = [info_gain(data, attr, target) for attr in attributes]
        elif self.method == "cart":
            gains = [-gini_index(data, attr, target) for attr in attributes]
        
        best_attr = attributes[np.argmax(gains)]
        tree = {best_attr: {}}

        for value in np.unique(data[best_attr]):
            subset = data[data[best_attr] == value]
            subtree = self.fit(subset, [attr for attr in attributes if attr != best_attr], target)
            tree[best_attr][value] = subtree

        self.tree = tree
        return tree

    def pretty_print(self, tree=None, name=""):
        if tree is None:
            tree = self.tree
        for branch in tree:
            print(name + branch)
            for value in tree[branch]:
                self.pretty_print(tree[branch][value], name + "  ")

    def predict(self, query):
        node = self.tree
        while isinstance(node, dict):
            attr = next(iter(node))
            value = query[attr]
            node = node[attr][value]
        return node

    def __str__(self):
        return pprint.pformat(self.tree)

    def __repr__(self):
        return self.__str__()


In [11]:
decision_tree_id3 = DecisionTree(method="id3")
print("\nID3 Decision Tree:")
id3_tree = decision_tree_id3.fit(df, attributes, "buys_computer")
print(id3_tree)



ID3 Decision Tree:
{'age': {'middle_aged': 'yes', 'senior': {'credit_rating': {'excellent': 'no', 'fair': 'yes'}}, 'youth': {'income': {'high': 'no', 'low': 'yes', 'medium': 'yes'}}}}


In [12]:
decision_tree_cart = DecisionTree(method="cart")
print("\nCART Decision Tree:")
cart_tree = decision_tree_cart.fit(df, attributes, "buys_computer")
print(cart_tree)


CART Decision Tree:
{'credit_rating': {'excellent': {'age': {'middle_aged': 'yes', 'senior': 'no', 'youth': {'income': {'high': 'no', 'medium': 'yes'}}}}, 'fair': {'age': {'middle_aged': 'yes', 'senior': 'yes', 'youth': {'income': {'high': 'no', 'low': 'yes', 'medium': 'yes'}}}}}}


In [13]:
data_point = {"age": "youth", "income": "high", "student": "no", "credit_rating": "fair"}
print("\nPredictions:")
print(f"ID3: {decision_tree_id3.predict(data_point)}")
print(f"CART: {decision_tree_cart.predict(data_point)}")


Predictions:
ID3: no
CART: no


In [14]:
data_point = {"age": "youth", "income": "medium", "student": "no", "credit_rating": "fair"}
print("\nPredictions:")
print(f"ID3: {decision_tree_id3.predict(data_point)}")
print(f"CART: {decision_tree_cart.predict(data_point)}")


Predictions:
ID3: yes
CART: yes
