In [3]:
import pandas as pd
import numpy as np
from math import log2

# ===== 1. Load Data =====
file_path = "Color Balls_Train.csv"  # Change to your dataset
df = pd.read_csv(file_path)

# ===== 2. Entropy =====
def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy_val = 0
    for i in range(len(elements)):
        prob = counts[i] / np.sum(counts)
        entropy_val += -prob * log2(prob)
    return entropy_val

# ===== 3. Information Gain & Gain Ratio =====
def info_gain(data, split_attr, target_attr):
    total_entropy = entropy(data[target_attr])
    vals, counts = np.unique(data[split_attr], return_counts=True)
    weighted_entropy = 0
    for i in range(len(vals)):
        subset = data[data[split_attr] == vals[i]]
        weighted_entropy += (counts[i] / np.sum(counts)) * entropy(subset[target_attr])
    IG = total_entropy - weighted_entropy

    # Split Info
    split_info = 0
    for i in range(len(vals)):
        prob = counts[i] / np.sum(counts)
        split_info += -prob * log2(prob)
    
    # Gain Ratio (handle divide by zero)
    if split_info == 0:
        return IG, 0
    return IG, IG / split_info

# ===== 4. C4.5 Algorithm =====
def c45(data, features, target_attr):
    if len(np.unique(data[target_attr])) == 1:
        return np.unique(data[target_attr])[0]
    if len(features) == 0:
        return data[target_attr].mode()[0]
    
    # Compute Gain Ratios
    gains_ratios = [info_gain(data, f, target_attr)[1] for f in features]
    best_feature = features[np.argmax(gains_ratios)]
    
    tree = {best_feature: {}}
    for value in np.unique(data[best_feature]):
        sub_data = data[data[best_feature] == value].drop(columns=[best_feature])
        subtree = c45(sub_data, [f for f in features if f != best_feature], target_attr)
        tree[best_feature][value] = subtree
    return tree

# ===== 5. Build Tree =====
features = list(df.columns)
features.remove("Color")
tree = c45(df, features, "Color")

# ===== 6. Pretty Print =====
def print_tree(tree, indent=""):
    if not isinstance(tree, dict):
        print(indent + "→ " + str(tree))
        return
    for feature, branches in tree.items():
        for value, subtree in branches.items():
            print(indent + f"{feature} = {value}")
            print_tree(subtree, indent + "    ")

print("Decision Tree (C4.5):")
print_tree(tree)

# ===== 7. Show IG & GR for each feature =====
print("\nIG and GR for each feature:")
for f in features:
    IG, GR = info_gain(df, f, "Color")
    print(f"{f}: IG={IG:.4f}, GR={GR:.4f}")


Decision Tree (C4.5):
Room = 1
    Partitions = 1
        Rows = 1
            Columns = 1
                → Green
            Columns = 2
                → Blue
        Rows = 2
            → Red
    Partitions = 2
        → Yellow
    Partitions = 3
        Rows = 1
            Columns = 1
                → Green
            Columns = 2
                → Blue
        Rows = 2
            → Green
Room = 2
    Columns = 1
        Partitions = 1
            → Blue
        Partitions = 2
            Rows = 1
                → Yellow
            Rows = 2
                → Red
        Partitions = 3
            → Blue
    Columns = 2
        Partitions = 1
            → Green
        Partitions = 2
            Rows = 1
                → Green
            Rows = 2
                → Blue
        Partitions = 3
            → Blue
Room = 3
    → Red
Room = 4
    Partitions = 1
        → Blue
    Partitions = 2
        Rows = 1
            → Yellow
        Rows = 2
            Columns = 1
     