In [1]:
import numpy as np
from collections import Counter
import pandas as pd

In [2]:
def entropy(data):
    labels = [row[-1] for row in data]
    label_counts = Counter(labels)
    total = len(data)
    return -sum((count / total) * np.log2(count / total) for count in label_counts.values())


In [3]:
def info_gain(data, split_feature):
    original_entropy = entropy(data)
    feature_values = set(row[split_feature] for row in data)
    total = len(data)
    subset_entropy = 0
    
    for value in feature_values:
        subset = [row for row in data if row[split_feature] == value]
        weight = len(subset) / total
        subset_entropy += weight * entropy(subset)
        
    return original_entropy - subset_entropy


In [4]:
def best_feature(data):
    features = len(data[0]) - 1
    return max(range(features), key=lambda f: info_gain(data, f))


In [5]:
def id3(data, features):
    labels = [row[-1] for row in data]
    
    # If all examples have the same label, return that label
    if len(set(labels)) == 1:
        return labels[0]
    
    # If there are no more features to split on, return the most common label
    if not features:
        return Counter(labels).most_common(1)[0][0]
    
    # Select the best feature to split on
    best = best_feature(data)
    tree = {best: {}}
    
    # Reduce the features by excluding the best one
    remaining_features = features - {best}
    feature_values = set(row[best] for row in data)
    
    for value in feature_values:
        subset = [row for row in data if row[best] == value]
        subtree = id3(subset, remaining_features)
        tree[best][value] = subtree
        
    return tree


In [9]:
data = [
    ['sunny', 'hot', 'high', 'weak', 'no'],
    ['sunny', 'hot', 'high', 'strong', 'no'],
    ['overcast', 'hot', 'high', 'weak', 'yes'],
    ['rain', 'mild', 'high', 'weak', 'yes'],
    ['rain', 'cool', 'normal', 'weak', 'yes'],
    ['rain', 'cool', 'normal', 'strong', 'no'],
    ['overcast', 'cool', 'normal', 'strong', 'yes'],
    ['sunny', 'mild', 'high', 'weak', 'no'],
    ['sunny', 'cool', 'normal', 'weak', 'yes'],
    ['rain', 'mild', 'normal', 'weak', 'yes'],
    ['sunny', 'mild', 'normal', 'strong', 'yes'],
    ['overcast', 'mild', 'high', 'strong', 'yes'],
    ['overcast', 'hot', 'normal', 'weak', 'yes'],
    ['rain', 'mild', 'high', 'strong', 'no']
]


In [15]:
tree = id3(data, set(range(len(data[0]) - 1)))
print(tree)

{0: {'sunny': {2: {'normal': 'yes', 'high': 'no'}}, 'overcast': 'yes', 'rain': {3: {'strong': 'no', 'weak': 'yes'}}}}
