In [10]:
import math

# Data for shirt size and class (C0, C1)
data = {
    "Extra Large": [2, 2],
    "Large": [2, 2],
    "Medium": [3, 4],
    "Small": [3, 2]
}

# total number of instances
total_instances = sum(sum(value) for value in data.values())

# entropy for the entire dataset
def entropy(data):
    total_positive = sum(value[1] for value in data.values())
    total_negative = sum(value[0] for value in data.values())
    
    if total_positive == 0 or total_negative == 0:
        return 0
    
    p_positive = total_positive / total_instances
    p_negative = total_negative / total_instances
    return - (p_positive * math.log2(p_positive) + p_negative * math.log2(p_negative))

# information gain for a split on a particular shirt size
def information_gain(data, shirt_size):
    positive_values = data[shirt_size][1]
    negative_values = data[shirt_size][0]
    total_size_instances = sum(data[shirt_size])
    
    # entropy before the split
    entropy_before = entropy(data)
    
    # weighted entropy after the split (for each size category)
    weighted_entropy = 0
    for size, values in data.items():
        if size != shirt_size:
            size_prob = sum(values) / total_instances
            weighted_entropy += size_prob * entropy({size: values})
    
    return entropy_before - weighted_entropy

# gain ratio for a split on a particular shirt size
def gain_ratio(data, shirt_size):
    information_gain_value = information_gain(data.copy(), shirt_size)
    intrinsic_value = entropy({shirt_size: data[shirt_size]})
    
    if intrinsic_value == 0:
        return 0
    return information_gain_value / intrinsic_value

# information gain and gain ratio for each shirt size split
for shirt_size, values in data.items():
    information_gain_value = information_gain(data.copy(), shirt_size)
    gain_ratio_value = gain_ratio(data.copy(), shirt_size)
    
    print(f"Shirt Size: {shirt_size}")
    print(f"Information Gain: {information_gain_value:.4f}")
    print(f"Gain Ratio: {gain_ratio_value:.4f}")
    print("-" * 20)

Shirt Size: Extra Large
Information Gain: 0.3752
Gain Ratio: 0.5648
--------------------
Shirt Size: Large
Information Gain: 0.3752
Gain Ratio: 0.5648
--------------------
Shirt Size: Medium
Information Gain: 0.5486
Gain Ratio: 0.6270
--------------------
Shirt Size: Small
Information Gain: 0.4280
Gain Ratio: 0.5763
--------------------
