In [5]:
from math import log2

def calculate_gain_ratio(data):
  """
  Calculates the gain ratio for a given dataset.

  Args:
      data: A list of lists representing the dataset. The first element in each sublist is the class label (C0 or C1), and the remaining elements are the feature values.

  Returns:
      A dictionary containing the gain ratio for each feature (shirt size in this case).
  """

  # Calculate total entropy
  total_entropy = 0
  class_counts = {cls: 0 for cls in set(row[0] for row in data)}
  for row in data:
    class_counts[row[0]] += 1
  for count in class_counts.values():
    if count > 0:
      total_entropy -= (count / len(data)) * log2(count / len(data))

  # Calculate gain ratio for each shirt size
  gain_ratios = {}
  for size in set(row[1] for row in data):
    subset_data = [row for row in data if row[1] == size]
    subset_entropy = 0
    subset_class_counts = {cls: 0 for cls in set(row[0] for row in subset_data)}
    for row in subset_data:
      subset_class_counts[row[0]] += 1
    for count in subset_class_counts.values():
      if count > 0:
        subset_entropy -= (count / len(subset_data)) * log2(count / len(subset_data))

    # Calculate intrinsic information (split information)
    intrinsic_info = total_entropy - (len(subset_data) / len(data)) * subset_entropy
    split_info = - (len(set(row[1] for row in data)) / len(data)) * log2(len(set(row[1] for row in data)) / len(data))

    # Avoid division by zero (when a size has only one class)
    if split_info > 0:
      gain_ratios[size] = intrinsic_info / split_info
    else:
      gain_ratios[size] = 0

  return gain_ratios

# Q18 data
data = [
    ["C0", "Extra Large"],
    ["C1", "Extra Large"],
    ["C0", "Large"],
    ["C1", "Large"],
    ["C0", "Medium"],
    ["C1", "Medium"],
    ["C0", "Medium"],
    ["C1", "Small"],
    ["C0", "Small"],
    ["C1", "Small"],
]

# Calculate and print gain ratios
gain_ratios = calculate_gain_ratio(data)
print("Gain Ratios:")
for size, ratio in gain_ratios.items():
  print(f"{size}: {ratio:.4f}")

Gain Ratios:
Small: 1.3702
Extra Large: 1.5129
Medium: 1.3702
Large: 1.5129
