In [1]:
import pandas as pd
import numpy as np

data = {'X1': [0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        'X2': [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8],
        'Y': [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1]}
df = pd.DataFrame(data)

def entropy(y):
    p = y.value_counts(normalize=True)
    return -np.sum(p * np.log2(p))

def gain_ratio_and_information_gain(df, feature, label):
    initial_entropy = entropy(df[label])
    unique_values = sorted(df[feature].unique())
    
    results = []
    
    for value in unique_values:
        left = df[df[feature] >= value]
        right = df[df[feature] < value]
        
        left_entropy = entropy(left[label])
        right_entropy = entropy(right[label])
        
        weighted_avg_entropy = (len(left) / len(df)) * left_entropy + (len(right) / len(df)) * right_entropy
        information_gain = initial_entropy - weighted_avg_entropy
        
        split_info = - (len(left) / len(df)) * np.log2(len(left) / len(df)) - (len(right) / len(df)) * np.log2(len(right) / len(df))
        if split_info == 0:
            results.append((value, 'Information Gain', information_gain))
            continue
        
        gain_ratio = information_gain / split_info
        results.append((value, 'Gain Ratio', gain_ratio))
    
    return results

features = ['X1', 'X2']
for feature in features:
    results = gain_ratio_and_information_gain(df, feature, 'Y')
    print(f"For feature {feature}:")
    for value, metric, result in results:
        print(f"  - Split on {feature} >= {value} has {metric}: {result}")


For feature X1:
  - Split on X1 >= 0.0 has Gain Ratio: nan
  - Split on X1 >= 0.1 has Gain Ratio: 0.10051807676021828
For feature X2:
  - Split on X2 >= -2 has Gain Ratio: nan
  - Split on X2 >= -1 has Gain Ratio: 0.10051807676021828
  - Split on X2 >= 0 has Gain Ratio: 0.055953759631263526
  - Split on X2 >= 1 has Gain Ratio: 0.00578004220515232
  - Split on X2 >= 2 has Gain Ratio: 0.0011443495172767494
  - Split on X2 >= 3 has Gain Ratio: 0.016411136842102134
  - Split on X2 >= 4 has Gain Ratio: 0.049749064181778546
  - Split on X2 >= 5 has Gain Ratio: 0.11124029586339801
  - Split on X2 >= 6 has Gain Ratio: 0.23609960614360798
  - Split on X2 >= 7 has Gain Ratio: 0.055953759631263526
  - Split on X2 >= 8 has Gain Ratio: 0.4301569161309807


  split_info = - (len(left) / len(df)) * np.log2(len(left) / len(df)) - (len(right) / len(df)) * np.log2(len(right) / len(df))
  split_info = - (len(left) / len(df)) * np.log2(len(left) / len(df)) - (len(right) / len(df)) * np.log2(len(right) / len(df))
  split_info = - (len(left) / len(df)) * np.log2(len(left) / len(df)) - (len(right) / len(df)) * np.log2(len(right) / len(df))
  split_info = - (len(left) / len(df)) * np.log2(len(left) / len(df)) - (len(right) / len(df)) * np.log2(len(right) / len(df))
