In [2]:
import numpy as np
import math


y = np.array([-1,-1,-1,-1,-1,1,1,1,1,-1])

features = {
    "know author": np.array([0,1,0,1,0,1,0,1,1,1]),
    "is long": np.array([0,1,1,1,1,0,0,0,0,1]),
    "has research": np.array([1,0,1,1,0,1,1,0,1,1]),
    "has grade": np.array([1,1,1,1,0,1,0,0,1,1]),
    "has lottery": np.array([0,0,1,0,0,1,0,0,0,1])
}


# Create a 2D array for all the data
email_data = np.zeros((len(features["know author"]),len(features)+1))

# populate feature data
email_data[:,0] = features["know author"]
email_data[:,1] = features["is long"]    
email_data[:,2] = features["has research"]
email_data[:,3] = features["has grade"]
email_data[:,4] = features["has lottery"]

# populate class data
email_data[:,-1] = y


assert len(y) == 10


p_read = np.count_nonzero(y == 1) / len(y)
p_not_read = np.count_nonzero(y== -1) / len(y)

entropy_y = - p_read * np.log2(p_read) - p_not_read * np.log2(p_not_read)

print(f"entropy = {entropy_y}")

entropy = 0.9709505944546686


In [30]:
class InfoGains():
    def __init__(self, data: np.array):
        self.features = data[:,:-1]
        self.classes = data[:,-1]
        
        self.entropy = self.calc_entropy(self.classes)
        self.p_class_a = np.count_nonzero(self.classes == 1) / len(self.classes)
        self.p_class_b = np.count_nonzero(self.classes == -1) / len(self.classes)
        
        
    def calc_entropy(self, classes: np.array) -> float:
        p_yes = np.count_nonzero(classes == 1) / len(classes)
        p_no = np.count_nonzero(classes == -1) / len(classes)
        
        entropy = - p_yes * np.log2(p_yes) - p_no * np.log2(p_no)
        
        return entropy
    
    
    def calc_entropy_of_feature(self, feature: np.array, feature_val: int) -> float:
        p_yes = np.count_nonzero(self.classes[feature == feature_val] == 1) / len(self.classes[feature == feature_val] == 1)
        p_no = np.count_nonzero(self.classes[feature == feature_val] == -1) / len(self.classes[feature == feature_val] == -1)
        
        # print(self.classes[feature == feature_val])
        if (p_yes == 0 or p_yes == 1):
            return 0
        
        entropy = - p_yes * np.log2(p_yes) - p_no * np.log2(p_no)
        
        print('entropy:', entropy)
        # print(feature)
        # print(self.classes)
        

        return entropy

    
    def calc_info_gain_of_feature(self, feature: np.array) -> float:
        #print("--- info gain of feature ---")
        entropy_left = self.calc_entropy_of_feature(feature,0)
        entropy_right = self.calc_entropy_of_feature(feature,1)
        
        
        weighted_entropy = entropy_left*(1 - feature.mean()) + entropy_right*(feature.mean())
        
        info_gain = self.entropy - weighted_entropy
        
        return info_gain
        
   

    def calc_all_info_gain(self) -> np.array:
        gains = np.array(np.zeros(len(self.features.T)))

        for index,feature in enumerate(self.features.T):
            info_gain = self.calc_info_gain_of_feature(feature)
            
            gains[index] = info_gain

        return gains

In [31]:
import mltools as ml
my_gains = InfoGains(email_data)

info_gains = my_gains.calc_all_info_gain()

for index,info_gain in enumerate(info_gains):
    print(f"Info gain for feature {index+1} = {info_gain}")

feature: [0. 1. 0. 1. 0. 1. 0. 1. 1. 1.]
[False False False  True]
1
4
p_yes:   0.25
p_no:    0.75
entropy: 0.8112781244591328
feature: [0. 1. 0. 1. 0. 1. 0. 1. 1. 1.]
[False False  True  True  True False]
3
6
p_yes:   0.5
p_no:    0.5
entropy: 1.0
left: 0.8112781244591328
right: 1.0
feature.mean(): 0.6
feature: [0. 1. 1. 1. 1. 0. 0. 0. 0. 1.]
[False  True  True  True  True]
4
5
p_yes:   0.8
p_no:    0.2
entropy: 0.7219280948873623
feature: [0. 1. 1. 1. 1. 0. 0. 0. 0. 1.]
[False False False False False]
0
5
p_yes:   0.0
p_no:    1.0
left: 0.7219280948873623
right: 0
feature.mean(): 0.5
feature: [1. 0. 1. 1. 0. 1. 1. 0. 1. 1.]
[False False  True]
1
3
p_yes:   0.3333333333333333
p_no:    0.6666666666666666
entropy: 0.9182958340544896
feature: [1. 0. 1. 1. 0. 1. 1. 0. 1. 1.]
[False False False  True  True  True False]
3
7
p_yes:   0.42857142857142855
p_no:    0.5714285714285714
entropy: 0.9852281360342515
left: 0.9182958340544896
right: 0.9852281360342515
feature.mean(): 0.7
feature: [1. 