In [1]:
import numpy as np
import math


y = np.array([-1,-1,-1,-1,-1,1,1,1,1,-1])

features = {
    "know author": np.array([0,1,0,1,0,1,0,1,1,1]),
    "is long": np.array([0,1,1,1,1,0,0,0,0,1]),
    "has research": np.array([1,0,1,1,0,1,1,0,1,1]),
    "has grade": np.array([1,1,1,1,0,1,0,0,1,1]),
    "has lottery": np.array([0,0,1,0,0,1,0,0,0,1])
}


# Create a 2D array for all the data
email_data = np.zeros((len(features["know author"]),len(features)+1))

# populate feature data
email_data[:,0] = features["know author"]
email_data[:,1] = features["is long"]    
email_data[:,2] = features["has research"]
email_data[:,3] = features["has grade"]
email_data[:,4] = features["has lottery"]

# populate class data
email_data[:,-1] = y


assert len(y) == 10


p_read = np.count_nonzero(y == 1) / len(y)
p_not_read = np.count_nonzero(y== -1) / len(y)

entropy_y = - p_read * np.log2(p_read) - p_not_read * np.log2(p_not_read)

print(f"entropy = {entropy_y}")

entropy = 0.9709505944546686


In [6]:
class InfoGains():
    def __init__(self, data: np.array):
        self.features = data[:,:-1]
        self.classes = data[:,-1]
        
        self.entropy = self.calc_entropy(self.classes)
        print("entropy of class", self.entropy)
        self.p_class_a = np.count_nonzero(self.classes == 1) / len(self.classes)
        self.p_class_b = np.count_nonzero(self.classes == -1) / len(self.classes)
        
        
    def calc_entropy(self, classes: np.array) -> float:
        p_yes = np.count_nonzero(classes == 1) / len(classes)
        p_no = np.count_nonzero(classes == -1) / len(classes)
        
        entropy = - p_yes * np.log2(p_yes) - p_no * np.log2(p_no)
        
        return entropy
    
    
    def calc_entropy_of_feature(self, feature: np.array, feature_val: int) -> float:
        p_yes = np.count_nonzero(self.classes[feature == feature_val] == 1) / len(self.classes[feature == feature_val] == 1)
        p_no = np.count_nonzero(self.classes[feature == feature_val] == -1) / len(self.classes[feature == feature_val] == -1)
        
        # print(self.classes[feature == feature_val])
        if (p_yes == 0 or p_yes == 1):
            return 0
        
        entropy = - p_yes * np.log2(p_yes) - p_no * np.log2(p_no)
        
        print('entropy:', entropy)
        # print(feature)
        # print(self.classes)
        

        return entropy

    
    def calc_info_gain_of_feature(self, feature: np.array) -> float:
        #print("--- info gain of feature ---")
        entropy_left = self.calc_entropy_of_feature(feature,0)
        entropy_right = self.calc_entropy_of_feature(feature,1)
        
        
        weighted_entropy = entropy_left*(1 - feature.mean()) + entropy_right*(feature.mean())
        
        print("weighted_entropy:", weighted_entropy)
        
        info_gain = self.entropy - weighted_entropy
        print("info gain", info_gain)
        return info_gain
        
   

    def calc_all_info_gain(self) -> np.array:
        gains = np.array(np.zeros(len(self.features.T)))

        for index,feature in enumerate(self.features.T):
            info_gain = self.calc_info_gain_of_feature(feature)
            
            gains[index] = info_gain

        return gains

In [7]:
import mltools as ml
my_gains = InfoGains(email_data)

info_gains = my_gains.calc_all_info_gain()

for index,info_gain in enumerate(info_gains):
    print(f"Info gain for feature {index+1} = {info_gain}")

entropy of class 0.9709505944546686
entropy: 0.8112781244591328
entropy: 1.0
weighted_entropy: 0.9245112497836532
info gain 0.0464393446710154
entropy: 0.7219280948873623
weighted_entropy: 0.36096404744368116
info gain 0.6099865470109874
entropy: 0.9182958340544896
entropy: 0.9852281360342515
weighted_entropy: 0.965148445440323
info gain 0.0058021490143456145
entropy: 0.9182958340544896
entropy: 0.863120568566631
weighted_entropy: 0.8796731482129886
info gain 0.09127744624168
entropy: 0.9852281360342515
entropy: 0.9182958340544896
weighted_entropy: 0.965148445440323
info gain 0.0058021490143456145
Info gain for feature 1 = 0.0464393446710154
Info gain for feature 2 = 0.6099865470109874
Info gain for feature 3 = 0.0058021490143456145
Info gain for feature 4 = 0.09127744624168
Info gain for feature 5 = 0.0058021490143456145
