In [2]:
import numpy as np
import pandas as pd
from collections import Counter 

# counter :
#Counting the occurrences of classes in a dataset.
#Calculating class probabilities.
#Summarizing categorical features.


In [5]:
def gini_index (groups, classes):
    #count all samples at split point
    n_instances = float(sum([len(group) for group in groups]))
    
    # sum weighted Gini index for each group
    gini = 0.0
    for group in groups :
        size = float(len(group))
        if size == 0:
            continue
    
    score = 0.0
    for class_val in classes :
        p = [row[-1] for row in group].count(class_val) /size
        score += p* p
    
    gini += (1.0 - score) * (size / n_instances)
    return gini

In [7]:
def entropy (groups, classes):
    #count all sampls at split point
    n_instances = float(sum([len(group) for group in groups]))
    
    #sum weighted entropy for each group
    ent = 0.0
    for group in groups :
        size = float(len(group))
        if size == 0:
            continue
            
    score = 0.0      
    for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            if p > 0:
                score += p * np.log2(p)
                
    ent += (-score)*(size /n_instances)
    return ent

In [8]:
def information_gain(groups, classes):
    total_entropy = entropy([sum(groups, [])],classes)
    weighted_entropy = sum([(len(group)/ float(sum([len(g) for g in groups]))) * entropy([group],classes) for group in groups])
    return total_entropy - weighted_entropy

In [9]:
# Example dataset
dataset = [
    [2.771244718, 1.784783929, 0],
    [1.728571309, 1.169761413, 0],
    [3.678319846, 2.81281357, 0],
    [3.961043357, 2.61995032, 0],
    [2.999208922, 2.209014212, 0],
    [7.497545867, 3.162953546, 1],
    [9.00220326, 3.339047188, 1],
    [7.444542326, 0.476683375, 1],
    [10.12493903, 3.234550982, 1],
    [6.642287351, 3.319983761, 1]
]


In [10]:
# Split into groups
group1 = [dataset[0], dataset[1], dataset[2], dataset[3], dataset[4]]
group2 = [dataset[5], dataset[6], dataset[7], dataset[8], dataset[9]]

groups = [group1, group2]
classes = [0, 1]


In [13]:
print("gini index", gini_index(groups, classes))
print("entropy", entropy(groups, classes))
print("information gain", information_gain(groups, classes))

gini index 0.0
entropy 0.0
information gain 1.0
