In [2]:
import math
import pandas as pd
import numpy as np

In [112]:
def entropy(partial_dataset):
    data_count = {}
    
    for ele in partial_dataset:
        if ele in data_count.keys():
            data_count[ele] += 1
        else:
            data_count[ele] = 1
    
    res = 0
    count = len(partial_dataset)
    for key in data_count:
        frac = float(data_count[key]) / count
        res += frac * math.log2(frac)
    
    res *= -1
    
    return res

In [113]:
def gini(partial_dataset):
    data_count = {}
    
    for ele in partial_dataset:
        if ele in data_count.keys():
            data_count[ele] += 1
        else:
            data_count[ele] = 1
            
    res = 0
    count = len(partial_dataset)
    
    for key in data_count:
        frac = float(data_count[key]) / count
        res += frac * frac
        
    return 1 - res

In [114]:
def remainder(dataset, col_idx, target_idx):
    divided_data = {}
    
    for idx, ele in enumerate(dataset[:,col_idx]):
        if ele in divided_data:
            divided_data[ele].append(dataset[idx][target_idx])
        else:
            divided_data[ele] = [dataset[idx][target_idx]]
    
    res = 0
    
    for key in divided_data:
        frac = float(len(divided_data[key])) / len(dataset)
        res += frac * gini(divided_data[key]) # entropy(divided_data[key])
        
    return res

In [115]:
def information_gain(dataset, col_idx, target_idx):
    target_entropy = gini(dataset[:,target_idx]) # entropy(dataset[:,target_idx])
    
    rem = remainder(dataset, col_idx, target_idx)
    
    return target_entropy - rem

In [107]:
def information_gain_ratio(dataset, col_idx, target_idx):
    info_gain = information_gain(dataset, col_idx, target_idx)
    info_entropy = entropy(dataset[:,col_idx])
    
    return info_gain / info_entropy

In [3]:
dataset = pd.read_csv('Datasets/example_dataset.csv')
dataset

Unnamed: 0,id,stream,slope,elevation,vegetation
0,1,False,steep,high,chaparral
1,2,True,moderate,low,riparian
2,3,True,steep,medium,riparian
3,4,False,steep,medium,chaparral
4,5,False,flat,high,conifer
5,6,True,steep,highest,conifer
6,7,True,steep,high,chaparral


In [49]:
np_dataset = np.array(dataset)

In [83]:
np_dataset[:,4]

array(['chaparral', 'riparian', 'riparian', 'chaparral', 'conifer',
       'conifer', 'chaparral'], dtype=object)

In [98]:
print(len(np_dataset))
np_dataset

7


array([[1, False, 'steep', 'high', 'chaparral'],
       [2, True, 'moderate', 'low', 'riparian'],
       [3, True, 'steep', 'medium', 'riparian'],
       [4, False, 'steep', 'medium', 'chaparral'],
       [5, False, 'flat', 'high', 'conifer'],
       [6, True, 'steep', 'highest', 'conifer'],
       [7, True, 'steep', 'high', 'chaparral']], dtype=object)

In [86]:
entropy(np_dataset[:,4])

1.5566567074628228

In [116]:
target = 4

for idx in range(1, len(np_dataset[0])):
    if idx == target:
        continue
    print(f"idx {idx} : {remainder(np_dataset, idx, target)}")

idx 1 : 0.5476190476190476
idx 2 : 0.39999999999999997
idx 3 : 0.3333333333333333


In [117]:
for idx in range(1, len(np_dataset[0])):
    if idx == target:
        continue
    print(f"idx {idx} : {information_gain(np_dataset, idx, target)}")

idx 1 : 0.10544217687074842
idx 2 : 0.253061224489796
idx 3 : 0.31972789115646266


In [108]:
for idx in range(1, len(np_dataset[0])):
    if idx == target:
        continue
    print(f"idx {idx} : {information_gain_ratio(np_dataset, idx, target)}")

idx 1 : 0.31054583367826694
idx 2 : 0.5026016408718359
idx 3 : 0.4762271375015451


In [110]:
gini(np_dataset[:,target])

0.653061224489796