In [1]:
import numpy as np

In [2]:
from collections import Counter

**Build function to calculate entropy**

In [3]:
def entropy(elements):
    counter = Counter(elements)
    probabilities = [counter[e] / len(elements) for e in counter.keys()]
    return -sum(p * np.log10(p) for p in probabilities)

Test

In [4]:
entropy([1,1,1,1])

-0.0

In [5]:
entropy([1,1,1,0])

0.24421905028821556

In [6]:
entropy([1,1,0,0])

0.3010299956639812

**Generate mock data**

In [7]:
import pandas as pd

In [8]:
mock_data = {
    'gender': ['F', 'F', 'F', 'F', 'M', 'M', 'M'],
    'income': ['+10', '-10', '+10', '+10', '+10', '+10', '-10'],
    'family_number': [1, 1, 2, 1, 1, 1, 2],
    'bought': [1, 1, 1, 0, 0, 0, 1]
}

In [9]:
dataset = pd.DataFrame.from_dict(mock_data)

In [10]:
dataset

Unnamed: 0,gender,income,family_number,bought
0,F,10,1,1
1,F,-10,1,1
2,F,10,2,1
3,F,10,1,0
4,M,10,1,0
5,M,10,1,0
6,M,-10,2,1


In [11]:
set(mock_data['family_number'])

{1, 2}

In [12]:
sub_split_1 = dataset[dataset['family_number'] == 1]['bought'].tolist()
sub_split_1

[1, 1, 0, 0, 0]

In [13]:
sub_split_2 = dataset[dataset['family_number'] != 1]['bought'].tolist()
sub_split_2

[1, 1]

In [14]:
entropy(sub_split_1) + entropy(sub_split_2)

0.29228525323862886

In [15]:
dataset.columns.tolist()

['gender', 'income', 'family_number', 'bought']

**Build function to get the Minimum entropy spliter**

In [16]:
def find_min_spliter(training_data:pd.DataFrame, target: str) -> str:
    x_fields = set(training_data.columns.tolist()) - {target}
    
    spliter = None
    min_entropy = float('inf')
    
    for f in x_fields:
        elements = set(training_data[f])
        for e in elements:
            sub_spliter_1 = training_data[training_data[f] == e][target].tolist()
            entropy_1 = entropy(sub_spliter_1)
            
            sub_spliter_2 = training_data[training_data[f] != e][target].tolist()
            entropy_2 = entropy(sub_spliter_2)
            
            entropy_v = entropy_1 + entropy_2
            
            if entropy_v < min_entropy:
                min_entropy = entropy_v
                spliter = (f, e)
    
    print('spliter is: {}'.format(spliter))
    print('the min entropy is: {}'.format(min_entropy))
    
    return spliter

Test

In [17]:
find_min_spliter(dataset, 'bought')

spliter is: ('family_number', 1)
the min entropy is: 0.29228525323862886


('family_number', 1)

In [18]:
dataset[dataset['family_number'] == 1]

Unnamed: 0,gender,income,family_number,bought
0,F,10,1,1
1,F,-10,1,1
3,F,10,1,0
4,M,10,1,0
5,M,10,1,0


In [19]:
dataset[dataset['family_number'] != 1]

Unnamed: 0,gender,income,family_number,bought
2,F,10,2,1
6,M,-10,2,1
