In [220]:
# Imports and get the dataframe
import pandas as pd
import numpy as np

df = pd.read_csv('ml-bugs.csv')

In [221]:
df.head()

Unnamed: 0,Species,Color,Length (mm)
0,Mobug,Brown,11.6
1,Mobug,Blue,16.3
2,Lobug,Blue,15.1
3,Lobug,Green,23.7
4,Lobug,Blue,18.4


## Fixing columns

In [222]:
df.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)

In [223]:
df.head()

Unnamed: 0,species,color,length_(mm)
0,Mobug,Brown,11.6
1,Mobug,Blue,16.3
2,Lobug,Blue,15.1
3,Lobug,Green,23.7
4,Lobug,Blue,18.4


## Formula
$$entropy = -\frac{m}{m+n}\log_2\bigg(\frac{m}{m+n}\bigg)-\frac{n}{m+n}\log_2\bigg(\frac{n}{m+n}\bigg)$$


This entropy equation can be extended to the multi-class case, where we have three or more possible values:

$$p_1 = \frac{m}{m+n}$$

$$p_2 = \frac{n}{m+n}$$

$$entropy = -p_1\log_2(p_1)-p_2\log_2(p_2)- ... -p_n\log_2(p_n)= -\sum_{i=1}^{n}p_i\log_2(p_i)$$


In [224]:
species = df['species'].value_counts()
species

Lobug    14
Mobug    10
Name: species, dtype: int64

In [225]:
ps = [(specie/df.shape[0]) for specie in species]
ps

[0.5833333333333334, 0.4166666666666667]

In [226]:
entropy = 0
for p in ps:
    entropy += p*np.log2(p)
    
entropy = -1*entropy
entropy

0.9798687566511528

## Color = Blue

In [227]:
blues = df.query('color == "Blue"')

In [228]:
blues_spec = blues['species'].value_counts()
blues_spec

Lobug    6
Mobug    4
Name: species, dtype: int64

In [229]:
ps = [(blue/blues.shape[0]) for blue in blues_spec]
ps

[0.6, 0.4]

In [230]:
blue_ent = 0
for p in ps:
    blue_ent += p*np.log2(p)
    
blue_ent = -1*blue_ent
blue_ent

0.9709505944546686

In [231]:
entropy - blue_ent

0.008918162196484225

## Color = Green

In [232]:
greens = df.query('color == "Green"')

In [233]:
green_spec = greens['species'].value_counts()
green_spec

Lobug    6
Mobug    2
Name: species, dtype: int64

In [234]:
ps = [(green/greens.shape[0]) for green in green_spec]
ps

[0.75, 0.25]

In [235]:
green_ent = 0
for p in ps:
    green_ent += p*np.log2(p)
    
green_ent = -1*green_ent
green_ent

0.8112781244591328

In [236]:
entropy - green_ent

0.16859063219201997

## Color = Brown

In [237]:
browns = df.query('color == "Brown"')

In [238]:
brown_spec = browns['species'].value_counts()
brown_spec

Mobug    4
Lobug    2
Name: species, dtype: int64

In [239]:
ps = [(brown/browns.shape[0]) for brown in brown_spec]
ps

[0.6666666666666666, 0.3333333333333333]

In [240]:
brown_ent = 0
for p in ps:
    brown_ent += p*np.log2(p)
    
brown_ent = -1*brown_ent
brown_ent

0.9182958340544896

In [241]:
mean = (blue_ent+green_ent+brown_ent)/3
entropy - mean

0.07969390566172252

## Length < 17.0mm

In [242]:
less17s = df[df['length_(mm)'] < 17.0]

In [243]:
less17_spec = less17s['species'].value_counts()
less17_spec

Mobug    6
Lobug    3
Name: species, dtype: int64

In [244]:
ps = [(less17/less17s.shape[0]) for less17 in less17_spec]
ps

[0.6666666666666666, 0.3333333333333333]

In [245]:
less17_ent = 0
for p in ps:
    less17_ent += p*np.log2(p)
    
less17_ent = -1*less17_ent
less17_ent

0.9182958340544896

In [246]:
entropy - less17_ent

0.06157292259666325

## Length < 20.0mm

In [247]:
less20s = df[df['length_(mm)'] < 20.0]


In [248]:
less20_spec = less20s['species'].value_counts()
less20_spec

Mobug    9
Lobug    8
Name: species, dtype: int64

In [249]:
ps = [(less20/less20s.shape[0]) for less20 in less20_spec]
ps

[0.5294117647058824, 0.47058823529411764]

In [250]:
less20_ent = 0
for p in ps:
    less20_ent += p*np.log2(p)
    
less20_ent = -1*less20_ent
less20_ent

0.9975025463691153

In [251]:
mean = (less17_ent+less20_ent)/2
entropy - mean

0.02196956643935044