In [8]:
import numpy as pd
import pandas as pd
from sklearn import datasets
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
from scipy.stats import entropy
import numpy as np

In [4]:
df = pd.read_csv('adult.csv', encoding='ISO-8859-1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   education       32561 non-null  object
 3   education-num   32561 non-null  int64 
 4   marital-status  32561 non-null  object
 5   occupation      32561 non-null  object
 6   relationship    32561 non-null  object
 7   race            32561 non-null  object
 8   sex             32561 non-null  object
 9   hours-per-week  32561 non-null  int64 
 10  native-country  32561 non-null  object
 11  income          32561 non-null  object
dtypes: int64(3), object(9)
memory usage: 3.0+ MB


### Entropy
$$ H(x) = -\sum_x P(x)log_2P(x) $$


In [9]:
ent = entropy(df['education'].value_counts(), base=2)
print(ent)

2.9313508978037115


In [10]:
def my_entropy(df):
    val, cont = np.unique(df, return_counts=True)
    p = cont/np.sum(cont)
    plogp = np.log2(p)*p
    h = -np.sum(plogp)
    return h



### Mutual Information
$$ IM(X,Y) = \sum_x \sum_y P(x,y) \left| log_2 \left  (\frac{P(x,y)}{P(x)P(y)} \right) \right |$$

In [13]:
def my_mutual_information(X, Y):
    n = len(X)
    valx, contX = np.unique(X, return_counts=True)
    valy, contY = np.unique(Y, return_counts=True)
    px = contX/np.sum(contX)
    py = contY/np.sum(contY)
    im = 0
    for ix, x in enumerate(valx):
        for iy, y in enumerate(valy):
            pxy = np.sum(np.logical_and(X==x, Y==y))
            if pxy == 0: continue
            pxy = pxy/n
            im += pxy*np.log2(pxy/(px[ix]*py[iy]))
            return im



In [14]:
df = pd.read_csv('adult.csv', encoding='ISO-8859-1')
h = my_entropy(df['education'].values)
h2 = my_entropy(df['relationship'].values)
im = my_mutual_information(df['education'].values, df['relationship'].values)
print('H(education) = ', h)
print('H(relationship) = ', h2)
print('IM(education, relationship) = ', im)


H(education) =  2.9313508978037115
H(relationship) =  2.1544237955049743
IM(education, relationship) =  -0.002617950007341875


In [16]:
ent1 = entropy(df['education'].value_counts(), base=2)
ent2 = entropy(df['relationship'].value_counts(), base=2)

entropyjoint = entropy(pd.crosstab(df['education'], df['relationship']).values.flatten(), base=2)
im = ent1 + ent2 - entropyjoint
print('H(education) = ', ent1)
print('H(relationship) = ', ent2)
print('IM(education, relationship) = ', im)


H(education) =  2.9313508978037115
H(relationship) =  2.154423795504974
IM(education, relationship) =  0.054353269720570196
