In [1]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd

# Mathematical Underpinnings - Lab 4

Tests to verify hipoteses of independence (from Lab 4):

In [2]:
def indep_test_asymptotic(X, Y, stat):
    if stat == "mi":
        stat_value = 2*len(X)*mutual_info_score(X, Y)
    if stat == "chi2":
        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic
    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)
    p_value = 1 - chi2.cdf(stat_value, df=df)
    return stat_value, p_value

def indep_test_permutation(X, Y, B, stat="mi"):
    stat_value = mutual_info_score(X, Y)
    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)
        stat_value_b = mutual_info_score(X_b, Y)
        if stat_value <= stat_value_b:
            condition_p_value += 1
    p_value = (1 + condition_p_value)/(1 + B)
    return 2*len(X)*stat_value, p_value

## Task 1

In [3]:
def CMI(X, Y, Z):
    stat_value = 0
    for z in np.unique(Z):
        X_z = X[Z == z]
        Y_z = Y[Z == z]
        stat_value += mutual_info_score(X_z, Y_z) * len(X_z) / len(Z)
    return stat_value

### a)

In [4]:
# CI test based on CMI and asymptotics
def cond_indep_test_asymptotic(X, Y, Z):
    stat_value = CMI(X, Y, Z)
    stat_value = 2 * len(X) * stat_value
    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)*(len(np.unique(Z)))
    p_value = 1 - chi2.cdf(stat_value, df=df)
    return stat_value, p_value

### b)

In [14]:
# CI test based on CMI and permutations
def cond_indep_test_permutation(X, Y, Z, B):
    stat_value = CMI(X, Y, Z)
    condition_p_value = 0
    for b in range(B):
        stat_value_b = 0
        for z in np.unique(Z):
            X_b = np.random.permutation(X[Z == z])
            p_z = (Z == z).sum() / len(Z)
            stat_value_b += mutual_info_score(X_b, Y[Z == z]) * p_z


        if stat_value <= stat_value_b:
            condition_p_value += 1
    p_value = (1 + condition_p_value)/(1 + B)
    return 2*len(X)*stat_value, p_value

### c)

conditionaly independent

In [15]:
n = 1000
X = np.random.normal(0, 1, n)
Y = np.random.normal(0, 1, n)
Z = np.random.normal(0, 1, n)
X = pd.cut(X, bins=10, labels=False)
Y = pd.cut(Y, bins=10, labels=False)
Z = pd.cut(Z, bins=10, labels=False)

In [16]:
print(cond_indep_test_asymptotic(X, Y, Z))
print(cond_indep_test_permutation(X, Y, Z, 100))

(437.55559451925757, 1.0)


(437.55559451925757, 0.6237623762376238)


conditionaly dependent

In [17]:
data_2d = np.random.multivariate_normal(np.zeros(2), np.array([[1, 0.93], [0.93, 1]]), n).T
X, Y = data_2d[0], data_2d[1]
Z = np.random.normal(0, 1, n)
X = pd.cut(X, bins=10, labels=False)
Y = pd.cut(Y, bins=10, labels=False)
Z = pd.cut(Z, bins=10, labels=False)

In [18]:
print(cond_indep_test_asymptotic(X, Y, Z))
print(cond_indep_test_permutation(X, Y, Z, 100))

(1756.6286607736904, 0.0)


(1756.6286607736904, 0.009900990099009901)


## Task 2

In [20]:
def sample_from_model1():
    n = 1000
    Z = np.random.normal(0, 1, n)
    Z_disc = np.where(Z < 0, -1, 1)
    X = np.random.normal(Z / 2, 1)
    X_disc = np.where(X < 0, -1, 1)
    Y = np.random.normal(Z / 2, 1)
    Y_disc = np.where(Y < 0, -1, 1)
    return X_disc, Y_disc, Z_disc

def sample_from_model2():
    n = 1000
    X = np.random.normal(0, 1, n)
    X_disc = np.where(X < 0, -1, 1)
    Z = np.random.normal(X / 2, 1)
    Z_disc = np.where(Z < 0, -1, 1)
    Y = np.random.normal(Z / 2, 1)
    Y_disc = np.where(Y < 0, -1, 1)
    return X_disc, Y_disc, Z_disc

def sample_from_model3():
    X = np.random.normal(0, 1, n)
    X_disc = np.where(X < 0, -1, 1)
    Y = np.random.normal(0, 1, n)
    Y_disc = np.where(Y < 0, -1, 1)
    Z = np.random.normal((X+Y) / 2, 1)
    Z_disc = np.where(Z < 0, -1, 1)
    return X_disc, Y_disc, Z_disc

### a)

Model 1 : X and Y are dependent but X and Y are independent given Z

Model 2 : X and Y are dependent but X and Y are independent given Z

Model 3 : X and Y are independent but X and Y are dependent given Z

### b)

In [24]:
X, Y, Z = sample_from_model1()

print("MI:")
print(mutual_info_score(X, Y))
print("CMI:")
print(CMI(X, Y, Z))

MI:
0.00802203511600591
CMI:
0.002213018871598088


In [25]:
X, Y, Z = sample_from_model2()

print("MI:")
print(mutual_info_score(X, Y))
print("CMI:")
print(CMI(X, Y, Z))

MI:
0.008616346741386838
CMI:
0.0007124620448937685


In [26]:
X, Y, Z = sample_from_model3()

print("MI:")
print(mutual_info_score(X, Y))
print("CMI:")
print(CMI(X, Y, Z))

MI:
0.0004745795908426831
CMI:
0.004341929964913568


### c)

In [30]:
B = 100
X, Y, Z = sample_from_model1()

print('MI test:', indep_test_asymptotic(X, Y, 'mi'))
print('Permutation test:', indep_test_permutation(X, Y, B))
print('CMI test:', cond_indep_test_asymptotic(X, Y, Z))
print('Cond permutation test:', cond_indep_test_permutation(X, Y, Z, B))

MI test: (14.371411624198638, 0.00015006342142187545)
Permutation test: (14.371411624198638, 0.009900990099009901)
CMI test: (2.0736842041485155, 0.35457262000386236)
Cond permutation test: (2.0736842041485155, 0.36633663366336633)


In [34]:
X, Y, Z = sample_from_model2()

print('MI test:', indep_test_asymptotic(X, Y, 'mi'))
print('Permutation test:', indep_test_permutation(X, Y, B))
print('CMI test:', cond_indep_test_asymptotic(X, Y, Z))
print('Cond permutation test:', cond_indep_test_permutation(X, Y, Z, B))

MI test: (18.553855017263476, 1.6517084729272824e-05)
Permutation test: (18.553855017263476, 0.009900990099009901)
CMI test: (1.7111259458589545, 0.42504383554)
Cond permutation test: (1.7111259458589545, 0.3564356435643564)


In [32]:
X, Y, Z = sample_from_model3()

print('MI test:', indep_test_asymptotic(X, Y, 'mi'))
print('Permutation test:', indep_test_permutation(X, Y, B))
print('CMI test:', cond_indep_test_asymptotic(X, Y, Z))
print('Cond permutation test:', cond_indep_test_permutation(X, Y, Z, B))

MI test: (0.8030438463001444, 0.3701848648823415)
Permutation test: (0.8030438463001444, 0.43564356435643564)
CMI test: (6.097485400227894, 0.04741850620963939)
Cond permutation test: (6.097485400227894, 0.0891089108910891)
