In [1]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Mathematical Underpinnings - Lab 5

Tests to verify hipoteses of independence (from Lab 4):

In [2]:
def indep_test_asymptotic(X, Y, stat):
    assert stat in ["mi", "chi2"]
    if stat == "mi":
        stat_value = 2*len(X)*mutual_info_score(X, Y)
    if stat == "chi2":
        test_res, _, _, _ = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res
    
    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)
    p_value = 1 - chi2.cdf(stat_value, df=df)
    return stat_value, p_value

def indep_test_permutation(X, Y, B, stat="mi"):
    stat_value = mutual_info_score(X, Y)
    condition_p_value = 0
    
    for b in range(B):
        X_b = np.random.permutation(X)
        stat_value_b = mutual_info_score(X_b, Y)
        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)
    return 2*len(X)*stat_value, p_value

## Task 1

### a)

In [3]:
# a function which computes CMI
def cond_mutual_info_score(X, Y, Z):
    stat_value = 0
    for Z_val in np.unique(Z):
        stat_value += 2 * len(Z[Z==Z_val]) * mutual_info_score(X[Z==Z_val], Y[Z==Z_val])
    return stat_value

# CI test based on CMI and asymptotics
def cond_indep_test_asymptotic(X, Y, Z):
    stat_value = cond_mutual_info_score(X, Y, Z)
    df = (len(np.unique(X)) - 1) * (len(np.unique(Y)) - 1) * (len(np.unique(Z)))
    p_value = 1 - chi2.cdf(stat_value, df=df)
    return stat_value, p_value

### b)

In [4]:
def cond_permutation(X, Z):
    X_b = np.copy(X)
    for Z_val in np.unique(Z):
        X_b[Z==Z_val] = np.random.permutation(X[Z==Z_val])
    return X_b

# CI test based on CMI and permutations
def cond_indep_test_permutation(X, Y, Z, B):
    stat_value = cond_mutual_info_score(X, Y, Z)
    condition_p_value = 0
    
    for b in range(B):
        X_b = cond_permutation(X, Z)
        stat_value_b = cond_mutual_info_score(X_b, Y, Z)
        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)
    return 2*len(X)*stat_value, p_value

### c)

In [31]:
n = 1000
B = 20

def discretized(X):
    return np.where(X<0, -1, 1)

conditionally independent

In [32]:
Z = np.random.normal(size=n)
Z_d = discretized(Z)

X = np.random.normal(size=n) + Z_d/2
Y = np.random.normal(size=n) + Z_d/2

X_d = discretized(X)
Y_d = discretized(Y)

In [33]:
cond_indep_test_asymptotic(X_d, Y_d, Z_d)

(3.182062897051034, 0.20371538140219647)

In [34]:
cond_indep_test_permutation(X_d, Y_d, Z_d, B)

(6364.1257941020685, 0.19047619047619047)

conditionally dependent

In [35]:
X = np.random.normal(size=n)
Y = np.random.normal(size=n)

X_d = discretized(X)
Y_d = discretized(Y)

Z = np.random.normal(size=n) + X_d/2 + Y_d/2
Z_d = discretized(Z)

In [36]:
cond_indep_test_asymptotic(X_d, Y_d, Z_d)

(18.78446912694103, 8.336895471938721e-05)

In [37]:
cond_indep_test_permutation(X_d, Y_d, Z_d, B)

(37568.93825388206, 0.047619047619047616)

## Task 2

In [38]:
n = 1000

def sample_from_model1():
    Z = np.random.normal(size=n)
    Z_d = discretized(Z)
    
    X = np.random.normal(size=n) + Z_d/2
    Y = np.random.normal(size=n) + Z_d/2
    
    X_d = discretized(X)
    Y_d = discretized(Y)
    return X_d, Y_d, Z_d

def sample_from_model2():
    X = np.random.normal(size=n)
    X_d = discretized(X)
    
    Z = np.random.normal(size=n) + X_d/2
    Z_d = discretized(Z)
    
    Y = np.random.normal(size=n) + Z_d/2
    Y_d = discretized(Y)
    return X_d, Y_d, Z_d

def sample_from_model3():
    X = np.random.normal(size=n)
    Y = np.random.normal(size=n)
    
    X_d = discretized(X)
    Y_d = discretized(Y)
    
    Z = np.random.normal(size=n) + X_d/2 + Y_d/2
    Z_d = discretized(Z)
    return X_d, Y_d, Z_d

### a)

answer: A and B conditionally independent, C only **un**conditionally independent

### b) / c)

In [39]:
def uncond_mutual_info_score(X, Y):
    return 2 * len(X) * mutual_info_score(X, Y)

In [40]:
def stats(X, Y, Z):
    print('Mutual information:', uncond_mutual_info_score(X, Y))
    print('Conditional mutual information:', cond_mutual_info_score(X, Y, Z))
    print('Asymptotic independence test (MI):', indep_test_asymptotic(X, Y, "mi"))
    print('Asymptotic independence test (chi^2):', indep_test_asymptotic(X, Y, "chi2"))
    print('Permutation-based independence test:', indep_test_permutation(X, Y, 20))
    print('Asymptotic conditional independence test:', cond_indep_test_asymptotic(X, Y, Z))
    print('Permutation-based conditional independence test:', cond_indep_test_permutation(X, Y, Z, 20))

In [41]:
X, Y, Z = sample_from_model1()
stats(X, Y, Z)

Mutual information: 20.54861099593608
Conditional mutual information: 0.8303374053630881
Asymptotic independence test (MI): (20.54861099593608, 5.813595993831022e-06)
Asymptotic independence test (chi^2): (19.911734891201323, 8.110098651759046e-06)
Permutation-based independence test: (20.54861099593608, 0.047619047619047616)
Asymptotic conditional independence test: (0.8303374053630881, 0.6602288889252066)
Permutation-based conditional independence test: (1660.6748107261762, 0.7619047619047619)


In [42]:
X, Y, Z = sample_from_model2()
stats(X, Y, Z)

Mutual information: 27.016540342340356
Conditional mutual information: 1.7343962217897246
Asymptotic independence test (MI): (27.016540342340356, 2.0172191228517988e-07)
Asymptotic independence test (chi^2): (26.242552250692754, 3.0111385440623195e-07)
Permutation-based independence test: (27.016540342340356, 0.047619047619047616)
Asymptotic conditional independence test: (1.7343962217897246, 0.42012705107290393)
Permutation-based conditional independence test: (3468.7924435794494, 0.3333333333333333)


In [43]:
X, Y, Z = sample_from_model3()
stats(X, Y, Z)

Mutual information: 0.289005302558043
Conditional mutual information: 15.352533538990869
Asymptotic independence test (MI): (0.289005302558043, 0.59085866192837)
Asymptotic independence test (chi^2): (0.22488921257701913, 0.6353395713799781)
Permutation-based independence test: (0.289005302558043, 0.5238095238095238)
Asymptotic conditional independence test: (15.352533538990869, 0.0004637027808503946)
Permutation-based conditional independence test: (30705.067077981737, 0.047619047619047616)
