In [1]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd

# Mathematical Underpinnings - Lab 4

Tests to verify hipoteses of independence (from Lab 4):

In [2]:
def indep_test_asymptotic(X, Y, stat):

    if stat == "mi":

        stat_value = 2*len(X)*mutual_info_score(X, Y)

    if stat == "chi2":

        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

def indep_test_permutation(X, Y, B=100, stat="mi"):
    X = X.copy()
    Y = Y.copy()

    stat_value = mutual_info_score(X, Y)

    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)

        stat_value_b = mutual_info_score(X_b, Y)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

## Task 1

In [4]:
# a function which computes CMI

def CMI(X, Y, Z):
    mi = 0
    for z in np.unique(Z):
        X_z = X[Z == z]
        Y_z = Y[Z == z]
        mi += mutual_info_score(X_z, Y_z) * len(X_z) / len(X)
    return mi

### a)

In [5]:
def cond_indep_test_asymptotic(X, Y, Z):
    X = X.copy()
    Y = Y.copy()
    Z = Z.copy()
    stat_value = 2 * len(X) * CMI(X, Y, Z)

    df = (len(np.unique(X)) - 1) * (len(np.unique(Y)) - 1) * len(np.unique(Z))

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

### b)

In [6]:
def cond_indep_test_permutation(X, Y, Z, B = 1000):
    X = X.copy()
    Y = Y.copy()
    Z = Z.copy()
    stat_value = CMI(X, Y, Z)

    condition_p_value = 0
    for b in range(B):
        for z in np.unique(Z):
            X[Z == z] = np.random.permutation(X[Z == z])

        stat_value_b = CMI(X, Y, Z)
        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)
           
    return stat_value, p_value

### c)

conditionaly independent

In [7]:
N = 10000
rng = np.random.default_rng(2137)

Z = rng.binomial(1, 0.5, N)
X = rng.normal(0, 1, N) + Z/2
X = np.where(X>=0, 1, -1)
Y = rng.normal(0, 1, N) - Z/2
Y = np.where(Y>=0, 1, -1)

print("Asymptotic test ", cond_indep_test_asymptotic(X, Y, Z))
print("Permutation test ", cond_indep_test_permutation(X, Y, Z))

Asymptotic test  (0.07491517046116947, 0.9632352722563632)
Permutation test  (3.7457585230584734e-06, 0.971028971028971)


conditionaly dependent

In [9]:
X = rng.normal(0, 1, N)
X = np.where(X>=0, 1, 0)
Z = rng.normal(0, 1, N)
Z = np.where(Z>=0, 1, 0)

Y = np.logical_xor(X, Z).astype(int)

print("Asymptotic test ", cond_indep_test_asymptotic(X, Y, Z))
print("Permutation test ", cond_indep_test_permutation(X, Y, Z))

Asymptotic test  (13862.757041752793, 0.0)
Permutation test  (0.6931378520876397, 0.000999000999000999)


## Task 2

In [11]:
def sample_from_model1():
    n = 10000
    Z_tilde = np.random.normal(0,1,n)
    Z = (Z_tilde>0)*2-1

    X_tilde = np.random.normal(0,1,n)+Z/2
    X = (X_tilde>0)*2-1

    Y_tilde = np.random.normal(0,1,n)+Z/2
    Y = (Y_tilde>0)*2-1
    return X, Y, Z

def sample_from_model2():
    n = 10000
    X_tilde = np.random.normal(0,1,n)
    X = (X_tilde>0)*2-1

    Z_tilde = np.random.normal(0,1,n)+X/2
    Z = (Z_tilde>0)*2-1

    Y_tilde = np.random.normal(0,1,n)+X/2
    Y = (Y_tilde>0)*2-1
    return X, Y, Z
    

def sample_from_model3():
    n = 10000
    X_tilde = np.random.normal(0,1,n)
    X = (X_tilde>0)*2-1

    Y_tilde = np.random.normal(0,1,n)
    Y = (Y_tilde>0)*2-1

    Z_tilde = np.random.normal(0,1,n)+(X+Y)/2
    Z = (Z_tilde>0)*2-1
    return X, Y, Z

### a)

answer:

model1 - conditionally independent through Z, dependent

model2 - conditionally independent through Z, dependent

model3 - conditionally dependent through Z, independent

### b)

In [16]:
X1, Y1, Z1 = sample_from_model1()
X2, Y2, Z2 = sample_from_model2()
X3, Y3, Z3 = sample_from_model3()

MI(X,Y):

In [17]:
print("Model 1:", mutual_info_score(X1, Y1))
print("Model 2:", mutual_info_score(X2, Y2))
print("Model 3:", mutual_info_score(X3, Y3))

Model 1: 0.01057618077409661
Model 2: 0.08051402009271016
Model 3: 2.8786361938037963e-05


CMI(X,Y|Z)

In [25]:
print("Model 1:", CMI(X1, Y1, Z1))
print("Model 2:", CMI(X2, Y2, Z2))
print("Model 3:", CMI(X3, Y3, Z3))

Model 1: 9.527194210050589e-05
Model 2: 0.06564770345830997
Model 3: 0.007790428419281891


### c)

permutation test CMI(X,Y|Z):

In [21]:
print("Model 1:", cond_indep_test_permutation(X1, Y1, Z1))
print("Model 2:", cond_indep_test_permutation(X2, Y2, Z2))
print("Model 3:", cond_indep_test_permutation(X3, Y3, Z3))

Model 1: (9.527194210050589e-05, 0.3866133866133866)
Model 2: (0.06564770345830997, 0.000999000999000999)
Model 3: (0.007790428419281891, 0.000999000999000999)


asymptotic test CMI(X,Y|Z):

In [22]:
print("Model 1:", cond_indep_test_asymptotic(X1, Y1, Z1))
print("Model 2:", cond_indep_test_asymptotic(X2, Y2, Z2))
print("Model 3:", cond_indep_test_asymptotic(X3, Y3, Z3))

Model 1: (1.9054388420101178, 0.3856907405197967)
Model 2: (1312.9540691661996, 0.0)
Model 3: (155.8085683856378, 0.0)


permutation test MI(X,Y):

In [24]:
print("Model 1:", indep_test_permutation(X1, Y1))
print("Model 2:", indep_test_permutation(X2, Y2))
print("Model 3:", indep_test_permutation(X3, Y3))

Model 1: (211.5236154819322, 0.009900990099009901)
Model 2: (1610.2804018542033, 0.009900990099009901)
Model 3: (0.5757272387607593, 0.45544554455445546)


asymptotic test MI(X,Y):

In [None]:
print("Model 1:", indep_test_asymptotic(X1, Y1, "mi"))
print("Model 2:", indep_test_asymptotic(X2, Y2, "mi"))
print("Model 3:", indep_test_asymptotic(X3, Y3, "mi"))

asymptotic test CHI^2:

In [26]:
print("Model 1:", indep_test_asymptotic(X1, Y1, "chi2"))
print("Model 2:", indep_test_asymptotic(X2, Y2, "chi2"))
print("Model 3:", indep_test_asymptotic(X3, Y3, "chi2"))

Model 1: (210.19723533339362, 0.0)
Model 2: (1564.99328003871, 0.0)
Model 3: (0.5457670728269652, 0.46005244265441125)
