In [1]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd

# Mathematical Underpinnings - Lab 4

Tests to verify hipoteses of independence (from Lab 4):

In [2]:
def indep_test_asymptotic(X, Y, stat):

    if stat == "mi":

        stat_value = 2*len(X)*mutual_info_score(X, Y)

    if stat == "chi2":

        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

def indep_test_permutation(X, Y, B, stat="mi"):

    stat_value = mutual_info_score(X, Y)

    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)

        stat_value_b = mutual_info_score(X_b, Y)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

## Task 1

In [3]:
# a function which computes CMI

def cond_mutual_info_score(X, Y, Z):
    return np.sum([
        (Z == z).sum() / Z.shape[0] * mutual_info_score(X[Z == z], Y[Z == z])
        for z in np.unique(Z)
    ])

### a)

In [4]:
# CI test based on CMI and asymptotics
def cond_indep_test_asymptotic(X, Y, Z, stat="mi"):
    stat_value = 2*len(X)*cond_mutual_info_score(X, Y, Z)
    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)* len(np.unique(Z))
    p_value = 1 - chi2.cdf(stat_value, df=df)
    return stat_value, p_value

### b)

In [5]:
# CI test based on CMI and permutations

def cond_indep_test_permutation(X, Y, Z, B, stat="mi"):
    stat_value = cond_mutual_info_score(X, Y, Z)

    condition_p_value = 0
    for b in range(B):
        # X_b = np.random.permutation(X)
        X_b = np.empty_like(X)
        for z in np.unique(Z):
            X_b[Z == z] = np.random.permutation(X[Z == z])
        stat_value_b = cond_mutual_info_score(X_b, Y, Z)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

### c)

conditionaly independent

In [6]:
n = 1000

Z = np.random.binomial(1, 0.5, n)

X = np.random.normal(0, 1, n) + Z/2
X = np.where(X > 0, 1, -1)

Y = np.random.normal(0, 1, n) - Z/2
Y = np.where(Y > 0, 1, -1)

print("Asymptotic test ", cond_indep_test_asymptotic(X, Y, Z))
print("Permutation test ", cond_indep_test_permutation(X, Y, Z, 100))

Asymptotic test  (0.4041905804631041, 0.8170170704869395)
Permutation test  (0.4041905804631041, 0.7920792079207921)


conditionaly dependent

In [7]:
X = np.random.normal(0, 1, n)
X = np.where(X > 0, 1, 0)

Z = np.random.normal(0, 1, n)
Z = np.where(Z > 0, 1, 0)

Y = np.logical_xor(X, Z).astype(int)

print("Asymptotic test ", cond_indep_test_asymptotic(X, Y, Z))
print("Permutation test ", cond_indep_test_permutation(X, Y, Z, 100))

Asymptotic test  (1385.3311248026, 0.0)
Permutation test  (1385.3311248026, 0.009900990099009901)


## Task 2

In [8]:
n = 1000

def sample_from_model1():
    """
     (Y) <- (Z) -> (X)
    """
    Z_hat = np.random.standard_normal(n)
    Z = np.where(Z_hat > 0, 1, -1)

    X_hat = np.random.standard_normal(n) + Z / 2
    X = np.where(X_hat > 0, 1, -1)

    Y_hat = np.random.standard_normal(n) + Z / 2
    Y = np.where(Y_hat > 0, 1, -1)

    return X, Y, Z

def sample_from_model2():
    """
     (X) -> (Z) -> (Y)
    """
    X_hat = np.random.standard_normal(n)
    X = np.where(X_hat > 0, 1, -1)

    Z_hat = np.random.standard_normal(n) + X / 2
    Z = np.where(Z_hat > 0, 1, -1)

    Y_hat = np.random.standard_normal(n) + Z / 2
    Y = np.where(Y_hat > 0, 1, -1)

    return X, Y, Z

def sample_from_model3():
    """
     (X) -> (Z) <- (Y)
    """
    X_hat = np.random.standard_normal(n)
    X = np.where(X_hat > 0, 1, -1)

    Y_hat = np.random.standard_normal(n)
    Y = np.where(Y_hat > 0, 1, -1)

    Z_hat = np.random.standard_normal(n) + (X + Y) / 2
    Z = np.where(Z_hat > 0, 1, -1)

    return X, Y, Z

### a)

**answer**:

- Model 1
    - conditionally independent through Z
    - dependent
- Model 2
    - conditionally independent through Z
    - dependent
- Model 3
    - conditionally dependent through Z
    - independent

### b)

In [12]:
model1 = sample_from_model1()
model2 = sample_from_model2()
model3 = sample_from_model3()

In [14]:
pd.DataFrame([
    {
        'model': i+1,
        'MI(X,Y)': mutual_info_score(X, Y),
        'MI(X,Y|Z)': cond_mutual_info_score(X, Y, Z),
    }
    for i, (X, Y, Z) in enumerate([model1, model2, model3])
])

Unnamed: 0,model,"MI(X,Y)","MI(X,Y|Z)"
0,1,0.019766,0.001401
1,2,0.009073,0.001316
2,3,0.000438,0.007096


### c)

In [15]:
data = []
for i, (X, Y, Z) in enumerate([model1, model2, model3]):
    a_test_stat, a_p_value = cond_indep_test_asymptotic(X, Y, Z)
    p_test_stat, p_p_value = cond_indep_test_permutation(X, Y, Z, 100)
    data.append({
        ('','model'): i+1,
        ('asymptotic test', 'stat value'): a_test_stat,
        ('asymptotic test', 'p value'): a_p_value,
        ('permutation test', 'stat value'): p_test_stat,
        ('permutation test', 'p value'): p_p_value,
    })

pd.DataFrame(data, columns=pd.MultiIndex.from_tuples(data[0].keys()))

Unnamed: 0_level_0,Unnamed: 1_level_0,asymptotic test,asymptotic test,permutation test,permutation test
Unnamed: 0_level_1,model,stat value,p value,stat value,p value
0,1,2.801502,0.246412,2.801502,0.188119
1,2,2.632994,0.268073,2.632994,0.257426
2,3,14.192128,0.000828,14.192128,0.009901
