In [1]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd

# Mathematical Underpinnings - Lab 4

Tests to verify hipoteses of independence (from Lab 4):

In [2]:
def indep_test_asymptotic(X, Y, stat):

    if stat == "mi":
        stat_value = 2*len(X)*mutual_info_score(X, Y)

    if stat == "chi2":
        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)
    p_value = 1 - chi2.cdf(stat_value, df=df)
    return stat_value, round(p_value, 4)

def indep_test_permutation(X, Y, B, stat="mi"):

    stat_value = mutual_info_score(X, Y)
    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)
        stat_value_b = mutual_info_score(X_b, Y)
        if stat_value <= stat_value_b:
            condition_p_value += 1
    p_value = (1 + condition_p_value)/(1 + B)
    return 2*len(X)*stat_value, round(p_value, 4)

## Task 1

### a)

In [3]:
# CI test based on CMI and asymptotics

def cond_indep_test_asymptotic(X, Y, Z):

    stat_value = 0
    for z in np.unique(Z):
        p_z = (Z == z).sum() / Z.shape[0]
        stat_value += 2*len(X[Z == z])*mutual_info_score(X[Z == z], Y[Z == z]) * p_z

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1) * len(np.unique(Z))

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

### b)

In [4]:
# CI test based on CMI and permutations

def cond_indep_test_permutation(X, Y, Z, B):

    stat_value = 0
    for z in np.unique(Z):
        p_z = (Z == z).sum() / Z.shape[0]
        stat_value += mutual_info_score(X[Z == z], Y[Z == z]) * p_z

    condition_p_value = 0
    for b in range(B):
        stat_value_b = 0
        
        for z in np.unique(Z):
            X_b = np.random.permutation(X[Z == z])
            p_z = (Z == z).sum() / Z.shape[0]
            stat_value_b += mutual_info_score(X_b, Y[Z == z]) * p_z

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, round(p_value, 4)

### c)

conditionaly independent

In [5]:
# conditionaly independent r.v. - we expect that our tests don't reject the null hypothesis

n = 5000
X = np.random.normal(0, 1, n)
Y = np.random.normal(0, 1, n)
Z = np.random.normal(0, 1, n)

X = pd.qcut(X, q=10, labels=range(10), retbins=False).to_numpy()
Y = pd.qcut(Y, q=10, labels=range(10), retbins=False).to_numpy()
Z = pd.qcut(Z, q=10, labels=range(10), retbins=False).to_numpy()

In [6]:
cond_indep_test_asymptotic(X, Y, Z)

(84.81203917225527, 1.0)

In [7]:
cond_indep_test_permutation(X, Y, Z, B=100)

(848.1203917225528, 0.7426)

p-values large -> the null hypothesis is not rejected ($\alpha = 0.05$) -> conditionaly independent

conditionaly dependent

In [8]:
# conditionaly dependent r.v. - we expect that our tests reject the null hypothesis

n = 5000

data = np.random.multivariate_normal(np.zeros(2), np.array([[1, 0.93], [0.93, 1]]), n)
X = data[:, 0]
Y = data[:, 1]
Z = np.random.normal(0, 1, n)

X = pd.qcut(X, q=10, labels=range(10), retbins=False).to_numpy()
Y = pd.qcut(Y, q=10, labels=range(10), retbins=False).to_numpy()
Z = pd.qcut(Z, q=10, labels=range(10), retbins=False).to_numpy()

In [9]:
cond_indep_test_asymptotic(X, Y, Z)

(915.4315592529199, 0.005701075691179169)

In [10]:
cond_indep_test_permutation(X, Y, Z, B=100)

(9154.315592529198, 0.0099)

p-values small -> the null hypothesis is rejected ($\alpha = 0.05$) -> conditionaly dependent

## Task 2

In [11]:
n = 1000

def sample_from_model1():
    Z = np.random.normal(0, 1, n)
    Z_ = np.where(Z < 0, -1, 1)
    X = np.random.normal(Z / 2, 1)
    X_ = np.where(X < 0, -1, 1)
    Y = np.random.normal(Z / 2, 1)
    Y_ = np.where(Y < 0, -1, 1)
    return X_, Y_, Z_

def sample_from_model2():
    X = np.random.normal(0, 1, n)
    X_ = np.where(X < 0, -1, 1)
    Z = np.random.normal(X / 2, 1)
    Z_ = np.where(Z < 0, -1, 1)
    Y = np.random.normal(Z / 2, 1)
    Y_ = np.where(Y < 0, -1, 1)
    return X_, Y_, Z_

def sample_from_model3():
    X = np.random.normal(0, 1, n)
    X_ = np.where(X < 0, -1, 1)
    Y = np.random.normal(0, 1, n)
    Y_ = np.where(Y < 0, -1, 1)
    Z = np.random.normal((X+Y) / 2, 1)
    Z_ = np.where(Z < 0, -1, 1)
    return X_, Y_, Z_

### a)

answer:

1.
dependent

conditionaly independent

2.
dependent

conditionaly independent

3. 
independent

conditionaly dependent

### b)

In [12]:
# model 1

X, Y, Z = sample_from_model1()

mi = mutual_info_score(X, Y)

cmi = 0
for z in np.unique(Z):
    p_z = (Z == z).sum() / Z.shape[0]
    cmi += mutual_info_score(X[Z == z], Y[Z == z]) * p_z

print("MI", mi)
print("CMI", cmi)

MI 0.011433460208992274
CMI 0.004516327405334752


In [13]:
# model 2

X, Y, Z = sample_from_model2()

mi = mutual_info_score(X, Y)

cmi = 0
for z in np.unique(Z):
    p_z = (Z == z).sum() / Z.shape[0]
    cmi += mutual_info_score(X[Z == z], Y[Z == z]) * p_z

print("MI", mi)
print("CMI", cmi)

MI 0.018037540092452398
CMI 0.0036070271550888563


In [14]:
# model 3

X, Y, Z = sample_from_model3()

mi = mutual_info_score(X, Y)

cmi = 0
for z in np.unique(Z):
    p_z = (Z == z).sum() / Z.shape[0]
    cmi += mutual_info_score(X[Z == z], Y[Z == z]) * p_z

print("MI", mi)
print("CMI", cmi)

MI 0.00020276859358631105
CMI 0.003320728296923187


### c)

In [15]:
B = 100

In [16]:
# model 1

X, Y, Z = sample_from_model1()

print('MI test:', indep_test_asymptotic(X, Y, 'mi'))
print('Permutation test:', indep_test_permutation(X, Y, B))
print('CMI test:', cond_indep_test_asymptotic(X, Y, Z))
print('Cond permutation test:', cond_indep_test_permutation(X, Y, Z, B))

# we expect:
# dependent
# conditionaly independent

# we get:
# MI test -> small p-value -> reject the null hypothesis -> dependent
# CMI test -> large p-value -> don't reject the null hypothesis -> conditionaly independent

MI test: (18.847355801097642, 0.0)
Permutation test: (18.847355801097642, 0.0099)
CMI test: (1.6038890604936178, 0.44845607930623255)
Cond permutation test: (3.208600015615626, 0.2079)


In [17]:
# model 2

X, Y, Z = sample_from_model2()

print('MI test:', indep_test_asymptotic(X, Y, 'mi'))
print('Permutation test:', indep_test_permutation(X, Y, B))
print('CMI test:', cond_indep_test_asymptotic(X, Y, Z))
print('Cond permutation test:', cond_indep_test_permutation(X, Y, Z, B))

# we expect:
# dependent
# conditionaly independent

# we get:
# MI test -> small p-value -> reject the null hypothesis -> dependent
# CMI test -> large p-value -> don't reject the null hypothesis -> conditionaly independent

MI test: (14.192425985901536, 0.0002)
Permutation test: (14.192425985901536, 0.0099)
CMI test: (1.4825839903793, 0.47649788467740195)
Cond permutation test: (2.9536338284350494, 0.2376)


In [18]:
# model 3

X, Y, Z = sample_from_model3()

print('MI test:', indep_test_asymptotic(X, Y, 'mi'))
print('Permutation test:', indep_test_permutation(X, Y, B))
print('CMI test:', cond_indep_test_asymptotic(X, Y, Z))
print('Cond permutation test:', cond_indep_test_permutation(X, Y, Z, B))

# we expect:
# independent
# conditionaly dependent

# we get:
# MI test -> large p-value -> don't reject the null hypothesis -> independent
# CMI test ->  small p-value -> reject the null hypothesis -> conditionaly dependent

MI test: (0.013334578485957316, 0.9081)
Permutation test: (0.013334578485957316, 0.9406)
CMI test: (4.935663427905271, 0.0847684621512963)
Cond permutation test: (9.700549549572054, 0.0198)
