In [1]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd

In [2]:
np.random.seed(997)

# Mathematical Underpinnings - Lab 5

Tests to verify hipoteses of independence (from Lab 4):

In [3]:
def indep_test_asymptotic(X, Y, stat="mi"):

    if stat == "mi":

        stat_value = 2*len(X)*mutual_info_score(X, Y)

    if stat == "chi2":

        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

def indep_test_permutation(X, Y, B=1000, stat="mi"):

    stat_value = mutual_info_score(X, Y)

    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)

        stat_value_b = mutual_info_score(X_b, Y)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

## Task 1

In [4]:
# a function which computes CMI

In [5]:
def conditional_mutual_info_score(x, y, z):
    cmi = 0
    for val in np.unique(z):
        cmi += mutual_info_score(x[z == val], y[z == val]) * np.sum(z == val) / len(z)
    return cmi

### a)

In [6]:
# CI test based on CMI and asymptotics

In [7]:
def asymptotic_conditional_independence_test(x, y, z):
    statistic_value = 2 * len(z) * conditional_mutual_info_score(x, y, z)
    df = (len(np.unique(x)) - 1) * (len(np.unique(y)) - 1) * len(np.unique(z))
    p_value = chi2.sf(statistic_value, df)
    return statistic_value, p_value

### b)

In [8]:
# CI test based on CMI and permutations

In [9]:
def conditional_permuation(x, y):
    x = x.copy()
    for val in np.unique(y):
        x[y == val] = np.random.permutation(x[y == val])
    return x


def permutation_conditional_independence_test(x, y, z, B=1000):
    statistic_value = conditional_mutual_info_score(x, y, z)
    p_value = 1
    for _ in range(B):
        p_value += int(
            conditional_mutual_info_score(conditional_permuation(x, z), y, z)
            >= statistic_value
        )
    p_value /= 1 + B
    return statistic_value, p_value

### c)

In [10]:
def binarize(x):
    return np.where(x >= 0, 1, -1)

conditionaly independent

In [11]:
n_samples = 1000
z = binarize(np.random.standard_normal(n_samples))
x = binarize(np.random.standard_normal(n_samples) + z / 2)
y = binarize(np.random.standard_normal(n_samples) + z / 2)

In [12]:
statistic_value, p_value = asymptotic_conditional_independence_test(x, y, z)
print(f"Statistic value: {statistic_value:.4f}, P value: {p_value:.4f}")

Statistic value: 0.1618, P value: 0.9223


In [13]:
statistic_value, p_value = permutation_conditional_independence_test(x, y, z)
print(f"Statistic value: {statistic_value:.4f}, P value: {p_value:.4f}")

Statistic value: 0.0001, P value: 0.9311


conditionaly dependent

In [14]:
n_samples = 1000
x = binarize(np.random.standard_normal(n_samples))
y = binarize(np.random.standard_normal(n_samples))
z = binarize(np.random.standard_normal(n_samples) + (x + y) / 2)

In [15]:
statistic_value, p_value = asymptotic_conditional_independence_test(x, y, z)
print(f"Statistic value: {statistic_value:.4f}, P value: {p_value:.4f}")

Statistic value: 13.1731, P value: 0.0014


In [16]:
statistic_value, p_value = permutation_conditional_independence_test(x, y, z)
print(f"Statistic value: {statistic_value:.4f}, P value: {p_value:.4f}")

Statistic value: 0.0066, P value: 0.0030


The results of all tests are as expected.

## Task 2

In [17]:
def sample_from_model_1(n_samples=1000):
    z = binarize(np.random.standard_normal(n_samples))
    x = binarize(np.random.standard_normal(n_samples) + z / 2)
    y = binarize(np.random.standard_normal(n_samples) + z / 2)
    return x, y, z


def sample_from_model_2(n_samples=1000):
    x = binarize(np.random.standard_normal(n_samples))
    z = binarize(np.random.standard_normal(n_samples) + x / 2)
    y = binarize(np.random.standard_normal(n_samples) + z / 2)
    return x, y, z


def sample_from_model_3(n_samples=1000):
    x = binarize(np.random.standard_normal(n_samples))
    y = binarize(np.random.standard_normal(n_samples))
    z = binarize(np.random.standard_normal(n_samples) + (x + y) / 2)
    return x, y, z

### a)

answer:

- model 1 - unconditionally dependent, conditionally independent
- model 2 - unconditionally dependent, conditionally independent
- model 3 - unconditionally independent, conditionally dependent

### b)

model 1

In [18]:
x1, y1, z1 = sample_from_model_1()
mi = mutual_info_score(x1, y1)
cmi = conditional_mutual_info_score(x1, y1, z1)
print(f"MI: {mi:.4f}, CMI: {cmi:.4f}")

MI: 0.0063, CMI: 0.0004


model 2

In [19]:
x2, y2, z2 = sample_from_model_2()
mi = mutual_info_score(x2, y2)
cmi = conditional_mutual_info_score(x2, y2, z2)
print(f"MI: {mi:.4f}, CMI: {cmi:.4f}")

MI: 0.0142, CMI: 0.0010


model 3

In [20]:
x3, y3, z3 = sample_from_model_3()
mi = mutual_info_score(x3, y3)
cmi = conditional_mutual_info_score(x3, y3, z3)
print(f"MI: {mi:.4f}, CMI: {cmi:.4f}")

MI: 0.0000, CMI: 0.0087


The results are in line with the above answer.

### c)

model 1

In [21]:
statistic_value, p_value = indep_test_asymptotic(x1, y1)
print(f"Statistic value: {statistic_value:.4f}, P value: {p_value:.4f}")

Statistic value: 12.6514, P value: 0.0004


In [22]:
statistic_value, p_value = indep_test_permutation(x1, y1)
print(f"Statistic value: {statistic_value:.4f}, P value: {p_value:.4f}")

Statistic value: 12.6514, P value: 0.0010


In [23]:
statistic_value, p_value = asymptotic_conditional_independence_test(x1, y1, z1)
print(f"Statistic value: {statistic_value:.4f}, P value: {p_value:.4f}")

Statistic value: 0.7532, P value: 0.6862


In [24]:
statistic_value, p_value = permutation_conditional_independence_test(x1, y1, z1)
print(f"Statistic value: {statistic_value:.4f}, P value: {p_value:.4f}")

Statistic value: 0.0004, P value: 0.6983


model 2

In [25]:
statistic_value, p_value = indep_test_asymptotic(x2, y2)
print(f"Statistic value: {statistic_value:.4f}, P value: {p_value:.4f}")

Statistic value: 28.4686, P value: 0.0000


In [26]:
statistic_value, p_value = indep_test_permutation(x2, y2)
print(f"Statistic value: {statistic_value:.4f}, P value: {p_value:.4f}")

Statistic value: 28.4686, P value: 0.0010


In [27]:
statistic_value, p_value = asymptotic_conditional_independence_test(x2, y2, z2)
print(f"Statistic value: {statistic_value:.4f}, P value: {p_value:.4f}")

Statistic value: 1.9691, P value: 0.3736


In [28]:
statistic_value, p_value = permutation_conditional_independence_test(x2, y2, z2)
print(f"Statistic value: {statistic_value:.4f}, P value: {p_value:.4f}")

Statistic value: 0.0010, P value: 0.4006


model 3

In [29]:
statistic_value, p_value = indep_test_asymptotic(x3, y3)
print(f"Statistic value: {statistic_value:.4f}, P value: {p_value:.4f}")

Statistic value: 0.0153, P value: 0.9017


In [30]:
statistic_value, p_value = indep_test_permutation(x3, y3)
print(f"Statistic value: {statistic_value:.4f}, P value: {p_value:.4f}")

Statistic value: 0.0153, P value: 0.9341


In [31]:
statistic_value, p_value = asymptotic_conditional_independence_test(x3, y3, z3)
print(f"Statistic value: {statistic_value:.4f}, P value: {p_value:.4f}")

Statistic value: 17.3554, P value: 0.0002


In [32]:
statistic_value, p_value = permutation_conditional_independence_test(x3, y3, z3)
print(f"Statistic value: {statistic_value:.4f}, P value: {p_value:.4f}")

Statistic value: 0.0087, P value: 0.0020


The results of all tests are as expected.