# Mathematical Underpinnings - Lab 6

In [1]:
from sklearn.metrics import mutual_info_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm import tqdm

In [2]:
np.random.seed(123)

## Useful functions

In [3]:
def discetize_2bins(X):
    X_discrete = 1*(X >= 0)
    return X_discrete

In [4]:
def conditional_permutation(X, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    X_b = np.zeros(n)

    for i in range(n_z_values):

        z_value_tmp = z_values[i]

        X_b[Z == z_value_tmp] = np.random.permutation(X[Z == z_value_tmp])

    return X_b

In [5]:
def conditional_mutual_information(X, Y, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    cmi = 0

    for i in range(n_z_values):

        z_value_tmp = z_values[i]
        z_condition = (Z == z_value_tmp)

        X_z = X[z_condition]
        Y_z = Y[z_condition]

        mi_XY_z = mutual_info_score(X_z, Y_z)
        p_z = np.sum(z_condition)/n

        cmi += p_z*mi_XY_z

    return cmi

In [6]:
# II(X;Y;Z)
def interaction_information(X, Y, Z):
    return conditional_mutual_information(X, Y, Z) - mutual_info_score(X, Y)

In [7]:
# II(X;Y;Z1;Z2)
def interaction_information2(X, Y, Z1, Z2):
    Z_1_and_2 = 2*Z2 + Z1
    return interaction_information(X, Y, Z_1_and_2) - interaction_information(X, Y, Z1) - interaction_information(X, Y, Z2)

## Task 1

In [8]:
def secmi2(X, Y, Z):
    val = mutual_info_score(X, Y)
    for i in range(Z.shape[1]):
        val += interaction_information(X, Y, Z[:, i])
    return val

def secmi3(X, Y, Z):
    val = secmi2(X, Y, Z)
    for i in range(Z.shape[1]):
        for j in range(i):
            val += interaction_information2(X, Y, Z[:, i], Z[:, j])
    return val

### a)

In [9]:
def cond_indep_test_permutation(X, Y, Z, stat, B=50):

    n_col_Z = Z.shape[1]
    Z_1dim = np.dot(Z, 2**np.linspace(0, n_col_Z-1, n_col_Z))

    if stat == "cmi":
        stat_value = conditional_mutual_information(X, Y, Z_1dim)
    if stat == "secmi2":
        stat_value = secmi2(X, Y, Z)
    if stat == "secmi3":
        stat_value = secmi3(X, Y, Z)

    condition_p_value = 0
    for b in range(B):
        X_b = conditional_permutation(X, Z_1dim)

        if stat == "cmi":
            stat_value_b = conditional_mutual_information(X_b, Y, Z_1dim)
        if stat == "secmi2":
            stat_value_b = secmi2(X_b, Y, Z)
        if stat == "secmi3":
            stat_value_b = secmi3(X_b, Y, Z)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

### b)

In [10]:
def sample_data1(n_samples=1000):
    Y = discetize_2bins(np.random.standard_normal(n_samples))
    Z = discetize_2bins(
        np.random.standard_normal(n_samples * 3).reshape((n_samples, 3))
        + Y.reshape((n_samples, 1))
    )
    X = discetize_2bins(np.random.standard_normal(n_samples) + Z[:, 0])
    return X, Y, Z

In [11]:
results1 = []
for rep in tqdm(range(100)):
    X, Y, Z = sample_data1()
    for scenario, indices in {"z1+z2": [0, 1], "z2+z3": [1, 2]}.items():
        for stat in ["cmi", "secmi2", "secmi3"]:
            _, p_value = cond_indep_test_permutation(X, Y, Z[:, indices], stat)
            results1.append(
                {"rep": rep, "scenario": scenario, "stat": stat, "p_value": p_value}
            )
results1 = pd.DataFrame(results1)

100%|██████████| 100/100 [05:51<00:00,  3.51s/it]


In [12]:
results1["rejected"] = results1["p_value"] < 0.05
results1.groupby(["scenario", "stat"]).agg({"rejected": "sum"})

Unnamed: 0_level_0,Unnamed: 1_level_0,rejected
scenario,stat,Unnamed: 2_level_1
z1+z2,cmi,4
z1+z2,secmi2,6
z1+z2,secmi3,5
z2+z3,cmi,84
z2+z3,secmi2,86
z2+z3,secmi3,76


Conditional independence is true in the first scenario, but not in the second, and the results of all tests are mostly consistent with this fact (assuming significance level of 0.05).

### c)

In [13]:
def sample_data2(n_samples=1000):
    X = np.random.binomial(1, 0.5, n_samples)
    Z = np.random.binomial(1, 0.5, n_samples * 3).reshape(n_samples, 3)
    Y = np.random.binomial(
        1, np.where((X + Z[:, 0] + Z[:, 1]) % 2 == 1, 0.8, 0.2), n_samples
    )
    return X, Y, Z

In [14]:
results2 = []
for rep in tqdm(range(100)):
    X, Y, Z = sample_data2()
    for scenario, indices in {"z1+z2": [0, 1], "z2+z3": [1, 2]}.items():
        for stat in ["cmi", "secmi2", "secmi3"]:
            _, p_value = cond_indep_test_permutation(X, Y, Z[:, indices], stat)
            results2.append(
                {"rep": rep, "scenario": scenario, "stat": stat, "p_value": p_value}
            )
results2 = pd.DataFrame(results2)

100%|██████████| 100/100 [05:53<00:00,  3.54s/it]


In [15]:
results2["rejected"] = results2["p_value"] < 0.05
results2.groupby(["scenario", "stat"]).agg({"rejected": "sum"})

Unnamed: 0_level_0,Unnamed: 1_level_0,rejected
scenario,stat,Unnamed: 2_level_1
z1+z2,cmi,100
z1+z2,secmi2,4
z1+z2,secmi3,100
z2+z3,cmi,5
z2+z3,secmi2,5
z2+z3,secmi3,7


Conditional independence is true in the second scenario, but not in the first, and the results of all tests except the SECMI2-based one are mostly consistent with this fact (assuming significance level of 0.05). The reason for this is that the SECMI2 statistic does not take into account third-order interactions.

## Task 2
 
in R