# Mathematical Underpinnings - Lab 6

In [1]:
import numpy as np
from sklearn.metrics import mutual_info_score

## Useful functions

In [2]:
def discetize_2bins(X):
    X_discrete = np.where(X<0, -1, 1)
    return X_discrete

In [3]:
def conditional_permutation(X, Z):
    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    X_b = np.zeros(n)

    for i in range(n_z_values):
        z_value_tmp = z_values[i]

        X_b[Z == z_value_tmp] = np.random.permutation(X[Z == z_value_tmp])

    return X_b

In [4]:
def conditional_mutual_information(X, Y, Z):
    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    cmi = 0

    for i in range(n_z_values):
        z_value_tmp = z_values[i]
        z_condition = (Z == z_value_tmp)

        X_z = X[z_condition]
        Y_z = Y[z_condition]

        mi_XY_z = mutual_info_score(X_z, Y_z)
        p_z = np.sum(z_condition) / n

        cmi += p_z * mi_XY_z

    return cmi

In [5]:
# II(X;Y;Z)
def interaction_information(X, Y, Z):
    return conditional_mutual_information(X, Y, Z) - mutual_info_score(X, Y)

In [6]:
# II(X;Y;Z1;Z2)
def interaction_information2(X, Y, Z1, Z2):
    Z_1_and_2 = 2 * Z2 + Z1
    return interaction_information(X, Y, Z_1_and_2) - interaction_information(X, Y, Z1) - interaction_information(X, Y,
                                                                                                                  Z2)

## Task 1

In [7]:

def secmi2(x, y, z):
    return mutual_info_score(x, y) + sum(interaction_information(x, y, z[:, i]) for i in range(z.shape[1]))


def secmi3(x, y, z):
    return (secmi2(x, y, z) + 
                sum(
                    sum(
                        interaction_information2(x, y, z[:, i], z[:, j])
                        for j in range(i, z.shape[1])
                    )
                    for i in range(z.shape[1])
                )
            )
            

### a)

In [8]:
def cond_indep_test_permutation(X, Y, Z, B, stat):
    n_col_Z = Z.shape[1]
    Z_1dim = np.dot(Z, 2 ** np.linspace(0, n_col_Z - 1, n_col_Z))

    if stat == "cmi":
        stat_value = conditional_mutual_information(X, Y, Z_1dim)
    if stat == "secmi2":
        stat_value = secmi2(X, Y, Z)
    if stat == "secmi3":
        stat_value = secmi3(X, Y, Z)

    condition_p_value = 0
    for b in range(B):
        X_b = conditional_permutation(X, Z_1dim)

        if stat == "cmi":
            stat_value_b = conditional_mutual_information(X_b, Y, Z_1dim)
        if stat == "secmi2":
            stat_value_b = secmi2(X_b, Y, Z)
        if stat == "secmi3":
            stat_value_b = secmi3(X_b, Y, Z)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value) / (1 + B)

    return 2 * len(X) * stat_value, p_value

### b)

In [35]:
def gen_data_1(n):
    y = np.random.normal(size=n)
    disc_y = discetize_2bins(y)
    
    z = np.random.normal(size=(n, 3)) + disc_y[:, np.newaxis] / 2 
    disc_z = discetize_2bins(z)
    
    x = np.random.normal(size=n) + disc_z[:, 0] / 2
    disc_x = discetize_2bins(x)
    
    return disc_x, disc_y, disc_z


In [36]:
x, y, z = gen_data_1(1000)
print(cond_indep_test_permutation(x, y, z[:, [0, 1]], 20, stat="cmi"))
print(cond_indep_test_permutation(x, y, z[:, [0, 1]], 20, stat="secmi2"))
print(cond_indep_test_permutation(x, y, z[:, [0, 1]], 20, stat="secmi3"))


(3.6330323551453088, 0.5714285714285714)
(2.5986321733707274, 0.23809523809523808)
(29.97080441618416, 0.14285714285714285)


In [37]:
print(cond_indep_test_permutation(x, y, z[:, [1, 2]], 20, stat="cmi"))
print(cond_indep_test_permutation(x, y, z[:, [1, 2]], 20, stat="secmi2"))
print(cond_indep_test_permutation(x, y, z[:, [1, 2]], 20, stat="secmi3"))


(28.65905638849497, 0.047619047619047616)
(28.26896495575873, 0.047619047619047616)
(29.326495667145814, 0.047619047619047616)


In [38]:
from tqdm import tqdm

results = []
for i in tqdm(range(100)):
    x, y, z = gen_data_1(1000)
    for stat_ in ["cmi", "secmi2", "secmi3"]:
        for cols in [(0,1), (1,2)]:
            stat, p = cond_indep_test_permutation(x, y, z[:, cols], 20, stat=stat_)
            results.append({
                "hipo": cols,
                "type": stat_,
                "it": i,
                "stat": stat,
                "p": p
            })
    

100%|██████████| 100/100 [02:04<00:00,  1.24s/it]


In [39]:
import pandas as pd

res = pd.DataFrame(results)
res["rejected"] = res["p"] < .05
res.groupby(["hipo", "type"])["rejected"].sum()

hipo    type  
(0, 1)  cmi        3
        secmi2     4
        secmi3     7
(1, 2)  cmi       86
        secmi2    90
        secmi3    92
Name: rejected, dtype: int64

### c)

In [42]:
def gen_data_2(n):
    x = np.random.binomial(1, 0.5, size=n)
    z = np.random.binomial(1, 0.5, size=(n, 3))
    cond = x + np.sum(z[:, :2], axis=1)
    y = np.random.binomial(1, np.where(cond % 2, .8, .2), n)
    return x, y, z


In [43]:
results = []
for i in tqdm(range(100)):
    x, y, z = gen_data_2(1000)
    for stat_ in ["cmi", "secmi2", "secmi3"]:
        for cols in [(0,1), (1,2)]:
            stat, p = cond_indep_test_permutation(x, y, z[:, cols], 20, stat=stat_)
            results.append({
                "hipo": cols,
                "type": stat_,
                "it": i,
                "stat": stat,
                "p": p
            })

res = pd.DataFrame(results)
res["rejected"] = res["p"] < .05
res.groupby(["hipo", "type"])["rejected"].sum()

100%|██████████| 100/100 [02:05<00:00,  1.26s/it]


hipo    type  
(0, 1)  cmi       100
        secmi2      4
        secmi3    100
(1, 2)  cmi         4
        secmi2      4
        secmi3      3
Name: rejected, dtype: int64

## Task 2
 
in R