# Mathematical Underpinnings - Lab 6

In [166]:
import pandas as pd
from sklearn.metrics import mutual_info_score
import numpy as np
from tqdm import tqdm

## Useful functions

In [196]:
def discetize_2bins(X, false_value=-1):
    X_discrete = np.where(X >= 0, 1, false_value)
    return X_discrete

In [168]:
def conditional_permutation(X, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    X_b = np.zeros(n)

    for i in range(n_z_values):

        z_value_tmp = z_values[i]

        X_b[Z == z_value_tmp] = np.random.permutation(X[Z == z_value_tmp])

    return X_b

In [169]:
def conditional_mutual_information(X, Y, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    cmi = 0

    for i in range(n_z_values):

        z_value_tmp = z_values[i]
        z_condition = (Z == z_value_tmp)

        X_z = X[z_condition]
        Y_z = Y[z_condition]

        mi_XY_z = mutual_info_score(X_z, Y_z)
        p_z = np.sum(z_condition)/n

        cmi += p_z*mi_XY_z

    return cmi

In [170]:
# II(X;Y;Z)
def interaction_information(X, Y, Z):
    return conditional_mutual_information(X, Y, Z) - mutual_info_score(X, Y)

In [171]:
# II(X;Y;Z1;Z2)
def interaction_information2(X, Y, Z1, Z2):
    Z_1_and_2 = 2*Z2 + Z1
    return interaction_information(X, Y, Z_1_and_2) - interaction_information(X, Y, Z1) - interaction_information(X, Y, Z2)

## Task 1

In [172]:
def secmi2(X, Y, Z):
    mi = mutual_info_score(X, Y)
    sum_ii_2 = sum([conditional_mutual_information(X, Y, Z[:, i]) for i in range(Z.shape[1])])
    return mi + sum_ii_2


def secmi3(X, Y, Z):
    result = secmi2(X, Y, Z)
    for i in range(Z.shape[1]):
        for j in range(i):
            result += interaction_information2(X, Y, Z[:, i], Z[:, j])
    return result

### a)

In [173]:
def cond_indep_test_permutation(X, Y, Z, B, stat):

    n_col_Z = Z.shape[1]
    Z_1dim = np.dot(Z, 2**np.linspace(0, n_col_Z-1, n_col_Z))

    if stat == "cmi":
        stat_value = conditional_mutual_information(X, Y, Z_1dim)
    if stat == "secmi2":
        stat_value = secmi2(X, Y, Z)
    if stat == "secmi3":
        stat_value = secmi3(X, Y, Z)

    condition_p_value = 0
    for b in range(B):
        X_b = conditional_permutation(X, Z_1dim)

        if stat == "cmi":
            stat_value_b = conditional_mutual_information(X_b, Y, Z_1dim)
        if stat == "secmi2":
            stat_value_b = secmi2(X_b, Y, Z)
        if stat == "secmi3":
            stat_value_b = secmi3(X_b, Y, Z)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

### b)

In [174]:
def sample_1(n = 100, seed=8):
    np.random.seed(seed)
    Y = discetize_2bins(np.random.normal(0, 1, n))
    Z = discetize_2bins(np.random.normal(0, 1, n * 3).reshape((n, 3)) + Y.reshape((n, 1)))
    X = discetize_2bins(np.random.normal(0, 1, n) + Z[:, 0])
    return X, Y, Z

The first conditional independence is true.

### Running test 100 times

In [175]:
num_of_repetition = 100
rejected = [0] * 6
p_value_thresh, B = 0.05, 100
for i in tqdm(range(num_of_repetition)):
    X, Y, Z = sample_1(n = 100, seed=i)
    rejected[0] += int(cond_indep_test_permutation(X, Y, Z[:, [0, 1]], B, "cmi")[1] < p_value_thresh)
    rejected[1] += int(cond_indep_test_permutation(X, Y, Z[:, [0, 1]], B, "secmi2")[1] < p_value_thresh)
    rejected[2] += int(cond_indep_test_permutation(X, Y, Z[:, [0, 1]], B, "secmi3")[1] < p_value_thresh)
    rejected[3] += int(cond_indep_test_permutation(X, Y, Z[:, [1, 2]], B, "cmi")[1] < p_value_thresh)
    rejected[4] += int(cond_indep_test_permutation(X, Y, Z[:, [1, 2]], B, "secmi2")[1] < p_value_thresh)
    rejected[5] += int(cond_indep_test_permutation(X, Y, Z[:, [1, 2]], B, "secmi3")[1] < p_value_thresh)

100%|██████████| 100/100 [10:49<00:00,  6.49s/it]


In [176]:
print(f"The null hypotheses (with Z1, Z2) was rejected {rejected[0]} times when CMI was used")
print(f"The null hypotheses (with Z1, Z2) was rejected {rejected[1]} times when SECMI2 was used")
print(f"The null hypotheses (with Z1, Z2) was rejected {rejected[2]} times when SECMI3 was used")
print(f"The null hypotheses (with Z2, Z3) was rejected {rejected[3]} times when CMI was used")
print(f"The null hypotheses (with Z2, Z3) was rejected {rejected[4]} times when SECMI2 was used")
print(f"The null hypotheses (with Z2, Z3) was rejected {rejected[5]} times when SECMI3 was used")

The null hypotheses (with Z1, Z2) was rejected 5 times when CMI was used
The null hypotheses (with Z1, Z2) was rejected 3 times when SECMI2 was used
The null hypotheses (with Z1, Z2) was rejected 5 times when SECMI3 was used
The null hypotheses (with Z2, Z3) was rejected 64 times when CMI was used
The null hypotheses (with Z2, Z3) was rejected 83 times when SECMI2 was used
The null hypotheses (with Z2, Z3) was rejected 83 times when SECMI3 was used


### c)

The second conditional independence is true.

In [202]:
def sample_2(n = 100, seed=8):
    np.random.seed(seed)
    X = discetize_2bins(np.random.normal(0, 1, n), 0)
    Z1 = discetize_2bins(np.random.normal(0, 1, n), 0)
    Z2 = discetize_2bins(np.random.normal(0, 1, n), 0)
    Z3 = discetize_2bins(np.random.normal(0, 1, n), 0)
    Y = np.random.random(100)
    sum_mod = (X + Z1 + Z2) % 2
    for i in range(n):
        tresh = 0.2 if sum_mod[i] == 1 else 0.8
        Y[i] = 0 if Y[i] < tresh else 1  
    return X, Y, np.transpose([Z1, Z2, Z3])

In [203]:
num_of_repetition = 100
rejected = [0] * 6
p_value_thresh, B = 0.05, 100
for i in tqdm(range(num_of_repetition)):
    X, Y, Z = sample_2(n = 100, seed=i)
    rejected[0] += int(cond_indep_test_permutation(X, Y, Z[:, [0, 1]], B, "cmi")[1] < p_value_thresh)
    rejected[1] += int(cond_indep_test_permutation(X, Y, Z[:, [0, 1]], B, "secmi2")[1] < p_value_thresh)
    rejected[2] += int(cond_indep_test_permutation(X, Y, Z[:, [0, 1]], B, "secmi3")[1] < p_value_thresh)
    rejected[3] += int(cond_indep_test_permutation(X, Y, Z[:, [1, 2]], B, "cmi")[1] < p_value_thresh)
    rejected[4] += int(cond_indep_test_permutation(X, Y, Z[:, [1, 2]], B, "secmi2")[1] < p_value_thresh)
    rejected[5] += int(cond_indep_test_permutation(X, Y, Z[:, [1, 2]], B, "secmi3")[1] < p_value_thresh)

100%|██████████| 100/100 [12:52<00:00,  7.72s/it]


In [204]:
print(f"The null hypotheses (with Z1, Z2) was rejected {rejected[0]} times when CMI was used")
print(f"The null hypotheses (with Z1, Z2) was rejected {rejected[1]} times when SECMI2 was used")
print(f"The null hypotheses (with Z1, Z2) was rejected {rejected[2]} times when SECMI3 was used")
print(f"The null hypotheses (with Z2, Z3) was rejected {rejected[3]} times when CMI was used")
print(f"The null hypotheses (with Z2, Z3) was rejected {rejected[4]} times when SECMI2 was used")
print(f"The null hypotheses (with Z2, Z3) was rejected {rejected[5]} times when SECMI3 was used")

The null hypotheses (with Z1, Z2) was rejected 100 times when CMI was used
The null hypotheses (with Z1, Z2) was rejected 4 times when SECMI2 was used
The null hypotheses (with Z1, Z2) was rejected 100 times when SECMI3 was used
The null hypotheses (with Z2, Z3) was rejected 2 times when CMI was used
The null hypotheses (with Z2, Z3) was rejected 3 times when SECMI2 was used
The null hypotheses (with Z2, Z3) was rejected 3 times when SECMI3 was used
