# Mathematical Underpinnings - Lab 6

In [2]:
from sklearn.metrics import mutual_info_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm import tqdm, trange

## Useful functions

In [3]:
def discetize_2bins(X):
    X_discrete = 1*(X >= 0)
    return X_discrete

In [4]:
def conditional_permutation(X, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    X_b = np.zeros(n)

    for i in range(n_z_values):

        z_value_tmp = z_values[i]

        X_b[Z == z_value_tmp] = np.random.permutation(X[Z == z_value_tmp])

    return X_b

In [5]:
def conditional_mutual_information(X, Y, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    cmi = 0

    for i in range(n_z_values):

        z_value_tmp = z_values[i]
        z_condition = (Z == z_value_tmp)

        X_z = X[z_condition]
        Y_z = Y[z_condition]

        mi_XY_z = mutual_info_score(X_z, Y_z)
        p_z = np.sum(z_condition)/n

        cmi += p_z*mi_XY_z

    return cmi

In [6]:
# II(X;Y;Z)
def interaction_information(X, Y, Z):
    return conditional_mutual_information(X, Y, Z) - mutual_info_score(X, Y)

In [7]:
# II(X;Y;Z1;Z2)
def interaction_information2(X, Y, Z1, Z2):
    Z_1_and_2 = 2*Z2 + Z1
    return interaction_information(X, Y, Z_1_and_2) - interaction_information(X, Y, Z1) - interaction_information(X, Y, Z2)

## Task 1

In [8]:
def secmi2(X, Y, Z):
    stat = mutual_info_score(X, Y)
    for i in range(Z.shape[-1]):
        stat += interaction_information(Y, X, Z[:,i])
    return stat
    
def secmi3(X, Y, Z):
    stat = 0
    for i in range(Z.shape[-1]):
        for j in range(i + 1, Z.shape[-1]):
            stat += interaction_information2(Y, X, Z[:,i], Z[:,j])
    stat += secmi2(X, Y, Z)
    return stat

### a)

In [9]:
def cond_indep_test_permutation(X, Y, Z, B, stat):

    n_col_Z = Z.shape[1]
    Z_1dim = np.dot(Z, 2 ** np.linspace(0, n_col_Z - 1, n_col_Z))

    if stat == "cmi":
        stat_value = conditional_mutual_information(X, Y, Z_1dim)
    if stat == "secmi2":
        stat_value = secmi2(X, Y, Z)
    if stat == "secmi3":
        stat_value = secmi3(X, Y, Z)

    condition_p_value = 0
    for b in range(B):
        X_b = conditional_permutation(X, Z_1dim)

        if stat == "cmi":
            stat_value_b = conditional_mutual_information(X_b, Y, Z_1dim)
        if stat == "secmi2":
            stat_value_b = secmi2(X_b, Y, Z)
        if stat == "secmi3":
            stat_value_b = secmi3(X_b, Y, Z)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value) / (1 + B)

    return 2 * len(X) * stat_value, p_value

### b)

In [10]:
def get_data(n):
    Y = np.random.normal(0, 1, n)
    Y_disc = discetize_2bins(Y)

    Z1 = np.random.normal(Y / 2, 1, n)
    Z2 = np.random.normal(Y / 2, 1, n)
    Z3 = np.random.normal(Y / 2, 1, n)

    Z1_disc = discetize_2bins(Z1)
    Z2_disc = discetize_2bins(Z2)
    Z3_disc = discetize_2bins(Z3)

    X = np.random.normal(Z1 / 2, 1, n)
    X_disc = discetize_2bins(X)

    Z = np.column_stack((Z1_disc, Z2_disc, Z3_disc))

    return X_disc, Y_disc, Z

In [22]:
N = 100
n = 5000
B = 50
th = 0.05

In [23]:
ans = []
for i in trange(N):
    X, Y, Z = get_data(n)
    Z = Z[:,:-1]
    cmi_test, cmi_p_val = cond_indep_test_permutation(X, Y, Z, B, "cmi")
    secmi2_test, secmi2_p_val = cond_indep_test_permutation(X, Y, Z, B, "secmi2")
    secmi3_test, secmi3_p_val = cond_indep_test_permutation(X, Y, Z, B, "secmi3")
    ans.append([cmi_test, cmi_p_val, secmi2_test, secmi2_p_val, secmi3_test, secmi3_p_val])

100%|██████████| 100/100 [05:49<00:00,  3.49s/it]


In [24]:
df = pd.DataFrame(ans, columns=["cmi_test", "cmi_p_val", "secmi2_test", "secmi2_p_val", "secmi3_test", "secmi3_p_val"])

In [25]:
print("Rejection rate - cmi:", np.sum(df["cmi_p_val"] < th) / N)
print("Rejection rate - secmi2:", np.sum(df["secmi2_p_val"] < th) / N)
print("Rejection rate  - secmi3:", np.sum(df["secmi3_p_val"] < th) / N)

Rejection rate - cmi: 0.57
Rejection rate - secmi2: 0.55
Rejection rate  - secmi3: 0.61


In [26]:
ans = []
for i in trange(N):
    X, Y, Z = get_data(n)
    Z = Z[:, 1:]
    cmi_test, cmi_p_val = cond_indep_test_permutation(X, Y, Z, B, "cmi")
    secmi2_test, secmi2_p_val = cond_indep_test_permutation(X, Y, Z, B, "secmi2")
    secmi3_test, secmi3_p_val = cond_indep_test_permutation(X, Y, Z, B, "secmi3")
    ans.append([cmi_test, cmi_p_val, secmi2_test, secmi2_p_val, secmi3_test, secmi3_p_val])

 12%|█▏        | 12/100 [00:50<05:14,  3.58s/it]

100%|██████████| 100/100 [05:44<00:00,  3.45s/it]


In [27]:
df = pd.DataFrame(ans, columns=["cmi_test", "cmi_p_val", "secmi2_test", "secmi2_p_val", "secmi3_test", "secmi3_p_val"])

In [28]:
print("Rejection rate - cmi:", np.sum(df["cmi_p_val"] < th) / N)
print("Rejection rate - secmi2:", np.sum(df["secmi2_p_val"] < th) / N)
print("Rejection rate  - secmi3:", np.sum(df["secmi3_p_val"] < th) / N)

Rejection rate - cmi: 1.0
Rejection rate - secmi2: 1.0
Rejection rate  - secmi3: 1.0


### c)

In [29]:
def get_data(n):
    Y = np.random.normal(0, 1, n)
    Y_disc = discetize_2bins(Y)

    Z1 = np.random.normal(Y / 2, 1, n)
    Z2 = np.random.normal(Y / 2, 1, n)
    Z3 = np.random.normal(Y / 2, 1, n)

    Z1_disc = discetize_2bins(Z1)
    Z2_disc = discetize_2bins(Z2)
    Z3_disc = discetize_2bins(Z3)

    X = np.random.normal(Z1 / 2, 1, n)
    X_disc = discetize_2bins(X)

    Z = np.column_stack((Z1_disc, Z2_disc, Z3_disc))

    return X_disc, Y_disc, Z

In [30]:
ans = []
for i in trange(N):
    X, Y, Z = get_data(n)
    Z = Z[:,:-1]
    cmi_test, cmi_p_val = cond_indep_test_permutation(X, Y, Z, B, "cmi")
    secmi2_test, secmi2_p_val = cond_indep_test_permutation(X, Y, Z, B, "secmi2")
    secmi3_test, secmi3_p_val = cond_indep_test_permutation(X, Y, Z, B, "secmi3")
    ans.append([cmi_test, cmi_p_val, secmi2_test, secmi2_p_val, secmi3_test, secmi3_p_val])

100%|██████████| 100/100 [05:37<00:00,  3.38s/it]


In [31]:
df = pd.DataFrame(ans, columns=["cmi_test", "cmi_p_val", "secmi2_test", "secmi2_p_val", "secmi3_test", "secmi3_p_val"])

In [32]:
print("Rejection rate - cmi:", np.sum(df["cmi_p_val"] < th) / N)
print("Rejection rate - secmi2:", np.sum(df["secmi2_p_val"] < th) / N)
print("Rejection rate  - secmi3:", np.sum(df["secmi3_p_val"] < th) / N)

Rejection rate - cmi: 0.62
Rejection rate - secmi2: 0.55
Rejection rate  - secmi3: 0.64


In [33]:
ans = []
for i in trange(N):
    X, Y, Z = get_data(n)
    Z = Z[:, 1:]
    cmi_test, cmi_p_val = cond_indep_test_permutation(X, Y, Z, B, "cmi")
    secmi2_test, secmi2_p_val = cond_indep_test_permutation(X, Y, Z, B, "secmi2")
    secmi3_test, secmi3_p_val = cond_indep_test_permutation(X, Y, Z, B, "secmi3")
    ans.append([cmi_test, cmi_p_val, secmi2_test, secmi2_p_val, secmi3_test, secmi3_p_val])

100%|██████████| 100/100 [05:31<00:00,  3.32s/it]


In [34]:
df = pd.DataFrame(ans, columns=["cmi_test", "cmi_p_val", "secmi2_test", "secmi2_p_val", "secmi3_test", "secmi3_p_val"])

In [35]:
print("Rejection rate - cmi:", np.sum(df["cmi_p_val"] < th) / N)
print("Rejection rate - secmi2:", np.sum(df["secmi2_p_val"] < th) / N)
print("Rejection rate  - secmi3:", np.sum(df["secmi3_p_val"] < th) / N)

Rejection rate - cmi: 1.0
Rejection rate - secmi2: 1.0
Rejection rate  - secmi3: 1.0


## Task 2
 
in R