# Mathematical Underpinnings - Lab 6

In [24]:
from sklearn.metrics import mutual_info_score
import itertools
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm import tqdm

## Useful functions

In [25]:
def discetize_2bins(X):
    X_discrete = 1*(X >= 0)
    return X_discrete

In [26]:
def conditional_permutation(X, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    X_b = np.zeros(n)

    for i in range(n_z_values):

        z_value_tmp = z_values[i]

        X_b[Z == z_value_tmp] = np.random.permutation(X[Z == z_value_tmp])

    return X_b

In [27]:
def conditional_mutual_information(X, Y, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    cmi = 0

    for i in range(n_z_values):

        z_value_tmp = z_values[i]
        z_condition = (Z == z_value_tmp)

        X_z = X[z_condition]
        Y_z = Y[z_condition]

        mi_XY_z = mutual_info_score(X_z, Y_z)
        p_z = np.sum(z_condition)/n

        cmi += p_z*mi_XY_z

    return cmi

In [28]:
# II(X;Y;Z)
def interaction_information(X, Y, Z):
    return conditional_mutual_information(X, Y, Z) - mutual_info_score(X, Y)

In [29]:
# II(X;Y;Z1;Z2)
def interaction_information2(X, Y, Z1, Z2):
    Z_1_and_2 = 2*Z2 + Z1
    return interaction_information(X, Y, Z_1_and_2) - interaction_information(X, Y, Z1) - interaction_information(X, Y, Z2)

## Task 1

In [30]:
def secmi2(X, Y, Z):
    secmi = mutual_info_score(X, Y)
    for i in range(Z.shape[1]):
        secmi += interaction_information(X, Y, Z[:,i])
    return secmi
def secmi3(X, Y, Z):
    secmi = secmi2(X, Y, Z)
    for combination in itertools.combinations(list(range(Z.shape[1])), 2):
        secmi += interaction_information2(X, Y, Z[:,combination[0]], Z[:,combination[1]])
    return secmi

In [31]:
mean = (1, 2, 1)

cov = [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
n = 100
X = discetize_2bins(np.random.normal(0, 1, n))
Y = discetize_2bins(np.random.normal(0, 1, n))
Z = discetize_2bins(np.random.multivariate_normal(mean, cov, n))
# Z = discetize_2bins(np.random.normal(0, 1, n))

In [32]:
secmi3(X, Y, Z)

0.014553565377989896

### a)

In [33]:
def cond_indep_test_permutation(X, Y, Z, B, stat):

    n_col_Z = Z.shape[1]
    Z_1dim = np.dot(Z, 2**np.linspace(0, n_col_Z-1, n_col_Z))

    if stat == "cmi":
        stat_value = conditional_mutual_information(X, Y, Z_1dim)
    if stat == "secmi2":
        stat_value = secmi2(X, Y, Z)
    if stat == "secmi3":
        stat_value = secmi3(X, Y, Z)

    condition_p_value = 0
    for b in range(B):
        X_b = conditional_permutation(X, Z_1dim)

        if stat == "cmi":
            stat_value_b = conditional_mutual_information(X_b, Y, Z_1dim)
        if stat == "secmi2":
            stat_value_b = secmi2(X_b, Y, Z)
        if stat == "secmi3":
            stat_value_b = secmi3(X_b, Y, Z)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

### b)

In [34]:
def sample_from_model(n):
    Y_tilde = np.random.normal(size=n)
    Y = (Y_tilde > 0) * 2 - 1
    Z1 = (np.random.normal(loc=Y, size=n) > 0) * 2 - 1
    Z2 = (np.random.normal(loc=Y, size=n) > 0) * 2 - 1
    Z3 = (np.random.normal(loc=Y, size=n) > 0) * 2 - 1
    X = (np.random.normal(loc=Z1, size=n) > 0) * 2 - 1
    return X, Y, Z1, Z2, Z3

In [35]:
n = 100
X, Y, Z1, Z2, Z3 = sample_from_model(n)

In [36]:
tries = 100

for stat in ["cmi", "secmi2", "secmi3"]:
    results = []
    for _ in range(tries):
        X, Y, Z1, Z2, Z3 = sample_from_model(n)
        pval = cond_indep_test_permutation(X, Y, np.array([[z1, z2] for z1, z2 in zip(Z1, Z2)]), 50, stat)[1]
        results = [*results, pval]
    print(f"For first case, {stat}:\nMean p-value: {np.mean(results)}\nRejected in {(np.sum(np.array(results) < 0.05))/tries} cases\n\n")
    results = []
    for _ in range(tries):
        X, Y, Z1, Z2, Z3 = sample_from_model(n)
        pval = cond_indep_test_permutation(X, Y, np.array([[z1, z2] for z1, z2 in zip(Z3, Z2)]), 50, stat)[1]
        results = [*results, pval]
    print(f"For second case, {stat}:\nMean p-value: {np.mean(results)}\nRejected in {(np.sum(np.array(results) < 0.05))/tries} cases\n\n")

For first case, cmi:
Mean p-value: 0.5858823529411764
Rejected in 0.05 cases


For second case, cmi:
Mean p-value: 0.09313725490196077
Rejected in 0.64 cases


For first case, secmi2:
Mean p-value: 0.5719607843137254
Rejected in 0.03 cases


For second case, secmi2:
Mean p-value: 0.12137254901960784
Rejected in 0.61 cases


For first case, secmi3:
Mean p-value: 0.5809803921568627
Rejected in 0.07 cases


For second case, secmi3:
Mean p-value: 0.10098039215686275
Rejected in 0.57 cases




### c)

In [37]:
n = 100
X = np.random.binomial(1, 0.5, n)
Z1 = np.random.binomial(1, 0.5, n)
Z2 = np.random.binomial(1, 0.5, n)
Z3 = np.random.binomial(1, 0.5, n)

In [38]:
Y08 = np.random.binomial(1, 0.8, n)
Y02 = np.random.binomial(1, 0.2, n)
Y = np.repeat(0,n)

In [39]:
var_sum = (X + Z1 + Z2) % 2

In [40]:
def c_scenario(n=100):
    X = np.random.binomial(1, 0.5, n)
    Z1 = np.random.binomial(1, 0.5, n)
    Z2 = np.random.binomial(1, 0.5, n)
    Z3 = np.random.binomial(1, 0.5, n)
    Y08 = np.random.binomial(1, 0.8, n)
    Y02 = np.random.binomial(1, 0.2, n)
    Y = np.repeat(0,n)
    var_sum = (X + Z1 + Z2) % 2
    Y[var_sum == 1] = Y08[var_sum == 1]
    Y[var_sum == 0] = Y02[var_sum == 0]
    return X, Z1, Z2, Z3, Y

In [41]:
tries = 100

for stat in ["cmi", "secmi2", "secmi3"]:
    results = []
    for _ in range(tries):
        X, Y, Z1, Z2, Z3 = c_scenario(n)
        pval = cond_indep_test_permutation(X, Y, np.array([[z1, z2] for z1, z2 in zip(Z1, Z2)]), 50, stat)[1]
        results = [*results, pval]
    print(f"For first case, {stat}:\nMean p-value: {np.mean(results)}\nRejected in {(np.sum(np.array(results) < 0.05))/tries} cases\n\n")
    results = []
    for _ in range(tries):
        X, Y, Z1, Z2, Z3 = c_scenario(n)
        pval = cond_indep_test_permutation(X, Y, np.array([[z1, z2] for z1, z2 in zip(Z3, Z2)]), 50, stat)[1]
        results = [*results, pval]
    print(f"For second case, {stat}:\nMean p-value: {np.mean(results)}\nRejected in {(np.sum(np.array(results) < 0.05))/tries} cases\n\n")

For first case, cmi:
Mean p-value: 0.515686274509804
Rejected in 0.08 cases


For second case, cmi:
Mean p-value: 0.5566666666666666
Rejected in 0.05 cases


For first case, secmi2:
Mean p-value: 0.49490196078431375
Rejected in 0.04 cases


For second case, secmi2:
Mean p-value: 0.5598039215686275
Rejected in 0.04 cases


For first case, secmi3:
Mean p-value: 0.47
Rejected in 0.04 cases


For second case, secmi3:
Mean p-value: 0.5311764705882352
Rejected in 0.03 cases




## Task 2
 
in R