# Mathematical Underpinnings - Lab 6

In [1]:
from sklearn.metrics import mutual_info_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm import tqdm

## Useful functions

In [2]:
def discetize_2bins(X):
    X_discrete = 1*(X >= 0)
    return X_discrete

In [3]:
def conditional_permutation(X, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    X_b = np.zeros(n)

    for i in range(n_z_values):

        z_value_tmp = z_values[i]

        X_b[Z == z_value_tmp] = np.random.permutation(X[Z == z_value_tmp])

    return X_b

In [4]:
def conditional_mutual_information(X, Y, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    cmi = 0

    for i in range(n_z_values):

        z_value_tmp = z_values[i]
        z_condition = (Z == z_value_tmp)

        X_z = X[z_condition]
        Y_z = Y[z_condition]

        mi_XY_z = mutual_info_score(X_z, Y_z)
        p_z = np.sum(z_condition)/n

        cmi += p_z*mi_XY_z

    return cmi

In [5]:
# II(X;Y;Z)
def interaction_information(X, Y, Z):
    return conditional_mutual_information(X, Y, Z) - mutual_info_score(X, Y)

In [6]:
# II(X;Y;Z1;Z2)
def interaction_information2(X, Y, Z1, Z2):
    Z_1_and_2 = 2*Z2 + Z1
    return interaction_information(X, Y, Z_1_and_2) - interaction_information(X, Y, Z1) - interaction_information(X, Y, Z2)

## Task 1

In [7]:
def secmi2(X, Y, Z):
    mutual_info = mutual_info_score(X, Y)
    II_list = [
        interaction_information(X, Y, Z_i.T) for Z_i in Z.T
    ]
    return mutual_info + np.sum(II_list)

def secmi3(X, Y, Z):
    mutual_info = mutual_info_score(X, Y)
    II_list = [
        interaction_information(X, Y, Z_i.T) for Z_i in Z.T
    ]
    II_second_list = [
        interaction_information2(X, Y, Z_i.T, Z_j.T) for i, Z_i in enumerate(Z.T) for j, Z_j in enumerate(Z.T) if j > i
    ]
    return mutual_info + np.sum(II_list) + np.sum(II_second_list)

### a)

In [8]:
def cond_indep_test_permutation(X, Y, Z, B, stat):

    n_col_Z = Z.shape[1]
    Z_1dim = np.dot(Z, 2**np.linspace(0, n_col_Z-1, n_col_Z))

    if stat == "cmi":
        stat_value = conditional_mutual_information(X, Y, Z_1dim)
    if stat == "secmi2":
        stat_value = secmi2(X, Y, Z)
    if stat == "secmi3":
        stat_value = secmi3(X, Y, Z)

    condition_p_value = 0
    for b in range(B):
        X_b = conditional_permutation(X, Z_1dim)

        if stat == "cmi":
            stat_value_b = conditional_mutual_information(X_b, Y, Z_1dim)
        if stat == "secmi2":
            stat_value_b = secmi2(X_b, Y, Z)
        if stat == "secmi3":
            stat_value_b = secmi3(X_b, Y, Z)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

### b)

In [11]:
# the '-0.5) * 2' changes zeros into -1

def generate_data(n):
    Y = (discetize_2bins(np.random.randn(n))-0.5) * 2
    Z1 = (discetize_2bins(np.random.randn(n) + Y)-0.5) * 2
    Z2 = (discetize_2bins(np.random.randn(n) + Y)-0.5) * 2
    Z3 = (discetize_2bins(np.random.randn(n) + Y)-0.5) * 2
    X = (discetize_2bins(np.random.randn(n) + Z1)-0.5) * 2
    return X, Y, Z1, Z2, Z3

In [34]:
B = 50 # for permutation tests
n = 100 # samples
N = 10 # repetitions

results = []
for _ in tqdm(range(N)):
    X, Y, Z1, Z2, Z3 = generate_data(n)
    for stat in ['cmi', 'secmi2', 'secmi3']:
        H1_stat_value, p_value1 = cond_indep_test_permutation(X, Y, np.array([Z1, Z2]).T, B, stat)
        H2_stat_value, p_value2 = cond_indep_test_permutation(X, Y, np.array([Z2, Z3]).T, B, stat)
        results.append({
            'condition': 'Z1,Z2',
            'stat': stat,
            'stat_value': H1_stat_value,
            'p_value': p_value1
        })
        results.append({
            'condition': 'Z2,Z3',
            'stat': stat,
            'stat_value': H2_stat_value,
            'p_value': p_value2
        })

100%|██████████| 10/10 [00:28<00:00,  2.83s/it]


In [35]:
# make condition and stat columns as index
results_df = pd.DataFrame(results).set_index(['condition', 'stat'])
results_df["rejected_null_hypothesis"] = results_df["p_value"] < 0.05
results_df.head(6)

Unnamed: 0_level_0,Unnamed: 1_level_0,stat_value,p_value,rejected_null_hypothesis
condition,stat,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Z1,Z2",cmi,3.590291,0.509804,False
"Z2,Z3",cmi,19.742496,0.019608,True
"Z1,Z2",secmi2,-6.490278,0.372549,False
"Z2,Z3",secmi2,14.975211,0.019608,True
"Z1,Z2",secmi3,3.590291,0.72549,False
"Z2,Z3",secmi3,19.742496,0.019608,True


In [37]:
print(f"Parameters: n={n}, N={N}, B={B}")
pd.DataFrame(results_df.groupby(['condition', 'stat']).sum().loc[:, 'rejected_null_hypothesis'])

Parameters: n=100, N=10, B=50


Unnamed: 0_level_0,Unnamed: 1_level_0,rejected_null_hypothesis
condition,stat,Unnamed: 2_level_1
"Z1,Z2",cmi,1
"Z1,Z2",secmi2,2
"Z1,Z2",secmi3,1
"Z2,Z3",cmi,6
"Z2,Z3",secmi2,6
"Z2,Z3",secmi3,7


Looks like for condition Z2 and Z3 the null hypothesis is rejected in most of the times whereas for condition Z1 and Z2, rather rarely.

### c)

In [77]:
def generate_data(n):
    Z1 = np.random.binomial(1, 0.5, n)
    Z2 = np.random.binomial(1, 0.5, n)
    Z3 = np.random.binomial(1, 0.5, n)
    X  = np.random.binomial(1, 0.5, n)
    # Y = 1 with probability 0.8 if X + Z1 + Z2 = 1 modulo 2, otherwise 1 with probability 0.2
    Y  = np.random.binomial(1, np.where((X + Z1 + Z2) % 2 == 1, 0.8, 0.2))
    return X, Y, Z1, Z2, Z3

In [78]:
generate_data(4)

(array([0, 1, 0, 1]),
 array([0, 1, 0, 0]),
 array([0, 1, 1, 0]),
 array([0, 0, 0, 1]),
 array([1, 0, 0, 0]))

In [80]:
B = 50 # for permutation tests
n = 100 # samples
N = 10 # repetitions

results = []
for _ in tqdm(range(N)):
    X, Y, Z1, Z2, Z3 = generate_data(n)
    for stat in ['cmi', 'secmi2', 'secmi3']:
        H1_stat_value, p_value1 = cond_indep_test_permutation(X, Y, np.array([Z1, Z2]).T, B, stat)
        H2_stat_value, p_value2 = cond_indep_test_permutation(X, Y, np.array([Z2, Z3]).T, B, stat)
        results.append({
            'condition': 'Z1,Z2',
            'stat': stat,
            'stat_value': H1_stat_value,
            'p_value': p_value1
        })
        results.append({
            'condition': 'Z2,Z3',
            'stat': stat,
            'stat_value': H2_stat_value,
            'p_value': p_value2
        })

100%|██████████| 10/10 [00:28<00:00,  2.89s/it]


In [81]:
# make condition and stat columns as index
results_df = pd.DataFrame(results).set_index(['condition', 'stat'])
results_df["rejected_null_hypothesis"] = results_df["p_value"] < 0.05
results_df.head(6)

Unnamed: 0_level_0,Unnamed: 1_level_0,stat_value,p_value,rejected_null_hypothesis
condition,stat,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Z1,Z2",cmi,48.640608,0.019608,True
"Z2,Z3",cmi,5.526978,0.27451,False
"Z1,Z2",secmi2,0.46858,1.0,False
"Z2,Z3",secmi2,4.261104,0.254902,False
"Z1,Z2",secmi3,48.640608,0.019608,True
"Z2,Z3",secmi3,5.526978,0.313725,False


In [82]:
print(f"Parameters: n={n}, N={N}, B={B}")
pd.DataFrame(results_df.groupby(['condition', 'stat']).sum().loc[:, 'rejected_null_hypothesis'])

Parameters: n=100, N=10, B=50


Unnamed: 0_level_0,Unnamed: 1_level_0,rejected_null_hypothesis
condition,stat,Unnamed: 2_level_1
"Z1,Z2",cmi,10
"Z1,Z2",secmi2,0
"Z1,Z2",secmi3,10
"Z2,Z3",cmi,0
"Z2,Z3",secmi2,0
"Z2,Z3",secmi3,0


Based on a small sample of 10 repetitions, null hypothesis is rejected always for condition Z1 and Z2 with stat cmi and secmi3. In all other cases, the null hypothesis has not been rejected even once.

## Task 2
 
in R