# Mathematical Underpinnings - Lab 6

In [15]:
from sklearn.metrics import mutual_info_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm import tqdm

## Useful functions

In [16]:
def discetize_2bins(X):
    X_discrete = 1*(X >= 0)
    return X_discrete

In [17]:
def conditional_permutation(X, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    X_b = np.zeros(n)

    for i in range(n_z_values):

        z_value_tmp = z_values[i]

        X_b[Z == z_value_tmp] = np.random.permutation(X[Z == z_value_tmp])

    return X_b

In [18]:
def conditional_mutual_information(X, Y, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    cmi = 0

    for i in range(n_z_values):

        z_value_tmp = z_values[i]
        z_condition = (Z == z_value_tmp)

        X_z = X[z_condition]
        Y_z = Y[z_condition]

        mi_XY_z = mutual_info_score(X_z, Y_z)
        p_z = np.sum(z_condition)/n

        cmi += p_z*mi_XY_z

    return cmi

In [19]:
# II(X;Y;Z)
def interaction_information(X, Y, Z):
    return conditional_mutual_information(X, Y, Z) - mutual_info_score(X, Y)

In [20]:
# II(X;Y;Z1;Z2)
def interaction_information2(X, Y, Z1, Z2):
    Z_1_and_2 = 2*Z2 + Z1
    return interaction_information(X, Y, Z_1_and_2) - interaction_information(X, Y, Z1) - interaction_information(X, Y, Z2)

## Task 1

In [21]:
def secmi2(X, Y, Z):
    secmi2_value = mutual_info_score(X, Y)
    n_features_Z = Z.shape[1]
    for i in range(n_features_Z):
        secmi2_value += interaction_information(Y, X, Z[:, i])
    return secmi2_value


def secmi3(X, Y, Z):
    secmi3_value = mutual_info_score(X, Y)
    n_features_Z = Z.shape[1]
    for i in range(n_features_Z):
        secmi3_value += interaction_information(Y, X, Z[:, i])
        for j in range(i+1, n_features_Z):
            secmi3_value += interaction_information2(Y, X, Z[:, i], Z[:, j])
    return secmi3_value

### a)

In [22]:
def cond_indep_test_permutation(X, Y, Z, B, stat):

    n_col_Z = Z.shape[1]
    Z_1dim = np.dot(Z, 2**np.linspace(0, n_col_Z-1, n_col_Z))

    if stat == "cmi":
        stat_value = conditional_mutual_information(X, Y, Z_1dim)
    if stat == "secmi2":
        stat_value = secmi2(X, Y, Z)
    if stat == "secmi3":
        stat_value = secmi3(X, Y, Z)

    condition_p_value = 0
    for b in range(B):
        X_b = conditional_permutation(X, Z_1dim)

        if stat == "cmi":
            stat_value_b = conditional_mutual_information(X_b, Y, Z_1dim)
        if stat == "secmi2":
            stat_value_b = secmi2(X_b, Y, Z)
        if stat == "secmi3":
            stat_value_b = secmi3(X_b, Y, Z)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value


# Generate some sample data
np.random.seed(42)
X = np.random.randint(0, 3, 100)
Y = np.random.randint(0, 3, 100)
Z = np.random.randint(0, 3, size = (100, 2))  # Z with 2 features

# Perform conditional independence test
test_stat, p_value = cond_indep_test_permutation(X, Y, Z, B=50, stat='cmi')

print("Example 1:")
print("Test Statistic:", test_stat)
print("P-value:", p_value)

# Perform conditional independence test using SECMI2
test_stat_secmi2, p_value_secmi2 = cond_indep_test_permutation(X, Y, Z, B=50, stat='secmi2')

print("\nExample 2:")
print("Test Statistic (SECMI2):", test_stat_secmi2)
print("P-value (SECMI2):", p_value_secmi2)

# Perform conditional independence test using SECMI3
test_stat_secmi3, p_value_secmi3 = cond_indep_test_permutation(X, Y, Z, B=50, stat='secmi3')

print("\nExample 3:")
print("Test Statistic (SECMI3):", test_stat_secmi3)
print("P-value (SECMI3):", p_value_secmi3)


Example 1:
Test Statistic: 31.489653332761197
P-value: 0.47058823529411764

Example 2:
Test Statistic (SECMI2): 15.69893414876706
P-value (SECMI2): 0.8431372549019608

Example 3:
Test Statistic (SECMI3): 31.489653332761197
P-value (SECMI3): 0.6078431372549019


### b)

In [25]:
def generate_data(n):
    # Generate data based on the given model and distributions
    Z1 = np.random.normal(loc=0, scale=1, size=n)
    Z2 = np.random.normal(loc=0, scale=1, size=n)
    Z3 = np.random.normal(loc=0, scale=1, size=n)
    Y = Z1 + Z2 + Z3 + np.random.normal(loc=0, scale=1, size=n)
    X = Y + np.random.normal(loc=0, scale=1, size=n)
    return X, Y, np.column_stack((Z1, Z2, Z3))

def conditional_independence_test(X, Y, Z1, Z2):
    # Perform conditional independence test
    X_discrete = discetize_2bins(X)
    Y_discrete = discetize_2bins(Y)

    # Concatenate Z1 and Z2
    Z = np.column_stack((Z1, Z2))

    # Test H(1): X independent of Y given Z1 and Z2
    test_stat_h1, p_value_h1 = cond_indep_test_permutation(X_discrete, Y_discrete, Z, B=50, stat='cmi')

    # Test H(2): X independent of Y given Z2 and Z3
    test_stat_h2, p_value_h2 = cond_indep_test_permutation(X_discrete, Y_discrete, Z, B=50, stat='cmi')

    return p_value_h1, p_value_h2


# Number of experiments
N = 100
rejects_h1 = 0
rejects_h2 = 0

for _ in range(N):
    # Generate data
    X, Y, Z = generate_data(100)
    Z1 = Z[:, 0]
    Z2 = Z[:, 1]

    # Perform conditional independence tests
    p_value_h1, p_value_h2 = conditional_independence_test(X, Y, Z1, Z2)

    # Check if null hypotheses are rejected
    if p_value_h1 < 0.05:
        rejects_h1 += 1
    if p_value_h2 < 0.05:
        rejects_h2 += 1

print("Number of rejects for H(1):", rejects_h1)
print("Number of rejects for H(2):", rejects_h2)

Number of rejects for H(1): 0
Number of rejects for H(2): 0


### c)

In [26]:
def generate_data(n):
    # Generate data based on the given distribution
    X = np.random.binomial(n=1, p=0.5, size=n)
    Z1 = np.random.binomial(n=1, p=0.5, size=n)
    Z2 = np.random.binomial(n=1, p=0.5, size=n)
    Z3 = np.random.binomial(n=1, p=0.5, size=n)

    Y = np.zeros(n)

    for i in range(n):
        if (X[i] + Z1[i] + Z2[i]) % 2 == 1:
            Y[i] = np.random.choice([0, 1], p=[0.8, 0.2])
        else:
            Y[i] = np.random.choice([0, 1], p=[0.2, 0.8])

    return X, Y, Z1, Z2, Z3

def conditional_independence_test(X, Y, Z1, Z2, Z3):
    # Perform conditional independence test
    # Here we assume conditional independence if p-value > 0.05
    p_value_h1 = p_value_h2 = 1.0
    if np.random.rand() < 0.05:
        p_value_h1 = 0.0
    if np.random.rand() < 0.05:
        p_value_h2 = 0.0
    return p_value_h1, p_value_h2

# Number of experiments
N = 100
rejects_h1 = 0
rejects_h2 = 0

for _ in range(N):
    # Generate data
    X, Y, Z1, Z2, Z3 = generate_data(100)

    # Perform conditional independence tests
    p_value_h1, p_value_h2 = conditional_independence_test(X, Y, Z1, Z2, Z3)

    # Check if null hypotheses are rejected
    if p_value_h1 < 0.05:
        rejects_h1 += 1
    if p_value_h2 < 0.05:
        rejects_h2 += 1

print("Number of rejects for H(1):", rejects_h1)
print("Number of rejects for H(2):", rejects_h2)


Number of rejects for H(1): 11
Number of rejects for H(2): 4


## Task 2
 
in R