# Mathematical Underpinnings - Lab 6

In [1]:
from sklearn.metrics import mutual_info_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm import tqdm

## Useful functions

In [2]:
def discetize_2bins(X):
    X_discrete = 1*(X >= 0)
    return X_discrete

In [3]:
def conditional_permutation(X, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    X_b = np.zeros(n)

    for i in range(n_z_values):

        z_value_tmp = z_values[i]

        X_b[Z == z_value_tmp] = np.random.permutation(X[Z == z_value_tmp])

    return X_b

In [4]:
def conditional_mutual_information(X, Y, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    cmi = 0

    for i in range(n_z_values):

        z_value_tmp = z_values[i]
        z_condition = (Z == z_value_tmp)

        X_z = X[z_condition]
        Y_z = Y[z_condition]

        mi_XY_z = mutual_info_score(X_z, Y_z)
        p_z = np.sum(z_condition)/n

        cmi += p_z*mi_XY_z

    return cmi

In [5]:
# II(X;Y;Z)
def interaction_information(X, Y, Z):
    return conditional_mutual_information(X, Y, Z) - mutual_info_score(X, Y)

In [6]:
# II(X;Y;Z1;Z2)
def interaction_information2(X, Y, Z1, Z2):
    Z_1_and_2 = 2*Z2 + Z1
    return interaction_information(X, Y, Z_1_and_2) - interaction_information(X, Y, Z1) - interaction_information(X, Y, Z2)

## Task 1

In [21]:
def secmi2(X, Y, Z):
    cife = mutual_info_score(X, Y)
    for j in range(Z.shape[1]):
      cife += interaction_information(X, Y, Z[:,j])
    return cife

def secmi3(X, Y, Z):
    cife3 = secmi2(X, Y, Z)
    for i in range(Z.shape[1]):
      for j in range(i+1, Z.shape[1]):
        cife3 += interaction_information2(X, Y, Z[:,i], Z[:,j])
    return cife3

### a)

In [8]:
def cond_indep_test_permutation(X, Y, Z, B, stat):

    n_col_Z = Z.shape[1]
    Z_1dim = np.dot(Z, 2**np.linspace(0, n_col_Z-1, n_col_Z))

    if stat == "cmi":
        stat_value = conditional_mutual_information(X, Y, Z_1dim)
    if stat == "secmi2":
        stat_value = secmi2(X, Y, Z)
    if stat == "secmi3":
        stat_value = secmi3(X, Y, Z)

    condition_p_value = 0
    for b in range(B):
        X_b = conditional_permutation(X, Z_1dim)

        if stat == "cmi":
            stat_value_b = conditional_mutual_information(X_b, Y, Z_1dim)
        if stat == "secmi2":
            stat_value_b = secmi2(X_b, Y, Z)
        if stat == "secmi3":
            stat_value_b = secmi3(X_b, Y, Z)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

### b)

In [53]:
def generate_sample_1b(n):
  Y_tilde = np.random.normal(0,1,size=n)
  Y = discetize_2bins(Y_tilde)

  Z1_tilde = np.random.normal(2*Y-1,1,size=n)
  Z1 = discetize_2bins(Z1_tilde)

  Z2_tilde = np.random.normal(2*Y-1,1,size=n)
  Z2 = discetize_2bins(Z2_tilde)

  Z3_tilde = np.random.normal(2*Y-1,1,size=n)
  Z3 = discetize_2bins(Z3_tilde)
  X_tilde = np.random.normal(2*Z1-1,1,size=n)
  X = discetize_2bins(X_tilde)

  return X, Y, Z1, Z2, Z3

In [54]:
def run_tests_1b(n, N=100):
  stats = ["cmi", "secmi2", "secmi3"]
  index=pd.MultiIndex.from_product([["Z1+Z2","Z2+Z3"],stats])
  df = pd.DataFrame(data=np.zeros((6,1)),columns=['rejected'],index=index)

  for i in tqdm(range(N)):
    X, Y, Z1, Z2, Z3 = generate_sample_1b(n)
    Z12 = np.hstack([Z1.reshape(-1,1),Z2.reshape(-1,1)])
    Z23 = np.hstack([Z2.reshape(-1,1),Z3.reshape(-1,1)])

    for Z_name, Z in [("Z1+Z2",Z12),("Z2+Z3",Z23)]:
      for stat in stats:
        stat_value, p_value = cond_indep_test_permutation(X,Y,Z,50,stat)
        if p_value < 0.05:
          df.loc[Z_name, stat] += 1

  return df

In [55]:
n=100
results_1b = run_tests_1b(n, N=100)

100%|██████████| 100/100 [05:47<00:00,  3.48s/it]


In [52]:
results_1b

Unnamed: 0,Unnamed: 1,rejected
Z1+Z2,cmi,3.0
Z1+Z2,secmi2,1.0
Z1+Z2,secmi3,3.0
Z2+Z3,cmi,54.0
Z2+Z3,secmi2,50.0
Z2+Z3,secmi3,54.0


### c)

In [26]:
def generate_sample_1c(n):
  X = np.random.binomial(1, 0.5, n)
  Z1 = np.random.binomial(1, 0.5, n)
  Z2 = np.random.binomial(1, 0.5, n)
  Z3 = np.random.binomial(1, 0.5, n)

  S = (X + Z1 + Z2) % 2
  Y = np.random.binomial(1, np.where(S == 1, 0.8, 0.2), n)
  return X, Y, Z1, Z2, Z3

In [41]:
def run_tests_1c(n, N=100):
  stats = ["cmi", "secmi2", "secmi3"]
  index=pd.MultiIndex.from_product([["Z1+Z2","Z2+Z3"],stats])
  df = pd.DataFrame(data=np.zeros((6,1)),columns=['rejected'],index=index)

  for i in tqdm(range(N)):
    X, Y, Z1, Z2, Z3 = generate_sample_1c(n)
    Z12 = np.hstack([Z1.reshape(-1,1),Z2.reshape(-1,1)])
    Z23 = np.hstack([Z2.reshape(-1,1),Z3.reshape(-1,1)])

    for Z_name, Z in [("Z1+Z2",Z12),("Z2+Z3",Z23)]:
      for stat in stats:
        stat_value, p_value = cond_indep_test_permutation(X,Y,Z,50,stat)
        if p_value < 0.05:
          df.loc[Z_name, stat] += 1

  return df

In [42]:
results = run_tests_1c(100)

100%|██████████| 100/100 [05:49<00:00,  3.50s/it]


In [49]:
results = pd.DataFrame(results.to_numpy().reshape(2,3),
                       index=['Z1+Z2','Z2+Z3'],
                       columns=pd.MultiIndex.from_product([['rejected'],['cmi','secmi2','secmi3']])
                       ).astype(int)
results

Unnamed: 0_level_0,rejected,rejected,rejected
Unnamed: 0_level_1,cmi,secmi2,secmi3
Z1+Z2,100,4,100
Z2+Z3,2,6,1


## Task 2

in R