In [1]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd

# Mathematical Underpinnings - Lab 4

Tests to verify hipoteses of independence (from Lab 4):

In [2]:
def indep_test_asymptotic(X, Y, stat):

    if stat == "mi":

        stat_value = 2*len(X)*mutual_info_score(X, Y)

    if stat == "chi2":

        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

def indep_test_permutation(X, Y, B, stat="mi"):

    stat_value = mutual_info_score(X, Y)

    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)

        stat_value_b = mutual_info_score(X_b, Y)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

## Task 1

In [3]:
# a function which computes CMI

def conditional_mutual_information(X, Y, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    cmi = 0

    for i in range(n_z_values):

        z_value_tmp = z_values[i]
        z_condition = (Z == z_value_tmp)

        X_z = X[z_condition]
        Y_z = Y[z_condition]

        mi_XY_z = mutual_info_score(X_z, Y_z)
        p_z = np.sum(z_condition)/n

        cmi += p_z*mi_XY_z

    return cmi

### a)

In [4]:
# CI test based on CMI and asymptotics

def cond_indep_test_asymptotic(X, Y, Z, stat):

    if stat == "cmi":

        stat_value = 2*len(X)*conditional_mutual_information(X, Y, Z)

    if stat == "chi2":
        pass

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)*len(np.unique(Z))

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

### b)

In [5]:
# CI test based on CMI and permutations

def conditional_permutation(X, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    X_b = np.zeros(n)

    for i in range(n_z_values):

        z_value_tmp = z_values[i]

        X_b[Z == z_value_tmp] = np.random.permutation(X[Z == z_value_tmp])

    return X_b

def cond_indep_test_permutation(X, Y, Z, B, stat="cmi"):

    stat_value = conditional_mutual_information(X, Y, Z)

    condition_p_value = 0
    for b in range(B):
        X_b = conditional_permutation(X, Z)

        stat_value_b = conditional_mutual_information(X_b, Y, Z)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

### c)

conditionaly independent

In [6]:
X = np.random.choice(5, size=1000)
Y = np.random.choice(5, size=1000)
Z = np.random.choice(5, size=1000)

In [7]:
cond_indep_test_asymptotic(X, Y, Z, "cmi")

(94.26895991202782, 0.13152682401038773)

In [8]:
cond_indep_test_permutation(X, Y, Z, 100)

(94.26895991202782, 0.2079207920792079)

conditionaly dependent

In [9]:
X = np.random.choice(5, size=1000)
Y = np.random.choice(5, size=1000)
where_equal = np.random.choice(2, size=1000)
Y[where_equal == 0] = X[where_equal == 0]
Z = np.random.choice(5, size=1000)

In [10]:
cond_indep_test_asymptotic(X, Y, Z, "cmi")

(876.6782303060543, 0.0)

In [11]:
cond_indep_test_permutation(X, Y, Z, 100)

(876.6782303060543, 0.009900990099009901)

## Task 2

In [12]:
def discetize_2bins(X):
    X_discrete = 1*(X >= 0) - 1*(X < 0)
    return X_discrete

In [13]:
def sample_from_model1(n):
    Z_tilde = np.random.normal(size=n)
    Z = discetize_2bins(Z_tilde)

    X_tilde = np.random.normal(loc=Z/2, size=n)
    X = discetize_2bins(X_tilde)

    Y_tilde = np.random.normal(loc=Z/2, size=n)
    Y = discetize_2bins(Y_tilde)

    return X, Y, Z

In [14]:
def sample_from_model2(n):
    X_tilde = np.random.normal(size=n)
    X = discetize_2bins(X_tilde)

    Z_tilde = np.random.normal(loc=X/2, size=n)
    Z = discetize_2bins(Z_tilde)

    Y_tilde = np.random.normal(loc=Z/2, size=n)
    Y = discetize_2bins(Y_tilde)

    return X, Y, Z

In [43]:
def sample_from_model3(n):
    X_tilde = np.random.normal(size=n)
    X = discetize_2bins(X_tilde)

    Y_tilde = np.random.normal(size=n)
    Y = discetize_2bins(Y_tilde)

    Z_tilde = np.random.normal(loc=(X+Y)/2, size=n)
    Z = discetize_2bins(Z_tilde)

    return X, Y, Z

### a)

answer:

model 1: X and Y dependent, conditionally independent given Z

model 2: X and Y dependent, conditionally independent given Z

model 3: X and Y independent, conditionally dependent given Z

### c)

independence tests, conditional independence tests

In [58]:
results_independence = pd.DataFrame(np.zeros((3, 3)), columns=['model1', 'model2', 'model3'], index=['mi_as', 'chi_as', 'mi_perm'])
results_conditional_independence = pd.DataFrame(np.zeros((2, 3)), columns=['model1', 'model2', 'model3'], index=['cmi_as', 'cmi_perm'])

In [59]:
n = 1000
B = 100

In [63]:
X, Y, Z = sample_from_model1(n)
results_independence['model1'] = [indep_test_asymptotic(X, Y, 'mi')[1], indep_test_asymptotic(X, Y, 'chi2')[1], indep_test_permutation(X, Y, B)[1]]
results_conditional_independence['model1'] = [cond_indep_test_asymptotic(X, Y, Z, 'cmi')[1], cond_indep_test_permutation(X, Y, Z, B)[1]]

In [64]:
X, Y, Z = sample_from_model2(n)
results_independence['model2'] = [indep_test_asymptotic(X, Y, 'mi')[1], indep_test_asymptotic(X, Y, 'chi2')[1], indep_test_permutation(X, Y, B)[1]]
results_conditional_independence['model2'] = [cond_indep_test_asymptotic(X, Y, Z, 'cmi')[1], cond_indep_test_permutation(X, Y, Z, B)[1]]

In [65]:
X, Y, Z = sample_from_model3(n)
results_independence['model3'] = [indep_test_asymptotic(X, Y, 'mi')[1], indep_test_asymptotic(X, Y, 'chi2')[1], indep_test_permutation(X, Y, B)[1]]
results_conditional_independence['model3'] = [cond_indep_test_asymptotic(X, Y, Z, 'cmi')[1], cond_indep_test_permutation(X, Y, Z, B)[1]]

In [69]:
results_independence.round(2) 

Unnamed: 0,model1,model2,model3
mi_as,0.0,0.0,0.72
chi_as,0.0,0.0,0.76
mi_perm,0.01,0.01,0.73


In [68]:
results_conditional_independence.round(2)

Unnamed: 0,model1,model2,model3
cmi_as,0.87,0.65,0.0
cmi_perm,0.84,0.63,0.02
