In [2]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd

# Mathematical Underpinnings - Lab 4

Tests to verify hipoteses of independence (from Lab 4):

In [3]:
def indep_test_asymptotic(X, Y, stat):

    if stat == "mi":

        stat_value = 2*len(X)*mutual_info_score(X, Y)

    if stat == "chi2":

        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

def indep_test_permutation(X, Y, B, stat="mi"):

    stat_value = mutual_info_score(X, Y)

    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)

        stat_value_b = mutual_info_score(X_b, Y)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

## Task 1

In [4]:
# a function which computes CMI
def conditional_mutual_information(X, Y, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    cmi = 0

    for i in range(n_z_values):

        z_value_tmp = z_values[i]
        z_condition = (Z == z_value_tmp)

        X_z = X[z_condition]
        Y_z = Y[z_condition]

        mi_XY_z = mutual_info_score(X_z, Y_z)
        p_z = np.sum(z_condition)/n

        cmi += p_z*mi_XY_z

    return cmi

### a)

In [5]:
# CI test based on CMI and asymptotics
def cond_indep_test_asymptotic(X, Y, Z, stat):

    if stat == "cmi":

        stat_value = 2*len(X)*conditional_mutual_information(X, Y, Z)

    if stat == "chi2":
        pass

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)*len(np.unique(Z))

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

### b)

In [6]:
# CI test based on CMI and permutations
def conditional_permutation(X, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    X_b = np.zeros(n)

    for i in range(n_z_values):

        z_value_tmp = z_values[i]

        X_b[Z == z_value_tmp] = np.random.permutation(X[Z == z_value_tmp])

    return X_b

def cond_indep_test_permutation(X, Y, Z, B, stat="cmi"):

    stat_value = conditional_mutual_information(X, Y, Z)

    condition_p_value = 0
    for b in range(B):
        X_b = conditional_permutation(X, Z)

        stat_value_b = conditional_mutual_information(X_b, Y, Z)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

### c)

conditionaly independent

In [7]:
X = np.random.choice(5, size=1000)
Y = np.random.choice(5, size=1000)
Z = np.random.choice(5, size=1000)

In [8]:
print("asymptotic test of conditional independence")
stat_value, p_value = cond_indep_test_asymptotic(X, Y, Z, "cmi")
print(stat_value, p_value)

asymptotic test of conditional independence
71.00896340885502 0.7536690344865403


In [9]:
print("permutation test of conditional independence")
stat_value, p_value = cond_indep_test_permutation(X, Y, Z, 100)
print(stat_value, p_value)

permutation test of conditional independence


71.00896340885502 0.8712871287128713


conditionaly dependent

In [10]:
X = np.random.choice(5, size=1000)
Y = np.random.choice(5, size=1000)
where_equal = np.random.choice(2, size=1000)
Y[where_equal == 0] = X[where_equal == 0]
Z = np.random.choice(5, size=1000)

In [11]:
print("asymptotic test of conditional dependence")
cond_indep_test_asymptotic(X, Y, Z, "cmi")

asymptotic test of conditional dependence


(998.4643003355621, 0.0)

In [12]:
print("permutation test of conditional dependence")
stat_value, p_value = cond_indep_test_permutation(X, Y, Z, 100)

permutation test of conditional dependence


## Task 2

In [13]:
def discetize_2bins(X):
    X_discrete = 1*(X >= 0) - 1*(X < 0)
    return X_discrete
def sample_from_model1(n):
    Z_tile = np.random.normal(size=n)
    Z = discetize_2bins(Z_tile)

    X_tile = np.random.normal(loc=Z/2, size=n)
    X = discetize_2bins(X_tile)

    Y_tile = np.random.normal(loc=Z/2, size=n)
    Y = discetize_2bins(Y_tile)

    return X, Y, Z


def sample_from_model2(n):
    X_tile = np.random.normal(size=n)
    X = discetize_2bins(X_tile)

    Z_tile = np.random.normal(loc=X/2, size=n)
    Z = discetize_2bins(Z_tile)

    Y_tile = np.random.normal(loc=Z/2, size=n)
    Y = discetize_2bins(Y_tile)
    return X, Y, Z

def sample_from_model3(n):
    X_tile = np.random.normal(size=n)
    X = discetize_2bins(X_tile)

    Y_tile = np.random.normal(size=n)
    Y = discetize_2bins(Y_tile)

    Z_tile = np.random.normal(loc=(X+Y)/2, size=n)
    Z = discetize_2bins(Z_tile)
    return X, Y, Z

### a)

answer:
model 1: X and Y dependent, conditionally independent given Z

model 2: X and Y dependent, conditionally independent given Z

model 3: X and Y independent, conditionally dependent given Z


### b)

In [14]:
model1_x, model1_y, model1_z = sample_from_model1(1000)
mi = mutual_info_score(model1_x, model1_y)
print("mutual information for model 1:", mi)
stat_value = conditional_mutual_information(model1_x, model1_y, model1_z)
print("conditional mutual information for model 1:", stat_value)

mutual information for model 1: 0.011636375281762257
conditional mutual information for model 1: 9.113066449126827e-05


In [15]:
model2_x, model2_y, model2_z = sample_from_model2(1000)
mi = mutual_info_score(model2_x, model2_y)
print("mutual information for model 2:", mi)
stat_value = conditional_mutual_information(model2_x, model2_y, model2_z)
print("conditional mutual information for model 2:", stat_value)

mutual information for model 2: 0.00443631974765285
conditional mutual information for model 2: 0.0030218100641348075


In [16]:
model3_x, model3_y, model3_z = sample_from_model3(1000)
mi = mutual_info_score(model3_x, model3_y)
print("mutual information for model 3:", mi)
stat_value = conditional_mutual_information(model3_x, model3_y, model3_z)
print("conditional mutual information for model 3:", stat_value)

mutual information for model 3: 0.0015726197449853485
conditional mutual information for model 3: 0.016428708024691333


### c)

Independence

In [17]:
print("Independence test for model 1")
stat_value, p_value = indep_test_asymptotic(model1_x, model1_y, "mi")
print(stat_value, p_value)
print("conditional Independence test for model 1")
stat_value, p_value = cond_indep_test_asymptotic(model1_x, model1_y, model1_z, "cmi")
print(stat_value, p_value)
print("permutation test for model 1")
stat_value, p_value = cond_indep_test_permutation(model1_x, model1_y, model1_z, 100)
print(stat_value, p_value)

Independence test for model 1
23.272750563524514 1.405776494212141e-06
conditional Independence test for model 1
0.18226132898253655 0.912898419698284
permutation test for model 1


0.18226132898253655 0.900990099009901


In [18]:
print("Independence test for model 2")
stat_value, p_value = indep_test_asymptotic(model2_x, model2_y, "mi")
print(stat_value, p_value)
print("conditional Independence test for model 2")
stat_value, p_value = cond_indep_test_asymptotic(model2_x, model2_y, model2_z, "cmi")
print(stat_value, p_value)
print("permutation test for model 2")
stat_value, p_value = cond_indep_test_permutation(model2_x, model2_y, model2_z, 100)
print(stat_value, p_value)

Independence test for model 2
8.872639495305702 0.0028947629546270948
conditional Independence test for model 2
6.043620128269615 0.04871296492320876
permutation test for model 2
6.043620128269615 0.039603960396039604


In [19]:
print("Independence test for model 3")
stat_value, p_value = indep_test_asymptotic(model3_x, model3_y, "mi")
print(stat_value, p_value)
print("conditional Independence test for model 3")
stat_value, p_value = cond_indep_test_asymptotic(model3_x, model3_y, model3_z, "cmi")
print(stat_value, p_value)
print("permutation test for model 3")
stat_value, p_value = cond_indep_test_permutation(model3_x, model3_y, model3_z, 100)
print(stat_value, p_value)

Independence test for model 3
3.145239489970697 0.07614882127972067
conditional Independence test for model 3
32.857416049382664 7.329979501147221e-08
permutation test for model 3
32.857416049382664 0.009900990099009901
