In [1]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd

# Mathematical Underpinnings - Lab 4

Tests to verify hipoteses of independence (from Lab 4):

In [2]:
def indep_test_asymptotic(X, Y, stat):

    if stat == "mi":

        stat_value = 2 * len(X) * mutual_info_score(X, Y)

    if stat == "chi2":

        test_res = chi2_contingency(pd.crosstab(X, Y))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1) * (len(np.unique(Y)) - 1)

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value


def indep_test_permutation(X, Y, B, stat="mi"):

    stat_value = mutual_info_score(X, Y)

    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)

        stat_value_b = mutual_info_score(X_b, Y)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value) / (1 + B)

    return 2 * len(X) * stat_value, p_value

## Task 1

this task is homework

In [3]:
# a function which computes CMI


def conditional_mutual_information(X, Y, Z):

    z_values = np.unique(Z)
    n = len(Z)

    cmi = 0

    for i in range(len(z_values)):

        curr_z_value = z_values[i]
        mask = Z == curr_z_value

        mi_XY_z = mutual_info_score(X[mask], Y[mask])
        p_z = np.sum(mask) / n

        cmi += p_z * mi_XY_z

    return cmi

### a)

In [4]:
# CI test based on CMI and asymptotics
def cond_indep_test_asymptotic(X, Y, Z):

    stat_value = 2 * len(X) * conditional_mutual_information(X, Y, Z)

    df = (len(np.unique(X)) - 1) * (len(np.unique(Y)) - 1) * len(np.unique(Z))

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

### b)

In [5]:
# CI test based on CMI and permutations


def conditional_permutation(X, Z):
    rng = np.random.default_rng()

    z_values = np.unique(Z)
    n = len(Z)
    X_b = np.zeros(n)

    for i in range(len(z_values)):

        curr_z_value = z_values[i]
        X_b[Z == curr_z_value] = rng.permutation(X[Z == curr_z_value])

    return X_b


def cond_indep_test_permutation(X, Y, Z, B):

    stat_value = conditional_mutual_information(X, Y, Z)

    condition_p_value = 0
    for b in range(B):
        X_b = conditional_permutation(X, Z)

        stat_value_b = conditional_mutual_information(X_b, Y, Z)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value) / (1 + B)

    return 2 * len(X) * stat_value, p_value

### c)

conditionaly independent

In [93]:
rng = np.random.default_rng()

X = rng.choice(6, size=1000)
Y = rng.choice(6, size=1000)
Z = rng.choice(6, size=1000)

In [94]:
cond_indep_test_asymptotic(X, Y, Z)

(150.12885210414842, 0.48168035338447124)

In [95]:
cond_indep_test_permutation(X, Y, Z, 100)

(150.12885210414842, 0.7128712871287128)

conditionaly dependent

In [96]:
Y = rng.choice(10, size=1000)
X = rng.choice(10, size=1000)

eq_zero = rng.choice(2, size=1000)
Y[eq_zero == 0] = X[eq_zero == 0]
Z = rng.choice(5, size=1000)

In [97]:
cond_indep_test_asymptotic(X, Y, Z)

(1897.758550621121, 0.0)

In [98]:
cond_indep_test_permutation(X, Y, Z, 100)

(1897.758550621121, 0.009900990099009901)

## Task 2

In [135]:
rng = np.random.default_rng()


def discretize(arr):
    r = np.zeros_like(arr)
    r[arr >= 0] = 1
    r[arr < 0] = -1
    return r


def sample_from_model1(n):
    Z_tilde = rng.normal(size=n)
    Z = discretize(Z_tilde)

    X_tilde = rng.normal(loc=Z / 2, scale=1, size=n)
    X = discretize(X_tilde)

    Y_tilde = rng.normal(loc=Z / 2, scale=1, size=n)
    Y = discretize(Y_tilde)

    return X, Y, Z


def sample_from_model2(n):
    X_tilde = rng.normal(size=n)
    X = discretize(X_tilde)

    Z_tilde = rng.normal(loc=X / 2, scale=1, size=n)
    Z = discretize(Z_tilde)

    Y_tilde = rng.normal(loc=Z / 2, scale=1, size=n)
    Y = discretize(Y_tilde)

    return X, Y, Z


def sample_from_model3(n):
    X_tilde = rng.normal(size=n)
    Y_tilde = rng.normal(size=n)

    X = discretize(X_tilde)
    Y = discretize(Y_tilde)

    Z_tilde = rng.normal(loc=(X + Y) / 2, scale=1, size=n)
    Z = discretize(Z_tilde)

    return X, Y, Z

### a)

answer:

model 1:
- independence: no
- conditional independence: yes


model 2:
- independence: no
- conditional independence: yes (Markov chain)


model 3:
- independence: yes
- conditional independence: no

### b)

In [150]:
def mi_and_cmi(X, Y, Z):
    print(f"{' mutual information between X and Y' :#^60}")
    print(mutual_info_score(X, Y))

    print(f"{' conditional mutual information between X and Y given Z ' :#^60}")
    print(conditional_mutual_information(X, Y, Z))

In [146]:
n = 1000

In [154]:
mi_and_cmi(*sample_from_model1(n))

############ mutual information between X and Y#############
0.022823037493722442
## conditional mutual information between X and Y given Z ##
0.0032749728604355515


In [155]:
mi_and_cmi(*sample_from_model2(n))

############ mutual information between X and Y#############
0.017025738563786808
## conditional mutual information between X and Y given Z ##
0.00043250830244191853


In [157]:
mi_and_cmi(*sample_from_model3(n))

############ mutual information between X and Y#############
0.0007121882149421777
## conditional mutual information between X and Y given Z ##
0.006909107075214898


### c)

In [144]:
def indep_or_cond_indep(X, Y, Z, B=100):
    print(f"{' independence (asymptotic mi) ' :#^60}")
    print(f"{' between X and Y ' :*>30}")
    print(indep_test_asymptotic(X, Y, "mi"))
    print(f"{' between X and Z ' :*>30}")
    print(indep_test_asymptotic(X, Z, "mi"))
    print(f"{' between Y and Z ' :*>30}")
    print(indep_test_asymptotic(Y, Z, "mi"))

    print()

    print(f"{' independence (asymptotic chi2) ' :#^60}")
    print(f"{' between X and Y ' :*>30}")
    print(indep_test_asymptotic(X, Y, "chi2"))
    print(f"{' between X and Z ' :*>30}")
    print(indep_test_asymptotic(X, Z, "chi2"))
    print(f"{' between Y and Z ' :*>30}")
    print(indep_test_asymptotic(Y, Z, "chi2"))

    print()

    print(f"{' independence (permutation mi) ' :#^60}")
    print(f"{' between X and Y ' :*>30}")
    print(indep_test_permutation(X, Y, B=B))
    print(f"{' between X and Z ' :*>30}")
    print(indep_test_permutation(X, Z, B=B))
    print(f"{' between Y and Z ' :*>30}")
    print(indep_test_permutation(Y, Z, B=B))

    print()

    print(f"{' conditional indempendence (asymptotic cmi) ' :#^60}")
    print(cond_indep_test_asymptotic(X, Y, Z))

    print(f"{' conditional indempendence (permutation cmi) ' :#^60}")
    print(cond_indep_test_permutation(X, Y, Z, B=B))

In [147]:
indep_or_cond_indep(*sample_from_model1(n))

############### independence (asymptotic mi) ###############
************* between X and Y 
(7.4283487921561475, 0.006420422846351692)
************* between X and Z 
(173.75771994784682, 0.0)
************* between Y and Z 
(115.1085572236173, 0.0)

############## independence (asymptotic chi2) ##############
************* between X and Y 
(7.078380979077759, 0.007801947048665658)
************* between X and Z 
(166.95920614283276, 0.0)
************* between Y and Z 
(111.5435914960044, 0.0)

############## independence (permutation mi) ###############
************* between X and Y 
(7.4283487921561475, 0.009900990099009901)
************* between X and Z 
(173.75771994784682, 0.009900990099009901)
************* between Y and Z 
(115.1085572236173, 0.009900990099009901)

######## conditional indempendence (asymptotic cmi) ########
(3.835676940534005, 0.14692420017427454)
####### conditional indempendence (permutation cmi) ########
(3.835676940534005, 0.1782178217821782)


In [148]:
indep_or_cond_indep(*sample_from_model2(n))

############### independence (asymptotic mi) ###############
************* between X and Y 
(18.093783871338086, 2.1028682169577095e-05)
************* between X and Z 
(134.99922990307212, 0.0)
************* between Y and Z 
(171.28520225473287, 0.0)

############## independence (asymptotic chi2) ##############
************* between X and Y 
(17.504757463675606, 2.8658965222971844e-05)
************* between X and Z 
(130.40290263364156, 0.0)
************* between Y and Z 
(164.72257238705197, 0.0)

############## independence (permutation mi) ###############
************* between X and Y 
(18.093783871338086, 0.009900990099009901)
************* between X and Z 
(134.99922990307212, 0.009900990099009901)
************* between Y and Z 
(171.28520225473287, 0.009900990099009901)

######## conditional indempendence (asymptotic cmi) ########
(0.5842717221646839, 0.7466670860835029)
####### conditional indempendence (permutation cmi) ########
(0.5842717221646839, 0.7128712871287128)


In [149]:
indep_or_cond_indep(*sample_from_model3(n))

############### independence (asymptotic mi) ###############
************* between X and Y 
(0.2915942656678272, 0.5892006941230905)
************* between X and Z 
(136.9658090614056, 0.0)
************* between Y and Z 
(112.10430860605037, 0.0)

############## independence (asymptotic chi2) ##############
************* between X and Y 
(0.2271594524714508, 0.6336380976116874)
************* between X and Z 
(132.36317860886166, 0.0)
************* between Y and Z 
(108.58661923256281, 0.0)

############## independence (permutation mi) ###############
************* between X and Y 
(0.2915942656678272, 0.5742574257425742)
************* between X and Z 
(136.9658090614056, 0.009900990099009901)
************* between Y and Z 
(112.10430860605037, 0.009900990099009901)

######## conditional indempendence (asymptotic cmi) ########
(16.736832200933886, 0.00023208285870579548)
####### conditional indempendence (permutation cmi) ########
(16.736832200933886, 0.009900990099009901)
