In [6]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd
import copy

# Mathematical Underpinnings - Lab 4

Tests to verify hipoteses of independence (from Lab 4):

In [69]:
def indep_test_asymptotic(X, Y, stat='mi'):

    if stat == "mi":

        stat_value = 2*len(X)*mutual_info_score(X, Y)

    if stat == "chi2":

        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

def indep_test_permutation(X, Y, B, stat="mi"):

    stat_value = mutual_info_score(X, Y)

    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)

        stat_value_b = mutual_info_score(X_b, Y)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

## Task 1

In [8]:
def calc_CMI(X, Y, Z):
    cmi_value = 0
    len_Z = len(Z)
    for z in np.unique(Z):
        cmi_value += mutual_info_score(X[Z==z], Y[Z==z]) * (len(Z[Z==z]) / len_Z)
    return cmi_value

### a)

In [12]:
def conditional_indepedence_asymptotic_test(X, Y, Z):
    stat_value = 2*len(X)*calc_CMI(X, Y, Z)
    df = (len(np.unique(X)) - 1) * (len(np.unique(Y)) - 1) * len(np.unique(Z))
    p_value = 1 - chi2.cdf(stat_value, df=df)
    return stat_value, p_value

### b)

In [13]:
# CI test based on CMI and permutations
def conditional_permutation(X, Z):
    X = copy.deepcopy(X)
    for z in np.unique(Z):
        X_z = X[Z==z]
        X_z = np.random.permutation(X_z)
        X[Z==z] = X_z
    return X

def conditional_independence_permutation_test(X, Y, Z, B):
    stat_value = calc_CMI(X, Y, Z)
    condition_p_value = 0
    for b in range(B):
        X_b = conditional_permutation(X, Z)
        stat_value_b = calc_CMI(X_b, Y, Z)
        if stat_value <=stat_value_b:
            condition_p_value += 1
    p_value = (1 + condition_p_value)/(1+B)
    return 2*len(X)*stat_value, p_value

### c)

conditionaly independent

In [18]:
n = 1000
X = pd.cut(np.random.normal(0, 1, n), bins=10, labels=False)
Y = pd.cut(np.random.normal(0, 2, n), bins=10, labels=False)
Z = np.random.randint(0,3,n)

print('Asymptotic test of conditional independence with mutual information')
stat_value, p_value = conditional_indepedence_asymptotic_test(X, Y, Z)
print(f'Stat value: {stat_value}, p-value: {p_value}')
print('Permutation test of conditional independence')
stat_value, p_value = conditional_independence_permutation_test(X, Y, Z, 50)
print(f'Stat value: {stat_value}, p-value: {p_value}')

Asymptotic test of conditional independence with mutual information
Stat value: 177.54170567047782, p-value: 0.9994363580066558
Permutation test of conditional independence
Stat value: 177.54170567047782, p-value: 0.7254901960784313


conditionaly dependent

In [50]:
n = 500
X1, Y1 = np.random.multivariate_normal(np.zeros(2), np.array([[1, 0.5], [0.5, 1]]), n).T
Z1 = np.ones(n)
X2, Y2 = np.random.multivariate_normal(np.zeros(2), np.array([[1, -0.5], [-0.5, 1]]), n).T
Z2 = np.ones(n) * 2
X = pd.cut(np.concatenate([X1, X2]), bins=10, labels=False)
Y = pd.cut(np.concatenate([Y1, Y2]), bins=10, labels=False)
Z = np.concatenate([Z1, Z2])

print('Asymptotic test of conditional independence with mutual information')
stat_value, p_value = conditional_indepedence_asymptotic_test(X, Y, Z)
print(f'Stat value: {stat_value}, p-value: {p_value}')
print('Permutation test of conditional independence')
stat_value, p_value = conditional_independence_permutation_test(X, Y, Z, 50)
print(f'Stat value: {stat_value}, p-value: {p_value}')

Asymptotic test of conditional independence with mutual information
Stat value: 358.1106144190442, p-value: 1.1102230246251565e-16
Permutation test of conditional independence
Stat value: 358.1106144190442, p-value: 0.0196078431372549


## Task 2

In [54]:
def discretize(Q):
    Q[Q < 0] = -1
    Q[Q >= 0] = 1
    return Q

def sample_from_model1(n=1000):
    Z = discretize(np.random.normal(0, 1, n))
    X = discretize(np.random.normal(Z/2, 1))
    Y = discretize(np.random.normal(Z/2, 1))
    return X, Y, Z

def sample_from_model2(n=1000):
    X = discretize(np.random.normal(0, 1, n))
    Z = discretize(np.random.normal(X/2, 1))
    Y = discretize(np.random.normal(Z/2, 1))
    return X, Y, Z

def sample_from_model3(n=1000):
    X = discretize(np.random.normal(0, 1, n))
    Y = discretize(np.random.normal(0, 1, n))
    Z = discretize(np.random.normal((X+Y)/2, 1))
    return X, Y, Z

### a)

Model 1: X and Y dependent, but conditionally independent;


Model 2: X and Y dependent, but conditionally independent;


Model 3: X and Y independent, but conditionally dependent.

### b)

In [60]:
#Model1
X, Y, Z = sample_from_model1()
print(f'Mutual information: {mutual_info_score(X, Y)}')
print(f'Conditional mutual information: {calc_CMI(X, Y, Z)}')

Mutual information: 0.011891689444400555
Conditional mutual information: 0.00010055999396948617


In [64]:
#Model2
X, Y, Z = sample_from_model2()
print(f'Mutual information: {mutual_info_score(X, Y)}')
print(f'Conditional mutual information: {calc_CMI(X, Y, Z)}')

Mutual information: 0.01317202982810034
Conditional mutual information: 0.00011355964710381205


In [62]:
#Model3
X, Y, Z = sample_from_model3()
print(f'Mutual information: {mutual_info_score(X, Y)}')
print(f'Conditional mutual information: {calc_CMI(X, Y, Z)}')

Mutual information: 8.10397914918215e-05
Conditional mutual information: 0.010781476819218191


### c)

In [73]:
X1, Y1, Z1 = sample_from_model1()
X2, Y2, Z2 = sample_from_model2()
X3, Y3, Z3 = sample_from_model3()
# Model 1
cta_stat_1, cta_p_1 = conditional_indepedence_asymptotic_test(X1, Y1, Z1)
ctp_stat_1, ctp_p_1 = indep_test_asymptotic(X1, Y1)

print(f"Conditional independance test for Model 1 stat value: {cta_stat_1}, p value {cta_p_1}")
print(f"Independence test for Model 1 stat value: {ctp_stat_1}, p value {ctp_p_1}")

# Model 2
cta_stat_2, cta_p_2 = conditional_indepedence_asymptotic_test(X2, Y2, Z2)
ctp_stat_2, ctp_p_2 = indep_test_asymptotic(X2, Y2)

print(f"Conditional independance test for Model 2 stat value: {cta_stat_2}, p value {cta_p_2}")
print(f"Independence test for Model 2 stat value: {ctp_stat_2}, p value {ctp_p_2}")

# Model 1
cta_stat_3, cta_p_3 = conditional_indepedence_asymptotic_test(X3, Y3, Z3)
ctp_stat_3, ctp_p_3 = indep_test_asymptotic(X3, Y3)

print(f"Conditional independance test for Model 3 stat value: {cta_stat_3}, p value {cta_p_3}")
print(f"Independence test for Model 3 stat value: {ctp_stat_3}, p value {ctp_p_3}")

Conditional independance test for Model 1 stat value: 1.171134703676019, p value 0.5567898764234305
Independence test for Model 1 stat value: 23.385019517736083, p value 1.3260721574148704e-06
Conditional independance test for Model 2 stat value: 6.593957004016993, p value 0.036994778348677704
Independence test for Model 2 stat value: 18.606580325979948, p value 1.6066501696498925e-05
Conditional independance test for Model 3 stat value: 12.51411399374058, p value 0.0019168788840249196
Independence test for Model 3 stat value: 2.1287546009043368, p value 0.1445581656205558
