In [1]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd

# Mathematical Underpinnings - Lab 5

Tests to verify hipoteses of independence (from Lab 4):

In [2]:
def indep_test_asymptotic(X, Y, stat):
    if stat == "mi":

        stat_value = 2*len(X)*mutual_info_score(X, Y)

    if stat == "chi2":

        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

def indep_test_permutation(X, Y, B, stat="mi"):

    stat_value = mutual_info_score(X, Y)

    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)

        stat_value_b = mutual_info_score(X_b, Y)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

## Task 1

### a)

In [3]:
# a function which computes CMI
def compute_cmi(X, Y, Z):
    CMI = 0
    for unique_z in np.unique(Z):
        count = np.sum(Z == unique_z)
        total = len(Z)
        p = count / total
        CMI +=  p * mutual_info_score(X[Z==unique_z], Y[Z==unique_z])
    return CMI

In [4]:
# CI test based on CMI and asymptotics

### b)

In [5]:
# CI test based on CMI and permutations

def conditional_permutations(X, Z):
    X_permuted = np.empty_like(X)
    for z_value in np.unique(Z):
        mask = Z == z_value
        X_permuted[mask] = np.random.permutation(X[mask])
    return X_permuted

In [9]:
def cond_indep_test_asymptotic(X, Y, Z, stat="mi"):

    if stat == "mi":
        CMI = compute_cmi(X, Y, Z)
        stat_value = 2*len(X)*CMI

    if stat == "chi2":

        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1) * len(np.unique(Z))

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

def cond_indep_test_permutation(X, Y, Z, B, stat="mi"):

    stat_value = compute_cmi(X, Y, Z)

    condition_p_value = 0
    for b in range(B):
        X_b = conditional_permutations(X, Z)

        stat_value_b = compute_cmi(X_b, Y, Z)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

### c)

In [6]:
# for the permutations we permutate X for each separate value of Z

conditionaly independent

In [13]:
# Conditionally independent
n = 10000
ro = 0.
data = np.random.multivariate_normal(np.zeros(2), np.array([[1, ro], [ro, 1]]), n)
x = data[:, 0]
y = data[:, 1]
z = np.random.randint(0,2,n)
cond_indep_test_asymptotic(data[:,0], data[:,1], z)



(170344.15542974195, 1.0)

In [14]:
cond_indep_test_permutation(x, y, z, B=100)



(170344.15542974195, 1.0)

conditionaly dependent

In [20]:
n = 500
ro = 0.8
data1 = np.random.multivariate_normal(np.zeros(2), np.array([[1, ro], [ro,  1]]), n)
x1, y1 = data1[:, 0], data1[:, 1]
z1 = np.ones(n)
data2 = np.random.multivariate_normal(np.zeros(2), np.array([[1, -ro], [-ro, 1]]), n)
z2 = np.ones(n) * -1
x2, y2 = data2[:, 0], data2[:, 1]
x = pd.cut(np.concatenate([x1, x2]), bins=10, labels=False)
y = pd.cut(np.concatenate([y1, y2]), bins=10, labels=False)
z = np.concatenate([z1, z2])

In [21]:
cond_indep_test_asymptotic(x, y, z)

(950.3082893980958, 0.0)

In [22]:
cond_indep_test_permutation(x, y, z, B=100)

(950.3082893980958, 0.009900990099009901)

## Task 2

In [95]:
n = 1_000
def sample_from_model1():
    z_tilda = np.random.normal(0, 1, size=n)
    z = np.where(z_tilda > 0, 1, -1)
    x_tilda = np.random.normal(z/2, 1)
    y_tilda = np.random.normal(z/2, 1)
    x = np.where(x_tilda > 0, 1, -1)
    y = np.where(y_tilda > 0, 1, -1)
    return x,y,z

def sample_from_model2():
    x_tilda = np.random.normal(0, 1, size=n)
    x = np.where(x_tilda > 0, 1, -1)
    
    z_tilda = np.random.normal(x/2, 1)
    z = np.where(z_tilda > 0, 1, -1)
    
    y_tilda = np.random.normal(z/2, 1)
    y = np.where(y_tilda > 0, 1, -1)
    return x,y,z

def sample_from_model3():
    x_tilda = np.random.normal(0, 1, size=n)
    x = np.where(x_tilda > 0, 1, -1)
    
    y_tilda = np.random.normal(0, 1, size=n)
    y = np.where(y_tilda > 0, 1, -1)
    
    z_tilda = np.random.normal((x+y)/2, 1)
    z = np.where(z_tilda > 0, 1, -1)
    

    return x,y,z

### a)

In [97]:
x,y,z = sample_from_model1()
print("Model 1")
print("CMI")
print(compute_cmi(x,y,z))
print("indep_test_asymptotic")
print(indep_test_asymptotic(x,y,stat="mi"))
print("cond_indep_test_asymptotic")
print(cond_indep_test_asymptotic(x,y,z,stat="mi"))
print("cond_indep_test_permutation")
print(cond_indep_test_permutation(x,y,z, 100))

Model 1
CMI
0.0005815666884880494
indep_test_asymptotic
(29.728861107963468, 4.96895111723461e-08)
cond_indep_test_asymptotic
(1.1631333769760988, 0.5590218670201816)
cond_indep_test_permutation
(1.1631333769760988, 0.6534653465346535)


answer:Dependent, Conditionally Independent

### b)

In [98]:
x,y,z = sample_from_model2()
print("Model 2")
print("CMI")
print(compute_cmi(x,y,z))
print("indep_test_asymptotic")
print(indep_test_asymptotic(x,y,stat="mi"))
print("cond_indep_test_asymptotic")
print(cond_indep_test_asymptotic(x,y,z,stat="mi"))
print("cond_indep_test_permutation")
print(cond_indep_test_permutation(x,y,z, 100))

Model 2
CMI
5.4755893402921534e-05
indep_test_asymptotic
(24.964452485768284, 5.839712771127381e-07)
cond_indep_test_asymptotic
(0.10951178680584307, 0.9467162194240238)
cond_indep_test_permutation
(0.10951178680584307, 0.9306930693069307)


answer:Dependent, Conditionally Independent

### c)

In [100]:
x,y,z = sample_from_model3()
print("Model 3")
print("CMI")
print(compute_cmi(x,y,z))
print("indep_test_asymptotic")
print(indep_test_asymptotic(x,y,stat="mi"))
print("cond_indep_test_asymptotic")
print(cond_indep_test_asymptotic(x,y,z,stat="mi"))
print("cond_indep_test_permutation")
print(cond_indep_test_permutation(x,y,z, 100))

Model 3
CMI
0.009477834884453541
indep_test_asymptotic
(0.2646061050316506, 0.6069732119248707)
cond_indep_test_asymptotic
(18.955669768907082, 7.65294530182814e-05)
cond_indep_test_permutation
(18.955669768907082, 0.009900990099009901)


answer: Independent; Conditionally Dependent