In [4]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd

# Mathematical Underpinnings - Lab 4

Tests to verify hipoteses of independence (from Lab 4):

In [101]:
def indep_test_asymptotic(X, Y, stat):

    if stat == "mi":

        stat_value = 2*len(X)*mutual_info_score(X, Y)

    if stat == "chi2":

        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

def indep_test_permutation(X, Y, B, stat="mi"):

    stat_value = mutual_info_score(X, Y)

    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)

        stat_value_b = mutual_info_score(X_b, Y)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

## Task 1

In [41]:
# a function which computes CMI
def compute_cmi(x, y, z):
    z_vals, counts = np.unique(z, return_counts=True)
    probs = counts / np.sum(counts)
    stat_value = 0
    for p, z_ in zip(probs, z_vals):
        index = ( z_ == z)
        score = mutual_info_score(x[index], y[index])
        stat_value += p * score
    return stat_value



### a)

In [None]:
# CI test based on CMI and asymptotics

def asymptotic_conditional_independence_test(x, y, z, stat="cmi"):
    assert stat in ["cmi", "chi2"]
    if stat == "cmi":
        stat_value = compute_cmi(x, y, z)
        stat_value *= 2*x.shape[0]
        df = (len(np.unique(x)) - 1)*(len(np.unique(y)) - 1) * len(np.unique(z))
    if stat == "chi2":
        test_res = (chi2_contingency(pd.crosstab(x, y)))
        stat_value = test_res.statistic

        df = (len(np.unique(x)) - 1)*(len(np.unique(y)) - 1)
    
    p_value = 1 - chi2.cdf(stat_value, df=df) # noqa

    return stat_value, p_value # noqa

In [74]:
data = np.random.multivariate_normal(np.zeros(2), np.array([[1, 0], [0, 1]]), 1000)
disc_data_x = np.digitize(data[:, 0], np.linspace(data[:,0].min(), data[:,0].max(), 10))
disc_data_y = np.digitize(data[:, 1], np.linspace(data[:,1].min(), data[:,1].max(), 10))
z = np.random.randint(0, 2, size=disc_data_x.shape)
print("pearson", asymptotic_conditional_independence_test(disc_data_x, disc_data_y, z, "chi2"))
print("cmi", asymptotic_conditional_independence_test(disc_data_x, disc_data_y, z, "cmi"))

pearson (97.06414913234596, 0.10771490195555178)
cmi (132.06883999117926, 0.9590956896146202)


### b)

In [61]:
# CI test based on CMI and permutations

def in_class_permutations(x, z):
    y = x.copy()
    z_vals = np.unique(z)
    for z_ in z_vals:
        y[z == z_] = np.random.permutation(y[z==z_])
    return y



def conditional_permutations_independence_test(x, y, z, b=100):
    stat_value = compute_cmi(x, y, z)
    ctx = 1
    for b_ in range(b):
        ctx += compute_cmi(in_class_permutations(x, z), y, z) >= stat_value
    return 2* len(x)*stat_value, ctx / (1 + b)

In [67]:
data = np.random.multivariate_normal(np.zeros(2), np.array([[1, 0], [0, 1]]), 1000)
disc_data_x = np.digitize(data[:, 0], np.linspace(data[:,0].min(), data[:,0].max(), 10))
disc_data_y = np.digitize(data[:, 1], np.linspace(data[:,1].min(), data[:,1].max(), 10))
z = np.random.randint(0, 2, size=disc_data_x.shape)
conditional_permutations_independence_test(disc_data_x, disc_data_y, z)

(140.64183215470132, 0.19801980198019803)

### c)

conditionaly independent

In [70]:
data = np.random.multivariate_normal(np.zeros(2), np.array([[1, 0], [0, 1]]), 1000)
disc_data_x = np.digitize(data[:, 0], np.linspace(data[:,0].min(), data[:,0].max(), 10))
disc_data_y = np.digitize(data[:, 1], np.linspace(data[:,1].min(), data[:,1].max(), 10))
z = np.random.randint(0, 2, size=disc_data_x.shape)
print("perm", conditional_permutations_independence_test(disc_data_x, disc_data_y, z))
print("asymp", asymptotic_conditional_independence_test(disc_data_x, disc_data_y, z, "cmi"))

perm (116.06752250114444, 0.8712871287128713)
asymp (116.06752250114444, 0.9974835371385276)


conditionaly dependent

In [69]:
data = np.random.multivariate_normal(np.zeros(2), np.array([[1, 0], [0, 1]]), 1000)
data[500:, :] = np.random.multivariate_normal(np.zeros(2), np.array([[1,1], [1,1]]), 500)
disc_data_x = np.digitize(data[:, 0], np.linspace(data[:,0].min(), data[:,0].max(), 10))
disc_data_y = np.digitize(data[:, 1], np.linspace(data[:,1].min(), data[:,1].max(), 10))
z = np.zeros(1000)
z[500:] = np.ones(500)
print("perm", conditional_permutations_independence_test(disc_data_x, disc_data_y, z))
print("asymp", asymptotic_conditional_independence_test(disc_data_x, disc_data_y, z, "cmi"))

perm (1347.7498732479448, 0.009900990099009901)
asymp (1347.7498732479448, 0.0)


## Task 2

In [80]:
def sample_from_model1():
    z_tilde = np.random.normal(0, 1, 1000)
    z = np.where(z_tilde, -1, 1)
    x_tilde = np.zeros(1000, float)
    x_tilde[z==-1] = np.random.normal(-.5, 1, sum(z==-1))
    x_tilde[z==1] = np.random.normal(.5, 1, sum(z==1))
    y_tilde = np.zeros(1000, float)
    y_tilde[z==-1] = np.random.normal(-.5, 1, sum(z==-1))
    y_tilde[z==1] = np.random.normal(.5, 1, sum(z==1))
    x = np.where(x_tilde < 0, -1, 1)
    y = np.where(y_tilde < 0, -1, 1)
    return x, y, z


def sample_from_model2():
    x_tilde = np.random.normal(0, 1, 1000)
    x = np.where(x_tilde < 0, -1, 1)
    z_tilde = np.zeros(1000, float)
    z_tilde[x == -1] = np.random.normal(-.5, 1, sum(x==-1))
    z_tilde[ x == 1] = np.random.normal(.5, 1, sum(x==1))
    z = np.where(z_tilde < 0, -1, 1)
    y_tilde = np.zeros(1000, float)
    y_tilde[z == -1] = np.random.normal(-.5, 1, sum(z==-1))
    y_tilde[ z == 1] = np.random.normal(.5, 1, sum(z==1))
    y = np.where(y_tilde < 0, -1, 1)
    return x, y, z


def sample_from_model3():
    x_tilde = np.random.normal(0, 1, 1000)
    y_tilde = np.random.normal(0, 1, 1000)
    x = np.where(x_tilde < 0, -1, 1)
    y = np.where(y_tilde < 0, -1, 1)
    z_tilde = np.zeros(1000, float)
    z_tilde[x+y == -2] = np.random.normal(-1, 1, sum(x+y==-2))
    z_tilde[x+y == 0] = np.random.normal(0, 1, sum(x+y==0))
    z_tilde[x+y == 2] = np.random.normal(1, 1, sum(x+y==2))
    z = np.where(z_tilde < 0, -1, 1)
    return x, y, z

### a)


In [99]:
x, y, z = sample_from_model1()
print("perm1", conditional_permutations_independence_test(x, y, z, b=100))
print("asymp1", asymptotic_conditional_independence_test(x, y, z, "cmi"))

x, y, z = sample_from_model2()
print("perm2", conditional_permutations_independence_test(x, y, z, b=100))
print("asymp2", asymptotic_conditional_independence_test(x, y, z, "cmi"))

x, y, z = sample_from_model3()
print("perm3", conditional_permutations_independence_test(x, y, z, b=100))
print("asymp3", asymptotic_conditional_independence_test(x, y, z, "cmi"))

perm1 (0.26093554671091646, 0.7029702970297029)
asymp1 (0.26093554671091646, 0.6094781490819117)
perm2 (1.207727952417956, 0.5742574257425742)
asymp2 (1.207727952417956, 0.5466951326817019)
perm3 (14.839995871596656, 0.009900990099009901)
asymp3 (14.839995871596656, 0.0005991503822802002)


In [102]:
x, y, z = sample_from_model1()
print("perm1", indep_test_permutation(x, y, 100))
print("asymp1", indep_test_asymptotic(x, y, "cmi"))

x, y, z = sample_from_model2()
print("perm1", indep_test_permutation(x, y, 100))
print("asymp1", indep_test_asymptotic(x, y, "cmi"))

x, y, z = sample_from_model3()
print("perm1", indep_test_permutation(x, y, 100))
print("asymp1", indep_test_asymptotic(x, y, "cmi"))


perm1 (0.1869848076032632, 0.7227722772277227)


UnboundLocalError: cannot access local variable 'stat_value' where it is not associated with a value

answer:

### b)

### c)