In [2]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd

# Mathematical Underpinnings - Lab 4

Tests to verify hipoteses of independence (from Lab 4):

In [237]:
def indep_test_asymptotic(X, Y, stat):

    if stat == "mi":

        stat_value = 2*len(X)*mutual_info_score(X, Y)

    if stat == "chi2":

        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

def indep_test_permutation(X, Y, B, stat="mi"):

    stat_value = mutual_info_score(X, Y)

    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)

        stat_value_b = mutual_info_score(X_b, Y)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

## Task 1

In [33]:
# a function which computes CMI
def generate_data(rho1, rho2, n=1000):
    return np.random.multivariate_normal(np.zeros(3), np.array([[1, rho1, 0], 
                                                                [rho1, 1, rho2],
                                                                [0, rho2, 1]]), n)

In [148]:
def digitize_var(var, bins=10):
    bins = np.linspace(min(var), max(var), bins)
    digitized = np.digitize(var, bins)
    return digitized

In [119]:
def cmi_score(X,Y,Z):
    cond_mut_info = 0
    for z_val in list(set(Z)):
        cond = Z==z_val
        X_filt_z = X[cond]
        Y_cond_z = Y[cond]
        p_z = np.mean([cond])
        cond_mut_info += p_z * mutual_info_score(X_filt_z, Y_cond_z)
    return cond_mut_info

### a)

In [151]:
# CI test based on CMI and asymptotics
def cond_indep_test_asymptotic(X, Y, Z, stat="cmi"):

    if stat == "cmi":

        stat_value = 2*len(X)*cmi_score(X,Y,Z)

    df = (len(set(X)) - 1)*(len(set(Y)) - 1)*len(set(Z))

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

### b)

In [212]:
# CI test based on CMI and permutations
def cond_indep_test_permutation(X, Y, Z, B=100):

    stat_value = cmi_score(X, Y, Z)

    condition_p_value = 0
    for b in range(B):
        X_bs = np.array([])
        Y_bs = np.array([])
        Z_bs = np.array([])
        for z_val in list(set(Z)):
            cond = Z == z_val
            X_b = np.random.permutation(X[cond])
            X_bs = np.append(X_bs, X_b)
            # just to reorder
            Y_b = Y[cond]
            Y_bs = np.append(Y_bs, Y_b)
            Z_b = Z[cond]
            Z_bs = np.append(Z_bs, Z_b)

        stat_value_b = cmi_score(X_bs, Y_bs, Z_bs)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

### c)

conditionaly independent

In [186]:
data = generate_data(0, 0)
X = digitize_var(data[:,0])
Y = digitize_var(data[:,1])
Z = digitize_var(data[:,2])
cond_indep_test_asymptotic(X, Y, Z)

(372.0814133334087, 1.0)

In [214]:
data = generate_data(0, 0)
X = digitize_var(data[:,0])
Y = digitize_var(data[:,1])
Z = digitize_var(data[:,2])
cond_indep_test_permutation(X, Y, Z)

(320.3049320343419, 0.8811881188118812)

conditionaly dependent

In [187]:
data = generate_data(0.7, 0.7)
X = digitize_var(data[:,0])
Y = digitize_var(data[:,1])
Z = digitize_var(data[:,2])
cond_indep_test_asymptotic(X, Y, Z)

(1763.7738824744677, 0.0)

In [213]:
data = generate_data(0.7, 0.7)
X = digitize_var(data[:,0])
Y = digitize_var(data[:,1])
Z = digitize_var(data[:,2])
cond_indep_test_permutation(X, Y, Z)

(1764.902905674606, 0.009900990099009901)

## Task 2

In [189]:
def sample_from_model1(n=1000):
    Z_tilde = np.random.normal(0, 1, n)
    Z = (Z_tilde >=0)*2-1
    X_tilde = np.random.normal(Z, 1, n)
    X = (X_tilde>=0)*2-1
    Y_tilde = np.random.normal(Z, 1, n)
    Y = (Y_tilde>=0)*2-1
    return X, Y, Z

def sample_from_model2(n=1000):
    X_tilde = np.random.normal(0, 1, n)
    X = (X_tilde>=0)*2-1
    Z_tilde = np.random.normal(X, 1, n)
    Z = (Z_tilde >=0)*2-1
    Y_tilde = np.random.normal(Z, 1, n)
    Y = (Y_tilde>=0)*2-1
    return X, Y, Z

def sample_from_model3(n=1000):
    X_tilde = np.random.normal(0, 1, n)
    X = (X_tilde>=0)*2-1
    Y_tilde = np.random.normal(0, 1, n)
    Y = (Y_tilde>=0)*2-1
    Z_tilde = np.random.normal((X+Y)/2, 1, n)
    Z = (Z_tilde >=0)*2-1
    return X, Y, Z

### a)

answer:

In a) and b) X and Y are conditionally independent, in c) X and Y are independent.

### b)

In [190]:
X, Y, Z = sample_from_model1()

In [195]:
mutual_info_score(X, Y)

0.14275425544942086

In [196]:
cmi_score(X, Y, Z)

0.0004607830349713014

In [197]:
X, Y, Z = sample_from_model2()

In [198]:
mutual_info_score(X, Y)

0.10373500368263833

In [199]:
cmi_score(X, Y, Z)

0.0008287240566436041

In [200]:
X, Y, Z = sample_from_model3()

In [201]:
mutual_info_score(X, Y)

4.909911058786909e-05

In [202]:
cmi_score(X, Y, Z)

0.007874223135890474

### c)

In [246]:
X, Y, Z = sample_from_model1()

In [247]:
indep_test_asymptotic(X, Y, stat="mi")

(238.1096195925134, 0.0)

In [248]:
indep_test_permutation(X, Y, B=100)

(238.1096195925134, 0.009900990099009901)

In [249]:
cond_indep_test_asymptotic(X,Y,Z)

(1.6651859525495005, 0.43492008549564154)

In [250]:
cond_indep_test_permutation(X,Y,Z)

(1.6651859525495005, 0.48514851485148514)

In [243]:
X, Y, Z = sample_from_model2()

In [244]:
indep_test_asymptotic(X, Y, stat="mi")

(211.76426199370945, 0.0)

In [245]:
indep_test_permutation(X, Y, B=100)

(211.76426199370945, 0.009900990099009901)

In [227]:
cond_indep_test_asymptotic(X,Y,Z)

(0.16548010445929318, 0.9205904215468532)

In [228]:
cond_indep_test_permutation(X,Y,Z)

(0.16548010445929318, 0.9306930693069307)

In [251]:
X, Y, Z = sample_from_model3()

In [252]:
indep_test_asymptotic(X, Y, stat="mi")

(4.248784867477262, 0.03927842530093828)

In [254]:
indep_test_permutation(X, Y, B=100)

(4.248784867477262, 0.0297029702970297)

In [255]:
cond_indep_test_asymptotic(X,Y,Z)

(35.393173005290016, 2.0628618035978263e-08)

In [256]:
cond_indep_test_permutation(X,Y,Z)

(35.393173005290016, 0.009900990099009901)