In [1]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd

# Mathematical Underpinnings - Lab 4

Tests to verify hipoteses of independence (from Lab 4):

In [125]:
def indep_test_asymptotic(X, Y, stat="mi"):

    if stat == "mi":

        stat_value = 2*len(X)*mutual_info_score(X, Y)

    if stat == "chi2":

        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

def indep_test_permutation(X, Y, B, stat="mi"):

    stat_value = mutual_info_score(X, Y)

    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)

        stat_value_b = mutual_info_score(X_b, Y)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

## Task 1

In [3]:
def CMI(X, Y, Z):
  cmi = 0
  for z in np.unique(Z):
    cmi += mutual_info_score(X[Z == z], Y[Z == z]) * (len(Z[Z == z]) / len(Z))
  return cmi

### a)

In [4]:
def cond_indep_test_asymptotic(X, Y, Z):
    stat_value = 2*len(X)*CMI(X, Y, Z)
    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)*len(np.unique(Z))
    p_value = 1 - chi2.cdf(stat_value, df=df)
    return stat_value, p_value

### b)

In [5]:
def perm_X_on_Z(X, Z):
  for z in np.unique(Z):
    X[Z == z] = np.random.permutation(X[Z == z])
  return X

In [10]:
def cond_test_permutation(X, Y, Z, B=100):
    stat_value = CMI(X, Y, Z)
    condition_p_value = 0
    for b in range(B):
        X_b = perm_X_on_Z(X, Z)
        stat_value_b = CMI(X_b, Y, Z)
        if stat_value <= stat_value_b:
            condition_p_value += 1
    p_value = (1 + condition_p_value)/(1 + B)
    return 2*len(X)*stat_value, p_value

### c)

In [94]:
def generate_data(n, p):
  X, Y = np.random.multivariate_normal(np.array([0, 0]), np.array([[1, p], [p, 1]]), size=n).T
  Z = np.ones(n)
  for z in range(2, 5):
    X_new, Y_new = np.random.multivariate_normal(np.array([0, 0]), np.array([[1, p], [p, 1]]), size=n).T
    Z_new = np.ones(n) * z

    X = np.concatenate((X, X_new), axis=0)
    Y = np.concatenate((Y, Y_new), axis=0)
    Z = np.concatenate((Z, Z_new), axis=0)
  return pd.cut(X, bins=10, labels=False), pd.cut(Y, bins=10, labels=False), Z

conditionaly independent

In [95]:
np.random.seed(222)
n = 1000
X_indep = pd.cut(np.random.normal(0, 1, n), bins=10, labels=False)
Y_indep = pd.cut(np.random.normal(0, 1, n), bins=10, labels=False)
Z_indep = np.random.randint(0, 3, n)

cta_stat, cta_p = cond_indep_test_asymptotic(X_indep, Y_indep, Z_indep)
ctp_stat, ctp_p = cond_test_permutation(X_indep, Y_indep, Z_indep)

print(f"Conditional independance test stat value: {cta_stat}, p value {cta_p}")
print(f"Conditional permutation test stat value: {ctp_stat}, p value {ctp_p}")

Conditional independance test stat value: 178.57416869163853, p value 0.9993102413070204
Conditional permutation test stat value: 178.57416869163853, p value 0.8811881188118812


conditionaly dependent

In [97]:
X_dep, Y_dep, Z_dep = generate_data(n=1000, p=0.5)
cta_stat_dep, cta_p_dep = cond_indep_test_asymptotic(X_dep, Y_dep, Z_dep)
ctp_stat_dep, ctp_p_dep = cond_test_permutation(X_dep, Y_dep, Z_dep)

print(f"Conditional independance test stat value: {cta_stat_dep}, p value {cta_p_dep}")
print(f"Conditional permutation test stat value: {ctp_stat_dep}, p value {ctp_p_dep}")

Conditional independance test stat value: 1282.2806875050526, p value 0.0
Conditional permutation test stat value: 1282.2806875050526, p value 0.009900990099009901


## Task 2

In [139]:
def sample_from_model1(n=1000):
    Z = np.random.normal(0, 1, n)
    Z[Z<0] = -1
    Z[Z>=0] = 1

    X = np.random.normal(Z/2, 1, n)
    X[X<0] = -1
    X[X>=0] = 1

    Y = np.random.normal(Z/2, 1, n)
    Y[Y<0] = -1
    Y[Y>=0] = 1

    return X, Y, Z

def sample_from_model2(n=1000):
    X = np.random.normal(0, 1, n)
    X[X<0] = -1
    X[X>=0] = 1

    Z = np.random.normal(X/2, 1, n)
    Z[Z<0] = -1
    Z[Z>=0] = 1

    Y = np.random.normal(Z/2, 1, n)
    Y[Y<0] = -1
    Y[Y>=0] = 1

    return X, Y, Z

def sample_from_model3(n=1000):
    X = np.random.normal(0, 1, n)
    Y = np.random.normal(0, 1, n)

    Y[Y<0] = -1
    Y[Y>=0] = 1

    X[X<0] = -1
    X[X>=0] = 1

    Z = np.random.normal((X+Y)/2, 1, n)
    Z[Z<0] = -1
    Z[Z>=0] = 1


    return X, Y, Z

### a)

Model 1: X and Y dependent, but conditionally independent;


Model 2: X and Y dependent, but conditionally independent;


Model 3: X and Y independent, but conditionally dependent.

### b)

In [134]:
X1, Y1, Z1 = sample_from_model1()
X2, Y2, Z2 = sample_from_model2()
X3, Y3, Z3 = sample_from_model3()

print(f"Mutual information for Model 1 {mutual_info_score(X1, Y1)}")
print(f"Mutual information for Model 2 {mutual_info_score(X2, Y2)}")
print(f"Mutual information for Model 3 {mutual_info_score(X3, Y3)}")

print(f"Conditional mutual information for Model 1 {CMI(X1, Y1, Z1)}")
print(f"Conditional mutual information for Model 2 {CMI(X2, Y2, Z2)}")
print(f"Conditional mutual information for Model 3 {CMI(X3, Y3, Z3)}")

Mutual information for Model 1 0.008151594349316282
Mutual information for Model 2 0.013854544164044413
Mutual information for Model 3 0.00019849850284847648
Conditional mutual information for Model 1 0.0007804655484022727
Conditional mutual information for Model 2 0.00022244382926404687
Conditional mutual information for Model 3 0.006677010310498424


### c)

In [138]:
# Model 1
cta_stat_1, cta_p_1 = cond_indep_test_asymptotic(X1, Y1, Z1)
ctp_stat_1, ctp_p_1 = indep_test_asymptotic(X1, Y1)

print(f"Conditional independance test for Model 1 stat value: {cta_stat_1}, p value {cta_p_1}")
print(f"Independence test for Model 1 stat value: {ctp_stat_1}, p value {ctp_p_1}")

# Model 2
cta_stat_2, cta_p_2 = cond_indep_test_asymptotic(X2, Y2, Z2)
ctp_stat_2, ctp_p_2 = indep_test_asymptotic(X2, Y2)

print(f"Conditional independance test for Model 2 stat value: {cta_stat_2}, p value {cta_p_2}")
print(f"Independence test for Model 2 stat value: {ctp_stat_2}, p value {ctp_p_2}")

# Model 1
cta_stat_3, cta_p_3 = cond_indep_test_asymptotic(X3, Y3, Z3)
ctp_stat_3, ctp_p_3 = indep_test_asymptotic(X3, Y3)

print(f"Conditional independance test for Model 3 stat value: {cta_stat_3}, p value {cta_p_3}")
print(f"Independence test for Model 3 stat value: {ctp_stat_3}, p value {ctp_p_3}")

Conditional independance test for Model 1 stat value: 1.5609310968045456, p value 0.45819265078774585
Independence test for Model 1 stat value: 16.303188698632564, p value 5.397305182530676e-05
Conditional independance test for Model 2 stat value: 0.44488765852809375, p value 0.8005599735301961
Independence test for Model 2 stat value: 27.709088328088825, p value 1.4099873313355715e-07
Conditional independance test for Model 3 stat value: 13.354020620996849, p value 0.0012595379766049408
Independence test for Model 3 stat value: 0.39699700569695295, p value 0.5286442174787276
