In [2]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd

# Mathematical Underpinnings - Lab 4

Tests to verify hipoteses of independence (from Lab 4):

In [3]:
def indep_test_asymptotic(X, Y, stat):

    if stat == "mi":

        stat_value = 2*len(X)*mutual_info_score(X, Y)

    if stat == "chi2":

        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

def indep_test_permutation(X, Y, B, stat="mi"):

    stat_value = mutual_info_score(X, Y)

    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)

        stat_value_b = mutual_info_score(X_b, Y)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

In [4]:
def gen_data(n=1000, rho = 0.5):
    data_2d = np.random.multivariate_normal(np.zeros(2), np.array([[1, rho], [rho, 1]]), n)

    x_bins = np.linspace(data_2d[:,0].min(), data_2d[:,0].max(), 10)
    y_bins = np.linspace(data_2d[:,1].min(), data_2d[:,1].max(), 10)

    x_discrete = np.digitize(data_2d[:,0], bins=x_bins)
    y_discrete = np.digitize(data_2d[:,1], bins=y_bins)
    return x_discrete, y_discrete

In [5]:
np.random.binomial(size=10, n=5, p=0.5)

array([3, 2, 3, 4, 1, 3, 2, 4, 4, 3])

## Task 1

In [6]:
# a function which computes CMI

In [7]:
X, Y = gen_data(rho = 0.1)
Z = np.random.binomial(size=len(X), n=4, p=0.5)

In [8]:
def cmi(X, Y, Z):
    cmi_value = 0
    for uniq_Z in np.unique(Z):
        p = np.sum(Z == uniq_Z)/len(Z)
        cmi_value += p * mutual_info_score(X[Z == uniq_Z],Y[Z == uniq_Z])
    return cmi_value

In [9]:
cmi(X,Y,Z)

0.13038258884571582

### a)

In [10]:
# CI test based on CMI and asymptotics

In [11]:
def cond_indep_test_asymptotic(X, Y, Z):

    stat_value = 2*len(X)*cmi(X, Y, Z)

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)*len(np.unique(Z))

    p_value = chi2.sf(stat_value, df=df)

    return stat_value, p_value

In [12]:
X, Y = gen_data(rho = 0.2)
Z = np.random.binomial(size=len(X), n=4, p=0.5)
cond_indep_test_asymptotic(X, Y, Z)

(246.83711055598982, 0.9999999999557527)

### b)

In [13]:
# CI test based on CMI and permutations

In [14]:
def cond_indep_test_permutation(X, Y, Z, B = 500, stat="mi"):

    sorting_strat = np.argsort(Z)
    Z_sorted = np.sort(Z)
    X_sorted = X[sorting_strat]
    Y_sorted = Y[sorting_strat]
    stat_value = cmi(X, Y, Z)

    condition_p_value = 0
    for b in range(B):
        X_b = np.concatenate([np.random.permutation(X_sorted[Z_sorted == z_val]) for z_val in np.unique(Z)])

        stat_value_b = cmi(X_b, Y_sorted, Z_sorted)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

In [15]:
X, Y = gen_data(rho = 0.5)
Z = np.random.binomial(size=len(X), n=4, p=0.5)
cond_indep_test_permutation(X, Y, Z)


(435.93432906878576, 0.001996007984031936)

### c)

conditionaly independent

In [16]:
X, Y = gen_data(rho = 0)
Z = np.random.binomial(size=len(X), n=4, p=0.5)
s,pv = cond_indep_test_asymptotic(X, Y, Z)
print(f"Asymptotic - Statistic: {s}, p-value: {pv}")
s,pv = cond_indep_test_permutation(X, Y, Z)
print(f"Permutation - Statistic: {s}, p-value: {pv}")

Asymptotic - Statistic: 248.25148058457708, p-value: 0.9999999999300225
Permutation - Statistic: 248.25148058457708, p-value: 0.6906187624750499


conditionaly dependent

In [19]:
X, Y = gen_data(rho = 0.5)
Z = np.random.binomial(size=len(X), n=4, p=0.5)
s,pv = cond_indep_test_asymptotic(X, Y, Z)
print(f"Asymptotic - Statistic: {s}, p-value: {pv}")
s,pv = cond_indep_test_permutation(X, Y, Z)
print(f"Permutation - Statistic: {s}, p-value: {pv}")

Asymptotic - Statistic: 499.10977527613585, p-value: 0.0009562191238579951
Permutation - Statistic: 499.10977527613585, p-value: 0.001996007984031936


## Task 2

In [20]:
def binary_discrete(x):
    return (x > 0)*2-1

In [21]:
def sample_from_model1(n=1000):
    Z = np.random.normal(0, 1, n)
    Z = binary_discrete(Z)
    X = binary_discrete(np.random.normal(Z/2, 1, n))
    Y = binary_discrete(np.random.normal(Z/2, 1, n))
    return X,Y,Z

def sample_from_model2(n=1000):
    X = np.random.normal(0, 1, n)
    X = binary_discrete(X)
    Z = binary_discrete(np.random.normal(X/2, 1, n))
    Y = binary_discrete(np.random.normal(Z/2, 1, n))
    return X,Y,Z

def sample_from_model3(n=1000):
    X = binary_discrete(np.random.normal(0, 1, n))
    Y = binary_discrete(np.random.normal(0, 1, n))
    Z = binary_discrete(np.random.normal((X+Y)/2, 1, n))
    return X,Y,Z

### a)

Unconditionally dependent, conditionally independent - model 1

UD, CI - model 2

UI, CD - model 3

### b) c)

In [22]:
def test_dependence(pval, th=0.05):
    return 'Dependent' if pval < th else 'Independent*'

In [57]:
X,Y,Z = sample_from_model1()
print(f"Model 1\nMI:{mutual_info_score(X, Y)}\nCMI:{cmi(X,Y,Z)}")
print(f"Unconditionally: {test_dependence(indep_test_asymptotic(X,Y,stat='mi')[1])}")
print(f"Conditionally: {test_dependence(cond_indep_test_asymptotic(X,Y,Z)[1])}")

X,Y,Z = sample_from_model2()
print(f"Model 2\nMI:{mutual_info_score(X, Y)}\nCMI:{cmi(X,Y,Z)}")
print(f"Unconditionally: {test_dependence(indep_test_asymptotic(X,Y,stat='mi')[1])}")
print(f"Conditionally: {test_dependence(cond_indep_test_asymptotic(X,Y,Z)[1])}")

X,Y,Z = sample_from_model3()
print(f"Model 3\nMI:{mutual_info_score(X, Y)}\nCMI:{cmi(X,Y,Z)}")
print(f"Unconditionally: {test_dependence(indep_test_asymptotic(X,Y,stat='mi')[1])}")
print(f"Conditionally: {test_dependence(cond_indep_test_asymptotic(X,Y,Z)[1])}")

Model 1
MI:0.007007438047019587
CMI:0.00188159921097073
Unconditionally: Dependent
Conditionally: Independent*
Model 2
MI:0.011892716942830839
CMI:0.0003923372126279079
Unconditionally: Dependent
Conditionally: Independent*
Model 3
MI:6.0233859839886605e-06
CMI:0.007018702850808585
Unconditionally: Independent*
Conditionally: Dependent
