In [1]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd

# Mathematical Underpinnings - Lab 4

Tests to verify hipoteses of independence (from Lab 4):

In [2]:
def indep_test_asymptotic(X, Y, stat):

    if stat == "mi":

        stat_value = 2*len(X)*mutual_info_score(X, Y)

    if stat == "chi2":

        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

def indep_test_permutation(X, Y, B, stat="mi"):

    stat_value = mutual_info_score(X, Y)

    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)

        stat_value_b = mutual_info_score(X_b, Y)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

## Task 1

In [3]:
# a function which computes CMI

In [4]:
def cond_mutual_info(X, Y, Z):
    mutual_info = 0
    for i in np.unique(Z):
        ind = np.where(Z==i)[0]
        X_cond = X[ind]
        Y_cond = Y[ind]
        mutual_info += (len(ind)/len(Z)) * mutual_info_score(X_cond, Y_cond)
    return mutual_info     

### a)

In [5]:
# CI test based on CMI and asymptotics

In [6]:
def cond_indep_test_asymptotic(X, Y, Z, stat="mi"):
                          
    stat_value = 2*len(X)*cond_mutual_info(X, Y, Z)

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)*(len(np.unique(Z)))

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

### b)

In [7]:
# CI test based on CMI and permutations

In [8]:
def cond_indep_test_permutation(X, Y, Z, B, stat="mi"):

    stat_value = cond_mutual_info(X, Y, Z)

    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)

        stat_value_b = cond_mutual_info(X_b, Y, Z)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

### c)

conditionaly independent

In [9]:
sample_size = 1000

X = np.random.normal(0, 1, sample_size)
Y = np.random.normal(0, 1, sample_size)
Z = np.random.normal(0, 1, sample_size)

X_bins = np.linspace(X.min(), X.max(), 3)
Y_bins = np.linspace(Y.min(), Y.max(), 3)
Z_bins = np.linspace(Z.min(), Z.max(), 3)

X_discrete = np.digitize(X, bins=X_bins)
Y_discrete = np.digitize(Y, bins=Y_bins)
Z_discrete = np.digitize(Z, bins=Z_bins)

In [10]:
print("Asymptotic test based on conditional mutual information:")
pvalue = cond_indep_test_asymptotic(X_discrete, Y_discrete, Z_discrete, "mi")[1]
print(f'p-value: {pvalue}')
if pvalue < 0.05:
    print("Reject the null hypothesis")
else:
    print("Don't reject the null hypothesis")

Asymptotic test based on conditional mutual information:
p-value: 0.9968858581205507
Don't reject the null hypothesis


In [11]:
print("Permutation test:")
pvalue = cond_indep_test_permutation(X_discrete, Y_discrete, Z_discrete, 100, "mi")[1]
print(f'p-value: {pvalue}')
if pvalue < 0.05:
    print("Reject the null hypothesis")
else:
    print("Don't reject the null hypothesis")

Permutation test:
p-value: 0.801980198019802
Don't reject the null hypothesis


conditionaly dependent

In [12]:
sample_size = 1000

X_discrete = np.zeros(sample_size)
Y_discrete = np.zeros(sample_size)

Z = np.random.normal(0, 1, sample_size)
Z_bins = np.linspace(Z.min(), Z.max(), 3)
Z_discrete = np.digitize(Z, bins=Z_bins)


for z in np.unique(Z_discrete):
    mean = [0, 0]
    covariance_matrix = [[1, 0.5], [0.5, 1]]

    ind = np.where(Z_discrete==z)[0]
    
    sample = np.random.multivariate_normal(mean, covariance_matrix, len(ind))
    x = sample[:, 0]
    y = sample[:, 1]
    
    x_bins = np.linspace(x.min(), x.max(), 3)
    y_bins = np.linspace(y.min(), y.max(), 3)
    
    x_discrete = np.digitize(x, bins=x_bins)
    y_discrete = np.digitize(y, bins=y_bins)

    X_discrete[ind] = x_discrete
    Y_discrete[ind] = y_discrete

In [13]:
print("Asymptotic test based on conditional mutual information:")
pvalue = cond_indep_test_asymptotic(X_discrete, Y_discrete, Z_discrete, "mi")[1]
print(f'p-value: {pvalue}')
if pvalue < 0.05:
    print("Reject the null hypothesis")
else:
    print("Don't reject the null hypothesis")

Asymptotic test based on conditional mutual information:
p-value: 1.5765166949677223e-14
Reject the null hypothesis


In [14]:
print("Permutation test:")
pvalue = cond_indep_test_permutation(X_discrete, Y_discrete, Z_discrete, 100, "mi")[1]
print(f'p-value: {pvalue}')
if pvalue < 0.05:
    print("Reject the null hypothesis")
else:
    print("Don't reject the null hypothesis")

Permutation test:


p-value: 0.009900990099009901
Reject the null hypothesis


## Task 2

In [15]:
def sample_from_model1():
    sample_size = 1000
    
    Z = np.random.normal(0, 1, sample_size)
    Z_discrete = np.digitize(Z, bins=[0])
    Z_discrete[Z_discrete == 0] = -1

    X = np.random.normal(Z_discrete/2, 1, sample_size)
    X_discrete = np.digitize(X, bins=[0])
    X_discrete[X_discrete == 0] = -1

    Y = np.random.normal(Z_discrete/2, 1, sample_size)
    Y_discrete = np.digitize(Y, bins=[0])
    Y_discrete[Y_discrete == 0] = -1

    return X_discrete, Y_discrete, Z_discrete

In [16]:
def sample_from_model2():
    sample_size = 1000

    X = np.random.normal(0, 1, sample_size)
    X_discrete = np.digitize(X, bins=[0])
    X_discrete[X_discrete == 0] = -1
    
    Z = np.random.normal(X_discrete/2, 1, sample_size)
    Z_discrete = np.digitize(Z, bins=[0])
    Z_discrete[Z_discrete == 0] = -1

    Y = np.random.normal(Z_discrete/2, 1, sample_size)
    Y_discrete = np.digitize(Y, bins=[0])
    Y_discrete[Y_discrete == 0] = -1
    
    return X_discrete, Y_discrete, Z_discrete

In [17]:
def sample_from_model3():
    sample_size = 1000

    X = np.random.normal(0, 1, sample_size)
    X_discrete = np.digitize(X, bins=[0])
    X_discrete[X_discrete == 0] = -1

    Y = np.random.normal(0, 1, sample_size)
    Y_discrete = np.digitize(Y, bins=[0])
    Y_discrete[Y_discrete == 0] = -1

    
    Z = np.random.normal((X_discrete+Y_discrete)/2, 1, sample_size)
    Z_discrete = np.digitize(Z, bins=[0])
    Z_discrete[Z_discrete == 0] = -1
    
    return X_discrete, Y_discrete, Z_discrete

### a)

answer: \
Model 1 : X and Y are dependent but X and Y are independent given Z \
Model 2 : X and Y are dependent but X and Y are independent given Z \
Model 3 : X and Y are independent but X and Y are dependent given Z

### b)

#### Model 1

In [18]:
X, Y, Z = sample_from_model1()

print("Mutual information between X and Y:")
print(mutual_info_score(X, Y))
print("Conditional mutual information between X and Y given Z:")
print(cond_mutual_info(X, Y, Z))

Mutual information between X and Y:
0.003610246588255428
Conditional mutual information between X and Y given Z:
0.0021709402328269124


#### Model 2

In [19]:
X, Y, Z = sample_from_model2()

print("Mutual information between X and Y:")
print(mutual_info_score(X, Y))
print("Conditional mutual information between X and Y given Z:")
print(cond_mutual_info(X, Y, Z))

Mutual information between X and Y:
0.013187009104361824
Conditional mutual information between X and Y given Z:
0.0011512107199705857


#### Model 3

In [20]:
X, Y, Z = sample_from_model3()

print("Mutual information between X and Y:")
print(mutual_info_score(X, Y))
print("Conditional mutual information between X and Y given Z:")
print(cond_mutual_info(X, Y, Z))

Mutual information between X and Y:
0.0005346946033590871
Conditional mutual information between X and Y given Z:
0.011311682778773157


### c)

#### Model 1

In [21]:
X, Y, Z = sample_from_model1()

##### Independence

In [22]:
print("Asymptotic test based on mutual information:")
pval_1 = indep_test_asymptotic(X, Y, "mi")[1]
print(f'p-value: {pval_1}')
if pval_1 < 0.05:
    print("X and Y are dependent")
else:
    print("X and Y are independent")

print("Permutation test:")
pval_2 = indep_test_permutation(X, Y, 100, "mi")[1]
print(f'p-value: {pval_2}')
if pval_2 < 0.05:
    print("X and Y are dependent")
else:
    print("X and Y are independent")

Asymptotic test based on mutual information:
p-value: 5.319489388866394e-06
X and Y are dependent
Permutation test:
p-value: 0.009900990099009901
X and Y are dependent


##### Conditional independence

In [23]:
print("Asymptotic test based on conditional mutual information:")
pval_3 = cond_indep_test_asymptotic(X, Y, Z, "mi")[1]
print(f'p-value: {pval_3}')
if pval_3 < 0.05:
    print("X and Y are dependent given Z")
else:
    print("X and Y are independent given Z")

print("Permutation test:")
pval_4 = cond_indep_test_permutation(X, Y, Z, 100, "mi")[1]
print(f'p-value: {pval_4}')
if pval_4 < 0.05:
    print("X and Y are dependent given Z")
else:
    print("X and Y are independent given Z")

Asymptotic test based on conditional mutual information:
p-value: 0.5870807077745189
X and Y are independent given Z
Permutation test:
p-value: 0.5445544554455446
X and Y are independent given Z


#### Model 2

In [24]:
X, Y, Z = sample_from_model2()

##### Independence

In [25]:
print("Asymptotic test based on mutual information:")
pval_1 = indep_test_asymptotic(X, Y, "mi")[1]
print(f'p-value: {pval_1}')
if pval_1 < 0.05:
    print("X and Y are dependent")
else:
    print("X and Y are independent")

print("Permutation test:")
pval_2 = indep_test_permutation(X, Y, 100, "mi")[1]
print(f'p-value: {pval_2}')
if pval_2 < 0.05:
    print("X and Y are dependent")
else:
    print("X and Y are independent")

Asymptotic test based on mutual information:
p-value: 2.1245263289726424e-06
X and Y are dependent
Permutation test:


p-value: 0.009900990099009901
X and Y are dependent


##### Conditional independence

In [26]:
print("Asymptotic test based on conditional mutual information:")
pval_3 = cond_indep_test_asymptotic(X, Y, Z, "mi")[1]
print(f'p-value: {pval_3}')
if pval_3 < 0.05:
    print("X and Y are dependent given Z")
else:
    print("X and Y are independent given Z")

print("Permutation test:")
pval_4 = cond_indep_test_permutation(X, Y, Z, 100, "mi")[1]
print(f'p-value: {pval_4}')
if pval_4 < 0.05:
    print("X and Y are dependent given Z")
else:
    print("X and Y are independent given Z")

Asymptotic test based on conditional mutual information:
p-value: 0.10192753924619047
X and Y are independent given Z
Permutation test:
p-value: 0.12871287128712872
X and Y are independent given Z


#### Model 3

In [27]:
X, Y, Z = sample_from_model3()

##### Independence

In [28]:
print("Asymptotic test based on mutual information:")
pval_1 = indep_test_asymptotic(X, Y, "mi")[1]
print(f'p-value: {pval_1}')
if pval_1 < 0.05:
    print("X and Y are dependent")
else:
    print("X and Y are independent")

print("Permutation test:")
pval_2 = indep_test_permutation(X, Y, 100, "mi")[1]
print(f'p-value: {pval_2}')
if pval_2 < 0.05:
    print("X and Y are dependent")
else:
    print("X and Y are independent")

Asymptotic test based on mutual information:
p-value: 0.8193933122883362
X and Y are independent
Permutation test:
p-value: 0.8613861386138614
X and Y are independent


##### Conditional independence

In [29]:
print("Asymptotic test based on conditional mutual information:")
pval_3 = cond_indep_test_asymptotic(X, Y, Z, "mi")[1]
print(f'p-value: {pval_3}')
if pval_3 < 0.05:
    print("X and Y are dependent given Z")
else:
    print("X and Y are independent given Z")

print("Permutation test:")
pval_4 = cond_indep_test_permutation(X, Y, Z, 100, "mi")[1]
print(f'p-value: {pval_4}')
if pval_4 < 0.05:
    print("X and Y are dependent given Z")
else:
    print("X and Y are independent given Z")

Asymptotic test based on conditional mutual information:
p-value: 0.0013173353545314903
X and Y are dependent given Z
Permutation test:
p-value: 0.009900990099009901
X and Y are dependent given Z
