In [26]:
import warnings

warnings.filterwarnings("ignore")

In [27]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd

# Mathematical Underpinnings - Lab 4

Tests to verify hipoteses of independence (from Lab 4):

In [28]:
def indep_test_asymptotic(X, Y, stat = 'mi'):

    if stat == "mi":

        stat_value = 2*len(X)*mutual_info_score(X, Y)

    if stat == "chi2":

        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

def indep_test_permutation(X, Y, B, stat="mi"):

    stat_value = mutual_info_score(X, Y)

    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)

        stat_value_b = mutual_info_score(X_b, Y)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value



## Task 1

### a)

In [29]:
# a function which computes CMI

In [30]:
 from sklearn.preprocessing import KBinsDiscretizer

In [31]:
def conditional_i_test(X,Y,Z):
    df = np.stack((X,Y),axis = 1)
    dis = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy = 'uniform')
    dis.fit(df)
    new_X = dis.transform(df)
    n = X.shape[0]
    MI = 0
    for z in np.unique(Z):
        MI += np.count_nonzero(Z == z)/n * mutual_info_score(new_X[Z==z,0], new_X[Z==z,1])
    return MI, chi2.sf(2*n*MI, 81*np.shape(np.unique(Z))[0])

In [32]:
n = 1000
ro = 0.15
data = np.random.multivariate_normal(np.zeros(2), np.array([[1,ro], [ro,1]]), n)

In [33]:
conditional_i_test(data[:,0], data[:,1], np.random.randint(0,2,n))

(0.08195836664158211, 0.443113603917279)

In [34]:
# teraz drugi przypadek
n = 1000
ro = 0.15
X = np.random.randn(n)
Z = np.random.randint(0,2,n)
Y = Z * X + 1/10*np.random.randn(n) # noise added
conditional_i_test(X,Y,Z)

(0.6908642777328198, 2.048815995301209e-192)

### b)

In [35]:
def conditional_permutation_test(X, Y, Z, B):
    benchmark = np.zeros(B)
    data = np.stack((X,Y),axis=1)
    discx = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
    discx.fit(data)
    new_X = discx.transform(data)
    MI = 0
    for z in np.unique(Z):
        MI += np.count_nonzero(Z == z)/n * mutual_info_score(new_X[Z == z,0], new_X[Z == z,1])
    for b in range(B):
        for z in np.unique(Z):
            MI += np.count_nonzero(Z == z)/n * mutual_info_score(np.random.permutation(new_X[Z == z,0]), new_X[Z == z,1])
        benchmark[b] = mutual_info_score(new_X[:,0], new_X[:,1])
    return (1 + sum(benchmark >= MI)) / (B + 1)

### c)

In [36]:

n = 1000
ro = 0.15
data = np.random.multivariate_normal(np.zeros(2), np.array([[1, ro], [ro, 1]]), n)
conditional_i_test(data[:,0], data[:,1], np.random.randint(0,2,n))

(0.07614297065876405, 0.6962391188332047)

In [37]:

n = 1000
ro = 0.15
X = np.random.randn(n)
Z = np.random.randint(0,2,n)
Y = Z * X + 1/10*np.random.randn(n) # noise added
conditional_permutation_test(X,Y,Z, 100)


0.009900990099009901

## Task 2

In [38]:
def sample_from_model1():
    pass

def sample_from_model2():
    pass

def sample_from_model3():
    pass

In [39]:
def discretize(Q):
    Q[Q < 0] = -1
    Q[Q >= 0] = 1
    return Q

def sample_from_model1(n=1000):
    Z = discretize(np.random.normal(0, 1, n))
    X = discretize(np.random.normal(Z/2, 1))
    Y = discretize(np.random.normal(Z/2, 1))
    return X, Y, Z

def sample_from_model2(n=1000):
    X = discretize(np.random.normal(0, 1, n))
    Z = discretize(np.random.normal(X/2, 1))
    Y = discretize(np.random.normal(Z/2, 1))
    return X, Y, Z

def sample_from_model3(n=1000):
    X = discretize(np.random.normal(0, 1, n))
    Y = discretize(np.random.normal(0, 1, n))
    Z = discretize(np.random.normal((X+Y)/2, 1))
    return X, Y, Z



### a)

Model 1: dependent, conditionally independent

Model 2: dependent, conditionally independent

Model 3: independent, conditionally dependent

### b)

In [40]:
X, Y, Z = sample_from_model1()

print("MI:")
print(mutual_info_score(X, Y))
print("CMI:")
print(conditional_i_test(X, Y, Z)[0])

MI:
0.01452524086812318
CMI:
0.0006296366411007807


In [41]:
X, Y, Z = sample_from_model2()

print("MI:")
print(mutual_info_score(X, Y))
print("CMI:")
print(conditional_i_test(X, Y, Z)[0])

MI:
0.015690823716691726
CMI:
0.0018604522542880638


In [42]:
X, Y, Z = sample_from_model3()

print("MI:")
print(mutual_info_score(X, Y))
print("CMI:")
print(conditional_i_test(X, Y, Z)[0])

MI:
7.390300710286013e-08
CMI:
0.007440045380275112


### c)

Model 1

In [52]:
X, Y, Z = sample_from_model1()

print("Independence test p-value: "+ str(indep_test_asymptotic(X,Y, 'mi')))
print("Conditional permutation test p-value: "+ str(conditional_permutation_test(X,Y, Z, 100)))

print("Chi2 independence test p-value: "+ str(indep_test_asymptotic(X,Y,"chi2")[1]))
print("Conditional chi2 test p-value: "+ str(conditional_i_test(X,Y, Z)[1]))


Independence test p-value: (10.407009682625091, 0.0012553786531661615)
Conditional permutation test p-value: 0.009900990099009901
Chi2 independence test p-value: 0.001578067856151888
Conditional chi2 test p-value: 1.0


Model 2

In [54]:
X, Y, Z = sample_from_model2()

print("Independence test p-value: "+ str(indep_test_asymptotic(X,Y, 'mi')))
print("Conditional permutation test p-value: "+ str(conditional_permutation_test(X,Y, Z, 100)))

print("Chi2 independence test p-value: "+ str(indep_test_asymptotic(X,Y,"chi2")[1]))
print("Conditional chi2 test p-value: "+ str(conditional_i_test(X,Y, Z)[1]))

Independence test p-value: (24.012473989735494, 9.571359249616052e-07)
Conditional permutation test p-value: 0.009900990099009901
Chi2 independence test p-value: 1.3952628016289381e-06
Conditional chi2 test p-value: 1.0


Model 3

In [55]:
X, Y, Z = sample_from_model3()

print("Independence test p-value: "+ str(indep_test_asymptotic(X,Y, 'mi')))
print("Conditional permutation test p-value: "+ str(conditional_permutation_test(X,Y, Z, 100)))

print("Chi2 independence test p-value: "+ str(indep_test_asymptotic(X,Y,"chi2")[1]))
print("Conditional chi2 test p-value: "+ str(conditional_i_test(X,Y, Z)[1]))

Independence test p-value: (1.273135188640473, 0.2591793436316202)
Conditional permutation test p-value: 0.009900990099009901
Chi2 independence test p-value: 0.2870042376422688
Conditional chi2 test p-value: 1.0
