In [1]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd

# Mathematical Underpinnings - Lab 4

Tests to verify hipoteses of independence (from Lab 4):

In [11]:
def indep_test_asymptotic(X, Y, stat):

    if stat == "mi":

        stat_value = 2*len(X)*mutual_info_score(X, Y)

    if stat == "chi2":

        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

def indep_test_permutation(X, Y, B, stat="mi"):

    stat_value = mutual_info_score(X, Y)

    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)

        stat_value_b = mutual_info_score(X_b, Y)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

## Task 1

In [12]:
# a function which computes CMI
def calculate_cmi(X, Y, Z):
    n = len(Z)
    cmi = 0
    for z in np.unique(Z):
        cmi += mutual_info_score(X[Z == z], Y[Z == z]) * len(Z[Z==z]) / n   
    return cmi

### a)

In [13]:
# CI test based on CMI and asymptotics
def cond_indep_test_asymptotic(X, Y, stat):
    stat_value = 2 * len(X) * calculate_cmi(X, Y, Z)
    df = (len(np.unique(X)) - 1) * (len(np.unique(Y)) - 1) * len(np.unique(Z))
    p_value = 1 - chi2.cdf(stat_value, df=df)
    return stat_value, p_value

### b)

In [14]:
# CI test based on CMI and permutations

def cond_permutation(X, Z):
    X = X.copy()
    ids = np.array([i for i in range(len(Z))])
    for z in np.unique(Z):
        X_z = X[Z == z]
        X_z = np.random.permutation(X_z)
        X[Z == z] = X_z
    return X
        

def cond_indep_test_permutation(X, Y, Z, B):
    stat_value = calculate_cmi(X, Y, Z)
    
    condition_p_value = 0
    for b in range(B):
        X_b = cond_permutation(X, Z)
        stat_value_b = calculate_cmi(X_b, Y, Z)
        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)
    return 2 * len(X) * stat_value, p_value

### c)

conditionaly independent

In [68]:
n = 1000
X = pd.cut(np.random.normal(0, 1, n), bins=10, labels=False) 
Y = pd.cut(np.random.normal(0, 2, n), bins=10, labels=False) 
Z = np.random.randint(0, 3, n)

print(f'Asymptotic test of conditional independence with mutual information:')
stat_value, p_value = cond_indep_test_asymptotic(X, Y, Z)
print(f'statistic value: {stat_value}, p-value: {p_value}')
print(f'Permutation test of conditional independence:')
stat_value, p_value = cond_indep_test_permutation(X, Y, Z, 50)
print(f'statistic value: {stat_value}, p-value: {p_value}')

Asymptotic test of conditional independence with mutual information:
statistic value: 189.4052743717044, p-value: 0.9954227788843352
Permutation test of conditional independence:
statistic value: 189.4052743717044, p-value: 0.7450980392156863


p-value is close to 1, thus we fail to reject the hypothesis that X and Y given Z are independent

conditionaly dependent

In [26]:
n = 300
X1, Y1 = np.random.multivariate_normal(np.zeros(2), np.array([[1, 0.5], [0.5, 1]]), n).T
Z1 = np.ones(n)
X2, Y2 = np.random.multivariate_normal(np.zeros(2), np.array([[1, -0.5], [-0.5, 1]]), n).T
Z2 = np.ones(n) * 2
X = pd.cut(np.concatenate([X1, X2]), bins=10, labels=False)
Y = pd.cut(np.concatenate([Y1, Y2]), bins=10, labels=False)
Z = np.concatenate([Z1, Z2])

print(f'Asymptotic test of conditional independence with mutual information:')
stat_value, p_value = cond_indep_test_asymptotic(X, Y, Z)
print(f'statistic value: {stat_value}, p-value: {p_value}')
print(f'Permutation test of conditional independence:')
stat_value, p_value = cond_indep_test_permutation(X, Y, Z, 50)
print(f'statistic value: {stat_value}, p-value: {p_value}')

Asymptotic test of conditional independence with mutual information:
statistic value: 245.49512259358036, p-value: 2.506976507576919e-05
Permutation test of conditional independence:
statistic value: 245.49512259358036, p-value: 0.0196078431372549


p-value is less than 0.05, thus we reject the hypothesis that X and Y given Z are independent

## Task 2

In [15]:
def discretize(v):
    v[v < 0] = -1
    v[v >= 0] = 1
    return v

def sample_from_model1(n=1000):
    Z = discretize(np.random.normal(0, 1, n))
    X = discretize(np.random.normal(Z/2, 1))
    Y = discretize(np.random.normal(Z/2, 1))
    return X, Y, Z

def sample_from_model2(n=1000):
    X = discretize(np.random.normal(0, 1, n))
    Z = discretize(np.random.normal(X/2, 1))
    Y = discretize(np.random.normal(Z/2, 1))
    return X, Y, Z

def sample_from_model3(n=1000):
    X = discretize(np.random.normal(0, 1, n))
    Y = discretize(np.random.normal(0, 1, n))
    Z = discretize(np.random.normal((X+Y)/2, 1))
    return X, Y, Z

### a)

answer:

Model 1: $X \not\!\perp\!\!\!\perp Y$ and $X \!\perp\!\!\!\perp Y | Z$

Model 2: $X \not\!\perp\!\!\!\perp Y$ and $X \!\perp\!\!\!\perp Y | Z$

Model 3: $X \!\perp\!\!\!\perp Y$ and $X \not\!\perp\!\!\!\perp Y | Z$

### b)

#### Model 1

In [27]:
X, Y, Z = sample_from_model1()
print(f'Mutual information: {mutual_info_score(X, Y)}')
print(f'Conditional mutual information: {calculate_cmi(X, Y, Z)}')

Mutual information: 0.011286357599670616
Conditional mutual information: 0.00022561836562851804


#### Model 2

In [28]:
X, Y, Z = sample_from_model2()
print(f'Mutual information: {mutual_info_score(X, Y)}')
print(f'Conditional mutual information: {calculate_cmi(X, Y, Z)}')

Mutual information: 0.006712262263487978
Conditional mutual information: 0.00083837904191386


#### Model 3

In [29]:
X, Y, Z = sample_from_model3()
print(f'Mutual information: {mutual_info_score(X, Y)}')
print(f'Conditional mutual information: {calculate_cmi(X, Y, Z)}')

Mutual information: 2.048396046228085e-06
Conditional mutual information: 0.01386560976362487


### c)

In [32]:
def calc_tests(X, Y, Z):
    print('Independence tests:')
    print(f'Asymptotic test of independence with mutual information:')
    stat_value, p_value = indep_test_asymptotic(X, Y, 'mi')
    print(f'statistic value: {stat_value}, p-value: {p_value}')
    print(f'Permutation test of independence:')
    stat_value, p_value = indep_test_permutation(X, Y, 50)
    print(f'statistic value: {stat_value}, p-value: {p_value}')
    print('----------------------------------------------------')
    print('Conditional independence tests:')
    print(f'Asymptotic test of conditional independence with mutual information:')
    stat_value, p_value = cond_indep_test_asymptotic(X, Y, Z)
    print(f'statistic value: {stat_value}, p-value: {p_value}')
    print(f'Permutation test of conditional independence:')
    stat_value, p_value = cond_indep_test_permutation(X, Y, Z, 50)
    print(f'statistic value: {stat_value}, p-value: {p_value}')

#### Model 1

In [31]:
X, Y, Z = sample_from_model1()
calc_tests(X, Y, Z)

Independence tests:
Asymptotic test of independence with mutual information:
statistic value: 38.63028295789772, p-value: 5.12180520217953e-10
Permutation test of independence:
statistic value: 38.63028295789772, p-value: 0.0196078431372549
----------------------------------------------------
Conditional independence tests:
Asymptotic test of conditional independence with mutual information:
statistic value: 0.9798267143903748, p-value: 0.6126794761530625
Permutation test of conditional independence:
statistic value: 0.9798267143903748, p-value: 0.6470588235294118


p-value in independence tests is less than 0.05, therefore we reject the null hypothesis that X and Y are independent, but in the conditional independence tests p-value is greater than 0.05, so we fail to reject the hypothesis that X and Y given Z are independent.

#### Model 2

In [21]:
X, Y, Z = sample_from_model2()
calc_tests(X, Y, Z)

Independence tests:
Asymptotic test of independence with mutual information:
statistic value: 41.17574006745994, p-value: 1.391388115834502e-10
Permutation test of independence:
statistic value: 41.17574006745994, p-value: 0.0196078431372549
----------------------------------------------------
Conditional independence tests:
Asymptotic test of conditional independence with mutual information:
statistic value: 1.0786226938582484, p-value: 0.5831497019614147
Permutation test of conditional independence:
statistic value: 1.0786226938582484, p-value: 0.5686274509803921


Similarly as in sample from model 1, we reject the hypothesis that X and Y are independent, but fail to reject the hypothesis that X and Y given Z are independent. 

#### Model 3

In [23]:
X, Y, Z = sample_from_model3()
calc_tests(X, Y, Z)

Independence tests:
Asymptotic test of independence with mutual information:
statistic value: 1.1973852178203126, p-value: 0.2738449159862093
Permutation test of independence:
statistic value: 1.1973852178203126, p-value: 0.21568627450980393
----------------------------------------------------
Conditional independence tests:
Asymptotic test of conditional independence with mutual information:
statistic value: 16.559253034787933, p-value: 0.000253631909453933
Permutation test of conditional independence:
statistic value: 16.559253034787933, p-value: 0.0196078431372549


In case of sample from model 3 we fail to reject the hypothesis that X and Y are independent, and we reject the hypothesis of conditional independence.