# Mathematical Underpinnings - Lab 6

In [1]:
from sklearn.metrics import mutual_info_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm import tqdm

In [44]:
np.random.seed(123)

## Useful functions

In [2]:
def discetize_2bins(X):
    X_discrete = 1*(X >= 0)
    return X_discrete

In [3]:
def conditional_permutation(X, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    X_b = np.zeros(n)

    for i in range(n_z_values):

        z_value_tmp = z_values[i]

        X_b[Z == z_value_tmp] = np.random.permutation(X[Z == z_value_tmp])

    return X_b

In [4]:
def conditional_mutual_information(X, Y, Z):

    z_values = np.unique(Z)
    n_z_values = len(z_values)
    n = len(Z)

    cmi = 0

    for i in range(n_z_values):

        z_value_tmp = z_values[i]
        z_condition = (Z == z_value_tmp)

        X_z = X[z_condition]
        Y_z = Y[z_condition]

        mi_XY_z = mutual_info_score(X_z, Y_z)
        p_z = np.sum(z_condition)/n

        cmi += p_z*mi_XY_z

    return cmi

In [5]:
# II(X;Y;Z)
def interaction_information(X, Y, Z):
    return conditional_mutual_information(X, Y, Z) - mutual_info_score(X, Y)

In [6]:
# II(X;Y;Z1;Z2)
def interaction_information2(X, Y, Z1, Z2):
    Z_1_and_2 = 2*Z2 + Z1
    return interaction_information(X, Y, Z_1_and_2) - interaction_information(X, Y, Z1) - interaction_information(X, Y, Z2)

## Task 1

In [15]:
def secmi2(X, Y, Z):
    # I(X;Y)
    first = mutual_info_score(X, Y)
    second_list=[0 for _ in range(Z.shape[1])]
    for i in range(Z.shape[1]):
        second_list[i]=interaction_information(X, Y, Z[:,i])
    return sum(second_list)+first
def secmi3(X, Y, Z):
    first_and_second=secmi2(X, Y, Z)
    third_list=[0 for _ in range(Z.shape[1])]
    for i in range(Z.shape[1]):
        for j in range(i+1, Z.shape[1]):
            third_list[i]=interaction_information2(X, Y, Z[:,i], Z[:,j])
    return sum(third_list)+first_and_second

### a)

In [13]:
def cond_indep_test_permutation(X, Y, Z, B=50, stat='cmi'):

    n_col_Z = Z.shape[1]
    Z_1dim = np.dot(Z, 2**np.linspace(0, n_col_Z-1, n_col_Z))

    if stat == "cmi":
        stat_value = conditional_mutual_information(X, Y, Z_1dim)
    if stat == "secmi2":
        stat_value = secmi2(X, Y, Z)
    if stat == "secmi3":
        stat_value = secmi3(X, Y, Z)

    condition_p_value = 0
    for b in range(B):
        X_b = conditional_permutation(X, Z_1dim)

        if stat == "cmi":
            stat_value_b = conditional_mutual_information(X_b, Y, Z_1dim)
        if stat == "secmi2":
            stat_value_b = secmi2(X_b, Y, Z)
        if stat == "secmi3":
            stat_value_b = secmi3(X_b, Y, Z)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

### b)

In [9]:
def generate_data(n):
    Y_first = np.random.normal(size=n)
    Y = discetize_2bins(Y_first)

    Z1_first = np.random.normal(loc=Y/2,size=n)
    Z1 = discetize_2bins(Z1_first)

    Z2_first = np.random.normal(loc=Y/2,size=n)
    Z2 = discetize_2bins(Z2_first)

    Z3_first = np.random.normal(loc=Y/2,size=n)
    Z3 = discetize_2bins(Z3_first)

    X_first = np.random.normal(loc=Z1/2,size=n)
    X = discetize_2bins(X_first)

    return Y, Z1, Z2, Z3, X

In [40]:
def run_tests_b(n):
    Y, Z1, Z2, Z3, X = generate_data(n)
    results=pd.DataFrame(columns=['stat', 'stat_val_1', 'p_value_1', 'stat_val_2', 'p_value_2'], index=range(3))
    results['stat']=['cmi', 'secmi2', 'secmi3']
    Z_test_1=np.column_stack((Z1, Z2))
    Z_test_2=np.column_stack((Z2, Z3))
    for stats in ['cmi', 'secmi2', 'secmi3']:
        results.loc[results['stat']==stats,'stat_val_1'], results.loc[results['stat']==stats,'p_value_1']=cond_indep_test_permutation(X, Y, Z_test_1, B=50, stat=stats)
        results.loc[results['stat']==stats,'stat_val_2'], results.loc[results['stat']==stats,'p_value_2']=cond_indep_test_permutation(X, Y, Z_test_2, B=50, stat=stats)
    return results    

In [23]:
Y, Z1, Z2, Z3, X = generate_data(10)
Z1

array([1, 1, 0, 1, 1, 0, 0, 1, 1, 1])

In [45]:
N=100
results_test=run_tests_b(N)
results_test

Unnamed: 0,stat,stat_val_1,p_value_1,stat_val_2,p_value_2
0,cmi,2.481897,0.784314,1.373166,0.862745
1,secmi2,1.306004,0.764706,1.089497,0.823529
2,secmi3,2.481897,0.666667,1.373166,0.823529


In this test there was no test true.

In [59]:
cmi_1=0
cmi_2=0
secmi2_1=0
secmi2_2=0
secmi3_1=0
secmi3_2=0

for i in range(100):
    print(i)
    results_test=run_tests_b(N)
    if int(results_test[results_test['stat']=='cmi']['p_value_1']<0.05):
        cmi_1+=1
    elif int(results_test[results_test['stat']=='cmi']['p_value_2']<0.05):
        cmi_2+=1
    elif int(results_test[results_test['stat']=='secmi2']['p_value_1']<0.05):
        secmi2_1+=1
    elif int(results_test[results_test['stat']=='secmi2']['p_value_2']<0.05):
        secmi2_2+=1
    elif int(results_test[results_test['stat']=='secmi3']['p_value_1']<0.05):
        secmi3_1+=1
    elif int(results_test[results_test['stat']=='secmi3']['p_value_2']<0.05):
        secmi3_1+=1
    else:
        pass


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [60]:
print(f"For cmi hypothesis 1 was rejected: {100-cmi_1} times.")
print(f"For cmi hypothesis 2 was rejected: {100-cmi_2} times.")
print(f"For secmi2 hypothesis 1 was rejected: {100-secmi2_1} times.")
print(f"For secmi2 hypothesis 2 was rejected: {100-secmi2_2} times.")
print(f"For secmi3 hypothesis 1 was rejected: {100-secmi3_1} times.")
print(f"For secmi3 hypothesis 2 was rejected: {100-secmi3_2} times.")

For cmi hypothesis 1 was rejected: 91 times.
For cmi hypothesis 1 was rejected: 97 times.
For secmi2 hypothesis 1 was rejected: 98 times.
For secmi2 hypothesis 1 was rejected: 99 times.
For secmi3 hypothesis 1 was rejected: 98 times.
For secmi3 hypothesis 1 was rejected: 100 times.


### c)

## Task 2
 
in R