In [1]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Mathematical Underpinnings - Lab 4

Tests to verify hipoteses of independence (from Lab 4):

In [2]:
def indep_test_asymptotic_dep(X, Y, stat):

    if stat == "mi":

        stat_value = 2*len(X)*mutual_info_score(X, Y)

    if stat == "chi2":

        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

def indep_test_permutation_dep(X, Y, B, stat="mi"):

    stat_value = mutual_info_score(X, Y)

    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)

        stat_value_b = mutual_info_score(X_b, Y)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

## Task 1

In [3]:
# a function which computes CMI
n = 100
X = np.random.uniform(0,1,n)
Z = [0 for i in range(n//4)] + [1 for i in range(n//4+5)] + [2 for i in range(n//4-5)] + [3 for i in range(n//4)]
Y = X + Z

xbins = np.linspace(X.min(), X.max(), 10 + 1)
ybins = np.linspace(Y.min(), Y.max(), 10 + 1)
def get_bucket(x, bins):
    for i in range(1,len(bins)):
      if x<bins[i]:
          return i
    return i+1
X_bin = [get_bucket(x, xbins) for x in X]
Y_bin = [get_bucket(y, ybins) for y in Y]

def CMI(X, Y, Z):
  CMI = 0
  for z in np.unique(Z):
    X_filtered = X[Z==z]
    Y_filtered = Y[Z==z]
    xbins = np.linspace(X_filtered.min(), X_filtered.max(), 10 + 1)
    ybins = np.linspace(Y_filtered.min(), Y_filtered.max(), 10 + 1)
    prob = np.mean(Z==z)
    MI = mutual_info_score([get_bucket(x, xbins) for x in X], [get_bucket(y, ybins) for y in Y])
    CMI += MI*prob
  return CMI
CMI(X,Y,Z)

0.5684937554306104

### a)

In [4]:
# CI test based on CMI and asymptotics
def indep_test_asymptotic(X, Y, Z):

    stat_value = 2*len(X)*CMI(X, Y, Z)



    df = (9)*(9) * len(np.unique(Z))

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

indep_test_asymptotic(X, Y, Z)

(113.69875108612209, 1.0)

### b)

In [5]:
# CI test based on CMI and permutations
def indep_test_permutation(X, Y, Z, B):

    stat_value = CMI(X, Y, Z)

    condition_p_value = 0

    for b in range(B):
      X_b = []
      for z in np.unique(Z):
        X_b += list(np.random.permutation(X[Z==z]))

      stat_value_b = CMI(np.array(X_b), Y, Z)

      if stat_value <= stat_value_b:
          condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value
indep_test_permutation(X, Y, Z, B=100)

(113.69875108612209, 0.009900990099009901)

### c)

conditionaly independent

In [6]:
n = 1000
X = np.random.uniform(0,1,n)
Z = [0 for i in range(n//4)] + [1 for i in range(n//4+5)] + [2 for i in range(n//4-5)] + [3 for i in range(n//4)]
Y = np.random.normal(0,1,n)
print(f"CMI: {CMI(X,Y,Z)}")
print(f"Permutation test: {indep_test_permutation(X, Y, Z, B=100)}")
print(f"Aymptotic test: {indep_test_asymptotic(X, Y, Z)}")

CMI: 0.03780889732325577
Permutation test: (75.61779464651154, 0.9900990099009901)
Aymptotic test: (75.61779464651154, 1.0)


conditionaly dependent

In [7]:

X = np.random.uniform(0,1,n)
Z = [1 for i in range(n//4)] + [2 for i in range(n//4+5)] + [3 for i in range(n//4-5)] + [4 for i in range(n//4)]
Y = X + np.array(Z)
print(f"CMI: {CMI(X,Y,Z)}")
print(f"Permutation test: {indep_test_permutation(X, Y, Z, B=100)}")
print(f"Aymptotic test: {indep_test_asymptotic(X, Y, Z)}")


CMI: 0.5379881763872735
Permutation test: (1075.976352774547, 0.009900990099009901)
Aymptotic test: (1075.976352774547, 0.0)


## Task 2

In [8]:
def sample_from_model1(n=100):
    Z = np.random.normal(0,1,n)
    Z = Z/np.abs(Z)
    X = [np.random.normal(z/2,1) for z in Z]
    Y = [np.random.normal(z/2,1) for z in Z]
    Y = Y/np.abs(Y)
    X = X/np.abs(X)
    return X, Y, Z

def sample_from_model2(n=100):
  X = np.random.normal(0,1,n)
  X = X/np.abs(X)
  Z = [np.random.normal(x/2,1) for x in X]
  Y = [np.random.normal(z/2,1) for z in Z]
  Y = Y/np.abs(Y)
  Z = Z/np.abs(Z)
  return X, Y, Z

def sample_from_model3(n=100):
  X = np.random.normal(0,1,n)
  Y = np.random.normal(0,1,n)
  Y = Y/np.abs(Y)
  X = X/np.abs(X)
  Z = [np.random.normal((X[i]+Y[i])/2,1) for i in range(len(X))]
  Z = Z/np.abs(Z)
  return X, Y, Z


### a)

* Independent models: 3
* Conditionally independent models: 1, 2


### b)

In [9]:
#CMI for discrete X and Y
def CMI_d(X, Y, Z):
  CMI = 0
  for z in np.unique(Z):
    X_filtered = X[Z==z]
    Y_filtered = Y[Z==z]
    prob = np.mean(Z==z)
    MI = mutual_info_score(X_filtered, Y_filtered)
    CMI += MI*prob
  return CMI


for model in [sample_from_model1, sample_from_model2, sample_from_model3]:
  X, Y, Z = model(n=1000)
  print(f"____________________________{model.__name__}____________________________")
  print(f"Mutual information: {mutual_info_score(X, Y)}")
  print(f"Conditional mutual information: {CMI_d(X, Y, Z)}")
  print("                     ")

____________________________sample_from_model1____________________________
Mutual information: 0.015570393197281707
Conditional mutual information: 0.0008420563685527838
                     
____________________________sample_from_model2____________________________
Mutual information: 0.03009052210962676
Conditional mutual information: 0.00834095623694324
                     
____________________________sample_from_model3____________________________
Mutual information: 0.0016415720008549628
Conditional mutual information: 0.018486657925638528
                     


### c)

In [11]:
# CI test based on CMI and asymptotics
def indep_test_asymptotic_d(X, Y, Z):

    stat_value = 2*len(X)*CMI_d(X, Y, Z)



    df =  len(np.unique(Z))

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value
    # CI test based on CMI and permutations
def indep_test_permutation_d(X, Y, Z, B):

    stat_value = CMI_d(X, Y, Z)

    condition_p_value = 0

    for b in range(B):
      X_b = []
      for z in np.unique(Z):
        X_b += list(np.random.permutation(X[Z==z]))

      stat_value_b = CMI_d(np.array(X_b), Y, Z)

      if stat_value <= stat_value_b:
          condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value


for model in [sample_from_model1, sample_from_model2, sample_from_model3]:
  X, Y, Z = model(n=1000)
  print(f"____________________________{model.__name__}____________________________")
  print(f"CMI: {CMI_d(X,Y,Z)}")
  print(f"Permutation test conditional: {indep_test_permutation_d(X, Y, Z, B=100)[1]}")
  print(f"Aymptotic test conditional: {indep_test_asymptotic_d(X, Y, Z)[1]}")
  print(f"MI: {mutual_info_score(X,Y)}")
  print(f"Permutation test: {indep_test_permutation_dep(X, Y, B=100)[1]}")
  print(f"Aymptotic test: {indep_test_asymptotic_dep(X, Y, stat='mi')[1]}")
  print("                     ")

____________________________sample_from_model1____________________________
CMI: 8.931900842347145e-05
Permutation test conditional: 0.9405940594059405
Aymptotic test conditional: 0.9145537766756151
MI: 0.016063013281034044
Permutation test: 0.009900990099009901
Aymptotic test: 1.4448864882332657e-08
                     
____________________________sample_from_model2____________________________
CMI: 0.0021422514592079633
Permutation test conditional: 0.12871287128712872
Aymptotic test conditional: 0.11739024591866021
MI: 0.014180073279656369
Permutation test: 0.009900990099009901
Aymptotic test: 1.007175212119904e-07
                     
____________________________sample_from_model3____________________________
CMI: 0.007715601592188533
Permutation test conditional: 0.009900990099009901
Aymptotic test conditional: 0.0004458171836136948
MI: 0.00020794332689916262
Permutation test: 0.6039603960396039
Aymptotic test: 0.5189966929491698
                     
