In [1]:
from sklearn.metrics import mutual_info_score
#from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde, pearsonr
import seaborn as sns
import pandas as pd

# Mathematical Underpinnings - Lab 4

Tests to verify hipoteses of independence (from Lab 4):

In [2]:
def indep_test_asymptotic(X, Y, stat):

    if stat == "mi":

        stat_value = 2*len(X)*mutual_info_score(X, Y)

    if stat == "chi2":

        test_res = (chi2_contingency(pd.crosstab(X, Y)))
        stat_value = test_res.statistic

    df = (len(np.unique(X)) - 1)*(len(np.unique(Y)) - 1)

    p_value = 1 - chi2.cdf(stat_value, df=df)

    return stat_value, p_value

def indep_test_permutation(X, Y, B, stat="mi"):

    stat_value = mutual_info_score(X, Y)

    condition_p_value = 0
    for b in range(B):
        X_b = np.random.permutation(X)

        stat_value_b = mutual_info_score(X_b, Y)

        if stat_value <= stat_value_b:
            condition_p_value += 1

    p_value = (1 + condition_p_value)/(1 + B)

    return 2*len(X)*stat_value, p_value

## Task 1

In [32]:
from sklearn.metrics import mutual_info_score

def cmi(x, y, z):
    """
    Computes the Conditional Mutual Information (CMI) between three random variables X, Y, and Z.
    
    Parameters:
        x: list or array-like object containing values of random variable X
        y: list or array-like object containing values of random variable Y
        z: list or array-like object containing values of random variable Z
    
    Returns:
        cmi_value: the value of Conditional Mutual Information (CMI) between X, Y, and Z
    """
    # Combine x, y, and z into tuples (x_i, y_i, z_i)
    xyz = list(zip(x, y, z))

    # Count occurrences of each z_i value
    z_counts = {}
    for _, _, z_i in xyz:
        z_counts[z_i] = z_counts.get(z_i, 0) + 1

    # Compute total number of samples
    total_samples = len(x)

    # Compute conditional mutual information
    cmi_value = 0
    for z_i, count in z_counts.items():
        # Compute conditional probability p(z_i)
        p_z = count / total_samples
        # Filter x, y pairs where z equals z_i
        xy_given_z = [(x_i, y_i) for x_i, y_i, z_i_temp in xyz if z_i_temp == z_i]
        # Extract x and y from filtered pairs
        x_given_z, y_given_z = zip(*xy_given_z)
        # Compute mutual information I(x, y | z_i)
        mi = mutual_info_score(x_given_z, y_given_z)
        # Accumulate CMI
        cmi_value += p_z * mi
        
    return cmi_value

# Example usage:
X = [1, 0, 1, 0, 1]
Y = [0, 1, 1, 0, 1]
Z = [1, 0, 1, 0, 0]

result = cmi(X, Y, Z)
print("CMI(X;Y|Z) =", result)

CMI(X;Y|Z) = 0.10464962875290966


### a)

In [34]:
# CI test based on CMI and asymptotics
def asymptotic_test_cmi(x, y, z):
    """
    Performs an asymptotic test of conditional independence between random variables X and Y given Z.
    
    Parameters:
        x: list or array-like object containing values of random variable X
        y: list or array-like object containing values of random variable Y
        z: list or array-like object containing values of random variable Z
    
    Returns:
        test_statistic: the test statistic value
        p_value: the p-value for the test
    """
    # Compute the CMI between X and Y given Z
    cmi_value = cmi(x, y, z)
    
    # Compute the number of samples
    n = len(x)
    
    # Determine the degrees of freedom for the chi-squared distribution
    df = (len(set(x)) - 1) * (len(set(y)) - 1) * len(set(z))
    
    # Compute the test statistic
    test_statistic = 2 * n * cmi_value
    
    # Compute the p-value
    p_value = 1 - chi2.cdf(test_statistic, df)
    
    return test_statistic, p_value

# Example usage:
X = [1, 0, 1, 0, 1]
Y = [0, 1, 1, 1, 0]
Z = [1, 0, 1, 0, 0]

test_statistic, p_value = asymptotic_test_cmi(X, Y, Z)
print("Test Statistic:", test_statistic)
print("p-value:", p_value)

Test Statistic: 3.819085009768877
p-value: 0.14814814814814814


### b)

In [89]:
# CI test based on CMI and permutations
def conditional_permutation_test(x, y, z, B=100):
    """
    Performs a conditional permutation test for conditional independence between random variables X and Y given Z.
    
    Parameters:
        x: list or array-like object containing values of random variable X
        y: list or array-like object containing values of random variable Y
        z: list or array-like object containing values of random variable Z
        B: number of permutations (default is 100)
    
    Returns:
        p_value: the p-value for the test
    """
    # Compute the observed CMI
    observed_cmi = cmi(x, y, z)
    
    # Initialize count of permuted CMIs less than or equal to observed CMI
    count_permuted_cmi_bi_observed = 0
    
    # Iterate over permutations
    for _ in range(B):
        # Shuffle the values of x while preserving the grouping by z
        permuted_x = np.zeros(len(x))
        unique_z_values = np.unique(z)
        for z_value in unique_z_values:
            indices = np.where(np.array(z) == z_value)[0]
            permuted_indices = np.random.permutation(indices)
            permuted_x[indices] = np.array(x)[permuted_indices]
        
        # Compute CMI for permuted sample
        permuted_cmi = cmi(permuted_x, y, z)
        
        # Check if permuted CMI is bigger than or equal to observed CMI
        if observed_cmi <= permuted_cmi:
            count_permuted_cmi_bi_observed += 1
    
    # Compute p-value
    p_value = (1 + count_permuted_cmi_bi_observed) / (1 + B)
    
    return observed_cmi, p_value

# Example usage:
X = [1, 0, 1, 0, 1]
Y = [0, 1, 1, 1, 0]
Z = [1, 0, 1, 0, 0]

observed_cmi, p_value = conditional_permutation_test(X, Y, Z)
print("Test Statistic:", observed_cmi)
print("p-value:", p_value)


Test Statistic: 0.3819085009768877
p-value: 0.36633663366336633


### c)

conditionaly independent

In [90]:
def generate_independent_samples(sample_size):
    """
    Generate synthetic samples for variables X, Y, and Z.
    
    Parameters:
        sample_size: number of samples to generate
    
    Returns:
        x: list containing values of random variable X
        y: list containing values of random variable Y
        z: list containing values of random variable Z
    """
    x = np.random.randint(0, 2, size=sample_size)
    y = np.random.randint(0, 2, size=sample_size)
    z = np.random.randint(0, 2, size=sample_size)
    
    return x.tolist(), y.tolist(), z.tolist()
    
# Generate larger samples
sample_size = 1000
X, Y, Z = generate_samples(sample_size)

# Perform asymptotic permutation test
test_statistic, p_value = asymptotic_test_cmi(X, Y, Z)
print("Asymptotic Test Statistic:", test_statistic)
print("Asymptotic p-value:", p_value)

# Perform conditional permutation test
observed_cmi, p_value = conditional_permutation_test(X, Y, Z)
print("Conditional Test Statistic:", observed_cmi)
print("Conditional p-value:", p_value)

Asymptotic Test Statistic: 0.050627169180975096
Asymptotic p-value: 0.9750041178176528
Conditional Test Statistic: 2.531358459048755e-05
Conditional p-value: 0.9900990099009901


conditionaly dependent

In [91]:
def sample_dependent_distribution(n_samples):
    """
    Draw a sample from a distribution where X and Y are conditionally dependent given Z.
    
    Parameters:
        n_samples: number of samples to draw
        
    Returns:
        x: list containing values of random variable X
        y: list containing values of random variable Y
        z: list containing values of random variable Z
    """
    x = np.random.randint(0, 2, size=sample_size)
    y = 1 - x # negative of x
    z = np.random.randint(0, 2, size=sample_size)
    
    return x.tolist(), y.tolist(), z.tolist()

# Example usage:
n_samples = 1000
X, Y, Z = sample_dependent_distribution(n_samples)

# Perform asymptotic permutation test
test_statistic, p_value = asymptotic_test_cmi(X, Y, Z)
print("Asymptotic Test Statistic:", test_statistic)
print("Asymptotic p-value:", p_value)

# Perform conditional permutation test
observed_cmi, p_value = conditional_permutation_test(X, Y, Z)
print("Conditional Test Statistic:", observed_cmi)
print("Conditional p-value:", p_value)

Asymptotic Test Statistic: 1386.0945747775522
Asymptotic p-value: 0.0
Conditional Test Statistic: 0.6930472873887761
Conditional p-value: 0.009900990099009901


## Task 2

In [76]:
def sample_from_model1():
    pass

def sample_from_model2():
    pass

def sample_from_model3():
    pass

### a)

answer:

### b)

### c)