In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt
from math import log2
import pandas as pd
from matplotlib.collections import LineCollection
import matplotlib.patches as mpatches
from itertools import product



s=1985

# CREATING METRICS

In [None]:
def entropy(X):

  # Count the occurrences of each unique value in the sample
  unique, counts = np.unique(X, return_counts=True)

  # Calculate the probabilities of each unique value
  p = counts / len(X)

  # Handle zero probabilities (add small value for numerical stability)
  p[p == 0] = np.finfo(float).eps

  # Calculate the entropy
  return -np.sum(p * np.log2(p))  # Base 2 logarithm for binary entropy

# Example usage
size=1000
r=1000

X1 = np.random.randint(0, r, size)
entropy_value = entropy(X1)
print(f"The entropy of the sample is: {entropy_value}")


The entropy of the sample is: 9.116178169653605


In [None]:
def entropy_continuous(X, num_bins=10):
    """Estimates the differential entropy of a continuous variable using binning.

    Args:
        X: An array-like object containing the data samples.
        num_bins: Number of bins to use for discretization.

    Returns:
        float: The estimated differential entropy.
    """

    if num_bins <= 0:
       raise ValueError("num_bins must be a positive integer")

    # 1. Find value range and create bins
    data_min, data_max = X.min(), X.max()
    if data_max - data_min == 0:  # All data has the same value
        return 0.0

    bin_width = (data_max - data_min) / num_bins
    bins = np.arange(data_min, data_max + bin_width, bin_width)

    # 2. Calculate bin counts (approximate probability)
    bin_counts = np.histogram(X, bins=bins)[0]
    p = bin_counts / len(X)

    # 3. Handle zero probabilities and calculate entropy
    p[p == 0] = np.finfo(float).eps  # Add tiny value for stability
    entropy = -np.sum(p * np.log2(p))

    return entropy


In [None]:
def probability(X):
  counts={}
  total=len(X)
  for x in X:
    counts[x]=counts.get(x, 0) + 1

  probs={count:x /total for count,x in counts.items()}
  return probs



In [None]:
def joint_probability(X, Y):
    # Create a dictionary to count occurrences of each pair (x, y)
    pair_counts = {}
    total_pairs = len(X)

    for x, y in zip(X, Y):
        pair = (x, y)
        pair_counts[pair] = pair_counts.get(pair, 0) + 1

    # Calculate the joint probability of each (x, y) pair
    joint_probs = {pair: count / total_pairs for pair, count in pair_counts.items()}

    return joint_probs


In [None]:
def joint_probability3(X, Y, Z):
    # Create a dictionary to count occurrences of each triple (x, y, z)
    triple_counts = {}
    total_triples = len(X)

    for x, y, z in zip(X, Y, Z):
        triple = (x, y, z)
        triple_counts[triple] = triple_counts.get(triple, 0) + 1

    # Calculate the joint probability of each (x, y, z) triple
    joint_probs = {triple: count / total_triples for triple, count in triple_counts.items()}

    return joint_probs

In [None]:
def probability_of_x_given_y(X, Y):
    counts = {}
    # Create an empty dictionary to store counts of x for each unique y

    for x, y in zip(X, Y):
        if y not in counts:
            counts[y] = {}
            # If y is not already a key in counts, initialize it with an empty dictionary

        if x not in counts[y]:
            counts[y][x] = 0
            # If x is not already a key in counts[y], initialize it with a count of 0

        counts[y][x] += 1
        # Increment the count of x for the corresponding y

    probabilities = {}
    # Create a dictionary to store probabilities

    for y, x_counts in counts.items():
        total_count = sum(x_counts.values())
        # Calculate the total count of x values for a particular y

        probabilities[y] = {x: count / total_count for x, count in x_counts.items()}
        # Calculate the probability of each x value given a specific y value.
        # This is done by dividing the count of each x value by the total count of x values for that y.

    return probabilities



In [None]:

def conditional_entropy(X, Y):
    joint_probs = joint_probability(X, Y)
    prob_y = probability(Y)
    entropy = 0.0
    for key, value in joint_probs.items():
      py=prob_y[key[1]]
      entropy-=value*math.log(value/py,2)
    return entropy


In [None]:
def joint_entropy(X, Y):
    joint_probs = joint_probability(X, Y)
    entropy = 0.0
    for key, value in joint_probs.items():
      entropy-=value*math.log(value,2)
    return entropy


In [None]:
def joint_entropy3(X,Y,Z):
    p_xyz=joint_probability3(X,Y,Z)
    entropy = 0.0
    for prob in p_xyz.values():
        entropy -= prob * math.log2(prob)
    return entropy

In [None]:
def mutual_information(X,Y):
  return entropy(X)-conditional_entropy(X,Y)


In [None]:
def mutual_information3(X,Y,Z):
  res=entropy(X)+entropy(Y)+entropy(Z)-joint_entropy(X,Y)-joint_entropy(X,Z)-joint_entropy(Y,Z)+joint_entropy3(X,Y,Z)
  return -res

In [None]:
def conditional_mutual_information(X,Y,Z):
  return -(mutual_information(X,Y)-mutual_information(X,Z)-mutual_information(Y,Z)-mutual_information3(X,Y,Z))

In [None]:
def mutual_informationX1X2Y(X1, X2, Y):
    res = 0
    PJ = joint_probability_List(X1, X2, Y)
    PX1X2 = joint_probability_List(X1, X2)
    PY = probability(Y)

    for x1, x2, y in set(product(X1, X2, Y)):
        pj = PJ.get((x1, x2, y), 0.0)
        py = PY.get(y, 0.0)
        px1x2 = PX1X2.get((x1, x2), 0.0)
        if pj != 0 and py != 0 and px1x2 != 0:
            res += pj * log2(pj/(py * px1x2))
    return res

In [None]:
def local_mutual_informationX1X2Y(X1,X2,Y, x1,x2,y):
  res=0
  PJ=joint_probability_List(X1,X2,Y)
  PX1X2=joint_probability_List(X1,X2)
  PY=probability(Y)
  pj=PJ.get((x1,x2,y), 0.0)
  py=PY.get(y, 0.0)
  px1x2=PX1X2.get((x1,x2), 0.0)
  if pj != 0 and py != 0 and px1x2 != 0: return log2(pj/(py*px1x2))
  return 0


In [None]:
def joint_probability_List(*args):
    counts = {}
    total_combos = len(args[0])

    for tup in list(zip(*args)):
       key_tuple = tuple(tup)
       counts[key_tuple] = counts.get(key_tuple, 0) + 1
    joint_probs = {tup: count / total_combos for tup, count in counts.items()}

    return joint_probs

In [None]:
def joint_probability_List_continuous(*args, num_bins=10):
    """Calculates the approximate joint probability for continuous variables using binning,
       with enhancements to handle data ranges equal to zero.

    Args:
        *args:  Arrays of continuous values representing the different variables.
        num_bins: Number of bins to use for each variable.

    Returns:
        dict: A dictionary where keys are tuples representing bin combinations
              and values are the approximate joint probabilities.
    """

    # 1. Find value ranges for binning
    min_values = [np.min(data) for data in args]
    max_values = [np.max(data) for data in args]
    bin_widths = [(max_val - min_val) / num_bins for max_val, min_val in zip(max_values, min_values)]

    # 2. Assign values to bins, handling zero-range cases
    binned_data = []
    for data, min_val, max_val, width in zip(args, min_values, max_values, bin_widths):
        if max_val - min_val == 0:  # Zero range detected
          if np.all(data == data[0]):  # All values are the same
            binned_data.append(np.zeros_like(data))  # Assign all to a single bin
        else:
          binned_data.append(np.digitize(data, np.arange(min_val, max_val + width, width)))

    # 3. Use original logic for calculating joint probability on binned (now discrete) data
    counts = {}
    total_combos = len(binned_data[0])

    for tup in list(zip(*binned_data)):
        key_tuple = tuple(tup)
        counts[key_tuple] = counts.get(key_tuple, 0) + 1
    joint_probs = {tup: count / total_combos for tup, count in counts.items()}

    return joint_probs


In [None]:
def joint_entropy_List(*args):
  jp=joint_probability_List(*args)
  entropy = 0.0
  for prob in jp.values():
    entropy -= prob * math.log2(prob)
  return entropy

In [None]:
def joint_entropy_List_continuous(*args):
  jp=joint_probability_List_continuous(*args)
  entropy = 0.0
  for prob in jp.values():
    entropy -= prob * math.log2(prob)
  return entropy

In [None]:
def conditional_entropy_List(X, *args):
    joint_probs = joint_probability_List(X, *args)
    prob_arg = joint_probability_List(*args)
    entropy = 0.0
    for key, value in joint_probs.items():
      parg=prob_arg[key[1:]]
      entropy-=value*math.log(value/parg,2)
    return entropy

In [None]:
def conditional_entropy_List_continuous(X, *args):
    joint_probs = joint_probability_List_continuous(X, *args)
    prob_arg = joint_probability_List_continuous(*args)
    entropy = 0.0
    for key, value in joint_probs.items():
      parg=prob_arg[key[1:]]
      entropy-=value*math.log(value/parg,2)
    return entropy

In [None]:
def TC_List(*args):
  sum=0
  for X in args:
    sum+=entropy(X)
  return sum-joint_entropy_List(*args)

In [None]:
def TC_List_continuous(*args):
  sum=0
  for X in args:
    sum+=entropy_continuous(X)
  return sum-joint_entropy_List_continuous(*args)

In [None]:
def DTC_List(*args):
  sum=0
  for i, arg in enumerate(args):
        other_args = args[:i] + args[i+1:]
        sum+=conditional_entropy_List(arg,*other_args)
  return joint_entropy_List(*args)-sum

In [None]:
def DTC_List_continuous(*args):
  sum=0
  for i, arg in enumerate(args):
        other_args = args[:i] + args[i+1:]
        sum+=conditional_entropy_List_continuous(arg,*other_args)
  return joint_entropy_List_continuous(*args)-sum

In [None]:
def O_information(*args):
  return TC_List(*args)-DTC_List(*args)

In [None]:
def O_information_continuous(*args):
  return TC_List_continuous(*args)-DTC_List_continuous(*args)

# IMPACT OF CORRELATION ON TC and DTC

In [None]:
def create_correlated_datasets(n, r, s, c):
    """
    Creates two datasets with a given correlation.

    Args:
        n (int): Number of elements in each dataset.
        r (int): Range of the numbers in the dataset (0 to r).
        s (int): Seed for the random number generator.
        c (float): The desired correlation coefficient (-1 to 1).

    Returns:
        tuple: A tuple containing the two correlated datasets as NumPy arrays.
    """

    np.random.seed(s)

    # Ensure correlation coefficient is within valid range
    if not -1 <= c <= 1:
        raise ValueError("Correlation coefficient 'c' must be between -1 and 1.")

    # Generate a base dataset
    x = np.random.randint(0, r + 1, size=n)

    # Create a correlated dataset based on the desired correlation
    if c == 0:
        # No correlation, just generate another independent dataset
        y = np.random.randint(0, r + 1, size=n)
    elif c == 1:
        y = x
    else:
        # Calculate the required standard deviation for the correlated data
        sigma_y = np.std(x) * np.sqrt(1 - c**2)

        # Generate correlated noise
        noise = np.random.normal(scale=sigma_y, size=n)

        # Create the correlated dataset
        y = c * x + noise
        y = np.round(y).astype(int)
        y = np.clip(y, 0, r)

    return x, y

In [None]:
def create_correlated_binary_datasets(n, s, c):
    np.random.seed(s)

    # Generate random binary dataset x
    x = np.random.binomial(1, 0.5, n)

    # Generate random noise for y
    noise = np.random.binomial(1, 0.5, n)

    # Calculate y based on correlation c with x
    y = np.where(noise, x, 1 - x)

    # Introduce correlation between x and y
    if c != 0:
        cov_matrix = np.array([[1, c], [c, 1]])
        x_y = np.random.multivariate_normal([0, 0], cov_matrix, size=n).T
        x, y = np.where(x_y > 0, 1, 0)

    return x, y

# GENEREREN VAN DATA

In [None]:
def generate_correlation_table_weights(n, t_c, t_w, r,s):
    """
    Generates a table of Dual Total Correlations for different correlation coefficients.

    Args:
        n (int): Number of samples in X1 and X2.
        t_c (int): Number of correlation values
        t_w (int): Number of
        r (int): Range for the values in X1 and X2 (0 to r).
        s=seed

    Returns:
        pandas.DataFrame: A table containing correlation coefficients, X1-X2
                          correlations, and Dual Total Correlations.
    """


    corrs = np.linspace(0, 1, t_c)  # Uniformly spaced correlation coefficients
    weights= np.linspace(0, 1, t_w)
    results = []
    for w_AND in weights:
      for corr in corrs:
        X1, X2 = create_correlated_datasets(n, r, s, corr)
        Y=w_AND*np.logical_and(X1,X2)+(1-w_AND)*np.logical_xor(X1,X2)#+np.random.standard_normal()
        Y=Y.round()
        #Y=np.clip(Y, 0, 1).round()

        true_corr = np.corrcoef(X1, X2)[0, 1]  # True correlation between X1 and X2
        X1_Y_corr=np.corrcoef(X1, Y)[0, 1]
        X2_Y_corr=np.corrcoef(X2, Y)[0, 1]
        dtc = DTC_List_continuous(X1, X2, Y)
        tc= TC_List_continuous(X1,X2,Y)

        results.append({'w_AND': w_AND, 'X1-X2 Correlation': true_corr, 'X1-Y Correlation': X1_Y_corr,'X2-Y Correlation': X2_Y_corr,'DTC': dtc, 'TC': tc, 'OINF': tc-dtc})
    df = pd.DataFrame(results)



    return pd.DataFrame(results)

In [None]:
def generate_correlation_table_function(n, t_c, r,s):
    """
    Generates a table of Dual Total Correlations for different correlation coefficients.

    Args:
        n (int): Number of samples in X1 and X2.
        t_c (int): Number of correlation values
        r (int): Range for the values in X1 and X2 (0 to r).
        s=seed

    Returns:
        pandas.DataFrame: A table containing correlation coefficients, X1-X2
                          correlations, and Dual Total Correlations.
    """


    corrs = np.linspace(0, 1, t_c)  # Uniformly spaced correlation coefficients
    results = []
    for corr in corrs:
        X1, X2 = create_correlated_datasets(n, r, s, corr)
        Y=np.logical_and(X1,X2)
        Y=np.logical_xor(X1,X2)
        Y=np.random.randint(0, r + 1, size=n)
        Y=Y.round()
        #Y=np.clip(Y, 0, 1).round()

        true_corr = np.corrcoef(X1, X2)[0, 1]  # True correlation between X1 and X2
        X1_Y_corr=np.corrcoef(X1, Y)[0, 1]
        X2_Y_corr=np.corrcoef(X2, Y)[0, 1]
        dtc = DTC_List_continuous(X1, X2, Y)
        tc= TC_List_continuous(X1,X2,Y)
        results.append({'X1-X2 Correlation': true_corr, 'X1-Y Correlation': X1_Y_corr,'X2-Y Correlation': X2_Y_corr,'DTC': dtc, 'TC': tc, 'OINF': tc-dtc})
    df = pd.DataFrame(results)

    return pd.DataFrame(results)

In [None]:
def plot_correlation_vs_dtc_tc_oinf(df, figsize=(15, 4)):
    """
    Creates plots with two subplots: 'X1-X2 Correlation' vs 'DTC' and
    'X1-X2 Correlation' vs 'TC' for each w_AND value. Ensures shared y-axis range.

    Args:
        df (pandas.DataFrame): The DataFrame containing results.
        figsize (tuple, optional): The size of each figure. Defaults to (10, 4).
    """
    ylim_min = -1
    ylim_max = 3
    for i, group in df.groupby('w_AND'):
        w_AND = group['w_AND'].iloc[0]

        # Create figure and subplots
        fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=figsize)

        # Plot 'X1-X2 Correlation' vs 'DTC'
        ax1.scatter(group['X1-X2 Correlation'], group['DTC'])
        ax1.set_xlabel('X1-X2 Correlation')
        ax1.set_ylabel('DTC')
        ax1.set_title(f'DTC: w_AND = {w_AND:.2f}')

        # Plot 'X1-X2 Correlation' vs 'TC'
        ax2.scatter(group['X1-X2 Correlation'], group['TC'])
        ax2.set_xlabel('X1-X2 Correlation')
        ax2.set_ylabel('TC')
        ax2.set_title(f'TC: w_AND = {w_AND:.2f}')

        # Plot 'X1-X2 Correlation' vs 'O-inf'
        ax3.scatter(group['X1-X2 Correlation'], group['OINF'])
        ax3.set_xlabel('X1-X2 Correlation')
        ax3.set_ylabel('OINF')
        ax3.set_title(f'OINF: w_AND = {w_AND:.2f}')

        # Set the same y-axis range for both subplots
        ax1.set_ylim(ylim_min, ylim_max)
        ax2.set_ylim(ylim_min, ylim_max)
        ax3.set_ylim(ylim_min, ylim_max)

        plt.show()

In [None]:
def plot_correlation_vs_dtc_tc_oinf_function(df, figsize=(15, 4)):
    """
    Creates plots with two subplots: 'X1-X2 Correlation' vs 'DTC' and
    'X1-X2 Correlation' vs 'TC' for each w_AND value. Ensures shared y-axis range.

    Args:
        df (pandas.DataFrame): The DataFrame containing results.
        figsize (tuple, optional): The size of each figure. Defaults to (10, 4).
    """
    ylim_min = -1
    ylim_max = 3
    # Create figure and subplots
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=figsize)

    # Plot 'X1-X2 Correlation' vs 'DTC'
    ax1.scatter(df['X1-X2 Correlation'], df['DTC'])
    ax1.set_xlabel('X1-X2 Correlation')
    ax1.set_ylabel('DTC')
    ax1.set_title(f'DTC: w_AND = {w_AND:.2f}')

    # Plot 'X1-X2 Correlation' vs 'TC'
    ax2.scatter(df['X1-X2 Correlation'], df['TC'])
    ax2.set_xlabel('X1-X2 Correlation')
    ax2.set_ylabel('TC')
    ax2.set_title(f'TC: w_AND = {w_AND:.2f}')

    # Plot 'X1-X2 Correlation' vs 'O-inf'
    ax3.scatter(df['X1-X2 Correlation'], df['OINF'])
    ax3.set_xlabel('X1-X2 Correlation')
    ax3.set_ylabel('OINF')
    ax3.set_title(f'OINF: w_AND = {w_AND:.2f}')

    # Set the same y-axis range for both subplots
    ax1.set_ylim(ylim_min, ylim_max)
    ax2.set_ylim(ylim_min, ylim_max)
    ax3.set_ylim(ylim_min, ylim_max)

    plt.show()

In [None]:
#generate_correlation_table_function(1000, 20, 1,s)

# LOCAL MEASUREMENTS


In [None]:
def local_TC(X,Y,Z):
  pX=probability(X)
  pY=probability(Y)
  pZ=probability(Z)
  pXYZ=joint_probability_List(X,Y,Z)
  result=[]
  for x,y,z in zip(X,Y,Z):
    i=-(np.log2(pX.get(x))+np.log2(pY.get(y))+np.log2(pZ.get(z)))+np.log2(pXYZ.get((x,y,z)))
    result.append(i)
  return result


In [None]:
def local_DTC(X,Y,Z):
  pXYZ=joint_probability_List(X,Y,Z)
  result=[]
  for x,y,z in zip(X,Y,Z):
    i=-np.log2(pXYZ.get((x,y,z)))+np.log2(conditional_probability(X,Y,Z,x,y,z))+np.log2(conditional_probability(Y,Z,X,y,z,x))+np.log2(conditional_probability(Z,X,Y,z,x,y))
    result.append(i)
  return result

In [None]:
def local_mutual_informationX1X2Y(X1,X2,Y, x1,x2,y):
  PJ=joint_probability_List(X1,X2,Y)
  PX1X2=joint_probability_List(X1,X2)
  PY=probability(Y)
  pj=PJ.get((x1,x2,y), 0.0)
  py=PY.get(y, 0.0)
  px1x2=PX1X2.get((x1,x2), 0.0)
  if pj != 0 and py != 0 and px1x2 != 0: return log2(pj/(py*px1x2))
  return 0


In [None]:
def local_mutual_X1X2Y(X1,X2,Y):
  result=[]
  PJ=joint_probability_List(X1,X2,Y)
  PX1X2=joint_probability_List(X1,X2)
  PY=probability(Y)
  for x1,x2,y in zip(X1,X2,Y):
    pj=PJ.get((x1,x2,y), 0.0)
    py=PY.get(y, 0.0)
    px1x2=PX1X2.get((x1,x2), 0.0)
    if pj != 0 and py != 0 and px1x2 != 0:
      i= log2(pj/(py*px1x2))
      result.append(i)
  return result

In [None]:
def conditional_probability(X,Y,Z, x_value, y_value, z_value):
  """
  Calculates the conditional probability of X given Y and Z.

  Args:
      data (pd.DataFrame): A DataFrame containing three columns: 'X', 'Y', and 'Z'.
      x_value: The value of X for which to calculate the probability.
      y_value: The value of Y to condition on.
      z_value: The value of Z to condition on.

  Returns:
      float: The conditional probability of X given Y and Z.
  """
  data = pd.DataFrame({'X':X, 'Y': Y, 'Z': Z})

  # Filter data for specific Y and Z values
  filtered_data = data[(data['Y'] == y_value) & (data['Z'] == z_value)]

  # Check if there are any matching Y and Z values
  if filtered_data.empty:
    return 0

  # Calculate the probability of X given Y and Z
  probability = (filtered_data['X'] == x_value).mean()
  return probability


In [None]:
def generate_correlation_table_weights_local(n, t_c, t_w, r,s):
    """
    Generates a table of Dual Total Correlations for different correlation coefficients.

    Args:
        n (int): Number of samples in X1 and X2.
        t (int): Number of entries in the table.
        r (int): Range for the values in X1 and X2 (0 to r).
        s=seed

    Returns:
        pandas.DataFrame: A table containing correlation coefficients, X1-X2
                          correlations, and Dual Total Correlations.
    """


    corrs = np.linspace(0, 1, t_c)  # Uniformly spaced correlation coefficients
    weights= np.linspace(0, 1, t_w)
    results = []
    for w_AND in weights:
      print(w_AND)
      for corr in corrs:
        X1, X2 = create_correlated_datasets(n, r, s, corr)
        Y=w_AND*np.logical_and(X1,X2)+(1-w_AND)*np.logical_xor(X1,X2)
        Y=Y.round()
        #Y=np.random.randint(0, r + 1, size=n)

        true_corr = np.corrcoef(X1, X2)[0, 1]  # True correlation between X1 and X2
        X1_Y_corr=np.corrcoef(X1, Y)[0, 1]
        X2_Y_corr=np.corrcoef(X2, Y)[0, 1]
        dtc = DTC_List_continuous(X1, X2, Y)
        tc= TC_List_continuous(X1,X2,Y)
        tc_local=local_TC(X1,X2,Y)
        dtc_local=local_DTC(X1,X2,Y)

        results.append({'w_AND': w_AND, 'X1-X2 Correlation': true_corr, 'X1-Y Correlation': X1_Y_corr,'X2-Y Correlation': X2_Y_corr,'DTC': dtc, 'DTC_AVG': np.mean(dtc_local),'DTC_MED': np.median(dtc_local), 'DTC_Q1': np.percentile(dtc_local,25), 'DTC_Q3': np.percentile(dtc_local,75), 'TC': tc, 'TC_AVG': np.mean(tc_local),'TC_MED': np.median(tc_local),'TC_Q1': np.percentile(tc_local,25), 'TC_Q3': np.percentile(tc_local,75),  'OINF': tc-dtc, 'TC_STDV': np.std(tc_local), 'DTC_STDV': np.std(dtc_local), 'DTC_LOCAL': dtc_local, 'TC_LOCAL': tc_local, 'TC_SET': set(tc_local), 'DTC_SET':set(dtc_local) })
    df = pd.DataFrame(results)



    return pd.DataFrame(results)

In [None]:
def generate_correlation_table_function(n, t_c, r,s,f):
    """
    Generates a table of Dual Total Correlations for different correlation coefficients.

    Args:
        n (int): Number of samples in X1 and X2.
        t_c (int): Number of correlation values
        r (int): Range for the values in X1 and X2 (0 to r).
        s=seed

    Returns:
        pandas.DataFrame: A table containing correlation coefficients, X1-X2
                          correlations, and Dual Total Correlations.
    """

    print(f)
    corrs = np.linspace(0, 1, t_c)  # Uniformly spaced correlation coefficients
    results = []
    for corr in corrs:
        print("NEXT EXPERIMENT")
        print(f"corr X1_X2={corr:.2f}")
        X1, X2 = create_correlated_binary_datasets(n, s, corr)
        true_corr = np.corrcoef(X1, X2)[0, 1]  # True correlation between X1 and X2
        print(f"true_corr X1_X2={true_corr:.2f}")
        if f==1:
          Y=np.logical_and(X1,X2)
        if f==2:
          Y=np.logical_xor(X1,X2)
        if f==3:
          Y=np.random.randint(0, r + 1, size=n)
        if f==4:
          Y=X1
        Y=np.clip(Y, 0, 1).round()


        X1_Y_corr=np.corrcoef(X1, Y)[0, 1]
        X2_Y_corr=np.corrcoef(X2, Y)[0, 1]
        dtc = DTC_List_continuous(X1, X2, Y)
        tc= TC_List_continuous(X1,X2,Y)
        mi=mutual_informationX1X2Y(X1, X2, Y)
        tc_local=local_TC(X1,X2,Y)
        dtc_local=local_DTC(X1,X2,Y)
        o_local=[a - b for a, b in zip(tc_local, dtc_local)]
        mi_local=local_mutual_X1X2Y(X1,X2,Y)

        results.append({'X1': X1, 'X2': X2, 'Y':Y, 'X1-X2 Correlation': true_corr, 'X1-Y Correlation': X1_Y_corr,'X2-Y Correlation': X2_Y_corr,'DTC': dtc, 'DTC_AVG': np.mean(dtc_local),'DTC_MED': np.median(dtc_local), 'DTC_Q1': np.percentile(dtc_local,25), 'DTC_Q3': np.percentile(dtc_local,75), 'TC': tc, 'TC_AVG': np.mean(tc_local),'TC_MED': np.median(tc_local),'TC_Q1': np.percentile(tc_local,25), 'TC_Q3': np.percentile(tc_local,75),  'OINF': tc-dtc, 'OINF_LOCAL': o_local, 'TC_STDV': np.std(tc_local), 'DTC_STDV': np.std(dtc_local), 'DTC_LOCAL': dtc_local, 'TC_LOCAL': tc_local, 'TC_SET': set(tc_local), 'DTC_SET':set(dtc_local), 'OINF_SET':set(o_local), 'MUTUAL_INF':mi, 'MI_LOCAL': mi_local, 'MI_SET':set(mi_local) })
        data = {
          'X1': X1,
          'X2': X2,
          'Y': Y,
          'tc_local': tc_local,
          'dtc_local': dtc_local,
          'mi_local': mi_local
        }
        df_overview=analyze_dataframe(pd.DataFrame(data))
        print(df_overview)
        print("TC=", f"{tc:.2f}")
        print("DTC=", f"{dtc:.2f}")
        print("O_INF=", f"{tc-dtc:.2f}")
        print("MUTUAL INF=", f"{mi:.2f}")
        print("\n\n")
        #bubble_plot(df_overview)

    return pd.DataFrame(results)

In [None]:
def plot_correlation_vs_dtc_tc_spread(df, figsize=(10, 4)):
  """
  Creates plots with two subplots: 'X1-X2 Correlation' vs 'DTC' and
  'X1-X2 Correlation' vs 'TC' for each w_AND value. Ensures shared y-axis range.

  Args:
      df (pandas.DataFrame): The DataFrame containing results.
      figsize (tuple, optional): The size of each figure. Defaults to (10, 4).
      marker_color (str, optional): The color for all markers. Defaults to 'red'.
  """
  ylim_min = -5
  ylim_max = 5

  for i, group in df.groupby('w_AND'):
    w_AND = group['w_AND'].iloc[0]
    title = f"w_AND = {w_AND}"  # Create title string with w_AND value

    # Create figure and subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)

    # Add title to each subplot
    ax1.set_title(title)
    ax2.set_title(title)

    for index, row in group.iterrows():
      x_value = row['X1-X2 Correlation']
      y_values = row['DTC_SET']
      for y in y_values:
        ax1.scatter(x_value, y, color='blue')  # Set marker color
      ax1.scatter(x_value, row['DTC'], color='red')

    # Plot 'X1-X2 Correlation' vs 'TC'
    for index, row in group.iterrows():
      x_value = row['X1-X2 Correlation']
      y_values = row['TC_SET']
      for y in y_values:
        ax2.scatter(x_value, y, color='blue')  # Set marker color
      ax2.scatter(x_value, row['TC'], color='red')


    # Set the same y-axis range for both subplots
    #ax1.set_ylim(ylim_min, ylim_max)
    #ax2.set_ylim(ylim_min, ylim_max)

    # Add axis labels
    ax1.set_xlabel('X1-X2 Correlation')
    ax1.set_ylabel('DTC')
    ax2.set_xlabel('X1-X2 Correlation')
    ax2.set_ylabel('TC')

  plt.show()

In [None]:
def plot_correlation_vs_dtc_tc_oinf_mi_spread_function(df, figsize=(25, 4)):
  """
  Creates plots with two subplots: 'X1-X2 Correlation' vs 'DTC' and
  'X1-X2 Correlation' vs 'TC' for each w_AND value. Ensures shared y-axis range.

  Args:
      df (pandas.DataFrame): The DataFrame containing results.
      figsize (tuple, optional): The size of each figure. Defaults to (10, 4).
      marker_color (str, optional): The color for all markers. Defaults to 'red'.
  """
  ylim_min = -5
  ylim_max = 5

  # Create figure and subplots
  fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=figsize)

  # Add title to each subplot
  ax1.set_title("DTC")
  ax2.set_title("TC")
  ax3.set_title("OINF")
  ax4.set_title("MUTUAL INF")

  for index, row in df.iterrows():
    x_value = row['X1-X2 Correlation']
    y_values = row['DTC_SET']
    for y in y_values:
      ax1.scatter(x_value, y, color='blue')  # Set marker color
    ax1.scatter(x_value, row['DTC'], color='red')

  # Plot 'X1-X2 Correlation' vs 'TC'
  for index, row in df.iterrows():
    x_value = row['X1-X2 Correlation']
    y_values = row['TC_SET']
    for y in y_values:
      ax2.scatter(x_value, y, color='blue')  # Set marker color
    ax2.scatter(x_value, row['TC'], color='red')

  # Plot 'X1-X2 Correlation' vs 'O-INFO'
  for index, row in df.iterrows():
    x_value = row['X1-X2 Correlation']
    y_values = row['OINF_SET']
    for y in y_values:
      ax3.scatter(x_value, y, color='blue')  # Set marker color
    ax3.scatter(x_value, row['OINF'], color='red')

   # Plot 'X1-X2 Correlation' vs 'O-INFO'
  for index, row in df.iterrows():
    x_value = row['X1-X2 Correlation']
    y_values = row['MI_SET']
    for y in y_values:
      ax4.scatter(x_value, y, color='blue')  # Set marker color
    ax4.scatter(x_value, row['MUTUAL_INF'], color='red')

    # Set the same y-axis range for both subplots
    #ax1.set_ylim(ylim_min, ylim_max)
    #ax2.set_ylim(ylim_min, ylim_max)

    # Add axis labels
    ax1.set_xlabel('X1-X2 Correlation')
    ax1.set_ylabel('DTC')
    ax2.set_xlabel('X1-X2 Correlation')
    ax2.set_ylabel('TC')
    ax3.set_xlabel('X1-X2 Correlation')
    ax3.set_ylabel('O-INF')
    ax4.set_xlabel('X1-X2 Correlation')
    ax4.set_ylabel('MUTUAL INF')

  plt.show()

In [None]:
def analyze_dataframe(df_data):
  """
  Analyzes a DataFrame containing features X1, X2, Y, tc_local, and dtc_local,
  calculates descriptive statistics for each unique combination,
  and returns a new DataFrame with the results, formatting tc_local and dtc_local to 2 decimals.

  Args:
      df_data (pandas.DataFrame): The DataFrame containing the data.

  Returns:
      pandas.DataFrame: A new DataFrame with the following columns, formatted as specified:
          - X1/X2/Y: Unique combination of features.
          - Count (%): Percentage occurrence of the combination (normalized).
          - tc_local: Average value of tc_local for the combination, formatted to 2 decimals.
          - dtc_local: Average value of dtc_local for the combination, formatted to 2 decimals.
  """

  # Group by X1, X2, and Y for efficient calculations
  grouped_data = df_data.groupby(['X1', 'X2', 'Y'])

  # Calculate descriptive statistics with appropriate functions
  results = grouped_data.agg(
      Count=('X1', 'size'),  # Count occurrences (faster than nrows)
      tc_local=('tc_local', 'mean'),
      dtc_local=('dtc_local', 'mean'),
      mi_local=('mi_local', 'mean')
  ).reset_index()

  # Normalize Count to percentage
  results['Count (%)'] = (results['Count'] / len(df_data)) * 100

  # Custom formatting function for string representation (2 decimals)
  def format_two_decimals(value):
      return f"{value:.2f}"

  # Format tc_local and dtc_local columns using list comprehension
  results[['tc_local', 'dtc_local', 'mi_local']] = results[['tc_local', 'dtc_local', 'mi_local']].applymap(format_two_decimals)

  # Reorder columns as requested
  results = results[['X1', 'X2', 'Y', 'Count (%)', 'tc_local', 'dtc_local', 'mi_local']]

  # Print the DataFrame with clear column titles for readability
  print("Analysis Results:")
  #print(results)

  return results

In [None]:
def bubble_plot(results):
  """
  Creates a bubble chart with numerically ordered axes and bubble sizes proportional to count(%).

  Args:
      results (pandas.DataFrame): The DataFrame containing the results.
  """

  # Extract data for plotting and ensure numerical interpretation
  tc_local = results['tc_local'].astype(float)  # Explicitly cast to float
  dtc_local = results['dtc_local'].astype(float)  # Explicitly cast to float
  count_percent = results['Count (%)'] * 10  # Adjust bubble size based on count percentage

  # Sort data for numerically ordered axes
  sorted_data = results[['tc_local', 'dtc_local', 'Count (%)']].sort_values(by=['tc_local', 'dtc_local'])
  sorted_tc_local = sorted_data['tc_local'].to_numpy()
  sorted_dtc_local = sorted_data['dtc_local'].to_numpy()
  sorted_count_percent = sorted_data['Count (%)'].to_numpy()

  # Create the bubble chart
  plt.figure(figsize=(8, 6))  # Set figure size for readability
  plt.scatter(sorted_tc_local, sorted_dtc_local, s=sorted_count_percent, alpha=0.7, edgecolors='k')  # Customize plot

  # Add labels and title
  plt.xlabel('tc_local (average)')
  plt.ylabel('dtc_local (average)')
  plt.title('Bubble Chart of tc_local vs. dtc_local (Bubble Size Proportional to Count %)')

  # Ensure numerical ordering on axes
  plt.xticks(rotation=45)  # Rotate x-axis labels for readability if needed
  plt.yticks()

  # Show the plot
  plt.show()

  return results

In [None]:
def plot_local_distributions(df_overview, w_AND, corr):
  """
  This function plots the distributions of tc_local and dtc_local from the given DataFrame.

  Args:
      df_overview (pandas.DataFrame): A DataFrame containing tc_local and occurence columns.
  """

  # Set bar width (adjust as needed)
  bar_width = 0.01
  tc_avg= (df_overview["tc_local"] * df_overview["occurence"]).sum()
  dtc_avg=(df_overview["dtc_local"] * df_overview["occurence"]).sum()
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))  # Adjust figure size for better readability

  # Plot tc_local distribution as bars
  ax2.bar(df_overview["tc_local"], df_overview["occurence"], width=bar_width, color='green', label='TC_local')
  ax2.plot( [tc_avg, tc_avg],[0, 0.5],color='red', label='TC=Average')  # Add average marker with label


  # Plot dtc_local distribution as bars
  ax1.bar(df_overview["dtc_local"], df_overview["occurence"], width=bar_width, color='green', label='DTC_local')
  ax1.plot( [dtc_avg, dtc_avg],[0, 0.5],color='red', label='DTC=Average')  # Add average marker with label


  # Add titles and labels
  ax2.set_title(f'TC:  w_AND = {w_AND:.2f}, X1_X2_Corr={corr}')
  ax1.set_title(f'DTC: w_AND = {w_AND:.2f}, X1_X2_Corr={corr}')
  ax2.set_xlabel('TC_LOCAL')
  ax1.set_xlabel('DTC_LOCAL')
  ax2.set_ylabel('OCCURENCE')
  ax1.set_ylabel('OCCURENCE')

  # Add legend (optional)
  ax1.legend()
  ax2.legend()

  plt.tight_layout()  # Adjust spacing for better readability
  plt.show()

In [None]:
n= 400
t_c=20
r=1

In [None]:
df=generate_correlation_table_function(n, t_c, r,s,1)
plot_correlation_vs_dtc_tc_oinf_mi_spread_function(df)

1
NEXT EXPERIMENT
corr X1_X2=0.00
true_corr X1_X2=0.02
Analysis Results:
   X1  X2  Y  Count (%) tc_local dtc_local mi_local
0   0   0  0      28.00     0.41      0.03     0.38
1   0   1  0      22.50     0.34      0.99     0.38
2   1   0  0      26.25     0.35      0.88     0.38
3   1   1  1      23.25     2.14      2.10     2.10
TC= 0.78
DTC= 0.95
O_INF= -0.17
MUTUAL INF= 0.78



NEXT EXPERIMENT
corr X1_X2=0.05
true_corr X1_X2=0.11
Analysis Results:
   X1  X2  Y  Count (%) tc_local dtc_local mi_local
0   0   0  0       27.0     0.64      0.16     0.48
1   0   1  0       24.5     0.33      0.96     0.48
2   1   0  0       20.0     0.30      1.09     0.48
3   1   1  1       28.5     1.96      1.81     1.81
TC= 0.87
DTC= 1.01
O_INF= -0.14
MUTUAL INF= 0.86



NEXT EXPERIMENT
corr X1_X2=0.11
true_corr X1_X2=0.15
Analysis Results:
   X1  X2  Y  Count (%) tc_local dtc_local mi_local
0   0   0  0      27.75     0.72      0.21     0.51
1   0   1  0      23.25     0.29      0.97     0.51
2   1

In [None]:
df=generate_correlation_table_function(n, t_c, r,s,2)
plot_correlation_vs_dtc_tc_oinf_mi_spread_function(df)

In [None]:
df=generate_correlation_table_function(n, t_c, r,s,3)
plot_correlation_vs_dtc_tc_oinf_mi_spread_function(df)

In [None]:
df=generate_correlation_table_function(n, t_c, r,s,4)
plot_correlation_vs_dtc_tc_oinf_mi_spread_function(df)