# Exercises for probability polytopes


In [1]:
import os
from pathlib import Path

from itertools import product
from math import floor

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline

## Probability polytopes exercise: probability simplex

* Describe in words and mathematical notation what the method `fake_data_for_learning.utils.get_simplex_sample` does. Difficulty: *

* Test if the method `fake_data_for_learning.utils.get_simplex_sample` generates uniformly distributed samples from the probability simplex. Difficulty: **

* Test if projecting samples to individual coordinate axes is uniformly distributed. Difficulty: **

Answer the same questions for the following two versions. Be sure to consider more than one choice of ambient dimension.

In [3]:
def get_simplex_sample_v1(ambient_dimension):
    """
    Get random element of the simplex of given ambient dimension

    Parameters
    ----------
    ambient_dimension : int

    Returns
    -------
    res : np.array
    """

    res = np.random.uniform(size=ambient_dimension)
    res = res / res.sum()

    return res


def get_simplex_sample_v2(ambient_dimension):
    """
    Get random element of the probability simplex

    Parameters
    ----------
    ambient_dimension : int

    Returns
    -------
    res : np.array
    """

    res = np.random.uniform(size=ambient_dimension-1)
    res = np.sort(res)
    res = np.insert(res, 0, 0)
    res = np.append(res, 1)
    res = np.diff(res)
    
    return res

1. So basically it calculates a random element in a simplex( its vertices are of a type (0,0,0,...,0,1,0,...,0), where length of this vector is the ambient_dimension)

In [16]:
from scipy.stats import chi2_contingency

ambient_dimension = 4
n_samples = 1000
bins_per_dimension = 50

samples = np.array([get_simplex_sample_v1(ambient_dimension) for _ in range(n_samples)])

def test_uniformity_high_dim(samples, bins_per_dimension):
    """
    Test if the samples are uniformly distributed over the simplex without reducing dimensions.

    Parameters
    ----------
    samples : np.array
    bins_per_dimension : int
    
    Returns
    -------
    chi2 : float
    p_value : float
    """
    n_samples, ambient_dimension = samples.shape
    
    # Create multidimensional bins
    hist, edges = np.histogramdd(samples, bins=bins_per_dimension)
    
    # Flatten the histogram
    hist = hist.flatten()
    
    # Calculate the expected counts assuming uniform distribution
    expected_count = n_samples / len(hist)
    expected = np.full_like(hist, expected_count)
    
    # Perform the Chi-Square test
    chi2, p_value = chi2_contingency([hist, expected])[:2]
    
    return chi2, p_value

chi2, p_value = test_uniformity_high_dim(samples, bins_per_dimension)

print(f"Chi-Square Statistic: {chi2}")
print(f"P-Value: {p_value}")


Chi-Square Statistic: 1999.363941462163
P-Value: 1.0


2.For the values calculated above one can conclude that it really is uniformly distributed ( failed to reject the null hypothesis from the chi^2 test that it is not uniformly distributed)