In [1]:
import numpy as np
import sys
import scipy.stats as st
import matplotlib.pyplot as plt
from pathlib import Path
# setting path
sys.path.append('../topotests/')
from topotests import TopoTest
from distributions import MultivariateDistribution, GaussianMixture, AbsoluteDistribution
import pandas as pd

In [2]:
def normality_tests_1d(samples):
    # HERE sifinicance level of 0.05 is assumed for anderson
    def anderson(sample):
        anderson_out = st.anderson(sample.flatten(), 'norm')
        return anderson_out.statistic < anderson_out.critical_values[2]

    shapiro = [st.shapiro(sample).pvalue > significance_level for sample in samples]
    ks = [st.kstest(sample, 'norm').pvalue > significance_level for sample in samples]
    cvm = [st.cramervonmises(sample.flatten(), 'norm').pvalue > significance_level for sample in samples]
    ad = [anderson(sample) for sample in samples]
    
    shapiro = np.sum(shapiro)/len(shapiro)
    ks = np.sum(ks)/len(ks)
    ad = np.sum(ad)/len(ad)
    cvm = np.sum(cvm)/len(cvm)
    return shapiro, ks, ad, cvm

In [3]:
def run_mc(N, rv_true, rvs):
    # generate representation for standard normal distribution
    topo_test = TopoTest(n=N, dim=dim, method=method, 
                         wasserstein_p=wasserstein_p, wasserstein_order=wasserstein_order)
    topo_test.fit(rv=rv_true, n_signature=n_signature, n_test=n_test)
    # write distance matrix
    topo_test.save_distance_matrix(outputfile_basename+f'_N={N}_distance_matrix.txt')
    
    for rv in rvs:
        # generate samples
        samples = [rv.rvs(N) for i in range(mc_samples)]
        # perform topo tests
        topo_out = topo_test.predict(samples)
        # aggregate results of topo tests
        topo_min = np.mean(topo_out.min)
        topo_mean = np.mean(topo_out.mean)
        topo_max = np.mean(topo_out.max)
        topo_quantile = np.mean(topo_out.quantile)
        # perform standard normality tests
        shapiro, ks, ad, cvm = normality_tests_1d(samples)
        # collect results of topo tests and standard normality tests
        result = [rv.label, method, significance_level, wasserstein_p, wasserstein_order, 
                  mc_samples, n_signature, n_test, 
                  topo_min, topo_mean, topo_max, topo_quantile,
                  shapiro, ks, ad, cvm]
        return result

In [4]:
# generate distributions
# combine this into one list?
rv_true = MultivariateDistribution([st.norm()], label='True')

rvs = [MultivariateDistribution([st.norm()], label='N_0_1'),
       MultivariateDistribution([st.norm(1, 2)], label='N_1_2'),
       MultivariateDistribution([st.beta(2, 2)], label='beta_2_2'),
       MultivariateDistribution([st.beta(5, 5)], label='beta_5_5'),
       MultivariateDistribution([st.beta(10, 10)], label='beta_10_10'),
       MultivariateDistribution([st.laplace()], label='laplace'),
       MultivariateDistribution([st.uniform()], label='U_0_1'),
       MultivariateDistribution([st.t(df=3)], label='T_3'),
       MultivariateDistribution([st.t(df=5)], label='T_5'),
       MultivariateDistribution([st.t(df=7)], label='T_7'),
       MultivariateDistribution([st.t(df=10)], label='T_10'),
       MultivariateDistribution([st.t(df=30)], label='T_30'),
       MultivariateDistribution([st.gamma(5,1)], label='G_5_1'),
       MultivariateDistribution([st.gamma(10,1)], label='G_10_1'),
       MultivariateDistribution([st.gamma(20,1)], label='G_20_1'),
       MultivariateDistribution([st.gamma(4,5)], label='G_4_5'),
       MultivariateDistribution([st.chi2(df=4)], label='ChiSq_4'),
       MultivariateDistribution([st.chi2(df=10)], label='ChiSq_10'),
       MultivariateDistribution([st.cauchy()], label='Cauchy'),
       MultivariateDistribution([st.logistic()], label='Logistic'),
       MultivariateDistribution([st.lognorm(s=1)], label='LN_0_1'),
       MultivariateDistribution([st.lognorm(s=0.5)], label='LN_0_0.5'),
       MultivariateDistribution([AbsoluteDistribution(rv=st.norm())], label='HalfNormal'),
       MultivariateDistribution([GaussianMixture([0, 2], [1, 1], [0.5, 0.5])], label='GM_0.5_2_1'),
       MultivariateDistribution([GaussianMixture([0, 4], [1, 1], [0.5, 0.5])], label='GM_0.5_4_1'),
       MultivariateDistribution([GaussianMixture([0, 6], [1, 3], [0.5, 0.5])], label='GM_0.5_6_3'),
       MultivariateDistribution([GaussianMixture([0, 6], [1, 3], [0.9, 0.1])], label='GM_0.5_6_3')
      ]

In [7]:
# set random numbers generator seed to have reproducibale results
np.random.seed(1)

# set simulation parameters
Ns = [20, 50, 100, 200, 300, 500]
mc_samples = 1000
n_signature = n_test = 750
method = 'mergegram'

dim = 1
significance_level = 0.05
wasserstein_p=1
wasserstein_order=1

outputfile_basename = f'results.{dim}d/{method}_{wasserstein_p}_{wasserstein_order}'

results = []
result_labels = ['distrib', 'method', 'sign_level', 'wasserstein_p', 'wasserstein_order',
                 'mc_loops', 'n_signature', 'n_test', 
                 'topo_min', 'topo_mean', 'topo_max', 'topo_quantile',
                 'shapiro', 'ks', 'ad', 'cvm']
for N in Ns:
    result = run_mc(N=N, rv_true=rv_true, rvs=rvs)
    results.append(result)
    results_df = pd.DataFrame(results, columns=result_labels)
    results_df.to_csv(f'{outputfile_basename}.csv')