# Statistical analysis

**Collect and process all the data from the experiments**

In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
from collections import defaultdict

In [3]:
import os
from pathlib import Path

path = Path("../../data/").resolve()
experiments_path = path / "embeddings/MNIST/2"

In [4]:
file_names = ["config.json", "averaged_values.json"]

In [5]:
from collections.abc import MutableMapping

def flatten_dict(dictionary: dict, parent_key='', separator='/') -> dict:
    
    items = []
    for key, value in dictionary.items():
        new_key = parent_key + separator + key if parent_key else key
        if isinstance(value, MutableMapping):
            items.extend(flatten_dict(value, new_key, separator=separator).items())
        else:
            items.append((new_key, value))

    return dict(items)

In [6]:
def gather_data(path, file_names):
    full_data = []
    
    for node in path.iterdir():
        if not node.is_dir():
            continue

        data = {}
        for subnode in node.iterdir():
            if not subnode.is_file():
                continue

            if subnode.name in file_names:
                with open(subnode) as file:
                    data |= json.load(file)

        if data:
            full_data.append(flatten_dict(data))

    return pd.DataFrame(full_data)

In [7]:
data = gather_data(experiments_path, file_names)

In [8]:
data.columns

Index(['n_classes', 'batch_size_train', 'batch_size_test', 'embedding_dim',
       'discriminator_network_inner_dim', 'distribution/henze_zirkler_(train)',
       'distribution/henze_zirkler_(train)_std',
       'distribution/henze_zirkler_(test)',
       'distribution/henze_zirkler_(test)_std',
       'distribution/shapiro_wilk_(train)',
       'distribution/shapiro_wilk_(train)_std',
       'distribution/shapiro_wilk_(test)',
       'distribution/shapiro_wilk_(test)_std',
       'distribution/dagostino_pearson_(train)',
       'distribution/dagostino_pearson_(train)_std',
       'distribution/dagostino_pearson_(test)',
       'distribution/dagostino_pearson_(test)_std', 'input_p', 'output_p',
       'capacity', 'min_capacity_for_classification', 'n_epochs',
       'embedder_network_lr', 'discriminator_network_lr', 'training/loss',
       'training/loss_std', 'training/mutual_information',
       'training/mutual_information_std',
       'training/kullback_leibler_upper_bound',
      

In [9]:
data.groupby(["capacity"])[["output_p", "training/mutual_information", "training/kullback_leibler_upper_bound", "clustering/silhouette_score", "distribution/henze_zirkler_(test)", "classification_mlp/accuracy"]].mean()

Unnamed: 0_level_0,output_p,training/mutual_information,training/kullback_leibler_upper_bound,clustering/silhouette_score,distribution/henze_zirkler_(test),classification_mlp/accuracy
capacity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.575646,0.749894,0.561946,0.013701,0.241132,0.00691,0.780173
1.151293,0.562341,1.11267,0.038623,0.192042,0.295867,0.795673
2.302585,0.316228,2.140525,0.16206,0.109758,0.451149,0.783053
3.453878,0.177828,3.018134,0.435743,0.103048,0.436809,0.785967
4.60517,0.1,3.644869,0.960301,0.133835,0.39683,0.830053
6.907755,0.031623,4.185015,2.72274,0.1556,0.266976,0.87
9.21034,0.01,4.287558,4.922783,0.142566,0.052591,0.873167
11.512925,0.003162,4.309556,7.203369,0.146359,0.009867,0.867993
13.815511,0.001,4.310678,9.504833,0.152633,0.005998,0.88182
