In [1]:
!git clone https://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@github.com/Snover98/icmla-sampling-intrinsic-dimension-research.git

fatal: destination path 'icmla-sampling-intrinsic-dimension-research' already exists and is not an empty directory.


In [2]:
import os
os.chdir('icmla-sampling-intrinsic-dimension-research')

In [3]:
from functools import partial
from typing import Callable, Any, Final

In [4]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from pprint import pprint

In [5]:
from dataset_subset import sample_dataset_random, sample_dataset_jls_kmeans
from train_dataset import train_cfar, train_svhn, train_mnist

In [6]:
BASE_SEED: Final[int] = 206783441

In [7]:
DATASET_IMPLICIT_DIMS: dict[str, int] = {
    "MNIST": 11,
    "SVHN": 14,
    "CIFAR-10": 21
}

Calculate the different values for functions of $d_{ID}$

In [8]:
dims_df = pd.DataFrame.from_dict(DATASET_IMPLICIT_DIMS, orient='index', columns=['d'])
dims_df['2d'] = 2 * dims_df['d']
dims_df['5d'] = 5 * dims_df['d']
dims_df['d*ln(d)'] = dims_df['d'] * np.log(dims_df['d'])
dims_df['d*log2(d)'] = dims_df['d'] * np.log2(dims_df['d'])
dims_df['d^1.5*ln(d)'] = dims_df['d*ln(d)'] * np.sqrt(dims_df['d'])
dims_df['d^2'] = np.power(dims_df['d'], 2)
dims_df

Unnamed: 0,d,2d,5d,d*ln(d),d*log2(d),d^1.5*ln(d),d^2
MNIST,11,22,55,26.376848,38.053748,87.482108,121
SVHN,14,28,70,36.946803,53.302969,138.242277,196
CIFAR-10,21,42,105,63.934971,92.238666,292.986845,441


Round the dimensions since we can only use positive integers

In [9]:
dims_df = dims_df.round().astype(int)
dims_df

Unnamed: 0,d,2d,5d,d*ln(d),d*log2(d),d^1.5*ln(d),d^2
MNIST,11,22,55,26,38,87,121
SVHN,14,28,70,37,53,138,196
CIFAR-10,21,42,105,64,92,293,441


In [10]:
def train_model(model_name: str, sample_func: Callable[[np.ndarray[float]], list[int]], base_seed: int, num_runs: int = 10) -> list[float]:
    train_func: Callable[[Callable[[np.ndarray[float]], list[int]]], tuple[Any, float, float]]
    if model_name.upper() == 'MNIST':
        train_func = train_mnist
    elif model_name.upper() == 'SVHN':
        train_func = train_svhn
    else:
        train_func = train_cfar

    return [max(train_func(partial(sample_func, random_seed=base_seed + i * 13), seed=base_seed+i * 13)[1]) for i in tqdm(range(num_runs))]


In [11]:
train_results: dict[str, dict[str, list[float]]] = dict()
train_results = {
    'MNIST': {
        'baseline': [52.33, 54.83, 56.95, 54.12, 52.56, 52.34, 50.64, 49.53, 45.2, 50.77],
        'd': [54.32, 46.77, 55.82, 57.36, 48.16, 63.0, 58.47, 52.8, 56.6, 59.6],
        '2d': [53.34, 57.55, 59.89, 55.04, 59.28, 57.08, 56.91, 51.06, 57.56, 62.95],
        '5d': [57.18, 53.42, 57.06, 56.22, 60.05, 49.58, 56.41, 55.89, 55.95, 62.38],
        'd*ln(d)': [59.7, 47.25, 54.86, 54.26, 52.56, 50.85, 59.24, 58.05, 56.8, 55.18],
        'd*log2(d)': [46.79, 56.5, 62.31, 60.16, 51.88, 57.12, 59.3, 57.36, 56.35, 62.19],
        'd^1.5*ln(d)': [56.43, 60.5, 60.4, 61.43, 60.61, 68.87, 57.85, 64.31, 52.51, 57.9],
        'd^2': [53.35, 51.51, 61.52, 53.84, 60.17, 57.92, 66.15, 57.77, 59.33, 55.77]
    }
}


# train_results = {
#     'SVHN': {   '2d': [   18.10464044253227,
#               31.353718500307313,
#               25.349569760295022,
#               27.527658266748617,
#               25.188229870928087,
#               19.960049170252,
#               21.162415488629378,
#               23.901352181929933,
#               30.958051628764597,
#               19.03042409342348],
#     '5d': [   22.360940381069454,
#               13.210663798401967,
#               23.92824216349109,
#               24.33159188690842,
#               18.846035648432697,
#               23.620928088506453,
#               18.481100184388445,
#               23.69775660725261,
#               23.836047940995698,
#               24.988475722188078],
#     'baseline': [   24.097264904732636,
#                     27.324062692071298,
#                     21.38521819299324,
#                     19.587430854333128,
#                     32.49078057775046,
#                     15.938076213890596,
#                     23.982022126613398,
#                     19.587430854333128,
#                     20.467117393976643,
#                     20.394130301167795],
#     'd': [   21.90381069452981,
#              16.014904732636754,
#              19.587430854333128,
#              24.327750460971114,
#              15.938076213890596,
#              21.339121081745542,
#              20.367240319606637,
#              19.587430854333128,
#              21.93838352796558,
#              19.068838352796558],
#     'd*ln(d)': [   15.80362630608482,
#                    15.669176398279042,
#                    22.622157344806393,
#                    25.994929317762754,
#                    18.365857406269207,
#                    21.05101413644745,
#                    20.966502765826675,
#                    19.587430854333128,
#                    27.105101413644746,
#                    20.182851874615857]}
# }

In [None]:
for model in dims_df.index:
    print('='*50)
    print(model.center(50, '='))
    print('='*50)
    print(f'{model} baseline:')
    n_samples = 100
    if model.upper() == 'SVHN':
      n_samples = 250
    elif model.upper() == 'MNIST':
      n_samples = 25

    if 'baseline' not in train_results.get(model, dict()):
        train_results[model] = {'baseline': train_model(model, partial(sample_dataset_random, n_samples=n_samples), base_seed=BASE_SEED)}
    for jls_dim in dims_df.columns:
        if jls_dim not in train_results[model]:
            print(f'{model} with {jls_dim=}:')
            train_results[model][jls_dim] = train_model(model,
                                                        partial(sample_dataset_jls_kmeans, n_samples=n_samples, jls_dim=dims_df.loc[model, jls_dim]),
                                                        base_seed=BASE_SEED)
    print(f"Results for {model=}:")
    pprint(train_results[model], indent=4)
    print()

MNIST baseline:
Results for model='MNIST':
{   '2d': [   53.34,
              57.55,
              59.89,
              55.04,
              59.28,
              57.08,
              56.91,
              51.06,
              57.56,
              62.95],
    '5d': [   57.18,
              53.42,
              57.06,
              56.22,
              60.05,
              49.58,
              56.41,
              55.89,
              55.95,
              62.38],
    'baseline': [   52.33,
                    54.83,
                    56.95,
                    54.12,
                    52.56,
                    52.34,
                    50.64,
                    49.53,
                    45.2,
                    50.77],
    'd': [54.32, 46.77, 55.82, 57.36, 48.16, 63.0, 58.47, 52.8, 56.6, 59.6],
    'd*ln(d)': [   59.7,
                   47.25,
                   54.86,
                   54.26,
                   52.56,
                   50.85,
                   59.24,
       

  0%|          | 0/10 [00:00<?, ?it/s]

Building SVHN data loader with 1 workers
Using downloaded and verified file: /tmp/public_dataset/pytorch/svhn-data/train_32x32.mat
Using downloaded and verified file: /tmp/public_dataset/pytorch/svhn-data/train_32x32.mat
Using downloaded and verified file: /tmp/public_dataset/pytorch/svhn-data/test_32x32.mat
Epoch #1 Elapsed 2.44s, 2.44 s/epoch, 0.15 s/batch, ets 241.54s
	Test set: Average loss: 2.2671, Accuracy: 4149/26032 (16%)
Epoch #2 Elapsed 64.92s, 32.46 s/epoch, 2.03 s/batch, ets 3180.85s


In [None]:
train_results

In [None]:
{m: {k: np.mean(v) for k, v in m_res.items()} for m, m_res in train_results.items()}

In [None]:
{m: {k: np.mean(v) / np.mean(m_res['baseline']) for k, v in m_res.items()} for m, m_res in train_results.items()}