In [30]:
from functools import partial
from typing import Callable, Any, Final

In [31]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from pprint import pprint

In [32]:
from dataset_subset import sample_dataset_random, sample_dataset_jls_kmeans
from train_dataset import train_cfar, train_svhn, train_mnist

In [33]:
BASE_SEED: Final[int] = 206783441

In [34]:
DATASET_IMPLICIT_DIMS: dict[str, int] = {
    "MNIST": 11,
    "SVHN": 14,
    "CIFAR-10": 21
}

Calculate the different values for functions of $d_{ID}$

In [35]:
dims_df = pd.DataFrame.from_dict(DATASET_IMPLICIT_DIMS, orient='index', columns=['d'])
dims_df['2d'] = 2 * dims_df['d']
dims_df['5d'] = 5 * dims_df['d']
dims_df['d*ln(d)'] = dims_df['d'] * np.log(dims_df['d'])
dims_df['d*log2(d)'] = dims_df['d'] * np.log2(dims_df['d'])
dims_df['d^1.5*ln(d)'] = dims_df['d*ln(d)'] * np.sqrt(dims_df['d'])
dims_df['d^2'] = np.power(dims_df['d'], 2)
dims_df

Unnamed: 0,d,2d,5d,d*ln(d),d*log2(d),d^1.5*ln(d),d^2
MNIST,11,22,55,26.376848,38.053748,87.482108,121
SVHN,14,28,70,36.946803,53.302969,138.242277,196
CIFAR-10,21,42,105,63.934971,92.238666,292.986845,441


Round the dimensions since we can only use positive integers

In [36]:
dims_df = dims_df.round().astype(int)
dims_df

Unnamed: 0,d,2d,5d,d*ln(d),d*log2(d),d^1.5*ln(d),d^2
MNIST,11,22,55,26,38,87,121
SVHN,14,28,70,37,53,138,196
CIFAR-10,21,42,105,64,92,293,441


In [37]:
def train_model(model_name: str, sample_func: Callable[[np.ndarray[float]], list[int]], base_seed: int, num_runs: int = 10) -> list[float]:
    train_func: Callable[[Callable[[np.ndarray[float]], list[int]]], tuple[Any, float, float]]
    if model_name.upper() == 'MNIST':
        train_func = train_mnist
    elif model_name.upper() == 'SVHN':
        train_func = train_svhn
    else:
        train_func = train_cfar

    return [max(train_func(partial(sample_func, random_seed=base_seed + i * 13), seed=base_seed+i * 13)[1]) for i in tqdm(range(num_runs))]


In [38]:
train_results: dict[str, dict[str, list[float]]] = dict()

In [39]:
import json
with open('results.json') as f:
    train_results = json.load(f)

with open('results_diff_sample_size.json') as f:
    train_results_diff_samples = json.load(f)

train_results['MNIST_s_25'] = train_results_diff_samples['MNIST']
train_results['SVHN_s_250'] = train_results_diff_samples['SVHN']

train_results

{'CIFAR-10': {'baseline': [25.07,
   27.58,
   25.11,
   25.1,
   25.95,
   26.42,
   23.25,
   26.71,
   24.16,
   26.73],
  'd': [28.41, 25.55, 26.24, 28.37, 22.51, 25.84, 26.76, 24.75, 27.2, 25.78],
  '2d': [25.5, 25.04, 28.8, 27.42, 26.6, 27.22, 22.72, 25.38, 28.8, 29.65],
  '5d': [26.74, 27.16, 30.17, 23.13, 26.11, 24.9, 28.39, 27.26, 25.03, 27.08],
  'd*ln(d)': [27.97,
   25.75,
   26.69,
   29.71,
   26.24,
   25.03,
   25.91,
   25.11,
   24.28,
   26.12],
  'd*log2(d)': [27.55,
   28.1,
   28.71,
   26.52,
   24.0,
   25.38,
   26.91,
   30.12,
   27.51,
   24.3],
  'd^1.5*ln(d)': [28.21,
   24.47,
   27.69,
   26.68,
   26.3,
   26.76,
   27.7,
   28.62,
   29.24,
   27.61],
  'd^2': [27.84,
   29.4,
   25.94,
   24.53,
   24.47,
   26.06,
   27.41,
   24.72,
   24.58,
   28.55]},
 'MNIST': {'baseline': [72.77,
   67.73,
   72.91,
   76.14,
   75.61,
   70.54,
   73.81,
   73.36,
   74.07,
   75.86],
  'd': [77.17, 78.09, 75.23, 79.53, 78.27, 79.85, 78.3, 79.7, 74.18, 76.77],

In [40]:
for model in dims_df.index:
    print('='*50)
    print(model.center(50, '='))
    print('='*50)
    print(f'{model} baseline:')
    n_samples = 100

    if 'baseline' not in train_results.get(model, dict()):
        train_results[model] = {'baseline': train_model(model, partial(sample_dataset_random, n_samples=n_samples), base_seed=BASE_SEED)}
    for jls_dim in dims_df.columns:
        if jls_dim not in train_results[model]:
            print(f'{model} with {jls_dim=}:')
            pprint(train_results[model], indent=4)
            train_results[model][jls_dim] = train_model(model,
                                                        partial(sample_dataset_jls_kmeans, n_samples=n_samples, jls_dim=dims_df.loc[model, jls_dim]),
                                                        base_seed=BASE_SEED)
    print(f"Results for {model=}:")
    pprint(train_results[model], indent=4)
    print()

MNIST baseline:
Results for model='MNIST':
{   '2d': [78.22, 81.03, 73.8, 78.51, 78.82, 77.48, 79.54, 77.85, 78.93, 78.65],
    '5d': [   78.55,
              76.74,
              78.04,
              80.91,
              80.43,
              79.06,
              78.56,
              76.68,
              80.89,
              77.23],
    'baseline': [   72.77,
                    67.73,
                    72.91,
                    76.14,
                    75.61,
                    70.54,
                    73.81,
                    73.36,
                    74.07,
                    75.86],
    'd': [77.17, 78.09, 75.23, 79.53, 78.27, 79.85, 78.3, 79.7, 74.18, 76.77],
    'd*ln(d)': [   78.41,
                   77.87,
                   77.88,
                   77.87,
                   79.12,
                   76.16,
                   76.67,
                   77.5,
                   78.72,
                   79.04],
    'd*log2(d)': [   78.44,
                     79.85,

# Results
## Test Accuracy percentage
Here we'll show a table of the test accuracy for each dataset for the different implicit dimension functions + random sample (baseline)

In [41]:
 mean_results = {model: {k: np.mean(v) for k, v in model_results.items()} for model, model_results in train_results.items()}
 mean_res_df = pd.DataFrame.from_dict(mean_results, orient='index')
 mean_res_df

Unnamed: 0,baseline,d,2d,5d,d*ln(d),d*log2(d),d^1.5*ln(d),d^2
CIFAR-10,25.608,26.141,26.713,26.597,26.281,26.91,27.328,26.35
MNIST,73.28,77.709,78.283,78.709,77.924,78.681,79.236,79.255
SVHN,22.290642,23.212969,20.061079,21.820068,20.885065,20.641902,20.800553,20.823602
MNIST_s_25,51.927,55.29,57.066,56.414,54.875,56.996,60.081,57.733
SVHN_s_250,49.774892,44.492548,41.661801,43.702572,42.695144,47.202674,44.817148,46.64413


We can see that only MNIST ever reaches an average over 50% (both for $s=100$ and $s=25$), while the others never really become accurate

If we normalize the accuracy by the baseline, we can see some interesting phenomenas -

In [42]:
 normalized_results = {m: {k: np.mean(v) / np.mean(m_res['baseline'])  - 1 for k, v in m_res.items() if k != 'baseline'} for m, m_res in train_results.items()}
 norm_res_df = pd.DataFrame.from_dict(normalized_results, orient='index')
 norm_res_df * 100

Unnamed: 0,d,2d,5d,d*ln(d),d*log2(d),d^1.5*ln(d),d^2
CIFAR-10,2.081381,4.315058,3.862074,2.628085,5.084349,6.716651,2.897532
MNIST,6.043941,6.827238,7.40857,6.337336,7.37036,8.127729,8.153657
SVHN,4.137729,-10.00224,-2.111086,-6.305685,-7.396557,-6.684819,-6.581419
MNIST_s_25,6.4764,9.896586,8.640977,5.677201,9.761781,15.702814,11.181081
SVHN_s_250,-10.612469,-16.299566,-12.199565,-14.223532,-5.167703,-9.960332,-6.289842


1) SVHN seems to mostly become **worse** when we use our sampling method (besides the case of $s=100$ & $d_{JL} = d_{ID}$
2) The best performance is seen on MNIST - especially when $s=25$
3) In general $d_{ID}^{1.5}ln\left(d_{ID}\right)$ is the dimension with the best performance, except for SVHN

## Speculations as for why these results make sense
1) SVHN is a very varied dataset with noisy digits that look similar - perhaps euclidian distance is less useful when using working with it.\
Another option is that due to a larger test set (mult-digits caused it) the test evaluation is just better (I don't buy that)
2) MNIST is the dataset with the smallest dimension, and does not require convolutions to solve.\
Perhaps due to its simplicity k-means manages to choose very informative samples, and JL manages to focus k-means on useful features
3) I can't really think of an explanation as to why $d_{ID}^{1.5}ln\left(d_{ID}\right)$ had the best performance - my guess would be that it is big enough to keep many features, but not too big as to include too much noise.

## Ideas for further research
### The relationship between JL and the implicit dimension estimation
Since the implicit dimension is computed via MLE anf k-means - it would be interesting to look at how the JL transformation changes the dimension estimation
### Usage of JL with classic ML classifiers for MNIST
Since we saw that JL is the most helpful with MNIST, it would be interesting to see how classic ML classifiers like SVMs would work on the result of the JL tranformation into these functions of the implicit dimension
### Why SVHN was so bad
Another interesting vector would be to find out why SVHN worked so badly with JL - maybe we can find out for which datasets euclidian distance is useful
### Using the estimated dimension as a feature dimension in the model
Out of scope for this course - but it would be interesting to see if using these functions of the implicit dim as the feature dimension before a classifier has any merit.

# Conclusion
This work had several fun challenges:
* reproducing the original work (ended up finding out that pytorch is *very* different from tensorflow)
* reaching google colab's resource limit (I had to optimize the code several times, and to borrow my GFs google account for the final leg)

While I didn't reach any results that were too interesting, I think the directions with MNIST could have *some* potential, and at least it was a very educational assignment that casued me to do deep-dives into both papers and python libraries