## Imports

In [2]:
import sys
import os
import pathlib
from pathlib import Path
import pickle

import pandas as pd
import numpy as np
from tqdm import tqdm

from tensorflow.keras.datasets import cifar100, cifar10, mnist

from src.models.baseline.helper import *

## Paths

In [3]:
cwd = Path(os.getcwd())
root = cwd.parent.parent
data = pathlib.PurePath(root, 'data')
interim = pathlib.PurePath(data, 'interim')

## Parameters & Datasets

In [4]:
test_param_grid ={
    1 : {
        'K' : [50, 100, 200, 500, 1000],
        'epsilon' : 0.01
    },
    2 : {
        'K' : 1000,
        'epsilon' : [0.01, 0.05, 0.1, 1.0]
    }
}

datasets = {
    'MNIST' : {
        'shape' : (28, 28, 1)
        },
    'CIFAR10' : {
        'shape' : (32, 32, 3) 
    },
    'CIFAR100' : {
        'shape' : (32, 32, 3) 
    }
}

In [5]:
partition_dir = interim
seed = 8008

## Partitions

In [None]:
for key, v in datasets.items():
    x_train = v['data']['x_train']
    x_test = v['data']['x_test']
    y_train = v['data']['y_train']
    y_test = v['data']['y_test']

    for k in tqdm(test_param_grid[1]['K']):
        print("Generating Partitions for {} dataset with {} clusters".format(key, k))
        x_vecs = flatten(x_train)
        x, y = partition(x_vecs, k, SEED=seed, write_path=pathlib.PurePath(interim, key + str(k) + '_partitions.tsv'))

## Mu & Sigma

Assumes Partitions Exist

In [8]:
for key, v in datasets.items():

    storage = {}

    for k in tqdm(test_param_grid[1]['K']):
        dir = pathlib.PurePath(partition_dir, key + str(k) + '_partitions.tsv')
        if Path(dir.as_posix()).exists():
            print("Loading Partitions for {} dataset with {} clusters".format(key, k))
            with open(dir) as f:
                lines = f.readlines()
            lines = [line.rstrip() for line in lines]
            x = []
            y = []
            for line in lines:
                tokens = line.split()
                x_vec = np.zeros(len(tokens)-1)
                for i in range(len(tokens)-1):
                    x_vec[i] = float(tokens[i])

                x.append(x_vec)
                y.append(int(tokens[-1]))

        print("Generating Mean & Variance for {} dataset with {} clusters".format(key, k))
        members = groupClusters(x, y)
        mu, sigma = computeMultivariateGaussianParameters(members)

        storage[k] = (mu, sigma)

        # Explicit Garbage Collection to Save Memory
        
        del x 
        del y
        del members
        del mu
        del sigma

    print("Saving Mean & Variance for {} dataset".format(key))
    with open(pathlib.PurePath(interim, "meanvar{}.pkl".format(key)), 'wb') as f:
        pickle.dump(storage, f)

  0%|          | 0/5 [00:00<?, ?it/s]

Loading Partitions for MNIST dataset with 50 clusters
Generating Mean & Variance for MNIST dataset with 50 clusters


 20%|██        | 1/5 [00:28<01:54, 28.60s/it]

Loading Partitions for MNIST dataset with 100 clusters
Generating Mean & Variance for MNIST dataset with 100 clusters


 40%|████      | 2/5 [00:57<01:26, 28.67s/it]

Loading Partitions for MNIST dataset with 200 clusters
Generating Mean & Variance for MNIST dataset with 200 clusters


 60%|██████    | 3/5 [01:24<00:56, 28.10s/it]

Loading Partitions for MNIST dataset with 500 clusters
Generating Mean & Variance for MNIST dataset with 500 clusters


 80%|████████  | 4/5 [01:53<00:28, 28.29s/it]

Loading Partitions for MNIST dataset with 1000 clusters
Generating Mean & Variance for MNIST dataset with 1000 clusters


100%|██████████| 5/5 [02:26<00:00, 29.24s/it]


Saving Mean & Variance for MNIST dataset


  0%|          | 0/5 [00:00<?, ?it/s]

Loading Partitions for CIFAR10 dataset with 50 clusters
Generating Mean & Variance for CIFAR10 dataset with 50 clusters


 20%|██        | 1/5 [02:12<08:48, 132.18s/it]

Loading Partitions for CIFAR10 dataset with 100 clusters
Generating Mean & Variance for CIFAR10 dataset with 100 clusters


 40%|████      | 2/5 [04:53<07:28, 149.54s/it]

Loading Partitions for CIFAR10 dataset with 200 clusters
Generating Mean & Variance for CIFAR10 dataset with 200 clusters


 60%|██████    | 3/5 [07:42<05:16, 158.26s/it]

Loading Partitions for CIFAR10 dataset with 500 clusters
Generating Mean & Variance for CIFAR10 dataset with 500 clusters


 80%|████████  | 4/5 [10:20<02:38, 158.30s/it]

Loading Partitions for CIFAR10 dataset with 1000 clusters
Generating Mean & Variance for CIFAR10 dataset with 1000 clusters


100%|██████████| 5/5 [13:03<00:00, 156.76s/it]


Saving Mean & Variance for CIFAR10 dataset


  0%|          | 0/5 [00:00<?, ?it/s]

Loading Partitions for CIFAR100 dataset with 50 clusters
Generating Mean & Variance for CIFAR100 dataset with 50 clusters


 20%|██        | 1/5 [02:42<10:51, 162.82s/it]

Loading Partitions for CIFAR100 dataset with 100 clusters
Generating Mean & Variance for CIFAR100 dataset with 100 clusters


 40%|████      | 2/5 [06:00<09:09, 183.16s/it]

Loading Partitions for CIFAR100 dataset with 200 clusters
Generating Mean & Variance for CIFAR100 dataset with 200 clusters


 60%|██████    | 3/5 [08:43<05:48, 174.15s/it]

Loading Partitions for CIFAR100 dataset with 500 clusters
Generating Mean & Variance for CIFAR100 dataset with 500 clusters


 80%|████████  | 4/5 [11:12<02:44, 164.16s/it]

Loading Partitions for CIFAR100 dataset with 1000 clusters
Generating Mean & Variance for CIFAR100 dataset with 1000 clusters


100%|██████████| 5/5 [13:29<00:00, 161.99s/it]


Saving Mean & Variance for CIFAR100 dataset
