# Verification of Thesis Results

## Imports 

In [1]:
import sys
import os
import logging
import pathlib
from pathlib import Path

import sklearn
import pandas as pd
import numpy as np
from tqdm import tqdm

from tensorflow import keras
from src.models.baseline.kmeans import runKmeans
from tensorflow.keras import layers
from tensorflow.keras.datasets import cifar100, cifar10, mnist

from src.models.baseline.helper import *

## Required Paths

In [2]:
cwd = Path(os.getcwd())
root = cwd.parent.parent
data = pathlib.PurePath(root, 'data')
history = pathlib.PurePath(data, 'history')
interim = pathlib.PurePath(data, 'interim')
results = pathlib.PurePath(data, 'results', 'thesis')

## Parameters and Datasets

In [3]:
test_param_grid ={
    1 : {
        'K' : [50, 100, 200, 500, 1000],
        'epsilon' : 0.01
    },
    2 : {
        'K' : 1000,
        'epsilon' : [0.01, 0.05, 0.1, 1.0]
    }
}

model_param_grid = {
    'MNIST' : {
        'batch_size' : 1000,
        'epochs' : 15,
        'save_history' : False,
        'path' : history
    },
    'CIFAR10' : {
        'batch_size' : 64,
        'epochs' : 30,
        'save_history' : False,
        'path' : history
    },
    'CIFAR100' : {
        'batch_size' : 64,
        'epochs' : 30,
        'save_history' : False,
        'path' : history
    }
}

In [4]:
datasets = {
    'MNIST' : {
        'data' : dataset_normalize(mnist.load_data()),
        'shape' : (28, 28, 1)
        },
    'CIFAR10' : {
        'data' : dataset_normalize(cifar10.load_data()),
        'shape' : (32, 32, 3) 
    },
    'CIFAR100' : {
        'data' : dataset_normalize(cifar100.load_data()),
        'shape' : (32, 32, 3) 
    }
}

In [5]:
x_train = datasets['MNIST']['data']['x_train']

In [6]:
results1 = pd.DataFrame()

In [7]:
partition_dir = interim
seed = 8008

## Strategy 1 (Variable K)

In [8]:
for key, v in datasets.items():
    x_train = v['data']['x_train']
    x_test = v['data']['x_test']
    y_train = v['data']['y_train']
    y_test = v['data']['y_test']

    print("Running Test 1 on {}...".format(key))

    for k in tqdm(test_param_grid[1]['K']):
        dir = pathlib.PurePath(partition_dir, key + str(k) + '_partitions.tsv')
        if Path(dir.as_posix()).exists():
            print("Loading Partitions for {} dataset with {} clusters".format(key, k))
            with open(dir) as f:
                lines = f.readlines()
            lines = [line.rstrip() for line in lines]
            x = []
            y = []
            for line in lines:
                tokens = line.split()
                x_vec = np.zeros(len(tokens)-1)
                for i in range(len(tokens)-1):
                    x_vec[i] = float(tokens[i])

                x.append(x_vec)
                y.append(int(tokens[-1]))
        else:
            print("Generating Partitions for {} dataset with {} clusters".format(key, k))
            x_vecs = flatten(x_train)
            x, y = partition(x_vecs, k, SEED=seed, write_path=pathlib.PurePath(interim, key + str(k) + '_partitions.tsv'))

        kmeans = runKmeans(k,  (x_train, x_test), (y_train, y_test), v['shape'], model_param_grid[key])
        results = runTest(k, test_param_grid[1]['epsilon'], (x_train, x_test), (y_train, y_test), (x, y), v['shape'], model_param_grid[key], (True, True, True))
        
        sets = [kmeans, results['gaussian'], results['epsilon'], results['complete']]
        for set in sets:
            set['dataset'] = key
            set['K'] = k
            results1.append(set, ignore_index=True)

    print("Test 1 Completed Successfully")

Running Test 1 on MNIST...


  0%|          | 0/5 [00:00<?, ?it/s]

Loading Partitions for MNIST dataset with 50 clusters
Epoch 1/15
1/1 - 1s - loss: 2.3030 - accuracy: 0.1250 - val_loss: 2.2816 - val_accuracy: 0.1000
Epoch 2/15
1/1 - 0s - loss: 2.2684 - accuracy: 0.2000 - val_loss: 2.2509 - val_accuracy: 0.1000
Epoch 3/15
1/1 - 0s - loss: 2.2312 - accuracy: 0.1750 - val_loss: 2.2121 - val_accuracy: 0.1000
Epoch 4/15
1/1 - 0s - loss: 2.1995 - accuracy: 0.1250 - val_loss: 2.1804 - val_accuracy: 0.1000
Epoch 5/15
1/1 - 0s - loss: 2.1311 - accuracy: 0.2250 - val_loss: 2.1232 - val_accuracy: 0.3000
Epoch 6/15
1/1 - 0s - loss: 2.0592 - accuracy: 0.3000 - val_loss: 2.0466 - val_accuracy: 0.5000
Epoch 7/15
1/1 - 0s - loss: 1.9116 - accuracy: 0.4250 - val_loss: 1.9548 - val_accuracy: 0.5000
Epoch 8/15
1/1 - 0s - loss: 1.8705 - accuracy: 0.4750 - val_loss: 1.8274 - val_accuracy: 0.5000
Epoch 9/15
1/1 - 0s - loss: 1.5236 - accuracy: 0.5500 - val_loss: 1.6276 - val_accuracy: 0.8000
Epoch 10/15
1/1 - 0s - loss: 1.3654 - accuracy: 0.5500 - val_loss: 1.5321 - val_ac

  results1.append(set)
  0%|          | 0/5 [31:35<?, ?it/s]


TypeError: Can only append a dict if ignore_index=True

## Strategy 2 (Variable $\epsilon$)

In [None]:
results2 = pd.DataFrame()

In [None]:
print("Running Test 2 on {}...".format(key))

for e in tqdm(test_param_grid[2]['epsilon']): # TODO: Make function run test 1 or test 2
    results = runTest(test_param_grid[2]['K'], e, (x_train, x_test), (y_train, y_test), (x, y), v['shape'], model_param_grid[key], (False, True, False))
    results = results['epsilon']
    results['dataset'] = key
    results['Epsilon'] = e

    results2.append(results)

print("Test 2 Completed Successfully")

In [None]:
print("Saving Results...")

results1.to_csv(pathlib.PurePath(results, 'test1.csv'))
results2.to_csv(pathlib.PurePath(results, 'test2.csv'))