# Verification of Thesis Results

## Imports 

In [1]:
import sys
import os
import pathlib
from pathlib import Path

import pandas as pd
import numpy as np
from tqdm import tqdm

from src.models.baseline.kmeans import runKmeans
from tensorflow.keras.datasets import cifar100, cifar10, mnist

from src.models.baseline.helper import *

## Required Paths

In [2]:
cwd = Path(os.getcwd())
root = cwd.parent.parent
data = pathlib.PurePath(root, 'data')
history = pathlib.PurePath(data, 'history')
interim = pathlib.PurePath(data, 'interim')
res = pathlib.PurePath(data, 'results', 'thesis')

## Parameters and Datasets

In [3]:
test_param_grid ={
    1 : {
        'K' : [50, 100, 200, 500, 1000],
        'epsilon' : 0.01
    },
    2 : {
        'K' : 1000,
        'epsilon' : [0.01, 0.05, 0.1, 1.0]
    }
}

model_param_grid = {
    'MNIST' : {
        'batch_size' : 1000,
        'epochs' : 15,
        'save_history' : True,
        'path' : history
    },
    'CIFAR10' : {
        'batch_size' : 64,
        'epochs' : 30,
        'save_history' : True,
        'path' : history
    },
    'CIFAR100' : {
        'batch_size' : 64,
        'epochs' : 30,
        'save_history' : True,
        'path' : history
    }
}

In [4]:
datasets = {

    #'MNIST' : {
    #    'data' : dataset_normalize(mnist.load_data()),
    #    'shape' : (28, 28, 1)
    #    }
    'CIFAR10' : {
       'data' : dataset_normalize(cifar10.load_data()),
       'shape' : (32, 32, 3) 
    },
    'CIFAR100' : {
        'data' : dataset_normalize(cifar100.load_data()),
       'shape' : (32, 32, 3) 
    }
}

In [5]:
partition_dir = interim
seed = 8008

## Strategy 1 (Variable K)

In [6]:
for key, v in datasets.items():
    
    intermediate = []

    x_train = v['data']['x_train']
    x_test = v['data']['x_test']
    y_train = v['data']['y_train']
    y_test = v['data']['y_test']

    print("Running Test 1 on {}...".format(key))

    for k in tqdm(test_param_grid[1]['K']):
        dir = pathlib.PurePath(partition_dir, key + str(k) + '_partitions.tsv')
        if Path(dir.as_posix()).exists():
            print("Loading Partitions for {} dataset with {} clusters".format(key, k))
            with open(dir) as f:
                lines = f.readlines()
            lines = [line.rstrip() for line in lines]
            x = []
            y = []
            for line in lines:
                tokens = line.split()
                x_vec = np.zeros(len(tokens)-1)
                for i in range(len(tokens)-1):
                    x_vec[i] = float(tokens[i])

                x.append(x_vec)
                y.append(int(tokens[-1]))
        else:
            print("Generating Partitions for {} dataset with {} clusters".format(key, k))
            x_vecs = flatten(x_train)
            x, y = partition(x_vecs, k, SEED=seed, write_path=pathlib.PurePath(interim, key + str(k) + '_partitions.tsv'))

        kmeans = runKmeans(k, (x_train, x_test), (y_train, y_test), v['shape'], key, model_param_grid[key])

        results = runTest(k, test_param_grid[1]['epsilon'], (x_train, x_test), (y_train, y_test), (x, y), v['shape'], model_param_grid[key], key, (False, True, True))
        
        sets = [kmeans, results['gaussian'], results['epsilon'], results['complete']]
        sets = [results['gaussian'], results['epsilon'], results['complete']]
        for set in sets:
            set['K'] = k
            intermediate.append(set)

    metrics = pd.DataFrame(intermediate)
    metrics.to_csv(pathlib.PurePath(res, 'strategy1_{}.csv'.format(key)))
    print("Test 1 Completed Successfully for {}".format(key))

Running Test 1 on MNIST...


  0%|          | 0/5 [00:00<?, ?it/s]

Loading Partitions for MNIST dataset with 50 clusters
Accuracy on K-Means : 0.7265
Accuracy on Gaussian_Neighbourhood : 0.8513
Accuracy on Epsilon_Neighbourhood : 0.8782


 20%|██        | 1/5 [06:10<24:40, 370.11s/it]

Accuracy on Complete_Information : 0.9937
Loading Partitions for MNIST dataset with 100 clusters
Accuracy on K-Means : 0.7632
Accuracy on Gaussian_Neighbourhood : 0.9009
Accuracy on Epsilon_Neighbourhood : 0.9212


 40%|████      | 2/5 [12:12<18:16, 365.62s/it]

Accuracy on Complete_Information : 0.9928
Loading Partitions for MNIST dataset with 200 clusters
Accuracy on K-Means : 0.794
Accuracy on Gaussian_Neighbourhood : 0.9336
Accuracy on Epsilon_Neighbourhood : 0.9402


 60%|██████    | 3/5 [18:35<12:27, 373.72s/it]

Accuracy on Complete_Information : 0.9932
Loading Partitions for MNIST dataset with 500 clusters
Accuracy on K-Means : 0.8401
Accuracy on Gaussian_Neighbourhood : 0.9452
Accuracy on Epsilon_Neighbourhood : 0.9501


 80%|████████  | 4/5 [25:54<06:39, 399.46s/it]

Accuracy on Complete_Information : 0.994
Loading Partitions for MNIST dataset with 1000 clusters
Accuracy on K-Means : 0.8548
Accuracy on Gaussian_Neighbourhood : 0.9626
Accuracy on Epsilon_Neighbourhood : 0.9615


100%|██████████| 5/5 [34:28<00:00, 413.66s/it]

Accuracy on Complete_Information : 0.9936
Test 1 Completed Successfully for MNIST





## Strategy 2 (Variable $\epsilon$)

In [7]:
for key, v in datasets.items():

    intermediate = []

    x_train = v['data']['x_train']
    x_test = v['data']['x_test']
    y_train = v['data']['y_train']
    y_test = v['data']['y_test']

    k = test_param_grid[2]['K']
    dir = pathlib.PurePath(partition_dir, key + str(k) + '_partitions.tsv')
    
    if Path(dir.as_posix()).exists():
        print("Loading Partitions for {} dataset with {} clusters".format(key, k))
        with open(dir) as f:
            lines = f.readlines()
        lines = [line.rstrip() for line in lines]
        x = []
        y = []
        for line in lines:
            tokens = line.split()
            x_vec = np.zeros(len(tokens)-1)
            for i in range(len(tokens)-1):
                x_vec[i] = float(tokens[i])

            x.append(x_vec)
            y.append(int(tokens[-1]))
    else: 
        print("Generating Partitions for {} dataset with {} clusters".format(key, k))
        x_vecs = flatten(x_train)
        x, y = partition(x_vecs, k, SEED=seed, write_path=pathlib.PurePath(interim, key + str(k) + '_partitions.tsv'))

    print("Running Test 2 on {}...".format(key))

    for e in tqdm(test_param_grid[2]['epsilon']): 
        results = runTest(test_param_grid[2]['K'], e, (x_train, x_test), (y_train, y_test), (x, y), v['shape'], model_param_grid[key], key, (False, True, False))
        results = results['epsilon']
        results['dataset'] = key
        results['Epsilon'] = e

        intermediate.append(results)
    
    metrics = pd.DataFrame(intermediate)
    metrics.to_csv(pathlib.PurePath(res, 'strategy2_{}.csv'.format(key)))
    print("Test 2 Completed Successfully for {}".format(key))

Loading Partitions for MNIST dataset with 1000 clusters
Running Test 2 on MNIST...


 25%|██▌       | 1/4 [03:57<11:51, 237.02s/it]

Accuracy on Epsilon_Neighbourhood : 0.9673


 50%|█████     | 2/4 [07:57<07:57, 238.79s/it]

Accuracy on Epsilon_Neighbourhood : 0.9671


 75%|███████▌  | 3/4 [11:53<03:57, 237.88s/it]

Accuracy on Epsilon_Neighbourhood : 0.9631


100%|██████████| 4/4 [15:50<00:00, 237.69s/it]

Accuracy on Epsilon_Neighbourhood : 0.9569
Test 2 Completed Successfully for MNIST



