# CS 6080 - Time Series Taxonomy

In [None]:
import gc
import os

import pandas as pd
import matplotlib as plt
import numpy as np

from pyts.classification import KNeighborsClassifier, BOSSVS

from sklearn.model_selection import StratifiedShuffleSplit

from data_types import Breed, Species

## Load the datasets metadata and produce training and testing splits on the data

In [None]:
def load_samples_into_ram(sample_names, data_dir):
    # TRICKY: Preallocate based on how much we will need. Makes memory usage and perfomrance MUCH better
    if data_dir == "data/a1":
        prealloc = np.zeros((len(sample_names), 64*64), dtype="float32")
    elif data_dir == "data/a2":
        prealloc = np.zeros((len(sample_names), 64*3), dtype="float32")
    else:
        assert(False, "Must be data/a1, data/a2")
        
    for i in range(0, len(sample_names)):
        if i % 10000 == 0:
            print(f"Loading sample {i}")
            gc.collect()
        sample_path = os.path.join(data_dir, f"{sample_names[i]}.npy")
        prealloc[i, :] = np.load(sample_path)
    gc.collect()
    print(f"Loaded {prealloc.shape[0]} samples")
    return prealloc

def load_dataset(data_dir, n_splits=1, test_size=0.2):
    dataset_meta_path = os.path.join(data_dir, "labels.pkl")
    dataset_meta = pd.read_pickle(dataset_meta_path)
        
    samples = load_samples_into_ram(dataset_meta["img"].to_numpy(), data_dir)
    species = dataset_meta["species"].to_numpy()
    breed = dataset_meta["breed"].to_numpy()
    
    sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size)
    for train_index, test_index in sss.split(samples, species):
        train_samples_species, test_samples_species = samples[train_index], samples[test_index]
        train_labels_species,  test_labels_species  = species[train_index], species[test_index]

    for train_index, test_index in sss.split(samples, breed):
        train_samples_breed, test_samples_breed = samples[train_index], samples[test_index]
        train_labels_breed,  test_labels_breed  = breed[train_index], breed[test_index]

    return {
        "train_samples_species": train_samples_species,
        "train_labels_species": train_labels_species,
        "train_samples_breed": train_samples_breed,
        "train_labels_breed": train_labels_breed,
        "test_samples_species": test_samples_species,
        "test_labels_species": test_labels_species,
        "test_samples_breed": test_samples_breed,
        "test_labels_breed": test_labels_breed,
    }

## Load Datasets into RAM and Perform learning on the Species label (Cat, Dog)

In [None]:
dataset_a1 = load_dataset("data/a1")

In [None]:
a1_knn = KNeighborsClassifier(metric="euclidean")
a1_knn.fit(dataset_a1["train_samples_species"], dataset_a1["train_labels_species"])
a1_knn_score = a1_knn.score(dataset_a1["test_samples_species"], dataset_a1["test_labels_species"])
print(a1_knn_score)
gc.collect()

In [None]:
a1_boss = BOSSVS(word_size=2, window_size=16)
a1_boss.fit(dataset_a1["train_samples_species"], dataset_a1["train_labels_species"])
a1_boss_score = a1_boss.score(dataset_a1["test_samples_species"], dataset_a1["test_labels_species"])
print(a1_boss_score)
gc.collect()

In [None]:
del dataset_a1
gc.collect()

## Load Datasets into RAM and Perform learning on the Breed label (Cat - Russain Blue, Dog - Standard Poodle, etc)

In [None]:
dataset_a1 = load_dataset("data/a1")

In [None]:
a1_knn = KNeighborsClassifier(metric="euclidean")
a1_knn.fit(dataset_a1["train_samples_breed"], dataset_a1["train_labels_breed"])
a1_knn_score = a1_knn.score(dataset_a1["test_samples_breed"], dataset_a1["test_labels_breed"])
print(a1_knn_score)
gc.collect()

In [None]:
a1_boss = BOSSVS(word_size=2, window_size=16)
a1_boss.fit(dataset_a1["train_samples_breed"], dataset_a1["train_labels_breed"])
a1_boss_score = a1_boss.score(dataset_a1["test_samples_breed"], dataset_a1["test_labels_breed"])
print(a1_boss_score)
gc.collect()

## Report metrics and create plots (Accuracy, Precision, Recall, ROC)