# Notebook setup

In [2]:
%load_ext autoreload
%autoreload 2

# Importing relevant things

In [4]:
import numpy as np

# Loading the dataset

In [19]:
splitted_dataset = np.load('splitted_cifar10_dataset.npz')

x_train = splitted_dataset['x_train']
print(f"x_train shape :{x_train.shape}")

y_train = splitted_dataset['y_train']
print(f"y_train shape :{y_train.shape}")

x_val = splitted_dataset['x_val']
print(f"x_val shape :{x_val.shape}")

y_val = splitted_dataset['y_val']
print(f"y_val shape :{y_val.shape}")

x_test = splitted_dataset['x_test']
print(f"x_test shape :{x_test.shape}")

y_test = splitted_dataset['y_test']
print(f"y_test shape: {y_test.shape}")


x_train shape :(40000, 32, 32, 3)
y_train shape :(40000, 1)
x_val shape :(10000, 32, 32, 3)
y_val shape :(10000, 1)
x_test shape :(10000, 32, 32, 3)
y_test shape: (10000, 1)


# Define the class names

In [20]:
class_names = ["airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"]


# Creating a custom dataset loader to calculate UCC and RCC (TODO)

In [None]:
class Dataset:
    def __init__(self, x_train, y_train, x_val, y_val, x_test, y_test, batch_size=16):
        '''
        Note these are numpy arrays

        :param x_train:
        :param y_train:
        :param x_val:
        :param y_val:
        :param x_test:
        :param y_test:
        '''
        self.num_classes = 10
        self.batch_size = batch_size

        # converting it all into a tensor (its not yet one hotified)
        self.x_train = torch.from_numpy(x_train)
        self.y_train = torch.from_numpy(y_train)
        self.x_val = torch.from_numpy(x_val)
        self.y_val = torch.from_numpy(y_val)
        self.x_test = torch.from_numpy(x_test)
        self.y_test = torch.from_numpy(y_test)

        # create datasets
        self.train_dataset = TensorDataset(self.x_train, self.y_train)
        self.val_dataset = TensorDataset(self.x_val, self.y_val)
        self.test_dataset = TensorDataset(self.x_test, self.y_test)

        # create loaders for going through the dataset to create the new final dataset
        self.train_loader = DeviceDataLoader(self.train_dataset, batch_size=self.batch_size)
        self.val_loader = DeviceDataLoader(self.val_dataset, batch_size=self.batch_size)
        self.test_loader = DeviceDataLoader(self.test_dataset, batch_size=self.batch_size)

    # get UCC
    def construct_datasets_with_ucc(self):
        train_dataset_with_ucc = self.construct_dataset_with_ucc(self.train_loader)
        val_dataset_with_ucc = self.construct_dataset_with_ucc(self.val_loader)
        test_dataset_with_ucc = self.construct_dataset_with_ucc(self.test_loader)

        return train_dataset_with_ucc, val_dataset_with_ucc, test_dataset_with_ucc

    def construct_dataset_with_ucc(self, dataloader):
        image_tensors = []
        ucc_tensors = []

        for data in tqdm(dataloader):
            images, labels = data

            ucc = self.get_ucc_from_labels_of_batch(labels)

            image_tensors.append(images)
            ucc_tensors.append(ucc)


        return TensorDataset(
            torch.stack(image_tensors),
            torch.stack(ucc_tensors)
        )

    def construct_datasets_with_ucc_and_rcc(self):
        train_dataset_with_ucc_and_rcc = self.construct_dataset_with_ucc_and_rcc(self.train_loader)
        val_dataset_with_ucc_and_rcc = self.construct_dataset_with_ucc_and_rcc(self.val_loader)
        test_dataset_with_ucc_and_rcc = self.construct_dataset_with_ucc_and_rcc(self.test_loader)

        return train_dataset_with_ucc_and_rcc, val_dataset_with_ucc_and_rcc, test_dataset_with_ucc_and_rcc

    # get both UCC and RCC
    def construct_dataset_with_ucc_and_rcc(self, dataloader):
        image_tensors = []
        ucc_tensors = []
        rcc_tensors = []

        for data in tqdm(dataloader):
            images, labels = data

            #get ucc
            ucc = self.get_ucc_from_labels_of_batch(labels)

            #get rcc
            rcc = self.get_rcc_from_labels_of_batch(labels)

            image_tensors.append(images)
            ucc_tensors.append(ucc)
            rcc_tensors.append(rcc)


        return TensorDataset(
            torch.stack(image_tensors),
            torch.stack(ucc_tensors),
            torch.stack(rcc_tensors),
        )

    def get_ucc_from_labels_of_batch(self, labels):
        unique_count = torch.unique(labels).size(0)
        unique_count = torch.tensor(unique_count)
        ucc = self.one_hot(unique_count)
        return ucc

    def get_rcc_from_labels_of_batch(self, labels):
        labels = labels.squeeze()
        rcc = torch.zeros(self.num_classes, dtype=torch.int32)
        # Count the occurrences of each class
        for i in range(self.num_classes):
            rcc[i] = (labels == i).sum()
        return rcc

    # util
    def one_hot(self, label):
        # Create a one-hot tensor
        one_hot = torch.zeros(self.num_classes)

        # since each label is in range of [1,10] getting it to a range of [0,9]
        one_hot[label-1] = 1
        return one_hot



# Things left to do

1. set up the model architecture for both appproaches (#1 with only ac + ucc , #2 with ac + ucc + rcc)
2. set up the training module code
3. find out how to use JS divergence and KDE
4. Write clustering code
5. train it