## Exercise 1

*In this exercise, we work on the EMNIST dataset and work on the creation of a Balanced Dataset of Uppercase Letters*

In [1]:
import emnist
import numpy as np
import matplotlib.pyplot as plt
import pickle


In [2]:
def filter_dataset(img, label, x, y):
    """
    Returns the labels in the interval of [x, y] and corresponding images
    Parameters:
        img (ndarray): 3D array containing images
        label (ndarray): 1D array containing labels
        x (int): Smallest value of labels to keep
        y (int): Highest value of labels to keep
    Returns:
        img_filtered (ndarray): 3D array containing images which has labels in [x, y]
        label_filtered (ndarray): 1D array containing labels in [x, y]
    """
    ind = np.where(np.logical_and(label >= x, label <= y))  # gives the positions of the labels in the interval [x, y]
    img_filtered = img[ind]
    label_filtered = label[ind] - 10  # readjusting the labels
    return img_filtered, label_filtered

In [3]:
def find_smallest_class_size(label):
    """
    Given the labels of the dataset, returns the smallest size of a class in the dataset
    Parameters:
        label (ndarray): 1D array containing labels
    Returns:
        (int): size of the class which has the least samples
    """
    counts, _ = np.histogram(label, bins=np.unique(label))
    return counts.min(), np.unique(label)[counts.argmin()]

In [4]:
def resize_dataset(img, label, N):
    """
    Returns the sum of two decimal numbers in binary digits.
    Parameters:
        img (ndarray): 3D array containing images
        label (ndarray): 1D array containing labels
        N (int): total number of samples to keep from a class
    Returns:
        img_train (ndarray): 3D array containing training images
        lbl_train (ndarray): 3D array containing training labels
        img_test (ndarray): 3D array containing test images
        lbl_test (ndarray): 3D array containing test labels
    """
    img_train = np.array([], dtype=np.uint8).reshape(0, 28, 28)
    img_test = np.array([], dtype=np.uint8).reshape(0, 28, 28)
    lbl_train = np.array([], dtype=np.uint8)
    lbl_test = np.array([], dtype=np.uint8)

    test_size = int(N / 6)
    training_size = N - test_size

    for l in np.unique(label):
        img_train = np.vstack([img_train, img[np.where(label == l)][:training_size]])
        img_test = np.vstack([img_test, img[np.where(label == l)][-test_size:]])
        lbl_train = np.concatenate([lbl_train, np.tile(l, training_size)])
        lbl_test = np.concatenate([lbl_test, np.tile(l, test_size)])

    return img_train, lbl_train, img_test, lbl_test

In [5]:
def shuffle_dataset(img, label):
    """
    Returns the sum of two decimal numbers in binary digits.
    Parameters:
        img (ndarray): 3D array containing images
        label (ndarray): 1D array containing labels
    Returns:
        (ndarray), (ndarray): Shuffled arrays of images and labels
    """
    shuffler = np.arange(img.shape[0])
    np.random.shuffle(shuffler)  # shuffles the array in-place, returns None
    return img[shuffler], label[shuffler]


In [6]:
if __name__ == '__main__':
    img_train, label_train = emnist.extract_training_samples('byclass')
    img_test, label_test = emnist.extract_test_samples('byclass')
    print(f'Byclass - training:{len(label_train)}, test:{len(label_test)}, total:{len(label_train)+len(label_test)}')

    img_all = np.append(img_train, img_test, axis=0)
    label_all = np.append(label_train, label_test)

    img_uc, label_uc = filter_dataset(img_all, label_all, 10, 35)
    print(f'Initially, {len(label_uc)}/{len(label_all)} uppercase samples...')

    N, label_N = find_smallest_class_size(label_uc)
    print(f'Smallest number of samples {N} from label {label_N}')

    img_train, label_train, img_test, label_test = resize_dataset(img_uc, label_uc, N)
    print(f'Total number of samples after balancing: {len(label_train)+len(label_test)}')
    print(f'Balanced training samples: {len(label_train)}')
    print(f'Balanced test samples: {len(label_test)}')

    img_train, label_train = shuffle_dataset(img_train, label_train)
    img_test, label_test = shuffle_dataset(img_test, label_test)

    pickle.dump((img_train, label_train), open("train", "wb"))
    pickle.dump((img_test, label_test), open("test", "wb"))

Byclass - training:697932, test:116323, total:814255
Initially, 220304/814255 uppercase samples...
Smallest number of samples 2850 from label 10
Total number of samples after balancing: 74100
Balanced training samples: 61750
Balanced test samples: 12350
