In [13]:
import numpy as np
import os
import constants
import pickle

In [219]:
def normalize_data(inp):
    """
    TODO
    Normalizes inputs (on per channel basis of every image) here to have 0 mean and unit variance.
    This will require reshaping to seprate the channels and then undoing it while returning

    args:
        inp : N X d 2D array where N is the number of examples and d is the number of dimensions

    returns:
        normalized inp: N X d 2D array

    """

    # N X (32 * 32 * 3) to N X 32 * 32 X 3
    d = int(inp.shape[1] / 3) # only works for square images
    N = inp.shape[0]
    per_channel = inp.reshape((N, d, 3))
    

    # normalize per channel per image
    mu_per_channel_per_image = np.mean(per_channel, axis=1)
    std_per_channel_per_image = np.std(per_channel, axis=1)
    
  
    mu_2d = np.column_stack([np.tile(mu_per_channel_per_image[:, i].reshape((N, 1)), d) for i in range(3)])
    std_2d = np.column_stack([np.tile(std_per_channel_per_image[:, i].reshape((N, 1)), d) for i in range(3)])


    normalized = (inp - mu_2d) / std_2d
    

    return normalized

In [220]:
t_x, t_y, v_x, v_y, ts_x, ts_y = load_data('./data/')

size of train (40000, 3072)


In [217]:
v_y.shape

(10000, 10)

In [186]:

def one_hot_encoding(labels, num_classes=10):
    """
    TODO
    Encodes labels using one hot encoding.

    args:
        labels : N dimensional 1D array where N is the number of examples
        num_classes: Number of distinct labels that we have (10 for CIFAR-10)

    returns:
        oneHot : N X num_classes 2D array

    """
    
    n = labels.size
    k = num_classes

    matrix = np.zeros((n, k))
    
    # for each row, change the value specified at index y to 1
    matrix[np.arange(n), labels] = 1

    return matrix


In [187]:
def createTrainValSplit(x_train, y_train):

    """
    TODO
    Creates the train-validation split (80-20 split for train-val). Please shuffle the data before creating the train-val split.
    """

    # x_train is N X d
    # y_train is N X 1

    N = x_train.shape[0]
    
    # combine then shuffle
    combined = np.column_stack((x_train, y_train))
    np.random.shuffle(combined) # shuffles in place

    
    train_prop = np.floor(N*0.8).astype(int)

    x_train_sh = combined[:train_prop, :-1]
    y_train_sh = combined[:train_prop, -1]

    x_valid_sh = combined[train_prop:, :-1]
    y_valid_sh = combined[train_prop:, -1]

    return x_train_sh, y_train_sh, x_valid_sh, y_valid_sh

In [204]:
def load_data(path):
    """
    Loads, splits our dataset- CIFAR-10 into train, val and test sets and normalizes them

    args:
        path: Path to cifar-10 dataset
    returns:
        train_normalized_images, train_one_hot_labels, val_normalized_images, val_one_hot_labels,  test_normalized_images, test_one_hot_labels

    """
    def unpickle(file):
        with open(file, 'rb') as fo:
            dict = pickle.load(fo, encoding='bytes')
        return dict

    cifar_path = os.path.join(path, constants.cifar10_directory)

    train_images = []
    train_labels = []
    val_images = []
    val_labels = []
    for i in range(1,constants.cifar10_trainBatchFiles+1):
        images_dict = unpickle(os.path.join(cifar_path, f"data_batch_{i}"))
        data = images_dict[b'data']
        label = images_dict[b'labels']
        train_labels.extend(label)
        train_images.extend(data)
    train_images = np.array(train_images)
    train_labels = np.array(train_labels).reshape((len(train_labels),-1))
    train_images, train_labels, val_images, val_labels = createTrainValSplit(train_images,train_labels)
    
    print('size of train', train_images.shape)

    train_normalized_images = normalize_data(train_images)
    train_one_hot_labels = one_hot_encoding(train_labels)

    val_normalized_images = normalize_data(val_images)
    val_one_hot_labels = one_hot_encoding(val_labels)

    test_images_dict = unpickle(os.path.join(cifar_path, f"test_batch"))
    test_data = test_images_dict[b'data']
    test_labels = test_images_dict[b'labels']
    test_images = np.array(test_data)
    test_labels = np.array(test_labels).reshape((len(test_labels),-1))
    test_normalized_images = normalize_data(test_images)
    test_one_hot_labels = one_hot_encoding(test_labels)
    
    return train_normalized_images, train_one_hot_labels, val_normalized_images, val_one_hot_labels,  test_normalized_images, test_one_hot_labels


In [205]:
t_x, t_y, v_x, v_y, t_x, t_y = load_data('./data/')

size of train (40000, 3072)
now normalizing (40000, 3072)
reshaping into (40000, 1024, 3)
per channel (40000, 3)
mean and std shape (40000, 3072)
normalized shape (40000, 3072)
now normalizing (10000, 3072)
reshaping into (10000, 1024, 3)
per channel (10000, 3)
mean and std shape (10000, 3072)
normalized shape (10000, 3072)
now normalizing (10000, 3072)
reshaping into (10000, 1024, 3)
per channel (10000, 3)
mean and std shape (10000, 3072)
normalized shape (10000, 3072)


In [206]:
t_x.shape

(10000, 3072)

In [160]:
t_x.shape

(10000, 3072)

In [125]:
out = normalize_data(test)

50000 1024


In [126]:
out.shape

(50000, 3072)

In [152]:
one_hot_encoding(np.arange(2))

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [133]:
np.random.shuffle(a)

In [134]:
a

array([[ 6,  7,  8],
       [ 3,  4,  5],
       [ 9, 10, 11],
       [ 0,  1,  2]])

In [137]:
np.floor(N*0.8).astype(int)

40000