In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
import time
import importlib
import pandas as pd
import numpy as np
import pickle
#import cnn_p2 as cnn  
import cnn_utils_solutions as cnn
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
importlib.reload(cnn)

# Part 1

### You will implement the following CNN network from scratch
- The following **optional** code is from the [CNN tutorial at PyTorch](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html).  It illustrates how the network is specified in PyTorch, and counts the number of parameters.  You will have to install PyTorch for it to work.

In [None]:
#
# This is OPTIONAL code.  It will not run unless
# PyTorch is installed.
#

import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
net = Net()

def count_parameters(model):
    total_params = 0
    print(f"The number of trainable params in each layer:")
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        params = parameter.numel()
        print('  ', name, f'{params:,d}')
        total_params+=params
    print(f"Total trainable params: {total_params:,d}")
    return total_params
    
count_parameters(net)

### Below are test cases to help you debug the Conv, MaxPool and Flatten
- For HW5, you will also rely on other Operation Classes (VDot, Softmax, Log, etc) that you have implemented in HW4.

In [None]:
class FailTestError(Exception):
    pass

input_tensor = cnn.InputValue(np.arange(4*4*2).reshape((4,4,2)))
conv1 = cnn.InputValue(np.arange(3*3*2*4).reshape((3,3,2,4)))
bias1 = cnn.InputValue(np.arange(4))
v1 = cnn.InputValue(np.arange(16))

In [None]:
# Test Conv
x = cnn.Add(input_tensor, input_tensor)
y = cnn.Conv(x, conv1, 1, 0)
z = cnn.Add(y, bias1)
for component in [x,y,z]:
    component.forward()
z.grad = np.ones_like(z.value)
for component in [x, y, conv1, bias1]:
    component.grad = 0
for component in [z,y,x]:
    component.backward()

yvalue = np.array([[[17880., 18258., 18636., 19014.],
                    [20328., 20778., 21228., 21678.]],

                   [[27672., 28338., 29004., 29670.],
                    [30120., 30858., 31596., 32334.]]])
y_kernel_grad = np.array([[[[ 40.,  40.,  40.,  40.],
                             [ 48.,  48.,  48.,  48.]],

                            [[ 56.,  56.,  56.,  56.],
                             [ 64.,  64.,  64.,  64.]],

                            [[ 72.,  72.,  72.,  72.],
                             [ 80.,  80.,  80.,  80.]]],


                           [[[104., 104., 104., 104.],
                             [112., 112., 112., 112.]],

                            [[120., 120., 120., 120.],
                             [128., 128., 128., 128.]],

                            [[136., 136., 136., 136.],
                             [144., 144., 144., 144.]]],


                           [[[168., 168., 168., 168.],
                             [176., 176., 176., 176.]],

                            [[184., 184., 184., 184.],
                             [192., 192., 192., 192.]],

                            [[200., 200., 200., 200.],
                             [208., 208., 208., 208.]]]])
y_inputtensor_grad = np.array([[[  6.,  22.],
                                [ 44.,  76.],
                                [108., 140.],
                                [ 70.,  86.]],

                               [[108., 140.],
                                [280., 344.],
                                [408., 472.],
                                [236., 268.]],

                               [[300., 332.],
                                [664., 728.],
                                [792., 856.],
                                [428., 460.]],

                               [[198., 214.],
                                [428., 460.],
                                [492., 524.],
                                [262., 278.]]])

if not np.array_equal(y.value, yvalue):
    raise FailTestError("The output of Conv is incorrect")
if not np.array_equal(y.kernel.grad, y_kernel_grad):
    raise FailTestError("The gradient of kernel in Conv is incorrect")
if not np.array_equal(y.input_tensor.grad, y_inputtensor_grad):
    raise FailTestError("The gradient of input_tensor in Conv is incorrect")
print("Passed test on Conv with default settings")

In [None]:
# For Full Credit: Conv with stride 2
x = cnn.Add(input_tensor, input_tensor)
y = cnn.Conv(x, conv1, 1, 1)
z = cnn.Add(y, bias1)
for component in [x,y,z]:
    component.forward()
z.grad = np.ones_like(z.value)
for component in [x, y, conv1, bias1]:
    component.grad = 0
for component in [z,y,x]:
    component.backward()

yvalue = np.array([[[ 5248.,  5336.,  5424.,  5512.],
                    [ 8608.,  8764.,  8920.,  9076.],
                    [10816., 11020., 11224., 11428.],
                    [ 7232.,  7384.,  7536.,  7688.]],

                   [[11856., 12084., 12312., 12540.],
                    [17880., 18258., 18636., 19014.],
                    [20328., 20778., 21228., 21678.],
                    [12912., 13236., 13560., 13884.]],

                   [[19152., 19572., 19992., 20412.],
                    [27672., 28338., 29004., 29670.],
                    [30120., 30858., 31596., 32334.],
                    [18672., 19188., 19704., 20220.]],

                   [[ 9792., 10136., 10480., 10824.],
                    [13312., 13852., 14392., 14932.],
                    [14368., 14956., 15544., 16132.],
                    [ 8192.,  8600.,  9008.,  9416.]]])
y_kernel_grad = np.array([[[[180., 180., 180., 180.],
                         [198., 198., 198., 198.]],

                        [[264., 264., 264., 264.],
                         [288., 288., 288., 288.]],

                        [[216., 216., 216., 216.],
                         [234., 234., 234., 234.]]],


                       [[[336., 336., 336., 336.],
                         [360., 360., 360., 360.]],

                        [[480., 480., 480., 480.],
                         [512., 512., 512., 512.]],

                        [[384., 384., 384., 384.],
                         [408., 408., 408., 408.]]],


                       [[[324., 324., 324., 324.],
                         [342., 342., 342., 342.]],

                        [[456., 456., 456., 456.],
                         [480., 480., 480., 480.]],

                        [[360., 360., 360., 360.],
                         [378., 378., 378., 378.]]]])
y_inputtensor_grad = np.array([[[ 280.,  344.],
                                [ 516.,  612.],
                                [ 516.,  612.],
                                [ 408.,  472.]],

                               [[ 708.,  804.],
                                [1206., 1350.],
                                [1206., 1350.],
                                [ 900.,  996.]],

                               [[ 708.,  804.],
                                [1206., 1350.],
                                [1206., 1350.],
                                [ 900.,  996.]],

                               [[ 664.,  728.],
                                [1092., 1188.],
                                [1092., 1188.],
                                [ 792.,  856.]]])

if not np.array_equal(y.value, yvalue):
    raise FailTestError("The output of Conv is incorrect")
if not np.array_equal(y.kernel.grad, y_kernel_grad):
    raise FailTestError("The gradient of kernel in Conv is incorrect")
if not np.array_equal(y.input_tensor.grad, y_inputtensor_grad):
    raise FailTestError("The gradient of input_tensor in Conv is incorrect")
print("Passed Test on Conv with non-zero padding")

In [None]:
# For Full Credit: Conv with stride 2 and padding 1
x = cnn.Add(input_tensor, input_tensor)
y = cnn.Conv(x, conv1, 2, 1)
z = cnn.Add(y, bias1)
for component in [x,y,z]:
    component.forward()
z.grad = np.ones_like(z.value)
for component in [x, y, conv1, bias1]:
    component.grad = 0
for component in [z,y,x]:
    component.backward()

yvalue = np.array([[[ 5248.,  5336.,  5424.,  5512.],
                    [10816., 11020., 11224., 11428.]],

                   [[19152., 19572., 19992., 20412.],
                    [30120., 30858., 31596., 32334.]]])
y_kernel_grad = np.array([[[[ 20.,  20.,  20.,  20.],
                             [ 22.,  22.,  22.,  22.]],

                            [[ 40.,  40.,  40.,  40.],
                             [ 44.,  44.,  44.,  44.]],

                            [[ 48.,  48.,  48.,  48.],
                             [ 52.,  52.,  52.,  52.]]],


                           [[[ 40.,  40.,  40.,  40.],
                             [ 44.,  44.,  44.,  44.]],

                            [[ 80.,  80.,  80.,  80.],
                             [ 88.,  88.,  88.,  88.]],

                            [[ 96.,  96.,  96.,  96.],
                             [104., 104., 104., 104.]]],


                           [[[ 72.,  72.,  72.,  72.],
                             [ 76.,  76.,  76.,  76.]],

                            [[144., 144., 144., 144.],
                             [152., 152., 152., 152.]],

                            [[160., 160., 160., 160.],
                             [168., 168., 168., 168.]]]])
y_inputtensor_grad = np.array([[[134., 150.],
                                [268., 300.],
                                [134., 150.],
                                [166., 182.]],

                               [[268., 300.],
                                [536., 600.],
                                [268., 300.],
                                [332., 364.]],

                               [[134., 150.],
                                [268., 300.],
                                [134., 150.],
                                [166., 182.]],

                               [[230., 246.],
                                [460., 492.],
                                [230., 246.],
                                [262., 278.]]])

if not np.array_equal(y.value, yvalue):
    raise FailTestError("The output of Conv is incorrect")
if not np.array_equal(y.kernel.grad, y_kernel_grad):
    raise FailTestError("The gradient of kernel in Conv is incorrect")
if not np.array_equal(y.input_tensor.grad, y_inputtensor_grad):
    raise FailTestError("The gradient of input_tensor in Conv is incorrect")
print("Passed Test on Conv with stride 2 and padding 1")

In [None]:
# Test MaxPool
x = cnn.Add(input_tensor, input_tensor)
y = cnn.Conv(x, conv1, 1, 1)
z = cnn.Add(y, bias1)
u = cnn.RELU(z)
v = cnn.MaxPool(u, 2)
for component in [x,y,z,u,v]:
    component.forward()
v.grad = np.ones_like(v.value)
for component in [x,y,z,u, conv1, bias1]:
    component.grad = 0
for component in [v,u,z,y,x]:
    component.backward()

vvalue = np.array([[[17880., 18259., 18638., 19017.],
                    [20328., 20779., 21230., 21681.]],
                   [[27672., 28339., 29006., 29673.],
                    [30120., 30859., 31598., 32337.]]])
v_inputtensor_grad = np.array([[[0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [0., 0., 0., 0.]],

                               [[0., 0., 0., 0.],
                                [1., 1., 1., 1.],
                                [1., 1., 1., 1.],
                                [0., 0., 0., 0.]],

                               [[0., 0., 0., 0.],
                                [1., 1., 1., 1.],
                                [1., 1., 1., 1.],
                                [0., 0., 0., 0.]],

                               [[0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [0., 0., 0., 0.]]])

if not np.array_equal(v.value, vvalue):
    raise FailTestError("The output of MaxPool is incorrect")
if not np.array_equal(v.input_tensor.grad, v_inputtensor_grad):
    raise FailTestError("The gradient of input_tensor in MaxPool is incorrect")
print("Passed Test on MaxPool")

In [None]:
# For Full Credit: Test MaxPool with non-default stride
x = cnn.Add(input_tensor, input_tensor)
y = cnn.Conv(x, conv1, 1, 1)
z = cnn.Add(y, bias1)
u = cnn.RELU(z)
v = cnn.MaxPool(u, 2, stride=4)
for component in [x,y,z,u,v]:
    component.forward()
v.grad = np.ones_like(v.value)
for component in [x,y,z,u, conv1, bias1]:
    component.grad = 0
for component in [v,u,z,y,x]:
    component.backward()

vvalue = np.array([[[30120., 30859., 31598., 32337.]]])
v_inputtensor_grad = np.array([[[0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [0., 0., 0., 0.]],

                               [[0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [0., 0., 0., 0.]],

                               [[0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [1., 1., 1., 1.],
                                [0., 0., 0., 0.]],

                               [[0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [0., 0., 0., 0.]]])

if not np.array_equal(v.value, vvalue):
    raise FailTestError("The output of MaxPool is incorrect")
if not np.array_equal(v.input_tensor.grad, v_inputtensor_grad):
    raise FailTestError("The gradient of input_tensor in MaxPool is incorrect")
print("Passed Test on MaxPool with non-default stride")

In [None]:
# Test Flatten
x = cnn.Add(input_tensor, input_tensor)
y = cnn.Conv(x, conv1, 1, 1)
z = cnn.Add(y, bias1)
u = cnn.RELU(z)
v = cnn.MaxPool(u, 2)
w = cnn.Flatten(v)
o = cnn.Mul(w, v1)
for component in [x,y,z,u,v,w,o]:
    component.forward()
o.grad = np.ones_like(o.value)
for component in [x,y,z,u,v,w,conv1, bias1]:
    component.grad = 0
for component in [o,w,v,u,z,y,x]:
    component.backward()

wvalue = np.array([17880., 18259., 18638., 19017., 20328., 20779., 21230., 21681.,
       27672., 28339., 29006., 29673., 30120., 30859., 31598., 32337.])
w_inputtensor_grad = np.array([[[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.]],

       [[ 8.,  9., 10., 11.],
        [12., 13., 14., 15.]]])

if not np.array_equal(w.value, wvalue):
    raise FailTestError("The output of Flatten is incorrect")
if not np.array_equal(w.input_tensor.grad, w_inputtensor_grad):
    raise FailTestError("The gradient of input_tensor in Flatten is incorrect")
print("Passed Test on Flatten")

# Part 2

### Applying to the CIFAR10 dataset
- You can refer to https://www.cs.toronto.edu/~kriz/cifar.html for details
- Labels 0 to 9 refer to the following --- 0:airplane, 1:automobile, 2:bird, 3:cat, 4:deer, 5:dog, 6:frog, 7:horse, 8:ship, 9:truck
- We will only use a subsample of 10000 images with 1000 of each class.


In [None]:
data = np.load('./cifar10_data/sub_data.npz')
X = np.float32(data['imgs'])/255.
# Reshape the valid image data to (idx, h, w, channel)
X = X.reshape(10000, 32, 32, 3)
y = np.float32(data['labels'])

# for simplicity, let's focus on the first four classes
# there are 4000 images in total
sub_idx = np.where(y<=3)[0]
X = X[sub_idx]
y = y[sub_idx]

# split in to train an test set
train_x, test_x = X[:3000], X[3000:]
train_y, test_y = y[:3000], y[3000:]

In [None]:
# below is what an image looks like
print(train_y[0])  # 3:cat
imgplot = plt.imshow(train_x[0])

### Debugging the fit function

In [None]:
importlib.reload(cnn)  # important line so that the changes you made on cnn_p2.py will be reflected without restarting the kernel
model = cnn.CNN(num_labels=4)

# # Used to generate sample_params, don't uncomment the codes below
# model.init_params_with_xavier()
# params = model.get_param_dict()
# with open("./cifar10_data/sample_params.pkl", 'wb') as f:
#     pickle.dump(params, f, protocol=pickle.HIGHEST_PROTOCOL)

# You can use the provided sample weights for initialization to help debug
with open("./cifar10_data/sample_params.pkl", 'rb') as f:
    params = pickle.load(f)
model.set_params_by_dict(params)

# You can use the first 2 samples to test if the gradients are correct
X = train_x[:2]
y = train_y[:2]

# when calling fit, a computational graph will be built first, you should expect the exact lines printed
model.fit(X, y, alpha=0.01, t=1)

# # Used to generate sample_grad, don't uncomment the codes below
# sample_grad = {}
# for k in params.keys():
#     sample_grad[k] = model.params[k].grad
# with open("./cifar10_data/sample_grad.pkl", 'wb') as f:
#     pickle.dump(sample_grad, f, protocol=pickle.HIGHEST_PROTOCOL)

# Load the sample gradient for debugging
with open("./cifar10_data/sample_grad.pkl", 'rb') as f:
    sample_grad = pickle.load(f)
    
for k in params.keys():
    if not np.array_equal(np.round(sample_grad[k], 3), np.round(model.params[k].grad, 3)):
        raise FailTestError(f"gradient of param {k} is incorrect")
print("Congrats! You have passed the test of your fit function, your CNN model should be good to go!")


### Now train your CNN on the whole training dataset

In [None]:
importlib.reload(cnn)
model = cnn.CNN(num_labels=4)
model.init_params_with_xavier()
# It could take as much as 1 hour to train
model.fit(train_x, train_y, 0.01, 10)

In [None]:
# with 10 epochs, you should be able to achieve an accuracy of over 60%, 
# which is quite good compared with 25% of random guess
accy, loss = model.eval(test_x, test_y)
print("Test accuracy = %.4f, loss = %.4f" % (accy, loss))