Question 1: 
Write a pseudo-code for how you would implement this with a set of nested
for loops. The convolution is defined by a set of weights/parameters which we will learn.
How do you represent these weights?


# Part 1

In [1]:
# for sample in batch_size:
#     padding_width = input_width + 2*padding
#     padding_height = input_height + 2*padding
#     # Given that output_width and output_height are the same:
#     output_width = output_height = ((input_width - kernel_size + 2*padding)/stride) + 1
#     # Create empty output for this layer
#     output = zeros([total_kernels, output_width, output_height])
#     for layer in input_channels:
#         layer_pad = [sample, layer, :, :]
#         # Add padding
#         for pad in padding:
#             add column of zeros to right to layer_pad
#             add column of zeros to left to layer_pad
#             add column of zeros to top to layer_pad
#             add column of zeros to bottom to layer_pad
#         # Move over layer:
#         for y in range(output_height-1):
#             for x in range(output_width-1):
#                 # If multiple kernels exist, save in different slices of output
#                 for kernel in total_kernels:
#                     patch = layer_pad[y*stride : (y*stride)+kernel_size, x*stride : (x*stride)+kernel_size]
#                     output[kernel, y, x] += sum( patch * kernel[layer, :, :] )
#     sample_output[sample, :, :, :] = output


Question 2. 
For a given input tensor, kernel size, stride and padding (no dilutions) work out
a general function that computes the size of the output.
https://pytorch.org/docs/stable/generated/torch.nn.functional.conv2d.html?highlight=functional%20conv2d#torch.nn.functional.conv2d 

In [2]:
import torch
import numpy as np
import ipykernel
import math
ipykernel.__version__

'6.4.1'

In [3]:
tensor = np.random.rand(1, 2, 27, 27)
weight = np.random.rand(1, 2, 3, 3)

def output_size(input_tensor, kernel_size, stride, padding):
    batch_size, channels, height, width = input_tensor.shape # <- similar to tensor.size() in torch, but '.shape' here since it is numpy
    out_size = (((height - kernel_size + 2*padding) / stride ) + 1)
    return out_size

print(output_size(tensor, 3, 2, 0)) #<- example from the slides: Lecture 3 AlexNet, top right

tensor = torch.rand([1, 2, 27, 27])
weight = torch.rand([1, 2, 3, 3])

print(torch.nn.functional.conv2d(tensor, weight, stride=2,padding=0).size())


13.0
torch.Size([1, 1, 13, 13])


Question 3: 
Write a naive (non-vectorized) implementation of the unfold function in
pseudocode. Include the pseudocode in your report.
https://pytorch.org/docs/stable/generated/torch.nn.functional.unfold.html 

In [4]:
# #input_tensor = [b, c, h, w] #batch_size, channels, height, width

# # Pseudo code naive unfold:
# def naive_unfold(input_tensor, kernel_size, stride, padding):
#     output_size = output_size(input_tensor, kernel_size, stride, padding)
#     #1 extract all patches from the input
#     for sample in b:
#         for channel in c:
#             layer_pad = [sample, layer, :, :]
#             # Add padding
#             for pad in padding:
#                 add column of zeros to right to layer_pad
#                 add column of zeros to left to layer_pad
#                 add column of zeros to top to layer_pad
#                 add column of zeros to bottom to layer_pad
#             n_patches_per_layer = output_size * output_size
#             for y in range(output_size-1):
#                 for x in range(output_size-1):
#                     # x+ y = number of patch, total_patch is 0 first time
#                     total_patch = (x+y) * channel
#                     patch[sample, x+y + total_patch ,:, :] = layer_pad[y*stride : (y*stride)+kernel_size, 
#                                                                         x*stride : (x*stride)+kernel_size]
#     # 2. Flatten these patches (with all channels) into vectors, arranged as the columns of a matrix X.
#     #### THIS CORRECT???
#     X = patch[:, :].flatten()
#     p = len(X)
#     # 3. Multiply this matrix by a weight matrix Y = XW
#     Y = X * W
#     # 4. Reshape the matrix Y, so that its columns become the pixels of the output tensor.
#     k = c * output_size * output_size
#     Y = Y.reshape([b, k, p])
#     return patch
#     #output = [b, k, p] #batch_size, number of values per patch, number of patches

## Torch Module

In [5]:
import torch
import torch.nn.functional as F
from torch import nn

class Conv2D(nn.Module):
    def __init__(self,in_channels, out_channels, kernel_size=(3,3), stride=1, padding=1):
        super().__init__() # <- belangrijk!
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        
    def forward(self, input_batch):
        batch_size, channels, height, width = input_batch.size()
        print("input_batch = ", input_batch.size())
        
        # output dimensions
        h_out = int(output_size(input_batch, self.kernel_size[0], self.stride, self.padding))
        w_out = int(output_size(input_batch, self.kernel_size[1], self.stride, self.padding))
        
        # unfolded matrix (b, k, p)
        unfolded = F.unfold(input_batch, self.kernel_size, padding=self.padding, stride=self.stride)
        print("unfolded = ", unfolded.size())
        batch_size, k_values_per_patch, patches = unfolded.size()
        
        # reshape to (b, p, k) tensor, than merge b and p to get (b*p,k) tensor
        reshaped = torch.transpose(unfolded, 1, 2).reshape(-1, k_values_per_patch)
        print("X_reshaped = ", reshaped.size())
        
        # Initiate random weights with correct dimensions
        W = torch.rand((k_values_per_patch, self.out_channels)) # - rows: number of nodes in one patch of input. -columns: # of nodes in one pixel in output
        print("W = ", W.size())
        
        # Matrix multiplication to get Y
        Y = torch.mm(reshaped, W) # bmm?
        print("Y = ", Y.size())
        
        # Reshape to get seperate batches back
        Y_reshaped = Y.reshape((batch_size, patches, self.out_channels)) # contains one row-vector for each pixel in output
        print("Y_reshaped = ", Y_reshaped.size())
        
        # Permute to swap axis for p and k
        Y_permuted = torch.permute(Y_reshaped, (0, 2, 1))
        print("Y_permuted = ", Y_permuted.size())
        
        # Fold back to obtain the output of this layer
        output = Y_permuted.reshape(batch_size, self.out_channels, h_out, w_out)
        print("output = ", output.size())
        
        assert output.size() == torch.nn.functional.conv2d(input_batch, W.reshape(self.out_channels, self.in_channels, self.kernel_size[0], self.kernel_size[1]),padding=self.padding).size()
        return output

# We use the Conv2D module by instantiating it, and applying it to an input.
torch.manual_seed(0)
conv = Conv2D(in_channels= 3, out_channels= 8)
input_batch = torch.randn(16, 3, 32, 32)
output_batch = conv(input_batch)

input_batch =  torch.Size([16, 3, 32, 32])
unfolded =  torch.Size([16, 27, 1024])
X_reshaped =  torch.Size([16384, 27])
W =  torch.Size([27, 8])
Y =  torch.Size([16384, 8])
Y_reshaped =  torch.Size([16, 1024, 8])
Y_permuted =  torch.Size([16, 8, 1024])
output =  torch.Size([16, 8, 32, 32])


## FUNCTION (Q4, Q5 and Q6)

In [6]:
class MyConv2DFunc(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward
    passes which operate on Tensors.
    """
    @staticmethod
    def forward(ctx, input_batch, kernel, stride=1, padding=1):
        """
        In the forward pass we receive a Tensor containing the input
        and return a Tensor containing the output. ctx is a context
        object that can be used to stash information for backward
        computation. You can cache arbitrary objects for use in the
        backward pass using the ctx.save_for_backward method.
        """
        output_channels = kernel.size()[1]
        
        # your code here
        batch_size, input_channels, height, width = input_batch.size()
        print("X = ", input_batch.size())
        
        kernel_size = int(math.sqrt(kernel.size()[0]/input_channels))
        
        # output dimensions
        h_out = int(output_size(input_batch, kernel_size, stride, padding))
        w_out = int(output_size(input_batch, kernel_size, stride, padding))
        
        # unfolded matrix (b, k, p)
        U = F.unfold(input_batch, (kernel_size, kernel_size), padding=padding, stride=stride)
        print("U = ", U.size())
        batch_size, k_values_per_patch, patches = U.size()
        
        # reshape to (b, p, k) tensor, than merge b and p to get (b*p,k) tensor
        U_reshaped = torch.transpose(U, 1, 2).reshape(-1, k_values_per_patch)
        print("U_reshaped = ", U_reshaped.size())
        
        # Initiate random weights with correct dimensions
        W = torch.rand((k_values_per_patch, output_channels)) # - rows: number of nodes in one patch of input. -columns: # of nodes in one pixel in output
        print("W = ", W.size())
    
        # store objects for the backward
        ctx.save_for_backward(input_batch, U_reshaped, W)
        
        # Matrix multiplication to get Y
        Y = torch.mm(U_reshaped, W) # bmm?
        print("Y = ", Y.size())
        
        # Reshape to get seperate batches back
        Y_reshaped = Y.reshape((batch_size, patches, output_channels)) # contains one row-vector for each pixel in output
        print("Y_reshaped = ", Y_reshaped.size())
        
        # Permute to swap axis for p and k
        Y_permuted = torch.permute(Y_reshaped, (0, 2, 1))
        print("Y_permuted = ", Y_permuted.size())
        
        output_batch = Y_permuted.reshape(batch_size, output_channels, h_out, w_out)
        print("output_batch = ", output_batch.size())
        print("\n")
        
        assert output_batch.size() == torch.nn.functional.conv2d(input_batch, W.reshape(output_channels, input_channels, kernel_size, kernel_size),padding=padding).size()
        return output_batch

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the
        gradient of the loss with respect to the output, and we need
        to compute the gradient of the loss with respect to the
        input
        """
        # retrieve stored objects
        input_batch, X_reshaped, W = ctx.saved_tensors
        # your code here
        
        grad_Y_permuted = grad_output.reshape(grad_output.size()[0], grad_output.size()[1], grad_output.size()[2]*grad_output.size()[2])
        print("grad_Y_permuted = ", grad_Y_permuted.size())
        grad_Y_reshaped = torch.permute(grad_Y_permuted, (0, 2, 1))
        print("grad_Y_reshaped = ", grad_Y_reshaped.size())
        grad_Y = grad_Y_reshaped.reshape(grad_Y_reshaped.size()[0]*grad_Y_reshaped.size()[1],grad_Y_reshaped.size()[2])
        print("grad_Y = ", grad_Y.size())
        kernel_grad = torch.transpose(torch.mm(torch.transpose(grad_Y,0,1), X_reshaped),0,1)
        print("grad_W = ", kernel_grad.size())
        grad_U_reshaped = torch.mm(grad_Y, torch.transpose(W,0,1))
        print("grad_U_reshaped = ", grad_U_reshaped.size())
        grad_U = torch.permute(grad_U_reshaped.reshape(input_batch.size()[0], int(grad_U_reshaped.size()[0]/input_batch.size()[0]), kernel_grad.size()[0]), (0, 2, 1))
        print("grad_U = ", grad_U.size())
        input_batch_grad = F.fold(grad_U, output_size=[input_batch.size()[2], input_batch.size()[3]], kernel_size=(3,3), padding=1)
        print("grad_X = ", input_batch_grad.size())
        return input_batch_grad, kernel_grad, None, None
        
input_channels = 3
output_channels = 8
kernel_size = 3

input_batch = torch.randn(16, 3, 32, 32, requires_grad=True)
kernel = torch.randn(kernel_size*kernel_size*input_channels, output_channels, requires_grad=True)

conv = MyConv2DFunc.apply
output = conv(input_batch, kernel)
loss = output.sum()
loss.backward()

X =  torch.Size([16, 3, 32, 32])
U =  torch.Size([16, 27, 1024])
U_reshaped =  torch.Size([16384, 27])
W =  torch.Size([27, 8])
Y =  torch.Size([16384, 8])
Y_reshaped =  torch.Size([16, 1024, 8])
Y_permuted =  torch.Size([16, 8, 1024])
output_batch =  torch.Size([16, 8, 32, 32])


grad_Y_permuted =  torch.Size([16, 8, 1024])
grad_Y_reshaped =  torch.Size([16, 1024, 8])
grad_Y =  torch.Size([16384, 8])
grad_W =  torch.Size([27, 8])
grad_U_reshaped =  torch.Size([16384, 27])
grad_U =  torch.Size([16, 27, 1024])
grad_X =  torch.Size([16, 3, 32, 32])


# Part 2

Question 7

In [9]:
import torchvision
from torchvision.transforms import ToTensor
from tqdm import tqdm
import torch

arg = {"data":'./data', "batch": 60000} # with batch = 16 we get a dataloader of length 3750 (*16=60.000)
train_set = torchvision.datasets.MNIST(root=arg['data'], train=True, download=True, transform=ToTensor())
trainloader = torch.utils.data.DataLoader(train_set, batch_size=arg['batch'], shuffle=True, num_workers=2)
test_set = torchvision.datasets.MNIST(root=arg['data'], train=False, download=True, transform=ToTensor())
testloader = torch.utils.data.DataLoader(test_set, batch_size=arg['batch'], shuffle=True, num_workers=2)

for i, data in enumerate(trainloader):
    input, labels = data


#ongeveer 15sec

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data\MNIST\raw\train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting ./data\MNIST\raw\train-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data\MNIST\raw\train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting ./data\MNIST\raw\train-labels-idx1-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data\MNIST\raw\t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ./data\MNIST\raw\t10k-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw



In [10]:
training = (input[:50000])
training_label = labels[:50000]
validation = input[50000:]
validation_label = labels[50000:]

def loop_over(data, label, step):
    print(len(data))
    for i in range(0,len(data),step):
        data[i:i+step]
        label[i:i+step]

loop_over(training, training_label, 16)


50000


### Question 8:
Build this network and tune the hyperparameters until you get a good baseline
performance you are happy with. You should be able to get at least 95% accuracy. If training
takes too long, you can reduce the number of channels in each layer.

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision
from torchvision.transforms import ToTensor
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np

In [12]:
arg = {"data":'./data', "batch": 60000} # with batch = 16 we get a dataloader of length 3750 (*16=60.000)
train_set = torchvision.datasets.MNIST(root=arg['data'], train=True, download=True, transform=ToTensor())
trainloader = torch.utils.data.DataLoader(train_set, batch_size=arg['batch'], shuffle=True, num_workers=2)
test_set = torchvision.datasets.MNIST(root=arg['data'], train=False, download=True, transform=ToTensor())
testloader = torch.utils.data.DataLoader(test_set, batch_size=arg['batch'], shuffle=True, num_workers=2)

In [13]:
class Net(nn.Module):
    def __init__(self, input_chan, kernel, stride, padding, output):
        super().__init__()
        self.conv1 = nn.Sequential(nn.Conv2d(input_chan, 16, kernel, stride, padding), nn.ReLU(), nn.MaxPool2d(2,2))
        self.conv2 = nn.Sequential(nn.Conv2d(16, 32, kernel, stride, padding), nn.ReLU(), nn.MaxPool2d(2,2))
        self.conv3 = nn.Sequential(nn.Conv2d(32, 64, kernel, stride, padding), nn.ReLU(), nn.MaxPool2d(2,2))
        self.out = nn.Linear(64*3*3, output)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = torch.flatten(x, 1)
        output = self.out(x)
        return output

In [14]:
def get_laoders(batch_size):
    loaders = {'train_set' : DataLoader(train_set, 
                                          batch_size=batch_size, 
                                          shuffle=True, 
                                          num_workers=1),
                'test_set'  : DataLoader(test_set, 
                                          batch_size=10000, 
                                          shuffle=True, 
                                          num_workers=1)}
    return loaders

In [15]:
def validate(net, loaders):
    net.eval()
    correct = 0
    with torch.no_grad():
        for x, y in loaders['test_set']:
            output = net(x)
            _,pred_y = torch.max(output, dim = 1)
            correct += (pred_y == y).float().sum()

    print('accuracy on test set', (correct / 10000)*100, '%')
    return (correct / 10000)*100


def train(net, loaders, epochs, loss_f, opt):
    train_loss = []
    epoch_list = []
    acc_list = []

    for i in range(epochs):
        print('epoch = ', i)
        for j, (x, y) in enumerate(loaders['train_set']):
            x_batch = x
            y_batch = y
            output = net.forward(x_batch)
            loss = loss_f(output, y_batch)
            train_loss.append(loss)
            if j % 2000 == 0:
                print('loss:', loss.item())
            opt.zero_grad() 
            loss.backward()
            opt.step() 
        epoch_list.append(i)
        acc_list.append(validate(net, loaders))
    
    return train_loss, epoch_list, acc_list
            
input_chan = 1
output = 10
kernel = 3
stride = 1
padding = 1
lr = 0.0001
net = Net(input_chan, kernel, stride, padding, output)
loss_f = nn.CrossEntropyLoss()
opt = optim.Adam(net.parameters(), lr)

In [None]:
epochs = 10
batch_size = 16
loaders = get_laoders(batch_size) 
train_loss, epoch_list, acc_list = train(net, loaders, epochs, loss_f, opt)

epoch =  0
loss: 2.308333396911621
loss: 0.2767612934112549


In [None]:
y1 = acc_list
x1 = epoch_list

plt.plot(x1, y1, label = "batch_size = 16" )
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy during training epochs')
plt.xticks(np.arange(0, 20, 1))
plt.savefig('figures/Q8_acc')
plt.show()