In [14]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

## Define the network

In [15]:
# lets code a simple feed forward neural network

class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        
        # 1 input image channel, 6 output channels, 5x5 square convolution kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        # 6 input channels, 16 output channels, 5x5 square convolution kernel
        self.conv2 = nn.Conv2d(6, 16, 5)
        
        # we also need some linear transformations; these are initialised by their in_ and out_features
        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 16 output ch from second layer times the kernel size 
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)  
        # I don't know, where 120, 84 and 10 come from; but they stem from the picture in the tutorial
        
    def forward(self, x):
        # Max pooling over a (2, 2) window
        # => from a 2x2 window take the max value. relu = Rectified Linear unit; relu(x) = max(0,x)
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

net = Net()
print(net)

Net(
  (conv1): Conv2d (1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d (6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120)
  (fc2): Linear(in_features=120, out_features=84)
  (fc3): Linear(in_features=84, out_features=10)
)


since we created a ```forward``` function, a ```backward``` function will be created automagically. The backward function can then be used for 
``` autograd ```. 

learnable parameters of a model: ```net.parameters()```

In [5]:
params = list(net.parameters())
print(params[0])
# NOTE: I don't know exactly what these weights correspond to in the actual network above. 
# In the sense of: how does the network look like, where do these weights actually sit.

Parameter containing:
(0 ,0 ,.,.) = 
  0.0243  0.0660 -0.1330 -0.1541 -0.1973
  0.0709  0.0343  0.1709  0.1624 -0.0348
 -0.1038  0.1492  0.0972 -0.0673  0.0110
  0.0843 -0.1877  0.0932 -0.1139 -0.0879
  0.0791 -0.0629 -0.1274  0.1480 -0.1263

(1 ,0 ,.,.) = 
  0.1253 -0.1703 -0.1055 -0.0463 -0.0287
 -0.1286  0.1820 -0.1123  0.1013  0.1505
  0.0251  0.1256  0.0632  0.1710  0.0744
 -0.1606  0.1198  0.0847  0.1633  0.0825
  0.1223  0.0258 -0.1176  0.0064 -0.1164

(2 ,0 ,.,.) = 
 -0.0666  0.0922 -0.0761  0.1710  0.1152
  0.1615  0.0906  0.1579 -0.1165 -0.0946
  0.1658  0.0144 -0.1873 -0.1908 -0.1246
  0.0061 -0.1969 -0.1835 -0.0562 -0.0316
  0.1807 -0.1750 -0.1308  0.1415 -0.0436

(3 ,0 ,.,.) = 
 -0.0488  0.0050  0.0904  0.1844 -0.0693
  0.1241  0.1377  0.0198  0.1808  0.0406
 -0.1071  0.0817 -0.1523 -0.1795  0.0721
 -0.0534  0.0055 -0.0136  0.1909 -0.0336
 -0.1597 -0.1837  0.1358  0.1471 -0.1792

(4 ,0 ,.,.) = 
  0.0265 -0.1899 -0.1786  0.0210 -0.1534
  0.1675  0.1504 -0.0888 -0.0445  0.12

The input to the ```forward``` method is an ```autograd.Variable```. The produced output is of the same type. 
The CNN expects an input of $32 \times 32$ picture sizes. how does that work? let's walk through that:

First, let us just assume for the moment, that we have a $32\times32\times1$ sized picture (we only have one colour channel, hence BW picture). Our first layer ```net.conv1``` has a $5\times5$ convolutional layer, with kernel size (KS) of $5$ and $6$ outputs $\Leftarrow 6$ kernels.

The default values in ```torch.nn.Conv2d``` for ```stride``` and ```padding``` are $1$ and $0$ respectively. Since we didn't set any padding ourselves, we'll lose some information of the image. If the function
```python
out = lambda wid,KS,pad,st: ((wid-KS+2*pad)/st+1)
```
with wid=width, KS=kernel size, pad=padding, st=stride, returns an integer, we don't lose information. In our case we have ```out(32, 5, 0, 1)=13.5```.

The loss of information is $\mathsf{KS} -1$, so after ```net.conv1``` we end up with $28\times28\times6$, because we have 6 outputs. This is fed into the 2d pooling layer of $2\times2$, so we get $14\times14\times6$, because channels aren't pooled.
Same procedure again with layer ```net.conv2``` $\Rightarrow\ 10\times10\times16$, and after pooling $5\times5\times16$.

This output tensor is then streched into a 1d object, such that ```net.fc1``` is able to use it. Hence the $16\cdot 5 \cdot 5$ input size.

In [4]:
input = Variable(torch.randn(1, 1, 32, 32))  # variable takes a 4-dim tensor: nSamples x nChannels x Height x Width
# what is a channel? think of an RGB picture - it has 3 channels, each for a certain colour. each channel has info
# about the whole picture
out = net(input)
print(out)

Variable containing:
-0.0035  0.1019 -0.0021 -0.0666  0.0568  0.0193 -0.0635 -0.1036 -0.0144 -0.0583
[torch.FloatTensor of size 1x10]



In [15]:
%timeit test(net)

955 µs ± 1.06 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [18]:
net.zero_grad()
out.backward(torch.randn(1, 10))

In [6]:
nn.Module?

In [14]:
# test function to get an idea, how long it takes for a network to do a full data propagation with weight adaption
# still to add: weight adaption with learning rate, loss function, ..
def test(net):
    input = Variable(torch.randn(1,1,32,32))
    out = net(input)
    net.zero_grad()
    out.backward(torch.randn(1,10))

In [12]:
test(net)

Variable containing:
-0.0040  0.1117 -0.0444 -0.0671  0.0633 -0.0058 -0.0577 -0.1043 -0.0062 -0.0718
[torch.FloatTensor of size 1x10]



In [13]:
o = lambda w,k,p,s: ((w-k+2*p)/s+1)


2.25

In [14]:
plt.scatter?

In [16]:
nn.Conv2d?