In [1]:
# Add Lincoln to system path
import sys
sys.path.append("/Users/seth/development/lincoln/")

In [2]:
from torch import Tensor
import torch

import typing
from typing import List, Tuple

In [3]:
from lincoln.operations import Operation, ParamOperation
from lincoln.layers import Layer
from lincoln.activations import Activation, LinearAct

In [4]:
class Dense(Layer):
    '''
    Once we define all the Operations and the outline of a layer, all that remains to implement here 
    is the _setup_layer function!
    '''
    def __init__(self, 
                 neurons: int, 
                 activation: Activation = LinearAct) -> None:
        super().__init__(neurons)
        self.activation = activation

    def _setup_layer(self, num_in: int) -> None:
        # weights
        self.params.append(torch.empty(num_in, self.neurons).uniform_(-1, 1))
        
        # bias
        self.params.append(torch.empty(1, self.neurons).uniform_(-1, 1))
        
        self.operations = [WeightMultiply(self.params[0]), 
                           BiasAdd(self.params[1])] + [self.activation]

In [5]:
class WeightMultiply(ParamOperation):

    def __init__(self, 
                 W: Tensor):
        super().__init__(W)
    
    def _compute_output(self):
        return torch.mm(self.input_, self.param)
    
    def _compute_grads(self, output_grad):
        return torch.mm(output_grad, self.param.transpose(0, 1))
    
    def _param_grad(self, 
                    output_grad: Tensor):
        
        return torch.mm(self.input_.transpose(0, 1), output_grad)

In [6]:
def assert_same_shape(output: Tensor, 
                      output_grad: Tensor):
    assert output.shape == output_grad.shape, \
    '''
    Two tensors should have the same shape; instead, first Tensor's shape is {0}
    and second Tensor's shape is {1}.
    '''.format(tuple(output_grad.shape), tuple(output.shape))
    return None

## 1D Convolution

1 input, 1 output

## With padding

In [7]:
def _pad_1d(inp: Tensor,
            num: int) -> Tensor:
    z = torch.Tensor([0])
    z = z.repeat(num)
    return torch.cat([z, inp, z])

In [8]:
def conv_1d(inp: Tensor, 
            fil: Tensor) -> Tensor:
    
    fil_len = fil.shape[0]
    fil_mid = fil_len // 2
    inp_pad = _pad_1d(inp, fil_mid)
    
    out = torch.zeros(inp.shape)
    
    for o in range(out.shape[0]):
        for f in range(fil_len):
            out[o] += fil[f] * inp_pad[o+f]

    assert_same_shape(inp, out)
    return out

In [9]:
inp1 = Tensor([0,1,2,3,4,5,6])
fil1 = Tensor([1,1,1])
fil2 = Tensor([2,1,1])

In [10]:
conv_1d(inp1, fil1)

tensor([ 1.,  3.,  6.,  9., 12., 15., 11.])

In [11]:
conv_1d(inp1, fil2)

tensor([ 1.,  3.,  7., 11., 15., 19., 16.])

In [12]:
def conv_1d_sum(inp: Tensor, 
                fil: Tensor) -> Tensor:
    out = conv_1d(inp, fil)
    return torch.sum(out)

In [13]:
conv_1d_sum(inp1, fil1)

tensor(57.)

In [14]:
conv_1d_sum(inp1, fil2)

tensor(72.)

### Gradients

In [15]:
def _param_grad(inp: Tensor, 
                fil: Tensor, 
                output_grad: Tensor = None) -> Tensor:
    
    fil_len = fil.shape[0]
    fil_mid = fil_len // 2
    inp_pad = _pad_1d(inp, fil_mid)
    
    if output_grad is None:
        output_grad = torch.ones_like(inp)
    else:
        assert_same_shape(inp, output_grad)
    
    output_pad = _pad_1d(output_grad, fil_mid)
    
    # Zero padded 1 dimensional convolution
    param_grad = torch.zeros_like(fil)
    input_grad = torch.zeros_like(inp)

    for f in range(fil.shape[0]):
        for o in range(inp.shape[0]):
            param_grad[f] += inp_pad[o+f] * output_grad[o]
            input_grad[o] += output_pad[o+fil_len-f-1] * fil[f]
        
    assert_same_shape(param_grad, fil)
    assert_same_shape(input_grad, inp)
    return param_grad, input_grad

### Gradient check

In [16]:
inp_t = Tensor([0,1,2,3,4,5,6])
fil_t = Tensor([1,1,1])

In [17]:
def _check_grad(inp: Tensor,
                fil: Tensor):
    
    inp_grad_check = torch.zeros_like(inp)
    fil_grad_check = torch.zeros_like(fil)

    for i in range(inp.shape[0]):
        inp_temp = inp.clone()
        inp_temp[i] = inp_temp[i] + 1
        sum1 = conv_1d_sum(inp, fil).item()
        sum2 = conv_1d_sum(inp_temp, fil).item()
        inp_grad_check[i] = sum2 - sum1

    for f in range(fil.shape[0]):
        fil_temp = fil.clone()
        fil_temp[f] = fil_temp[f] + 1
        sum1 = conv_1d_sum(inp, fil).item()
        sum2 = conv_1d_sum(inp, fil_temp).item()
        fil_grad_check[f] = sum2 - sum1
        
    return fil_grad_check, inp_grad_check

In [18]:
_param_grad(inp_t, fil_t)

(tensor([15., 21., 21.]), tensor([2., 3., 3., 3., 3., 3., 2.]))

In [19]:
_check_grad(inp_t, fil_t)

(tensor([15., 21., 21.]), tensor([2., 3., 3., 3., 3., 3., 2.]))

### Batch size of 2

#### Pad

In [20]:
inp_t = Tensor([[0,1,2,3,4,5,6], 
                [1,2,3,4,5,6,7]])

In [21]:
def _pad_1d(inp: Tensor,
            num: int) -> Tensor:
    z = torch.Tensor([0])
    z = z.repeat(num)
    return torch.cat([z, inp, z])

In [22]:
def _pad_1d_batch(inp: Tensor, 
                  num: int) -> Tensor:
    outs = [_pad_1d(obs, num) for obs in inp]
    return torch.stack(outs)

In [23]:
_pad_1d_batch(inp_t, 1)

tensor([[0., 0., 1., 2., 3., 4., 5., 6., 0.],
        [0., 1., 2., 3., 4., 5., 6., 7., 0.]])

#### Forward

In [24]:
def conv_1d_batch(inp: Tensor, 
                  fil: Tensor) -> Tensor:

    outs = [conv_1d(obs, fil) for obs in inp]
    return torch.stack(outs)    

In [25]:
conv_1d_batch(inp_t, fil1)

tensor([[ 1.,  3.,  6.,  9., 12., 15., 11.],
        [ 3.,  6.,  9., 12., 15., 18., 13.]])

#### Grad

In [26]:
def grad_1d_batch(inp: Tensor, 
                  fil: Tensor) -> Tensor:

    out = conv_1d_batch(inp, fil)
    
    out_grad = torch.ones_like(out)
        
    grads = [_param_grad(inp[i], fil, out_grad[i])[1] for i in range(output_grad.shape[0])]    

    return torch.stack(grads)

In [27]:
def param_grad_1d_batch(inp: Tensor, 
                        fil: Tensor) -> Tensor:

    output_grad = torch.ones_like(inp)
    
    inp_pad = _pad_1d_batch(inp, 1)
    out_pad = _pad_1d_batch(inp, 1)

    param_grad = torch.ones_like(fil)    
    
    for i in range(inp.shape[0]):
        for p in range(fil.shape[0]):
            for o in range(inp.shape[1]):
                param_grad[p] += inp_pad[i][o+p] * output_grad[i][o]    
    return param_grad

## Testing conv + weight multiplication

#### Just `param_grad_1d_batch`

In [28]:
def param_grad_1d_batch(inp: Tensor, 
                        fil: Tensor, 
                        output_grad: Tensor) -> Tensor:

    inp_pad = _pad_1d_batch(inp, 1)
    out_pad = _pad_1d_batch(inp, 1)
    param_grad = torch.zeros_like(fil)    
    
    for i in range(inp.shape[0]):
        for p in range(fil.shape[0]):
            for o in range(inp.shape[1]):
                param_grad[p] += inp_pad[i][o+p] * output_grad[i][o]    
    return param_grad

In [29]:
def forward_conv(inp: Tensor, 
                     fil: Tensor):
    conv_out = conv_1d_batch(inp, fil)

    return round(torch.sum(conv_out).item(), 4)

In [30]:
torch.manual_seed(83118)
inp_t = Tensor(2, 7).uniform_(-1, 1)
fil = Tensor(torch.empty(3).uniform_(-1, 1))
print(inp_t)
print(fil)

tensor([[-0.4595, -0.0404,  0.7231,  0.9657,  0.9637, -0.6279,  0.7466],
        [ 0.8894, -0.9637,  0.1816,  0.4596,  0.6427, -0.0797, -0.8224]])
tensor([ 0.7842, -0.3495, -0.1091])


In [31]:
forward_conv(inp_t, fil)

0.9457

In [32]:
output_grad = torch.ones_like(inp_t)

In [33]:
param_grad_1d_batch(inp_t, 
                    fil, 
                    output_grad)

tensor([2.6545, 2.5786, 2.1487])

`fil_grad[0] = 2.6545`

Adding 0.1 to `fil[0]` should change the sum to

In [34]:
round(0.9457 + 0.1 * 2.6545, 4)

1.2111

In [35]:
fil2 = torch.clone(fil)
fil2[0] += 0.1
fil2

tensor([ 0.8842, -0.3495, -0.1091])

In [36]:
forward_conv(inp_t, fil2)

1.2112

#### Checking input gradient

In [37]:
grad_1d_batch(inp_t, fil)

tensor([[ 0.4346,  0.3255,  0.3255,  0.3255,  0.3255,  0.3255, -0.4587],
        [ 0.4346,  0.3255,  0.3255,  0.3255,  0.3255,  0.3255, -0.4587]])

`inp_t[0][0]`'s gradient is `0.4346`. So increasing it by 0.1 should increase the sum to:

In [38]:
round(0.9457 + 0.1 * 0.4346, 4)

0.9892

In [39]:
inp_t2 = inp_t.clone()
inp_t2[0][0] += 0.1
inp_t2

tensor([[-0.3595, -0.0404,  0.7231,  0.9657,  0.9637, -0.6279,  0.7466],
        [ 0.8894, -0.9637,  0.1816,  0.4596,  0.6427, -0.0797, -0.8224]])

In [40]:
forward_conv(inp_t2, fil)

0.9892

Works!

#### With weight multiplication

In [41]:
inp_t = Tensor(2, 7).uniform_(-1, 1)
fil = Tensor(torch.empty(3).uniform_(-1, 1))
weights = Tensor(torch.empty(7, 5).uniform_(-1, 1))

conv_out = conv_1d_batch(inp_t, fil)

out = torch.mm(conv_out, weights)

round(torch.sum(out).item(), 4)

-1.8207

In [42]:
fil

tensor([-0.4404,  0.2923, -0.1474])

In [43]:
out_grad = torch.mm(torch.ones_like(out), weights.transpose(0, 1))

fil_grad = param_grad_1d_batch(inp_t, fil, out_grad)

In [44]:
fil_grad

tensor([2.7563, 1.8538, 7.7927])

`fil_grad[0] = 2.7563`

Adding 0.1 to `FIL[0]` should change the sum to

In [45]:
round(-1.8207 + 0.1 * 2.7563, 4)

-1.5451

In [46]:
def forward_conv_mul(inp: Tensor, 
                     fil: Tensor, 
                     weights: Tensor):
    conv_out = conv_1d_batch(inp_t, fil)

    out = torch.mm(conv_out, weights)

    return round(torch.sum(out).item(), 4)

In [47]:
fil2 = torch.clone(fil)
fil2[0] += 0.1
fil2

tensor([-0.3404,  0.2923, -0.1474])

In [48]:
forward_conv_mul(inp_t, fil2, weights)

-1.545

### Operation

In [49]:
class Operation(object):

    def __init__(self):
        pass
    

    def forward(self, 
                input_: Tensor):
        self.input_ = input_
        
        self.output = self._compute_output()

        return self.output
    

    def _input_grad(self, output_grad: Tensor) -> Tensor:
        assert_same_shape(self.output, output_grad)       
        
        input_grad = self._compute_grads(output_grad)
               
        assert_same_shape(self.input_, input_grad)
        return input_grad
    

    def backward(self, output_grad: Tensor) -> Tensor:
        return self._input_grad(output_grad)

    def _compute_output(self, input_: Tensor) -> Tensor:
        raise NotImplementedError()
    
    def _compute_grads(self, output_grad: Tensor) -> Tensor:
        raise NotImplementedError()
        

class ParamOperation(Operation):

    def __init__(self, param: Tensor) -> Tensor:
        super().__init__()
        self.param = param
        
    def backward(self, output_grad: Tensor) -> Tensor:
        
        self.param_grad = self._param_grad(output_grad)
        
        assert_same_shape(self.param, self.param_grad)
        
        return self._input_grad(output_grad)
        
    def _param_grad(self, output_grad: Tensor) -> Tensor:
        raise NotImplementedError()

In [50]:
class Conv1D(ParamOperation):

    def __init__(self, 
                 param: Tensor):
        super().__init__(param)
        self.param_size = param.shape[0]
        self.param_pad = self.param_size // 2
        
    def _pad_1d_obs(self, obs: Tensor) -> Tensor:
        z = torch.Tensor([0])
        z = z.repeat(self.param_pad)
        return torch.cat([z, obs, z])

    def _pad_1d(self, inp: Tensor) -> Tensor:
        outs = [self._pad_1d_obs(obs) for obs in inp]
        return torch.stack(outs)    

    def _compute_output_obs(self, obs: Tensor):
        
        obs_pad = self._pad_1d_obs(obs)

        out = torch.zeros(obs.shape)

        for o in range(out.shape[0]):
            for p in range(self.param_size):
                out[o] += self.param[p] * obs_pad[o+p]
        return out
    
    def _compute_output(self):
        
        outs = [self._compute_output_obs(obs) for obs in self.input_]
        return torch.stack(outs)
    
    def _compute_grads_obs(self, 
                           input_obs: Tensor,
                           output_grad_obs: Tensor) -> None:

        output_obs_pad = self._pad_1d_obs(output_grad_obs)
        input_obs_pad = self._pad_1d_obs(input_obs)
        input_grad = torch.zeros_like(input_obs)

        for p in range(self.param.shape[0]):
            for o in range(input_obs.shape[0]):
                input_grad[o] += output_obs_pad[o+self.param_size-p-1] * self.param[p]
                
        return input_grad
        
    def _compute_grads(self, output_grad: Tensor) -> None:
        
        grads = [self._compute_grads_obs(self.input_[i], output_grad[i]) for i in range(output_grad.shape[0])]    

        return torch.stack(grads)

    def _param_grad(self, output_grad: Tensor) -> Tensor:

        inp_pad = self._pad_1d(self.input_)
        out_pad = self._pad_1d(output_grad)

        param_grad = torch.zeros_like(self.param)

        for i in range(self.input_.shape[0]):
            for p in range(self.param.shape[0]):
                for o in range(self.input_.shape[1]):
                    param_grad[p] += inp_pad[i][o+p] * output_grad[i][o]
        
        return param_grad
                

In [51]:
fil

tensor([-0.4404,  0.2923, -0.1474])

In [52]:
weights

tensor([[ 0.1779,  0.6374, -0.8940, -0.3311, -0.0193],
        [-0.1690,  0.0073,  0.8437,  0.9426,  0.6066],
        [ 0.4668,  0.5077, -0.6763,  0.0992, -0.4717],
        [ 0.0476, -0.3656,  0.7781, -0.5844,  0.8515],
        [ 0.8489,  0.6374,  0.6090,  0.6846,  0.5798],
        [-0.6807,  0.3116, -0.1502, -0.3789, -0.1404],
        [ 0.4816,  0.8886,  0.8650, -0.4064, -0.2798]])

In [53]:
class TestConv(Layer):
    '''
    Once we define all the Operations and the outline of a layer, all that remains to implement here 
    is the _setup_layer function!
    '''
    def __init__(self, 
                 neurons: int, 
                 activation: Activation = LinearAct,
                 filter_size: int = 3) -> None:
        super().__init__(neurons)
        self.activation = activation
        self.filter_size = filter_size
        

    def _setup_layer(self, num_in: int) -> None:
        # filter
        self.params.append(Tensor([-0.4404,  0.2923, -0.1474]))
        
        # Weights
        self.params.append(Tensor([[ 0.1779,  0.6374, -0.8940, -0.3311, -0.0193],
        [-0.1690,  0.0073,  0.8437,  0.9426,  0.6066],
        [ 0.4668,  0.5077, -0.6763,  0.0992, -0.4717],
        [ 0.0476, -0.3656,  0.7781, -0.5844,  0.8515],
        [ 0.8489,  0.6374,  0.6090,  0.6846,  0.5798],
        [-0.6807,  0.3116, -0.1502, -0.3789, -0.1404],
        [ 0.4816,  0.8886,  0.8650, -0.4064, -0.2798]]))
        
        self.operations = [Conv1D(self.params[0]),
                           WeightMultiply(self.params[1])]


In [54]:
a = TestConv(5)

In [55]:
inp_t

tensor([[-0.3163, -0.1501,  0.7113,  0.3191,  0.6163,  0.7247, -0.0049],
        [-0.1855,  0.7793,  0.8862, -0.1635, -0.4434,  0.8358,  0.8314]])

In [56]:
out = a.forward(Tensor(inp_t))
out

tensor([[-0.0912, -0.1621, -0.6499,  0.3251, -0.3171],
        [-0.5356, -0.1283, -0.2132,  0.2433, -0.2916]])

In [57]:
round(a.forward(Tensor(inp_t)).sum().item(), 4)

-1.8206

In [58]:
a.backward(torch.ones_like(out))

tensor([[-1.1080,  0.7482, -0.6709, -1.2561,  1.3323, -1.4810,  0.6059],
        [-1.1080,  0.7482, -0.6709, -1.2561,  1.3323, -1.4810,  0.6059]])

In [59]:
a._param_grads()

a.operations[0].param_grad

tensor([2.7561, 1.8538, 7.7929])

```python
tensor([2.7563, 1.8538, 7.7927])
```

Checks out! 

### 2D Convolutions

In [60]:
from torchvision.datasets import MNIST

In [61]:
mnist_trainset = MNIST(root="./data", train=True, download=False, transform=None)

In [62]:
mnist_data = mnist_trainset.train_data.type(torch.float32) / 255.0

In [63]:
mnist_imgs = mnist_data[:10]

In [64]:
torch.save(mnist_imgs, "../speedup/data/img_batch")

In [65]:
mnist_imgs_2 = torch.load("img_batch")


In [69]:
mnist_imgs_2_np = mnist_imgs_2.numpy()

In [71]:
mnist_imgs_2_np

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [76]:
import numpy as np

pad = 2
obs = np.zeros(28)
a = np.zeros(pad)
np.concatenate([a, obs, a]).shape

(32,)

In [73]:
a = Tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

torch.Size([28])

In [None]:
torch.manual_seed(90118)
fil = Tensor(torch.empty(3, 3).uniform_(-1, 1))
fil

#### Strategy:

1. `_pad_2d_obs`
2. `_pad_2d`
3. `_compute_output_obs`
4. `_compute_output`

The hard part:

5. `_compute_grads`
6. `_compute_grads_obs`

For 5 and 6: can use the "CNN_Explanation" notebook as a guide.

In [None]:
mnist_imgs = mnist_data[:2]

#### Padding

In [None]:
def _pad_2d_obs(inp: Tensor, 
                num: int):
    '''
    Input is a 2 dimensional, square, 2D Tensor
    '''
    
    inp_pad = _pad_1d_batch(inp, num)
    other = torch.zeros(num, inp.shape[0] + num * 2)
    return torch.cat([other, inp_pad, other])

In [None]:
def _pad_1d_batch(inp: Tensor, 
                  num: int) -> Tensor:
#     import pdb; pdb.set_trace()
    outs = [_pad_1d(obs, num) for obs in inp]

    return torch.stack(outs)

In [None]:
def _pad_1d(inp: Tensor,
            num: int) -> Tensor:
    z = torch.Tensor([0])
    z = z.repeat(num)
#     import pdb; pdb.set_trace()
    return torch.cat([z, inp, z])

In [None]:
mnist_img[0].shape

In [None]:
_pad_2d_obs(mnist_img[0], 1).shape

In [None]:
def _pad_2d(inp: Tensor, 
            num: int):
    '''
    Input is a 3 dimensional tensor, first dimension batch size
    '''

    outs = [_pad_2d_obs(obs, num) for obs in inp]
    return torch.stack(outs)

In [None]:
_pad_2d(mnist_imgs, 1).shape

#### Compute output

**1D**:

```python
    def _compute_output_obs(self, obs: Tensor):
        
        obs_pad = self._pad_1d_obs(obs)

        out = torch.zeros(obs.shape)

        for o in range(out.shape[0]):
            for p in range(self.param_size):
                out[o] += self.param[p] * obs_pad[o+p]
        return out
    
    def _compute_output(self):
        
        outs = [self._compute_output_obs(obs) for obs in self.input_]
        return torch.stack(outs)
```

In [None]:
def _compute_output_obs_2d(obs: Tensor, 
                           param: Tensor):
    '''
    Obs is a 2d square Tensor, so is param
    '''
    param_mid = param.shape[0] // 2
    
    obs_pad = _pad_2d_obs(obs, param_mid)
    
    out = torch.zeros(obs.shape)
    
    for o_w in range(out.shape[0]):
        for o_h in range(out.shape[1]):
            for p_w in range(param.shape[0]):
                for p_h in range(param.shape[1]):
                    out[o_w][o_h] += param[p_w][p_h] * obs_pad[o_w+p_w][o_h+p_h]
    return out    

In [None]:
_compute_output_obs_2d(mnist_imgs[0], fil).shape

In [None]:
_compute_output_obs_2d(torch.ones_like(mnist_imgs[0]), 
                   torch.ones_like(fil)).shape

In [None]:
def _compute_output_2d(img_batch: Tensor,
                       param: Tensor):
    
    outs = [_compute_output_obs_2d(obs, param) for obs in img_batch]
    return torch.stack(outs)

In [None]:
_compute_output_2d(mnist_imgs, fil).shape

#### Param grads


```python
def _compute_grads_obs(self, 
                       input_obs: Tensor,
                       output_grad_obs: Tensor) -> None:

    output_obs_pad = self._pad_1d_obs(output_grad_obs)
    input_obs_pad = self._pad_1d_obs(input_obs)
    input_grad = torch.zeros_like(input_obs)

    for p in range(self.param.shape[0]):
        for o in range(input_obs.shape[0]):
            input_grad[o] += output_obs_pad[o+self.param_size-p-1] * self.param[p]

    return input_grad

def _compute_grads(self, output_grad: Tensor) -> None:

    grads = [self._compute_grads_obs(self.input_[i], output_grad[i]) for i in range(output_grad.shape[0])]    

    return torch.stack(grads)

def _param_grad(self, output_grad: Tensor) -> Tensor:

    inp_pad = self._pad_1d(self.input_)
    out_pad = self._pad_1d(output_grad)

    param_grad = torch.zeros_like(self.param)

    for i in range(self.input_.shape[0]):
        for p in range(self.param.shape[0]):
            for o in range(self.input_.shape[1]):
                param_grad[p] += inp_pad[i][o+p] * output_grad[i][o]

    return param_grad
```

In [None]:
def _compute_grads_obs_2d(input_obs: Tensor,
                          output_grad_obs: Tensor, 
                          param: Tensor) -> Tensor:
    '''
    input_obs: 2D Tensor representing the input observation
    output_grad_obs: 2D Tensor representing the output gradient  
    param: 2D filter
    '''
    
    param_size = param.shape[0]
    output_obs_pad = _pad_2d_obs(output_grad_obs, param_size // 2)
    input_grad = torch.zeros_like(input_obs)
    
    for i_w in range(input_obs.shape[0]):
        for i_h in range(input_obs.shape[1]):
            for p_w in range(param_size):
                for p_h in range(param_size):
                    input_grad[i_w][i_h] += output_obs_pad[i_w+param_size-p_w-1][i_h+param_size-p_h-1] \
                    * param[p_w][p_h]

    return input_grad

def _compute_grads_2d(inp: Tensor,
                      output_grad: Tensor, 
                      param: Tensor) -> Tensor:

    grads = [_compute_grads_obs_2d(inp[i], output_grad[i], param) for i in range(output_grad.shape[0])]    

    return torch.stack(grads)



def _param_grad_2d(inp: Tensor,
                output_grad: Tensor, 
                param: Tensor) -> Tensor:

    param_size = param.shape[0]
    inp_pad = _pad_2d(inp, param_size // 2)

    param_grad = torch.zeros_like(param)
    img_shape = output_grad.shape[1:]
    
    for i in range(inp.shape[0]):
        for o_w in range(img_shape[0]):
            for o_h in range(img_shape[1]):
                for p_w in range(param_size):
                    for p_h in range(param_size):
                        param_grad[p_w][p_h] += inp_pad[i][o_w+p_w][o_h+p_h] \
                        * output_grad[i][o_w][o_h]
    return param_grad

In [None]:
img_grads = _compute_grads_2d(mnist_imgs, 
                  torch.ones_like(mnist_imgs),
                  fil)

In [None]:
img_grads.shape

In [None]:
_param_grad(mnist_imgs, 
            torch.ones_like(mnist_imgs),
            fil).shape

In [None]:
_param_grad(torch.ones_like(mnist_imgs), 
            torch.ones_like(mnist_imgs),
            torch.ones_like(fil))

## Testing functions

In [None]:
torch.manual_seed(83118)
inp_2d = Tensor(2, 28, 28).uniform_(-1, 1)
fil_2d = Tensor(torch.empty(3, 3).uniform_(-1, 1))
print(inp_2d)
print(fil_2d)

In [None]:
out = _compute_output_2d(inp_2d, fil_2d)

In [None]:
torch.sum(out)

In [None]:
out_grad_2d = torch.ones_like(inp_2d)

### Testing input gradient

In [None]:
inp_grads_2d = _compute_grads_2d(inp_2d, out_grad_2d, fil_2d)

In [None]:
inp_grads_2d[1][1][1]

`inp_grads_2d[1][1][1] = 0.1522` That means if we increase `inp_2d[1][1][1]` by 0.1, the sum will increase to:

In [None]:
round(4.3893 + 0.1 * 0.1522, 4)

In [None]:
def conv_2d_sum(inp: Tensor, 
                fil: Tensor):
    out = _compute_output_2d(inp, fil)
    return torch.sum(out)

In [None]:
inp_2d_2 = inp_2d.clone()
inp_2d_2[1][1][1] += 0.1
inp_2d_2

In [None]:
print(conv_2d_sum(inp_2d, fil_2d))
print(conv_2d_sum(inp_2d_2, fil_2d))

Works!

### Testing param gradient

In [None]:
param_grads_2d = _param_grad(inp_2d, out_grad_2d, fil)
param_grads_2d

`param_grads_2d[1][1] = 9.9202` That means if we increase `param_grads_2d[1][1]` by 0.1, the sum will increase to:

In [None]:
round(4.3893 + 9.9202 * 0.1, 4)

In [None]:
fil_2d_2 = fil_2d.clone()
fil_2d_2[1][1] += 0.1
fil_2d_2

In [None]:
print(conv_2d_sum(inp_2d, fil_2d))
print(conv_2d_sum(inp_2d, fil_2d_2))

Works!

## Conv2D class

In [None]:
class Conv2D(ParamOperation):

    def __init__(self, 
                 param: Tensor):
        super().__init__(param)
        self.param_size = param.shape[0]
        self.param_pad = self.param_size // 2
        
    def _pad_1d_obs(self, obs: Tensor) -> Tensor:
        z = torch.Tensor([0])
        z = z.repeat(self.param_pad)
        return torch.cat([z, obs, z])

    def _pad_1d(self, inp: Tensor) -> Tensor:
        outs = [self._pad_1d_obs(obs) for obs in inp]
        return torch.stack(outs)
        
    def _pad_2d_obs(self,
                    inp: Tensor):

        inp_pad = self._pad_1d_batch(inp, self.param_pad)
        other = torch.zeros(num, inp.shape[0] + num * 2)
        return torch.cat([other, inp_pad, other])

    def _pad_2d(self, inp: Tensor):
        
        outs = [_pad_2d_obs(obs, num) for obs in inp]
        return torch.stack(outs)

    def _compute_output_obs(self, 
                            obs: Tensor):
        '''
        Obs is a 2d square Tensor, so is param
        '''
        obs_pad = self._pad_2d_obs(obs)

        out = torch.zeros(obs.shape)

        for o_w in range(out.shape[0]):
            for o_h in range(out.shape[1]):
                for p_w in range(self.param_size):
                    for p_h in range(self.param_size):
                        out[o_w][o_h] += self.param_size[p_w][p_h] * obs_pad[o_w+p_w][o_h+p_h]
        return out    

    def _compute_output(self):

        outs = [self._compute_output_obs(obs) for obs in self.input_]
        return torch.stack(outs)

    def _compute_grads_obs(self, 
                           input_obs: Tensor,
                           output_grad_obs: Tensor) -> Tensor:

        output_obs_pad = self._pad_2d_obs(output_grad_obs)
        input_grad = torch.zeros_like(input_obs)

        for i_w in range(input_obs.shape[0]):
            for i_h in range(input_obs.shape[1]):
                for p_w in range(param_size):
                    for p_h in range(param_size):
                        input_grad[i_w][i_h] += output_obs_pad[i_w+self.param_size-p_w-1][i_h+self.param_size-p_h-1] \
                        * self.param[p_w][p_h]

        return input_grad

    def _compute_grads(self, output_grad: Tensor) -> Tensor:

        grads = [_compute_grads_obs(self.input_[i], output_grad[i], self.param) for i in range(output_grad.shape[0])]    

        return torch.stack(grads)


    def _param_grad(inp: Tensor,
                    output_grad: Tensor, 
                    param: Tensor) -> Tensor:

        param_size = param.shape[0]
        inp_pad = _pad_2d(inp, param_size // 2)

        param_grad = torch.zeros_like(param)
        img_shape = output_grad.shape[1:]

        for i in range(inp.shape[0]):
            for o_w in range(img_shape[0]):
                for o_h in range(img_shape[1]):
                    for p_w in range(param_size):
                        for p_h in range(param_size):
                            param_grad[p_w][p_h] += inp_pad[i][o_w+p_w][o_h+p_h] \
                            * output_grad[i][o_w][o_h]
        return param_grad

TODO: 

* Channels (another for loop)
* `Flatten`
* How to wrap this in a layer - with activation, same as Dense
* Figure out how to write `for` loops in Cython

MNIST demo