In [1]:
# Add Lincoln to system path
import sys
sys.path.append("/Users/seth/development/lincoln/")

In [2]:
from torch import Tensor
import torch

import typing
from typing import List, Tuple

In [3]:
from lincoln.operations import Operation, ParamOperation
from lincoln.layers import Layer
from lincoln.activations import Activation, LinearAct

In [4]:
class Dense(Layer):
    '''
    Once we define all the Operations and the outline of a layer, all that remains to implement here 
    is the _setup_layer function!
    '''
    def __init__(self, 
                 neurons: int, 
                 activation: Activation = LinearAct) -> None:
        super().__init__(neurons)
        self.activation = activation

    def _setup_layer(self, num_in: int) -> None:
        # weights
        self.params.append(torch.empty(num_in, self.neurons).uniform_(-1, 1))
        
        # bias
        self.params.append(torch.empty(1, self.neurons).uniform_(-1, 1))
        
        self.operations = [WeightMultiply(self.params[0]), 
                           BiasAdd(self.params[1])] + [self.activation]

In [5]:
class WeightMultiply(ParamOperation):

    def __init__(self, W: Tensor):
        super().__init__(W)


    def _output(self) -> Tensor:
        return torch.mm(self.input, self.param)


    def _input_grad(self, output_grad: Tensor) -> Tensor:
        return torch.mm(output_grad, self.param.transpose(0, 1))


    def _param_grad(self, output_grad: Tensor) -> Tensor:
        return torch.mm(self.input.transpose(0, 1), output_grad)

In [6]:
def assert_same_shape(output: Tensor, 
                      output_grad: Tensor):
    assert output.shape == output_grad.shape, \
    '''
    Two tensors should have the same shape; instead, first Tensor's shape is {0}
    and second Tensor's shape is {1}.
    '''.format(tuple(output_grad.shape), tuple(output.shape))
    return None

In [7]:
def assert_dim(t: Tensor,
               dim: Tensor):
    assert len(t.shape) == dim, \
    '''
    Tensor expected to have dimension {0}, instead has dimension {1}
    '''.format(dim, len(t.shape))
    return None

## 1D Convolution

1 input, 1 output

## With padding

In [8]:
def _pad_1d(inp: Tensor,
            num: int) -> Tensor:
    z = torch.Tensor([0])
    z = z.repeat(num)
    return torch.cat([z, inp, z])

In [9]:
def conv_1d(inp: Tensor, 
            fil: Tensor) -> Tensor:
    
    # assert correct dimensions
    assert_dim(inp, 1)
    assert_dim(fil, 1)
    
    # pad the input
    fil_len = fil.shape[0]
    fil_mid = fil_len // 2
    inp_pad = _pad_1d(inp, fil_mid)
    
    # initialize the output
    out = torch.zeros(inp.shape)
    
    # perform the 1d convolution
    for o in range(out.shape[0]):
        for f in range(fil_len):
            out[o] += fil[f] * inp_pad[o+f]

    # ensure shapes didn't change            
    assert_same_shape(inp, out)
    return out

In [10]:
inp1 = Tensor([1,2,3,4,5])
fil1 = Tensor([1,1,1])
fil1_0 = Tensor([2,1,1])
fil1_1 = Tensor([1,2,1])
fil1_2 = Tensor([1,1,2])

In [11]:
def conv_1d_sum(inp: Tensor, 
                fil: Tensor) -> Tensor:
    out = conv_1d(inp, fil)
    return torch.sum(out).item()

In [12]:
conv_1d_sum(inp1, fil1)

39.0

In [13]:
print(conv_1d_sum(inp1, fil1_0) - conv_1d_sum(inp1, fil1))
print(conv_1d_sum(inp1, fil1_1) - conv_1d_sum(inp1, fil1))
print(conv_1d_sum(inp1, fil1_2) - conv_1d_sum(inp1, fil1))

10.0
15.0
14.0


In [14]:
def conv_1d_sum(inp: Tensor, 
                fil: Tensor) -> Tensor:
    out = conv_1d(inp, fil)
    return torch.sum(out).item()

In [15]:
import random
random.seed(92318)
random.randint(0, inp1.shape[0])
random.randint(0, fil1.shape[0])

0

In [16]:
inp1_4 = Tensor([1,2,3,4,6])
fil1 = Tensor([1,1,1])

In [17]:
conv_1d_sum(inp1, fil1)

39.0

In [18]:
conv_1d_sum(inp1_4, fil1)

41.0

In [19]:
inp1 = Tensor([1,2,3,4,5])
fil1_0 = Tensor([2,1,1])

In [20]:
conv_1d_sum(inp1, fil1)

39.0

In [21]:
conv_1d_sum(inp1, fil1_0)

49.0

### Gradients

In [22]:
def _param_grad(inp: Tensor, 
                fil: Tensor, 
                output_grad: Tensor = None) -> Tensor:
    
    fil_len = fil.shape[0]
    fil_mid = fil_len // 2
    inp_pad = _pad_1d(inp, fil_mid)
    
    if output_grad is None:
        output_grad = torch.ones_like(inp)
    else:
        assert_same_shape(inp, output_grad)
    
    output_pad = _pad_1d(output_grad, fil_mid)
    
    # Zero padded 1 dimensional convolution
    param_grad = torch.zeros_like(fil)
    input_grad = torch.zeros_like(inp)

    for f in range(fil.shape[0]):
        for o in range(inp.shape[0]):
            param_grad[f] += inp_pad[o+f] * output_grad[o]
            input_grad[o] += output_pad[o+fil_len-f-1] * fil[f]
        
    assert_same_shape(param_grad, fil)
    assert_same_shape(input_grad, inp)
    return param_grad, input_grad

In [23]:
def _input_grad(inp: Tensor, 
                fil: Tensor, 
                output_grad: Tensor = None) -> Tensor:
    
    fil_len = fil.shape[0]
    fil_mid = fil_len // 2
    inp_pad = _pad_1d(inp, fil_mid)
    
    if output_grad is None:
        output_grad = torch.ones_like(inp)
    else:
        assert_same_shape(inp, output_grad)
    
    output_pad = _pad_1d(output_grad, fil_mid)
    
    input_grad = torch.zeros_like(inp)

    for f in range(fil.shape[0]):
        for o in range(inp.shape[0]):
            input_grad[o] += output_pad[o+fil_len-f-1] * fil[f]
        
    assert_same_shape(input_grad, inp)
    return input_grad

### Gradient check

In [24]:
inp_t = Tensor([0,1,2,3,4,5,6])
fil_t = Tensor([1,1,1])

In [25]:
def _check_grad(inp: Tensor,
                fil: Tensor):
    
    inp_grad_check = torch.zeros_like(inp)
    fil_grad_check = torch.zeros_like(fil)

    for i in range(inp.shape[0]):
        inp_temp = inp.clone()
        inp_temp[i] = inp_temp[i] + 1
        sum1 = conv_1d_sum(inp, fil)
        sum2 = conv_1d_sum(inp_temp, fil)
        inp_grad_check[i] = sum2 - sum1

    for f in range(fil.shape[0]):
        fil_temp = fil.clone()
        fil_temp[f] = fil_temp[f] + 1
        sum1 = conv_1d_sum(inp, fil)
        sum2 = conv_1d_sum(inp, fil_temp)
        fil_grad_check[f] = sum2 - sum1
        
    return fil_grad_check, inp_grad_check

In [26]:
_param_grad(inp_t, fil_t)

(tensor([15., 21., 21.]), tensor([2., 3., 3., 3., 3., 3., 2.]))

In [27]:
_check_grad(inp_t, fil_t)

(tensor([15., 21., 21.]), tensor([2., 3., 3., 3., 3., 3., 2.]))

### Batch size of 2

#### Pad

In [28]:
inp_2 = Tensor([[0,1,2,3,4,5,6], 
                [1,2,3,4,5,6,7]])

In [29]:
def _pad_1d(inp: Tensor,
            num: int) -> Tensor:
    z = torch.Tensor([0])
    z = z.repeat(num)
    return torch.cat([z, inp, z])

In [30]:
def _pad_1d_batch(inp: Tensor, 
                  num: int) -> Tensor:
    outs = [_pad_1d(obs, num) for obs in inp]
    return torch.stack(outs)

In [31]:
_pad_1d_batch(inp_2, 1)

tensor([[0., 0., 1., 2., 3., 4., 5., 6., 0.],
        [0., 1., 2., 3., 4., 5., 6., 7., 0.]])

#### Forward

In [32]:
def conv_1d_batch(inp: Tensor, 
                  fil: Tensor) -> Tensor:

    outs = [conv_1d(obs, fil) for obs in inp]
    return torch.stack(outs)    

In [33]:
conv_1d_batch(inp_2, fil1)

tensor([[ 1.,  3.,  6.,  9., 12., 15., 11.],
        [ 3.,  6.,  9., 12., 15., 18., 13.]])

#### Grad

In [34]:
def grad_1d_batch(inp: Tensor, 
                  fil: Tensor) -> Tensor:

    out = conv_1d_batch(inp, fil)
    
    out_grad = torch.ones_like(out)
    
    batch_size = out_grad.shape[0]
        
    grads = [_input_grad(inp[i], fil, out_grad[i])[1] for i in range(batch_size)]    

    return torch.stack(grads)

In [35]:
def param_grad_1d_batch(inp: Tensor, 
                        fil: Tensor) -> Tensor:

    output_grad = torch.ones_like(inp)
    
    inp_pad = _pad_1d_batch(inp, 1)
    out_pad = _pad_1d_batch(inp, 1)

    param_grad = torch.ones_like(fil)    
    
    for i in range(inp.shape[0]):
        for p in range(fil.shape[0]):
            for o in range(inp.shape[1]):
                param_grad[p] += inp_pad[i][o+p] * output_grad[i][o]    
    return param_grad

## Testing conv + weight multiplication

#### Just `param_grad_1d_batch`

In [36]:
def param_grad_1d_batch(inp: Tensor, 
                        fil: Tensor, 
                        output_grad: Tensor) -> Tensor:

    inp_pad = _pad_1d_batch(inp, 1)
    out_pad = _pad_1d_batch(inp, 1)
    param_grad = torch.zeros_like(fil)    
    
    for i in range(inp.shape[0]):
        for p in range(fil.shape[0]):
            for o in range(inp.shape[1]):
                param_grad[p] += inp_pad[i][o+p] * output_grad[i][o]    
    return param_grad

In [37]:
def forward_conv(inp: Tensor, 
                     fil: Tensor):
    conv_out = conv_1d_batch(inp, fil)

    return round(torch.sum(conv_out).item(), 4)

In [38]:
torch.manual_seed(83118)
inp_t = Tensor(2, 7).uniform_(-1, 1)
fil = Tensor(torch.empty(3).uniform_(-1, 1))
print(inp_t)
print(fil)

tensor([[-0.4595, -0.0404,  0.7231,  0.9657,  0.9637, -0.6279,  0.7466],
        [ 0.8894, -0.9637,  0.1816,  0.4596,  0.6427, -0.0797, -0.8224]])
tensor([ 0.7842, -0.3495, -0.1091])


In [39]:
forward_conv(inp_t, fil)

0.9457

In [40]:
output_grad = torch.ones_like(inp_t)

In [41]:
param_grad_1d_batch(inp_t, 
                    fil, 
                    output_grad)

tensor([2.6545, 2.5786, 2.1487])

`fil_grad[0] = 2.6545`

Adding 0.1 to `fil[0]` should change the sum to

In [42]:
round(0.9457 + 0.1 * 2.6545, 4)

1.2111

In [43]:
fil2 = torch.clone(fil)
fil2[0] += 0.1
fil2

tensor([ 0.8842, -0.3495, -0.1091])

In [44]:
forward_conv(inp_t, fil2)

1.2112

#### Checking input gradient

In [45]:
grad_1d_batch(inp_2, fil)

tensor([0.3255, 0.3255])

`inp_t[0][0]`'s gradient is `0.4346`. So increasing it by 0.1 should increase the sum to:

In [46]:
round(0.9457 + 0.1 * 0.4346, 4)

0.9892

In [47]:
inp_t2 = inp_t.clone()
inp_t2[0][0] += 0.1
inp_t2

tensor([[-0.3595, -0.0404,  0.7231,  0.9657,  0.9637, -0.6279,  0.7466],
        [ 0.8894, -0.9637,  0.1816,  0.4596,  0.6427, -0.0797, -0.8224]])

In [48]:
forward_conv(inp_t2, fil)

0.9892

Works!

#### With weight multiplication

In [49]:
inp_t = Tensor(2, 7).uniform_(-1, 1)
fil = Tensor(torch.empty(3).uniform_(-1, 1))
weights = Tensor(torch.empty(7, 5).uniform_(-1, 1))

conv_out = conv_1d_batch(inp_t, fil)

out = torch.mm(conv_out, weights)

round(torch.sum(out).item(), 4)

-1.8207

In [50]:
fil

tensor([-0.4404,  0.2923, -0.1474])

In [51]:
out_grad = torch.mm(torch.ones_like(out), weights.transpose(0, 1))

fil_grad = param_grad_1d_batch(inp_t, fil, out_grad)

In [52]:
fil_grad

tensor([2.7563, 1.8538, 7.7927])

`fil_grad[0] = 2.7563`

Adding 0.1 to `FIL[0]` should change the sum to

In [53]:
round(-1.8207 + 0.1 * 2.7563, 4)

-1.5451

In [54]:
def forward_conv_mul(inp: Tensor, 
                     fil: Tensor, 
                     weights: Tensor):
    conv_out = conv_1d_batch(inp_t, fil)

    out = torch.mm(conv_out, weights)

    return round(torch.sum(out).item(), 4)

In [55]:
fil2 = torch.clone(fil)
fil2[0] += 0.1
fil2

tensor([-0.3404,  0.2923, -0.1474])

In [56]:
forward_conv_mul(inp_t, fil2, weights)

-1.545

### Operation

In [57]:
class Operation(object):

    def __init__(self):
        pass


    def forward(self, input_: Tensor):
        self.input_ = input_

        self.output = self._output()

        return self.output


    def backward(self, output_grad: Tensor) -> Tensor:

        assert_same_shape(self.output, output_grad)

        self._compute_grads(output_grad)

        assert_same_shape(self.input_, self.input_grad)
        return self.input_grad


    def _compute_grads(self, output_grad: Tensor) -> Tensor:
        self.input_grad = self._input_grad(output_grad)


    def _output(self) -> Tensor:
        raise NotImplementedError()


    def _input_grad(self, output_grad: Tensor) -> Tensor:
        raise NotImplementedError()
        

class ParamOperation(Operation):

    def __init__(self, param: Tensor) -> Tensor:
        super().__init__()
        self.param = param


    def _compute_grads(self, output_grad: Tensor) -> Tensor:
        self.input_grad = self._input_grad(output_grad)
        self.param_grad = self._param_grad(output_grad)


    def _param_grad(self, output_grad: Tensor) -> Tensor:
        raise NotImplementedError()


In [58]:
class Conv1D(ParamOperation):

    def __init__(self, 
                 param: Tensor):
        super().__init__(param)
        self.param_size = param.shape[0]
        self.param_pad = self.param_size // 2
        
    def _pad_1d_obs(self, obs: Tensor) -> Tensor:
        z = torch.Tensor([0])
        z = z.repeat(self.param_pad)
        return torch.cat([z, obs, z])

    def _pad_1d(self, inp: Tensor) -> Tensor:
        outs = [self._pad_1d_obs(obs) for obs in inp]
        return torch.stack(outs)    

    def _compute_output_obs(self, obs: Tensor):
        
        obs_pad = self._pad_1d_obs(obs)

        out = torch.zeros(obs.shape)

        for o in range(out.shape[0]):
            for p in range(self.param_size):
                out[o] += self.param[p] * obs_pad[o+p]
        return out
    
    def _output(self):
        
        outs = [self._compute_output_obs(obs) for obs in self.input_]
        return torch.stack(outs)
    
    def _compute_grads_obs(self, 
                           input_obs: Tensor,
                           output_grad_obs: Tensor) -> None:

        output_obs_pad = self._pad_1d_obs(output_grad_obs)
        input_obs_pad = self._pad_1d_obs(input_obs)
        input_grad = torch.zeros_like(input_obs)

        for p in range(self.param.shape[0]):
            for o in range(input_obs.shape[0]):
                input_grad[o] += output_obs_pad[o+self.param_size-p-1] * self.param[p]
                
        return input_grad
        
    def _input_grad(self, output_grad: Tensor) -> None:
        
        grads = [self._compute_grads_obs(self.input_[i], output_grad[i]) for i in range(output_grad.shape[0])]    

        return torch.stack(grads)

    def _param_grad(self, output_grad: Tensor) -> Tensor:

        inp_pad = self._pad_1d(self.input_)
        out_pad = self._pad_1d(output_grad)

        param_grad = torch.zeros_like(self.param)

        for i in range(self.input_.shape[0]):
            for p in range(self.param.shape[0]):
                for o in range(self.input_.shape[1]):
                    param_grad[p] += inp_pad[i][o+p] * output_grad[i][o]
        
        return param_grad
                

In [59]:
fil

tensor([-0.4404,  0.2923, -0.1474])

In [60]:
weights

tensor([[ 0.1779,  0.6374, -0.8940, -0.3311, -0.0193],
        [-0.1690,  0.0073,  0.8437,  0.9426,  0.6066],
        [ 0.4668,  0.5077, -0.6763,  0.0992, -0.4717],
        [ 0.0476, -0.3656,  0.7781, -0.5844,  0.8515],
        [ 0.8489,  0.6374,  0.6090,  0.6846,  0.5798],
        [-0.6807,  0.3116, -0.1502, -0.3789, -0.1404],
        [ 0.4816,  0.8886,  0.8650, -0.4064, -0.2798]])

In [61]:
class TestConv(Layer):
    '''
    Once we define all the Operations and the outline of a layer, all that remains to implement here 
    is the _setup_layer function!
    '''
    def __init__(self, 
                 neurons: int, 
                 activation: Activation = LinearAct,
                 filter_size: int = 3) -> None:
        super().__init__(neurons)
        self.activation = activation
        self.filter_size = filter_size
        

    def _setup_layer(self, num_in: int) -> None:
        # filter
        self.params.append(Tensor([-0.4404,  0.2923, -0.1474]))
        
        # Weights
        self.params.append(Tensor([[ 0.1779,  0.6374, -0.8940, -0.3311, -0.0193],
        [-0.1690,  0.0073,  0.8437,  0.9426,  0.6066],
        [ 0.4668,  0.5077, -0.6763,  0.0992, -0.4717],
        [ 0.0476, -0.3656,  0.7781, -0.5844,  0.8515],
        [ 0.8489,  0.6374,  0.6090,  0.6846,  0.5798],
        [-0.6807,  0.3116, -0.1502, -0.3789, -0.1404],
        [ 0.4816,  0.8886,  0.8650, -0.4064, -0.2798]]))
        
        self.operations = [Conv1D(self.params[0]),
                           WeightMultiply(self.params[1])]


In [62]:
a = TestConv(5)

In [63]:
inp_t

tensor([[-0.3163, -0.1501,  0.7113,  0.3191,  0.6163,  0.7247, -0.0049],
        [-0.1855,  0.7793,  0.8862, -0.1635, -0.4434,  0.8358,  0.8314]])

In [64]:
out = a.forward(Tensor(inp_t))
out

tensor([[-0.0912, -0.1621, -0.6499,  0.3251, -0.3171],
        [-0.5356, -0.1283, -0.2132,  0.2433, -0.2916]])

In [65]:
round(a.forward(Tensor(inp_t)).sum().item(), 4)

-1.8206

In [66]:
a.backward(torch.ones_like(out))

tensor([[-1.1080,  0.7482, -0.6709, -1.2561,  1.3323, -1.4810,  0.6059],
        [-1.1080,  0.7482, -0.6709, -1.2561,  1.3323, -1.4810,  0.6059]])

In [67]:
a._param_grads()

a.operations[0].param_grad

tensor([2.7561, 1.8538, 7.7929])

```python
tensor([2.7563, 1.8538, 7.7927])
```

Checks out! 

### 2D Convolutions

In [68]:
from torchvision.datasets import MNIST

In [69]:
mnist_trainset = MNIST(root="./data", train=True, download=False, transform=None)

In [70]:
mnist_data = mnist_trainset.train_data.type(torch.float32) / 255.0

#### Saving images

In [71]:
mnist_imgs = mnist_data[:10]

In [72]:
import numpy as np

pad = 2
obs = np.zeros(28)
a = np.zeros(pad)
np.concatenate([a, obs, a]).shape

(32,)

In [73]:
a = Tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [74]:
torch.manual_seed(90118)
fil = Tensor(torch.empty(3, 3).uniform_(-1, 1))
fil

tensor([[ 0.4100, -0.6319,  0.7821],
        [ 0.9452,  0.9196, -0.1515],
        [-0.3346,  0.3960,  0.7145]])

In [75]:
mnist_imgs = mnist_data[:2]
mnist_img = mnist_imgs[0]

#### Padding

In [76]:
def _pad_2d_obs(inp: Tensor, 
                num: int):
    '''
    Input is a 2 dimensional, square, 2D Tensor
    '''
    
    inp_pad = _pad_1d_batch(inp, num)
    other = torch.zeros(num, inp.shape[0] + num * 2)
    return torch.cat([other, inp_pad, other])

In [77]:
def _pad_1d_batch(inp: Tensor, 
                  num: int) -> Tensor:
#     import pdb; pdb.set_trace()
    outs = [_pad_1d(obs, num) for obs in inp]

    return torch.stack(outs)

In [78]:
def _pad_1d(inp: Tensor,
            num: int) -> Tensor:
    z = torch.Tensor([0])
    z = z.repeat(num)
#     import pdb; pdb.set_trace()
    return torch.cat([z, inp, z])

In [79]:
mnist_img[0].shape

torch.Size([28])

In [80]:
_pad_2d_obs(mnist_img, 1).shape

torch.Size([30, 30])

In [81]:
def _pad_2d(inp: Tensor, 
            num: int):
    '''
    Input is a 3 dimensional tensor, first dimension batch size
    '''

    outs = [_pad_2d_obs(obs, num) for obs in inp]
    return torch.stack(outs)

In [82]:
_pad_2d(mnist_imgs, 1).shape

torch.Size([2, 30, 30])

#### Compute output

**1D**:

```python
    def _compute_output_obs(self, obs: Tensor):
        
        obs_pad = self._pad_1d_obs(obs)

        out = torch.zeros(obs.shape)

        for o in range(out.shape[0]):
            for p in range(self.param_size):
                out[o] += self.param[p] * obs_pad[o+p]
        return out
    
    def _compute_output(self):
        
        outs = [self._compute_output_obs(obs) for obs in self.input_]
        return torch.stack(outs)
```

In [83]:
def _compute_output_obs_2d(obs: Tensor, 
                           param: Tensor):
    '''
    Obs is a 2d square Tensor, so is param
    '''
    param_mid = param.shape[0] // 2
    
    obs_pad = _pad_2d_obs(obs, param_mid)
    
    out = torch.zeros(obs.shape)
    
    for o_w in range(out.shape[0]):
        for o_h in range(out.shape[1]):
            for p_w in range(param.shape[0]):
                for p_h in range(param.shape[1]):
                    out[o_w][o_h] += param[p_w][p_h] * obs_pad[o_w+p_w][o_h+p_h]
    return out    

In [84]:
_compute_output_obs_2d(mnist_imgs[0], fil).shape

torch.Size([28, 28])

In [85]:
_compute_output_obs_2d(torch.ones_like(mnist_imgs[0]), 
                   torch.ones_like(fil)).shape

torch.Size([28, 28])

In [86]:
def _compute_output_2d(img_batch: Tensor,
                       param: Tensor):
    
    outs = [_compute_output_obs_2d(obs, param) for obs in img_batch]
    return torch.stack(outs)

In [87]:
_compute_output_2d(mnist_imgs, fil).shape

torch.Size([2, 28, 28])

#### Param grads


```python
def _compute_grads_obs(self, 
                       input_obs: Tensor,
                       output_grad_obs: Tensor) -> None:

    output_obs_pad = self._pad_1d_obs(output_grad_obs)
    input_obs_pad = self._pad_1d_obs(input_obs)
    input_grad = torch.zeros_like(input_obs)

    for p in range(self.param.shape[0]):
        for o in range(input_obs.shape[0]):
            input_grad[o] += output_obs_pad[o+self.param_size-p-1] * self.param[p]

    return input_grad

def _compute_grads(self, output_grad: Tensor) -> None:

    grads = [self._compute_grads_obs(self.input_[i], output_grad[i]) for i in range(output_grad.shape[0])]    

    return torch.stack(grads)

def _param_grad(self, output_grad: Tensor) -> Tensor:

    inp_pad = self._pad_1d(self.input_)
    out_pad = self._pad_1d(output_grad)

    param_grad = torch.zeros_like(self.param)

    for i in range(self.input_.shape[0]):
        for p in range(self.param.shape[0]):
            for o in range(self.input_.shape[1]):
                param_grad[p] += inp_pad[i][o+p] * output_grad[i][o]

    return param_grad
```

In [88]:
def _compute_grads_obs_2d(input_obs: Tensor,
                          output_grad_obs: Tensor, 
                          param: Tensor) -> Tensor:
    '''
    input_obs: 2D Tensor representing the input observation
    output_grad_obs: 2D Tensor representing the output gradient  
    param: 2D filter
    '''
    
    param_size = param.shape[0]
    output_obs_pad = _pad_2d_obs(output_grad_obs, param_size // 2)
    input_grad = torch.zeros_like(input_obs)
    
    for i_w in range(input_obs.shape[0]):
        for i_h in range(input_obs.shape[1]):
            for p_w in range(param_size):
                for p_h in range(param_size):
                    input_grad[i_w][i_h] += output_obs_pad[i_w+param_size-p_w-1][i_h+param_size-p_h-1] \
                    * param[p_w][p_h]

    return input_grad

def _compute_grads_2d(inp: Tensor,
                      output_grad: Tensor, 
                      param: Tensor) -> Tensor:

    grads = [_compute_grads_obs_2d(inp[i], output_grad[i], param) for i in range(output_grad.shape[0])]    

    return torch.stack(grads)


def _param_grad_2d(inp: Tensor,
                output_grad: Tensor, 
                param: Tensor) -> Tensor:

    param_size = param.shape[0]
    inp_pad = _pad_2d(inp, param_size // 2)

    param_grad = torch.zeros_like(param)
    img_shape = output_grad.shape[1:]
    
    for i in range(inp.shape[0]):
        for o_w in range(img_shape[0]):
            for o_h in range(img_shape[1]):
                for p_w in range(param_size):
                    for p_h in range(param_size):
                        param_grad[p_w][p_h] += inp_pad[i][o_w+p_w][o_h+p_h] \
                        * output_grad[i][o_w][o_h]
    return param_grad

In [89]:
img_grads = _compute_grads_2d(mnist_imgs, 
                  torch.ones_like(mnist_imgs),
                  fil)

In [90]:
img_grads.shape

torch.Size([2, 28, 28])

In [91]:
_param_grad_2d(mnist_imgs, 
               torch.ones_like(mnist_imgs),
               fil).shape

torch.Size([3, 3])

In [92]:
_param_grad_2d(torch.ones_like(mnist_imgs), 
            torch.ones_like(mnist_imgs),
            torch.ones_like(fil))

tensor([[1458., 1512., 1458.],
        [1512., 1568., 1512.],
        [1458., 1512., 1458.]])

## Testing functions

In [93]:
torch.manual_seed(83118)
inp_2d = Tensor(2, 28, 28).uniform_(-1, 1)
fil_2d = Tensor(torch.empty(3, 3).uniform_(-1, 1))
print(inp_2d)
print(fil_2d)

tensor([[[-0.4595, -0.0404,  0.7231,  ...,  0.7793,  0.8862, -0.1635],
         [-0.4434,  0.8358,  0.8314,  ...,  0.8515,  0.8489,  0.6374],
         [ 0.6090,  0.6846,  0.5798,  ..., -0.9869, -0.5122,  0.1004],
         ...,
         [-0.6893,  0.6844,  0.3930,  ...,  0.8784, -0.9622,  0.6246],
         [ 0.2483,  0.2489,  0.6214,  ..., -0.4182,  0.3494, -0.3749],
         [-0.0431, -0.5983,  0.0159,  ...,  0.9402, -0.7548, -0.0260]],

        [[-0.9996, -0.0940, -0.2361,  ..., -0.8060,  0.2113,  0.2238],
         [ 0.8753, -0.0206,  0.7403,  ...,  0.0474, -0.1131, -0.9616],
         [ 0.9618, -0.7869, -0.1776,  ..., -0.2627,  0.0214,  0.1699],
         ...,
         [-0.7987,  0.2323,  0.5628,  ..., -0.3837,  0.3447,  0.7020],
         [ 0.1002,  0.5694, -0.3786,  ...,  0.0969, -0.9385, -0.9083],
         [-0.4816, -0.8912, -0.8171,  ...,  0.9857,  0.5978,  0.6349]]])
tensor([[-0.4692,  0.8415, -0.3125],
        [-0.4369, -0.4208,  0.9089],
        [-0.5128,  0.0941,  0.4598]])


In [94]:
out = _compute_output_2d(inp_2d, fil_2d)

In [95]:
torch.sum(out)

tensor(4.3893)

In [96]:
out_grad_2d = torch.ones_like(inp_2d)

### Testing input gradient

In [97]:
inp_grads_2d = _compute_grads_2d(inp_2d, out_grad_2d, fil_2d)

In [98]:
inp_grads_2d[1][1][1]

tensor(0.1522)

`inp_grads_2d[1][1][1] = 0.1522` That means if we increase `inp_2d[1][1][1]` by 0.1, the sum will increase to:

In [99]:
round(4.3893 + 0.1 * 0.1522, 4)

4.4045

In [100]:
def conv_2d_sum(inp: Tensor, 
                fil: Tensor):
    out = _compute_output_2d(inp, fil)
    return torch.sum(out)

In [101]:
inp_2d_2 = inp_2d.clone()
inp_2d_2[1][1][1] += 0.1
inp_2d_2

tensor([[[-0.4595, -0.0404,  0.7231,  ...,  0.7793,  0.8862, -0.1635],
         [-0.4434,  0.8358,  0.8314,  ...,  0.8515,  0.8489,  0.6374],
         [ 0.6090,  0.6846,  0.5798,  ..., -0.9869, -0.5122,  0.1004],
         ...,
         [-0.6893,  0.6844,  0.3930,  ...,  0.8784, -0.9622,  0.6246],
         [ 0.2483,  0.2489,  0.6214,  ..., -0.4182,  0.3494, -0.3749],
         [-0.0431, -0.5983,  0.0159,  ...,  0.9402, -0.7548, -0.0260]],

        [[-0.9996, -0.0940, -0.2361,  ..., -0.8060,  0.2113,  0.2238],
         [ 0.8753,  0.0794,  0.7403,  ...,  0.0474, -0.1131, -0.9616],
         [ 0.9618, -0.7869, -0.1776,  ..., -0.2627,  0.0214,  0.1699],
         ...,
         [-0.7987,  0.2323,  0.5628,  ..., -0.3837,  0.3447,  0.7020],
         [ 0.1002,  0.5694, -0.3786,  ...,  0.0969, -0.9385, -0.9083],
         [-0.4816, -0.8912, -0.8171,  ...,  0.9857,  0.5978,  0.6349]]])

In [102]:
print(conv_2d_sum(inp_2d, fil_2d))
print(conv_2d_sum(inp_2d_2, fil_2d))

tensor(4.3893)
tensor(4.4045)


Works!

### Testing param gradient

In [103]:
param_grads_2d = _param_grad_2d(inp_2d, out_grad_2d, fil)
param_grads_2d

tensor([[12.9452, 11.3265, 15.7122],
        [10.9300,  9.9202, 14.8306],
        [ 7.7102,  6.6400, 10.0913]])

`param_grads_2d[1][1] = 9.9202` That means if we increase `param_grads_2d[1][1]` by 0.1, the sum will increase to:

In [104]:
round(4.3893 + 9.9202 * 0.1, 4)

5.3813

In [105]:
fil_2d_2 = fil_2d.clone()
fil_2d_2[1][1] += 0.1
fil_2d_2

tensor([[-0.4692,  0.8415, -0.3125],
        [-0.4369, -0.3208,  0.9089],
        [-0.5128,  0.0941,  0.4598]])

In [106]:
print(conv_2d_sum(inp_2d, fil_2d))
print(conv_2d_sum(inp_2d, fil_2d_2))

tensor(4.3893)
tensor(5.3813)


Works!

## Conv2D class

In [107]:
class Conv2D(ParamOperation):

    def __init__(self, 
                 param: Tensor):
        super().__init__(param)
        self.param_size = param.shape[0]
        self.param_pad = self.param_size // 2
        
    def _pad_1d_obs(self, obs: Tensor) -> Tensor:
        z = torch.Tensor([0])
        z = z.repeat(self.param_pad)
        return torch.cat([z, obs, z])

    def _pad_1d(self, inp: Tensor) -> Tensor:
        outs = [self._pad_1d_obs(obs) for obs in inp]
        return torch.stack(outs)
        
    def _pad_2d_obs(self,
                    inp: Tensor):

        inp_pad = self._pad_1d_batch(inp, self.param_pad)
        other = torch.zeros(num, inp.shape[0] + num * 2)
        return torch.cat([other, inp_pad, other])

    def _pad_2d(self, inp: Tensor):
        
        outs = [_pad_2d_obs(obs, num) for obs in inp]
        return torch.stack(outs)

    def _compute_output_obs(self, 
                            obs: Tensor):
        '''
        Obs is a 2d square Tensor, so is param
        '''
        obs_pad = self._pad_2d_obs(obs)

        out = torch.zeros(obs.shape)

        for o_w in range(out.shape[0]):
            for o_h in range(out.shape[1]):
                for p_w in range(self.param_size):
                    for p_h in range(self.param_size):
                        out[o_w][o_h] += self.param_size[p_w][p_h] * obs_pad[o_w+p_w][o_h+p_h]
        return out    

    def _compute_output(self):

        outs = [self._compute_output_obs(obs) for obs in self.input_]
        return torch.stack(outs)

    def _compute_grads_obs(self, 
                           input_obs: Tensor,
                           output_grad_obs: Tensor) -> Tensor:

        output_obs_pad = self._pad_2d_obs(output_grad_obs)
        input_grad = torch.zeros_like(input_obs)

        for i_w in range(input_obs.shape[0]):
            for i_h in range(input_obs.shape[1]):
                for p_w in range(param_size):
                    for p_h in range(param_size):
                        input_grad[i_w][i_h] += output_obs_pad[i_w+self.param_size-p_w-1][i_h+self.param_size-p_h-1] \
                        * self.param[p_w][p_h]

        return input_grad

    def _compute_grads(self, output_grad: Tensor) -> Tensor:

        grads = [_compute_grads_obs(self.input_[i], output_grad[i], self.param) for i in range(output_grad.shape[0])]    

        return torch.stack(grads)


    def _param_grad(self, output_grad: Tensor) -> Tensor:

        inp_pad = _pad_2d(self.input_)

        param_grad = torch.zeros_like(self.param)
        img_shape = output_grad.shape[1:]

        for i in range(self.input_.shape[0]):
            for o_w in range(img_shape[0]):
                for o_h in range(img_shape[1]):
                    for p_w in range(self.param_size):
                        for p_h in range(self.param_size):
                            param_grad[p_w][p_h] += inp_pad[i][o_w+p_w][o_h+p_h] \
                            * output_grad[i][o_w][o_h]
        return param_grad

### With channels

In [108]:
class Conv2D(ParamOperation):

    def __init__(self, 
                 param: Tensor):
        super().__init__(param)
        self.param_size = param.shape[0]
        self.param_pad = self.param_size // 2
        
    def _pad_1d_obs(self, obs: Tensor) -> Tensor:
        z = torch.Tensor([0])
        z = z.repeat(self.param_pad)
        return torch.cat([z, obs, z])

    def _pad_1d(self, inp: Tensor) -> Tensor:
        outs = [self._pad_1d_obs(obs) for obs in inp]
        return torch.stack(outs)
        
    def _pad_2d_obs(self,
                    inp: Tensor):
        '''
        "inp" is a 2 dimensional tensor, representing (image width by image height) 
        '''
        inp_pad = self._pad_1d_batch(inp, self.param_pad)
        other = torch.zeros(num, inp.shape[0] + num * 2)
        return torch.cat([other, inp_pad, other])

    def _pad_2d_channel(self, inp: Tensor):
        '''
        "inp" is a 3 dimensional tensor, representing (image width by image height) 
        '''
        num_channels = inp.shape[2]
        return torch.stack([_pad_2d_obs(select_channel(inp, i), num) 
                            for i in range(num_channels)], dim=2)

    def _pad_input(self, inp: Tensor):   
        return torch.stack([_pad_2d_channel(obs) for obs in inp], dim=0)

    
    def _compute_output_obs(self, 
                            obs: Tensor):
        '''
        Obs is a 2d square Tensor, so is param
        '''
        obs_pad = self._pad_2d_obs(obs)

        out = torch.zeros(obs.shape)

        for o_w in range(out.shape[0]):
            for o_h in range(out.shape[1]):
                for p_w in range(self.param_size):
                    for p_h in range(self.param_size):
                        out[o_w][o_h] += self.param_size[p_w][p_h] * obs_pad[o_w+p_w][o_h+p_h]
        return out    

    def _compute_output(self):

        outs = [self._compute_output_obs(obs) for obs in self.input_]
        return torch.stack(outs)

    def _compute_grads_obs(self, 
                           input_obs: Tensor,
                           output_grad_obs: Tensor) -> Tensor:

        output_obs_pad = self._pad_2d_obs(output_grad_obs)
        input_grad = torch.zeros_like(input_obs)

        for i_w in range(input_obs.shape[0]):
            for i_h in range(input_obs.shape[1]):
                for p_w in range(param_size):
                    for p_h in range(param_size):
                        input_grad[i_w][i_h] += output_obs_pad[i_w+self.param_size-p_w-1][i_h+self.param_size-p_h-1] \
                        * self.param[p_w][p_h]

        return input_grad

    def _compute_grads(self, output_grad: Tensor) -> Tensor:

        grads = [_compute_grads_obs(self.input_[i], output_grad[i], self.param) for i in range(output_grad.shape[0])]    

        return torch.stack(grads)


    def _param_grad(self, output_grad: Tensor) -> Tensor:

        inp_pad = _pad_2d(self.input_)

        param_grad = torch.zeros_like(self.param)
        img_shape = output_grad.shape[1:]

        for i in range(self.input_.shape[0]):
            for o_w in range(img_shape[0]):
                for o_h in range(img_shape[1]):
                    for p_w in range(self.param_size):
                        for p_h in range(self.param_size):
                            param_grad[p_w][p_h] += inp_pad[i][o_w+p_w][o_h+p_h] \
                            * output_grad[i][o_w][o_h]
        return param_grad

In [109]:
torch.manual_seed(92718)
batch_size = 5
img_height = 4
in_channels = 6
out_channels = 9
fil_size = 3
inp = Tensor(torch.empty(batch_size, in_channels, img_height, img_height).uniform_(-1, 1))
param = Tensor(torch.empty(out_channels, in_channels, fil_size, fil_size).uniform_(-1, 1))

In [110]:
inp.shape

torch.Size([5, 6, 4, 4])

In [111]:
img_1 = inp[0]
img_1.shape

torch.Size([6, 4, 4])

In [112]:
img_1[0].shape

torch.Size([4, 4])

In [113]:
def select_channel(inp: Tensor, i: int):
    return torch.index_select(inp, dim=0, index=torch.LongTensor([i])).squeeze(2)

In [114]:
select_channel(img_1, 2).shape

torch.Size([1, 4, 4])

In [115]:
def _pad_2d_channel(inp: Tensor, num: int):
    '''
    "inp" is a 3 dimensional tensor, representing (image width by image height) 
    '''


def _pad_conv_input(inp: Tensor, num: int):   
    return torch.stack([_pad_2d_channel(obs, num) for obs in inp])

In [116]:
_pad_conv_input(inp, 1).shape

torch.Size([5, 6, 6, 6])

In [117]:
[1,2,3,4][2:]

[3, 4]

In [118]:
def _compute_output_obs(obs: Tensor, 
                        fil: Tensor):

    param_size = fil.shape[2]
    param_mid = param_size // 2
    in_channels = obs.shape[0]
    out_channels = fil.shape[0]
    img_size = obs.shape[1]
    obs_pad = _pad_2d_channel(obs, param_mid)
    
    out = torch.zeros((out_channels,) + obs.shape[1:])
    for c_out in range(out_channels):
        for c_in in range(in_channels):
            for o_w in range(img_size):
                for o_h in range(img_size):
                    for p_w in range(param_size):
                        for p_h in range(param_size):
                            out[c_out][o_w][o_h] += \
                            fil[c_out][c_in][p_w][p_h] * obs_pad[c_in][o_w+p_w][o_h+p_h]
    return out    

def _compute_output(inp: Tensor,
                    fil: Tensor) -> Tensor:

    outs = [_compute_output_obs(obs, fil) for obs in inp]    

    return torch.stack(outs)

In [125]:
out = _compute_output(inp, param)

In [126]:
round(torch.sum(out).item(), 4)

33.6228

In [127]:
out.shape

torch.Size([5, 9, 4, 4])

In [128]:
param.shape

torch.Size([9, 6, 3, 3])

In [130]:
def _compute_grads_obs(input_obs: Tensor,
                       output_grad_obs: Tensor,
                       fil: Tensor) -> Tensor:
    '''
    Input is dimension 3:
    
    '''
    input_grad = torch.zeros_like(input_obs)    
    param_size = fil.shape[2]
    param_mid = param_size // 2
    img_size = input_obs.shape[1]
    in_channels = input_obs.shape[0]
    out_channels = fil.shape[0]
    output_obs_pad = _pad_2d_channel(output_grad_obs, param_mid)
    
    for c_in in range(in_channels):
        for c_out in range(out_channels):
            for i_w in range(input_obs.shape[1]):
                for i_h in range(input_obs.shape[2]):
                    for p_w in range(param_size):
                        for p_h in range(param_size):
                            input_grad[c_in][i_w][i_h] += \
                            output_obs_pad[c_out][i_w+param_size-p_w-1][i_h+param_size-p_h-1] \
                            * fil[c_out][c_in][p_w][p_h]
    return input_grad

def _compute_grads(inp: Tensor,
                   output_grad: Tensor, 
                   fil: Tensor) -> Tensor:

    grads = [_compute_grads_obs(inp[i], output_grad[i], fil) for i in range(output_grad.shape[0])]    

    return torch.stack(grads)



In [131]:
param.shape[3]

3

In [132]:
inp[0].shape[2]

4

In [134]:
inp_grad = _compute_grads(inp, torch.ones_like(out), param)

In [135]:
inp_grad[1][1][1][1]

tensor(4.2669)

`inp_grad[1][1][1][1] == 4.2669`. This means that if we increase this value by 0.1, the sum of "out" will change to 

In [136]:
round(33.6228 + 4.2669 * 0.1, 4)

34.0495

In [137]:
inp_2 = inp.clone()
inp_2[1][1][1][1] += 0.1

In [140]:
out = _compute_output(inp_2, param)
round(torch.sum(out).item(), 4)

34.0495

Works!

### param_grad

In [141]:
def _param_grad(inp: Tensor,
                output_grad: Tensor, 
                fil: Tensor) -> Tensor:

    param_grad = torch.zeros_like(fil)    
    param_size = fil.shape[2]
    param_mid = param_size // 2
    img_size = inp.shape[2]
    in_channels = inp.shape[1]
    out_channels = output_grad.shape[1]    

    inp_pad = _pad_conv_input(inp, param_mid)
    img_shape = output_grad.shape[2:]

    for i in range(inp.shape[0]):
        for c_in in range(in_channels):
            for c_out in range(out_channels):
                for o_w in range(img_shape[0]):
                    for o_h in range(img_shape[1]):
                        for p_w in range(param_size):
                            for p_h in range(param_size):
                                param_grad[c_out][c_in][p_w][p_h] += \
                                inp_pad[i][c_in][o_w+p_w][o_h+p_h] \
                                * output_grad[i][c_out][o_w][o_h]
    return param_grad

In [142]:
p_grad = _param_grad(inp, torch.ones_like(out), param)

In [146]:
param[2][2][2][2]

tensor(0.7363)

In [147]:
p_grad[2][2][2][2]

tensor(-3.4440)

`p_grad[2][2][2][2] == -3.4440`. This means that if we increase this value by 0.1, the sum of "out" will change to 

In [155]:
round(33.6228 + -3.4440 * 0.1, 4)

33.2784

In [152]:
param_2 = param.clone()
param_2[2][2][2][2] += 0.1

In [153]:
torch.sum(_compute_output(inp, param))

tensor(33.6228)

In [154]:
torch.sum(_compute_output(inp, param_2))

tensor(33.2784)

Works!

## Numpy conversion

In [156]:
def _pad_1d_obs_np(obs, pad):
    a = np.zeros(pad)
    z = np.concatenate([a, obs, a])
    return z

In [157]:
def _pad_1d_np(inp, pad):
    return np.stack([_pad_1d_obs_np(obs, pad) for obs in inp])

In [158]:
def _pad_2d_obs_np(inp, pad):
    inp_pad = _pad_1d_np(inp, pad)
    other = np.zeros((pad, inp.shape[0] + pad * 2))
    return np.concatenate([other, inp_pad, other])


def _pad_2d_batch_np(inp, pad):
    return np.stack([_pad_2d_obs_np(obs, pad) for obs in inp])

In [159]:
img_1.shape

torch.Size([6, 4, 4])

In [160]:
img_1_np = img_1.numpy()

In [161]:
def _pad_2d_channel(inp: Tensor, num: int):
    '''
    "inp" is a 3 dimensional tensor, representing (image width by image height) 
    '''
    num_channels = inp.shape[0]
    return torch.stack([_pad_2d_obs(inp[i], num) for i in range(num_channels)])

def _pad_conv_input(inp: Tensor, num: int):   
    return torch.stack([_pad_2d_channel(obs, num) for obs in inp])

In [162]:
_pad_2d_channel(img_1, 1).shape

torch.Size([6, 6, 6])

In [163]:
def _pad_2d_channel_np(inp, num):
    return np.stack([_pad_2d_obs_np(channel, num) for channel in inp])

In [164]:
_pad_2d_channel_np(img_1, 1).shape

(6, 6, 6)

In [165]:
def _pad_conv_input(inp: Tensor, num: int):   
    return torch.stack([_pad_2d_channel(obs, num) for obs in inp])
_pad_conv_input(inp, 1).shape

torch.Size([5, 6, 6, 6])

In [166]:
def _pad_conv_input_np(inp, num):   
    return np.stack([_pad_2d_channel_np(obs, num) for obs in inp])

In [167]:
_pad_conv_input_np(inp.numpy(), 1).shape

(5, 6, 6, 6)

In [212]:
def _pad_1d_obs_cy(obs, pad):
    a = np.zeros(pad)
    z = np.concatenate([a, obs, a])
    return z


def _pad_1d_batch_cy(inp, pad):
    return np.stack([_pad_1d_obs_cy(obs, pad) for obs in inp])

In [213]:
def _pad_2d_obs_cy(inp, pad):
    inp_pad = _pad_1d_batch_cy(inp, pad)
    other = np.zeros((pad, inp.shape[0] + pad * 2))
    return np.concatenate([other, inp_pad, other])

In [214]:
def _pad_2d_channel_cy(inp, pad):
    return np.stack([_pad_2d_obs_cy(channel, pad) for channel in inp])

In [221]:
def _compute_output_obs_cy(inp, param):

    param_size = param.shape[2]
    in_channels = inp.shape[0]
    out_channels = param.shape[0]
    param_mid = param_size // 2
    img_size = inp.shape[1]

    obs_pad = _pad_2d_channel_cy(inp, param_mid)

    out = np.zeros((out_channels, img_size, img_size))

    for c_out in range(out_channels):
        for c_in in range(in_channels):
            for o_w in range(img_size):
                for o_h in range(img_size):
                    for p_w in range(param_size):
                        for p_h in range(param_size):
                              out[c_out][o_w][o_h] += \
                              param[c_out][c_in][p_w][p_h] * \
                              obs_pad[c_in][o_w+p_w][o_h+p_h]
    return out


def _output_cy(inp, param):

    return np.stack([_compute_output_obs_cy(obs, param) for obs in inp])

In [222]:
inp = np.random.uniform(size=(2,1,28,28))
param = np.random.uniform(size=(24,1,3,3))

In [224]:
_output_cy(inp, param).shape

(2, 24, 28, 28)

### Pytorch conv

In [169]:
from torch import nn

batch_size = 5
img_size = 28
n_channels = 1
inp = Tensor(torch.empty(batch_size, 
                       n_channels,
                       img_size, 
                       img_size).uniform_(-1, 1))

In [170]:
inp.shape

torch.Size([5, 1, 28, 28])

In [171]:
inp2 = inp.detach() 

In [172]:
inp2.shape

torch.Size([5, 1, 28, 28])

In [173]:
inp2.requires_grad = True

In [174]:
inp2.grad_fn

In [175]:
out_channels = 4
op = nn.Conv2d(1, 4, 3, padding=1, bias=False)

In [176]:
type(op)

torch.nn.modules.conv.Conv2d

In [177]:
output = op(inp2)

In [178]:
output.shape

torch.Size([5, 4, 28, 28])

In [179]:
output.requires_grad

True

In [180]:
inp2.grad.shape

AttributeError: 'NoneType' object has no attribute 'shape'

In [181]:
output.backward(gradient=torch.ones_like(output))

In [182]:
inp2.grad.shape

torch.Size([5, 1, 28, 28])

In [184]:
op.weight.grad

tensor([[[[53.6569, 55.5921, 53.9341],
          [61.2716, 62.0950, 61.3836],
          [67.9083, 67.9084, 67.8764]]],


        [[[53.6569, 55.5921, 53.9341],
          [61.2716, 62.0950, 61.3836],
          [67.9083, 67.9084, 67.8764]]],


        [[[53.6569, 55.5921, 53.9341],
          [61.2716, 62.0950, 61.3836],
          [67.9083, 67.9084, 67.8764]]],


        [[[53.6569, 55.5921, 53.9341],
          [61.2716, 62.0950, 61.3836],
          [67.9083, 67.9084, 67.8764]]]])

In [185]:
op.weight.grad.shape

torch.Size([4, 1, 3, 3])

In [None]:
for p in b.:
    print(p.grad.shape)