In [1]:
import torch
from torch import nn
from d2l import torch as d2l
from torch.nn import functional as F

In [2]:
class Residual(nn.Module):
    def __init__(self, input_channels, num_channels, use_1x1conv=False, stride=1):
        super().__init__()
        self.conv1 = nn.Conv2d(input_channels, num_channels, kernel_size=3, padding=1, stride=stride)
        self.conv2 = nn.Conv2d(num_channels, num_channels, kernel_size=3, padding=1) # stride=1, we do not want to change the shape
        if use_1x1conv:
            self.conv3 = nn.Conv2d(input_channels, num_channels, kernel_size=1, stride=stride)
        else:
            self.conv3 = None
        self.bn1 = nn.BatchNorm2d(num_channels)
        self.bn2 = nn.BatchNorm2d(num_channels)
        self.relu = nn.ReLU(inplace=True) # just to save memory
    def forward(self, X):
        Y = F.relu(self.bn1(self.conv1(X)))
        Y = self.bn2((self.conv2(Y)))
        if self.conv3:
            X = self.conv3(X)
        Y += X
        return F.relu(Y)

In [3]:
# do not change the number of channels
blk = Residual(3,3)
X = torch.rand(4,3,6,6)
Y = blk(X)
Y.shape

torch.Size([4, 3, 6, 6])

In [4]:
# change the number of channels & the shape of data
blk = Residual(3,6,use_1x1conv=True,stride=2)
X = torch.rand(4,3,6,6)
Y = blk(X)
Y.shape

torch.Size([4, 6, 3, 3])

In [5]:
# it is the first block of the net
b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3), 
                   nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
# construct resnet block
def resnet_block(input_channels, num_channels, num_residuals, first_block = False):
    blk = []
    for i in range(num_residuals):
        if i==0 and not first_block:
            blk.append(Residual(input_channels, num_channels, use_1x1conv=True, stride=2))
        else:
            blk.append(Residual(num_channels, num_channels)) # for the rest blk, do not change channels & shape
    return blk

b2 = nn.Sequential(*resnet_block(64, 64, 2, first_block=True)) # first_block=True--do not shrink data shape
b3 = nn.Sequential(*resnet_block(64, 128, 2))
b4 = nn.Sequential(*resnet_block(128, 256, 2)) 
b5 = nn.Sequential(*resnet_block(256, 512, 2)) 

# AdaptiveAvgPool2d--the output is (batch_size, channels, 1, 1)
net = nn.Sequential(b1, b2, b3, b4, b5, nn.AdaptiveAvgPool2d((1,1)), nn.Flatten(), nn.Linear(512, 10))

In [6]:
X = torch.rand(1,1,224,224)
for layer in net:
    X = layer(X)
    print(layer.__class__.__name__, 'Output shape: ', X.shape)

Sequential Output shape:  torch.Size([1, 64, 56, 56])
Sequential Output shape:  torch.Size([1, 64, 56, 56])
Sequential Output shape:  torch.Size([1, 128, 28, 28])
Sequential Output shape:  torch.Size([1, 256, 14, 14])
Sequential Output shape:  torch.Size([1, 512, 7, 7])
AdaptiveAvgPool2d Output shape:  torch.Size([1, 512, 1, 1])
Flatten Output shape:  torch.Size([1, 512])
Linear Output shape:  torch.Size([1, 10])


In [7]:
lr, num_epochs, batch_size = 0.05, 8, 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)

d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

training on cpu


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/liwuchen/miniforge3/envs/pytorch/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/w6/cz35k79j1h5g7lqb3fy6r5xm0000gn/T/ipykernel_19248/1824282394.py", line 4, in <module>
    d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())
  File "/Users/liwuchen/miniforge3/envs/pytorch/lib/python3.9/site-packages/d2l/torch.py", line 498, in train_ch6
    l.backward()
  File "/Users/liwuchen/miniforge3/envs/pytorch/lib/python3.9/site-packages/torch/_tensor.py", line 255, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/Users/liwuchen/miniforge3/envs/pytorch/lib/python3.9/site-packages/torch/autograd/__init__.py", line 147, in backward
    Variable._execution_engine.run_backward(
KeyboardInterrupt

During handling of the above exception, another exception occurre

TypeError: object of type 'NoneType' has no len()

Error in callback <function flush_figures at 0x1516c3550> (for post_execute):


KeyboardInterrupt: 

# How dose ResNet handle vanishing gradience?

Let's start from one layer net: $y=f(x)$

For a net with 2 layer: $y' = g(f(x))$, the gradience is:

$$
\frac{\partial y'}{\partial w} = \frac{\partial g(y)}{\partial y}\frac{\partial y}{\partial w}
$$

In the second layer, g(y), is strong enough, it will tend to overfitting very soon (because it is closer the the result). Then the gradient of this layer will be smaller, which means the total gradient will be smaller->vanishing gradient.

For ResNet: $y''=f(x)+g(f(x))$, the gradience is:

$$
\frac{\partial y''}{\partial w} = \frac{\partial y}{\partial w}+\frac{\partial g(y)}{\partial y}\frac{\partial y}{\partial w}
$$

So, even if the gradience of the second layer is small, the total gradience will be the gradient of first layer, which will help the first layer to converge.