In [4]:
# AlexNet
import torch
from torch import nn
from d2l import load_data_fashion_mnist, train_ch6_gpu

# input size: 1*224*224
net = nn.Sequential(
    nn.Conv2d(1, 96, kernel_size=11, stride=4, padding=1), nn.ReLU(),  # (224-11+2)/4=53+1=54 -> 96x54x54
    nn.MaxPool2d(kernel_size=3, stride=2),  # (54-3)/2=25+1=26 -> 96*26*26
    nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),  # 256*26*26
    nn.MaxPool2d(kernel_size=3, stride=2),  # (26-3)/2=11+1=12 -> 256*12*12
    nn.Conv2d(256, 384, kernel_size=3, padding=1), nn.ReLU(),
    nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.ReLU(),
    nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(),  # ??? why
    nn.MaxPool2d(kernel_size=3, stride=2),  # (12-3)/2=4+1=5 -> 256*5*5
    nn.Flatten(),
    nn.Linear(256 * 5 * 5, 4096), nn.ReLU(),
    nn.Dropout(p=0.5),
    nn.Linear(4096, 4096), nn.ReLU(),
    nn.Dropout(p=0.5),
    nn.Linear(4096, 10)
)
X = torch.randn(1, 1, 224, 224)
print(net(X).shape)

batch_size = 128
train_iter, test_iter = load_data_fashion_mnist(batch_size, resize=224)

lr, num_epochs = 0.1, 10
train_ch6_gpu(net, train_iter, test_iter, num_epochs, lr, 'cuda:0')

torch.Size([1, 10])
net will be trained on cuda:0
batch_0: train_l:2.304079532623291, train_accuracy:0.1015625, time_count:0.1982729434967041
batch_10: train_l:2.2864859104156494, train_accuracy:0.1484375, time_count:0.1981797218322754
batch_20: train_l:2.2666110084170388, train_accuracy:0.16703869047619047, time_count:0.19864797592163086
batch_30: train_l:2.249250127423194, train_accuracy:0.19480846774193547, time_count:0.19713163375854492
batch_40: train_l:2.2502381220096495, train_accuracy:0.1794969512195122, time_count:0.19765233993530273
batch_50: train_l:2.2339439836202883, train_accuracy:0.18244485294117646, time_count:0.19726800918579102
batch_60: train_l:2.2086783471654674, train_accuracy:0.19697745901639344, time_count:0.19753289222717285
batch_70: train_l:2.1412801406752897, train_accuracy:0.21764964788732394, time_count:0.1972038745880127
batch_80: train_l:2.041437046763338, train_accuracy:0.25559413580246915, time_count:0.19713258743286133
batch_90: train_l:1.9694457244087

KeyboardInterrupt: 

In [1]:
# VGG Block
import torch
from torch import nn


def vgg_block(num_convs, in_channels, out_channels):
    """
    create a vgg block, which has a serial of conv2d layer
    :param num_convs:
    :param in_channels:
    :param out_channels:
    :return:
    """
    layers = []
    for _ in range(num_convs):
        layers.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
        layers.append(nn.ReLU())
        in_channels = out_channels
    layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
    return nn.Sequential(*layers)


conv_arch = ((1, 64), (1, 128), (2, 256), (2, 512), (2, 512))


# VGG net
def vgg(conv_arch):
    conv_blks = []
    in_channels = 1
    for (num_convs, out_channels) in conv_arch:
        conv_blks.append(vgg_block(num_convs, in_channels=in_channels, out_channels=out_channels))
        in_channels = out_channels
    return nn.Sequential(
        *conv_blks,
        nn.Flatten(),
        # full connection layers
        nn.Linear(out_channels * 7 * 7, 4096), nn.ReLU(), nn.Dropout(p=0.5),
        nn.Linear(4096, 4096), nn.ReLU(), nn.Dropout(0.5),
        nn.Linear(4096, 10)
    )


net = vgg(conv_arch)  # VGG-11 net

ratio = 4
small_conv_arch = [(pair[0], pair[1] // ratio) for pair in conv_arch]
net = vgg(small_conv_arch)

from d2l import load_data_fashion_mnist, train_ch6_gpu

lr, num_epochs, batch_size = 0.05, 10, 128
train_iter, test_iter = load_data_fashion_mnist(batch_size, resize=224)
train_ch6_gpu(net, train_iter, test_iter, num_epochs, lr, 'cuda:0')

net will be trained on cuda:0


RuntimeError: CUDA out of memory. Tried to allocate 98.00 MiB (GPU 0; 3.82 GiB total capacity; 2.57 GiB already allocated; 29.44 MiB free; 2.58 GiB reserved in total by PyTorch)

In [None]:
import torch
from torch import nn
from d2l import load_data_fashion_mnist, train_ch6_gpu


# NiN Network in Network, use a c_in*1*1 conv kernel to replace a full connection layer, witch extremely decrease parameters
def nin_block(in_channels, out_channels, kernel_size, strides, padding):
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=strides, padding=padding), nn.ReLU(),
        nn.Conv2d(out_channels, out_channels, kernel_size=1), nn.ReLU(),
        nn.Conv2d(out_channels, out_channels, kernel_size=1), nn.ReLU()
    )


#  so way use two 1x1 conv kernel?

net = nn.Sequential(
    nin_block(1, 96, 11, 4, 0), nn.MaxPool2d(kernel_size=3, stride=2),
    nin_block(96, 256, 5, 1, 2), nn.MaxPool2d(kernel_size=3, stride=2),
    nin_block(256, 384, 3, 1, 1), nn.MaxPool2d(kernel_size=3, stride=2),
    nn.Dropout(p=0.5),
    # use a nin to output classification result
    nin_block(384, 10, kernel_size=3, strides=1, padding=1),
    nn.AdaptiveAvgPool2d((1, 1)),
    nn.Flatten()
)

lr, num_epochs, batch_size = 0.1, 50, 128
train_iter, test_iter = load_data_fashion_mnist(batch_size, resize=224)
train_ch6_gpu(net, train_iter, test_iter, num_epochs, lr, 'cuda:0')

net will be trained on cuda:0
batch_0: train_l:2.3009300231933594, train_accuracy:0.078125, time_count:0.2532370090484619
batch_10: train_l:2.301686503670432, train_accuracy:0.10866477272727272, time_count:0.21374726295471191
batch_20: train_l:2.306446154912313, train_accuracy:0.09933035714285714, time_count:0.21476244926452637
batch_30: train_l:2.3069870318135908, train_accuracy:0.09828629032258064, time_count:0.21554088592529297
batch_40: train_l:2.305805421457058, train_accuracy:0.10041920731707317, time_count:0.21606230735778809
batch_50: train_l:2.3062391982359043, train_accuracy:0.10033700980392157, time_count:0.2150895595550537
batch_60: train_l:2.306294671824721, train_accuracy:0.10053790983606557, time_count:0.21730446815490723
batch_70: train_l:2.306099102530681, train_accuracy:0.10453345070422536, time_count:0.21565961837768555
batch_80: train_l:2.3061735924379327, train_accuracy:0.10387731481481481, time_count:0.21639060974121094
batch_90: train_l:2.305754006563962, train_a

In [5]:
# GoogleNet
import torch
from torch import nn
from torch.nn import functional as F


class Inception(nn.Module):
    def __init__(self, in_channel, c1, c2, c3, c4, **kwargs):
        super().__init__(**kwargs)
        # path 1 -> conv 1x1
        self.p1_1 = nn.Conv2d(in_channels=in_channel, out_channels=c1, kernel_size=1)

        self.p2_1 = nn.Conv2d(in_channels=in_channel, out_channels=c2[0], kernel_size=1)
        self.p2_2 = nn.Conv2d(in_channels=c2[0], out_channels=c2[1], kernel_size=3, padding=1)

        self.p3_1 = nn.Conv2d(in_channels=in_channel, out_channels=c3[0], kernel_size=1)
        self.p3_2 = nn.Conv2d(in_channels=c3[0], out_channels=c3[1], kernel_size=5, padding=2)

        self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
        self.p4_2 = nn.Conv2d(in_channels=in_channel, out_channels=c4, kernel_size=1)

    def forward(self, X):
        p1 = F.relu(self.p1_1(X))
        p2 = F.relu(self.p2_2(F.relu(self.p2_1(X))))
        p3 = F.relu(self.p3_2(F.relu(self.p3_1(X))))
        p4 = F.relu(self.p4_2(self.p4_1(X)))

        return torch.cat((p1, p2, p3, p4), dim=1)  # cat in channel


block_1 = nn.Sequential(
    nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)

block_2 = nn.Sequential(
    nn.Conv2d(64, 64, kernel_size=1),
    nn.ReLU(),
    nn.Conv2d(64, 192, kernel_size=3, padding=1),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)

block_3 = nn.Sequential(
    Inception(192, 64, (96, 128), (16, 32), 32),
    Inception(256, 128, (128, 192), (32, 96), 64),
    nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)

block_4 = nn.Sequential(
    Inception(480, 192, (96, 208), (16, 48), 64),
    Inception(512, 160, (112, 224), (24, 64), 64),
    Inception(512, 128, (128, 256), (24, 64), 64),
    Inception(512, 112, (144, 288), (32, 64), 64),
    Inception(528, 256, (160, 320), (32, 128), 128),
    nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)
block_5 = nn.Sequential(Inception(832, 256, (160, 320), (32, 128), 128),
                   Inception(832, 384, (192, 384), (48, 128), 128),
                   nn.AdaptiveAvgPool2d((1,1)),
                   nn.Flatten())

net = nn.Sequential(block_1,block_2,block_3,block_4,block_5, nn.Linear(1024, 10))

from d2l import load_data_fashion_mnist,train_ch6_gpu
lr, num_epochs, batch_size = 0.1, 10, 128
train_iter, test_iter = load_data_fashion_mnist(batch_size, resize=96)
train_ch6_gpu(net, train_iter, test_iter, num_epochs, lr, 'cuda:0')

net will be trained on cuda:0
batch_0: train_l:2.310108184814453, train_accuracy:0.09375, time_count:0.23650741577148438
batch_10: train_l:2.3037409348921343, train_accuracy:0.08096590909090909, time_count:0.20711088180541992
batch_20: train_l:2.303336801983061, train_accuracy:0.0896577380952381, time_count:0.2083423137664795
batch_30: train_l:2.30305254074835, train_accuracy:0.09173387096774194, time_count:0.20851373672485352
batch_40: train_l:2.302480121938194, train_accuracy:0.0975609756097561, time_count:0.2091217041015625
batch_50: train_l:2.3028254462223425, train_accuracy:0.09666053921568628, time_count:0.211592435836792
batch_60: train_l:2.302575017585129, train_accuracy:0.09784836065573771, time_count:0.21390533447265625
batch_70: train_l:2.3026920976773115, train_accuracy:0.09881161971830986, time_count:0.22005558013916016
batch_80: train_l:2.302569354021991, train_accuracy:0.10146604938271606, time_count:0.22761821746826172
batch_90: train_l:2.3024109641274255, train_accurac

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fb85299f940>
Traceback (most recent call last):
  File "/home/siky/Applications/YOLOX/demo/MegEngine/PVMegengine/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1324, in __del__
    self._shutdown_workers()
  File "/home/siky/Applications/YOLOX/demo/MegEngine/PVMegengine/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1297, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.8/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 44, in wait
    if not wait([self.sentinel], timeout):
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 931, in wait
    ready = selector.select(timeout)
  File "/usr/lib/python3.8/selectors.py", line 415, in select
    fd_event_list = self._selector.poll(timeout)
KeyboardInterrupt: 


KeyboardInterrupt: 

In [18]:
# ResNet
# Residual_Block


import torch
from torch import nn
from torch.nn import functional as F

class Residual(nn.Module):
    def __init__(self, in_channel, num_channel, use_1x1conv = False, stride = 1):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channel,num_channel,kernel_size=(3,3), padding=(1,1), stride=stride)
        self.conv2 = nn.Conv2d(num_channel,num_channel,kernel_size=(3,3), padding=(1,1))
        if use_1x1conv:
            self.conv3 = nn.Conv2d(in_channel, num_channel, kernel_size=1, stride=stride)
        else:
            self.conv3 = None
        self.bn1 = nn.BatchNorm2d(num_channel)
        self.bn2 = nn.BatchNorm2d(num_channel)

    def forward(self, X):
        Y = F.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        Y = Y + X
        return F.relu(Y)

blk = Residual(4,3,use_1x1conv=True)
X = torch.rand(4,4,6,6)
Y = blk(X)
Y.shape

torch.Size([4, 3, 6, 6])

In [22]:
# ResNet
# the same as GoogLeNet
b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
                   nn.BatchNorm2d(64), nn.ReLU(),
                   nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

def resnet_block(input_channels, num_channels, num_residuals,
                 first_block=False):
    blk = []
    for i in range(num_residuals):
        if i == 0 and not first_block:
            blk.append(Residual(input_channels, num_channels,
                                use_1x1conv=True, stride=2))
        else:
            blk.append(Residual(num_channels, num_channels))
    return blk
b2 = nn.Sequential(*resnet_block(64, 64, 2, first_block=True))
b3 = nn.Sequential(*resnet_block(64, 128, 2))
b4 = nn.Sequential(*resnet_block(128, 256, 2))
b5 = nn.Sequential(*resnet_block(256, 512, 2))


In [24]:
net = nn.Sequential(b1,b2,b3,b4,b5,nn.AdaptiveAvgPool2d((1,1)),nn.Flatten(), nn.Linear(512,10))
X = torch.rand(size=(128, 1, 224, 224))
for layer in net:
    X = layer(X)
    print(layer.__class__.__name__,'output shape:\t', X.shape)

Sequential output shape:	 torch.Size([128, 64, 56, 56])
Sequential output shape:	 torch.Size([128, 64, 56, 56])
Sequential output shape:	 torch.Size([128, 128, 28, 28])
Sequential output shape:	 torch.Size([128, 256, 14, 14])
Sequential output shape:	 torch.Size([128, 512, 7, 7])
AdaptiveAvgPool2d output shape:	 torch.Size([128, 512, 1, 1])
Flatten output shape:	 torch.Size([128, 512])
Linear output shape:	 torch.Size([128, 10])


In [26]:
from d2l import load_data_fashion_mnist
from d2l import train_ch6_gpu
lr, num_epochs, batch_size = 0.05, 10, 256
train_iter, test_iter = load_data_fashion_mnist(batch_size, resize=96)
train_ch6_gpu(net, train_iter, test_iter, num_epochs, lr, 'cuda:0')

net will be trained on cuda:0
batch_0: train_l:2.8372650146484375, train_accuracy:0.125, time_count:0.8660681247711182
batch_10: train_l:2.040444005619396, train_accuracy:0.40980113636363635, time_count:0.2935636043548584
batch_20: train_l:1.4339956499281383, train_accuracy:0.5647321428571429, time_count:0.2933018207550049
batch_30: train_l:1.1670774730943865, train_accuracy:0.6349546370967742, time_count:0.29345059394836426
batch_40: train_l:1.0215179360494382, train_accuracy:0.6745426829268293, time_count:0.29442453384399414
batch_50: train_l:0.9123971713524238, train_accuracy:0.7058823529411765, time_count:0.2938680648803711
batch_60: train_l:0.8418931213558697, train_accuracy:0.7260502049180327, time_count:0.29495859146118164
batch_70: train_l:0.7851747555631987, train_accuracy:0.7420224471830986, time_count:0.2954397201538086
batch_80: train_l:0.7449468911429982, train_accuracy:0.7535686728395061, time_count:0.29618406295776367
batch_90: train_l:0.7118597754410335, train_accuracy:

In [3]:
# DenseNet: replace ResNet with cat operation: cat output with input in channel dimention
import torch
from torch import nn

def conv_block(in_channels, num_channels):
    return nn.Sequential(
        nn.BatchNorm2d(in_channels), # out_channels = in_channels,
        nn.ReLU(),
        nn.Conv2d(in_channels,num_channels,kernel_size=3, padding=1)
    )

class DenseBlock(nn.Module):  # the dense block contains some conv_block with same output_channels
    def __init__(self, num_convs, in_channels, num_channels):
        super(DenseBlock,self).__init__()
        layer = []
        for i in range(num_convs):
            layer.append(conv_block(in_channels+num_channels*i, num_channels))
        self.net = nn.Sequential(*layer)

    def forward(self, X):
        for blk in self.net:
            Y = blk(X)
            X = torch.cat((X,Y),dim=1)  # connect output of every layer with its input as the input of next layer
        return X

blk = DenseBlock(2,3,10)
X = torch.rand(128,3,64,64)
Y = blk(X)
print(Y.shape)

def transition_block(in_channels, num_channels):
    return nn.Sequential(
        nn.BatchNorm2d(in_channels), nn.ReLU(),  # an improved block construction
        nn.Conv2d(in_channels,num_channels,kernel_size=1),  # use 1x1 conv layer to decrease channels number to simplify net
        nn.AvgPool2d(kernel_size=2, stride=2)
    )

tblk = transition_block(23,10)
print(tblk(Y).shape)

torch.Size([128, 23, 64, 64])
torch.Size([128, 10, 32, 32])


In [5]:
# imp DenseNet
b1 = nn.Sequential(
    nn.Conv2d(1,64,kernel_size=7, stride=2, padding=3),
    nn.BatchNorm2d(64), nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)

num_channels = 64
growth_rate = 32  # how many channels will be increased when passed a conv layer
num_convs_in_dense_blocks = [4,4,4,4]
blks = []  # list to contain all sense block
for i, num_convs in enumerate(num_convs_in_dense_blocks):
    blks.append(DenseBlock(num_convs, num_channels, growth_rate))  # for dense block, output channels == input channels
    num_channels += num_convs * growth_rate
    if i != len(num_convs_in_dense_blocks) -1 :  # inset a transition_layer between two dense block to decrease channel number and h,w
        blks.append(transition_block(num_channels,num_channels//2))
        num_channels = num_channels // 2

net = nn.Sequential(
    b1, *blks,
    nn.BatchNorm2d(num_channels), nn.ReLU(),
    nn.AdaptiveAvgPool2d((1,1)),
    nn.Flatten(),
    nn.Linear(num_channels,10)
)


In [6]:
lr, num_epochs, batch_size = 0.1, 10, 256
from d2l import load_data_fashion_mnist,train_ch6_gpu
train_iter, test_iter = load_data_fashion_mnist(batch_size, resize=96)
train_ch6_gpu(net, train_iter, test_iter, num_epochs, lr, 'cuda:0')

net will be trained on cuda:0
batch_0: train_l:2.4899649620056152, train_accuracy:0.06640625, time_count:0.5744199752807617
batch_10: train_l:1.5340485030954534, train_accuracy:0.5095880681818182, time_count:0.2806875705718994
batch_20: train_l:1.227696838833037, train_accuracy:0.6069568452380952, time_count:0.2815587520599365
batch_30: train_l:1.0672901368910266, train_accuracy:0.653351814516129, time_count:0.2812154293060303
batch_40: train_l:0.9729181528091431, train_accuracy:0.6804496951219512, time_count:0.2823641300201416
batch_50: train_l:0.9042483509755602, train_accuracy:0.7001378676470589, time_count:0.2823350429534912
batch_60: train_l:0.8512513647313977, train_accuracy:0.7149718237704918, time_count:0.28495335578918457
batch_70: train_l:0.809513379990215, train_accuracy:0.727387764084507, time_count:0.28977036476135254
batch_80: train_l:0.7742350414211367, train_accuracy:0.7377989969135802, time_count:0.29097485542297363
batch_90: train_l:0.7403636843293577, train_accuracy: