# ResNet_Plus

- F(x)=G(x)+Residual
- 在已有模型的基础上拓宽函数空间，让模型至少不会训练偏

In [1]:
! pip install d2l

In [2]:
import torch
from torch import nn
import torchvision
from torch.nn import functional as F
from d2l import torch as d2l
import os

class Residual(nn.Module):  #@save
    def __init__(self, input_channels, num_channels,use_1x1conv=False, strides=1):
        super().__init__()
        self.conv1 = nn.Conv2d(input_channels, num_channels,
                               kernel_size=3, padding=1, stride=strides)
        self.conv2 = nn.Conv2d(num_channels, num_channels,
                               kernel_size=3, padding=1)
        if use_1x1conv:
            self.conv3 = nn.Conv2d(input_channels, num_channels,
                                   kernel_size=1, stride=strides)
        else:
            self.conv3 = None
        self.bn1 = nn.BatchNorm2d(num_channels)
        self.bn2 = nn.BatchNorm2d(num_channels)

    def forward(self, X):
        Y = F.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        Y += X
        return F.relu(Y)

- CABM注意力机制

In [3]:
class CALayer(nn.Module):  # Channel Attention (CA) Layer
    def __init__(self, in_channels, reduction=16, pool_types=['avg', 'max']):
        super().__init__()
        self.pool_list = ['avg', 'max']
        self.pool_types = pool_types
        self.in_channels = in_channels
        self.Pool = [nn.AdaptiveAvgPool2d(1), nn.AdaptiveMaxPool2d(1, return_indices=False)]
        self.conv_ca = nn.Sequential(
            nn.Conv2d(in_channels, in_channels //
                      reduction, 1, padding=0, bias=True),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels // reduction,
                      in_channels, 1, padding=0, bias=True)
        )

    def forward(self, x):
        for (i, pool_type) in enumerate(self.pool_types):
            pool = self.Pool[self.pool_list.index(pool_type)](x)
            channel_att_raw = self.conv_ca(pool)
            if i == 0:
                channel_att_sum = channel_att_raw
            else:
                channel_att_sum += channel_att_raw
        scale = F.sigmoid(channel_att_sum)
        return x * scale


class SALayer(nn.Module):  # Spatial Attention Layer
    def __init__(self):
        super().__init__()
        self.conv_sa = nn.Sequential(
            nn.Conv2d(2, 1, 3, 1, 1, bias=False),
            nn.BatchNorm2d(1, momentum=0.01),
            nn.Sigmoid()
        )

    def forward(self, x):
        x_compress = torch.cat(
            (torch.max(x, 1, keepdim=True)[0], torch.mean(x, dim=1, keepdim=True)), dim=1)
        scale = self.conv_sa(x_compress)
        return x * scale


class CBAM(nn.Module):
    def __init__(self, in_channels, reduction=2, pool_types=['avg', 'max']):
        super().__init__()
        self.CALayer = CALayer(
            in_channels, reduction, pool_types)
        self.SALayer = SALayer()

    def forward(self, x):
        x_out = self.CALayer(x)
        x_out = self.SALayer(x_out)
        return x_out

In [4]:
b1 = nn.Sequential(nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
                   nn.BatchNorm2d(64), nn.ReLU(),
                   nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

In [5]:
def resnet_block(input_channels, num_channels, num_residuals,
                 first_block=False):
    blk = []
    for i in range(num_residuals):
        if i == 0 and not first_block:
            blk.append(Residual(input_channels, num_channels,
                                use_1x1conv=True, strides=2))
        else:
            blk.append(Residual(num_channels, num_channels))
    return blk

In [6]:
b2 = nn.Sequential(*resnet_block(64, 64, 2, first_block=True))
b3 = nn.Sequential(*resnet_block(64, 128, 2))
b4 = nn.Sequential(*resnet_block(128, 256, 2))
b5 = nn.Sequential(*resnet_block(256, 512, 2),CBAM(512))

In [7]:
net = nn.Sequential(b1, b2, b3, b4, b5,
                    nn.AdaptiveAvgPool2d((1,1)),
                    nn.Flatten(),nn.Linear(512, 10))

- Data augumentation

In [8]:
# cifar10 32*32
train_augs = torchvision.transforms.Compose([
     # 在高度和宽度上将图像放大到40像素的正方形
    torchvision.transforms.Resize(40),
    # 随机裁剪出一个高度和宽度均为40像素的正方形图像，
    # 生成一个面积为原始图像面积0.64到1倍的小正方形，
    # 然后将其缩放为高度和宽度均为32像素的正方形
    torchvision.transforms.RandomResizedCrop(32, scale=(0.64, 1.0),ratio=(1.0, 1.0)),
    torchvision.transforms.RandomHorizontalFlip(),
    torchvision.transforms.ToTensor(),
    # 标准化图像的每个通道
    torchvision.transforms.Normalize([0.4914, 0.4822, 0.4465],[0.2023, 0.1994, 0.2010]) 
])

test_augs = torchvision.transforms.Compose([
     torchvision.transforms.ToTensor(),
     torchvision.transforms.Normalize([0.4914, 0.4822, 0.4465],[0.2023, 0.1994, 0.2010]) 
])

In [9]:
def load_cifar10(is_train, augs, batch_size):
    dataset = torchvision.datasets.CIFAR10(root="./data", train=is_train,
                                           transform=augs, download=True)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                    shuffle=is_train, num_workers=d2l.get_dataloader_workers())
    return dataloader

In [10]:
#@save
def train_batch_ch13(net, X, y, loss, trainer, devices):
    """用多GPU进行小批量训练"""
    if isinstance(X, list):
        # 微调BERT中所需（稍后讨论）
        X = [x.to(devices[0]) for x in X]
    else:
        X = X.to(devices[0])
    y = y.to(devices[0])
    net.train()
    trainer.zero_grad()
    pred = net(X)
    l = loss(pred, y)
    l.sum().backward()
    trainer.step()
    train_loss_sum = l.sum()
    train_acc_sum = d2l.accuracy(pred, y)
    return train_loss_sum, train_acc_sum

#@save
def train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs,devices=d2l.try_all_gpus()):
    """用多GPU进行模型训练"""
    timer, num_batches = d2l.Timer(), len(train_iter)
    animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 1],
                            legend=['train loss', 'train acc', 'test acc'])
    net = nn.DataParallel(net, device_ids=devices).to(devices[0])

    scheduler = torch.optim.lr_scheduler.StepLR(trainer, 4, 0.9)#加入调学习率的优化器
    
    for epoch in range(num_epochs):
        # 4个维度：储存训练损失，训练准确度，实例数，特点数
        metric = d2l.Accumulator(4)
        for i, (features, labels) in enumerate(train_iter):
            timer.start()
            l, acc = train_batch_ch13(
                net, features, labels, loss, trainer, devices)
            metric.add(l, acc, labels.shape[0], labels.numel())
            timer.stop()
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches,
                             (metric[0] / metric[2], metric[1] / metric[3],
                              None))
        test_acc = d2l.evaluate_accuracy_gpu(net, test_iter)
        animator.add(epoch + 1, (None, None, test_acc))
        scheduler.step()
    print(f'loss {metric[0] / metric[2]:.3f}, train acc '
          f'{metric[1] / metric[3]:.3f}, test acc {test_acc:.3f}')
    print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec on '
          f'{str(devices)}')

In [11]:
batch_size, devices = 512, d2l.try_all_gpus()


def train_with_data_aug(train_augs, test_augs, net, lr=0.06):
    train_iter = load_cifar10(True, train_augs, batch_size)
    test_iter = load_cifar10(False, test_augs, batch_size)
    loss = nn.CrossEntropyLoss(reduction="none")
    trainer = torch.optim.Adam(net.parameters(), lr=lr)
    train_ch13(net, train_iter, test_iter, loss, trainer, 30, devices)

- pretraining

In [12]:
train_with_data_aug(train_augs, test_augs, net)

![pretrain](ResNet_Plus_Result5.png)

In [13]:
print(net)

- add a 1x1 convolutional layel

In [14]:
b0 = nn.Conv2d(1, 3, kernel_size=1)

In [15]:
fnet=nn.Sequential(b0,net)
print(fnet)

- add a MLP

In [16]:
b1 = nn.Linear(10, 256)
b2 = nn.ReLU()
b3 = nn.Linear(256,10)
fnet.add_module("fc1",b1)
fnet.add_module("fc2",b2)
fnet.add_module("fc3",b3)
nn.init.xavier_uniform_(fnet.fc1.weight)
nn.init.xavier_uniform_(fnet.fc3.weight)
print(fnet)

In [17]:
def evaluate_accuracy_gpu(net, data_iter, device=None): #@save
    """使用GPU计算模型在数据集上的精度"""
    if isinstance(net, nn.Module):
        net.eval()  # 设置为评估模式
        if not device:
            device = next(iter(net.parameters())).device
    # 正确预测的数量，总预测的数量
    metric = d2l.Accumulator(2)
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(X, list):
                # BERT微调所需的（之后将介绍）
                X = [x.to(device) for x in X]
            else:
                X = X.to(device)
            y = y.to(device)
            metric.add(d2l.accuracy(net(X), y), y.numel())
    return metric[0] / metric[1]

In [18]:
for name, param in fnet.named_parameters():
    print(name)
    print("-----------------------------------")

In [19]:
def load_FashionMNIST(is_train, augs, batch_size):
    dataset = torchvision.datasets.FashionMNIST(root="./data", train=is_train,
                                           transform=augs, download=True)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                    shuffle=is_train, num_workers=d2l.get_dataloader_workers())
    
    return dataloader

In [20]:
# cifar10 32*32
train_augs = torchvision.transforms.Compose([
     # 在高度和宽度上将图像放大到40像素的正方形
    torchvision.transforms.Resize(40),
    # 随机裁剪出一个高度和宽度均为40像素的正方形图像，
    # 生成一个面积为原始图像面积0.64到1倍的小正方形，
    # 然后将其缩放为高度和宽度均为32像素的正方形
    torchvision.transforms.RandomResizedCrop(32, scale=(0.64, 1.0),ratio=(1.0, 1.0)),
    torchvision.transforms.ToTensor(),
    # 标准化图像的每个通道
    torchvision.transforms.Normalize([0.4914],[0.2023]) 
])

test_augs = torchvision.transforms.Compose([
     torchvision.transforms.Resize(32),
     torchvision.transforms.ToTensor(),
     torchvision.transforms.Normalize([0.4914],[0.2023]) 
])

In [21]:
def train_fine_tuning(net, learning_rate, batch_size=256, num_epochs=20,
                      param_group=True):
    train_iter = load_FashionMNIST(True, train_augs, batch_size)
    test_iter = load_FashionMNIST(False, test_augs, batch_size)
    devices = d2l.try_all_gpus()
    loss = nn.CrossEntropyLoss(reduction="none")
    if param_group:
        params_1x = [param for name, param in net.named_parameters()if name not in [
                             "fc1.weight", "fc1.bias",
                             "fc3.weight", "fc3.bias",
                             "0.weight", "0.bias"
                            ]
                    ]
        trainer = torch.optim.SGD([{'params': params_1x},
                                   {'params': net.fc1.parameters(),'lr': learning_rate * 20},
                                   {'params': net.fc3.parameters(),'lr': learning_rate * 10},
                                   {'params': net[0].parameters(),'lr': learning_rate * 50}
                                  ],lr=learning_rate, weight_decay=0.001)
    else:
        trainer = torch.optim.SGD(net.parameters(), lr=learning_rate*100,weight_decay=0.001)
    d2l.train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs,
                   devices)

In [23]:
train_fine_tuning(fnet,1e-4)

# Running on Kaggle

- The initial ResNet model

![the first](ResNet_Result1.png)
![the second](ResNet_Result2.png)

- 在Cifar10进行预训练，并做微调

![result1](ResNet_Plus_Result1.png)
![result2](ResNet_Plus_Result2.png)
![result3](ResNet_Plus_Result3.png)
![result4](ResNet_Plus_Result4.png)

- 在ResNet中加入了CBAM注意力机制，对Cifar10做了数据增广后进行预训练，训练过程使用了根据epoch数衰减学习率的optim，然后进行微调

![fashion_result](ResNet_Plus_Result6.png)
![fashion_result1](ResNet_Plus_Result7.png)

# summary
        在进行多次调参和改变网络结构后最后的结果反而更差了，主要原因我认为是改变了预训练模型中前面的结构，所以可以认为模型对图片特征的提取需要多层协调作用，但上面模型中在预训练之后在开头加了一层1x1卷积层，应该是这里一定程度上破坏了预训练的效果。
        总的来说，这个ResNet_Plus主要是为了实操前面所学缝合而成的，即尽量手动实现。这些所学包括：构建基本网络，添加SENet、CBAM注意力module、数据增广（Data_Augumentation)、预训练的做法、微调（finetune）、可进行learning rate_decay的optim。虽然这些缝合的方法在这里效果一般，但我后面将其用在了Cifar10分类任务上，达到了95%的test_acc。还是蛮好的
        至此，图片分类网络的学习告一段落。