# DeepLabV1

## 创新点：

### 1. dilated VGG 16

- 原始的VGG16：有五个maxpool层，即进行了5次下采样，即输出特征图的空间尺寸为原图的 $\frac{1}{32}$，注：此处的输出特征图是指全连接层之前的输出特征图
- dilated VGG16：将最后两个 max-pooling 层 ***改为*** `stride=1`， 因此下采样次数减少2次，为3次，输出特征图为输入尺寸的 $\frac{1}{8}$；同时，在移除的 maxpool 层后的卷积层中`均施加膨胀卷积`，扩大特征图的感受野，弥补去除的两次下采样本可以带来的扩大感受野的机会。
- 将最后的全连接层 FC6、7、8 全部改造成卷积层

### 2. CRF

分类器获得以对象为中心的决策需要空间不变性，需要 pooling 层来辅助实现，这限制了 DCNN 的空间精度。因此 DeepLabV1 中通过 `条件随机场 CRF` 来提高模型捕获细节的能力。（CRF在传统图像处理上主要做平滑处理。就是在决定一个位置的像素值时，会考虑周围邻居的像素值，这样能抹除一些噪音。）

## 主要贡献

- 速度：带孔算法的 DCNN 速度可达到 8 fps，全连接 CRF 平均预测只需 0.5 s
- 准确：PASCAL 第二名，
- 主要区别：DeepLabV1 可看作DCNN和CRF的级联。


In [1]:
import torch
import torch.nn as nn

In [3]:
class VGG16_LargeFOV(nn.Module):
    def __init__(self, input_size = 227, input_channel=3, num_classes = 21, split = 'train', init_weight=True):
        super(VGG16_LargeFOV, self).__init__()
        channels_list = [64,128,256,512]
        self.input_size = input_size
        self.input_channel = input_channel
        self.classes = num_classes
        self.split = split
        
        self.features = nn.Sequential(
            ### conv1_1 conv1_2 maxpooling
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(True),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),

            ### conv2_1 conv2_2 maxpooling
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(True),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),

            ### conv3_1 conv3_2 conv3_3 maxpooling
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(True),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(True),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),


            ### conv4_1 conv4_2 conv4_3 maxpooling(stride=1)
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(True),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(True),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),  # Pool 4 使用 stride = 1
            
            
            ### 开始 Delation Conv
            
            
            ### conv5_1 conv5_2 conv5_3 (dilated convolution dilation=2, padding=2)
            ### maxpooling(stride=1)
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=2, dilation=2),
            nn.ReLU(True),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=2, dilation=2),
            nn.ReLU(True),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=2, dilation=2),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
            ### average pooling
            nn.AvgPool2d(kernel_size=3, stride=1, padding=1),
            
            
            ### 用 Conv 层改造 FC 层
            
            
            ### fc6 relu6 drop6
            ### use conv to replace fc
            nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=12, dilation=12),
            nn.ReLU(True),
            nn.Dropout2d(0.5),
            
            ### fc7 relu7 drop7 (kernel_size=1, padding=0)
            nn.Conv2d(1024, 1024, kernel_size=1, stride=1, padding=0),
            nn.ReLU(True),
            nn.Dropout2d(0.5),

            ### fc8
            nn.Conv2d(1024, self.classes, kernel_size=1, stride=1, padding=0)   # 第 38 层
        )
        
        if init_weight:
            self.initialize_weights()
            
    def forward(self, x):
        output = self.features(x)
        if self.split == 'test':
            output = nn.functional.interpolate(output, size=(self.input_size, self.input_size), mode='bilinear', align_corners=True)
        return output
    
    def initialize_weights(self):
        for m in self.named_modules():
            if isinstance(m[1], nn.Conv2d):
                if m[0] == 'features.38':  # 第 38 层的 conv层
                    nn.init.normal_(m[1].weight.data, mean=0, std=0.01)  # 初始化权重 服从 N(0, 0.01)
                    nn.init.constant_(m[1].bias.data, 0.0)  # 初始化为常数

                    
                    
if __name__ == "__main__":
    net = VGG16_LargeFOV(3, 10)
    in_ten = torch.randn(1, 3, 224, 224)
    out = net(in_ten)
    print(out.size())

#     in_ten = torch.randn(1, 3, 64, 64)
#     mod = nn.Conv2d(3,
#             512,
#             kernel_size = 3,
#             stride = 1,
#             padding = 2,
#             dilation = 2)
#     out = mod(in_ten)
#     print(out.shape)

torch.Size([1, 21, 28, 28])


> 之所以输出没有恢复成原图大小，是为了减小CRF的计算量；也可以直接给上采样

# 定义 CRF

In [4]:
import pydensecrf.densecrf as dcrf
import pydensecrf.utils as utils


class DenseCRF(object):
    def __init__(self, iter_max, pos_w, pos_xy_std, bi_w, bi_xy_std, bi_rgb_std):
        self.iter_max = iter_max
        self.pos_w = pos_w
        self.pos_xy_std = pos_xy_std
        self.bi_w = bi_w
        self.bi_xy_std = bi_xy_std
        self.bi_rgb_std = bi_rgb_std

    def __call__(self, image, probmap):
        C, H, W = probmap.shape

        U = utils.unary_from_softmax(probmap)
        U = np.ascontiguousarray(U)

        image = np.ascontiguousarray(image)

        d = dcrf.DenseCRF2D(W, H, C)
        d.setUnaryEnergy(U)
        d.addPairwiseGaussian(sxy=self.pos_xy_std, compat=self.pos_w)
        d.addPairwiseBilateral(
            sxy=self.bi_xy_std, srgb=self.bi_rgb_std, rgbim=image, compat=self.bi_w
        )

        Q = d.inference(self.iter_max)
        Q = np.array(Q).reshape((C, H, W))

        return Q

# 定义 loss 及 metrics

In [5]:
def resize_labels(labels, size):
    """
    Downsample labels for 0.5x and 0.75x logits by nearest interpolation.
    Other nearest methods result in misaligned labels.
    -> F.interpolate(labels, shape, mode='nearest')
    -> cv2.resize(labels, shape, interpolation=cv2.INTER_NEAREST)
    """
    new_labels = []
    for label in labels:
        label = label.float().numpy()
        label = Image.fromarray(label).resize(size, resample=Image.NEAREST)
        new_labels.append(np.asarray(label))
    new_labels = torch.LongTensor(new_labels)
    return new_labels

def build_metrics(model, batch, device):
    CEL = nn.CrossEntropyLoss(ignore_index=255).to(device)

    image_ids, images, labels = batch
    labels = resize_labels(labels, size=(41, 41)).to(device)
    logits = model(images.to(device))

    loss_seg = CEL(logits, labels)

    preds = torch.argmax(logits, dim=1)
    accuracy = float(torch.eq(preds, labels).sum().cpu()) / (len(image_ids) * logits.shape[2] * logits.shape[3])

    return loss_seg, accuracy

# 定义 动态调整 LR 的部分

In [6]:
def get_params(model : nn.Module, key):
    if key == '1x':
        for m in self.named_modules():
            if isinstance(m[1], nn.Conv2d):
                if m[0] != 'features.38':
                    yield m[1].weight    
    if key == '2x':
        for m in model.named_modules():
            if isinstance(m[1], nn.Conv2d):
                if m[0] != 'features.38':
                    yield m[1].bias
                    
    if key == '10x':
        for m in model.named_modules():
            if isinstance(m[1], nn.Conv2d):
                if m[0] == 'features.38':
                    yield m[1].weight
    if key == '20x':
        for m in model.named_modules():
            if isinstance(m[1], nn.Conv2d):
                if m[0] == 'features.38':
                    yield m[1].bias
    

# 定义训练部分

In [7]:
def trin(weight_path, device_ids = [0]):
    model = VGG16_LargeFOV()
    if weight_path is not None:
        model.load_state_dict(torch.load(weight_path))
    
    ### 并行训练
    model = nn.DataParallel(model, device_ids=device_ids)
    
    optimizer = torch.optim.SGD(
        params = [
            {
                # get_params 内的参数 使用 lr 训练
                'params': get_params(model, '1x'),
                'lr': lr,
                'weight_decay': weight_decay
            },
            {
                # get_params 内的参数 使用 2r 训练
                'params': get_params(model, '2x'),
                'lr': lr * 2,
                'weight_decay': 0
            },
            {
                'params': get_params(model, '10x'),
                'lr': lr * 10,
                'weight_decay': weight_decay
            },
            {
                'params': get_params(model, '20x'),
                'lr': lr * 20,
                'weight_decay': 0
            },
        ],
        momentum = 0.9, # 所有参数都是用 动量 = 0.9
    )
    
    ### 加载图像， 自行完成 Datasets 的定义
    train_loader = torch.utils.data.DataLoader()
    
    
    ### Learning rate policy
    for group in optimizer.param_groups:
        group.setdefault('initial_lr', group['lr'])
    
    ### start training
    for epoch in range(1, 100):
        for item, batch in enumerate(train_loader):
            loss_seg, accuracy = losses.build_metrics(model, batch, device)
            optimizer.zero_grad()
            loss_seg.backward()
            optimizer.step()
            
            
            ## 还需要什么额外的操作自己添加
            '''
            ------
            '''
            
            # poly
            for group in optimizer.param_groups:
                group["lr"] = group['initial_lr'] * (1 - float(iters) / num_max_iters) ** 0.9

            if iters == num_max_iters:
                exit()