# YOLOv3 网络构建与整体流程解析

In [4]:
from __future__ import division
import torch
import torch.nn as nn
import torch.nn.functional as F 
from torch.autograd import Variable
import numpy as np 
from PIL import Image
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torchvision import datasets
import torchvision.transforms as transforms

## YOLOv3网络中使用的想对特殊层定义

### EmptyLayer

本层**不会**对输入的参数执行任何**改动操作**，其忠实的将传入参数传出。<br>
其主要作用在于完成`router`以及`shortcut`两类操作，前者将当前输入与指定层输入拼接（`concat`），后者将输入与指定层输出相加（`addition`），他们都不改变上一层的输出内容，只是要将其和之前历史层输出进行组合操作，具体操作逻辑在`Darknet`中实现。<br>

### YoLoLayer

本层定义YOLO网络层，其主要对`Darknet`的最后输出进行操作，即按照YOLO定义逻辑，将输出转化为其标准输出格式：`Bounding box`加`Pred classes`（YOLOv3假定一个目标可以分属多个分类）加`Confidence`的形式，针对每个`grid`，做出与设定`anchor`数量相等的预测结果。

- init：初始化

在初始化过程中，定义预设锚点列表（`anchors`），输入图像尺寸（`image_size`），样本总类别数（`num_class`），最低`IoU`限制阈值（`ignore_thres`），是否进行CUDA调用（`cuda`），同时，有目标时贡献乘比参数`obj_scale`和无目标贡献乘比参数`noobj_scale`也在此处被定义。<br>
此处使用最简单基本的`YOLO loss`计算方法：利用`MSEloss`（应用于`x`，`y`，`w`，`h`的长度直接损失）与`BCEloss`（应对利用`Sigmoid`激活后的二分类任务损失，如类别判断，`confidence`预估等）进行损失计算。

- compute grid offset：计算当前各个`grid`偏移量

在处理过程中，**感受野区域**会不断发生变化，在给定`grid_size`，即感受野区域大小后，每个`grid`提取所需的步长（`stride`），与在当前视野下`anchor`列表对应的缩放后大小都可得到计算（YOLOv3在每个给定感受野下根据给定的`anchor`列表给出预测）。

- forward：前向传播

此时输入内容`x`即为需要最后经由YOLO层转化到输出格式：根据**每个`grid`**求取如下内容：`x`，`y`的本单元格内**偏移量**（利用Sigmoid激活进入0~1后的`grid`内偏移率）。`w`，`h`位输出经由指数化后加入每个设定`anchor`锚点之中进行范围扰动，使得`Bounding box`能够有所伸展变化。`pred_class`由Sigmoid导出（YOLOv3放弃Softmax改对每个类应用Sigmoid，使得单目标同属多类成为可能，同时统一分类与`Confidence`的`loss`计算方式）。`conf`同样利用Sigmoid输出。<br>
在输出格式构建完毕后，将其与`ground truth`传入`build targets`之中，利用当前输出格式编辑`ground truth`，使其与输出格式对齐，即可利用`ground truth`以及预设参数对当前预测结果`loss`进行计算，并将其输出。

- bbox wh iou：通过给定Bounding Box的长宽计算其相交占比（IoU)

此时不将`偏移量offset`纳入考虑，单纯利用当前`grid`中目标的`Bounding box`进行两者的重合度计算，本函数的目标单纯为判定哪一个给出`anchor`最适配当前`grid`中的目标，故我们不考虑`offset`，直接进行筛选。

- build targets：根据`ground truth`建立对应的目标表示，由此来计算输出带来的损失`loss`

首先，根据`ground truth`，我们能够构建`obj_mask`（有目标的`grid`及`anchor`）和与之相对的`noobj_mask`（无目标`grid`及`anchor`），他们对于损失的贡献不同。同时，针对每个目标，针对它所在的`grid`，只输出**最符合其形状**的`anchor`对应的`Bounding box`和`class`预测，由于我们的输出是针对每个`grid`的`x`，`y`偏移量，`w`，`h`抖动量，我们也需要对`ground truth`中的输出进行调整，通过使用log对数，移除floor（减去向下取整后，剩余小数位即为当前`grid`内的偏移量）等使其格式与YOLO网络输出对齐。<br>
通过`pred_bbox`，`pred_class`，`target`，处理返回`class_mask`，`obj_mask`，`noobj_mask`，`tx`，`ty`，`tw`，`th`，`tcls`，`tconf`结果。

In [None]:
class EmptyLayer(nn.Module):
    '''
    For 'route' and 'shortcut'
    We do the concate and activation when we run into these two kinds of layers
    This EmptyLayer simply does nothing.
    '''
    def __init__(self):
        super(EmptyLayer, self).__init__()

class YoLoLayer(nn.Module):
    '''
    The detection layer for the YoLo network
    The 'Head' part
    Key: the calculation of loss and the implemention of NMS
    '''

    def __init__(self, anchors, num_class, img_size=(416, 416), ignore_thres=0.5, cuda=False):
        super(YoLoLayer, self).__init__()
        # attributes on anchors
        self.anchors = anchors
        self.num_anchors = len(anchors)
        # how many classes need to classifiy
        self.num_class = num_class
        # bounding boxed whit confidence below the threshold will be dropped
        self.ignore_thres = ignore_thres
        # the input image dim
        self.img_dim = img_size[0]
        # loss function used
        self.mse_loss = nn.MSELoss()
        self.bce_loss = nn.BCELoss()
        # the scale factor of bounding box with objects in or not
        self.obj_scale = 1
        self.noobj_scale = 100
        # how many grids are there in this image (split the image into how many parts)
        self.grid_size = 0
        # run the module on GPU or not
        self.cuda = cuda

    def compute_grid_offsets(self, grid_size):
        # in YoLo, we will get feature matrices with from different layers thus have different grid numbers
        self.grid_size = grid_size
        FloatTensor = torch.cuda.FloatTensor if self.cuda else torch.FloatTensor
        # the distance between each grid (map onto the real input image)
        self.stride = self.img_dim // self.grid_size
        # the position of the current grid
        self.grid_x = torch.arange(grid_size).repeat(grid_size, 1).view([1, 1, grid_size, grid_size]).type(FloatTensor)
        self.grid_y = torch.arange(grid_size).repeat(grid_size, 1).t().view([1, 1, grid_size, grid_size]).type(FloatTensor)
        # rescale the anchors with the same factors according to the grids
        self.scaled_anchors = FloatTensor([(width / self.stride, height / self.stride) for width, height in self.anchors])
        # the size of the current anchor
        self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1))
        self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1))
    
    def forward(self, x, targets=None, img_dim=None):
        FloatTensor = torch.cuda.FloatTensor if self.cuda else torch.FloatTensor
        LongTensor = torch.cuda.LongTensor if self.cuda else torch.LongTensor
        ByteTensor = torch.cuda.ByteTensor if self.cuda else torch.ByteTensor
        if img_dim is not None:
            self.img_dim = img_dim
        # get the size of the current batch
        num_samples = x.size(0)
        # x -> num_sample * channels(filters) * feature＿matrix_dim *　feature＿matrix_dim 
        grid_size = x.size(2)
        # YoLo's output:
        # num_samples: the current batch size
        # num_anchors * (num_class + 5): for each type of anchor, give a prediction with:
        #      classification results (num_class)
        #      bounding box centre: x
        #      bounding box centre: y
        #      bounding box length
        #      bounding box width
        #      confidence
        # grid_size * grid_size: how many grids in one image
        # transform the dimension
        prediction = (
            x.view(num_samples, self.num_anchors, self.num_class + 5, grid_size, grid_size).permute(0, 1, 3, 4, 2).contiguous()
        )
        # predict the (num_class + 5)
        # we compute the offset of the x,y in the current grid with the sigmoid function
        x = torch.sigmoid(prediction[..., 0])
        y = torch.sigmoid(prediction[..., 1])
        w = prediction[..., 2]
        h = prediction[..., 3]
        pred_conf = torch.sigmoid(prediction[..., 4])
        pred_class = torch.sigmoid(prediction[..., 5:])
        # compute the attributes for each grid
        if grid_size != self.grid_size:
            self.compute_grid_offsets(grid_size)
        # add the offset to bounding box according to the grid position
        pred_bbox = FloatTensor(prediction[..., :4].shape)
        pred_bbox[..., 0] = x.data + self.grid_x
        pred_bbox[..., 1] = y.data + self.grid_y
        # the width/height is computed with the exponent
        pred_bbox[..., 2] = torch.exp(w.data) + self.anchor_w
        pred_bbox[..., 3] = torch.exp(h.data) + self.anchor_h
        # organize the output
        output = torch.cat(
            (
                # map the bounding box to the position on the actual image
                pred_bbox.view(num_samples, -1, 4) * self.stride,
                pred_conf.view(num_samples, -1, 1),
                pred_class.view(num_samples, -1, self.num_class)
            ),
            dim=-1
        )
        # no detection targets in the ground truth
        if targets is None:
            return output, 0
        # calculate the loss
        else:
            class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = self.build_targets(
                pred_bbox=pred_bbox,
                pred_class=pred_class,
                target=targets
            )
            # the loss of bounding box center (x,y)
            loss_x = self.mse_loss(x[obj_mask], tx[obj_mask])
            loss_y = self.mse_loss(y[obj_mask], ty[obj_mask])
            # the loss of the bounding box size
            loss_w = self.mse_loss(w[obj_mask], tw[obj_mask])
            loss_h = self.mse_loss(h[obj_mask], th[obj_mask])
            # the loss of confidence (have objects or not)
            loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask])
            loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask])
            loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj
            # the loss of class prediction
            loss_cls = self.bce_loss(pred_class[obj_mask], tcls[obj_mask])
            # total loss
            total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
            return output, total_loss
    
    def bbox_wh_iou(wh1, wh2):
        wh2 = wh2.t()
        # get the width/height from the anchor and the ground truth
        w1, h1 = wh1[0], wh1[1]
        w2, h2 = wh2[0], wh2[1]
        # don not care about the offset, just check if the current anchor fit well with our ground truth
        inter_area = torch.min(w1, w2) * torch.min(h1, h2)
        union_area = (w1 * h1 + 1e-16) + w2 * h2 - inter_area
        return inter_area / union_area
    
    def build_targets(self, pred_bbox, pred_class, target):
        anchors=self.scaled_anchors
        ignore_thres=self.ignore_thres
        ByteTensor = torch.cuda.ByteTensor if self.cuda else torch.ByteTensor
        FloatTensor = torch.cuda.FloatTensor if self.cuda else torch.FloatTensor
        # batch size
        nB = pred_bbox.size(0)
        # anchor type
        nA = pred_bbox.size(1)
        # class num
        nC = pred_class.size(-1)
        # grid num
        nG = pred_bbox.size(2)
        # mark grids that have objects (or not)
        obj_mask = ByteTensor(nB, nA, nG, nG).fill_(0)
        noobj_mask = ByteTensor(nB, nA, nG, nG).fill_(1)
        # the class of the object in the grid (if any)
        class_mask = FloatTensor(nB, nA, nG, nG).fill_(0)
        # target bound boxes
        tx = FloatTensor(nB, nA, nG, nG).fill_(0)
        ty = FloatTensor(nB, nA, nG, nG).fill_(0)
        tw = FloatTensor(nB, nA, nG, nG).fill_(0)
        th = FloatTensor(nB, nA, nG, nG).fill_(0)
        tcls = FloatTensor(nB, nA, nG, nG, nC).fill_(0)
        # use the number of grids to resize the bounding box to its 'real position' on the current feature matrix
        target_boxes = target[:, 2:6] * nG
        gxy = target_boxes[:, :2]
        gwh = target_boxes[:, 2:]
        # get the anchor with best iou according to the current ground truth
        ious = torch.stack([self.bbox_wh_iou(anchor, gwh) for anchor in anchors])
        best_ious, best_n = ious.max(0)
        # Separate target values
        b, target_labels = target[:, :2].long().t()
        gx, gy = gxy.t()
        gw, gh = gwh.t()
        # get the target grids that have objects
        gi, gj = gxy.long().t()
        # set masks: mask the grids that have objects and label the correspond anchor type
        obj_mask[b, best_n, gj, gi] = 1
        noobj_mask[b, best_n, gj, gi] = 0
        # Set noobj mask to zero where iou exceeds ignore threshold
        # in order to punish the bounding boxes circle no object but have high confidence
        for i, anchor_ious in enumerate(ious.t()):
            noobj_mask[b[i], anchor_ious > ignore_thres, gj[i], gi[i]] = 0
        # Coordinates, get rid of the floor so the result is the offset according to the current grid
        tx[b, best_n, gj, gi] = gx - gx.floor()
        ty[b, best_n, gj, gi] = gy - gy.floor()
        # Width and height
        # Using 1e-16 to avoid 0s
        tw[b, best_n, gj, gi] = torch.log(gw / anchors[best_n][:, 0] + 1e-16)
        th[b, best_n, gj, gi] = torch.log(gh / anchors[best_n][:, 1] + 1e-16)
        # One-hot encoding of label, only the ground truth label will be marked as 1
        tcls[b, best_n, gj, gi, target_labels] = 1
        # Compute label correctness and iou at best anchor
        class_mask[b, best_n, gj, gi] = (pred_class[b, best_n, gj, gi].argmax(-1) == target_labels).float()
        # the grids have object have the confidence of 1.00
        tconf = obj_mask.float()
        return class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf


## YOLOv3：config文件读取配置

### parse cfg：config文件解析

按行读取所有内容，跳过注释等，根据格式将网络设计放入`dict`中，以供模型生成。

### generate model：模型结构，基础层构建

负责已有预设，简单层的定义，如卷积层，池化层等，同时统计每一层经过时`filter`的数量，以确保在梯度流的处理中，能够保证维度对齐。

In [None]:
def parse_cfg(cfg_filepath):
    cfg_file = open(cfg_filepath, mode='r')
    # read the file line by line
    contents = cfg_file.read().split('\n')
    # ignore all the empty lines
    contents = [line for line in contents if len(line)>0]
    # ignore all the muted information
    contents = [line for line in contents if line[0] != '#']
    contents = [line.rstrip().lstrip() for line in contents]
    block = {}
    block_list = []
    for line in contents:
        if line[0] is '[':
            # not the first [net] block
            if len(block) is not 0:
                block_list.append(block)
                block = {}
            # delete the brakets
            block['Type'] = line[1:-1].rstrip()
        else:
            k,v = line.split('=')
            block[k.rstrip()] = v.lstrip()
    block_list.append(block)
    return block_list

def generate_module(cfg_filepath):
    block_list = parse_cfg(cfg_filepath)
    # get hyperparameters from [net]
    hyperparameters = block_list[0]
    module_list = nn.ModuleList()
    # give the list an initial input dim (3)
    filters = 0
    filter_list = [int(hyperparameters["channels"])]
    # create layers
    for layer_index, block in enumerate(block_list[1:]):
        module = nn.Sequential()
        # check the type of the current block
        # handle convolutional layers
        if block['Type'] == 'convolutional':
            # extract the important parameters
            try:
                batch_norm = int(block['batch_normalize'])
            except:
                batch_norm = 0
            filters = int(block['filters'])
            kernel_size = int(block['size'])
            stride = int(block['stride'])
            padding = int(block['pad'])
            activation = block['activation']
            # do padding or not
            if padding:
                padding = (kernel_size - 1) // 2
            else:
                padding = 0
            # create the conv layer
            module.add_module(
                f"conv_{layer_index}",
                nn.Conv2d(
                    in_channels=filter_list[-1],
                    out_channels=filters,
                    kernel_size=kernel_size,
                    stride=stride,
                    padding=padding,
                    bias=not batch_norm
                )
            )
            # add batch_norm layer
            if batch_norm:
                module.add_module(
                    f"batch_nrom_{layer_index}",
                    nn.BatchNorm2d(filters)
                )
            if activation == 'leaky':
                module.add_module(
                    f"leaky_{layer_index}",
                    nn.LeakyReLU(0.1, inplace=True)
                )
        # deal with maxpooling
        elif block['Type'] == 'maxpool':
            kernel_size = int(block['size'])
            stride = int(block['stride'])
            padding = (kernel_size - 1) // 2
            if kernel_size is 2 and stride is 1:
                module.add_module(
                    f"debug_padding_{layer_index}",
                    nn.ZeroPad2d((0,1,0,1))
                )
            module.add_module(
                f"maxpool_{layer_index}",
                nn.MaxPool2d(
                    kernel_size=kernel_size,
                    stride=stride,
                    padding=padding
                )
            )
        # deal with shortcuts (gradient flow, concate two outputs)
        elif block['Type'] == 'shortcut':
            # the output should not change through the shortcut in yolo, so no need to modify filters
            module.add_module(
                f"shortcut_{layer_index}",
                EmptyLayer()
            )
        # deal with upsampling
        elif block['Type'] == 'upsample':
            scale_factor = int(block['stride'])
            module.add_module(
                f"upsample_{layer_index}",
                nn.Upsample(scale_factor=scale_factor, mode='bilinear')
            )
        # deal with route layer (concate target output with previous layer's output [reconstruction])
        elif block['Type'] == 'route':
            tar_layers = block['layers'].split(',')
            start_pos = int(tar_layers[0])
            try:
                end_pos = int(tar_layers[1])
            except:
                end_pos = 0
            # thus we have end pos for it's not zero
            # adjust the dimension of the current output
            module.add_module(
                f"route_{layer_index}",
                EmptyLayer()
            )
            if end_pos != 0:
                filters = filter_list[start_pos] + filter_list[end_pos]
            else:
                filters = filter_list[start_pos]
        # deal with YoLo 
        elif block['Type'] == 'yolo':
            anchor_idx = [int(idx) for idx in block['mask'].split(',')]
            # extract all the numbers
            anchors = [int(tag) for tag in block['anchors'].split(',')]
            # group the anchor's (length, width) in one element
            anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors), 2)]
            # pick out the selected (masked) elements
            anchors = [anchors[i] for i in anchor_idx]
            num_class = int(block['classes'])
            img_size = (int(hyperparameters['height']), int(hyperparameters['width']))
            ignore_thres = (float(block['ignore_thresh']))
            module.add_module(
                f"yolo_v3_{layer_index}",
                YoLoLayer(anchors, num_class, img_size, ignore_thres)
            )
        # keep the current layer and dimension
        module_list.append(module)
        if filters:
            filter_list.append(filters)
    return hyperparameters, module_list

## Darknet：YOLOv3模型构建

### init：初始化

存储指名使用的YOLO网络配置文件。

### forward：前向传播

将模型`Backbone`与`Head`的整体输入输出连接起来，计算输出与`loss`结果。<br>
同时此处定义了`router`层与`shortcut`层，`router`层执行`concat`操作，将指定的两个`Layer`输出拼接<br>
在`shortcut`中，利用上一层输出，将指定层输出直接加入其中（要保证维度对齐），构建梯度流。

### load pretrained weights：载入训练参数

根据指定文件载入权重数据，按层读取载入即可。

### save weights：存储权重文件

按层将当前训练结果参数进行存储。

In [None]:
class Darknet(nn.Module):
    '''
    Handle the short cuts and routers
    Load the pretrained weight if given
    '''

    def __init__(self, cfg_filepath, cuda=False):
        super(Darknet, self).__init__()
        self.cfg = cfg_filepath
        self.cuda = cuda
        self.blocks = parse_cfg(cfg_filepath)
        self.params, self.module_list = generate_module(cfg_filepath)

    def forward(self, x, targets=None, img_dim=None):
        module_cfg = self.blocks[1:]
        layer_outputs = []
        outputs = []
        loss = 0
        for idx, module in enumerate(module_cfg):
            if module['Type'] in ['convolutional', 'upsample', 'maxpool']:
                # put the index in this layer to do the forward pass
                x = self.module_list[idx](x)
            # the logics of route and shortcut are defined here
            elif module['Type'] == 'route':
                tar_layers = [int(pos) for pos in module['layers'].split(',')]
                if len(tar_layers) == 1:
                    x = layer_outputs[tar_layers[0]]
                else:
                    x = torch.cat(
                        (
                            layer_outputs[tar_layers[0]],
                            layer_outputs[tar_layers[1]]
                        ),
                        1
                    )
            elif module['Type'] == 'shortcut':
                pos = int(module['from'])
                x = layer_outputs[-1] + layer_outputs[pos]
            # get the bounding box output and append it to the outputs
            # we have serval output from different layers
            elif module['Type'] == 'yolo':
                x, layer_loss = self.module_list[idx][0](x, targets, img_dim)
                outputs.append(x)
                loss += layer_loss
            layer_outputs.append(x)
        # concate the outputs into one matrix
        outputs = torch.cat(outputs, 1)
        return outputs if targets is None else (outputs, loss)

    def load_pretrained_weights(self, weight_filepath):
        with open(weight_filepath, 'rb') as f:
            # get the values of the header
            header = np.fromfile(f, dtype=np.int32, count=5)
            # 1. Major version number
            # 2. Minor Version Number
            # 3. Subversion number 
            # 4,5. Images seen by the network (during training)
            self.seen = header[3]
            # needed when saving new weight files
            self.header_info = header
            weights = np.fromfile(f, dtype=np.float32)
        # start to write weights into the backbone
        module_cfg = self.blocks[1:]
        ptr = 0
        for idx, module in enumerate(module_cfg):
            # we need to load weights to all the conv layers
            if module['Type'] == 'convolutional':
                layer = self.module_list[idx]
                try:
                    batch_norm = int(module['batch_normalize'])
                except:
                    batch_norm = 0
                # get the conv layer
                conv_layer = layer[0]
                if batch_norm:
                    bn_layer = layer[1]
                    # number of params
                    num_bias = bn_layer.bias.numel()
                    # load the bias (always remember to reshpe the matrix into proper form)
                    bn_bias = torch.from_numpy(weights[ptr : ptr + num_bias]).view_as(bn_layer.bias)
                    bn_layer.bias.data.copy_(bn_bias)
                    ptr += num_bias
                    # load the weight
                    bn_weight = torch.from_numpy(weights[ptr : ptr + num_bias]).view_as(bn_layer.weight)
                    bn_layer.weight.data.copy_(bn_weight)
                    ptr += num_bias
                    # load the running mean
                    bn_running_mean = torch.from_numpy(weights[ptr : ptr + num_bias]).view_as(bn_layer.running_mean)
                    bn_layer.running_mean.data.copy_(bn_running_mean)
                    ptr += num_bias
                    # load the running varianve
                    bn_running_var = torch.from_numpy(weights[ptr : ptr + num_bias]).view_as(bn_layer.running_var)
                    bn_layer.running_var.data.copy_(bn_running_var)
                    ptr += num_bias
                else:
                    # load the conv bias
                    num_bias = conv_layer.bias.numel()
                    conv_bias = torch.from_numpy(weights[ptr : ptr + num_bias]).view_as(conv_layer.bias)
                    conv_layer.bias.data.copy_(conv_bias)
                    ptr += num_bias
                # load the conv weights
                num_weight = conv_layer.weight.numel()
                conv_weight = torch.from_numpy(weights[ptr : ptr + num_weight]).view_as(conv_layer.weight)
                conv_layer.weight.data.copy_(conv_weight)
                ptr += num_weight
                
    def save_weights(self, target_filepath):
        save_file = open(target_filepath, 'wb')
        self.header_info[3] = self.seen
        self.header_info.tofile(save_file)
        # Iterate to save all layers
        module_cfg = self.blocks[1:]
        for idx, module in enumerate(module_cfg):
            # also, just need to keep all the conv layer's weights
            if module['Type'] == 'convolutional':
                conv_layer = self.module_list[idx][0]
                try:
                    batch_norm = int(module['batch_normalize'])
                except:
                    batch_norm = 0
                if batch_norm:
                    bn_layer = self.module_list[idx][1]
                    bn_layer.bias.data.cpu().numpy().tofile(save_file)
                    bn_layer.weight.data.cpu().numpy().tofile(save_file)
                    bn_layer.running_mean.data.cpu().numpy().tofile(save_file)
                    bn_layer.running_var.data.cpu().numpy().tofile(save_file)
                else:
                    conv_layer.bias.data.cpu().numpy().tofile(save_file)
                conv_layer.weight.data.cpu().numpy().tofile(save_file)
        save_file.close()

### YOLO模型构建使用

- weight init：初始化网络层权重，针对`Conv`层与`BatchNorm`层的参数进行服从正态分布的参数初始化。

- Hyperparameters：一般来说超参数定义在`obj.data`，`yolo-obj.cfg`与`obj.names`数个文件中，记录当前模型与学习任务的各项相关参数。
    
    - 网络模型参数如`学习率`，`迭代次数`，`batch尺寸`，`输入数据维度`等，在`yolo-obj.cfg`文件中标明。

    - 训练数据属性如目标类别数，训练样本列表定义文件目录，验证/测试样本列表定义文件目录，各类别名定义文件目录以及模型参数文件自动存储目录，在`obj.data`文件中明确。

    - `obj.names`文件明确各类别的对应名字；`train.txt`明确所有训练样本路径；`test.txt`明确所有测试样本路径。

- Dataloader：读入YOLO格式的批量图片文件，并送入网络进行前向传播预测。

    - 需要为YOLO格式数据定义新Dataset，以满足对文件记录路径上图片的`读取`，`形状转化`和`padding`等操作。
    
    - 针对训练的Dataset还需要进一步修改，做到能够同时读取与图片文件同名的标记txt文件内的具体`bounding box`内容。

- 批量结果测试与训练简单示例。

    - 可以在训练使用的Dataset定义中添加`collate_fn`函数，即可用户自定义batch训练的相关处理机制。

In [None]:
# we directly define some parameters we need here
class_num = 2
train = "[train_file_path]"
valid = "[test_file_path]"
backup = "[model_backup_path]"

batch_size = 64
num_workers = 4
img_width = 416
img_height = 416
epoch = 2000
gradient_accumulations = 2
checkpoint_interval = 100

cfg_filepath = "[cfg_path]/yolov3.cfg"
weight_filepath = '[weight_path]/yolov3.weights'

def weight_init(model):
    classname = model.__class__.__name__
    if classname.find("Conv") != -1:
        torch.nn.init.nromal_(model.weight.data, 0.0, 0.02)
     elif classname.find("BatchNorm2d") != -1:
        torch.nn.init.normal_(model.weight.data, 1.0, 0.02)
        torch.nn.init.constant_(model.bias.data, 0.0)

In [None]:
# create model
model = Darknet(cfg_filepath=cfg_filepath)
print(model)

In [None]:
# since we have the pre-trained weights file, we do not need to init the weight.
model.load_pretrained_weights(weight_filepath)
# get into evaluation mode, thus some regularzation layers will be disabled.
model.eval()

In [None]:
# expand the dataset to read YOLO format training data
class ImgDataset(Dataset):

    def __init__(self, folder_path, img_size=(416,416)):
        self.files = sorted(glob.glob("%s/*.*" % folder_path))
        self.img_size = img_size[0]

    def __getitem__(self, index):
        # get the current image the index points to
        img_path = self.files[index % len(self.files)]
        img = transforms.ToTensor()(Image.open(img_path))
        # if the training data is not fixed with its size (not 416x416 square pictures), we need to pad them to square pictures with 0s
        # resize the image to 416x416x3
        img = F.interpolate(img.unsqueeze(0), size=self.size, mode="nearest").squeeze(0)
        return img_path, img

    def __len__(self):
        return len(self.files)

# read the pictures with the dataloader
dataloader = DataLoader(
    ImgDataset(folder_path=valid, img_size=(img_width, img_height)),
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers
)

# read in the class names (should be defined in obj.names file)
class_names = ["cls_1", "cls_2"]

# do the prediction
img_detected = []
img_predictions = []
for batch_i, (img_paths, imgs) in enumerate(dataloader):
    # adjust the configuration of the images
    imgs = Variable(imgs.type(Tensor))
    # get the predictions
    with torch.no_grad():
        detections = model(imgs)
        # then, we should have an nms layer that filter all the bounding boxes that have low confidence, this time we skip this layer
    img_detected.extend(imgs)
    img_predictions.extend(detections)

# show the results of the bounding boxes
for img_i, (img_path, detection) in enumerate(zip(img_detected, img_predictions)):
    if detection is not None:
        # we have to resize the bounding boxes according to the original picture size, currently we assume the pictures are all 416x416 without any padding.
        for x, y, w, h, conf, cls_conf, cls_pred in detection:
            print("\tScanned image: %s  Found object: %s at (%.5f, %.5f, %.5f, %.5f) with conf %.5f" % (img_path, class_names[int(cls_pred)], x, y, w, h, cls_conf.item()))

In [None]:
# the dataset for training, it needs to output the images and their labels
class TrainDataset(Dataset):

    def __init__(self, file_path, img_size=(416, 416)):
        # read in all the image paths
        with open(file_path, "r") as file:
            self.img_files = file.readlines()
        # read in all the label files
        self.label_files = [path.replace("images", "labels").replace(".png", ".txt").replace(".jpg", ".txt") for path in self.img_files]
        self.img_size = img_size[0]
    
    def __getitem__(self, index):
        img_path = self.img_files[index % len(self.img_files)].rstrip()
        img = transforms.ToTensor()(Image.open(img_path).convert('RGB'))
        # we assume our pictures are all 416x416x3 and all the labels are normalised.
        label_path = self.label_files[index % len(self.img_files)].rstrip()
        boxes = torch.from_numpy(np.loadtxt(label_path).reshape(-1, 5))
        # extract the targets from the boxes
        targets = torch.zeros((len(boxes), 6))
        targets[:, 1:] = boxes
        return img_path, img, targets
    
    def __len__(self):
        return len(self.img_files)

dataloader_train = DataLoader(
    ListDataset(file_path=train, img_size=(img_width, img_height)),
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=True
)

# here we use Adam optimizer with default settings
optimizer = torch.optim.Adam(model.parameters())
# start training
for epoch_i in range(epoch):
    model.train()
    # this dataloader is different from the previous one used in the prediction, it needs to return the labels from .txt files
    for batch_i, (_, imgs, labels) in enumerate(dataloader_train):
        batch_done = len(dataloader_train) * epoch_i + batch_i
        # adjust the formats
        imgs = Variable(imgs)
        targets = Variable(targets, requires_grad=False)
        # do the forward and gather the loss
        loss, outputs = model(imgs, targets)
        loss.backward()
        # do the backpropagation
        if batches_done % opt.gradient_accumulations == 1:
            optimizer.step()
            optimizer.zero_grad()
    # save the current model weights
    if epoch_i >= checkpoint_interval and epoch_i % checkpoint_interval == 0:
        torch.save(model.state_dict(), f"%s/yolov3_ckpt_%d.pth" % (backup, epoch))

### 参考资料

- [Referrence 1](https://blog.paperspace.com/how-to-implement-a-yolo-v3-object-detector-from-scratch-in-pytorch-part-2/)

- [Referrence 2](https://github.com/eriklindernoren/PyTorch-YOLOv3)