In [1]:
from __future__ import print_function
import sys
sys.path.append('D:\Machine Learning\Paper\Object Detection\YOLO\pytorch-yolo2-master')

In [2]:
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torchvision import datasets, transforms
from torch.autograd import Variable

import dataset
import random
import math
import os
os.chdir('D:\Machine Learning\Paper\Object Detection\YOLO\pytorch-yolo2-master')
from utils import *
from cfg import parse_cfg
from region_loss import RegionLoss
from darknet import Darknet
from models.tiny_yolo import TinyYoloNet

In [3]:
# Training settings
datacfg       = 'cfg/voc.data'
cfgfile       = 'cfg/yolo-voc.cfg'
weightfile    = 'darknet19_448.conv.23'

In [4]:
data_options  = read_data_cfg(datacfg)
net_options   = parse_cfg(cfgfile)[0]

In [5]:
data_options

{'backup': 'backup',
 'gpus': '0,1,2,3',
 'names': 'data/voc.names',
 'num_workers': '10',
 'train': 'train.txt',
 'valid': '2007_test.txt'}

In [6]:
net_options

{'angle': '0',
 'batch': '32',
 'burn_in': '1000',
 'channels': '3',
 'decay': '0.0005',
 'exposure': '1.5',
 'height': '416',
 'hue': '.1',
 'learning_rate': '0.001',
 'max_batches': '80200',
 'momentum': '0.9',
 'policy': 'steps',
 'saturation': '1.5',
 'scales': '0.1,10,.1,.1',
 'steps': '-1,500,40000,60000',
 'subdivisions': '8',
 'type': 'net',
 'width': '416'}

In [7]:
trainlist     = data_options['train']
testlist      = data_options['valid']
backupdir     = data_options['backup']

In [8]:
trainlist

'train.txt'

In [9]:
nsamples      = file_lines(trainlist)
gpus          = data_options['gpus']  # e.g. 0,1,2,3
ngpus         = 0
num_workers   = int(data_options['num_workers'])

batch_size    = int(net_options['batch'])
max_batches   = int(net_options['max_batches'])
learning_rate = float(net_options['learning_rate'])
momentum      = float(net_options['momentum'])
decay         = float(net_options['decay'])
steps         = [float(step) for step in net_options['steps'].split(',')]
scales        = [float(scale) for scale in net_options['scales'].split(',')]

#Train parameters
max_epochs    = max_batches*batch_size/nsamples+1
use_cuda      = False
seed          = int(time.time())
eps           = 1e-5
save_interval = 10  # epoches
dot_interval  = 70  # batches

# Test parameters
conf_thresh   = 0.25
nms_thresh    = 0.4
iou_thresh    = 0.5

if not os.path.exists(backupdir):
    os.mkdir(backupdir)
    
###############
torch.manual_seed(seed)
if use_cuda:
    os.environ['CUDA_VISIBLE_DEVICES'] = gpus
    torch.cuda.manual_seed(seed)

#Train parameters
max_epochs    = 32 * max_batches*batch_size/nsamples+1
use_cuda      = False
seed          = int(time.time())
eps           = 1e-5
save_interval = 10  # epoches
dot_interval  = 70  # batches

# Test parameters
conf_thresh   = 0.25
nms_thresh    = 0.4
iou_thresh    = 0.5

if not os.path.exists(backupdir):
    os.mkdir(backupdir)
    
###############
torch.manual_seed(seed)
if use_cuda:
    os.environ['CUDA_VISIBLE_DEVICES'] = gpus
    torch.cuda.manual_seed(seed)

In [10]:
model = Darknet(cfgfile)
region_loss = model.loss

In [11]:
model.load_weights(weightfile)
model.print_network()

  conv_model.weight.data.copy_(torch.from_numpy(buf[start:start+num_w])); start = start + num_w


layer     filters    size              input                output
    0 conv     32  3 x 3 / 1   416 x 416 x   3   ->   416 x 416 x  32
    1 max          2 x 2 / 2   416 x 416 x  32   ->   208 x 208 x  32
    2 conv     64  3 x 3 / 1   208 x 208 x  32   ->   208 x 208 x  64
    3 max          2 x 2 / 2   208 x 208 x  64   ->   104 x 104 x  64
    4 conv    128  3 x 3 / 1   104 x 104 x  64   ->   104 x 104 x 128
    5 conv     64  1 x 1 / 1   104 x 104 x 128   ->   104 x 104 x  64
    6 conv    128  3 x 3 / 1   104 x 104 x  64   ->   104 x 104 x 128
    7 max          2 x 2 / 2   104 x 104 x 128   ->    52 x  52 x 128
    8 conv    256  3 x 3 / 1    52 x  52 x 128   ->    52 x  52 x 256
    9 conv    128  1 x 1 / 1    52 x  52 x 256   ->    52 x  52 x 128
   10 conv    256  3 x 3 / 1    52 x  52 x 128   ->    52 x  52 x 256
   11 max          2 x 2 / 2    52 x  52 x 256   ->    26 x  26 x 256
   12 conv    512  3 x 3 / 1    26 x  26 x 256   ->    26 x  26 x 512
   13 conv    256  1 x 

In [12]:
region_loss.seen  = model.seen
processed_batches = model.seen/batch_size

init_width        = model.width
init_height       = model.height
init_epoch        = model.seen/nsamples 

kwargs = {'num_workers': num_workers, 'pin_memory': True} if use_cuda else {}
test_loader = torch.utils.data.DataLoader(
    dataset.listDataset(testlist, shape=(init_width, init_height),
                   shuffle=False,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                   ]), train=False),
    batch_size=batch_size, shuffle=False, **kwargs)

In [13]:
if use_cuda:
    if ngpus > 1:
        model = torch.nn.DataParallel(model).cuda()
    else:
        model = model.cuda()

In [14]:
params_dict = dict(model.named_parameters())
params = []
for key, value in params_dict.items():
    if key.find('.bn') >= 0 or key.find('.bias') >= 0:
        params += [{'params': [value], 'weight_decay': 0.0}]
    else:
        params += [{'params': [value], 'weight_decay': decay*batch_size}]

In [15]:
optimizer = optim.SGD(model.parameters(), lr=learning_rate/batch_size, momentum=momentum, dampening=0, weight_decay=decay*batch_size)

In [17]:
def adjust_learning_rate(optimizer, batch):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = learning_rate
    for i in range(len(steps)):
        scale = scales[i] if i < len(scales) else 1
        if batch >= steps[i]:
            lr = lr * scale
            if batch == steps[i]:
                break
        else:
            break
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr/batch_size
    return lr

In [19]:
global processed_batches

In [21]:
epoch = 1

In [22]:
t0 = time.time()
if ngpus > 1:
    cur_model = model.module
else:
    cur_model = model

In [23]:
train_loader = torch.utils.data.DataLoader(
dataset.listDataset(trainlist, shape=(init_width, init_height),
   shuffle=True,
   transform=transforms.Compose([
   transforms.ToTensor(),
   ]), 
   train=True, 
   seen=cur_model.seen,
   batch_size=batch_size,
   num_workers=num_workers),
batch_size=batch_size, shuffle=False, **kwargs)

In [24]:
lr = adjust_learning_rate(optimizer, processed_batches)

In [25]:
logging('epoch %d, processed %d samples, lr %f' % (epoch, epoch * len(train_loader.dataset), lr))

2018-04-01 11:06:12 epoch 1, processed 16551 samples, lr 0.000100


In [26]:
model.train()

batch_idx, (data, target) = next(enumerate(train_loader))

In [27]:
data, target = Variable(data), Variable(target)

In [30]:
output = model(data)

In [33]:
region_loss.seen = region_loss.seen + data.data.size(0)

In [36]:
# loss = region_loss(output, target)

In [38]:
        #output : BxAs*(4+1+num_classes)*H*W
        t0 = time.time()
        nB = output.data.size(0)
        nA = region_loss.num_anchors
        nC = region_loss.num_classes
        nH = output.data.size(2)
        nW = output.data.size(3)
        """
        nB,nA,nC,nH,nW
        (32, 5, 20, 13, 13)
        """
        # output.shape : [32, 125, 13, 13]
        output   = output.view(nB, nA, (5+nC), nH, nW) # [32, 5, 25, 13, 13]
        x = F.sigmoid(output.index_select(2, Variable(torch.LongTensor([0]))).view(nB, nA, nH, nW))
        """
        index_select : http://pytorch.org/docs/master/torch.html?highlight=index_select#torch.index_select
        相当于tf.gather
        x.shape = [32, 5, 13, 13]
        """
        y = F.sigmoid(output.index_select(2, Variable(torch.LongTensor([1]))).view(nB, nA, nH, nW))
        # y.shape = [32, 5, 13, 13]
        w = output.index_select(2, Variable(torch.LongTensor([2]))).view(nB, nA, nH, nW)
        # w.shape = [32, 5, 13, 13]
        h = output.index_select(2, Variable(torch.LongTensor([3]))).view(nB, nA, nH, nW)
        # h.shape = [32, 5, 13, 13]
        conf = F.sigmoid(output.index_select(2, Variable(torch.LongTensor([4]))).view(nB, nA, nH, nW))
        # conf.shape = [32, 5, 13, 13]
        cls = output.index_select(2, Variable(torch.linspace(5,5+nC-1,nC).long()))
        # torch.linspace(5,5+nC-1,nC).long() = [5,6,7.....,24] 取剩下20个输出 cls.shape = [32, 5, 20, 13, 13]
        cls = cls.view(nB*nA, nC, nH*nW).transpose(1,2).contiguous().view(nB*nA*nH*nW, nC)
        """
        注意这样和直接.view(nB*nA*nH*nW, nC)得到的结果是不同的
        这样操作的结果是以nC维度为轴，做的转换
        cls.shape = [27040, 20]
        """
        t1 = time.time()

        pred_boxes = torch.FloatTensor(4, nB*nA*nH*nW)
        # pred_boxes.shape = [4, 27040]
        grid_x = torch.linspace(0, nW-1, nW).repeat(nH,1).repeat(nB*nA, 1, 1).view(nB*nA*nH*nW)
        grid_y = torch.linspace(0, nH-1, nH).repeat(nW,1).t().repeat(nB*nA, 1, 1).view(nB*nA*nH*nW)
        """
        grid_x.shape = [27040]
        grid_y.shape = [27040]
        生成了0,1,2,3,4,5,6,7,8,9,10,11,12,0,1,2,3,,,,,,,,,(不停的重复13*batch*5次)
        """
        anchor_w = torch.Tensor(region_loss.anchors).view(nA, region_loss.anchor_step).index_select(1, torch.LongTensor([0]))
        anchor_h = torch.Tensor(region_loss.anchors).view(nA, region_loss.anchor_step).index_select(1, torch.LongTensor([1]))
        """
        把anchors分成两拨，实际上是square的两条边的大小
        region_loss.anchors:10个items的list
        [1.3221,
         1.73145,
         3.19275,
         4.00944,
         5.05587,
         8.09892,
         9.47112,
         4.84053,
         11.2364,
         10.0071]
         anchor_w : shape:[5,1]
          1.3221
          3.1927
          5.0559
          9.4711
         11.2364
         anchor_h : shape:[5,1]
          1.7314
          4.0094
          8.0989
          4.8405
         10.0071
        """
        anchor_w = anchor_w.repeat(nB, 1).repeat(1, 1, nH*nW).view(nB*nA*nH*nW)
        anchor_h = anchor_h.repeat(nB, 1).repeat(1, 1, nH*nW).view(nB*nA*nH*nW)
        # 把[5,1] repeat很多遍，尺寸[27040]
        # pred_boxes[0] = x.data + grid_x
        # pred_boxes[1] = y.data + grid_y
        # pred_boxes[2] = torch.exp(w.data) * anchor_w
        # pred_boxes[3] = torch.exp(h.data) * anchor_h

        pred_boxes[0] = x.data.view([-1]) + grid_x
        pred_boxes[1] = y.data.view([-1]) + grid_y
        pred_boxes[2] = torch.exp(w.data.view([-1])) * anchor_w
        pred_boxes[3] = torch.exp(h.data.view([-1])) * anchor_h
        """
        上面这个写法太不规范，改了一下，最终输出结果是一样的
        可以这样理解这个操作,先看一下
        grid_y[13*13:13*13*2] 0,1,2,3,4,5,6,7,8,9,10,11,12,0,1,2,3,4,,,,,
        grid_x[13*13:13*13*2] 0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,,,,
        grid_x和grid_y拼起来就是每13*13一段，拼起来正好是遍历了13*13棋盘中的每个点
        anchor_h[13*13:13*13*2]和anchor_h[13*13:13*13*2]里面的值都只有一个(h和w还是不同的)
        一个h和一个w合起来就是一个anchor的尺寸
        每13*13个中心点共一个anchor尺寸
        总共有batch * num_anchors = 32 * 5 = 160个尺寸
        所以和起来是遍历了每一张图片的13*13个中心点中的每一个点上面的20个预测类别的5种anchor尺寸，
        x和y是输出的中点，区间[0,1](经过sigmoid)
        可以看做是把每一张图划分成13*13个小格，x和y就是该小格预测出来的目标的中心点相对于小格内的坐标
        但是这个奇怪的anchor尺寸是怎么来的 ？
        难道是论文里Dimension Clusters那一段说的通过k-means从dataset里面算出来的 ？感觉也不是
        """

        pred_boxes = convert2cpu(pred_boxes.transpose(0,1).contiguous().view(-1,4))
        # 转到cpu也是为了截断梯度 ？
        t2 = time.time()