In [1]:
%matplotlib inline
import os
import shutil
import sys
import numpy as np
import pandas as pd
import mxnet as mx
from mxnet import gluon
from mxnet import image
from mxnet import nd
from mxnet import init
from mxnet import autograd
from mxnet.gluon.data import vision
from mxnet.gluon import nn
from mxnet.gluon.model_zoo import vision as models
from PIL import Image
from time import time
from sklearn.model_selection import train_test_split

In [2]:
def mkdir_if_not_exist(path):
    if not os.path.exists(os.path.join(*path)):
        os.makedirs(os.path.join(*path))


def reorg_data(label_file, train_dir, test_dir, input_dir, valid_ratio):
    labels = pd.read_csv(label_file)
    labels = labels.sort_values('id')
    files = os.listdir(train_dir)
    files.sort()
    count = 0
    idx = np.arange(len(files))
    np.random.shuffle(idx)
    shutil.rmtree(os.path.join(input_dir, 'train'))
    shutil.rmtree(os.path.join(input_dir, 'valid'))
    for i in idx:
        if count < valid_ratio * len(files):
            mkdir_if_not_exist([input_dir, 'train', labels['breed'][i]])
            shutil.copy(os.path.join(train_dir, files[i]), os.path.join(input_dir, 'train', labels['breed'][i]))
        else:
            mkdir_if_not_exist([input_dir, 'valid', labels['breed'][i]])
            shutil.copy(os.path.join(train_dir, files[i]), os.path.join(input_dir, 'valid', labels['breed'][i]))
        count += 1
    for file in os.listdir(test_dir):
        mkdir_if_not_exist([input_dir, 'test', 'unknown'])
        shutil.copy(os.path.join(test_dir, file), os.path.join(input_dir, 'test', 'unknown'))
    

In [3]:
label_file = '/home/samael/kaggle/dogs/labels.csv'
train_dir = '/home/samael/kaggle/dogs/train/'
test_dir = '/home/samael/kaggle/dogs/test/'
input_dir = '/home/samael/kaggle/dogs/data/'
reorg_data(label_file, train_dir, test_dir, input_dir, valid_ratio=0.8)

In [4]:
import re
extra_dir = '/home/samael/kaggle/dogs/Images'
for i in os.listdir(extra_dir):
    directory = re.sub('n[0-9]+-', '', i)
    directory = directory.lower()
    for file in os.listdir(os.path.join(extra_dir, i)):
        shutil.copy(os.path.join(extra_dir, i, file), os.path.join(input_dir, 'train', directory))

In [5]:
def transform_train(data, label):
    im = image.imresize(data.astype('float32') / 255, 224, 224)
    auglist = image.CreateAugmenter(data_shape=(3, 224, 224), resize=0,
                                    rand_crop=True, rand_resize=True, rand_mirror=True,
                                    mean=np.array([0.485, 0.456, 0.406]), std=np.array([0.229, 0.224, 0.225]), brightness=0, contrast=0, saturation=0,
                                    hue=0, pca_noise=0, rand_gray=0.1, inter_method=2)
    for aug in auglist:
        im = aug(im)
    im = nd.transpose(im, (2, 0, 1))
    return (im, nd.array([label]).asscalar().astype('float32'))

def transform_test(data, label):
    im = image.imresize(data.astype('float32') / 255, 224, 224)
    im = nd.transpose(im, (2, 0, 1))
    return (im, nd.array([label]).asscalar().astype('float32'))

In [6]:
class Residual(nn.HybridBlock):
    def __init__(self, channels, same_shape=True, **kwargs):
        super(Residual, self).__init__(**kwargs)
        self.same_shape = same_shape
        with self.name_scope():
            strides = 1 if same_shape else 2
            self.conv1 = nn.Conv2D(channels, kernel_size=3, padding=1,
                                  strides=strides)
            self.bn1 = nn.BatchNorm()
            self.conv2 = nn.Conv2D(channels, kernel_size=3, padding=1)
            self.bn2 = nn.BatchNorm()
            if not same_shape:
                self.conv3 = nn.Conv2D(channels, kernel_size=1,
                                      strides=strides)

    def hybrid_forward(self, F, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        if not self.same_shape:
            x = self.conv3(x)
        return F.relu(out + x)


class ResNet(nn.HybridBlock):
    def __init__(self, num_classes, verbose=False, **kwargs):
        super(ResNet, self).__init__(**kwargs)
        self.verbose = verbose
        with self.name_scope():
            net = self.net = nn.HybridSequential()
            # 模块1
            net.add(nn.Conv2D(channels=32, kernel_size=3, strides=1,
                              padding=1))
            net.add(nn.BatchNorm())
            net.add(nn.Activation(activation='relu'))
            # 模块2
            for _ in range(6):
                net.add(Residual(channels=32))
            # 模块3
            net.add(Residual(channels=64, same_shape=False))
            for _ in range(4):
                net.add(Residual(channels=64))
            # 模块4
            net.add(Residual(channels=128, same_shape=False))
            for _ in range(4):
                net.add(Residual(channels=128))
            # 模块5
            net.add(nn.GlobalAvgPool2D())
            net.add(nn.Flatten())
            net.add(nn.Dense(num_classes))

    def hybrid_forward(self, F, x):
        out = x
        for i, b in enumerate(self.net):
            out = b(out)
            if self.verbose:
                print('Block %d output: %s'%(i+1, out.shape))
        return out


def get_net(ctx):
    num_outputs = 120
    net = ResNet(num_outputs)
    net.initialize(ctx=ctx, init=init.Xavier())
    return net

In [7]:
import datetime
import sys
sys.path.append('..')
import utils

softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
def _get_batch(batch, ctx):
    """return data and label on ctx"""
    if isinstance(batch, mx.io.DataBatch):
        data = batch.data[0]
        label = batch.label[0]
    else:
        data, label = batch
    return (gluon.utils.split_and_load(data, ctx, even_split=False),
            gluon.utils.split_and_load(label, ctx, even_split=False),
            data.shape[0])


def accuracy(output, label):
    return nd.mean(output.argmax(axis=1)==label).asscalar()


def evaluate_accuracy(data_iterator, net, ctx=[mx.cpu()]):
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    acc = nd.array([0])
    n = 0.
    if isinstance(data_iterator, mx.io.MXDataIter):
        data_iterator.reset()
    for batch in data_iterator:
        data, label, batch_size = _get_batch(batch, ctx)
        for X, y in zip(data, label):
            acc += nd.sum(net(X).argmax(axis=1)==y).copyto(mx.cpu())
        acc.wait_to_read() # don't push too many operators into backend
        n += batch_size
    return acc.asscalar() / n

# def train(net, train_data, valid_data, num_epochs, lr, wd, ctx, lr_period, lr_decay):
#     trainer = gluon.Trainer(
#         net.collect_params(), 'sgd', {'learning_rate': lr, 'momentum': 0.9, 'wd': wd})

#     prev_time = datetime.datetime.now()
#     for epoch in range(num_epochs):
#         train_loss = 0.0
#         train_acc = 0.0
#         if epoch > 0 and epoch % lr_period == 0:
#             trainer.set_learning_rate(trainer.learning_rate * lr_decay)
#         for data, label in train_data:
#             data_list = gluon.utils.split_and_load(data, ctx, even_split=False)
#             label_list = gluon.utils.split_and_load(label, ctx, even_split=False)
# #             label = label.as_in_context(ctx)
#             with autograd.record():
#                 losses = [softmax_cross_entropy(net(X), y) for X, y in zip(data_list, label_list)]
# #                 output = net(data.as_in_context(ctx))
# #                 loss = softmax_cross_entropy(output, label)
# #             loss.backward()
#             for l in losses:
#                 l.backward()
#             train_loss += sum([l.sum().asscalar() for l in losses])
#             trainer.step(batch_size)
# #             train_loss += nd.mean(loss).asscalar()
# #             train_acc += accuracy(output, label)
#             train_acc += np.mean([accuracy(net(X), y) for X, y in zip(data_list, label_list)])
#         cur_time = datetime.datetime.now()
#         h, remainder = divmod((cur_time - prev_time).seconds, 3600)
#         m, s = divmod(remainder, 60)
#         time_str = "Time %02d:%02d:%02d" % (h, m, s)
#         if valid_data is not None:
#             valid_acc = evaluate_accuracy(valid_data, net, mx.gpu(0))
#             epoch_str = ("Epoch %d. Loss: %f, Train acc %f, Valid acc %f, "
#                          % (epoch, train_loss / len(train_data),
#                             train_acc / len(train_data), valid_acc))
#         else:
#             epoch_str = ("Epoch %d. Loss: %f, Train acc %f, "
#                          % (epoch, train_loss / len(train_data),
#                             train_acc / len(train_data)))
#         prev_time = cur_time
#         print(epoch_str + time_str + ', lr ' + str(trainer.learning_rate))

In [8]:
batch_size = 64
train_ds = vision.ImageFolderDataset(input_dir + 'train', flag=1,
                                     transform=transform_train)
valid_ds = vision.ImageFolderDataset(input_dir + 'valid', flag=1,
                                     transform=transform_train)
test_ds = vision.ImageFolderDataset(input_dir + 'test', flag=1,
                                     transform=transform_test)

loader = gluon.data.DataLoader
train_data = loader(train_ds, batch_size, shuffle=True, last_batch='keep')
valid_data = loader(valid_ds, batch_size, shuffle=True, last_batch='keep')
test_data = loader(test_ds, batch_size, shuffle=False, last_batch='keep')

In [9]:
# ctx = [mx.gpu(0), mx.gpu(1)]
# num_epochs = 300
# learning_rate = 0.1
# weight_decay = 5e-4
# lr_period = 100
# lr_decay = 0.1

# net = get_net(ctx)
# net.hybridize()
# train(net, train_data, valid_data, num_epochs, learning_rate, 
#       weight_decay, ctx, lr_period, lr_decay)

In [10]:
pretrained_net = models.resnet152_v1(pretrained=True)

In [11]:
ctx = [mx.gpu(0), mx.gpu(1)]
num_epochs = 600
learning_rate = 0.01
weight_decay = 5e-4
lr_period = 150
lr_decay = 0.1
finetune_net = models.resnet152_v1(classes=120)
finetune_net.features = pretrained_net.features
finetune_net.output.initialize(ctx=ctx[0], init=init.Xavier())

In [15]:
print(pretrained_net)

ResNetV1(
  (features): HybridSequential(
    (0): Conv2D(3 -> 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, in_channels=64)
    (2): Activation(relu)
    (3): MaxPool2D(size=(3, 3), stride=(2, 2), padding=(1, 1), ceil_mode=False)
    (4): HybridSequential(
      (0): BottleneckV1(
        (body): HybridSequential(
          (0): Conv2D(64 -> 64, kernel_size=(1, 1), stride=(1, 1))
          (1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, in_channels=64)
          (2): Activation(relu)
          (3): Conv2D(64 -> 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (4): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, in_channels=64)
          (5): Activation(relu)
          (6): Conv2D(64 -> 256, kernel_size=(1, 1), stride=(1, 1))
          (7): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, in_channels=256)
        )
        (downsa

In [12]:
def util_train(train_data, test_data, net, loss, trainer, ctx, num_epochs, lr_period, lr_decay, print_batches=None):
    """Train a network"""
    total_times = 0
    print("Start training on ", ctx)
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    for epoch in range(num_epochs):
        if epoch > 0 and epoch % lr_period == 0:
            trainer.set_learning_rate(trainer.learning_rate * lr_decay)
        train_loss, train_acc, n, m_all = 0.0, 0.0, 0.0, 0.0
        if isinstance(train_data, mx.io.MXDataIter):
            train_data.reset()
        start = time()
        for i, batch in enumerate(train_data):
            data, label, batch_size = _get_batch(batch, ctx)
            losses = []
            with autograd.record():
                outputs = [net(X) for X in data]
                losses = [loss(yhat, y) for yhat, y in zip(outputs, label)]
            for l in losses:
                l.backward()
            train_acc += sum([(yhat.argmax(axis=1)==y).sum().asscalar()
                              for yhat, y in zip(outputs, label)])
            train_loss += sum([l.sum().asscalar() for l in losses])
            trainer.step(batch_size)
            n += batch_size
            m_all += sum([y.size for y in label])
            if print_batches and (i+1) % print_batches == 0:
                print("Batch %d. Loss: %f, Train acc %f" % (
                    n, train_loss/n, train_acc/m_all
                ))

        test_acc = evaluate_accuracy(test_data, net, ctx)
        duration = time() - start
        total_times += duration
        h, remainder = divmod(total_times, 3600)
        m, s = divmod(remainder, 60)
        time_str = "%02d:%02d:%02d" % (h, m, s)
        print("Epoch %d. Loss: %f, Train acc %f, Valid acc %f, Time %.1f sec, Total time %s, lr %s" % (
            epoch, train_loss/n, train_acc/m_all, test_acc, duration, time_str, str(trainer.learning_rate)
        ))

In [13]:
def train(net, ctx, epochs=num_epochs, learning_rate=learning_rate, wd=weight_decay):
    # 确保net的初始化在ctx上
    net.collect_params().reset_ctx(ctx)
    net.hybridize()
    loss = gluon.loss.SoftmaxCrossEntropyLoss()
    # 训练
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {
        'learning_rate': learning_rate, 'wd': wd})
    count = 0
    for _, i in finetune_net.features.collect_params().items():
        count += 1
        i.lr_mult = learning_rate
        if count >= 820:
            i.lr_mult = learning_rate * 10
    util_train(train_data, valid_data, net, loss, trainer, ctx, epochs, lr_period, lr_decay)

In [14]:
train(finetune_net, ctx)

Start training on  [gpu(0), gpu(1)]
Epoch 0. Loss: 4.129219, Train acc 0.156061, Valid acc 0.344912, Time 553.4 sec, Total time 00:09:13, lr 0.01
Epoch 1. Loss: 2.665453, Train acc 0.450553, Valid acc 0.533757, Time 543.7 sec, Total time 00:18:17, lr 0.01
Epoch 2. Loss: 1.959670, Train acc 0.562974, Valid acc 0.619374, Time 540.3 sec, Total time 00:27:17, lr 0.01
Epoch 3. Loss: 1.635342, Train acc 0.607970, Valid acc 0.602740, Time 543.8 sec, Total time 00:36:21, lr 0.01
Epoch 4. Loss: 1.457243, Train acc 0.637144, Valid acc 0.654110, Time 545.5 sec, Total time 00:45:26, lr 0.01
Epoch 5. Loss: 1.345149, Train acc 0.656617, Valid acc 0.681018, Time 545.5 sec, Total time 00:54:32, lr 0.01
Epoch 6. Loss: 1.283257, Train acc 0.663537, Valid acc 0.698630, Time 543.1 sec, Total time 01:03:35, lr 0.01
Epoch 7. Loss: 1.211972, Train acc 0.679220, Valid acc 0.681996, Time 546.5 sec, Total time 01:12:41, lr 0.01
Epoch 8. Loss: 1.181558, Train acc 0.683149, Valid acc 0.684932, Time 541.9 sec, Tot

Epoch 74. Loss: 0.611331, Train acc 0.828604, Valid acc 0.815558, Time 540.9 sec, Total time 11:20:46, lr 0.01
Epoch 75. Loss: 0.611579, Train acc 0.827770, Valid acc 0.813601, Time 544.0 sec, Total time 11:29:50, lr 0.01
Epoch 76. Loss: 0.611752, Train acc 0.828674, Valid acc 0.817515, Time 542.3 sec, Total time 11:38:52, lr 0.01
Epoch 77. Loss: 0.613729, Train acc 0.827700, Valid acc 0.808219, Time 542.3 sec, Total time 11:47:55, lr 0.01
Epoch 78. Loss: 0.605005, Train acc 0.830586, Valid acc 0.814090, Time 540.7 sec, Total time 11:56:55, lr 0.01
Epoch 79. Loss: 0.608234, Train acc 0.830447, Valid acc 0.823875, Time 542.6 sec, Total time 12:05:58, lr 0.01
Epoch 80. Loss: 0.608563, Train acc 0.832221, Valid acc 0.837573, Time 547.6 sec, Total time 12:15:06, lr 0.01
Epoch 81. Loss: 0.608108, Train acc 0.830204, Valid acc 0.820450, Time 539.0 sec, Total time 12:24:05, lr 0.01
Epoch 82. Loss: 0.595252, Train acc 0.833090, Valid acc 0.838063, Time 541.9 sec, Total time 12:33:06, lr 0.01
E

Epoch 148. Loss: 0.482653, Train acc 0.871410, Valid acc 0.868885, Time 545.2 sec, Total time 22:31:26, lr 0.01
Epoch 149. Loss: 0.462867, Train acc 0.874609, Valid acc 0.878180, Time 545.5 sec, Total time 22:40:31, lr 0.01
Epoch 150. Loss: 0.470132, Train acc 0.873496, Valid acc 0.867417, Time 548.1 sec, Total time 22:49:40, lr 0.001
Epoch 151. Loss: 0.471454, Train acc 0.872349, Valid acc 0.867906, Time 547.6 sec, Total time 22:58:47, lr 0.001


KeyboardInterrupt: 

In [None]:
pred = []
for data, label in test_data:
    output = nd.softmax(finetune_net(data.as_in_context(ctx[0])))
    pred.extend(output.asnumpy())

In [None]:
ids = sorted(os.listdir('/home/samael/kaggle/dogs/data/test/unknown'))
with open('submission.csv', 'w') as f:
    f.write('id,' + ','.join(train_ds.synsets) + '\n')
    for i, output in zip(ids, pred):
        f.write(i.split('.')[0] + ',' + ','.join([str(num) for num in output]) + '\n')