# MXNet Gluon Multi-GPU

In [1]:
MULTI_GPU = True

In [2]:
import os
import sys
import multiprocessing
import logging
import numpy as np
import pandas as pd

import mxnet as mx
from mxnet.io import DataDesc
from mxnet import nd, gluon, autograd
from mxnet.gluon.data import RecordFileDataset, ArrayDataset, Dataset
from mxnet.gluon.data.vision import transforms
from mxnet.gluon.data.vision.datasets import ImageFolderDataset
from mxnet.gluon.data.dataloader import DataLoader
from mxnet.gluon.model_zoo import vision as models
from mxnet import recordio

from sklearn.metrics.ranking import roc_auc_score
from sklearn.model_selection import train_test_split
from PIL import Image
from common.utils import *
from common.params_dense import *
import math
from time import time

%load_ext autoreload
%autoreload 2

  from ._conv import register_converters as _register_converters


In [3]:
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("Numpy: ", np.__version__)
print("MXNet: ", mx.__version__)
print("GPU: ", get_gpu_name())
print(get_cuda_version())
print("CuDNN Version ", get_cudnn_version())

OS:  linux
Python:  3.5.4 |Anaconda custom (64-bit)| (default, Nov 20 2017, 18:44:38) 
[GCC 7.2.0]
Numpy:  1.14.1
MXNet:  1.3.0
GPU:  ['Tesla V100-PCIE-16GB', 'Tesla V100-PCIE-16GB', 'Tesla V100-PCIE-16GB', 'Tesla V100-PCIE-16GB']
CUDA Version 9.0.176
CuDNN Version  7.0.5


In [4]:
# User-set
# Note if NUM_GPUS > 1 then MULTI_GPU = True and ALL GPUs will be used
# Set below to affect batch-size
# E.g. 1 GPU = 64, 2 GPUs =64*2, 4 GPUs = 64*4
# Note that the effective learning-rate will be decreased this way
CPU_COUNT = multiprocessing.cpu_count() 
GPU_COUNT = len(get_gpu_name())
if not MULTI_GPU:
    GPU_COUNT = 1
print("CPUs: ", CPU_COUNT)
print("GPUs: ", GPU_COUNT)

CPUs:  24
GPUs:  4


In [5]:
# Manually scale to multi-gpu
if MULTI_GPU:
    LR *= GPU_COUNT
    BATCHSIZE *= (GPU_COUNT)
    BATCHSIZE = BATCHSIZE//GPU_COUNT*GPU_COUNT

## Data Download

In [6]:
# Model-params
# Paths
CSV_DEST = "chestxray"
IMAGE_FOLDER = os.path.join(CSV_DEST, "images")
LABEL_FILE = os.path.join(CSV_DEST, "Data_Entry_2017.csv")

In [7]:
%%time
# Download data
print("Please make sure to download")
print("https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy")
download_data_chextxray(CSV_DEST)

Please make sure to download
https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy
Data already exists
CPU times: user 587 ms, sys: 216 ms, total: 803 ms
Wall time: 802 ms


  params = attr.ib(convert=attr.converters.optional(tuple))
  ids = attr.ib(default=None, convert=_ensure_immutable_ids)


## Data prep
https://github.com/apache/incubator-mxnet/issues/1480


In [8]:
train_set, valid_set, test_set = get_train_valid_test_split(TOT_PATIENT_NUMBER)

train:21563 valid:3080 test:6162


## Data Loading

### Creating the datasets

In [9]:
class XrayData(Dataset):
    def __init__(self, img_dir, lbl_file, patient_ids, transform=None):
        
        self.img_locs, self.labels = get_imgloc_labels(img_dir, lbl_file, patient_ids)
        self.transform = transform
        print("Loaded {} labels and {} images".format(len(self.labels), len(self.img_locs)))
    
    def __getitem__(self, idx):
        im_file = self.img_locs[idx]
        im_rgb = Image.open(im_file)
        label = self.labels[idx]
        im_rgb = mx.nd.array(im_rgb)
        if self.transform is not None:
            im_rgb = self.transform(im_rgb)

        return im_rgb, mx.nd.array(label)
        
    def __len__(self):
        return len(self.img_locs)

In [10]:
def no_augmentation_dataset(img_dir, lbl_file, patient_ids, normalize):
    dataset = XrayData(img_dir, lbl_file, patient_ids,
                       transform=transforms.Compose([
                           transforms.Resize(WIDTH),
                           transforms.ToTensor(),  
                           transforms.Normalize(IMAGENET_RGB_MEAN, IMAGENET_RGB_SD)]))
    return dataset

In [11]:
# Dataset for training
train_dataset = XrayData(img_dir=IMAGE_FOLDER,
                         lbl_file=LABEL_FILE,
                         patient_ids=train_set,
                         transform=transforms.Compose([
                             transforms.RandomResizedCrop(size=WIDTH),
                             transforms.RandomFlipLeftRight(),
                             transforms.ToTensor(),
                             transforms.Normalize(IMAGENET_RGB_MEAN, IMAGENET_RGB_SD)]))

Loaded 87306 labels and 87306 images


In [12]:
valid_dataset = no_augmentation_dataset(IMAGE_FOLDER, LABEL_FILE, valid_set, transforms.Normalize(IMAGENET_RGB_MEAN, IMAGENET_RGB_SD))
test_dataset = no_augmentation_dataset(IMAGE_FOLDER, LABEL_FILE, test_set, transforms.Normalize(IMAGENET_RGB_MEAN, IMAGENET_RGB_SD))

Loaded 7616 labels and 7616 images
Loaded 17198 labels and 17198 images


In [13]:
# DataLoaders
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCHSIZE,
                          shuffle=True, num_workers=6, last_batch='discard')
valid_loader = DataLoader(dataset=valid_dataset, batch_size=BATCHSIZE,
                          shuffle=False, num_workers=6, last_batch='discard')
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCHSIZE,
                         shuffle=False, num_workers=6, last_batch='discard')

## Creating the network

### Loading the pretrained model

In [14]:
ctx = [mx.gpu(i) for i in range(GPU_COUNT)]   

In [15]:
net = mx.gluon.model_zoo.vision.densenet121(pretrained=True, ctx=ctx)
with net.name_scope():
    net.output = mx.gluon.nn.Dense(CLASSES)
net.output.initialize(ctx=ctx)
net.hybridize()

## Trainer

In [16]:
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': LR})

## Loss 

In [17]:
binary_cross_entropy = gluon.loss.SigmoidBinaryCrossEntropyLoss()

## Output

In [18]:
sig = gluon.nn.Activation('sigmoid')

## Evaluation loop

In [19]:
def evaluate_accuracy(data_iterator, net):
    acc = 0
    for i, (data, label) in enumerate(data_iterator):
        data_split = gluon.utils.split_and_load(data, ctx)
        label_split = gluon.utils.split_and_load(label, ctx)
        outputs = [(sig(net(X)),Y) for X, Y in zip(data_split, label_split)]
        for output, label in outputs:
            acc += float((label.asnumpy() == np.round(output.asnumpy())).sum()) / CLASSES / output.shape[0]
    data_split = gluon.utils.split_and_load(data, [mx.cpu()])
    label_split = gluon.utils.split_and_load(label, [mx.cpu()])
    return acc/i/len(ctx)

## Training loop

In [20]:
n_batch = 5 # Blocking call every 5 batches
n_print = 100 # Print every 100 batches

In [21]:
def train_epoch(net, dataloader, trainer, loss_fn, ctx, n_batch=7, n_print=100):
    losses_acc = [mx.nd.zeros((1), ctx=c) for c in ctx]
    print_loss = 0
    for i, (data, label) in enumerate(dataloader):        
        data_split = gluon.utils.split_and_load(data, ctx)
        label_split = gluon.utils.split_and_load(label, ctx)  
        
        if i > 0:
            for j, l in enumerate(losses):
                # Accumulate losses asynchronously on each GPU
                losses_acc[j] += l.mean()
            if i%n_batch == 0:
                # Blocking call
                print_loss = 0
                for l in losses_acc:
                    print_loss += l.asscalar()
                l = l / (i+1)/len(ctx)
            if i%n_print == 0:
                print('Batch {0}: Loss: {1:.4f}'.format(i, print_loss))            
            
        with autograd.record():
            losses = [loss_fn(net(X), Y) for X, Y in zip(data_split, label_split)]
        for l in losses:
            l.backward()
        trainer.step(data.shape[0]) 

In [22]:
%%time
# 1 GPU - Main training loop: 28min 40s
# 4 GPU - Main training loop: 9min 43s
for e in range(EPOCHS):
    tick = time()
    train_epoch(net, train_loader, trainer, binary_cross_entropy, ctx)
    test_accuracy = evaluate_accuracy(valid_loader, net)
    print('Epoch {0}, {1:.6f} test_accuracy after {2:.2f} seconds'.format(e, test_accuracy, time()-tick))

Batch 100: Loss: 82.8983
Batch 200: Loss: 148.3790
Batch 300: Loss: 212.7993
Epoch 0, 0.985651 test_accuracy after 122.56 seconds
Batch 100: Loss: 63.2765
Batch 200: Loss: 125.7693
Batch 300: Loss: 187.9143
Epoch 1, 0.986099 test_accuracy after 114.79 seconds
Batch 100: Loss: 61.0549
Batch 200: Loss: 122.8533
Batch 300: Loss: 184.8669
Epoch 2, 0.985880 test_accuracy after 116.16 seconds
Batch 100: Loss: 60.4399
Batch 200: Loss: 121.4686
Batch 300: Loss: 182.2017
Epoch 3, 0.985979 test_accuracy after 114.90 seconds
Batch 100: Loss: 60.2344
Batch 200: Loss: 120.4585
Batch 300: Loss: 180.7013
Epoch 4, 0.985700 test_accuracy after 115.12 seconds
CPU times: user 31min 24s, sys: 11min 1s, total: 42min 26s
Wall time: 9min 43s


## Evaluate

In [23]:
%%time
predictions = np.zeros((0, CLASSES))
labels = np.zeros((0, CLASSES))
for (data, label) in (test_loader):        
    data_split = gluon.utils.split_and_load(data, ctx)
    label_split = gluon.utils.split_and_load(label, ctx)  
    outputs = [sig(net(X)) for X in data_split]
    predictions = np.concatenate([predictions, np.concatenate([output.asnumpy() for output in outputs])])
    labels = np.concatenate([labels, np.concatenate([label.asnumpy() for label in label_split])])

CPU times: user 20.1 s, sys: 7.22 s, total: 27.3 s
Wall time: 13.7 s


In [24]:
# 1 GPU AUC: 0.8235
# 4 GPU AUC: 0.8145
print("Validation AUC: {0:.4f}".format(compute_roc_auc(labels, predictions, CLASSES)))

Full AUC [0.8125718127685243, 0.8551507280005255, 0.8071080700770485, 0.8908148250913325, 0.8860359865394066, 0.9254589157853671, 0.7143975677357627, 0.8279628814488093, 0.6307513183430535, 0.8473062753975474, 0.7456833776349139, 0.8078839618223166, 0.7708621556491738, 0.8808694022714751]
Validation AUC: 0.8145


## Synthetic Data (Pure Training)

In [25]:
# Test on fake-data -> no IO lag
batch_in_epoch = len(train_dataset.labels)//BATCHSIZE
tot_num = batch_in_epoch * BATCHSIZE
print(tot_num)

87296


In [26]:
fake_X = mx.nd.ones((tot_num, 3, 224, 224), dtype=np.float32)
fake_y = mx.nd.ones((tot_num, CLASSES), dtype=np.float32)

In [27]:
train_dataset_synth = ArrayDataset(fake_X, fake_y)
train_dataloader_synth = DataLoader(train_dataset_synth, BATCHSIZE, shuffle=False, num_workers=0, last_batch='discard')

In [28]:
%%time
# 1 GPU - Main training loop: 27min 45s
# 4 GPU - Main training loop: 8min 26s
n_batch = 100
for e in range(EPOCHS):
    tick = time()
    train_epoch(net, train_dataloader_synth, trainer, binary_cross_entropy, ctx)
    nd.waitall()
    print('Epoch {0}, {1:.2f} seconds'.format(e, time()-tick))

Batch 100: Loss: 61.9009
Batch 200: Loss: 61.9074
Batch 300: Loss: 61.9134
Epoch 0, 102.29 seconds
Batch 100: Loss: 0.0054
Batch 200: Loss: 0.0103
Batch 300: Loss: 0.0148
Epoch 1, 101.39 seconds
Batch 100: Loss: 0.0042
Batch 200: Loss: 0.0080
Batch 300: Loss: 0.0115
Epoch 2, 100.83 seconds
Batch 100: Loss: 0.0030
Batch 200: Loss: 0.0059
Batch 300: Loss: 0.0086
Epoch 3, 100.72 seconds
Batch 100: Loss: 0.0026
Batch 200: Loss: 0.0050
Batch 300: Loss: 0.0073
Epoch 4, 101.25 seconds
CPU times: user 30min 46s, sys: 10min 35s, total: 41min 22s
Wall time: 8min 26s
