Experimental notebook that modifies basic API a little to prefetch data-loaders and calculate loss multi-gpu (not cuda:0). However, no real improvement in speed

In [1]:
MULTI_GPU = True

In [2]:
import os
import sys
import time
import multiprocessing
import numpy as np
import pandas as pd
import torch
import torchvision.models as models
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.init as init
import torchvision.transforms as transforms
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics.ranking import roc_auc_score
from sklearn.model_selection import train_test_split
from PIL import Image
from common.utils import download_data_chextxray, get_imgloc_labels, get_train_valid_test_split
from common.utils import compute_roc_auc, get_cuda_version, get_cudnn_version, get_gpu_name
from common.utils import yield_mb
from common.params_dense import *

In [3]:
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("PyTorch: ", torch.__version__)
print("Numpy: ", np.__version__)
print("GPU: ", get_gpu_name())
print(get_cuda_version())
print("CuDNN Version ", get_cudnn_version())

OS:  linux
Python:  3.5.4 |Anaconda custom (64-bit)| (default, Nov 20 2017, 18:44:38) 
[GCC 7.2.0]
PyTorch:  0.4.0
Numpy:  1.14.1
GPU:  ['Tesla V100-PCIE-16GB', 'Tesla V100-PCIE-16GB', 'Tesla V100-PCIE-16GB', 'Tesla V100-PCIE-16GB']
CUDA Version 9.0.176
CuDNN Version  7.0.5


In [4]:
CPU_COUNT = multiprocessing.cpu_count()
GPU_COUNT = len(get_gpu_name())
print("CPUs: ", CPU_COUNT)
print("GPUs: ", GPU_COUNT)

CPUs:  24
GPUs:  4


In [5]:
# Model-params
IMAGENET_RGB_MEAN_TORCH = [0.485, 0.456, 0.406]
IMAGENET_RGB_SD_TORCH = [0.229, 0.224, 0.225]
# Paths
CSV_DEST = "chestxray"
IMAGE_FOLDER = os.path.join(CSV_DEST, "images")
LABEL_FILE = os.path.join(CSV_DEST, "Data_Entry_2017.csv")
print(IMAGE_FOLDER, LABEL_FILE)

chestxray/images chestxray/Data_Entry_2017.csv


In [6]:
# Manually scale to multi-gpu
assert torch.cuda.is_available()
_DEVICE = torch.device("cuda:0")
# enables cudnn's auto-tuner
torch.backends.cudnn.benchmark=True
if MULTI_GPU:
    LR *= GPU_COUNT 
    BATCHSIZE *= GPU_COUNT

In [7]:
%%time
# Download data
# Wall time: 17min 58s
print("Please make sure to download")
print("https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy")
download_data_chextxray(CSV_DEST)

Please make sure to download
https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy
Data already exists
CPU times: user 580 ms, sys: 240 ms, total: 819 ms
Wall time: 819 ms


In [8]:
#####################################################################################################
## Data Loading

In [9]:
# Normalise by imagenet mean/sd
normalize = transforms.Normalize(IMAGENET_RGB_MEAN_TORCH,
                                 IMAGENET_RGB_SD_TORCH)

In [10]:
class XrayData(Dataset):
    def __init__(self, img_dir, lbl_file, patient_ids, transform=None):
        
        self.img_locs, self.labels = get_imgloc_labels(img_dir, lbl_file, patient_ids)
        self.transform = transform
        print("Loaded {} labels and {} images".format(len(self.labels), len(self.img_locs)))
    
    def __getitem__(self, idx):
        im_file = self.img_locs[idx]
        im_rgb = Image.open(im_file)
        label = self.labels[idx]
        if self.transform is not None:
            im_rgb = self.transform(im_rgb)
        return im_rgb, torch.FloatTensor(label)
        
    def __len__(self):
        return len(self.img_locs)

In [11]:
def no_augmentation_dataset(img_dir, lbl_file, patient_ids, normalize):
    dataset = XrayData(img_dir, lbl_file, patient_ids,
                       transform=transforms.Compose([
                           transforms.Resize(WIDTH),
                           transforms.ToTensor(),  
                           normalize]))
    return dataset

In [12]:
train_set, valid_set, test_set = get_train_valid_test_split(TOT_PATIENT_NUMBER)

train:21563 valid:3080 test:6162


In [13]:
# Dataset for training
train_dataset = XrayData(img_dir=IMAGE_FOLDER,
                         lbl_file=LABEL_FILE,
                         patient_ids=train_set,
                         transform=transforms.Compose([
                             transforms.RandomResizedCrop(size=WIDTH),
                             transforms.RandomHorizontalFlip(),
                             transforms.ToTensor(),  # need to convert image to tensor!
                             normalize]))

Loaded 87306 labels and 87306 images


In [14]:
valid_dataset = no_augmentation_dataset(IMAGE_FOLDER, LABEL_FILE, valid_set, normalize)
test_dataset = no_augmentation_dataset(IMAGE_FOLDER, LABEL_FILE, test_set, normalize)

Loaded 7616 labels and 7616 images
Loaded 17198 labels and 17198 images


In [15]:
#####################################################################################################
## Helper Functions

In [16]:
class DataParallelNoGather(torch.nn.DataParallel):
    def gather(self, outputs, output_device):
        return outputs  # no concat to output-device here

In [17]:
class DataParallelCriterion(torch.nn.DataParallel):
    def forward(self, inputs, *targets, **kwargs):
            if not self.device_ids:
                return self.module(inputs, *targets, **kwargs)
            # Since output not gathered, scatter targets for multi-gpu loss cal
            targets, kwargs = self.scatter(targets, kwargs, self.device_ids)
            if len(self.device_ids) == 1:
                return self.module(inputs, *targets[0], **kwargs[0])
            # Return a list of losses on each gpu
            return [self.module(inputs[i], *targets[i], **kwargs[i]) for i in range(len(inputs))]

In [18]:
def get_symbol(out_features=CLASSES, multi_gpu=MULTI_GPU):
    model = models.densenet.densenet121(pretrained=True)
    # Replace classifier (FC-1000) with (FC-14)
    model.classifier = nn.Sequential(
        nn.Linear(model.classifier.in_features, out_features), 
        nn.Sigmoid())
    if multi_gpu:
        model = DataParallelNoGather(model)
        #model = nn.DataParallel(model)
    # CUDA
    model.to(_DEVICE)  
    return model

In [19]:
def init_symbol(sym, lr=LR):
    # BCE Loss since classes not mutually exclusive + Sigmoid FC-layer
    cri = nn.BCELoss()
    opt = optim.Adam(sym.parameters(), lr=lr, betas=(0.9, 0.999))
    sch = ReduceLROnPlateau(opt, factor=0.1, patience=5, mode='min')
    return opt, cri, sch 

In [20]:
def train_epoch(model, dataloader, optimizer, criterion):
    model.train()
    print("Training epoch")
    
    # Accumulate loss on gpu to avoid cpu-gpu comms
    # Multi-gpu tensors to avoid gpu to cuda:0 comms
    loss_val = [torch.FloatTensor(1).fill_(0).cuda(i) for i in range(GPU_COUNT)]
    
    for i, (data, target) in enumerate(dataloader): 
        # Get samples (both async)
        data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
        # Forwards (modified to return ungathered prediction), so output is a list
        outputs = model(data)
        # Losses (list) for each gpu
        all_losses = criterion(outputs, target)
        # Back-prop
        optimizer.zero_grad()
        
        # Log the loss (before .backward())
        for j, l in enumerate(all_losses):
            # Not calling .item() which is blocking
            loss_val[j] += l      
        
        # NOTE: this is much more efficient than calling .backward() on losses in a for loop
        torch.autograd.backward(all_losses)
        optimizer.step()  
        
    avg_loss = (sum([loss_val[i].detach().cpu() for i in range(GPU_COUNT)]).numpy()/GPU_COUNT/(i+1))[0]
    print("Training loss: {0:.4f}".format(avg_loss))
    print("~~~~~~~")

In [21]:
def valid_epoch(model, dataloader, criterion, phase='valid', cl=CLASSES):
    model.eval()
    if phase == 'testing':
        print("Testing epoch")
    else:
        print("Validating epoch")
    # Don't save gradients
    with torch.no_grad():
        if phase == 'testing':
            # pre-allocate predictions (on gpu)
            len_pred = len(dataloader)*(dataloader.batch_size)
            num_lab = dataloader.dataset.labels.shape[-1]
            out_pred = torch.cuda.FloatTensor(len_pred, num_lab).fill_(0)
        # Accumulate loss on gpu to avoid cpu-gpu comms
        loss_val = [torch.FloatTensor(1).fill_(0).cuda(i) for i in range(GPU_COUNT)]
        for i, (data, target) in enumerate(dataloader): 
            # Get samples
            data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
             # Forwards
            outputs = model(data)
            # Loss
            all_losses = criterion(outputs, target)
            # Log the loss
            for j, l in enumerate(all_losses):
                # Not calling .item() which is blocking
                loss_val[j] += l 
            # Log for AUC
            if phase == 'testing':
                output = torch.cat(outputs)
                out_pred[output.size(0)*i:output.size(0)*(1+i)] = output.data
        # Final loss
        avg_loss = (sum([loss_val[i].detach().cpu() for i in range(GPU_COUNT)]).numpy()/GPU_COUNT/(i+1))[0]
    if phase == 'testing':
        out_gt = dataloader.dataset.labels
        out_pred = out_pred.cpu().numpy()[:len(out_gt)]  # Trim padding
        print("Test-Dataset loss: {0:.4f}".format(avg_loss))
        print("Test-Dataset AUC: {0:.4f}".format(compute_roc_auc(out_gt, out_pred, cl)))
    else:
        print("Validation loss: {0:.4f}".format(avg_loss))
    return avg_loss

In [22]:
# Optimal to use fewer workers than CPU_COUNT
# DataLoaders
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCHSIZE,
                          shuffle=True, num_workers=6, pin_memory=True)
# Using a bigger batch-size (than BATCHSIZE) for below worsens performance
valid_loader = DataLoader(dataset=valid_dataset, batch_size=BATCHSIZE,
                          shuffle=False, num_workers=6, pin_memory=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCHSIZE,
                         shuffle=False, num_workers=6, pin_memory=True)

In [23]:
#####################################################################################################
## Train CheXNet

In [24]:
%%time
# Load symbol
chexnet_sym = get_symbol()



CPU times: user 4.23 s, sys: 1.47 s, total: 5.7 s
Wall time: 6.09 s


In [25]:
%%time
# Load optimiser, loss
# Scheduler for LRPlateau is not used
optimizer, criterion, scheduler = init_symbol(chexnet_sym)
# Calculate loss on all GPUs
criterion = DataParallelCriterion(criterion, chexnet_sym.device_ids)

CPU times: user 1.81 ms, sys: 0 ns, total: 1.81 ms
Wall time: 1.81 ms


In [26]:
%%time
# 4 GPU - Main training loop: 9min 28s
# Main train/val loop
train_iter = iter(train_loader) # Prefetch some training data in the background

for j in range(EPOCHS):
    stime = time.time()
    valid_iter = iter(valid_loader) # Will start fetching validation data 
    train_epoch(chexnet_sym, train_iter, optimizer, criterion)
    train_iter = iter(train_loader) # Will prefetch some training data
    loss_val = valid_epoch(chexnet_sym, valid_iter, criterion)   
    print("Epoch time: {0:.0f} seconds".format(time.time()-stime))

Training epoch
Training loss: 0.1741
~~~~~~~
Validating epoch
Validation loss: 0.1472
Epoch time: 135 seconds
Training epoch
Training loss: 0.1590
~~~~~~~
Validating epoch
Validation loss: 0.1444
Epoch time: 108 seconds
Training epoch
Training loss: 0.1558
~~~~~~~
Validating epoch
Validation loss: 0.1442
Epoch time: 111 seconds
Training epoch
Training loss: 0.1541
~~~~~~~
Validating epoch
Validation loss: 0.1429
Epoch time: 107 seconds
Training epoch
Training loss: 0.1527
~~~~~~~
Validating epoch
Validation loss: 0.1410
Epoch time: 106 seconds
CPU times: user 13min 37s, sys: 2min 51s, total: 16min 29s
Wall time: 9min 28s


In [27]:
%%time
# 4 GPU AUC: 0.8126
test_loss = valid_epoch(chexnet_sym, test_loader, criterion, 'testing')

Testing epoch
Test-Dataset loss: 0.1538
Full AUC [0.8180020240235686, 0.8601479720224297, 0.7969336459283853, 0.89202362445056, 0.8854655649319612, 0.9015711126353438, 0.721529667743366, 0.8872964038058377, 0.624164759241935, 0.8489753774960405, 0.7411880319380078, 0.7823526644862778, 0.742070371621199, 0.8745168006955891]
Test-Dataset AUC: 0.8126
CPU times: user 14.2 s, sys: 5.61 s, total: 19.8 s
Wall time: 1min 21s


In [28]:
#####################################################################################################
## Synthetic Data (Pure Training)

In [29]:
# Test on fake-data -> no IO lag
batch_in_epoch = len(train_dataset.labels)//BATCHSIZE
tot_num = batch_in_epoch * BATCHSIZE
print(tot_num)

87296


In [32]:
fake_X = torch.tensor(np.random.rand(tot_num, 3, 224, 224).astype(np.float32))
fake_y = torch.tensor(np.random.rand(tot_num, CLASSES).astype(np.float32))

In [33]:
%%time
# 4 GPU - Synthetic data: 8min 45s
for j in range(EPOCHS):
    train_epoch(chexnet_sym, 
                yield_mb(fake_X, fake_y, BATCHSIZE, shuffle=False),
                optimizer, 
                criterion)

Training epoch
Training loss: 0.6935
~~~~~~~
Training epoch
Training loss: 0.6935
~~~~~~~
Training epoch
Training loss: 0.6934
~~~~~~~
Training epoch
Training loss: 0.6934
~~~~~~~
Training epoch
Training loss: 0.6935
~~~~~~~
CPU times: user 12min 27s, sys: 1min 32s, total: 13min 59s
Wall time: 8min 45s
