Copyright (c) MONAI Consortium  
Licensed under the Apache License, Version 2.0 (the "License");  
you may not use this file except in compliance with the License.  
You may obtain a copy of the License at  
&nbsp;&nbsp;&nbsp;&nbsp;http://www.apache.org/licenses/LICENSE-2.0  
Unless required by applicable law or agreed to in writing, software  
distributed under the License is distributed on an "AS IS" BASIS,  
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
See the License for the specific language governing permissions and  
limitations under the License.

# Data loading pipeline examples

The purpose of this notebook is to illustrate reading Nifti files and test speed of different methods.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Project-MONAI/tutorials/blob/main/acceleration/transform_speed.ipynb)

## Setup environment

In [None]:
!python -c "import monai" || pip install -q "monai-weekly[nibabel]"

## Setup imports

In [None]:
import glob
import os
import shutil
import tempfile

import nibabel as nib
import numpy as np
import torch

try:
    torch.multiprocessing.set_start_method("spawn")
except RuntimeError:
    pass


from monai.config import print_config
from monai.data import ArrayDataset, create_test_image_3d
from monai.transforms import (
    EnsureChannelFirst,
    Compose,
    LoadImage,
    RandAffine,
    RandSpatialCrop,
    Rotate,
    ScaleIntensity,
)
from monai.utils import first

print_config()

## Setup data directory

You can specify a directory with the `MONAI_DATA_DIRECTORY` environment variable.  
This allows you to save results and reuse downloads.  
If not specified a temporary directory will be used.

In [3]:
directory = os.environ.get("MONAI_DATA_DIRECTORY")
if directory:
    directory = os.path.join(directory, "transform_speed")
    os.makedirs(directory, exist_ok=True)
root_dir = tempfile.mkdtemp() if directory is None else directory
print(root_dir)

/tmp/tmpvaqesd_z


### 0. Preparing input data (nifti images)

Create a number of test Nifti files, 3d single channel images with spatial size (256, 256, 256) voxels.

In [4]:
for i in range(5):
    im, seg = create_test_image_3d(256, 256, 256)

    n = nib.Nifti1Image(im, np.eye(4))
    nib.save(n, os.path.join(root_dir, f"im{i}.nii.gz"))

    n = nib.Nifti1Image(seg, np.eye(4))
    nib.save(n, os.path.join(root_dir, f"seg{i}.nii.gz"))

In [5]:
# prepare list of image names and segmentation names
images = sorted(glob.glob(os.path.join(root_dir, "im*.nii.gz")))
segs = sorted(glob.glob(os.path.join(root_dir, "seg*.nii.gz")))

### 1. Test image loading with minimal preprocessing

In [6]:
imtrans = Compose([LoadImage(image_only=True), EnsureChannelFirst()])

segtrans = Compose([LoadImage(image_only=True), EnsureChannelFirst()])

ds = ArrayDataset(images, imtrans, segs, segtrans)
loader = torch.utils.data.DataLoader(ds, batch_size=3, num_workers=8)

im, seg = first(loader)
print(im.shape, seg.shape)

(3, 1, 256, 256, 256) (3, 1, 256, 256, 256)


In [7]:
%time data = next(iter(loader))

CPU times: user 26.1 ms, sys: 172 ms, total: 198 ms
Wall time: 11 s


### 2. Test image-patch loading with CPU multi-processing:

- rotate (256, 256, 256)-voxel in the plane axes=(1, 2)
- extract random (64, 64, 64) patches
- implemented in MONAI using ` scipy.ndimage.rotate`

In [8]:
images = sorted(glob.glob(os.path.join(root_dir, "im*.nii.gz")))
segs = sorted(glob.glob(os.path.join(root_dir, "seg*.nii.gz")))

imtrans = Compose(
    [
        LoadImage(image_only=True),
        ScaleIntensity(),
        EnsureChannelFirst(),
        Rotate(angle=np.pi / 4),
        RandSpatialCrop((64, 64, 64), random_size=False),
    ]
)

segtrans = Compose(
    [
        LoadImage(image_only=True),
        EnsureChannelFirst(),
        Rotate(angle=np.pi / 4),
        RandSpatialCrop((64, 64, 64), random_size=False),
    ]
)

ds = ArrayDataset(images, imtrans, segs, segtrans)
loader = torch.utils.data.DataLoader(ds, batch_size=3, num_workers=8, pin_memory=torch.cuda.is_available())

im, seg = first(loader)
print(im.shape, seg.shape)

(3, 1, 64, 64, 64) (3, 1, 64, 64, 64)


In [9]:
%time data = next(iter(loader))

CPU times: user 37.6 ms, sys: 498 ms, total: 536 ms
Wall time: 24.6 s


(the above results were based on Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz)

### 3. Test image-patch loading with preprocessing on GPU:

- random rotate (256, 256, 256)-voxel in the plane axes=(1, 2)
- extract random (64, 64, 64) patches
- implemented in MONAI using native pytorch resampling

In [10]:
images = sorted(glob.glob(os.path.join(root_dir, "im*.nii.gz")))
segs = sorted(glob.glob(os.path.join(root_dir, "seg*.nii.gz")))

# same parameter with different interpolation mode for image and segmentation
rand_affine_img = RandAffine(
    prob=1.0,
    rotate_range=np.pi / 4,
    translate_range=(96, 96, 96),
    spatial_size=(64, 64, 64),
    mode="bilinear",
    device=torch.device("cuda:0"),
)
rand_affine_seg = RandAffine(
    prob=1.0,
    rotate_range=np.pi / 4,
    translate_range=(96, 96, 96),
    spatial_size=(64, 64, 64),
    mode="nearest",
    device=torch.device("cuda:0"),
)

imtrans = Compose([LoadImage(image_only=True), ScaleIntensity(), EnsureChannelFirst(), rand_affine_img])

segtrans = Compose([LoadImage(image_only=True), EnsureChannelFirst(), rand_affine_seg])

ds = ArrayDataset(images, imtrans, segs, segtrans)
loader = torch.utils.data.DataLoader(ds, batch_size=3, num_workers=0)

im, seg = first(loader)

print(im.shape, seg.shape)

(3, 1, 64, 64, 64) (3, 1, 64, 64, 64)


In [11]:
%time data = next(iter(loader))

CPU times: user 19.4 s, sys: 2.67 s, total: 22 s
Wall time: 4.83 s


In [12]:
print(torch.cuda.get_device_name(0))
print(torch.cuda.memory_summary(0, abbreviated=True))

Tesla V100-SXM2-16GB-N
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   16387 KB |   24580 KB |  118883 KB |  102496 KB |
|---------------------------------------------------------------------------|
| Active memory         |   16387 KB |   24580 KB |  118883 KB |  102496 KB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |   43008 KB |   43008 KB |   43008 KB |       0 B  |
|---------------------------------------------------------------------------|
| Non-releasable memory |    6141 KB |   22527 KB |  274525 KB |  268384 KB |
|----------------------------------------

### 4. Test image-patch loading with preprocessing on GPU using the Cupy backend:

In the cupy package is installed correctly along with MONAI, 
setting the `mode` to an integer in `[0-5]` and `device` to a cuda device will enable the cupy backend resampling.

- random rotate (256, 256, 256)-voxel in the plane axes=(1, 2)
- extract random (64, 64, 64) patches
- implemented in MONAI using the cupy backend for high-order spline interpolation

In [13]:
images = sorted(glob.glob(os.path.join(root_dir, "im*.nii.gz")))
segs = sorted(glob.glob(os.path.join(root_dir, "seg*.nii.gz")))

# same parameter with different interpolation mode for image and segmentation
rand_affine_img = RandAffine(
    prob=1.0,
    rotate_range=np.pi / 4,
    translate_range=(96, 96, 96),
    spatial_size=(64, 64, 64),
    mode=3,
    padding_mode="reflect",
    device=torch.device("cuda:0"),
)
rand_affine_seg = RandAffine(
    prob=1.0,
    rotate_range=np.pi / 4,
    translate_range=(96, 96, 96),
    spatial_size=(64, 64, 64),
    mode=0,
    padding_mode="reflect",
    device=torch.device("cuda:0"),
)

imtrans = Compose([LoadImage(image_only=True), ScaleIntensity(), EnsureChannelFirst(), rand_affine_img])

segtrans = Compose([LoadImage(image_only=True), EnsureChannelFirst(), rand_affine_seg])

ds = ArrayDataset(images, imtrans, segs, segtrans)
loader = torch.utils.data.DataLoader(ds, batch_size=3, num_workers=0)

im, seg = first(loader)

print(im.shape, seg.shape)

(3, 1, 64, 64, 64) (3, 1, 64, 64, 64)


In [14]:
%time data = next(iter(loader))

CPU times: user 15.4 s, sys: 3.15 s, total: 18.5 s
Wall time: 7.7 s


## Cleanup data directory

Remove directory if a temporary was used.

In [15]:
if directory is None:
    shutil.rmtree(root_dir)