Copyright (c) MONAI Consortium  
Licensed under the Apache License, Version 2.0 (the "License");  
you may not use this file except in compliance with the License.  
You may obtain a copy of the License at  
&nbsp;&nbsp;&nbsp;&nbsp;http://www.apache.org/licenses/LICENSE-2.0  
Unless required by applicable law or agreed to in writing, software  
distributed under the License is distributed on an "AS IS" BASIS,  
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
See the License for the specific language governing permissions and  
limitations under the License. 

# Datalist Generator

## Setup environment

In [1]:
!python3 -c "import monai" || pip install -q "monai"

## Setup imports

In [2]:
import os
import json
import random
import shutil
import tempfile
from monai.config import print_config
from monai.apps import download_and_extract

print_config()

MONAI version: 1.2.0rc4+5.gfe550a12
Numpy version: 1.22.2
Pytorch version: 1.14.0a0+410ce96
MONAI flags: HAS_EXT = False, USE_COMPILED = False, USE_META_DICT = False
MONAI rev id: fe550a12a4b2205d7c374f0436bfa0081505f668
MONAI __file__: /workspace/monai/monai-in-dev/monai/__init__.py

Optional dependencies:
Pytorch Ignite version: 0.4.11
ITK version: 5.3.0
Nibabel version: 5.0.1
scikit-image version: 0.20.0
Pillow version: 9.2.0
Tensorboard version: 2.9.0
gdown version: 4.6.4
TorchVision version: 0.15.0a0
tqdm version: 4.64.1
lmdb version: 1.4.0
psutil version: 5.9.4
pandas version: 1.5.2
einops version: 0.6.0
transformers version: 4.21.3
mlflow version: 2.2.2
pynrrd version: 1.0.0

For details about installing the optional dependencies, please visit:
    https://docs.monai.io/en/latest/installation.html#installing-the-recommended-dependencies



# Setup paths to your data

In [3]:
directory = os.environ.get("MONAI_DATA_DIRECTORY")
root_dir = tempfile.mkdtemp() if directory is None else directory
print(root_dir)

/workspace/data


# Download sample MSD Dataset

In [4]:
msd_task = "Task09_Spleen"
resource = "https://msd-for-monai.s3-us-west-2.amazonaws.com/" + msd_task + ".tar"

compressed_file = os.path.join(root_dir, msd_task + ".tar")
dataroot = os.path.join(root_dir, msd_task)

if not os.path.exists(dataroot):
    download_and_extract(resource, compressed_file, root_dir)

# MSD dataset structure follows the following convention:

In [5]:
test_dir = os.path.join(dataroot, "imagesTs/")
train_dir = os.path.join(dataroot, "imagesTr/")
label_dir = os.path.join(dataroot, "labelsTr/")

# Construct skeleton JSON to populate with your own data

In [6]:
datalist_json = {"testing": [], "training": []}

# Populate JSON with test data

In [7]:
datalist_json["testing"] = [{"image": "./imagesTs/" + file} for file in os.listdir(test_dir) if "._" not in file]

# Visualise testing data

In [8]:
datalist_json["testing"][:10]

[{'image': './imagesTs/spleen_54.nii.gz'},
 {'image': './imagesTs/spleen_42.nii.gz'},
 {'image': './imagesTs/spleen_7.nii.gz'},
 {'image': './imagesTs/spleen_39.nii.gz'},
 {'image': './imagesTs/spleen_30.nii.gz'},
 {'image': './imagesTs/spleen_43.nii.gz'},
 {'image': './imagesTs/spleen_1.nii.gz'},
 {'image': './imagesTs/spleen_51.nii.gz'},
 {'image': './imagesTs/spleen_34.nii.gz'},
 {'image': './imagesTs/spleen_11.nii.gz'}]

# Populate with training images and labels in your directory

In [9]:
datalist_json["training"] = [
    {"image": "./imagesTr/" + file, "label": "./labelsTr/" + file, "fold": 0}
    for file in os.listdir(train_dir)
    if "._" not in file
]  # Initialize as single fold

# Visualise training data

In [10]:
datalist_json["training"][:10]

[{'image': './imagesTr/.lock', 'label': './labelsTr/.lock', 'fold': 0},
 {'image': './imagesTr/spleen_17.nii.gz',
  'label': './labelsTr/spleen_17.nii.gz',
  'fold': 0},
 {'image': './imagesTr/spleen_62.nii.gz',
  'label': './labelsTr/spleen_62.nii.gz',
  'fold': 0},
 {'image': './imagesTr/spleen_52.nii.gz',
  'label': './labelsTr/spleen_52.nii.gz',
  'fold': 0},
 {'image': './imagesTr/spleen_10.nii.gz',
  'label': './labelsTr/spleen_10.nii.gz',
  'fold': 0},
 {'image': './imagesTr/datastore_v2.json',
  'label': './labelsTr/datastore_v2.json',
  'fold': 0},
 {'image': './imagesTr/labels', 'label': './labelsTr/labels', 'fold': 0},
 {'image': './imagesTr/spleen_29.nii.gz',
  'label': './labelsTr/spleen_29.nii.gz',
  'fold': 0},
 {'image': './imagesTr/spleen_28.nii.gz',
  'label': './labelsTr/spleen_28.nii.gz',
  'fold': 0},
 {'image': './imagesTr/spleen_14.nii.gz',
  'label': './labelsTr/spleen_14.nii.gz',
  'fold': 0}]

# Randomise training data


In [11]:
random.seed(42)
random.shuffle(datalist_json["training"])
datalist_json["training"][:10]

[{'image': './imagesTr/spleen_14.nii.gz',
  'label': './labelsTr/spleen_14.nii.gz',
  'fold': 0},
 {'image': './imagesTr/spleen_52.nii.gz',
  'label': './labelsTr/spleen_52.nii.gz',
  'fold': 0},
 {'image': './imagesTr/spleen_21.nii.gz',
  'label': './labelsTr/spleen_21.nii.gz',
  'fold': 0},
 {'image': './imagesTr/spleen_44.nii.gz',
  'label': './labelsTr/spleen_44.nii.gz',
  'fold': 0},
 {'image': './imagesTr/spleen_16.nii.gz',
  'label': './labelsTr/spleen_16.nii.gz',
  'fold': 0},
 {'image': './imagesTr/spleen_40.nii.gz',
  'label': './labelsTr/spleen_40.nii.gz',
  'fold': 0},
 {'image': './imagesTr/spleen_9.nii.gz',
  'label': './labelsTr/spleen_9.nii.gz',
  'fold': 0},
 {'image': './imagesTr/spleen_41.nii.gz',
  'label': './labelsTr/spleen_41.nii.gz',
  'fold': 0},
 {'image': './imagesTr/spleen_10.nii.gz',
  'label': './labelsTr/spleen_10.nii.gz',
  'fold': 0},
 {'image': './imagesTr/spleen_32.nii.gz',
  'label': './labelsTr/spleen_32.nii.gz',
  'fold': 0}]

# Split training data into N random folds

In [12]:
num_folds = 5
fold_size = len(datalist_json["training"]) // num_folds
for i in range(num_folds):
    for j in range(fold_size):
        datalist_json["training"][i * fold_size + j]["fold"] = i

# Visualise final training data with all randomised folds

In [13]:
datalist_json["training"][:5]

[{'image': './imagesTr/spleen_14.nii.gz',
  'label': './labelsTr/spleen_14.nii.gz',
  'fold': 0},
 {'image': './imagesTr/spleen_52.nii.gz',
  'label': './labelsTr/spleen_52.nii.gz',
  'fold': 0},
 {'image': './imagesTr/spleen_21.nii.gz',
  'label': './labelsTr/spleen_21.nii.gz',
  'fold': 0},
 {'image': './imagesTr/spleen_44.nii.gz',
  'label': './labelsTr/spleen_44.nii.gz',
  'fold': 0},
 {'image': './imagesTr/spleen_16.nii.gz',
  'label': './labelsTr/spleen_16.nii.gz',
  'fold': 0}]

# Save JSON to file

In [14]:
datalist_file = "msd_" + msd_task.lower() + "_folds.json"
with open(datalist_file, "w", encoding="utf-8") as f:
    json.dump(datalist_json, f, ensure_ascii=False, indent=4)
print(f"Datalist is saved to {datalist_file}")

Datalist is saved to msd_task09_spleen_folds.json


# Cleanup temporary files

In [15]:
if directory is None:
    shutil.rmtree(root_dir)