In [1]:
import re
import os
import numpy as np
import torch
from yucca.functional.preprocessing import preprocess_case_for_training_with_label
from batchgenerators.utilities.file_and_folder_operations import subfiles, join, save_pickle, maybe_mkdir_p, save_json
from yucca.paths import yucca_raw_data, yucca_preprocessed_data
from yucca.functional.utils.loading import read_file_to_nifti_or_np
from yucca.functional.preprocessing import (
    preprocess_case_for_training_with_label,
    preprocess_case_for_training_without_label,
    preprocess_case_for_inference,
)
from yucca.functional.planning import make_plans_file, add_stats_to_plans_post_preprocessing

Set some variables that we'll need and create necessary paths

In [2]:
task = "Task001_OASIS"
crop_to_nonzero = True
allow_missing_modalities = False
norm_op = "volume_wise_znorm"
plans_name = "demo"
modalities = ("MRI",)
task_type = "segmentation"
extension = ".nii.gz"

raw_images_dir = join(yucca_raw_data, task, "imagesTr")
raw_labels_dir = join(yucca_raw_data, task, "labelsTr")
test_raw_images_dir = join(yucca_raw_data, task, "imagesTs")


target_dir = join(yucca_preprocessed_data, task, plans_name)
test_target_dir = join(yucca_preprocessed_data, task + "_test", plans_name)


maybe_mkdir_p(target_dir)
maybe_mkdir_p(test_target_dir)

Now make a barebones plan

In [3]:
plans = make_plans_file(
    allow_missing_modalities=allow_missing_modalities,
    crop_to_nonzero=crop_to_nonzero,
    norm_op=norm_op,
    classes=[0, 1, 2],
    plans_name=plans_name,
    modalities=modalities,
    task_type=task_type,
)

now preprocess the samples in the folder

In [4]:
subjects = [file[: -len(extension)] for file in subfiles(raw_labels_dir, join=False) if not file.startswith(".")]

for sub in subjects:
    # we'll just do the first 5 images in this demo
    # this still assumes raw images are stored in the yucca format images are saved as:
    # sub_XXX.ext where XXX is the modality encoding (e.g 000 and 001 if two modalities are present per subject)
    images = [
        image_path
        for image_path in subfiles(raw_images_dir)
        if re.search(re.escape(sub) + "_" + r"\d{3}" + ".", os.path.split(image_path)[-1])
    ]
    images = [read_file_to_nifti_or_np(image) for image in images]
    label = read_file_to_nifti_or_np(join(raw_labels_dir, sub + extension))
    images, label, image_props = preprocess_case_for_training_with_label(
        images=images,
        label=label,
        normalization_operation=["volume_wise_znorm"],
        allow_missing_modalities=False,
        enable_cc_analysis=False,
        crop_to_nonzero=True,
    )
    images = np.vstack((np.array(images), np.array(label)[np.newaxis]), dtype=np.float32)

    save_path = join(target_dir, sub)
    np.save(save_path + ".npy", images)
    save_pickle(image_props, save_path + ".pkl")

Add some extra metadata to the plans file

In [5]:
plans = add_stats_to_plans_post_preprocessing(plans=plans, directory=target_dir)
save_json(plans, join(target_dir, plans_name + "_plans.json"), sort_keys=False)

And finally, let's preprocess the test data to make sure it's preprocessed identically

In [6]:
subjects = [file[: -len("_000" + extension)] for file in subfiles(test_raw_images_dir, join=False) if not file.startswith(".")]

for sub in subjects:
    # we'll just do the first 5 images in this demo
    # this still assumes raw images are stored in the yucca format images are saved as:
    # sub_XXX.ext where XXX is the modality encoding (e.g 000 and 001 if two modalities are present per subject)
    images = [
        image_path
        for image_path in subfiles(test_raw_images_dir)
        if re.search(re.escape(sub) + "_" + r"\d{3}" + ".", os.path.split(image_path)[-1])
    ]
    images, image_props = preprocess_case_for_inference(
        crop_to_nonzero=plans["crop_to_nonzero"],
        keep_aspect_ratio=plans["keep_aspect_ratio_when_using_target_size"],
        images=images,
        intensities=None,
        normalization_scheme=["volume_wise_znorm"],
        patch_size=(32, 32),
        target_size=plans["target_size"],
        target_spacing=plans["target_spacing"],
        target_orientation=plans["target_coordinate_system"],
        transpose_forward=plans["transpose_forward"],
    )
    # add channel dimension so they're stacked as (b, h, w, d) rather than (h * 2, w, d)
    save_path = join(test_target_dir, sub)
    torch.save(images, save_path + ".pt")
    save_pickle(image_props, save_path + ".pkl")

torch.Size([1, 1, 132, 175, 131])
torch.Size([1, 1, 143, 181, 136])
torch.Size([1, 1, 142, 183, 134])
torch.Size([1, 1, 129, 161, 127])
torch.Size([1, 1, 138, 160, 130])
torch.Size([1, 1, 132, 175, 132])
torch.Size([1, 1, 143, 180, 137])
torch.Size([1, 1, 142, 183, 136])
torch.Size([1, 1, 130, 159, 127])
torch.Size([1, 1, 137, 163, 131])
torch.Size([1, 1, 132, 182, 140])
torch.Size([1, 1, 136, 175, 137])
torch.Size([1, 1, 130, 164, 123])
torch.Size([1, 1, 131, 173, 128])
torch.Size([1, 1, 135, 174, 136])
torch.Size([1, 1, 140, 173, 138])
torch.Size([1, 1, 140, 160, 133])
torch.Size([1, 1, 134, 159, 121])
torch.Size([1, 1, 136, 177, 132])
torch.Size([1, 1, 125, 157, 120])


In [7]:
save_json(plans, join(test_target_dir, plans_name + "_plans.json"), sort_keys=False)