In [1]:
import os
import numpy as np
import nibabel as nib
import cv2
import gc

In [2]:
def load_and_preprocess(patient_path, patient_id, image_size=(128, 128)):
    modalities = ['flair', 't1', 't1ce', 't2']
    images = []

    for modality in modalities:
        file_path = os.path.join(patient_path, f"{patient_id}_{modality}.nii.gz")
        img = nib.load(file_path).get_fdata()
        img = (img - img.min()) / (img.max() - img.min())
        img = np.rot90(img, k=1, axes=(0, 1))
        images.append(img)

    images = np.stack(images, axis=-1)        # (H, W, Slices, 4)
    images = np.transpose(images, (2, 0, 1, 3)) # (Slices, H, W, 4)

    seg_path = os.path.join(patient_path, f"{patient_id}_seg.nii.gz")
    mask = nib.load(seg_path).get_fdata()
    mask = np.rot90(mask, k=1, axes=(0, 1))
    mask = np.transpose(mask, (2, 0, 1))       # (Slices, H, W)

    num_slices = images.shape[0]
    X = np.zeros((num_slices, image_size[0], image_size[1], 4), dtype=np.float32)
    Y = np.zeros((num_slices, image_size[0], image_size[1]), dtype=np.uint8)

    for i in range(num_slices):
        X[i] = cv2.resize(images[i], image_size, interpolation=cv2.INTER_LINEAR)
        Y[i] = cv2.resize(mask[i], image_size, interpolation=cv2.INTER_NEAREST)

    return X, Y

# Output folder to save preprocessed files
preprocessed_dir = "preprocessed"
os.makedirs(preprocessed_dir, exist_ok=True)

data_path = "archive/BraTS2021_Training_Data"

for folder_name in os.listdir(data_path):
    patient_path = os.path.join(data_path, folder_name)
    if os.path.isdir(patient_path):
        patient_id = folder_name
        print(f"Processing {patient_id}...")
        X, Y = load_and_preprocess(patient_path, patient_id)

        # Save using memory-mappable format
        np.save(os.path.join(preprocessed_dir, f"{patient_id}_X.npy"), X)
        np.save(os.path.join(preprocessed_dir, f"{patient_id}_Y.npy"), Y)

        del X, Y
        gc.collect()


Processing BraTS2021_00000...
Processing BraTS2021_00002...
Processing BraTS2021_00003...
Processing BraTS2021_00005...
Processing BraTS2021_00006...
Processing BraTS2021_00008...
Processing BraTS2021_00009...
Processing BraTS2021_00011...
Processing BraTS2021_00012...
Processing BraTS2021_00014...
Processing BraTS2021_00016...
Processing BraTS2021_00017...
Processing BraTS2021_00018...
Processing BraTS2021_00019...
Processing BraTS2021_00020...
Processing BraTS2021_00021...
Processing BraTS2021_00022...
Processing BraTS2021_00024...
Processing BraTS2021_00025...
Processing BraTS2021_00026...
Processing BraTS2021_00028...
Processing BraTS2021_00030...
Processing BraTS2021_00031...
Processing BraTS2021_00032...
Processing BraTS2021_00033...
Processing BraTS2021_00035...
Processing BraTS2021_00036...
Processing BraTS2021_00043...
Processing BraTS2021_00044...
Processing BraTS2021_00045...
Processing BraTS2021_00046...
Processing BraTS2021_00048...
Processing BraTS2021_00049...
Processing

In [None]:
import numpy as np
import os
from sklearn.utils import shuffle
import h5py

# Folder where preprocessed .npy files are stored
preprocessed_dir = "preprocessed"

all_patient_ids = sorted([
    fname.rsplit("_", 1)[0] for fname in os.listdir(preprocessed_dir)
    if fname.endswith("_X.npy")
])


# Shuffle patient IDs
all_patient_ids = shuffle(all_patient_ids, random_state=42)

# Split into train/val/test
num_train = 70
num_val = 20
num_test = 10

train_ids = all_patient_ids[:num_train]
val_ids   = all_patient_ids[num_train:num_train + num_val]
test_ids  = all_patient_ids[num_train + num_val:num_train + num_val + num_test]

# Helper function to load and concatenate data for a list of patient IDs
def load_split(patient_ids):
    X_list = []
    Y_list = []
    for pid in patient_ids:
        X_path = os.path.join(preprocessed_dir, f"{pid}_X.npy")
        Y_path = os.path.join(preprocessed_dir, f"{pid}_Y.npy")

        X = np.load(X_path)  # Use mmap_mode='r' for very large datasets
        Y = np.load(Y_path)

        X_list.append(X)
        Y_list.append(Y)

    X = np.concatenate(X_list, axis=0)
    Y = np.concatenate(Y_list, axis=0)
    return X, Y

split_path = "split_data"
os.makedirs(split_path, exist_ok=True)

def save_to_h5(file_path, X, Y):
    file_path = os.path.join(split_path, file_path)
    if os.path.exists(file_path):
        os.remove(file_path)
    with h5py.File(file_path, 'w') as f:
        f.create_dataset('X', data=X, compression='gzip', dtype=X.dtype)
        f.create_dataset('Y', data=Y, compression='gzip', dtype=Y.dtype)


In [None]:
# Split

X_train, Y_train = load_split(train_ids)
print("Train X:", X_train.shape, "Y:", Y_train.shape)
save_to_h5("train_data.h5", X_train, Y_train)
del X_train, Y_train
gc.collect()

X_val, Y_val = load_split(val_ids)
print("Val X:  ", X_val.shape, "Y:", Y_val.shape)
save_to_h5("val_data.h5",   X_val,   Y_val)
del X_val, Y_val
gc.collect()

X_test, Y_test = load_split(test_ids)
print("Test X: ", X_test.shape, "Y:", Y_test.shape)
save_to_h5("test_data.h5",  X_test,  Y_test)
del X_test, Y_test
gc.collect()

Train X: (10850, 128, 128, 4) Y: (10850, 128, 128)
Val X:   (3100, 128, 128, 4) Y: (3100, 128, 128)
Test X:  (1550, 128, 128, 4) Y: (1550, 128, 128)


220