In [1]:
import torch
import torchvision as tv
import torchvision.transforms as transforms
from torchvision.datasets.vision import StandardTransform
from torch.utils.data import DataLoader
from torchvision.utils import make_grid

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np
import pandas as pd 
from PIL import Image
import matplotlib.pyplot as plt 
%matplotlib inline 
import json
import os

import SimpleITK as sitk

# File structure:
```
BT_Segmentation_Project
    |_ Code
       |_ Data_Extraction.ipynb     (this file)
       |_ ...
    |_ Task01_BrainTumour           (Dataset)
       |_ dataset.json
       |_ imagesTr                  (here are the compressed nii-files)
       |_ labelsTr
       |_ imagesTs
       |_ extracted                 (here are the numpy files)
           |_ imagesTr
           |_ labelsTr
           |_ imagesTs
```

# Extracting files

In [4]:
# prepare extracting files:
# make new dirs etc

root_dir = os. getcwd()
parent_dir = os.path.abspath(os.path.join(root_dir, os.pardir))
dataset_dir = os.path.join(parent_dir, 'Task01_BrainTumour')
print(f'root_dir:    {root_dir}')
print(f'parent_dir:  {parent_dir}')
print(f'dataset_dir: {dataset_dir}')

# imagesTr: training images
# labelsTr: training labels
# imagesTs: test images

extracted_dir = os.path.join(dataset_dir, 'extracted')
imgTr_dir = os.path.join(extracted_dir, 'imagesTr')
labelsTr_dir = os.path.join(extracted_dir, 'labelsTr')
imgTs_dir = os.path.join(extracted_dir, 'imagesTs')

if not os.path.exists(dataset_dir + '/extracted'):
    extracted = False
    print('data not extracted yet.\n -> creating new extracted dir.')
    for dir_path in [extracted_dir, imgTr_dir, labelsTr_dir, imgTs_dir]:
        os.mkdir(dir_path)
else:
    extracted = True

root_dir:    /media/z/Ubuntu-Storage/BT_Segmentation_Project/Code
parent_dir:  /media/z/Ubuntu-Storage/BT_Segmentation_Project
dataset_dir: /media/z/Ubuntu-Storage/BT_Segmentation_Project/Task01_BrainTumour
data not extracted yet.
 -> creating new extracted dir.


In [5]:
# load the json dataset-info-file as a dictionary
with open(dataset_dir + '/dataset.json') as json_file:
    data = json.load(json_file)
    print(data.keys())
    print('\nScans:', data['modality'])
    print('\nLabels:', data['labels'])
    print('\n#Training:', data['numTraining'])
    print('#Test:', data['numTest'])
    
    train_filenames = data['training']
    test_filenames = data['test']
    
print('\nTraining-files paths:', train_filenames[0])
print('Test-files path:', test_filenames[0])

dict_keys(['name', 'description', 'reference', 'licence', 'release', 'tensorImageSize', 'modality', 'labels', 'numTraining', 'numTest', 'training', 'test'])

Scans: {'0': 'FLAIR', '1': 'T1w', '2': 't1gd', '3': 'T2w'}

Labels: {'0': 'background', '1': 'edema', '2': 'non-enhancing tumor', '3': 'enhancing tumour'}

#Training: 484
#Test: 266

Training-files paths: {'image': './imagesTr/BRATS_457.nii.gz', 'label': './labelsTr/BRATS_457.nii.gz'}
Test-files path: ./imagesTs/BRATS_557.nii.gz


In [6]:
# extract files
# needs about 110-120 GB disk-storage
# or modify the code (break after e.g. 10 iterations)
# to only unpack a couple of samples 

if extracted:
    print('files are already extracted.')

else:
    num_of_train_files = len(train_filenames)
    num_of_test_files = len(test_filenames)
    
    for i, img_path_dict in enumerate(train_filenames):
        print(f'extracting training_file {i}/{num_of_train_files}', end="\r")
        image_path_gz = img_path_dict['image'][2:]
        label_path_gz = img_path_dict['label'][2:]
        
        # extract and save image
        img_path = os.path.join(dataset_dir, image_path_gz) # nii-image path
        sitk_img = sitk.ReadImage(img_path) # read the nii-image
        img = sitk.GetArrayFromImage(sitk_img) # img to numpy array
        extracted_img_path = os.path.join(imgTr_dir, str(i))
        np.save(extracted_img_path, img)
        
        # extract and save label
        label_path = os.path.join(dataset_dir, label_path_gz) # nii-image path
        sitk_label_img = sitk.ReadImage(label_path) # read the nii-image
        img_label = sitk.GetArrayFromImage(sitk_label_img) # img to numpy array
        extracted_label_path = os.path.join(labelsTr_dir, str(i))
        np.save(extracted_label_path, img_label)
    print('finished extracting training files.')
    
    for i, img_path in enumerate(test_filenames):
        print(f'extracting testing_file {i}/{num_of_test_files}', end="\r")
        image_path_gz = img_path[2:]
        
        # extract and save test image
        img_path = os.path.join(dataset_dir, image_path_gz) # nii-image path
        sitk_img = sitk.ReadImage(img_path) # read the nii-image
        img = sitk.GetArrayFromImage(sitk_img) # img to numpy array
        extracted_img_path = os.path.join(imgTs_dir, str(i))
        np.save(extracted_img_path, img)
    print('finished extracting test files.')

finished extracting training files.
finished extracting test files.
