In [None]:
import os, sys
project_dir = os.path.join(os.getcwd(),'../..')
if project_dir not in sys.path:
    sys.path.append(project_dir)

import pandas as pd
import numpy as np
import config
import h5py

In [None]:
directory = os.path.join(config.BRATS_DATASET_PATH, 'BraTS2020_training_data/content/data/')

# Create a list of all .h5 files in the directory
h5_files = [f for f in os.listdir(directory) if f.endswith('.h5')]
print(f"Found {len(h5_files)} .h5 files:\nExample file names:{h5_files[:3]}")

# Open the first .h5 file in the list to inspect its contents
if h5_files:
    file_path = os.path.join(directory, h5_files[25070])
    with h5py.File(file_path, 'r') as file:
        print("\nKeys for each file:", list(file.keys()))
        for key in file.keys():
            print(f"\nData type of {key}:", type(file[key][()]))
            print(f"Shape of {key}:", file[key].shape)
            print(f"Array dtype: {file[key].dtype}")
            print(f"Array max val: {np.max(file[key])}")
            print(f"Array min val: {np.min(file[key])}")
else:
    print("No .h5 files found in the directory.")

In [None]:
import matplotlib.pyplot as plt
# plt.style.use('ggplot')
# plt.rcParams['figure.facecolor'] = '#171717'
# plt.rcParams['text.color']       = '#DDDDDD'

def display_image_channels(image, title='Image Channels'):
    channel_names = ['T1-Weighted (T1)', 'T1-Weighted Post Contrast (T1c)', 'T2-Weighted (T2)', 'Fluid Attenuated \n Inversion Recovery (FLAIR)']
    fig, axes = plt.subplots(1, 4, figsize=(10, 10))
    for idx, ax in enumerate(axes.flatten()):
        channel_image = image[idx, :, :]  # Transpose the array to display the channel
        ax.imshow(channel_image, cmap='gray')
        ax.axis('off')
        ax.set_title(channel_names[idx])
    plt.tight_layout()
    fig.savefig('../figures/brats_image_channels.pdf', bbox_inches='tight')
    # plt.suptitle(title, fontsize=20, y=1.03)
    plt.show()

def display_mask_channels_as_rgb(mask, title='Mask Channels as RGB'):
    channel_names = ['Necrotic (NEC)', 'Edema (ED)', 'Tumour (ET)']
    fig, axes = plt.subplots(1, 3, figsize=(9.75, 5))
    for idx, ax in enumerate(axes):
        rgb_mask = np.zeros((mask.shape[1], mask.shape[2], 3), dtype=np.uint8)
        rgb_mask[..., idx] = mask[idx, :, :] * 255  # Transpose the array to display the channel
        ax.imshow(rgb_mask)
        ax.axis('off')
        ax.set_title(channel_names[idx])
    plt.suptitle(title, fontsize=20, y=0.93)
    plt.tight_layout()
    plt.show()

def overlay_masks_on_image(image, mask, title='Brain MRI \n with Tumour Masks Overlay'):
    t1_image = image[0, :, :]  # Use the first channel of the image
    t1_image_normalized = (t1_image - t1_image.min()) / (t1_image.max() - t1_image.min())

    rgb_image = np.stack([t1_image_normalized] * 3, axis=-1)
    color_mask = np.stack([mask[0, :, :], mask[1, :, :], mask[2, :, :]], axis=-1)
    rgb_image = np.where(color_mask, color_mask, rgb_image)
    
    plt.figure(figsize=(4, 4))
    plt.imshow(rgb_image, cmap='gray')
    plt.title(title, fontsize=18, y=1.02)
    plt.axis('off')
    plt.savefig('../figures/brats_image_mask_overlay.pdf', bbox_inches='tight')
    plt.show()
    
    
# Sample image to view
sample_file_path = os.path.join(directory, h5_files[600])
data = {}
with h5py.File(sample_file_path, 'r') as file:
    for key in file.keys():
        data[key] = file[key][()]

# Transpose the image and mask to have channels first
image = data['image'].transpose(2, 0, 1)
mask = data['mask'].transpose(2, 0, 1)

# View images using plotting functions
display_image_channels(image)
display_mask_channels_as_rgb(mask)
overlay_masks_on_image(image, mask)

In [None]:
# Regular expression to extract the patient ID from the filename:
# filename: volume_{id}_slice_{slice}.h5

import re
patient_slices = [re.search(r'volume_(\d+)_slice_(\d+)', f).groups() for f in h5_files]

filtered_patient_slices = list(filter(lambda p: 80 <= int(p[1]) < 128, patient_slices))
len(filtered_patient_slices)

In [None]:
filenames = [f'volume_{p[0]}_slice_{p[1]}.h5' for p in filtered_patient_slices]
filenames[:5]

In [None]:
from tqdm import tqdm

labels = np.zeros(len(filenames))

# Iterate through the files and check if the mask contains tumour
for i in tqdm(range(len(filenames))):
    sample_file_path = os.path.join(directory, filenames[i])
    data = {}
    with h5py.File(sample_file_path, 'r') as file:
        for key in file.keys():
            data[key] = file[key][()]

    if len(np.unique(data['mask'])) > 1:
        labels[i] = 1


In [None]:
df = pd.DataFrame({'Filename': filenames, 'Label': labels.astype(int)})
df.to_csv(os.path.join(config.BRATS_DATASET_PATH, 'tumour_labels.csv'), index=False)

In [None]:
idx_0 = 1209
print(filenames[idx_0])
sample_file_path = os.path.join(directory, filenames[idx_0])

data_0 = {}
with h5py.File(sample_file_path, 'r') as file:
        for key in file.keys():
            data_0[key] = file[key][()]

idx_1 = 2512
print(filenames[idx_1])
sample_file_path = os.path.join(directory, filenames[idx_1])

data_1 = {}
with h5py.File(sample_file_path, 'r') as file:
        for key in file.keys():
            data_1[key] = file[key][()]

In [None]:
plt.subplot(1, 2, 1)
plt.imshow(data_0['image'][:, :, 0].T, cmap='gray')
plt.title('Patient A')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(data_1['image'][:, :, 0].T, cmap='gray')
plt.title('Patient b')
plt.axis('off')

plt.show()