In [7]:
import h5py
import os
from PIL import Image
import numpy as np
import pandas as pd

In [10]:
labels_map = {1: "meningioma", 2: "glioma", 3: "pituitary"}

def load_mat_v73(mat_file_path):
    """
    Load MATLAB v7.3 .mat file and extract relevant data
    """
    with h5py.File(mat_file_path, 'r') as file:
        cjdata = file['cjdata']
        data = {}

        # Handle image and tumor mask
        data['image'] = np.array(cjdata['image'])
        data['tumorMask'] = np.array(cjdata['tumorMask'])

        # Handle scalar values (label and PID) safely
        label_data = np.array(cjdata['label'])
        pid_data = np.array(cjdata['PID'])

        # Convert to simple integer, regardless of array dimension
        data['label'] = int(label_data.flatten()[0])
        data['PID'] = int(pid_data.flatten()[0])

    return data

def save_image_and_mask(data, output_image_dir, output_mask_dir, filename):
    """
    Save image and mask in proper folders
    """
    label_name = labels_map[data['label']]
    os.makedirs(os.path.join(output_image_dir, label_name), exist_ok=True)
    os.makedirs(os.path.join(output_mask_dir, label_name), exist_ok=True)

    # Save as PNG
    Image.fromarray(data['image']).save(os.path.join(output_image_dir, label_name, filename))
    Image.fromarray((data['tumorMask']*255).astype(np.uint8)).save(
        os.path.join(output_mask_dir, label_name, filename))

def process_dataset(mat_dir, output_image_dir, output_mask_dir, metadata_path):
    """
    Process all .mat files in a folder and save images, masks, metadata
    """
    metadata = []
    for mat_file in os.listdir(mat_dir):
        if mat_file.endswith(".mat"):
            mat_path = os.path.join(mat_dir, mat_file)
            data = load_mat_v73(mat_path)
            filename = f"{mat_file.split('.')[0]}.png"

            save_image_and_mask(data, output_image_dir, output_mask_dir, filename)
            metadata.append([filename, labels_map[data['label']], data['PID']])

    # Save metadata CSV
    df = pd.DataFrame(metadata, columns=["filename", "label", "PID"])
    df.to_csv(metadata_path, index=False)
    print(f"Processed {len(metadata)} files. Metadata saved to {metadata_path}")


In [11]:
`mat_dir = "../data/archive/dataset/data"
image_dir = "../data/mri_t1_dataset/images"
mask_dir = "../data/mri_t1_dataset/masks"
metadata_path = "../data/mri_t1_dataset/metadata.csv"
process_dataset(mat_dir, image_dir, mask_dir, metadata_path)


Processed 3064 files. Metadata saved to ../data/mri_t1_dataset/metadata.csv
