<a href="https://colab.research.google.com/github/TharinsaMudalige/Neuron-Brain_Tumor_Detection_Classification_with_XAI/blob/Detection-Classficiation-CNN/Generating_Annotations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Required Libraries

In [1]:
# Install libraries
!pip install tensorflow tensorflow-hub tensorflow-addons opencv-python

# Import libraries
import os
import random
import shutil
import cv2
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
import tensorflow_hub as hub
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from google.colab import drive
from PIL import Image
import xml.etree.ElementTree as ET
from xml.dom.minidom import parseString

Collecting tensorflow-addons
  Downloading tensorflow_addons-0.23.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons)
  Downloading typeguard-2.13.3-py3-none-any.whl.metadata (3.6 kB)
Downloading tensorflow_addons-0.23.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
  Attempting uninstall: typeguard
    Found existing installation: typeguard 4.4.2
    Uninstalling typeguard-4.4.2:
      Successfully uninstalled typeguard-4.4.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
inflect 7.5.0 requires typeguard>=4.0.1, b

Mount Google Drive and Define paths

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

# Define paths
RAW_DATASET_PATH = "/content/drive/MyDrive/DSGP/DSGP_dataset"
OUTPUT_PATH = "/content/drive/MyDrive/DSGP/CNN_Dataset"

# Detect tumor classes (from folders in dataset)
tumor_classes = [folder for folder in os.listdir(RAW_DATASET_PATH) if os.path.isdir(os.path.join(RAW_DATASET_PATH, folder))]
print("Detected tumor classes:", tumor_classes)

# Create folders for Train, Val, Test
for split in ["Train", "Val", "Test"]:
    for subdir in ["Images", "Annotations"]:
        for tumor_class in tumor_classes:
            os.makedirs(os.path.join(OUTPUT_PATH, split, subdir, tumor_class), exist_ok=True)

Mounted at /content/drive
Detected tumor classes: ['tuberculoma', 'granuloma', 'no_tumour', 'papiloma', 'schwannoma', 'meduloblastoma', 'pituitary', 'neurocitoma', 'oligodendroglioma', 'meningioma', 'germinoma', 'astrocitoma', 'glioblastoma', 'ependimoma', 'ganglioglioma', 'carcinoma']


Load U-Net Model

In [3]:
def unet_model(input_size=(256, 256, 3)):
    inputs = keras.Input(input_size)

    # Encoder
    conv1 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
    pool1 = layers.MaxPooling2D(pool_size=(2, 2))(conv1)

    conv2 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(pool1)
    pool2 = layers.MaxPooling2D(pool_size=(2, 2))(conv2)

    # Bottleneck
    conv3 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(pool2)

    # Decoder
    up4 = layers.Conv2DTranspose(128, (2, 2), strides=(2, 2), padding='same')(conv3)
    conv4 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(up4)

    up5 = layers.Conv2DTranspose(64, (2, 2), strides=(2, 2), padding='same')(conv4)
    conv5 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(up5)

    outputs = layers.Conv2D(1, (1, 1), activation='sigmoid')(conv5)

    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

# Compile the model
unet_model = unet_model()
unet_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print("U-Net model built and compiled.")

U-Net model built and compiled.


Resize and Preprocess Images

In [4]:
TARGET_SIZE = (256, 256)

def preprocess_image(image_path):
    """Load, resize, and normalize the MRI image."""
    image = load_img(image_path)
    image = image.resize(TARGET_SIZE)
    image = img_to_array(image) / 255.0
    return image

Tumor Segmentation and Bounding Box Generation

In [5]:
def get_segmentation_mask(image):
    """Generate segmentation mask using U-Net."""
    img_resized = tf.image.resize(image, TARGET_SIZE)
    img_resized = tf.expand_dims(img_resized, 0)
    mask = unet_model(img_resized)
    mask = tf.squeeze(mask, axis=0)
    mask = np.array(mask > 0.5, dtype=np.uint8)
    return mask

def mask_to_bbox(mask):
    """Convert a binary mask (2D) to bounding box coordinates (xmin, ymin, xmax, ymax)."""
    # Ensure the mask is 2D by collapsing any extra dimensions
    if len(mask.shape) == 3:  # If mask has channels (H, W, C), reduce to (H, W)
        mask = np.max(mask, axis=-1)

    # Get non-zero pixels (tumor regions)
    y_indices, x_indices = np.where(mask > 0)

    # Handle case where no tumor is detected (empty mask)
    if y_indices.size == 0 or x_indices.size == 0:
        return None

    # Calculate bounding box coordinates
    xmin, xmax = np.min(x_indices), np.max(x_indices)
    ymin, ymax = np.min(y_indices), np.max(y_indices)

    return [int(xmin), int(ymin), int(xmax), int(ymax)]

Data Augmentation for Class Imbalance

In [6]:
augmentor = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

def augment_image(image, count):
    """Generate augmented images."""
    augmented_images = []
    image = np.expand_dims(image, axis=0)
    for _ in range(count):
        aug_img = next(augmentor.flow(image, batch_size=1))[0]
        augmented_images.append(aug_img)
    return augmented_images

Dataset Splitting and Balancing

In [7]:
def split_and_balance_dataset(images, max_count):
    train, temp = train_test_split(images, test_size=0.3, random_state=42)
    val, test = train_test_split(temp, test_size=0.5, random_state=42)
    train = balance_class(train, max_count)
    return train, val, test

def balance_class(images, max_count):
    current_count = len(images)
    augmented_images = []
    if current_count < max_count:
        extra_images = max_count - current_count
        for image_path in random.choices(images, k=extra_images):
            img = preprocess_image(image_path)
            aug_imgs = augment_image(img, 1)
            augmented_images.append((aug_imgs[0], image_path))
    return images + [img_path for _, img_path in augmented_images]

Generating Annotations

In [8]:
def create_pascal_voc_xml(image_path, bbox, label, save_dir):
    image_name = os.path.basename(image_path)
    xml_filename = os.path.splitext(image_name)[0] + ".xml"

    root = ET.Element("annotation")
    ET.SubElement(root, "filename").text = image_name
    ET.SubElement(root, "path").text = image_path

    size = ET.SubElement(root, "size")
    ET.SubElement(size, "width").text = str(TARGET_SIZE[0])
    ET.SubElement(size, "height").text = str(TARGET_SIZE[1])

    obj = ET.SubElement(root, "object")
    ET.SubElement(obj, "name").text = label
    bbox_elem = ET.SubElement(obj, "bndbox")
    ET.SubElement(bbox_elem, "xmin").text = str(bbox[0])
    ET.SubElement(bbox_elem, "ymin").text = str(bbox[1])
    ET.SubElement(bbox_elem, "xmax").text = str(bbox[2])
    ET.SubElement(bbox_elem, "ymax").text = str(bbox[3])

    xml_pretty = parseString(ET.tostring(root)).toprettyxml()
    with open(os.path.join(save_dir, xml_filename), "w") as xml_file:
        xml_file.write(xml_pretty)

Check Folder Structure

In [9]:
def check_folder_structure(base_path):
    """Print the folder structure after preprocessing."""
    for root, dirs, files in os.walk(base_path):
        level = root.replace(base_path, "").count(os.sep)
        indent = " " * 2 * level
        print(f"{indent}{os.path.basename(root)}/")
        sub_indent = " " * 2 * (level + 1)
        for file in files:
            print(f"{sub_indent}{file}")

Preprocess dataset

In [14]:
def process_dataset():
    """Preprocess images, generate masks, and save with bounding boxes."""
    for tumor_class in tumor_classes:
        class_path = os.path.join(RAW_DATASET_PATH, tumor_class)
        images = [os.path.join(class_path, img) for img in os.listdir(class_path) if img.lower().endswith(('.png', '.jpg', '.jpeg'))]

        # Get max count for class balancing
        max_count = max(len(os.listdir(os.path.join(RAW_DATASET_PATH, cls))) for cls in tumor_classes)

# Corrected function call
        train, val, test = split_and_balance_dataset(images, max_count)


        for split, split_data in zip(["Train", "Val", "Test"], [train, val, test]):
            for image_path in split_data:
                image = preprocess_image(image_path)
                mask = get_segmentation_mask(image)
                bbox = mask_to_bbox(mask)

                if bbox:
                    img_dest = os.path.join(OUTPUT_PATH, split, "Images", tumor_class)
                    ann_dest = os.path.join(OUTPUT_PATH, split, "Annotations", tumor_class)

                    shutil.copy(image_path, img_dest)
                    create_pascal_voc_xml(image_path, bbox, tumor_class, ann_dest)

                    print(f"{split}: {os.path.basename(image_path)} -> {tumor_class}")

print("Dataset processing completed successfully.")

Dataset processing completed successfully.


Main Preprocessing Function

In [15]:
# Main function to run the full preprocessing pipeline
def main():
    print("Starting dataset preprocessing...")

    # Ensure the output path exists
    os.makedirs(OUTPUT_PATH, exist_ok=True)

    # Check if raw dataset path is valid
    if not os.path.exists(RAW_DATASET_PATH):
        print(f"Error: RAW_DATASET_PATH does not exist: {RAW_DATASET_PATH}")
        return

    # Run the dataset processing
    process_dataset()

    # Check final folder structure
    print("\n✅ Preprocessing complete! Final folder structure:")
    check_folder_structure(OUTPUT_PATH)

# Execute main function
main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        1703.jpg
        1899.jpg
        0081.jpg
        0836.jpg
        1855.jpg
        0666.jpg
        1432.jpg
        1134.jpg
        0249.jpg
        1032.jpg
        0079.jpg
        0498.jpg
        1318.jpg
        1017.jpg
        0518.jpg
        1626.jpg
        0621.jpg
        1803.jpg
        1403.jpg
        1196.jpg
        1267.jpg
        0405.jpg
        1119.jpg
        0716.jpg
        1468.jpg
        0456.jpg
        1532.jpg
        0069.jpg
        0014.jpg
        1578.jpg
        0037.jpg
        0685.jpg
        1080.jpg
        0751.jpg
        1376.jpg
        0773.jpg
        1770.jpg
        1905.jpg
        1845.jpg
        0947.jpg
        1004.jpg
        1264.jpg
        1362.jpg
        1904.jpg
        0009.jpg
        1954.jpg
        1801.jpg
        1333.jpg
        1554.jpg
        0235.jpg
        1779.jpg
        0916.jpg
        0483.jpg
        0045.jpg
        1507.jpg
