<a href="https://colab.research.google.com/github/TharinsaMudalige/Neuron-Brain_Tumor_Detection_Classification_with_XAI/blob/Detection-Classficiation-CNN/Generating_Annotations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Required Libraries

In [30]:
# Install necessary libraries
!pip install tensorflow tensorflow-hub tensorflow-addons opencv-python

# Import libraries
import os
import random
import shutil
import cv2
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from google.colab import drive
from PIL import Image
import xml.etree.ElementTree as ET
from xml.dom.minidom import parseString



Mount Google Drive and Define paths

In [31]:
# Mount Google Drive
drive.mount('/content/drive')

# Define paths
RAW_DATASET_PATH = "/content/drive/MyDrive/DSGP/DSGP_dataset"
OUTPUT_PATH = "/content/drive/MyDrive/DSGP/CNN_Dataset"

# Detect tumor classes (from folders in dataset)
tumor_classes = [folder for folder in os.listdir(RAW_DATASET_PATH) if os.path.isdir(os.path.join(RAW_DATASET_PATH, folder))]
print("Detected tumor classes:", tumor_classes)

# Create folders for Train, Val, Test
for split in ["Train", "Val", "Test"]:
    for subdir in ["Images", "Annotations"]:
        for tumor_class in tumor_classes:
            os.makedirs(os.path.join(OUTPUT_PATH, split, subdir, tumor_class), exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Detected tumor classes: ['tuberculoma', 'granuloma', 'no_tumour', 'papiloma', 'schwannoma', 'meduloblastoma', 'pituitary', 'neurocitoma', 'oligodendroglioma', 'meningioma', 'germinoma', 'astrocitoma', 'glioblastoma', 'ependimoma', 'ganglioglioma', 'carcinoma']


Load U-Net Model

In [32]:
# Build a simple U-Net model for segmentation
def unet_model(input_size=(256, 256, 3)):
    inputs = keras.Input(input_size)

    # Encoder
    conv1 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
    pool1 = layers.MaxPooling2D(pool_size=(2, 2))(conv1)

    conv2 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(pool1)
    pool2 = layers.MaxPooling2D(pool_size=(2, 2))(conv2)

    # Bottleneck
    conv3 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(pool2)

    # Decoder
    up4 = layers.Conv2DTranspose(128, (2, 2), strides=(2, 2), padding='same')(conv3)
    conv4 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(up4)

    up5 = layers.Conv2DTranspose(64, (2, 2), strides=(2, 2), padding='same')(conv4)
    conv5 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(up5)

    outputs = layers.Conv2D(1, (1, 1), activation='sigmoid')(conv5)

    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

# Compile U-Net model
unet_model = unet_model()
unet_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print("U-Net model built and compiled.")

U-Net model built and compiled.


Resize and Preprocess Images

In [33]:
# Resize images to 256x256
TARGET_SIZE = (256, 256)

def preprocess_image(image_path):
    """Load, resize, and normalize MRI image."""
    image = load_img(image_path)
    image = image.resize(TARGET_SIZE)
    image = img_to_array(image) / 255.0
    return image

Tumor Segmentation and Bounding Box Generation

In [34]:
def get_segmentation_mask(image):
    """Generate tumor segmentation mask using U-Net."""
    img_resized = tf.image.resize(image, TARGET_SIZE)
    img_resized = tf.expand_dims(img_resized, 0)
    mask = unet_model(img_resized)
    mask = tf.squeeze(mask, axis=0)
    mask = np.array(mask > 0.5, dtype=np.uint8)
    return mask

def mask_to_bbox(mask, is_no_tumor=False):
    """Convert a binary mask to bounding box coordinates (xmin, ymin, xmax, ymax)."""
    # If no tumor, assign full image as bounding box
    if is_no_tumor:
        return [0, 0, TARGET_SIZE[0], TARGET_SIZE[1]]

    # Ensure mask is 2D (squeeze any extra dimensions)
    if len(mask.shape) > 2:
        mask = np.squeeze(mask)

    # Identify tumor regions (non-zero areas)
    y_indices, x_indices = np.where(mask > 0)

    # Handle empty mask (no tumor detected)
    if y_indices.size == 0 or x_indices.size == 0:
        return [0, 0, TARGET_SIZE[0], TARGET_SIZE[1]]  # Assign full image for 'no_tumor'

    # Calculate bounding box coordinates
    xmin, xmax = np.min(x_indices), np.max(x_indices)
    ymin, ymax = np.min(y_indices), np.max(y_indices)

    return [int(xmin), int(ymin), int(xmax), int(ymax)]

Data Augmentation for Class Imbalance

In [35]:
# Define data augmentation strategies
augmentor = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

def augment_image(image, count):
    """Generate augmented images."""
    augmented_images = []
    image = np.expand_dims(image, axis=0)
    for _ in range(count):
        aug_img = next(augmentor.flow(image, batch_size=1))[0]
        augmented_images.append(aug_img)
    return augmented_images

Dataset Splitting and Balancing

In [36]:
def split_and_balance_dataset(images, max_count):
    """Split and balance the dataset."""
    train, temp = train_test_split(images, test_size=0.3, random_state=42)
    val, test = train_test_split(temp, test_size=0.5, random_state=42)
    train = balance_class(train, max_count)
    return train, val, test

def balance_class(images, max_count):
    """Balance classes using augmentation."""
    current_count = len(images)
    augmented_images = []
    if current_count < max_count:
        extra_images = max_count - current_count
        for image_path in random.choices(images, k=extra_images):
            img = preprocess_image(image_path)
            aug_imgs = augment_image(img, 1)
            augmented_images.append((aug_imgs[0], image_path))
    return images + [img_path for _, img_path in augmented_images]

Generating Annotations

In [37]:
def create_pascal_voc_xml(image_path, bbox, label, save_dir):
    """Generate PASCAL VOC annotations."""
    image_name = os.path.basename(image_path)
    xml_filename = os.path.splitext(image_name)[0] + ".xml"

    root = ET.Element("annotation")
    ET.SubElement(root, "filename").text = image_name
    ET.SubElement(root, "path").text = image_path

    size = ET.SubElement(root, "size")
    ET.SubElement(size, "width").text = str(TARGET_SIZE[0])
    ET.SubElement(size, "height").text = str(TARGET_SIZE[1])

    obj = ET.SubElement(root, "object")
    ET.SubElement(obj, "name").text = label
    bbox_elem = ET.SubElement(obj, "bndbox")
    ET.SubElement(bbox_elem, "xmin").text = str(bbox[0])
    ET.SubElement(bbox_elem, "ymin").text = str(bbox[1])
    ET.SubElement(bbox_elem, "xmax").text = str(bbox[2])
    ET.SubElement(bbox_elem, "ymax").text = str(bbox[3])

    with open(os.path.join(save_dir, xml_filename), "w") as xml_file:
        xml_file.write(parseString(ET.tostring(root)).toprettyxml())

Checking Folder Structure

In [38]:
def check_folder_structure(base_path):
    """Print the folder structure after preprocessing."""
    for root, dirs, files in os.walk(base_path):
        level = root.replace(base_path, "").count(os.sep)
        indent = " " * 2 * level
        print(f"{indent}{os.path.basename(root)}/")
        sub_indent = " " * 2 * (level + 1)
        for file in files:
            print(f"{sub_indent}{file}")

Preprocess the Dataset

In [39]:
def process_dataset():
    """Preprocess images, generate masks, and save with bounding boxes."""
    for tumor_class in tumor_classes:
        class_path = os.path.join(RAW_DATASET_PATH, tumor_class)
        images = [os.path.join(class_path, img) for img in os.listdir(class_path) if img.lower().endswith(('.png', '.jpg', '.jpeg'))]

        # Get max count for class balancing
        max_count = max(len(os.listdir(os.path.join(RAW_DATASET_PATH, cls))) for cls in tumor_classes)

        # Split into Train, Val, Test sets
        train, temp = train_test_split(images, test_size=0.3, random_state=42)
        val, test = train_test_split(temp, test_size=0.5, random_state=42)

        # Balance training data
        train = balance_class(train, max_count)

        # Process each split
        for split, split_data in zip(["Train", "Val", "Test"], [train, val, test]):
            for image_path in split_data:
                # Preprocess image and generate mask
                image = preprocess_image(image_path)
                mask = get_segmentation_mask(image)
                bbox = mask_to_bbox(mask)

                # Define save directories
                img_dest = os.path.join(OUTPUT_PATH, split, "Images", tumor_class)
                ann_dest = os.path.join(OUTPUT_PATH, split, "Annotations", tumor_class)

                # Save processed image and annotation
                shutil.copy(image_path, img_dest)
                create_pascal_voc_xml(image_path, bbox, tumor_class, ann_dest)

                print(f"{split}: {os.path.basename(image_path)} -> {tumor_class}")

    print("Dataset processing completed successfully.")

Main Preprocessing Function

In [40]:
# Main function to run full preprocessing pipeline
def main():
    print("Starting dataset preprocessing...")
    os.makedirs(OUTPUT_PATH, exist_ok=True)

    if not os.path.exists(RAW_DATASET_PATH):
        print(f"Error: RAW_DATASET_PATH does not exist: {RAW_DATASET_PATH}")
        return

    process_dataset()

    # Count and plot class distributions
    before_balancing_counts = count_images_per_class(RAW_DATASET_PATH)
    after_balancing_counts = count_images_per_class(os.path.join(OUTPUT_PATH, "Train/Images"))
    plot_class_distributions(before_balancing_counts, after_balancing_counts)

    print("\n Preprocessing complete! Final folder structure:")
    check_folder_structure(OUTPUT_PATH)

# Execute main function
main()


Starting dataset preprocessing...


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/DSGP/CNN_Dataset/Train/Images/tuberculoma'

Visualize Class Imbalance handling

In [None]:
# Count images per class before and after balancing
def count_images_per_class(base_path):
    """Count the number of images in each class directory."""
    class_counts = {}
    for tumor_class in tumor_classes:
        class_path = os.path.join(base_path, tumor_class)
        num_images = len([img for img in os.listdir(class_path) if img.lower().endswith(('.png', '.jpg', '.jpeg'))])
        class_counts[tumor_class] = num_images
    return class_counts

# Count images before and after balancing
before_balancing_counts = count_images_per_class(RAW_DATASET_PATH)
after_balancing_counts = count_images_per_class(os.path.join(OUTPUT_PATH, "Train/Images"))

print("Class counts before balancing:", before_balancing_counts)
print("Class counts after balancing:", after_balancing_counts)

In [None]:
def plot_class_distributions(before_counts, after_counts):
    """Plot side-by-side comparison of class distributions."""
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    # Before balancing
    axes[0].bar(before_counts.keys(), before_counts.values(), color='skyblue')
    axes[0].set_title("Class Distribution Before Balancing")
    axes[0].set_xlabel("Tumor Classes")
    axes[0].set_ylabel("Number of Images")
    axes[0].tick_params(axis='x', rotation=45)

    # After balancing
    axes[1].bar(after_counts.keys(), after_counts.values(), color='salmon')
    axes[1].set_title("Class Distribution After Balancing")
    axes[1].set_xlabel("Tumor Classes")
    axes[1].set_ylabel("Number of Images")
    axes[1].tick_params(axis='x', rotation=45)

    plt.tight_layout()
    plt.show()

# Plot class distributions
plot_class_distributions(before_balancing_counts, after_balancing_counts)