<a href="https://colab.research.google.com/github/TharinsaMudalige/Neuron-Brain_Tumor_Detection_Classification_with_XAI/blob/Detection-Classficiation-CNN/Preprocessing_for_Faster_R_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Libraries

In [40]:
!pip install opencv-python-headless matplotlib pandas SimpleITK
import os
import cv2
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from xml.dom import minidom
import matplotlib.pyplot as plt
import SimpleITK as sitk
from google.colab import drive
from sklearn.model_selection import train_test_split
import zipfile
import seaborn as sns
import gc



Mount Google Drive

In [41]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Folder Structure Setup

In [42]:
dataset_zip_path = "/content/drive/My Drive/DSGP/DSGP_Dataset.zip"
extracted_dataset_path = "/content/drive/My Drive/DSGP/original_dataset"

# Unzip dataset if not already extracted
if not os.path.exists(extracted_dataset_path):
    with zipfile.ZipFile(dataset_zip_path, 'r') as zip_ref:
        zip_ref.extractall(extracted_dataset_path)

base_dir = '/content/drive/My Drive/DSGP/Preprocessed Dataset'

# Create main folders
folders = [
    'Images/Train/Tumor',
    'Images/Train/No_Tumor',
    'Images/Val/Tumor',
    'Images/Val/No_Tumor',
    'Images/Test/Tumor',
    'Images/Test/No_Tumor',
    'Annotations/Train',
    'Annotations/Val',
    'Annotations/Test'
]

for folder in folders:
    os.makedirs(os.path.join(base_dir, folder), exist_ok=True)

Annotation Creation

In [43]:
def find_tumor_bbox(img):
    """Automatically detect tumor region using intensity thresholding"""
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if contours:
        largest = max(contours, key=cv2.contourArea)
        x,y,w,h = cv2.boundingRect(largest)
        return [x,y,x+w,y+h]
    return [0,0,0,0]  # For non-tumor

def create_xml(image_path, class_name, box, size):
    root = ET.Element("annotation")
    ET.SubElement(root, "filename").text = os.path.basename(image_path)
    size_elem = ET.SubElement(root, "size")
    ET.SubElement(size_elem, "width").text = str(size[1])
    ET.SubElement(size_elem, "height").text = str(size[0])
    ET.SubElement(size_elem, "depth").text = "3"
    obj = ET.SubElement(root, "object")
    ET.SubElement(obj, "name").text = class_name
    ET.SubElement(obj, "pose").text = "Unspecified"
    bndbox = ET.SubElement(obj, "bndbox")
    ET.SubElement(bndbox, "xmin").text = str(box[0])
    ET.SubElement(bndbox, "ymin").text = str(box[1])
    ET.SubElement(bndbox, "xmax").text = str(box[2])
    ET.SubElement(bndbox, "ymax").text = str(box[3])
    xml_str = ET.tostring(root)
    xml_pretty = minidom.parseString(xml_str).toprettyxml()
    return xml_pretty

Preprocessing Functions

In [44]:
#Skull Stripping
def skull_stripping(img):
    sitk_img = sitk.GetImageFromArray(img)
    sitk_img = sitk.Cast(sitk_img, sitk.sitkFloat32)
    mask = sitk.OtsuThreshold(sitk_img, 0, 1, 200)
    mask = sitk.BinaryMorphologicalClosing(mask, [3]*3)
    return sitk.GetArrayFromImage(sitk.Mask(sitk_img, mask))

In [45]:
#Normalization
def normalize_image(img):
    p2, p98 = np.percentile(img, (2, 98))
    img = np.clip(img, p2, p98)
    return cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX)

In [46]:
#Preprocessing Pipeline
def preprocess_pipeline(img_path, target_size=(256, 256)):
    img = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
    skull_free = skull_stripping(img)
    normalized = normalize_image(skull_free)
    return cv2.resize(normalized, target_size)

Data Augmentation

In [47]:
def augment_image(img):
    h,w = img.shape[:2]
    center = (np.random.randint(w//4, 3*w//4),
             np.random.randint(h//4, 3*h//4))
    scale = np.random.uniform(0.9, 1.1)

    M = cv2.getRotationMatrix2D(center, np.random.randint(-15,15), scale)
    return cv2.warpAffine(img, M, (w,h))

Preprocessing Function

In [48]:
def process_dataset():
    data = []
    # Collect data with proper class naming
    for folder_name in ['no tumour', 'tumour']:  # Original folder names with space
        class_path = os.path.join(extracted_dataset_path, folder_name)
        class_label = 'No_Tumor' if 'no ' in folder_name else 'Tumor'  # Standardized labels

        for root, _, files in os.walk(class_path):
            for file in files:
                if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    data.append({
                        'path': os.path.join(root, file),
                        'class': class_label  # Use standardized label
                    })

    df = pd.DataFrame(data)

    # Debugging: Check DataFrame content
    print("First 5 rows of df:", df.head())
    print("Columns in df:", df.columns)
    print("Total Images Found:", len(df))

    # If DataFrame is empty, stop execution
    if df.empty:
        print("Error: No images found! Check dataset extraction path.")
        return

    # Class balancing with standardized labels
    tumor_df = df[df['class'] == 'Tumor']
    no_tumor_df = df[df['class'] == 'No_Tumor']

    if len(tumor_df) < len(no_tumor_df):
        tumor_df = tumor_df.sample(len(no_tumor_df), replace=True)
    else:
        no_tumor_df = no_tumor_df.sample(len(tumor_df), replace=True)

    balanced_df = pd.concat([tumor_df, no_tumor_df])

    # Data splitting
    train_df, temp_df = train_test_split(balanced_df, test_size=0.3, stratify=balanced_df['class'])
    val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['class'])

    # Processing loop
    for split_df, split_name in zip([train_df, val_df, test_df], ['Train', 'Val', 'Test']):
        for idx, row in split_df.iterrows():
            try:
                # Preprocess image
                processed_img = preprocess_pipeline(row['path'])

                # Augment only tumor training images
                if row['class'] == 'Tumor' and split_name == 'Train':
                    processed_img = augment_image(processed_img)

                # Save image
                img_filename = f"{split_name.lower()}_{idx}.png"
                img_save_path = f"{base_dir}/Images/{split_name}/{row['class']}/{img_filename}"
                cv2.imwrite(img_save_path, cv2.cvtColor(processed_img, cv2.COLOR_RGB2BGR))

                # Generate bounding boxes
                bbox = find_tumor_bbox(processed_img) if row['class'] == 'Tumor' else [0,0,0,0]

                # Create XML
                xml_content = create_xml(img_filename, row['class'], bbox, processed_img.shape)
                xml_save_path = f"{base_dir}/Annotations/{split_name}/{img_filename.split('.')[0]}.xml"
                with open(xml_save_path, 'w') as f:
                    f.write(xml_content)

            except Exception as e:
                print(f"Error processing {row['path']}: {str(e)}")

Run Preprocessing

In [49]:
process_dataset()
print("Preprocessing complete! All files saved in:", base_dir)

First 5 rows of df:                                                 path     class
0  /content/drive/My Drive/DSGP/original_dataset/...  No_Tumor
1  /content/drive/My Drive/DSGP/original_dataset/...  No_Tumor
2  /content/drive/My Drive/DSGP/original_dataset/...  No_Tumor
3  /content/drive/My Drive/DSGP/original_dataset/...  No_Tumor
4  /content/drive/My Drive/DSGP/original_dataset/...  No_Tumor
Columns in df: Index(['path', 'class'], dtype='object')
Total Images Found: 300
Error processing /content/drive/My Drive/DSGP/original_dataset/tumour/glioma/Ganglioglioma/7._big_gallery.jpeg: OpenCV(4.10.0) /io/opencv/modules/imgproc/src/thresh.cpp:1559: error: (-2:Unspecified error) in function 'double cv::threshold(cv::InputArray, cv::OutputArray, double, double, int)'
> THRESH_OTSU mode:
>     'src_type == CV_8UC1 || src_type == CV_16UC1'
> where
>     'src_type' is 5 (CV_32FC1)

Error processing /content/drive/My Drive/DSGP/original_dataset/tumour/glioma/Ganglioglioma/1e1755f1923f3f302665d72