<div style="background-color:#f0f8ff; padding:15px; border-radius:8px; font-size:16px; line-height:1.6;">

  <h3 style="margin-top:0;">Author: Nikolin Prenga</h3>

  <p><strong>Summary:</strong></p>

  <p>
    This notebook organizes and prepares the full dataset for training <strong>YOLO</strong>.
    It includes three main steps:
  </p>

  <ol>
    <li>
      Generating YOLO-format annotations from binary tumor masks and no-tumor images using bounding boxes 
      with normalized coordinates: <code>x_center, y_center, width, height</code>.
    </li>
    <li>
      Splitting the full dataset into training and validation sets (default 80/20), 
      while keeping all four categories: <em>glioma</em>, <em>meningioma</em>, <em>pituitary</em>, and <em>no tumor</em>.
    </li>
    <li>
      Moving the test dataset from <code>Data_organize_Nikolin</code> into the final structure 
      <code>Yolo_v11_Nikolin_Dataset</code>, including images, masks, and YOLO label files.
    </li>
    </li>
  </ol>

  <p>
    The resulting dataset structure is suitable for training and evaluating both <strong>YOLOv11</strong> and <strong>YOLOv12</strong>.
  </p>
</div>


In [2]:
import numpy as np
import cv2
import os   
import matplotlib.pyplot as plt
from PIL import Image
import shutil
import yaml
from ultralytics import YOLO



In [3]:
# Function to delete files in a directory

def delete_one_file(directory):
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            os.remove(file_path)
            print(f"Deleted: {file_path}")
            #break  # Delete only one file

#delete_one_file('Yolo_v11_Nikolin_Dataset/val/images')

In [None]:
# The purpose of this function copy_files_only is to create a new dataset by copying
# images and masks from the brisc2025 dataset to my folder Data_organize_Nikolin under train and test, 
# each with a subdirectory: images and masks.
# This function copies images and masks from the original directory to a target one we're organizing.
# It removes unnecessary parts of the filename, keeping only the ID and a suffix (e.g., _gL_ for glioma, _me_ for meningioma).
# It retains all anatomical planes: Axial, Coronal, and Sagittal, along with their respective suffixes.

def copy_files_only(src_folder_image,
                    src_folder_mask,
                    dest_folder_image, 
                    dest_folder_mask):
    """
    Copy and rename files from source folders to destination folders,
    trimming the filenames to keep only the suffix (e.g., '_00001_gl_ax_t1.jpg').
    """

    for filename in os.listdir(src_folder_image):
        src_image_path = os.path.join(src_folder_image, filename)
        src_mask_path = os.path.join(src_folder_mask, filename)

        if os.path.isfile(src_image_path):
            # Shorten the filename by keeping only the suffix
            parts = filename.split("_")
            if len(parts) >= 4:
                new_filename = "_" + "_".join(parts[-4:])
            else:
                new_filename = filename  # fallback if format doesn't match

            image = cv2.imread(src_image_path, cv2.IMREAD_COLOR)
            mask = cv2.imread(src_mask_path, cv2.IMREAD_UNCHANGED)

            if image is None or mask is None:
                print(f'Image or mask is None for {filename}')
                continue
            destination_image_path = os.path.join(dest_folder_image, new_filename)
            destination_mask_path = os.path.join(dest_folder_mask, new_filename)

            cv2.imwrite(destination_image_path, image)
            cv2.imwrite(destination_mask_path, mask)
            print(f"Copied {filename} as {new_filename}")

            


In [5]:
# # Define source and destination folders for images and masks in the training set
src_folder_image_train= 'brisc2025/segmentation_task/train/images'
src_folder_mask_train = 'brisc2025/segmentation_task/train/masks'
dest_folder_image_train = 'Data_organize_Nikolin/train/images'
dest_folder_mask_train = 'Data_organize_Nikolin/train/masks'

# #Copy and rename files from source folders to destination folders
# copy_files_only(src_folder_image_train,
#                     src_folder_mask_train,
#                     dest_folder_image_train, 
#                     dest_folder_mask_train)

# # Define source and destination folders for images and masks in the testining/unseen set

src_folder_image_test = 'brisc2025/segmentation_task/test/images'
src_folder_mask_test = 'brisc2025/segmentation_task/test/masks'
dest_folder_image_test = 'Data_organize_Nikolin/test/images'
dest_folder_mask_test = 'Data_organize_Nikolin/test/masks'

# # Copy and rename files from source folders to destination folders
# copy_files_only(src_folder_image_test,
#                     src_folder_mask_test,
#                     dest_folder_image_test, 
#                     dest_folder_mask_test)

len(os.listdir(dest_folder_image_test)), len(os.listdir(dest_folder_mask_test))

(1000, 860)

In [None]:
# Above, all images and masks were copied to Data_organize_Nikolin from brisc2025/segmentation
# Since no-tumor images are not included there, we copy them from the classification folder
# These are moved to their respective 'train' and 'test' folders under 'images' in Data_organize_Nikolin
def copy_no_tumor_files(src_folder_image, dest_folder_image):
    """
    Copy no_tumor images from src_folder_image to dest_folder_image,
    renaming them to only include the last 4 parts separated by '_'.
    """

    for filename in os.listdir(src_folder_image):
        src_image_path = os.path.join(src_folder_image, filename)

        if not os.path.isfile(src_image_path):
            continue

        parts = filename.split("_")
        if len(parts) >= 4:
            new_filename = "_" + "_".join(parts[-4:])
        else:
            new_filename = filename  # fallback

        image = cv2.imread(src_image_path, cv2.IMREAD_COLOR)
        if image is None:
            print(f"Image is None for {filename}")
            continue

        dest_image_path = os.path.join(dest_folder_image, new_filename)
        cv2.imwrite(dest_image_path, image)
        print(f"Saved: {dest_image_path}")


# Define source  folders for no_tumor images in training and testing sets 
src_folder_no_tumor_train = 'brisc2025/classification_task/train/no_tumor'
src_folder_no_tumor_test='brisc2025/classification_task/test/no_tumor'

# # copy no tumor images from source folder to destination folder in training set
# copy_no_tumor_files(src_folder_no_tumor_train, dest_folder_image_train)

# # copy no tumor images from source folder to destination folder in testing set
#copy_no_tumor_files(src_folder_no_tumor_test, dest_folder_image_test)


In [None]:
# This function generates bounding boxes from a binary mask created by radiologists or physicians.
# It takes a mask as input and returns a list of bounding boxes in (x_min, y_min, width, height) format.
# These boxes are suitable for use in annotation tools and object detection models.

def mask_to_bboxes(mask, min_area=180):
    # Step 1: Ensure single-channel binary mask (0 and 255, uint8)
    if mask.ndim == 3:
        mask = cv2.cvtColor(mask, cv2.COLOR_RGB2GRAY)
    mask_bin = (mask > 0).astype(np.uint8)

    # Step 2: Find contours
    #contours, _ = cv2.findContours(mask_bin, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours, _ = cv2.findContours(mask_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)


    # Step 3: Extract and filter bounding boxes
    bboxes = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w * h >= min_area:  # ignore tiny noise
            bboxes.append((x, y, w, h))
    return bboxes

In [None]:
# To properly train on "no tumor" cases, we need bounding boxes — but what should they cover?
# We propose generating large bounding boxes that enclose the skull, effectively separating it from the background.
def get_skull_bbox(image_path):
    """
    Returns the bounding box (x_min, y_min, width, height) around the skull in an MRI image.

    Args:
        image_path (str): Path to the input image.

    Returns:
        tuple: (x_min, y_min, width, height)
    """
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    _, thresh = cv2.threshold(gray, 80, 255, cv2.THRESH_BINARY)
    kernel = np.ones((5, 5), np.uint8)
    cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)

    contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if not contours:
        return None  # No contour found

    largest_contour = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(largest_contour)

    return x, y, w, h

In [None]:
# This function generates YOLO-format labels from image-mask pairs.
# For "no tumor" cases, it uses a skull-based bounding box.
# For tumor cases, it uses the mask to generate bounding boxes via mask_to_bboxes().
# Class labels are assigned based on filename suffix: 0 = glioma, 1 = meningioma, 2 = pituitary, 3 = no tumor.
# Each object is represented in YOLO format as: 
# class_id, x_center_normalized, y_center_normalized, width_normalized, height_normalized


def make_anotations_from_masks(image_source, mask_source, annotations_path, mask_to_bboxes):

    pass
    """    Create annotations from masks in the specified source directories and save them in the specified annotations path.   
    Args:
        image_source (str): Path to the source directory containing images.
        mask_source (str): Path to the source directory containing masks.
        annotations_path (str): Path to the directory where annotations will be saved.
        mask_to_bboxes (function): Function to convert masks to bounding boxes.
    """
    for image_name in os.listdir(image_source):
        if not image_name.endswith(('.jpg', '.png')):
            continue

        # if '_no_' in image_name:
        #     label_name = os.path.splitext(image_name)[0] + '.txt'

        #     with open(os.path.join(annotations_path, label_name), 'w') as f:
        #         f.write(f'{0} {0} {0} {0} {0}\n')
        #     continue

        if '_no_' in image_name:
            im_no_tumor = cv2.imread(os.path.join(image_source, image_name))
            if im_no_tumor is None:
                print(f"Image not found for {image_name}. Skipping.")
                continue

            bbox_no_tumor = get_skull_bbox(os.path.join(image_source, image_name))
            if bbox_no_tumor is None:
                print(f"No bounding box found for {image_name}. Skipping.")
                continue
            x_min, y_min, width, height = bbox_no_tumor
            # determine x_center, y_center, width_normalized, height_normalized
            width_image, height_image = im_no_tumor.shape[1], im_no_tumor.shape[0]
            x_center = x_min + width / 2
            y_center = y_min + height / 2
            x_center_normalized = x_center / width_image
            y_center_normalized = y_center / height_image
            width_normalized = width / width_image
            height_normalized = height / height_image   
            
            
            label_name = os.path.splitext(image_name)[0] + '.txt'

            with open(os.path.join(annotations_path, label_name), 'w') as f:
                f.write(f'{3} {x_center_normalized} {y_center_normalized} {width_normalized} {height_normalized}\n')

            continue



        # Assign class based on image name
        if '_gl_' in image_name:
            class_name = 0
        elif '_me_' in image_name:
            class_name = 1
        elif '_pi_' in image_name:
            class_name = 2
        else:
            print(f"Unknown class in {image_name}. Skipping.")
            continue

        # Read the corresponding mask
        mask_path= os.path.join(mask_source, image_name)

        mask = cv2.imread(mask_path)
        if mask is None:
            print(f"Mask not found for {image_name}. Skipping.")
            continue
        image = cv2.imread(os.path.join(image_source, image_name))
        if image is None:
            print(f"Image not found for {image_name}. Skipping.")
            continue
        
        width_image, height_image = image.shape[1], image.shape[0]

        if image.shape[:2] != mask.shape[:2]:
            print(f"Image and mask dimensions do not match for {image_name}. Skipping.")
            continue


        # Convert mask to bounding boxes
        bboxes = mask_to_bboxes(mask)
        if not bboxes:
            print(f"No bounding boxes found for {image_name}. Skipping.")
            continue
        # Create annotation file

        #annotation_file = os.path.join(annotations_path, image_name.replace('.jpg', '.txt'))

        base_name = os.path.splitext(image_name)[0]
        annotation_file = os.path.join(annotations_path, base_name + ".txt")


        with open(annotation_file, 'w') as f:
            for (x_min, y_min, width, height) in bboxes:
                x_center = (x_min + width / 2) 
                y_center = (y_min + height / 2)
                x_center_normalized = x_center / width_image
                y_center_normalized = y_center / height_image
                width_normalized = width / width_image
                height_normalized = height / height_image
                f.write(f'{class_name} {x_center_normalized} {y_center_normalized} {width_normalized} {height_normalized}\n')

                print(f'{x_min} {y_min} {x_center} {y_center} {width} {height}\n')

                print(f'image name: {image_name}  \n   ')
            
            
# Define source and destination folders for images and masks in the training set


image_source_train ='Data_organize_Nikolin/train/images'
mask_source_train ='Data_organize_Nikolin/train/masks'
annotations_path_train = 'Data_organize_Nikolin/train/labels'

#make_anotations_from_masks(image_source_train, mask_source_train, annotations_path_train, mask_to_bboxes)

image_source_test = 'Data_organize_Nikolin/test/images'
mask_source_test = 'Data_organize_Nikolin/test/masks'
annotations_path_test = 'Data_organize_Nikolin/test/labels'

#make_anotations_from_masks(image_source_test, mask_source_test, annotations_path_test, mask_to_bboxes)

#print(len(os.listdir(annotations_path_train)), len(os.listdir(annotations_path_test))   )





In [None]:
# This function splits a large training dataset into two parts: training and validation.
# The dataset contains four categories: glioma, meningioma, pituitary tumor, and no tumor.
# Each image is categorized based on its filename suffix: '_gl_', '_me_', '_pi_', '_no_'.
# A fixed percentage (default 20%) of images is randomly selected from each category to form the validation set.
# The remaining 80% per category is used as the updated training set.
#
# All selected images and their corresponding label files are copied into two newly created directories:
# - Yolo_v11_Nikolin_Dataset/train/images and /train/labels
# - Yolo_v11_Nikolin_Dataset/val/images and /val/labels
#
# This ensures the resulting dataset is structured for YOLO training with clean separation between training and validation.

#
def _make_val_dataset(main_train_folder_images, 
                      main_train_folder_labels,
                      train_new_folder_images,
                        train_new_folder_labels,
                        val_new_folder_images,
                        val_new_folder_labels,
                          val_ratio=0.20):
    
    main_folder_images = os.listdir(main_train_folder_images)


    
    glioma_images = np.array([img for img in main_folder_images if '_gl_' in img])
    meningioma_images = np.array([img for img in main_folder_images if '_me_' in img])
    pituitary_images = np.array([img for img in main_folder_images if '_pi_' in img])
    no_tumor_images = np.array([img for img in main_folder_images if '_no_' in img])
    print(f'{len(glioma_images)+ len(meningioma_images) + len(pituitary_images) + len(no_tumor_images)} total images in the training set   ') 
    print(f'{len(glioma_images)} glioma images, {len(meningioma_images)} meningioma images, {len(pituitary_images)} pituitary images, {len(no_tumor_images)} no_tumor images')


    
    # Shuffle the images in each category and split them into training and validation sets
    selected_glioma_val = np.random.choice(glioma_images, size=int(len(glioma_images)*val_ratio), replace=False)
    selected_meningioma_val = np.random.choice(meningioma_images, size=int(len(meningioma_images)*val_ratio), replace=False)
    selected_pituitary_val = np.random.choice(pituitary_images, size=int(len(pituitary_images)*val_ratio), replace=False)
    selected_no_tumor_val = np.random.choice(no_tumor_images, size=int(len(no_tumor_images)*val_ratio), replace=False) 
    print(f'{len(selected_glioma_val)} glioma validation images, {len(selected_meningioma_val)} meningioma validation images, {len(selected_pituitary_val)} pituitary validation images, {len(selected_no_tumor_val)} no_tumor validation images')

    # Create the validation set by combining the selected images

    images_val = np.concatenate((selected_glioma_val, selected_meningioma_val, selected_pituitary_val, selected_no_tumor_val))

    np.random.shuffle(images_val)
    print(f'{len(images_val)} total images in the validation set     ')

    # Remaining 80% for training
    remaining_glioma = np.setdiff1d(glioma_images, selected_glioma_val)
    remaining_meningioma = np.setdiff1d(meningioma_images, selected_meningioma_val)
    remaining_pituitary = np.setdiff1d(pituitary_images, selected_pituitary_val)
    remaining_no_tumor = np.setdiff1d(no_tumor_images, selected_no_tumor_val)
    print(f'{len(remaining_glioma)+ len(remaining_meningioma) + len(remaining_pituitary) + len(remaining_no_tumor)} total images in the training set after validation split ')

    
    
    # Create the training set by combining the remaining images
    images_train = np.concatenate((remaining_glioma, remaining_meningioma, remaining_pituitary, remaining_no_tumor))
    np.random.shuffle(images_train)
    print(f'{len(images_train)} total images in the training set after validation split ')

        

    # Copy training images and labels to the new folder

    for image_val in images_val:
        image_source_val = os.path.join(main_train_folder_images, image_val)

        if os.path.isfile(image_source_val):
            image_val_test = os.path.join(val_new_folder_images, image_val)

            shutil.copy(image_source_val, image_val_test )

            # Copy corresponding label file

            label_file = image_val.replace('.jpg', '.txt').replace('.png', '.txt')

            label_from_source = os.path.join(main_train_folder_labels, label_file)

            
            if os.path.isfile(label_from_source):
                label_destination = os.path.join(val_new_folder_labels, label_file )
                shutil.copy(label_from_source, label_destination )

            else:
                print(f"Label file not found for {image_val}. Skipping label copy.")

    # Copy training images and labels to the new folder

    for image_train in images_train:
        source_path_main_train = os.path.join(main_train_folder_images, image_train )

        if os.path.isfile(source_path_main_train):
            destination_train_images = os.path.join(train_new_folder_images, image_train)

            shutil.copy(source_path_main_train, destination_train_images)

            # Copy corresponding label file
            label_file_train = image_train.replace('.jpg', '.txt').replace('.png', '.txt')
            
            source_label_path_train = os.path.join(main_train_folder_labels, label_file_train)
            
            if os.path.isfile(source_label_path_train):
                destination_label_path_train = os.path.join(train_new_folder_labels, label_file_train)
                shutil.copy(source_label_path_train, destination_label_path_train)
            else:
                print(f"Label file not found for {image_train}. Skipping label copy.")


main_train_folder_images= 'Data_organize_Nikolin/train/images'
main_train_folder_labels = 'Data_organize_Nikolin/train/labels'
train_new_folder_images ='Yolo_v11_Nikolin_Dataset/train/images'
train_new_folder_labels ='Yolo_v11_Nikolin_Dataset/train/labels'
val_new_folder_images ='Yolo_v11_Nikolin_Dataset/val/images'
val_new_folder_labels ='Yolo_v11_Nikolin_Dataset/val/labels'

# _make_val_dataset(main_train_folder_images,
#                       main_train_folder_labels,
#                       train_new_folder_images,
#                         train_new_folder_labels,
#                         val_new_folder_images,
#                         val_new_folder_labels,
#                         val_ratio=0.20)

In [None]:
# In this function just_copy_from_one_directory_to_another, we aim to copy files from one directory to another.
# In the Data_organize_Nikolin folder, we want to move all the testing dataset into Yolo_v11_Nikolin_Dataset.
# This Yolo_v11_Nikolin_Dataset will be chosen as the final dataset for training YOLOv11 and YOLOv12 and for performing inference.

def just_copy_from_one_directory_to_another(source_directory, destination_directory):
    os.makedirs(destination_directory, exist_ok=True)

    files_from_source = os.listdir(source_directory)

    for file in files_from_source:
        file_from_source = os.path.join(source_directory, file)

        if os.path.isfile(file_from_source):  
            file_to_destination = os.path.join(destination_directory, file)
            shutil.copy(file_from_source, file_to_destination)


In [None]:
source_directory_images = 'Data_organize_Nikolin/test/images'
destination_directory_images = 'Yolo_v11_Nikolin_Dataset/test/images'
#just_copy_from_one_directory_to_another(source_directory_images,destination_directory_images )

source_directory_masks_test = 'Data_organize_Nikolin/test/masks'
destination_directory_mask_test = 'Yolo_v11_Nikolin_Dataset/test/masks'
#just_copy_from_one_directory_to_another(source_directory_masks_test,destination_directory_mask_test )

source_directory_labels_test ='Data_organize_Nikolin/test/labels'
destination_directory_labels_test = 'Yolo_v11_Nikolin_Dataset/test/labels'
#just_copy_from_one_directory_to_another(source_directory_labels_test, destination_directory_labels_test)


