## Process BUSSET

Imports

In [13]:
import SimpleITK as sitk
import pandas as pd
from typing import Tuple, Dict, Any
from tqdm import tqdm
import numpy as np
import cv2
import os

Basic parameters

In [3]:
dataset_name = "busset" # This dataset has already the slices and masks created.

output_folder = f"datasets/{dataset_name}_png"
slice_min_lesion_px = 25

#use_classes = False  #If false, only one class is used (0)
val_frac = 0.2 # Validation fraction of the data

In [4]:
images_folder = os.path.join(output_folder, "images")
masks_folder = os.path.join(output_folder, "masks")

In [5]:
print("No classes available!")
use_classes = False
classes = []

No classes available!


### Create YOLO data

Format YOLOv8 segement:
- https://docs.ultralytics.com/datasets/segment/
- `<class-index> <x1> <y1> <x2> <y2> ... <xn> <yn>`
- Others: https://docs.ultralytics.com/yolov5/tutorials/train_custom_data/#13-prepare-dataset-for-yolov5

Format YOLOv8 detect:
- https://docs.ultralytics.com/datasets/detect/
- `<object-class> <x> <y> <width> <height>`

In [6]:
segmentation_data_path = os.path.join(output_folder, f"yolo_seg_data")
detection_data_path = os.path.join(output_folder, f"yolo_det_data")

In [21]:
import shutil

# Create folders
os.makedirs(segmentation_data_path, exist_ok=True)
os.makedirs(detection_data_path, exist_ok=True)
        
# List of image cases
list_cases = os.listdir(images_folder)

# Get classes
if not use_classes or any(len(item.split('_')) != 3 for item in list_cases):
    use_classes = False
    print("Classes are not used")
else:
    classes = sorted(set([ item[:-4].split('_')[-1] for item in list_cases]))
    print(classes)
    
# For each image
for image in tqdm(list_cases):
    
    # Check image name format
    # assert len(image[:-4].split("_")) == 3 (For busset there is a different format!)
    
    # Get class label
    if use_classes:
        label = image[:-4].split("_")[-1]
        label_index = classes.index(label)
    else:
        label_index = 0
    
    # Read mask
    mask_grey = cv2.imread(os.path.join(masks_folder, image), cv2.IMREAD_GRAYSCALE)
    mask = (mask_grey > 0).astype("uint8")
    mask_size = mask.shape[1], mask.shape[0]
    
    # Get contours
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours_list = [contour.squeeze().tolist() for contour in contours]
    
    if len(contours_list) > 1:
        print(f"More than 1 coutour found for image {image}")

    # Copy image files
    shutil.copyfile(os.path.join(images_folder, image), os.path.join(detection_data_path, image))
    shutil.copyfile(os.path.join(images_folder, image), os.path.join(segmentation_data_path, image))

    # Oepn label files
    det_fp = open(os.path.join(detection_data_path, f"{image[:-4]}.txt"), "w")
    seg_fp = open(os.path.join(segmentation_data_path, f"{image[:-4]}.txt"), "w")
        
    # For each countour
    for contour in contours_list:
        
        if len(contour) < 3:
            print(f'Incomplete countour in image {image}')
            continue
        
        # Get lesion countour data
        x, y = [point[0] for point in contour], [point[1] for point in contour] #TODO: use numpy
        x1, x2, y1, y2 = min(x), max(x), min(y), max(y)
        center_x, center_y = (x1+x2)/(2*mask_size[0]), (y1+y2)/(2*mask_size[1])
        width, height = (x2-x1)/mask_size[0], (y2-y1)/mask_size[1]
        
        # Check lesion size
        if width < 0.001 or height < 0.001:
            print(f"Small lesion found in image ({image})")
            continue
        
        # Write labels
        det_str = f"{center_x:0.6f} {center_y:0.6f} {width:0.6f} {height:0.6f}"
        det_fp.write(f"{label_index} {det_str}\n")   
        seg_str = " ".join([f"{point[0]/mask_size[0]:0.6f} {point[1]/mask_size[1]:0.6f}" for point in contour])
        seg_fp.write(f"{label_index} {seg_str}\n")
        
    # Close files
    det_fp.close()
    seg_fp.close()


Classes are not used


  2%|▏         | 24/1154 [00:00<00:08, 127.05it/s]

More than 1 coutour found for image malignant (19).png
Incomplete countour in image malignant (19).png


 46%|████▌     | 533/1154 [00:01<00:01, 608.27it/s]

More than 1 coutour found for image benign (62).png
Incomplete countour in image benign (62).png
More than 1 coutour found for image malignant (28).png
Incomplete countour in image malignant (28).png


 63%|██████▎   | 729/1154 [00:01<00:00, 639.32it/s]

More than 1 coutour found for image benign (88).png
Incomplete countour in image benign (88).png


100%|██████████| 1154/1154 [00:02<00:00, 526.35it/s]

More than 1 coutour found for image benign (414).png
Incomplete countour in image benign (414).png





### Create YOLO Train/Val split data

In [22]:
# Get patients list
list_items = [item[:-4] for item in os.listdir(detection_data_path) if item.endswith(".png")]
patients = list(set([ item.split("_")[0] for item in list_items]))

# Get lesion type per patient
if use_classes:
    patients_type = {}     
    for item in list_items:
        patient = item.split("_")[0] 
        label = item.split("_")[-1]
        if label not in patients_type:
            patients_type[label] = [patient]
        elif patient not in patients_type[label]:
            patients_type[label].append(patient)
else:
    patients_type = {0: patients}
    
print({k: len(v) for k, v in patients_type.items()})
    
# Distribute patients in Train/Val using val_frac with balanced lesion types
train_p = []
val_p = []
for k in patients_type.keys():
    num_train = int(len(patients_type[k])*(1-val_frac))
    train_type_p, val_type_p = patients_type[k][:num_train], patients_type[k][num_train:]
    train_p += train_type_p
    val_p += val_type_p
    
print(train_p, len(train_p))
print(val_p, len(val_p))
    
# Get distributed patient images for train and val
train = [ f"{item}.png" for item in list_items if item.split("_")[0] in train_p]
val = [ f"{item}.png" for item in list_items if item.split("_")[0] in val_p]

import random
random.seed(0)
random.shuffle(train)
random.shuffle(val)

# Generate train.txt file
train_det = os.path.join(output_folder, f"train_det.txt" ) 
with open(train_det, "w") as fp:
    fp.writelines([os.path.join(detection_data_path, t) + '\n' for t in train])
    
train_seg = os.path.join(output_folder, f"train_seg.txt" ) 
with open(train_seg, "w") as fp:
    fp.writelines([os.path.join(segmentation_data_path, t) + '\n' for t in train])
    
# Generate val.txt file
val_det = os.path.join(output_folder, f"val_det.txt")
with open(val_det, "w") as fp:
    fp.writelines([os.path.join(detection_data_path, v) + '\n' for v in val])
    
val_seg = os.path.join(output_folder, f"val_seg.txt")
with open(val_seg, "w") as fp:
    fp.writelines([os.path.join(segmentation_data_path, v) + '\n' for v in val])
    

{0: 1154}
['benign (389)', '100165', 'benign (424)', 'malignant (13)', 'benign (376)', 'malignant (185)', 'benign (199)', '100117', '100104', '500068', '100204', '100021', '100201', '19', 'benign (226)', 'benign (64)', 'benign (418)', 'malignant (20)', 'malignant (183)', 'benign (388)', '100096', '400061', 'malignant (11)', '123', 'malignant (72)', 'malignant (129)', '400008', 'malignant (140)', '100160', '400022', '137', '100168', '59', 'benign (210)', 'benign (77)', 'benign (172)', 'malignant (171)', '500004', '500034', 'benign (59)', 'malignant (112)', 'benign (390)', 'benign (61)', 'malignant (3)', 'malignant (43)', 'benign (105)', 'benign (398)', 'benign (8)', '500018', 'benign (206)', 'benign (189)', 'malignant (46)', 'benign (262)', '400004', 'malignant (37)', 'benign (337)', 'malignant (83)', '80', '100171', 'benign (397)', '100108', 'benign (400)', '21', '100065', '100170', '500006', '400074', 'benign (178)', '100043', 'malignant (115)', 'malignant (139)', '100109', 'benign (1