In [1]:
import cv2
import itertools
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random


from itertools import groupby
from skimage import io
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
dataset_name = "busset"
use_classes = False
val_frac = 0.2
dataset_folder = f"datasets/{dataset_name}_png"

https://www.kaggle.com/code/alejopaullier/how-to-create-a-coco-dataset

In [3]:
import pycocotools.mask as mask_util

def rle_decode(mask_rle, shape):
    """
    Decodes run-length encoded segmentation mask string into 2d array

    Parameters
    ----------
    :param rle_mask (str): Run-length encoded segmentation mask string.
    :param shape (tuple): (height, width) of array to return
    :return mask [numpy.ndarray of shape (height, width)]: Decoded 2d segmentation mask
    """
    # Splits the RLE string into a list of string by whitespaces.
    s = mask_rle.split()
    
    # This creates two numpy arrays, one with the RLE starts and one with their respective lengths
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    
    # To obtain the end point we need to substract 1 to the length or start because the initial point counts.
    starts -= 1
    ends = starts + lengths
    
    # Create a 1D array of size H*W of zeros
    mask = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    
    # Fill this array with ones in the positions where there is a mask using the RLE information
    for start, end in zip(starts, ends):
        mask[start:end] = 1
    
    # Reshape the 1D array into a 2D array so we can finally get the binary 2D mask.
    mask = mask.reshape(shape)
    return mask.T


def binary_mask_to_rle(binary_mask):
    """
    Checkout: https://cocodataset.org/#format-results
    :param mask [numpy.ndarray of shape (height, width)]: Decoded 2d segmentation mask
    
    This function returns the following dictionary:
    {
        "counts": encoded mask suggested by the official COCO dataset webpage.
        "size": the size of the input mask/image
    }
    """
    # Create dictionary for the segmentation key in the COCO dataset
    rle = {'counts': [], 'size': list(binary_mask.shape)}
    # We need to convert it to a Fortran array
    binary_mask_fortran = np.asfortranarray(binary_mask)
    # Encode the mask as specified by the official COCO format
    encoded_mask = mask_util.encode(binary_mask_fortran)
    # We must decode the byte encoded string or otherwise we cannot save it as a JSON file
    rle["counts"] = encoded_mask["counts"].decode()
    return rle


def create_coco_format_json(data_frame, classes, dataset_path):
    """
    This function creates a COCO dataset.
    :param data_frame: pandas dataframe with an "id" column.
    :param classes: list of strings where each string is a class.
    :return dataset_coco_format: COCO dataset (JSON).
    """
    images = []
    annotations = []
    categories = []
    count = 0
    
    # Creates a categories list, i.e: [{'id': 0, 'name': 'a'}, {'id': 1, 'name': 'b'}, {'id': 2, 'name': 'c'}] 
    for idx, class_ in enumerate(classes):
        categories.append(
            { 
                "id": idx+1, # detectron2 category range  [1, #categories]
                "name": class_
            }
        )
        
    masks_path = os.path.join(dataset_path, "masks")
    images_path = os.path.join(dataset_path, "images")
    
    # Iterate over image filepaths
    for index, row in tqdm(data_frame.iterrows(), total=len(data_frame)):
        # Get the image id, e.g: "10044"
        file_id = index #row["id"]
        # Get filename
        image_file = row["image_file"]
        mask_file = row["mask_file"]
        cat = row["category"]
        
        assert cat in classes
        
        mask_full_path = os.path.join(masks_path, mask_file)
        image_full_path = os.path.join(images_path, image_file)
        
        mk = cv2.imread(mask_full_path, cv2.IMREAD_GRAYSCALE)
        assert mk is not None
        
        height, width = mk.shape
        
        
        # Adding images which has annotations
        images.append(
            {
                "id": file_id,
                "width": width,
                "height": height,
                "file_name": image_full_path
            }
        )
        
        # Convert the RLE string into a numpy array binary mask

        ys, xs = np.where(mk)
        x1, x2 = min(xs), max(xs)
        y1, y2 = min(ys), max(ys)
        """
        Contours can be explained simply as a curve joining all the continuous points (along the boundary),
        having same color or intensity. The function retrieves contours from the binary image using the
        algorithm specified in the function. One RLE segmentation for a single class may have disconnected
        shapes, like "spots". We will iterate over these "spots" thus creating a new image for each spot.
        This image will be temporary, it will help us create annotations for each of these "spots".
        """
        contours, hierarchy = cv2.findContours(mk,cv2.RETR_CCOMP,cv2.CHAIN_APPROX_NONE)
        
        for id_, contour in enumerate(contours):
            # Image with 3 channels where H and W remain the same.
            mask_image = np.zeros((mk.shape[0], mk.shape[1], 3),  np.uint8)
            # This function takes the image and fills the contour inside it.
            cv2.drawContours(mask_image, [contour], -1, (255,255,255), thickness=cv2.FILLED)
            mask_image = cv2.cvtColor(mask_image, cv2.COLOR_BGR2GRAY)
            mask_image_bool = np.array(mask_image, dtype=bool).astype(np.uint8)
            ys, xs = np.where(mask_image_bool)
            x1, x2 = min(xs), max(xs)
            y1, y2 = min(ys), max(ys)
            enc = binary_mask_to_rle(mask_image_bool)
            seg = {
                'segmentation': enc, 
                'bbox': [int(x1), int(y1), int(x2-x1+1), int(y2-y1+1)],
                'area': int(np.sum(mask_image_bool)),
                'image_id':file_id, 
                'category_id':classes.index(cat) + 1, # detectron2 category range  [1, #categories]
                'iscrowd':0, 
                'id': count
            }
            annotations.append(seg)
            count +=1
            
    # Create the dataset
    dataset_coco_format = {
        "categories": categories,
        "images": images,
        "annotations": annotations,
    }
    
    return dataset_coco_format


def sep():
    print("-"*100)

In [4]:
import random
random.seed(0)

# Get patients list
images_folder = os.path.join(dataset_folder, "images")
list_items = [item[:-4] for item in os.listdir(images_folder) if item.endswith(".png")]
patients = list(set([ item.split("_")[0] for item in list_items]))

# Get lesion type per patient
if use_classes:
    patients_type = {}     
    for item in list_items:
        patient = item.split("_")[0] 
        label = item.split("_")[-1]
        if label not in patients_type:
            patients_type[label] = [patient]
        elif patient not in patients_type[label]:
            patients_type[label].append(patient)
else:
    patients_type = {0: patients}
    
print({k: len(v) for k, v in patients_type.items()})
    
# Distribute patients in Train/Val using val_frac with balanced lesion types
train_p = []
val_p = []
for k in patients_type.keys():
    patients_list = patients_type[k]
    random.shuffle(patients_list)
    num_train = int(len(patients_list)*(1-val_frac))
    train_type_p, val_type_p = patients_list[:num_train], patients_list[num_train:]
    train_p += train_type_p
    val_p += val_type_p
    
print(train_p, len(train_p))
print(val_p, len(val_p))
    
# Get distributed patient images for train and val
train = [ f"{item}.png" for item in list_items if item.split("_")[0] in train_p]
val = [ f"{item}.png" for item in list_items if item.split("_")[0] in val_p]
random.shuffle(train)
random.shuffle(val)


{0: 1154}
['benign (166)', '400048', '99', '500002', 'benign (239)', '100166', '129', 'benign (140)', '100197', '16', 'benign (417)', 'benign (1)', '100146', 'benign (275)', 'malignant (26)', 'malignant (6)', '100032', 'malignant (97)', '76', 'benign (355)', '500086', 'benign (191)', 'benign (272)', '100120', '400050', '400097', 'malignant (166)', 'benign (324)', 'benign (360)', '400016', 'malignant (34)', 'benign (406)', 'malignant (124)', '308', 'benign (187)', 'benign (364)', '100144', 'benign (73)', '127', 'malignant (118)', '400023', 'malignant (75)', '117', 'benign (234)', '4', 'benign (3)', 'benign (313)', '400040', '400043', '106', '500036', 'benign (415)', 'benign (291)', '500010', 'benign (311)', 'benign (34)', 'malignant (23)', '100109', '400079', 'benign (422)', 'malignant (159)', 'benign (184)', 'benign (105)', '100140', '100134', '100188', '68', '29', '400037', '100130', 'benign (106)', 'benign (380)', '500081', 'benign (85)', 'benign (147)', '100131', '100123', 'malignan

In [5]:
# Get classes
if not use_classes or any(len(item.split('_')) != 3 for item in list_items):
    use_classes = False
    classes = ["default"]
    print("Classes are not used")
else:
    classes = sorted(set([ item[:-4].split('_')[-1] for item in list_items]))
    print(classes)

Classes are not used


In [6]:
def getDataframe(list_cases):

    dataset = []
    for image in list_cases:
        
        
        if use_classes:
            label = image[:-4].split("_")[-1]
        else:
            label = "default"
        
        dataset.append({
            "image_file": image,
            "mask_file": image,
            "category": label
        })
        
    return pd.DataFrame(dataset)
  

In [7]:
train_df = getDataframe(train)
train_json = create_coco_format_json(train_df, classes, dataset_folder)
with open(os.path.join(dataset_folder, 'coco_seg_train.json'), 'w', encoding='utf-8') as f:
    json.dump(train_json, f, indent=4)
    
    
val_df = getDataframe(val)
val_json = create_coco_format_json(val_df, classes, dataset_folder)
with open(os.path.join(dataset_folder, 'coco_seg_val.json'), 'w', encoding='utf-8') as f:
    json.dump(val_json, f, indent=4)



100%|██████████| 923/923 [00:04<00:00, 214.20it/s]
100%|██████████| 231/231 [00:00<00:00, 248.81it/s]
