# Data Preparations

This is the notebook used for converting BDD100K dataset into YOLO format.

In [7]:
import os
import json
import shutil
import numpy as np
from tqdm import tqdm
from rich import print
from pathlib import Path

***

## Data folder definitions

In [8]:
ROOT_DIR = Path(os.getcwd())
DATA_DIR = ROOT_DIR / "data"
BASE_DATA_DIR = ROOT_DIR / "base_data"
BASE_DATA_PRFIX = "bdd100k_"

COCO_DATA_DIR = DATA_DIR / "coco_data"
os.makedirs(COCO_DATA_DIR, exist_ok=True)
YOLO_DATA_DIR = DATA_DIR / "yolo_data"
os.makedirs(YOLO_DATA_DIR, exist_ok=True)

***

## Helper functions

In [2]:
def setup_yolo_output_dir(dataset_name: str):
    yolo_output_datadir = YOLO_DATA_DIR / dataset_name
    os.makedirs(yolo_output_datadir, exist_ok=True)
    yolo_label_dir = YOLO_DATA_DIR / dataset_name / "labels"
    os.makedirs(yolo_label_dir, exist_ok=True)
    os.makedirs(yolo_label_dir / "train", exist_ok=True)
    os.makedirs(yolo_label_dir / "val", exist_ok=True)
    yolo_images_dir = YOLO_DATA_DIR / dataset_name / "images"
    os.makedirs(yolo_images_dir, exist_ok=True)
    os.makedirs(yolo_images_dir / "train", exist_ok=True)
    os.makedirs(yolo_images_dir / "val", exist_ok=True)
    print(f"Done setup path for '{dataset_name}'")
    
    return yolo_output_datadir, yolo_label_dir, yolo_images_dir

In [3]:
from json_to_yolo.general_json2yolo import convert_coco_json

def convert_cocojson_to_stdcoco_util(dataset_name: str):
    print(f"Start conversion tasks for dataset: '{dataset_name}'")
    # train set Conversion
    _ = convert_coco_json(
        json_dir=COCO_DATA_DIR / dataset_name / "train",
        save_dir=YOLO_DATA_DIR / dataset_name / "labels" / "train",
        use_segments=False if "seg" not in dataset_name else True,
    )
    
    # trainset Conversion
    _ = convert_coco_json(
        json_dir=COCO_DATA_DIR / dataset_name / "val",
        save_dir=YOLO_DATA_DIR / dataset_name / "labels" / "val",
        use_segments=False if "seg" not in dataset_name else True,
    )

    # All set
    print(f"Finish conversion tasks for dataset: '{dataset_name}'")
    return

In [4]:
import cv2
from pycocotools import mask as cocomask

def rle_to_coco(annotation: dict) -> dict:
    """Transform the rle coco annotation (a single one) into coco style.
    In this case, one mask can contain several polygons, later leading to several `Annotation` objects.
    In case of not having a valid polygon (the mask is a single pixel) it will be an empty list.
    
    Refer to: https://stackoverflow.com/questions/75326066/coco-annotations-convert-rle-to-polygon-segmentation
    
    Parameters
    ----------
    annotation : dict
        rle coco style annotation
    Returns
    -------
    list[dict]
        list of coco style annotations (in dict format)
    """
    tmp_ann = annotation.copy()
    masked_arr = cocomask.decode(annotation["segmentation"])
    contours, _ = cv2.findContours(masked_arr, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    segmentations = [
        contour
        for contour in contours if contour.size >= 6
    ]
    seg_coords = [
        seg.astype(float).flatten().tolist()
        for seg in segmentations
    ]
    # Replace the original segmentation object with the new one
    tmp_ann['segmentation'] = seg_coords

    return tmp_ann

***

## Image Datadir Reorganization

In [3]:
# Reformat images
import os

ORIGINAL_IMAGE_DIR = BASE_DATA_DIR / "100k"
NEW_IMAGE_DIR = BASE_DATA_DIR / "100k_images"

if os.path.exists(NEW_IMAGE_DIR / "done_reorganization_flags.json"):
    with open(NEW_IMAGE_DIR / "done_reorganization_flags.json", "r") as f:
        done_reorganization_flags = json.load(f)
else:
    os.makedirs(NEW_IMAGE_DIR / "train", exist_ok=True)
    os.makedirs(NEW_IMAGE_DIR / "val", exist_ok=True)
    os.makedirs(NEW_IMAGE_DIR / "test", exist_ok=True)
    done_reorganization_flags = {
        "train": False,
        "val": False,
        "test": False
    }
    with open(NEW_IMAGE_DIR / "done_reorganization_flags.json", "w+") as f:
        json.dump(done_reorganization_flags, f)

In [4]:
def move_images(image_dir, dest_dir):
    print(f"Moving images from {image_dir} to {dest_dir}")
    image_filenames = [elem for elem in os.listdir(image_dir) if elem.endswith(".jpg")]
    for image_filename in tqdm(image_filenames):
        shutil.move(image_dir / image_filename, dest_dir)
    print("Done.")
    return

In [9]:
# Train
train_image_dir = ORIGINAL_IMAGE_DIR / "train"
print(f"Train image dir: {train_image_dir}")
[elem for elem in os.listdir(train_image_dir) if not elem.endswith(".jpg")]

[]

In [11]:
train_target_image_dir = NEW_IMAGE_DIR / "train"
if not done_reorganization_flags['train']:
    _ = move_images(image_dir=train_image_dir, dest_dir=train_target_image_dir)
    _ = move_images(image_dir=train_image_dir / "trainA", dest_dir=train_target_image_dir)
    _ = move_images(image_dir=train_image_dir / "trainB", dest_dir=train_target_image_dir)
    _ = move_images(image_dir=train_image_dir / "testA", dest_dir=train_target_image_dir)
    _ = move_images(image_dir=train_image_dir / "testB", dest_dir=train_target_image_dir)
    done_reorganization_flags['train'] = True

In [12]:
# Val
val_image_dir = ORIGINAL_IMAGE_DIR / "val"
print(f"Val target image dir: {val_image_dir}")
[elem for elem in os.listdir(val_image_dir) if not elem.endswith(".jpg")]

[]

In [13]:
val_target_image_dir = NEW_IMAGE_DIR / "val"
if not done_reorganization_flags['val']:
    _ = move_images(image_dir=val_image_dir, dest_dir=val_target_image_dir)
    done_reorganization_flags['val'] = True

In [14]:
# Test
test_image_dir = ORIGINAL_IMAGE_DIR / "test"
print(f"Test target image dir: {test_image_dir}")
[elem for elem in os.listdir(test_image_dir) if not elem.endswith(".jpg")]

[]

In [15]:
test_target_image_dir = NEW_IMAGE_DIR / "test"
if not done_reorganization_flags['test']:
    _ = move_images(image_dir=test_image_dir, dest_dir=test_target_image_dir)
    _ = move_images(image_dir=test_image_dir / "trainA", dest_dir=test_target_image_dir)
    _ = move_images(image_dir=test_image_dir / "trainB", dest_dir=test_target_image_dir)
    _ = move_images(image_dir=test_image_dir / "testA", dest_dir=test_target_image_dir)
    _ = move_images(image_dir=test_image_dir / "testB", dest_dir=test_target_image_dir)
    done_reorganization_flags['test'] = True

In [16]:
# Image ID Set
train_image_ids = set([elem.split(".")[0] for elem in os.listdir(train_target_image_dir)])
val_image_ids = set([elem.split(".")[0] for elem in os.listdir(val_target_image_dir)])
test_image_ids = set([elem.split(".")[0] for elem in os.listdir(test_target_image_dir)])
full_image_ids = train_image_ids.union(val_image_ids).union(test_image_ids)

In [17]:
len(full_image_ids), len(train_image_ids), len(val_image_ids), len(test_image_ids)

(100000, 70000, 10000, 20000)

In [29]:
# Dump image ids
import json

dump_dict = {
    "train": sorted(list(train_image_ids)),
    "val": sorted(list(val_image_ids)),
    "test": sorted(list(test_image_ids)),
}
with open(BASE_DATA_DIR / "100k_images" / "image_ids_info.json", "w+") as f:
    json.dump(dump_dict, f)

***

## Detection

In [8]:
# Setup yolo data dir
yolo_output_datadir, yolo_label_dir, yolo_images_dir = setup_yolo_output_dir(dataset_name="det_20_labels_trainval")

In [19]:
_ = convert_cocojson_to_stdcoco_util(dataset_name="det_20_labels_trainval")

Annotations /Users/ken/Workspaces/PycharmProjects/bdd100k/data/coco_data/det_20_labels_trainval/train/det_train.json: 100%|██████████| 69853/69853 [00:12<00:00, 5429.19it/s]
Annotations /Users/ken/Workspaces/PycharmProjects/bdd100k/data/coco_data/det_20_labels_trainval/val/det_val.json: 100%|██████████| 10000/10000 [00:01<00:00, 5878.76it/s]


In [16]:
with open(NEW_IMAGE_DIR / "image_ids_info.json", "r") as f:
    image_ids_info = json.load(f)

train_image_ids = set(image_ids_info["train"])
val_image_ids = set(image_ids_info["val"])
test_image_ids = set(image_ids_info["test"])
full_image_ids = train_image_ids.union(val_image_ids).union(test_image_ids)

In [18]:
train_label_ids = set([elem.split(".")[0] for elem in os.listdir(yolo_label_dir / "train")])
val_label_ids = set([elem.split(".")[0] for elem in os.listdir(yolo_label_dir / "val")])
full_label_ids = train_label_ids.union(val_label_ids)

In [17]:
len(full_image_ids), len(train_image_ids), len(val_image_ids), len(test_image_ids)

(100000, 70000, 10000, 20000)

In [19]:
len(full_label_ids), len(train_label_ids), len(val_label_ids)

(79853, 69853, 10000)

In [14]:
print(f"Train label ids not in train image ids: {train_label_ids - train_image_ids}")
print(f"Val label ids not in val image ids: {val_label_ids - val_image_ids}")

***

## Instance Segmentation

In [75]:
# Setup yolo data dir
name = "ins_seg_labels_trainval"
_ = setup_yolo_output_dir(dataset_name=name)

In [77]:
with open(COCO_DATA_DIR / name / "train" / "ins_seg_train.json", "r") as f:
    ins_seg_coco = json.load(f)

In [78]:
len(ins_seg_coco["images"]), len(ins_seg_coco["annotations"])

(7000, 89450)

In [None]:
sample_coco_annotation = {
    'annotation_path': 'val2017/instances_val2017.json',
    'segmentation': [
        [510.66, 423.01, 511.72, 420.03, 510.45, 416.0, 510.34,
        413.02, 510.77, 410.26, 510.77, 407.5, 510.34, 405.16,
        511.51, 402.83, 511.41, 400.49, 510.24, 398.16, 509.39,
        397.31, 504.61, 399.22, 502.17, 399.64, 500.89, 401.66,
        500.47, 402.08, 499.09, 401.87, 495.79, 401.98, 490.59,
        401.77, 488.79, 401.77, 485.39, 398.58, 483.9, 397.31,
        481.56, 396.35, 478.48, 395.93, 476.68, 396.03, 475.4,
        396.77, 473.92, 398.79, 473.28, 399.96, 473.49, 401.87,
        474.56, 403.47, 473.07, 405.59, 473.39, 407.71, 476.68,
        409.41, 479.23, 409.73, 481.56, 410.69, 480.4, 411.85,
        481.35, 414.93, 479.86, 418.65, 477.32, 420.03, 476.04,
        422.58, 479.02, 422.58, 480.29, 423.01, 483.79, 419.93,
        486.66, 416.21, 490.06, 415.57, 492.18, 416.85, 491.65,
        420.24, 492.82, 422.9, 493.56, 424.39, 496.43, 424.6,
        498.02, 423.01, 498.13, 421.31, 497.07, 420.03, 497.07,
        415.15, 496.33, 414.51, 501.1, 411.96, 502.06, 411.32,
        503.02, 415.04, 503.33, 418.12, 501.1, 420.24, 498.98,
        421.63, 500.47, 424.39, 505.03, 423.32, 506.2, 421.31,
        507.69, 419.5, 506.31, 423.32, 510.03, 423.01, 510.45,
        423.01]
    ], 
    'area': '702.1057499999998',
    'iscrowd': 0,
    'image_id': 289343,
    'bbox': ['473.07', '395.93', '38.65', '28.67'],
    'category_id': 18, 
    'id': 1768, 
    'height': 640,
    'width': 529
}

In [59]:
ins_seg_coco['annotations'][0]

{'id': 1,
 'image_id': 1,
 'category_id': 3,
 'scalabel_id': '8',
 'iscrowd': 0,
 'ignore': 0,
 'bbox': [447.0, 392.0, 78.0, 29.0],
 'area': 1671,
 'segmentation': {'counts': '[fj97Wf02O1O10000O1O1O1000000000000O10000000000000000L4M3O1O1O10000O1000000000000000000000000000000000002N1O00000000000001O000000000O1O1O110O001O1O1O1O1O2N1O2M3Ll^b`0',
  'size': [720, 1280]}}

In [62]:
reformated_annotations = [
    rle_to_coco(annotation)
    for annotation in tqdm(ins_seg_coco['annotations'])
]

100%|██████████| 89450/89450 [01:04<00:00, 1380.86it/s]


In [63]:
print(ins_seg_coco.keys())

In [64]:
new_ins_seg_coco = dict(
    type=ins_seg_coco['type'],
    images=ins_seg_coco['images'],
    annotations=reformated_annotations,
    categories=ins_seg_coco['categories']
)

In [65]:
with open(COCO_DATA_DIR / name / "train" / "ins_seg_train.json", "w+") as f:
    json.dump(new_ins_seg_coco, f)

In [70]:
original_annotations = ins_seg_coco['annotations']
multiple_boxes = [len(elem['bbox']) for elem in original_annotations if len(elem['bbox']) > 1]

In [72]:
import pandas as pd

pd.Series(multiple_boxes).value_counts()

4    89450
Name: count, dtype: int64

In [76]:
_ = convert_cocojson_to_stdcoco_util(dataset_name="ins_seg_labels_trainval")

Annotations /Users/ken/Workspaces/PycharmProjects/bdd100k/data/coco_data/ins_seg_labels_trainval/train/ins_seg_train.json:  40%|███▉      | 2755/6891 [00:01<00:01, 2090.13it/s]


IndexError: list index out of range

***

## Our Own BDD100K to COCO

In [9]:
dataset_name = "drivable_labels_trainval"
raw_rle_train_path = BASE_DATA_DIR / (BASE_DATA_PRFIX + dataset_name) / "rles" / "drivable_train.json"
raw_rle_val_path = BASE_DATA_DIR / (BASE_DATA_PRFIX + dataset_name) / "rles" / "drivable_val.json"

In [10]:
with open(raw_rle_train_path, "r") as f:
    drivable_train_rle = json.load(f)
    
with open(raw_rle_val_path, "r") as f:
    drivable_val_rle = json.load(f)

In [11]:
print(f"Attributes: {drivable_train_rle.keys()}")
sample_frame = drivable_train_rle['frames'][0]
print(f"Attributes of 'frame' object: {sample_frame.keys()}, sample frame: {sample_frame}")

In [14]:
print(drivable_train_rle['groups'])
print(drivable_train_rle['config'])
len(drivable_train_rle['frames']), len(drivable_val_rle['frames'])
len([elem for elem in drivable_train_rle['frames'] if elem['labels']]), len([elem for elem in drivable_val_rle['frames'] if elem['labels']])

(66921, 9546)

In [18]:
from pycocotools import mask as cocomask

def process_frame_object_segments(frame: dict, label_dir: str, category_map: dict):
    """Process the frame object and save the labels in the label directory."""
    frame_id, _ = frame['name'].split(".")
    
    # Process Labels
    frame_labels = frame['labels']
    
    if not frame_labels:
        # print(f"Frame '{frame_id}' has no labels. Skipping...")
        return
    
    label_data = []
    for label in frame_labels:
        category = label['category']
        if category in category_map:
            category_id = category_map[category]
        else:
            raise RuntimeError(f"Category '{category}' not found in the category map. Skipping...")
        
        # rle obj
        rle_obj = label['rle']
        h, w = rle_obj['size']
        masked_arr = cocomask.decode(rle_obj) # Decode the rle object
        contours, _ = cv2.findContours(masked_arr, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
        segmentations = [
            contour.astype(np.float32).flatten().tolist()
            for contour in contours
        ]
        scaled_segmentations = [
            (np.array(segmentation).reshape(-1, 2) / np.array([w, h]))
            .reshape(-1)
            .tolist()
            for segmentation in segmentations
        ]
        label_str_segmentations = [
            " ".join(
                [str(category_id)] +
                ["{:.2f}".format(seg) for seg in scaled_seg]
            )
            for scaled_seg in scaled_segmentations
        ]
        label_data += label_str_segmentations
    # End processing loop for labels
    
    # Save the labels
    frame_label_path = label_dir / f"{frame_id}.txt"
    with open(frame_label_path, "w+") as f:
        f.writelines(label_data)
    # End saving labels
    return

In [19]:
def convert_rle_seg_dataset(raw_data: dict, dataset_name: str, split: str, category_map: dict):
    # Define paths
    frames = raw_data['frames']
    target_label_dir = YOLO_DATA_DIR / dataset_name / "labels" / split
    # Process frames
    for frame in tqdm(frames, desc=f"Annotations for '{dataset_name}' {split} set"):
        _ = process_frame_object_segments(
            frame, 
            label_dir=target_label_dir,
            category_map=category_map
        )
    return len(os.listdir(target_label_dir))

In [20]:
_ = setup_yolo_output_dir(dataset_name=dataset_name)

drivable_cat_map = {
    "direct": 0,
    "alternative": 1,
    "background": 2,
    "unknown": 2
}
split = "train"

num_labels = convert_rle_seg_dataset(
    raw_data=drivable_train_rle,
    dataset_name=dataset_name,
    split=split,
    category_map=drivable_cat_map
)

Annotations for 'drivable_labels_trainval' train set: 100%|██████████| 70000/70000 [02:21<00:00, 494.68it/s]


In [21]:
split = "val"

num_labels = convert_rle_seg_dataset(
    raw_data=drivable_val_rle,
    dataset_name=dataset_name,
    split=split,
    category_map=drivable_cat_map
)

Annotations for 'drivable_labels_trainval' val set: 100%|██████████| 10000/10000 [00:19<00:00, 504.53it/s]
