# Data Preparations

This is the notebook used for converting BDD100K dataset into YOLO format.

In [1]:
import os
import json
import shutil
import numpy as np
from tqdm import tqdm
from rich import print
from pathlib import Path

***

## Data folder definitions

In [2]:
ROOT_DIR = Path(os.getcwd())
DATA_DIR = ROOT_DIR / "data"
BASE_DATA_DIR = ROOT_DIR / "base_data"
BASE_DATA_PRFIX = "bdd100k_"

COCO_DATA_DIR = DATA_DIR / "coco_data"
os.makedirs(COCO_DATA_DIR, exist_ok=True)
YOLO_DATA_DIR = DATA_DIR / "yolo_data"
os.makedirs(YOLO_DATA_DIR, exist_ok=True)

***

## Helper functions

In [3]:
def setup_yolo_output_dir(dataset_name: str):
    yolo_output_datadir = YOLO_DATA_DIR / dataset_name
    os.makedirs(yolo_output_datadir, exist_ok=True)
    yolo_label_dir = YOLO_DATA_DIR / dataset_name / "labels"
    os.makedirs(yolo_label_dir, exist_ok=True)
    os.makedirs(yolo_label_dir / "train", exist_ok=True)
    os.makedirs(yolo_label_dir / "val", exist_ok=True)
    yolo_images_dir = YOLO_DATA_DIR / dataset_name / "images"
    os.makedirs(yolo_images_dir, exist_ok=True)
    os.makedirs(yolo_images_dir / "train", exist_ok=True)
    os.makedirs(yolo_images_dir / "val", exist_ok=True)
    print(f"Done setup path for '{dataset_name}'")
    
    return yolo_output_datadir, yolo_label_dir, yolo_images_dir

In [4]:
from json_to_yolo.general_json2yolo import convert_coco_json

def convert_cocojson_to_stdcoco_util(dataset_name: str):
    print(f"Start conversion tasks for dataset: '{dataset_name}'")
    # train set Conversion
    _ = convert_coco_json(
        json_dir=COCO_DATA_DIR / dataset_name / "train",
        save_dir=YOLO_DATA_DIR / dataset_name / "labels" / "train",
        use_segments=False if "seg" not in dataset_name else True,
    )
    
    # trainset Conversion
    _ = convert_coco_json(
        json_dir=COCO_DATA_DIR / dataset_name / "val",
        save_dir=YOLO_DATA_DIR / dataset_name / "labels" / "val",
        use_segments=False if "seg" not in dataset_name else True,
    )

    # All set
    print(f"Finish conversion tasks for dataset: '{dataset_name}'")
    return

In [5]:
import cv2
from pycocotools import mask as cocomask

def rle_to_coco(annotation: dict) -> dict:
    """Transform the rle coco annotation (a single one) into coco style.
    In this case, one mask can contain several polygons, later leading to several `Annotation` objects.
    In case of not having a valid polygon (the mask is a single pixel) it will be an empty list.
    
    Refer to: https://stackoverflow.com/questions/75326066/coco-annotations-convert-rle-to-polygon-segmentation
    
    Parameters
    ----------
    annotation : dict
        rle coco style annotation
    Returns
    -------
    list[dict]
        list of coco style annotations (in dict format)
    """
    tmp_ann = annotation.copy()
    masked_arr = cocomask.decode(annotation["segmentation"])
    contours, _ = cv2.findContours(masked_arr, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    segmentations = [
        contour
        for contour in contours if contour.size >= 6
    ]
    seg_coords = [
        seg.astype(float).flatten().tolist()
        for seg in segmentations
    ]
    # Replace the original segmentation object with the new one
    tmp_ann['segmentation'] = seg_coords

    return tmp_ann

***

## Image Datadir Reorganization

In [3]:
# Reformat images
import os

ORIGINAL_IMAGE_DIR = BASE_DATA_DIR / "100k"
NEW_IMAGE_DIR = BASE_DATA_DIR / "100k_images"

if os.path.exists(NEW_IMAGE_DIR / "done_reorganization_flags.json"):
    with open(NEW_IMAGE_DIR / "done_reorganization_flags.json", "r") as f:
        done_reorganization_flags = json.load(f)
else:
    os.makedirs(NEW_IMAGE_DIR / "train", exist_ok=True)
    os.makedirs(NEW_IMAGE_DIR / "val", exist_ok=True)
    os.makedirs(NEW_IMAGE_DIR / "test", exist_ok=True)
    done_reorganization_flags = {
        "train": False,
        "val": False,
        "test": False
    }
    with open(NEW_IMAGE_DIR / "done_reorganization_flags.json", "w+") as f:
        json.dump(done_reorganization_flags, f)

In [4]:
def move_images(image_dir, dest_dir):
    print(f"Moving images from {image_dir} to {dest_dir}")
    image_filenames = [elem for elem in os.listdir(image_dir) if elem.endswith(".jpg")]
    for image_filename in tqdm(image_filenames):
        shutil.move(image_dir / image_filename, dest_dir)
    print("Done.")
    return

In [9]:
# Train
train_image_dir = ORIGINAL_IMAGE_DIR / "train"
print(f"Train image dir: {train_image_dir}")
[elem for elem in os.listdir(train_image_dir) if not elem.endswith(".jpg")]

[]

In [11]:
train_target_image_dir = NEW_IMAGE_DIR / "train"
if not done_reorganization_flags['train']:
    _ = move_images(image_dir=train_image_dir, dest_dir=train_target_image_dir)
    _ = move_images(image_dir=train_image_dir / "trainA", dest_dir=train_target_image_dir)
    _ = move_images(image_dir=train_image_dir / "trainB", dest_dir=train_target_image_dir)
    _ = move_images(image_dir=train_image_dir / "testA", dest_dir=train_target_image_dir)
    _ = move_images(image_dir=train_image_dir / "testB", dest_dir=train_target_image_dir)
    done_reorganization_flags['train'] = True

In [12]:
# Val
val_image_dir = ORIGINAL_IMAGE_DIR / "val"
print(f"Val target image dir: {val_image_dir}")
[elem for elem in os.listdir(val_image_dir) if not elem.endswith(".jpg")]

[]

In [13]:
val_target_image_dir = NEW_IMAGE_DIR / "val"
if not done_reorganization_flags['val']:
    _ = move_images(image_dir=val_image_dir, dest_dir=val_target_image_dir)
    done_reorganization_flags['val'] = True

In [14]:
# Test
test_image_dir = ORIGINAL_IMAGE_DIR / "test"
print(f"Test target image dir: {test_image_dir}")
[elem for elem in os.listdir(test_image_dir) if not elem.endswith(".jpg")]

[]

In [15]:
test_target_image_dir = NEW_IMAGE_DIR / "test"
if not done_reorganization_flags['test']:
    _ = move_images(image_dir=test_image_dir, dest_dir=test_target_image_dir)
    _ = move_images(image_dir=test_image_dir / "trainA", dest_dir=test_target_image_dir)
    _ = move_images(image_dir=test_image_dir / "trainB", dest_dir=test_target_image_dir)
    _ = move_images(image_dir=test_image_dir / "testA", dest_dir=test_target_image_dir)
    _ = move_images(image_dir=test_image_dir / "testB", dest_dir=test_target_image_dir)
    done_reorganization_flags['test'] = True

In [16]:
# Image ID Set
train_image_ids = set([elem.split(".")[0] for elem in os.listdir(train_target_image_dir)])
val_image_ids = set([elem.split(".")[0] for elem in os.listdir(val_target_image_dir)])
test_image_ids = set([elem.split(".")[0] for elem in os.listdir(test_target_image_dir)])
full_image_ids = train_image_ids.union(val_image_ids).union(test_image_ids)

In [17]:
len(full_image_ids), len(train_image_ids), len(val_image_ids), len(test_image_ids)

(100000, 70000, 10000, 20000)

In [29]:
# Dump image ids
import json

dump_dict = {
    "train": sorted(list(train_image_ids)),
    "val": sorted(list(val_image_ids)),
    "test": sorted(list(test_image_ids)),
}
with open(BASE_DATA_DIR / "100k_images" / "image_ids_info.json", "w+") as f:
    json.dump(dump_dict, f)

***

## Detection

In [8]:
# Setup yolo data dir
yolo_output_datadir, yolo_label_dir, yolo_images_dir = setup_yolo_output_dir(dataset_name="det_20_labels_trainval")

In [19]:
_ = convert_cocojson_to_stdcoco_util(dataset_name="det_20_labels_trainval")

Annotations /Users/ken/Workspaces/PycharmProjects/bdd100k/data/coco_data/det_20_labels_trainval/train/det_train.json: 100%|██████████| 69853/69853 [00:12<00:00, 5429.19it/s]
Annotations /Users/ken/Workspaces/PycharmProjects/bdd100k/data/coco_data/det_20_labels_trainval/val/det_val.json: 100%|██████████| 10000/10000 [00:01<00:00, 5878.76it/s]


In [16]:
with open(NEW_IMAGE_DIR / "image_ids_info.json", "r") as f:
    image_ids_info = json.load(f)

train_image_ids = set(image_ids_info["train"])
val_image_ids = set(image_ids_info["val"])
test_image_ids = set(image_ids_info["test"])
full_image_ids = train_image_ids.union(val_image_ids).union(test_image_ids)

In [18]:
train_label_ids = set([elem.split(".")[0] for elem in os.listdir(yolo_label_dir / "train")])
val_label_ids = set([elem.split(".")[0] for elem in os.listdir(yolo_label_dir / "val")])
full_label_ids = train_label_ids.union(val_label_ids)

In [17]:
len(full_image_ids), len(train_image_ids), len(val_image_ids), len(test_image_ids)

(100000, 70000, 10000, 20000)

In [19]:
len(full_label_ids), len(train_label_ids), len(val_label_ids)

(79853, 69853, 10000)

In [14]:
print(f"Train label ids not in train image ids: {train_label_ids - train_image_ids}")
print(f"Val label ids not in val image ids: {val_label_ids - val_image_ids}")

***

## Our Own BDD100K to COCO

In [6]:
dataset_name = "drivable_labels_trainval"
raw_rle_train_path = BASE_DATA_DIR / (BASE_DATA_PRFIX + dataset_name) / "rles" / "drivable_train.json"
raw_rle_val_path = BASE_DATA_DIR / (BASE_DATA_PRFIX + dataset_name) / "rles" / "drivable_val.json"

In [7]:
with open(raw_rle_train_path, "r") as f:
    drivable_train_rle = json.load(f)
    
with open(raw_rle_val_path, "r") as f:
    drivable_val_rle = json.load(f)

In [8]:
print(f"Attributes: {drivable_train_rle.keys()}")
sample_frame = drivable_train_rle['frames'][0]
print(f"Attributes of 'frame' object: {sample_frame.keys()}, sample frame: {sample_frame}")

In [9]:
print(drivable_train_rle['groups'])
print(drivable_train_rle['config'])
len(drivable_train_rle['frames']), len(drivable_val_rle['frames'])
len([elem for elem in drivable_train_rle['frames'] if elem['labels']]), len([elem for elem in drivable_val_rle['frames'] if elem['labels']])

(66921, 9546)

In [10]:
from pycocotools import mask as cocomask

def process_frame_object_segments(frame: dict, label_dir: str, category_map: dict):
    """Process the frame object and save the labels in the label directory."""
    frame_id, _ = frame['name'].split(".")
    
    # Process Labels
    frame_labels = frame['labels']
    
    if not frame_labels:
        # print(f"Frame '{frame_id}' has no labels. Skipping...")
        return
    
    label_data = []
    for label in frame_labels:
        category = label['category']
        if category in category_map:
            category_id = category_map[category]
        else:
            raise RuntimeError(f"Category '{category}' not found in the category map. Skipping...")
        
        # rle obj
        rle_obj = label['rle']
        h, w = rle_obj['size']
        masked_arr = cocomask.decode(rle_obj) # Decode the rle object
        contours, _ = cv2.findContours(masked_arr, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
        segmentations = [
            contour.astype(np.float32).flatten().tolist()
            for contour in contours
        ]
        scaled_segmentations = [
            (np.array(segmentation).reshape(-1, 2) / np.array([w, h]))
            .reshape(-1)
            .tolist()
            for segmentation in segmentations
        ]
        label_str_segmentations = [
            " ".join(
                [str(category_id)] +
                ["{:.4f}".format(seg) for seg in scaled_seg]
            ) + "\n"
            for scaled_seg in scaled_segmentations
        ]
        label_data += label_str_segmentations
    # End processing loop for labels
    
    # Save the labels
    frame_label_path = label_dir / f"{frame_id}.txt"
    with open(frame_label_path, "w+") as f:
        f.writelines(label_data)
    # End saving labels
    return

In [11]:
def convert_rle_seg_dataset(raw_data: dict, dataset_name: str, split: str, category_map: dict):
    # Define paths
    frames = raw_data['frames']
    target_label_dir = YOLO_DATA_DIR / dataset_name / "labels" / split
    # Process frames
    for frame in tqdm(frames, desc=f"Annotations for '{dataset_name}' {split} set"):
        _ = process_frame_object_segments(
            frame, 
            label_dir=target_label_dir,
            category_map=category_map
        )
    return len(os.listdir(target_label_dir))

In [12]:
_ = setup_yolo_output_dir(dataset_name=dataset_name)

drivable_cat_map = {
    "direct": 0,
    "alternative": 1,
    "background": 2,
    "unknown": 2
}
split = "train"

num_labels = convert_rle_seg_dataset(
    raw_data=drivable_train_rle,
    dataset_name=dataset_name,
    split=split,
    category_map=drivable_cat_map
)

Annotations for 'drivable_labels_trainval' train set: 100%|██████████| 70000/70000 [02:25<00:00, 479.49it/s]


In [13]:
split = "val"

num_labels = convert_rle_seg_dataset(
    raw_data=drivable_val_rle,
    dataset_name=dataset_name,
    split=split,
    category_map=drivable_cat_map
)

Annotations for 'drivable_labels_trainval' val set: 100%|██████████| 10000/10000 [00:20<00:00, 491.29it/s]


***

## Lane Labeling

In [6]:
dataset_name = "lane_labels_trainval"
poly_labels_dir = BASE_DATA_DIR / (BASE_DATA_PRFIX + dataset_name) / "polygons"
mask_labels_dir = BASE_DATA_DIR / (BASE_DATA_PRFIX + dataset_name) / "masks"

raw_poly_train_path = BASE_DATA_DIR / (BASE_DATA_PRFIX + dataset_name) / "polygons" / "lane_train.json"
raw_poly_val_path = BASE_DATA_DIR / (BASE_DATA_PRFIX + dataset_name) / "polygons" / "lane_val.json"

In [7]:
with open(raw_poly_train_path, "r") as f:
    lane_poly_train = json.load(f)

with open(raw_poly_val_path, "r") as f:
    lane_poly_val = json.load(f)

In [8]:
print(f"Attributes: {lane_poly_train[0].keys()}")

In [9]:
sample_poly = lane_poly_train[0]
print(f"Attributes of 'frame' object: {sample_poly.keys()}, sample frame: {sample_poly}")

In [10]:
sample_label = sample_poly['labels'][0]
print(f"Attributes of 'label' object: {sample_label.keys()}, sample label: {sample_label}")

In [19]:
def process_poly_object_segments(poly: dict, label_dir: Path, category_map: dict):
    """Process the frame object and save the labels in the label directory."""
    label_id, _ = poly['name'].split(".")
    
    # Process Labels
    if "labels" not in poly.keys(): return
    else: poly_labels = poly['labels']
    
    if not poly_labels:
        # print(f"Frame '{frame_id}' has no labels. Skipping...")
        return
    
    label_data = []
    for label in poly_labels:
        category = label['category']
        if category in category_map:
            category_id = category_map[category]
        else:
            raise RuntimeError(f"Category '{category}' not found in the category map. Skipping...")
        
        # poly obj
        poly_obj = label['poly2d'] # list of vertice objects
        poly_attr = label['attributes']
        h, w = 720, 1280
        
        segmentations = [
            np.array(vertice_obj['vertices']).astype(np.float32) # shape: (2, 2)
            for vertice_obj in poly_obj
        ]
        scaled_segmentations = [
            (segmentation / np.array([w, h]))
            .reshape(-1)
            .tolist()
            for segmentation in segmentations
        ]
        label_str_segmentations = [
            " ".join(
                [str(category_id)] +
                ["{:.4f}".format(seg) for seg in scaled_seg]
            ) + "\n"
            for scaled_seg in scaled_segmentations
        ]
        label_data += label_str_segmentations
    # End processing loop for labels
    
    # Save the labels
    poly_label_path = label_dir / f"{label_id}.txt"
    with open(poly_label_path, "w+") as f:
        f.writelines(label_data)
    # End saving labels
    return

In [20]:
def convert_poly_seg_dataset(raw_data: list, dataset_name: str, split: str, category_map: dict):
    # Define paths
    target_label_dir = YOLO_DATA_DIR / dataset_name / "labels" / split
    # Process frames
    for poly in tqdm(raw_data, desc=f"Annotations for '{dataset_name}' {split} set"):
        _ = process_poly_object_segments(
            poly, 
            label_dir=target_label_dir,
            category_map=category_map
        )
    return len(os.listdir(target_label_dir))

In [21]:
_ = setup_yolo_output_dir(dataset_name=dataset_name)
lane_cat_map = {
    "crosswalk": 0,
    "double other": 1,
    "double white": 2,
    "double yellow": 3,
    "road curb": 4,
    "single other": 5,
    "single white": 6,
    "single yellow": 7,
    "background": 8
}
split = "train"

num_labels = convert_poly_seg_dataset(
    raw_data=lane_poly_train,
    dataset_name=dataset_name,
    split=split,
    category_map=lane_cat_map
)

Annotations for 'lane_labels_trainval' train set: 100%|██████████| 70000/70000 [00:08<00:00, 8676.64it/s]


In [22]:
split = "val"

num_labels = convert_poly_seg_dataset(
    raw_data=lane_poly_val,
    dataset_name=dataset_name,
    split=split,
    category_map=lane_cat_map
)

Annotations for 'lane_labels_trainval' val set: 100%|██████████| 10000/10000 [00:01<00:00, 9582.79it/s]


In [None]:
from PIL import Image

def process_poly_mask_segment(poly: dict, label_dir: Path, category_map: dict, mask_dir: Path):
    """Process the frame object and save the labels in the label directory."""
    label_id, _ = poly['name'].split(".")
    
    # Process Labels
    if "labels" not in poly.keys() or poly['labels'] is None: return
    else: poly_labels = poly['labels']
    
    label_data = []
    for label in poly_labels:
        # Category mapping into class index
        category = label['category']
        if category in category_map: category_id = category_map[category]
        else: raise RuntimeError(f"Category '{category}' not found in the category map. Skipping...")
        
        # use mask picture instead of polygon
        poly_mask = Image.open(mask_dir / poly['name']) # mask image name is the same as the original image name
        masked_arr = poly_mask.toarray()
        
        poly_attr = label['attributes']
        h, w = 720, 1280
        
        segmentations = [
            np.array(vertice_obj['vertices']).astype(np.float32) # shape: (2, 2)
            for vertice_obj in poly_obj
        ]
        scaled_segmentations = [
            (segmentation / np.array([w, h]))
            .reshape(-1)
            .tolist()
            for segmentation in segmentations
        ]
        label_str_segmentations = [
            " ".join(
                [str(category_id)] +
                ["{:.4f}".format(seg) for seg in scaled_seg]
            ) + "\n"
            for scaled_seg in scaled_segmentations
        ]
        label_data += label_str_segmentations
    # End processing loop for labels
    
    # Save the labels
    poly_label_path = label_dir / f"{label_id}.txt"
    with open(poly_label_path, "w+") as f:
        f.writelines(label_data)
    # End saving labels
    return

***

## Image Labels

In [6]:
dataset_name = "image_labels"
image_labels_dir = BASE_DATA_DIR / dataset_name
image_train_labels_path = image_labels_dir / "bdd100k_labels_images_train.json"
image_val_labels_path = image_labels_dir / "bdd100k_labels_images_val.json"

In [7]:
with open(image_train_labels_path, "r") as f:
    image_train_labels = json.load(f)
    
with open(image_val_labels_path, "r") as f:
    image_val_labels = json.load(f)

In [8]:
type(image_val_labels)

list

In [10]:
sample_image_lael = image_val_labels[0]
print(f"Attributes of 'frame' object: {sample_image_lael.keys()}, sample image lab el: {sample_image_lael}")

In [None]:
{
    'name': 'b1c66a42-6f7d68ca.jpg', 
    'attributes': {'weather': 'overcast', 'scene': 'city street', 'timeofday': 'daytime'}, 
    'timestamp': 10000,
}