In [11]:
# transforms current dataset structure to YOLO format
import os
import shutil
import json
ROOT_DIR = "D:/sumer/"
OUTPUT_DIR = "D:/sumer/3_YOLO_ultra/dataset/"

In [12]:
# current structure :
# ROOT_DIR/HeiCuBeDa/Images_MSII_Filter/  --> images
# MaiCuBeDa/train_photo_anno.json  --> annotations (train)
# MaiCuBeDa/test_photo_anno.json --> annotations (val)

# content in json:
# {
#     "HS_1220_03": {
#         "tablet_ID": "HS_1220",
#         "side": "front",
#         "image_path": "../HeiCuBeDa/Images_MSII_Filter/HS_1220_HeiCuBeDa_GMOCF_r1.50_n4_v512_03_front.png",
#         "bboxes": [
#             {
#                 "bbox": [
#                     13,
#                     17,
#                     121,
#                     122
#                 ],
#                 "charname": "U",
#                 "transliteration": "1(u)",
#                 "charname_id": 17,
#                 "transliteration_id": 21
#             },
#             {
#                 "bbox": [
#                     11,
#                     159,
#                     111,
#                     293
#                 ],
#                 "charname": "KI",
#                 "transliteration": "ki",
#                 "charname_id": 3,
#                 "transliteration_id": 3
#             },...

In [13]:
# required YOLO format:
# OUTPUT_DIR/images/train/  --> images for training
# OUTPUT_DIR/images/val/    --> images for validation
# OUTPUT_DIR/labels/train/  --> labels for training
# OUTPUT_DIR/labels/val/    --> labels for validation
# where label files are .txt files with same name as images, each line in txt file:
# <class_id> <x_center> <y_center> <width> <height>
# all values are normalized (divided by image width and height)

In [17]:
# existing files
TRAIN_JSON = os.path.join(ROOT_DIR, "MaiCuBeDa/train_photo_anno.json")
VAL_JSON   = os.path.join(ROOT_DIR, "MaiCuBeDa/test_photo_anno.json")
IMAGES_DIR = os.path.join(ROOT_DIR, "HeiCuBeDa/Images_MSII_Filter/")

# choose task
label_key = "charname_topN" # "is_sign" / "charname" / "transliteration" / "charname_topN"
N=150
if label_key != "is_sign":
    OUTPUT_DIR = OUTPUT_DIR[:-1] + f"_{label_key}"

# target YOLO folders
IMG_TRAIN_DIR = os.path.join(OUTPUT_DIR, "images/train")
IMG_VAL_DIR   = os.path.join(OUTPUT_DIR, "images/val")
LBL_TRAIN_DIR = os.path.join(OUTPUT_DIR, "labels/train")
LBL_VAL_DIR   = os.path.join(OUTPUT_DIR, "labels/val")
os.makedirs(IMG_TRAIN_DIR, exist_ok=True)
os.makedirs(IMG_VAL_DIR, exist_ok=True)
os.makedirs(LBL_TRAIN_DIR, exist_ok=True)
os.makedirs(LBL_VAL_DIR, exist_ok=True)


In [19]:
if label_key == "charname" or label_key == "charname_topN":
    with open(os.path.join(ROOT_DIR, "MaiCuBeDa/charname_to_id.json"),"r",encoding="utf-8") as f:
        dict_id = json.load(f)
    if label_key == "charname_topN":
        for k,v in dict_id.items():
            if v >= N:
                dict_id[k] = N  # group together
        # save modified dict
        with open(os.path.join(ROOT_DIR, f"MaiCuBeDa/charname_to_id_top{N}.json"),"w",encoding="utf-8") as f:
            json.dump(dict_id, f, ensure_ascii=False, indent=4)
elif label_key == "transliteration":
    with open(os.path.join(ROOT_DIR, "MaiCuBeDa/transliteration_to_id.json"),"r",encoding="utf-8") as f:
        dict_id = json.load(f)
        
reverse_dict_id = {v:k for k,v in dict_id.items()}
if label_key == "charname_topN":
    reverse_dict_id[N] = "OTHER"

In [20]:
def convert_bbox(bbox, img_width, img_height):
    x_min, y_min, x_max, y_max = bbox
    x_center = (x_min + x_max) / 2.0 / img_width
    y_center = (y_min + y_max) / 2.0 / img_height
    width    = (x_max - x_min) / img_width
    height   = (y_max - y_min) / img_height
    return x_center, y_center, width, height

In [21]:
from PIL import Image
from tqdm import tqdm
def process_json(json_file, img_output_dir, lbl_output_dir, label_key):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    for key, item in tqdm(data.items(), desc="Processing items"):
        # get image path
        img_path = os.path.join(IMAGES_DIR, os.path.basename(item['image_path']))
        if not os.path.exists(img_path):
            print(f"Warning: {img_path} does not exist.")
            continue
        
        # copy image
        dst_img_path = os.path.join(img_output_dir, os.path.basename(img_path))
        shutil.copy(img_path, dst_img_path)

        # open image to get size
        with Image.open(img_path) as img:
            w, h = img.size
        
        # write label file
        label_file = os.path.splitext(os.path.basename(img_path))[0] + ".txt"
        label_file_path = os.path.join(lbl_output_dir, label_file)
        
        with open(label_file_path, "w", encoding="utf-8") as f:
            for bbox_entry in item['bboxes']:
                if label_key == "is_sign":
                    class_id = 0
                elif label_key == "charname":
                    class_id = bbox_entry['charname_id']
                elif label_key == "transliteration":
                    class_id = bbox_entry['transliteration_id']
                elif label_key == "charname_topN":
                    class_id = bbox_entry['charname_id']
                    if class_id >= N:
                        class_id = N  # group together
                else:
                    raise ValueError("label_key must be one of ['is_sign','charname','transliteration','charname_topN']")
                
                x_center, y_center, width, height = convert_bbox(bbox_entry['bbox'], w, h)
                f.write(f"{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}\n")

# ----------------- PROCESS DATA -----------------
print("Processing training data...")
process_json(TRAIN_JSON, IMG_TRAIN_DIR, LBL_TRAIN_DIR, label_key)

print("Processing validation data...")
process_json(VAL_JSON, IMG_VAL_DIR, LBL_VAL_DIR, label_key)

print("Done! YOLO dataset ready.")


Processing training data...


Processing items: 100%|██████████| 856/856 [00:03<00:00, 259.19it/s]


Processing validation data...


Processing items: 100%|██████████| 214/214 [00:00<00:00, 258.73it/s]


Done! YOLO dataset ready.


In [22]:
import yaml
yaml_data = {
    'path': OUTPUT_DIR,  # root dataset folder
    'train': 'images/train',
    'val': 'images/val',
    'nc':  None,  # number of classes, will fill automatically
    'names': None  # list of class names
}
if label_key == "is_sign":
    yaml_data['nc'] = 1
    yaml_data['names'] = ["sign"]
else:
    # collect all ids from all json
    import json
    train_json = os.path.join("D:/sumer/MaiCuBeDa/all_photo_anno.json")
    with open(train_json, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    class_ids = set()
    for item in data.values():
        for bbox_entry in item['bboxes']:
            if label_key == "charname":
                class_ids.add(bbox_entry['charname_id'])
            elif label_key == "transliteration":
                class_ids.add(bbox_entry['transliteration_id'])
            elif label_key == "charname_topN":
                cid = bbox_entry['charname_id']
                if cid >= N:
                    cid = N
                class_ids.add(cid)
    
    class_ids = sorted(list(class_ids))
    yaml_data['nc'] = len(class_ids)  # assuming class ids are 1-indexed
    yaml_data['names'] = [reverse_dict_id[i] for i in class_ids]

# ----------------- SAVE YAML -----------------
yaml_path = os.path.join(OUTPUT_DIR, "dataset.yaml")
with open(yaml_path, "w", encoding="utf-8") as f:
    yaml.dump(yaml_data, f)

print(f"YOLO dataset.yaml saved to {yaml_path}")

YOLO dataset.yaml saved to D:/sumer/3_YOLO_ultra/dataset_charname_top_charname_topN\dataset.yaml


In [23]:
# remove .cache files in labels/
import glob
for cache_file in glob.glob(os.path.join(OUTPUT_DIR, "labels/*.cache"), recursive=True):
    os.remove(cache_file)