In [1]:
from pathlib import Path
import sys

ROOT_DIR = Path.cwd().parent
sys.path.insert(0, str(ROOT_DIR))

In [None]:
import re
import tensorflow as tf
from pathlib import Path
import os
import cv2
import numpy as np

DATA_DIR = ROOT_DIR / "data_src" / "TFRecord"
YOLO_DIR = ROOT_DIR / "data_src" / "YOLO"

# --- Robust label map loader ---


def load_label_map(path):
    with open(path, "r") as f:
        text = f.read()
    matches = re.findall(r'id:\s*(\d+).*?name:\s*"([^"]+)"', text, re.DOTALL)
    return {int(id_str): name for id_str, name in matches}

# --- TFRecord parser ---


def parse_tfrecord_fn(example_proto):
    feature_description = {
        "image/encoded": tf.io.FixedLenFeature([], tf.string),
        "image/filename": tf.io.FixedLenFeature([], tf.string, default_value=""),
        "image/height": tf.io.FixedLenFeature([], tf.int64),
        "image/width": tf.io.FixedLenFeature([], tf.int64),
        "image/object/bbox/xmin": tf.io.VarLenFeature(tf.float32),
        "image/object/bbox/xmax": tf.io.VarLenFeature(tf.float32),
        "image/object/bbox/ymin": tf.io.VarLenFeature(tf.float32),
        "image/object/bbox/ymax": tf.io.VarLenFeature(tf.float32),
        "image/object/class/label": tf.io.VarLenFeature(tf.int64),
    }
    return tf.io.parse_single_example(example_proto, feature_description)


# --- Create YOLO directories ---
for folder in ["images", "labels"]:
    for split in ["train", "valid", "test"]:
        out_dir = YOLO_DIR / folder / split
        out_dir.mkdir(parents=True, exist_ok=True)

# --- Loop through dataset splits ---
splits = ["train", "valid", "test"]

for split in splits:
    split_dir = DATA_DIR / split
    tfrecord_path = split_dir / "Diseases.tfrecord"
    label_map_path = split_dir / "Diseases_label_map.pbtxt"

    # load label map
    label_map = load_label_map(label_map_path)
    name_to_id = {v: k for k, v in label_map.items()}  # reverse if needed

    # read tfrecord
    raw_dataset = tf.data.TFRecordDataset(str(tfrecord_path))
    parsed_dataset = raw_dataset.map(parse_tfrecord_fn)

    for record in parsed_dataset:
        filename = record["image/filename"].numpy().decode()
        image = tf.io.decode_jpeg(record["image/encoded"]).numpy()
        h, w = image.shape[:2]

        # save image
        img_out_path = YOLO_DIR / "images" / split / filename
        cv2.imwrite(str(img_out_path), cv2.cvtColor(image, cv2.COLOR_RGB2BGR))

        # get bboxes
        xmins = tf.sparse.to_dense(record["image/object/bbox/xmin"]).numpy()
        xmaxs = tf.sparse.to_dense(record["image/object/bbox/xmax"]).numpy()
        ymins = tf.sparse.to_dense(record["image/object/bbox/ymin"]).numpy()
        ymaxs = tf.sparse.to_dense(record["image/object/bbox/ymax"]).numpy()
        labels = tf.sparse.to_dense(record["image/object/class/label"]).numpy()

        # write YOLO labels
        label_out_path = YOLO_DIR / "labels" / \
            split / (Path(filename).stem + ".txt")
        with open(label_out_path, "w") as f:
            for xmin, xmax, ymin, ymax, cls in zip(xmins, xmaxs, ymins, ymaxs, labels):
                # convert to YOLO normalized format
                x_center = ((xmin + xmax) / 2.0)
                y_center = ((ymin + ymax) / 2.0)
                bbox_w = xmax - xmin
                bbox_h = ymax - ymin

                f.write(
                    f"{cls-1} {x_center:.6f} {y_center:.6f} {bbox_w:.6f} {bbox_h:.6f}\n")

    print(f"✅ Finished {split}")

2025-09-02 20:28:09.493274: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Labels: {1: 'Acral Lentiginous Melanoma', 2: 'Beaus Line', 3: 'Blue Finger', 4: 'Clubbing', 5: 'Healthy Nail', 6: 'Koilonychia', 7: 'Lindsay-s Nail', 8: 'Muehrckes Lines', 9: 'Onychogryphosis', 10: 'Pitting', 11: 'Terry-s Nail'}


0it [00:00, ?it/s]2025-09-02 20:28:09.579806: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:390] TFRecordDataset `buffer_size` is unspecified, default to 262144
285it [00:00, 414.47it/s]2025-09-02 20:28:10.335840: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
307it [00:00, 399.17it/s]
