In [None]:
import os
from PIL import Image
import pandas as pd

csv_path = "data/ground_truth_lines.csv"

df = pd.read_csv(csv_path)

missing_files = []
empty_images = []
empty_text = []
corrupted_images = []

for idx, row in df.iterrows():
    image_path = str(row['image_path']).strip()
    transcription = str(row['transcription']).strip()

    # 1. Check if the file exists
    if not os.path.isfile(image_path):
        missing_files.append(image_path)
        continue

    # 2. Check transcription is not empty/null
    if transcription == "" or transcription.lower() == "nan":
        empty_text.append(image_path)

    # 3. Check if image can be opened (not null/corrupted)
    try:
        with Image.open(image_path) as img:
            img.verify()   # verify doesn't load full image, just checks integrity
    except Exception:
        corrupted_images.append(image_path)

print("\n=== DATASET VALIDATION RESULTS ===")

if not missing_files:
    print("✅ No missing image files.")
else:
    print("❌ Missing files:")
    for f in missing_files:
        print("   -", f)

if not empty_text:
    print("✅ All transcriptions are valid (not empty).")
else:
    print("❌ Empty transcription entries:")
    for f in empty_text:
        print("   -", f)

if not corrupted_images:
    print("✅ No corrupted or unreadable images.")
else:
    print("❌ Corrupted/unreadable images:")
    for f in corrupted_images:
        print("   -", f)

print("\n✅ Validation complete.")


=== DATASET VALIDATION RESULTS ===
✅ No missing image files.
✅ All transcriptions are valid (not empty).
✅ No corrupted or unreadable images.

✅ Validation complete.


In [1]:
import os
import sys
import lmdb
import cv2
import csv
import numpy as np
from tqdm import tqdm

def check_image_is_valid(image_bin):
    if image_bin is None:
        return False
    img = cv2.imdecode(np.frombuffer(image_bin, np.uint8), cv2.IMREAD_GRAYSCALE)
    return img is not None and img.size > 0

def write_cache(env, cache):
    with env.begin(write=True) as txn:
        for k, v in cache.items():
            txn.put(k.encode(), v)

def list_all_images(root_dir, exts={'.png', '.jpg', '.jpeg'}):
    img_paths = []
    for root, dirs, files in os.walk(root_dir):
        for f in files:
            if os.path.splitext(f)[1].lower() in exts:
                img_paths.append(os.path.join(root, f))
    return img_paths

def create_lmdb(output_path, img_root, csv_path, csv_key="filepath", csv_label="label"):
    print(f"Reading truth file: {csv_path}")

    # Load CSV into dictionary
    truth = {}
    with open(csv_path, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            key = row[csv_key].replace("\\", "/")  # normalize slashes
            truth[key] = row[csv_label]

    print(f"Loaded {len(truth)} label entries from CSV")

    print(f"Scanning for images under: {img_root}")
    all_imgs = list_all_images(img_root)

    print(f"Found {len(all_imgs)} total image files")

    env = lmdb.open(output_path, map_size=1099511627776)
    cache = {}
    cnt = 1

    for img_path in tqdm(all_imgs, desc="Creating LMDB"):
        rel = os.path.relpath(img_path, img_root).replace("\\", "/")

        if rel not in truth:
            print(f"[WARN] No label found for: {rel}")
            continue

        label = truth[rel]
        with open(img_path, "rb") as f:
            img_bin = f.read()

        if not check_image_is_valid(img_bin):
            print(f"[SKIP] Invalid image: {img_path}")
            continue

        img_key = f"image-{cnt:09d}"
        label_key = f"label-{cnt:09d}"

        cache[img_key] = img_bin
        cache[label_key] = label.encode("utf-8")

        if cnt % 1000 == 0:
            write_cache(env, cache)
            cache.clear()

        cnt += 1

    cache["num-samples"] = str(cnt - 1).encode()
    write_cache(env, cache)
    env.close()

    print(f"\n✅ LMDB successfully created with {cnt - 1} samples at: {output_path}")

if __name__ == "__main__":
    if len(sys.argv) < 4:
        print("Usage: python create_lmdb_nested_csv.py <output_lmdb_path> <image_root_folder> <truth.csv>")
        sys.exit(1)

    output = sys.argv[1]
    image_root = sys.argv[2]
    csv_file = sys.argv[3]

    create_lmdb(output, image_root, csv_file)


Usage: python create_lmdb_nested_csv.py <output_lmdb_path> <image_root_folder> <truth.csv>


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
