In [1]:
import os, re
from pathlib import Path
import numpy as np
from PIL import ImageFont
from PIL import Image, ImageDraw, ImageFilter, ImageOps



NOTEBOOK_DIR = Path.cwd()              # โดยปกติคือโฟลเดอร์ที่มี generate.ipynb
FONT_DIR = NOTEBOOK_DIR / "fonts"      # => Generate_Dataset/fonts


In [2]:
def find_fonts_in_folder(font_dir: Path):
    if not font_dir.exists():
        raise FileNotFoundError(f"ไม่เจอโฟลเดอร์ฟอนต์: {font_dir.resolve()}")
    font_paths = []
    for ext in ("*.ttf", "*.otf"):
        font_paths += list(font_dir.rglob(ext))
    font_paths = sorted(set(font_paths))
    if not font_paths:
        raise FileNotFoundError(f"ไม่พบไฟล์ .ttf/.otf ใน: {font_dir.resolve()}")
    return [str(p) for p in font_paths]

FONT_PATHS = find_fonts_in_folder(FONT_DIR)
print("Found fonts:", len(FONT_PATHS))
print("Top candidates:", [Path(p).name for p in FONT_PATHS[:5]])



Found fonts: 16
Top candidates: ['FC Lamoon Bold Italic ver 1.00.otf', 'FC Lamoon Bold Italic ver 1.00.ttf', 'FC Lamoon Bold ver 1.00.otf', 'FC Lamoon Bold ver 1.00.ttf', 'FC Lamoon Light Italic ver 1.00.otf']


In [3]:
def pick_font(font_paths, size, rng):
    sample = "ทดสอบABC123฿%/#"
    candidates = font_paths[:50] if font_paths else []

    if candidates:
        picks = rng.choice(candidates, size=min(10, len(candidates)), replace=False)
        for p in picks:
            try:
                f = ImageFont.truetype(p, size=size)
                bbox = f.getbbox(sample)
                if bbox and (bbox[2] - bbox[0]) > 20:
                    return f, p
            except Exception:
                continue

    # last resort
    return ImageFont.load_default(), None
pick_font(FONT_PATHS, size=10, rng=np.random.RandomState(42))

(<PIL.ImageFont.FreeTypeFont at 0x7f0c155b2ff0>,
 np.str_('/home/sagemaker-user/PaddleOCR/generate/fonts/FC Lamoon Bold Italic ver 1.00.otf'))

In [4]:
thai_words = [
    "สวัสดี", "ขอบคุณ", "กรุงเทพฯ", "บริษัท", "จำกัด", "ใบเสร็จ", "รายการ", "จำนวน", "รวม", "ส่วนลด",
    "ภาษี", "มูลค่าเพิ่ม", "ที่อยู่", "โทร", "วันที่", "เลขที่", "สาขา", "ลูกค้า", "เงินสด", "โอนเงิน",
    "พร้อมเพย์", "ยอดชำระ", "ชำระแล้ว", "คงเหลือ", "สินค้า", "บริการ", "ใบกำกับภาษี"
]
thai_suffix = ["ครับ", "ค่ะ", "นะ", "ด่วน", "เท่านั้น", "วันนี้"]

english_tokens = [
    "TOTAL", "SUBTOTAL", "VAT", "TAX", "INV", "INVOICE", "CASH", "CREDIT", "REF", "NO", "DATE", "TEL",
    "EMAIL", "ITEM", "QTY", "PRICE", "AMOUNT", "DISCOUNT", "PROMO", "BRANCH", "CUSTOMER"
]
domains = ["example.com", "mail.com", "company.co.th", "shop.co.th", "test.org"]

specials = ["฿", "%", "/", "-", ":", ".", ",", "#", "@", "&", "(", ")", "[", "]", "+", "=", "*", "!", "?"]

In [5]:
def render_text_image(text, out_path, font_paths, rng, height=48, max_width=360):
    # choose font size and fit to max_width
    base_size = int(rng.integers(22, 35))
    font, _ = pick_font(font_paths, base_size, rng)

    pad_x = int(rng.integers(6, 15))
    pad_y = int(rng.integers(2, 10))

    bg = int(rng.integers(235, 256))
    fg = int(rng.integers(0, 21))

    def safe_bbox(fnt):
        try:
            return fnt.getbbox(text)
        except Exception:
            return (0, 0, len(text) * 10, base_size)

    # shrink font if too wide
    size = getattr(font, "size", base_size)
    for _ in range(20):
        bbox = safe_bbox(font)
        w = (bbox[2] - bbox[0]) + 2 * pad_x
        if w <= max_width:
            break
        size = max(12, int(size * 0.9))
        font, _ = pick_font(font_paths, size, rng)
        if size == 12:
            break

    bbox = safe_bbox(font)
    text_w = bbox[2] - bbox[0]
    text_h = bbox[3] - bbox[1]
    width = min(max_width, max(120, text_w + 2 * pad_x))

    img = Image.new("L", (width, height), color=bg)
    draw = ImageDraw.Draw(img)

    x = int(rng.integers(0, max(1, width - text_w - pad_x + 1)))
    y = int(rng.integers(-2, max(1, height - text_h - pad_y + 1))) - 2

    draw.text((x, y), text, font=font, fill=fg)

    # augmentations (เบา ๆ)
    if rng.random() < 0.25:
        angle = float(rng.uniform(-2.5, 2.5))
        img = img.rotate(angle, resample=Image.Resampling.BICUBIC, expand=False, fillcolor=bg)

    if rng.random() < 0.30:
        img = img.filter(ImageFilter.GaussianBlur(radius=float(rng.uniform(0.2, 0.8))))

    if rng.random() < 0.35:
        arr = np.array(img).astype(np.int16)
        noise = rng.normal(0, float(rng.uniform(2, 8)), size=arr.shape)
        arr = np.clip(arr + noise, 0, 255).astype(np.uint8)
        img = Image.fromarray(arr, mode="L")

    if rng.random() < 0.20:
        img = ImageOps.autocontrast(img, cutoff=int(rng.integers(0, 4)))

    if rng.random() < 0.15:
        scale = float(rng.uniform(0.9, 1.0))
        new_w = max(40, int(img.size[0] * scale))
        img = img.resize((new_w, height), resample=Image.Resampling.BILINEAR).resize((width, height),
                                                                                     resample=Image.Resampling.BILINEAR)

    img = img.convert("RGB")
    out_path.parent.mkdir(parents=True, exist_ok=True)
    img.save(out_path, quality=95)


In [6]:
def rand_number(rng):
    style = rng.choice(["money", "int", "date", "phone", "id", "percent", "time"])
    if style == "money":
        n = rng.uniform(1, 99999)
        return f"{n:,.2f}"
    if style == "int":
        return str(rng.integers(0, 999999))
    if style == "date":
        y = int(rng.integers(2020, 2027))
        m = int(rng.integers(1, 13))
        d = int(rng.integers(1, 29))
        return f"{d:02d}/{m:02d}/{y}"
    if style == "phone":
        return f"0{int(rng.integers(2,10))}-{int(rng.integers(100,1000))}-{int(rng.integers(1000,10000))}"
    if style == "id":
        return f"{rng.choice(['INV','REF','ORD'])}-{int(rng.integers(2023,2027))}-{int(rng.integers(1,999999)):06d}"
    if style == "percent":
        return f"{int(rng.integers(1,51))}%"
    if style == "time":
        return f"{int(rng.integers(0,24)):02d}:{int(rng.integers(0,60)):02d}"
    return str(int(rng.integers(0,9999)))

def rand_email(rng):
    user = rng.choice(["test", "info", "sales", "support", "admin"]) + str(int(rng.integers(1,999)))
    return f"{user}@{rng.choice(domains)}"

def make_text(rng):
    templates = [
        lambda: f"{rng.choice(english_tokens)} {rng.choice(['฿',''])}{rand_number(rng)}",
        lambda: f"{rng.choice(['VAT','TAX'])} {rng.choice(['7%','10%','0%'])} {rng.choice(['฿',''])}{rand_number(rng)}",
        lambda: f"{rng.choice(['วันที่','DATE'])} {rand_number(rng)}",
        lambda: f"{rng.choice(['เลขที่','NO','REF'])}: {rand_number(rng)}",
        lambda: f"{rng.choice(['TEL','โทร'])}: {rand_number(rng)}",
        lambda: f"{rng.choice(['EMAIL','อีเมล'])}: {rand_email(rng)}",
        lambda: f"{rng.choice(['PROMO','โค้ด'])}#{rng.choice(['A','B','C'])}{int(rng.integers(100,999))} -{int(rng.integers(1,71))}%",
        lambda: f"{rng.choice(['ที่อยู่','ADDRESS'])}: {rng.choice(['กรุงเทพฯ','เชียงใหม่','ขอนแก่น','ชลบุรี'])} {int(rng.integers(10000,10999))}",
        lambda: f"{rng.choice(thai_words)} {rng.choice(thai_suffix)}",
        lambda: f"{rng.choice(thai_words)} {rng.choice(english_tokens)} {rand_number(rng)}",
        lambda: f"{rng.choice(['ชำระแล้ว','ยอดชำระ','รวม'])} ฿{rand_number(rng)}",
        lambda: f"{rng.choice(['QTY','จำนวน'])}: {int(rng.integers(1,100))} {rng.choice(['ชิ้น','pcs'])}",
        lambda: f"{rng.choice(['PRICE','ราคา'])}: ฿{rand_number(rng)}",
        lambda: f"{rng.choice(['ส่วนลด','DISCOUNT'])}: -{int(rng.integers(1,51))}% (฿{rand_number(rng)})",
        lambda: f"{rng.choice(['รวมสุทธิ','NET TOTAL'])}: ฿{rand_number(rng)}",
        lambda: f"{rng.choice(['พร้อมเพย์','PromptPay'])} {int(rng.integers(10**12,10**13))}",
        lambda: f"{rng.choice(['KBank','SCB','KTB','BBL'])} x{int(rng.integers(1000,9999))}",
        lambda: f"{rng.choice(thai_words)}{rng.choice(specials)}{rng.choice(english_tokens)}",
    ]

    parts = [rng.choice(templates)() for _ in range(int(rng.integers(1, 4)))]
    text = " ".join(parts)
    text = re.sub(r"\s{2,}", " ", text).strip()
    return text
    
def render_text_image(text, out_path, font_paths, rng, height=48, max_width=360):
    # choose font size and fit to max_width
    base_size = int(rng.integers(22, 35))
    font, _ = pick_font(font_paths, base_size, rng)

    pad_x = int(rng.integers(6, 15))
    pad_y = int(rng.integers(2, 10))

    bg = int(rng.integers(235, 256))
    fg = int(rng.integers(0, 21))

    def safe_bbox(fnt):
        try:
            return fnt.getbbox(text)
        except Exception:
            return (0, 0, len(text) * 10, base_size)

    # shrink font if too wide
    size = getattr(font, "size", base_size)
    for _ in range(20):
        bbox = safe_bbox(font)
        w = (bbox[2] - bbox[0]) + 2 * pad_x
        if w <= max_width:
            break
        size = max(12, int(size * 0.9))
        font, _ = pick_font(font_paths, size, rng)
        if size == 12:
            break

    bbox = safe_bbox(font)
    text_w = bbox[2] - bbox[0]
    text_h = bbox[3] - bbox[1]
    width = min(max_width, max(120, text_w + 2 * pad_x))

    img = Image.new("L", (width, height), color=bg)
    draw = ImageDraw.Draw(img)

    x = int(rng.integers(0, max(1, width - text_w - pad_x + 1)))
    y = int(rng.integers(-2, max(1, height - text_h - pad_y + 1))) - 2

    draw.text((x, y), text, font=font, fill=fg)

    # augmentations (เบา ๆ)
    if rng.random() < 0.25:
        angle = float(rng.uniform(-2.5, 2.5))
        img = img.rotate(angle, resample=Image.Resampling.BICUBIC, expand=False, fillcolor=bg)

    if rng.random() < 0.30:
        img = img.filter(ImageFilter.GaussianBlur(radius=float(rng.uniform(0.2, 0.8))))

    if rng.random() < 0.35:
        arr = np.array(img).astype(np.int16)
        noise = rng.normal(0, float(rng.uniform(2, 8)), size=arr.shape)
        arr = np.clip(arr + noise, 0, 255).astype(np.uint8)
        img = Image.fromarray(arr, mode="L")

    if rng.random() < 0.20:
        img = ImageOps.autocontrast(img, cutoff=int(rng.integers(0, 4)))

    if rng.random() < 0.15:
        scale = float(rng.uniform(0.9, 1.0))
        new_w = max(40, int(img.size[0] * scale))
        img = img.resize((new_w, height), resample=Image.Resampling.BILINEAR).resize((width, height),
                                                                                     resample=Image.Resampling.BILINEAR)

    img = img.convert("RGB")
    out_path.parent.mkdir(parents=True, exist_ok=True)
    img.save(out_path, quality=95)


In [7]:
def generate_synth_rec_dataset(
    out_root="train_data/rec",
    n_train=1200,
    n_val=200,
    seed=42,
    zip_output=False
):
    rng = np.random.default_rng(seed)
    out_root = Path(out_root)
    train_dir = out_root / "train"
    val_dir = out_root / "val"
    train_dir.mkdir(parents=True, exist_ok=True)
    val_dir.mkdir(parents=True, exist_ok=True)

    forced_texts = [
        "TOTAL ฿ 1,234.50",
        "วันที่ 16/12/2025 เวลา 09:30",
        "เลขที่ INV-2025-000123",
        "ส่วนลด: -50% (฿99.00)",
        "EMAIL: sales123@company.co.th",
        "TEL: 02-123-4567",
        "PromptPay 0123456789012",
        "ที่อยู่: กรุงเทพฯ 10110",
        "QTY: 12 pcs PRICE: ฿19.99",
        "REF#A123 @TEST (OK)"
    ]

    def gen_split(n, split_name, outdir):
        labels = []
        for i in range(n):
            text = forced_texts[i] if (split_name == "train" and i < len(forced_texts)) else make_text(rng)
            fname = f"{split_name}_{i:06d}.jpg"
            rel_path = f"{split_name}/{fname}"   # IMPORTANT: relative to out_root
            abs_path = outdir / fname
            render_text_image(text, abs_path, FONT_PATHS, rng)
            labels.append((rel_path, text))
        return labels

    train_labels = gen_split(n_train, "train", train_dir)
    val_labels = gen_split(n_val, "val", val_dir)

    # Write label files (TAB-separated)
    (out_root / "rec_gt_train.txt").write_text(
        "".join([f"{p}\t{t}\n" for p, t in train_labels]),
        encoding="utf-8"
    )
    (out_root / "rec_gt_val.txt").write_text(
        "".join([f"{p}\t{t}\n" for p, t in val_labels]),
        encoding="utf-8"
    )

    # Build dict (exclude space; use use_space_char=True ตอน train)
    chars = set()
    for _, t in train_labels + val_labels:
        for ch in t:
            if ch != " ":
                chars.add(ch)
    dict_path = out_root / "dict_th_en_num_special.txt"
    dict_path.write_text("".join([ch + "\n" for ch in sorted(chars, key=lambda c: ord(c))]), encoding="utf-8")

    zip_path = None
    if zip_output:
        zip_path = out_root.parent / "synthetic_rec_dataset.zip"
        with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
            for fp in out_root.parent.rglob("*"):
                if fp.is_file():
                    z.write(fp, arcname=str(fp.relative_to(out_root.parent)))

    return {
        "out_root": str(out_root),
        "train_count": len(train_labels),
        "val_count": len(val_labels),
        "dict_path": str(dict_path),
        "zip_path": str(zip_path) if zip_path else None,
        "sample_labels": train_labels[:10],
        "charset_size": len(chars),
    }

info = generate_synth_rec_dataset(out_root="train_data/rec", n_train=50, n_val=128, seed=42, zip_output=False)
info


{'out_root': 'train_data/rec',
 'train_count': 50,
 'val_count': 128,
 'dict_path': 'train_data/rec/dict_th_en_num_special.txt',
 'zip_path': None,
 'sample_labels': [('train/train_000000.jpg', 'TOTAL ฿ 1,234.50'),
  ('train/train_000001.jpg', 'วันที่ 16/12/2025 เวลา 09:30'),
  ('train/train_000002.jpg', 'เลขที่ INV-2025-000123'),
  ('train/train_000003.jpg', 'ส่วนลด: -50% (฿99.00)'),
  ('train/train_000004.jpg', 'EMAIL: sales123@company.co.th'),
  ('train/train_000005.jpg', 'TEL: 02-123-4567'),
  ('train/train_000006.jpg', 'PromptPay 0123456789012'),
  ('train/train_000007.jpg', 'ที่อยู่: กรุงเทพฯ 10110'),
  ('train/train_000008.jpg', 'QTY: 12 pcs PRICE: ฿19.99'),
  ('train/train_000009.jpg', 'REF#A123 @TEST (OK)')],
 'charset_size': 109}

In [8]:
from typing import Any


from pathlib import Path

data_dir = Path("train_data/rec")
gt_files = [data_dir/"rec_gt_train.txt", data_dir/"rec_gt_val.txt"]
dict_file = data_dir/"dict_th_en_num_special.txt"

# read dict
dict_chars = set(dict_file.read_text(encoding="utf-8").splitlines())
# NOTE: space ไม่อยู่ใน dict ถ้าคุณจะใช้ use_space_char=True
dict_chars_no_space = dict_chars | {" "}  # ถือว่ารองรับ space ด้วย

missing = set[Any]()
for gt in gt_files:
    for line in gt.read_text(encoding="utf-8").splitlines():
        if "\t" not in line:
            raise ValueError(f"Label line ไม่มี TAB: {line[:80]}")
        _, text = line.split("\t", 1)
        for ch in text:
            if ch not in dict_chars_no_space:
                missing.add(ch)

print("Missing chars:", missing)
print("OK" if not missing else "❌ dict ไม่ครอบคลุม")


Missing chars: set()
OK
