In [None]:
from docx import Document
import time
from OCRDataGenerator import OCRDataGenerator
from datasets import load_dataset
import re

def extract_docx_text(docx_path):
    doc = Document(docx_path)
    all_text = []
    buffer_text = ""

    for para in doc.paragraphs:
        words = para.text.split(" ")
        
        for word in words:
            if len(buffer_text) + len(word) + 1 > 30:
                all_text.append(buffer_text.strip())
                buffer_text = word
            else:
                buffer_text += " " + word
    
    if buffer_text:
        all_text.append(buffer_text.strip())
        
    return all_text

def chunk_text(paragraphs):
    all_text = []
    buffer_text = ""

    words = paragraphs.split(" ")
    
    for word in words:
        if len(buffer_text) + len(word) + 1 > 30:
            all_text.append(buffer_text.strip())
            buffer_text = word
        else:
            buffer_text += " " + word
    
    if buffer_text:
        all_text.append(buffer_text.strip())

    return all_text

def clean_shan_text(text, keep_numbers=False):
    # Unicode range for Shan script: U+1000–U+109F
    # shan_regex = r"[\u1000-\u109F]+"
    shan_regex = r"က-႟"
    number_regex = r"\d" if keep_numbers else ""
    
    # Combine regex patterns for allowed characters
    allowed_chars = f"{shan_regex}{number_regex}"
    
    # Remove emojis and unwanted characters
    cleaned_text = re.sub(rf"[^{allowed_chars}\s]", "", text)
    
    return cleaned_text.strip()


In [None]:
text = "Hello World ထၢမ်ႁႃပေႃႈမႄႈမၼ်းၵူႈၶိုၼ်းႁွင်ႉႁွင်ႉႁႆႈတိၵ်းတိၵ်းယဝ်ႉ😭😭 5 ဝၼ်း "

print(clean_shan_text(text, keep_numbers=True))

In [None]:
def generate_images_from_docx(docx_path, fonts, output_dir):
    generator = OCRDataGenerator(font_paths=fonts)

    texts = extract_docx_text(docx_path)
    
    for text in texts:
        text = text.strip()
        
        if len(text) < 1:
            continue

        image, metadata = generator.generate_image(
            text=text,
            min_font_size=24,
            max_font_size=48,
            horizontal_padding=40,
            vertical_padding=20,
            min_height=64,
            add_noise=False,
            random_transform=False
        )
        print(f"Text: {text}")
        print(f"Image size: {metadata['image_size']}\n")

        ts = time.time()

        # save TIF
        image.save(f"{output_dir}/{ts}.tif")
            
        # save TXT
        with open(f"{output_dir}/{ts}.gt.txt", "w", encoding='utf-8') as text_file:
            text_file.write(text)

        print(f"Saved image for word: {text}")

In [None]:
def generate_images_from_huggingface(dataset_repo, chunk_size, fonts, output_dir):
    generator = OCRDataGenerator(font_paths=fonts)
    chunk_count = 0
    
    # Load dataset from Hugging Face
    dataset = load_dataset(dataset_repo, split="train")
    contents = dataset["content"]
    
    for content in contents:
        texts = chunk_text(content)

        for text in texts:
            text = text.strip()
            text = clean_shan_text(text, keep_numbers=True)
            
            if len(text) < 1:
                continue

            image, metadata = generator.generate_image(
                text=text,
                min_font_size=24,
                max_font_size=48,
                horizontal_padding=40,
                vertical_padding=20,
                min_height=64,
                add_noise=False,
                random_transform=False
            )
            print(f"Text: {text}")
            print(f"Image size: {metadata['image_size']}\n")

            ts = time.time()

            # Save TIF
            image.save(f"{output_dir}/{ts}.tif")

            # Save TXT
            with open(f"{output_dir}/{ts}.gt.txt", "w", encoding='utf-8') as text_file:
                text_file.write(text)

            print(f"Saved image for word: {text}")

            chunk_count += 1
            print(f"chunk size: {chunk_count}")

            if chunk_count > chunk_size:
                return


In [None]:
from pathlib import Path

datasets = Path("./kawtai-dataset")
output_dir = "./output"
fonts = [
    "./Shan.ttf",
    "./PangLong.ttf"
]

for file_path in datasets.rglob("*.docx"):
    if file_path.is_file():
        docx_path = file_path
        generate_images_from_docx(docx_path, fonts, output_dir)

In [None]:
output_dir = "../data/shn-ground-truth"
fonts = [
    "./Shan.ttf",
    "./PangLong.ttf",
    "./GreatHorKham_Taunggyi.ttf"
]

huggingface_datasets_repo = [
    "NorHsangPha/shan-novel-tainovel_com",
    # "NorHsangPha/shan-news-shannews_org",
    # "NorHsangPha/shan-news-taifreedom_com",
    # "NorHsangPha/shan-news-shanhumanrights_org",
    # "NorHsangPha/shan-news-ssppssa_org",
]

for repo in huggingface_datasets_repo:
    generate_images_from_huggingface(dataset_repo=repo, chunk_size=1000000, fonts=fonts, output_dir=output_dir)

In [None]:
from datasets import load_dataset

dataset = load_dataset("NorHsangPha/shan-novel-tainovel_com", split="train")

dataset["content"][0]