In [1]:
from docx import Document

def extract_docx_text(docx_path):
    doc = Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

In [None]:
print(extract_docx_text("./ocr-text/the-state.docx"))

In [2]:
from PIL import Image, ImageDraw, ImageFont

def text_to_image(text, font_path, image_size=(400, 300), font_size=40):
    img = Image.new('RGB', image_size, color=(255, 255, 255))
    draw = ImageDraw.Draw(img)
    font = ImageFont.truetype(font_path, font_size)
    draw.text((50, 50), text, font=font, fill=(0, 0, 0))
    return img

text = "မႂ်ႇသုင်ၶႃႈ။"
img = text_to_image(text, "./ocr-text/Shan.ttf")
img.save("./ocr-text/sample_image.png")


## Extract text to image from DOCX

In [1]:
from docx import Document
from PIL import Image, ImageDraw, ImageFont
import time
import os

def extract_docx_text(docx_path):
    doc = Document(docx_path)
    all_text = []
    buffer_text = ""

    for para in doc.paragraphs:
        words = para.text.split(" ")
        
        for word in words:
            if len(buffer_text) + len(word) + 1 > 30:
                all_text.append(buffer_text.strip())
                buffer_text = word
            else:
                buffer_text += " " + word
    
    if buffer_text:
        all_text.append(buffer_text.strip())
        
    return all_text

def text_to_image(text, font_path, image_size=(800, 200), font_size=24):
    img = Image.new('RGB', image_size, color=(255, 255, 255))
    draw = ImageDraw.Draw(img)
    font = ImageFont.truetype(font_path, font_size)
    
    draw.text((50, 50), text, font=font, fill=(0, 0, 0))
    return img

def generate_images_from_docx(docx_path, font_path, output_dir):
    words = extract_docx_text(docx_path)
    
    for i, word in enumerate(words):

        word = word.strip()
        
        if len(word) < 1:
            continue

        img = text_to_image(word, font_path)
        ts = time.time()

        # save TIF
        img.save(f"{output_dir}/{ts}.tif")
        
        # save TXT
        with open(f"{output_dir}/{ts}.gt.txt", "w", encoding='utf-8') as text_file:
            text_file.write(word)

        print(f"Saved image for word: {word}")


In [None]:
from pathlib import Path

datasets = Path("./kawtai-dataset")

for file_path in datasets.rglob("*.docx"):
    if file_path.is_file():
        docx_path = file_path
        font_path = "./PangLong.ttf"
        output_dir = "./output"
        generate_images_from_docx(docx_path, font_path, output_dir)

In [None]:
import time
from OCRDataGenerator import OCRDataGenerator

output_dir = "./output"
fonts = [
    "./Shan.ttf",
    "./PangLong.ttf"
]

generator = OCRDataGenerator(font_paths=fonts)

texts = [
    "ၸွမ်းတီႈၼႂ်းပိူင်ၵၢၼ်ပၢႆးမၢၵ်ႈမုၼ်းယူႇႁိုဝ်?",
    "လူဝ်ႇမီးလွင်ႈလူင်ပွင်ႊၸိုင်ႈၶဝ်ႈပႃး",
    "တီႈလႂ် မႃး။ ၵမ်ႈၼမ်ၼမ်တႄႉ"
]

for text in texts:
    image, metadata = generator.generate_image(
        text=text,
        min_font_size=24,
        max_font_size=48,
        horizontal_padding=40,
        vertical_padding=20,
        min_height=64,
        add_noise=False,
        random_transform=False
    )
    print(f"Text: {text}")
    print(f"Image size: {metadata['image_size']}\n")

    ts = time.time()

    # save TIF
    image.save(f"{output_dir}/{ts}.tif")
        
    # save TXT
    with open(f"{output_dir}/{ts}.gt.txt", "w", encoding='utf-8') as text_file:
        text_file.write(text)

    print(f"Saved image for word: {text}")