# Generate image dataset for Khmer text recognition

In [20]:
from PIL import features
print(features.check("raqm"))

False


In [19]:
import pandas as pd
from PIL import Image, ImageDraw, ImageFont
import os
import random
import numpy as np
from sklearn.model_selection import train_test_split
import shutil
from IPython.display import FileLink

## 1. Load datasets

### 1.1 Loading text word 

In [10]:
# 1.1. Loading words data
dataset_path = 'combined_cleaned.txt'

# Read all words from the text file
with open(dataset_path, 'r', encoding='utf-8') as f:
    words = [line.strip() for line in f if line.strip()]

print(f"\n✓ Loaded {len(words)} words from {dataset_path}")
print(f"Sample words: {words[:5]}")

# Create DataFrame
df = pd.DataFrame({'word': words})
print(f"\nDataFrame shape: {df.shape}")


✓ Loaded 2190188 words from combined_cleaned.txt
Sample words: ['អាណិត', 'គាត', 'ណាស់', 'លោក', '៨០']

DataFrame shape: (2190188, 1)


## 2: Generate text to images

### 2.1. Import function for generate text to image

In [11]:
def gen_khmer_text_image(index, content, data_type, bg, 
                        font_path, font_size, data_folder, padding=10):
    """
    Generate an image from Khmer text with specified styling parameters
    Image size adapts to text content
    
    Args:
        index: Index number for filename
        content: The text to render
        data_type: 'train', 'valid', or 'test'
        bg: Background color (R, G, B, A)
        font_path: Path to the font file
        font_size: Size of the font
        data_folder: Base folder for output
        padding: Padding around text (pixels)
    
    Returns:
        Filename of the generated image
    """
    # Load font first to measure text
    try:
        font = ImageFont.truetype(font_path, font_size)
    except:
        print(f"Warning: Could not load font {font_path}, using default")
        font = ImageFont.load_default()
    
    # Create temporary image to measure text
    temp_img = Image.new('RGBA', (1, 1))
    temp_draw = ImageDraw.Draw(temp_img)
    
    # Get text bounding box
    bbox = temp_draw.textbbox((0, 0), content, font=font)
    text_width = bbox[2] - bbox[0]
    text_height = bbox[3] - bbox[1]
    
    # Calculate image size based on text with padding
    img_width = text_width + (padding * 2)
    img_height = text_height + (padding * 2)
    
    # Create actual image with calculated size
    image = Image.new('RGBA', (img_width, img_height), bg)
    draw = ImageDraw.Draw(image)
    
    # Draw text with padding offset
    draw.text((padding, padding), content, font=font, fill=(0, 0, 0, 255))
    
    # Generate filename with 6-digit index
    filename = f"{index:06d}.png"
    
    # Create output directory if it doesn't exist
    output_dir = os.path.join(data_folder, data_type)
    os.makedirs(output_dir, exist_ok=True)
    
    # Save image
    output_path = os.path.join(output_dir, filename)
    image.save(output_path)
    
    return filename


### 2.2. Define Variant values for Function Parameters

In [12]:
fonts_dir = "fonts"
fonts = []

if os.path.exists(fonts_dir):
    for filename in os.listdir(fonts_dir):
        if filename.endswith(('.ttf', '.otf', '.TTF', '.OTF')):
            font_path = os.path.join(fonts_dir, filename)
            fonts.append(font_path)
    fonts.sort()  # Sort alphabetically for consistency
else:
    print(f"Warning: '{fonts_dir}' folder not found!")
    fonts = []

if not fonts:
    print("ERROR: No font files found in 'fonts/' folder!")
    print("Please ensure .ttf or .otf font files are in the 'fonts/' directory")
    exit()

print(f"\nDiscovered fonts:")
for font in fonts:
    print(f"  • {font}")

# Font sizes
font_sizes = [9,10,11,12,13,14,15,16]

# Background colors
bg_colors = [
    (255, 255, 255, 255),
]

print(f"\n✓ {len(fonts)} fonts")
print(f"✓ {len(font_sizes)} font sizes")
print(f"✓ {len(bg_colors)} background colors")



Discovered fonts:
  • fonts\KhmerDigital-Black.ttf
  • fonts\KhmerDigital-Bold.ttf
  • fonts\KhmerDigital-ExtraBold.ttf
  • fonts\KhmerDigital-ExtraLight.ttf
  • fonts\KhmerDigital-Light.ttf
  • fonts\KhmerDigital-Medium.ttf
  • fonts\KhmerDigital-Regular.ttf
  • fonts\KhmerDigital-SemiBold.ttf
  • fonts\KhmerDigital-Thin.ttf
  • fonts\KhmerDigitalMax.ttf
  • fonts\KhmerDigitalNumber.ttf
  • fonts\KhmerDigitalNumberMax.ttf
  • fonts\KhmerMPTC.ttf
  • fonts\KhmerMPTCMoul.otf
  • fonts\KhmerOS_muollight.ttf
  • fonts\KhmerOS_siemreap.ttf

✓ 16 fonts
✓ 8 font sizes
✓ 1 background colors


### 2.3 Splitting The Dataset: Train, Validation, Test


In [13]:
# 2.3 Splitting The Dataset: Train, Validation, Test
print("\n2.3. Splitting the dataset...")

# Split: 70% train, 15% validation, 15% test
train, temp = train_test_split(df, test_size=0.3, random_state=42)
valid, test = train_test_split(temp, test_size=0.5, random_state=42)

# Reset indices for proper numbering
train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
test = test.reset_index(drop=True)

print(f"✓ Train: {len(train)} words")
print(f"✓ Validation: {len(valid)} words")
print(f"✓ Test: {len(test)} words")



2.3. Splitting the dataset...
✓ Train: 1533131 words
✓ Validation: 328528 words
✓ Test: 328529 words


### 2.4 Generating Text to Images


In [14]:
# 2.4 Generating Text to Images
# Create base output directory
data_folder = "data_v1"
os.makedirs(data_folder, exist_ok=True)

# Lists to store labels
train_labels = []
valid_labels = []
test_labels = []

In [15]:
# ============================================================================
# Generating training data to image
# ============================================================================
print("\n" + "-"*60)
print("Generating TRAIN images...")
print("-"*60)
i = 1
n = len(train)
for index, row in train.iterrows():
    font_size = random.choice(font_sizes)
    font = random.choice(fonts)
    bg = random.choice(bg_colors)
    
    try:
        filename = gen_khmer_text_image(
            index=index+1, 
            content=row["word"],
            data_type="train", 
            bg=bg,
            font_path=font, 
            font_size=font_size,
            data_folder=data_folder
        )
        
        train_labels.append(f"{filename}\t{row['word']}")
    except Exception as e:
        print(f"Error processing word '{row['word']}': {e}")
        continue
    
    if i % 100 == 0 or i == n:
        print(f"{i} of {n}: complete")
    i = i + 1


------------------------------------------------------------
Generating TRAIN images...
------------------------------------------------------------
100 of 1533131: complete
200 of 1533131: complete
300 of 1533131: complete
400 of 1533131: complete
500 of 1533131: complete
600 of 1533131: complete
700 of 1533131: complete
800 of 1533131: complete
900 of 1533131: complete
1000 of 1533131: complete
1100 of 1533131: complete
1200 of 1533131: complete
1300 of 1533131: complete


KeyboardInterrupt: 

In [None]:
# ============================================================================
# Generating validation data to image
# ============================================================================
print("\n" + "-"*60)
print("Generating VALID images...")
print("-"*60)
i = 1
n = len(valid)
for index, row in valid.iterrows():
    font_size = random.choice(font_sizes)
    font = random.choice(fonts)
    bg = random.choice(bg_colors)
    
    try:
        filename = gen_khmer_text_image(
            index=index+1, 
            content=row["word"],
            data_type="valid", 
            bg=bg,
            font_path=font, 
            font_size=font_size,
            data_folder=data_folder
        )
        
        valid_labels.append(f"{filename}\t{row['word']}")
    except Exception as e:
        print(f"Error processing word '{row['word']}': {e}")
        continue
    
    if i % 100 == 0 or i == n:
        print(f"{i} of {n}: complete")
    i = i + 1


In [None]:

# ============================================================================
# Generating testing data to image
# ============================================================================
print("\n" + "-"*60)
print("Generating TEST images...")
print("-"*60)
i = 1
n = len(test)
for index, row in test.iterrows():
    font_size = random.choice(font_sizes)
    font = random.choice(fonts)
    bg = random.choice(bg_colors)
    
    try:
        filename = gen_khmer_text_image(
            index=index+1, 
            content=row["word"],
            data_type="test", 
            bg=bg,
            font_path=font, 
            font_size=font_size,
            data_folder=data_folder
        )
        
        test_labels.append(f"{filename}\t{row['word']}")
    except Exception as e:
        print(f"Error processing word '{row['word']}': {e}")
        continue
    
    if i % 100 == 0 or i == n:
        print(f"{i} of {n}: complete")
    i = i + 1

In [None]:
print("\n" + "-"*60)
print("Saving label files...")
print("-"*60)

# Save train.txt
with open(os.path.join(data_folder, 'train.txt'), 'w', encoding='utf-8') as f:
    f.write('\n'.join(train_labels))
print(f"✓ Saved train.txt ({len(train_labels)} entries)")

# Save valid.txt
with open(os.path.join(data_folder, 'valid.txt'), 'w', encoding='utf-8') as f:
    f.write('\n'.join(valid_labels))
print(f"✓ Saved valid.txt ({len(valid_labels)} entries)")

# Save test.txt
with open(os.path.join(data_folder, 'test.txt'), 'w', encoding='utf-8') as f:
    f.write('\n'.join(test_labels))
print(f"✓ Saved test.txt ({len(test_labels)} entries)")



------------------------------------------------------------
Saving label files...
------------------------------------------------------------
✓ Saved train.txt (7475 entries)
✓ Saved valid.txt (0 entries)
✓ Saved test.txt (0 entries)


In [None]:
print("\n" + "="*60)
print("GENERATION COMPLETE!")
print("="*60)
print(f"Train images: {len(train_labels)} → data_v1/train/")
print(f"Valid images: {len(valid_labels)} → data_v1/valid/")
print(f"Test images: {len(test_labels)} → data_v1/test/")
print(f"\nLabel files:")
print(f"  • data_v1/train.txt")
print(f"  • data_v1/valid.txt")
print(f"  • data_v1/test.txt")
print("="*60)

In [None]:
zip_filename = "data_v1"
shutil.make_archive(zip_filename, 'zip', data_folder)

print(f"✓ Created {zip_filename}.zip")
print(f"✓ File size: {os.path.getsize(zip_filename + '.zip') / (1024*1024):.2f} MB")
print("\n" + "="*60)
print("DOWNLOAD READY!")
print("="*60)
print(f"Download file: {zip_filename}.zip")

In [None]:
try:
    display(FileLink(f"{zip_filename}.zip"))
    print("\nClick the link above to download")
except:
    print(f"\nTo download, locate the file: {zip_filename}.zip")
    print("In Jupyter: Right-click the file in the file browser and select 'Download'")
    print("In Colab: Find the file in the Files panel on the left and click the download icon")