In [None]:
import os

# Define dataset path
dataset_path = "/kaggle/input/iam-handwriting-word-database"
assert os.path.exists(dataset_path), "Dataset not found!"
print("Dataset loaded successfully.")

In [None]:
# Correct path to words.txt inside iam_words folder
transcriptions_path = os.path.join(dataset_path, "iam_words", "words.txt")

# Dictionary to store image-text pairs
image_text_map = {}

# Read transcription file
with open(transcriptions_path, "r") as f:
    lines = f.readlines()

for line in lines:
    if not line.startswith("#"):  # Ignore comment lines
        parts = line.strip().split()
        image_id = parts[0]  # Example: 'a01-000u-00'
        text = " ".join(parts[8:])  # Extract actual word
        image_text_map[image_id] = text  # Store mapping

# Display first few mappings
list(image_text_map.items())[:5]

In [None]:
import os
import shutil
import glob
from PIL import Image, ImageDraw, ImageFont
from tqdm import tqdm

# Dataset base path (update if needed)
dataset_path = "/kaggle/input/iam-handwriting-word-database"

# Output directory for paired dataset
paired_dataset_folder = "/kaggle/working/scrabblegan_dataset"
os.makedirs(paired_dataset_folder, exist_ok=True)

# Load a TrueType font (modify if needed)
font_path = "/usr/share/fonts/truetype/liberation/LiberationSerif-Regular.ttf"
font = ImageFont.truetype(font_path, 32)

# IAM Words directory
handwritten_images_folder = os.path.join(dataset_path, "iam_words", "words")
assert os.path.exists(handwritten_images_folder), f"❌ Directory not found: {handwritten_images_folder}"

print("✅ IAM dataset path verified.")

In [None]:
from tqdm import tqdm
import glob
import os

# Get all handwritten image paths
handwritten_images_paths = glob.glob(os.path.join(handwritten_images_folder, "**", "*.png"), recursive=True)

# Create a mapping with tqdm progress bar
handwritten_images_map = {}
for png in tqdm(handwritten_images_paths, desc="🔍 Mapping handwritten images", unit="file"):
    image_id = os.path.basename(png).replace(".png", "")
    handwritten_images_map[image_id] = png

print(f"✅ Found {len(handwritten_images_map)} handwritten images.")
print("📝 Sample mappings:", list(handwritten_images_map.items())[:5])

In [None]:
# Path to IAM transcription file
words_txt_path = os.path.join(dataset_path, "iam_words", "words.txt")

# Load transcriptions
image_text_map = {}
with open(words_txt_path, "r") as f:
    for line in f:
        if not line.startswith("#"):  # Ignore comments
            parts = line.strip().split()
            if len(parts) >= 9:
                image_id = parts[0]  # Example: "a01-000u-00"
                text = " ".join(parts[8:])  # Extract actual text
                image_text_map[image_id] = text

print(f"✅ Loaded {len(image_text_map)} transcriptions.")
print("🔍 Sample transcriptions:", list(image_text_map.items())[:5])

In [None]:
# Generate printed text images and store them with handwritten versions
for image_id, text in tqdm(image_text_map.items()):
    if image_id in handwritten_images_map:  # Ensure corresponding handwritten image exists
        handwritten_img_path = handwritten_images_map[image_id]

        # Create printed text image
        img = Image.new("RGB", (300, 100), "white")
        draw = ImageDraw.Draw(img)
        draw.text((10, 30), text, font=font, fill="black")

        # Save printed text image
        printed_img_path = os.path.join(paired_dataset_folder, f"{image_id}_printed.png")
        img.save(printed_img_path)

        # Copy handwritten image to paired dataset folder
        handwritten_copy_path = os.path.join(paired_dataset_folder, f"{image_id}_handwritten.png")
        shutil.copy(handwritten_img_path, handwritten_copy_path)

print(f"✅ Paired dataset saved in: {paired_dataset_folder}")

In [None]:
import os
import random
import matplotlib.pyplot as plt
from PIL import Image

# Path to the ScrabbleGAN dataset
dataset_folder = "/kaggle/working/scrabblegan_dataset"

# Get all printed images
printed_images = [f for f in os.listdir(dataset_folder) if "_printed.png" in f]

# Select a random printed image
if printed_images:
    random_image = random.choice(printed_images)
    img_path = os.path.join(dataset_folder, random_image)

    # Display the image
    img = Image.open(img_path)
    plt.imshow(img, cmap="gray")
    plt.axis("off")
    plt.title(f"Printed Text Image: {random_image}")
    plt.show()
else:
    print("❌ No printed images found in scrabblegan_dataset.")

In [None]:
import shutil

# Define paths
dataset_folder = "/kaggle/working/scrabblegan_dataset"
zip_path = "/kaggle/working/scrabblegan_dataset.zip"

# Zip the folder
shutil.make_archive(zip_path.replace(".zip", ""), 'zip', dataset_folder)

print(f"✅ Zipped folder saved at: {zip_path}")