## Dataset Preperation

### Set Up

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os
import glob
import xml.etree.ElementTree as ET
import shutil
from sklearn.model_selection import train_test_split
from PIL import Image
import re

### Helpers

In [None]:
def copy_files(file_paths, destination_directory):
    """Copy files to the specified destination directory."""
    for file_path in file_paths:
        shutil.copy(file_path, destination_directory)

def convert_xml_to_txt(xml_file, output_directory):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Extract base name and create corresponding .txt file path
    base_name = os.path.basename(xml_file).replace('.xml', '.txt')
    txt_file_path = os.path.join(output_directory, base_name)

    with open(txt_file_path, 'w') as file:
        for line in root.findall('.//handwritten-part/line'):
            line_text = line.get('text')
            if not line_text:
                continue

            # Initialize bounding box coordinates
            x_min, y_min, x_max, y_max = float('inf'), float('inf'), 0, 0

            for word in line.findall('word'):
                for cmp in word.findall('cmp'):
                    x, y, width, height = int(cmp.get('x')), int(cmp.get('y')), int(cmp.get('width')), int(cmp.get('height'))
                    x_min = min(x_min, x)
                    y_min = min(y_min, y)
                    x_max = max(x_max, x + width)
                    y_max = max(y_max, y + height)

            if x_min < float('inf'):
                file.write(f"{x_min},{y_min},{x_max},{y_min},{x_max},{y_max},{x_min},{y_max},{line_text}\n")

#### Convert label structure to OCR readible

In [None]:
label_directory = "/content/drive/My Drive/ocr_project/datasets/IAM_data/xml"
output_directory = "/content/drive/My Drive/ocr_project/datasets/IAM_data/txt_labels"

os.makedirs(output_directory, exist_ok=True)

# Process each XML file and create corresponding TXT file
for xml_path in glob.glob(os.path.join(label_directory, "*.xml")):
    convert_xml_to_txt(xml_path, output_directory)

#### Image Preprocessing

In [None]:
# Replace 'path_to_your_image.png' with the path to an actual image file
image = Image.open('/content/drive/My Drive/ocr_project/datasets/IAM_data/forms/h07-087.png')
print(image.size)


(2479, 3542)


In [None]:
# Open the image
image = Image.open('/content/drive/My Drive/ocr_project/datasets/IAM_data/forms/h07-087.png')

# Calculate the new size, maintaining the aspect ratio
target_width = 620
target_height = 886

# Resize the image, maintaining aspect ratio
aspect_ratio = image.width / image.height
new_width = int(target_height * aspect_ratio)
image = image.resize((new_width, target_height), Image.ANTIALIAS)

# Pad the resized image to the required input size
new_image = Image.new('RGB', (target_width, target_height), (255, 255, 255))  # Assuming white padding
offset = ((target_width - new_width) // 2, 0)
new_image.paste(image, offset)

# Save or process the padded image
new_image.save('/content/drive/My Drive/ocr_project/test_reshape/resized_and_padded_image.png')
image = Image.open('/content/drive/My Drive/ocr_project/test_reshape/resized_and_padded_image.png')
print(image.size)


  image = image.resize((new_width, target_height), Image.ANTIALIAS)


(620, 886)


#### Train/Test Split

In [None]:

# Directory containing the image files
image_directory = "/content/drive/My Drive/ocr_project/datasets/IAM_data/forms"
# Directory to save the training, validation and test images
train_image_directory = "/content/drive/My Drive/ocr_project/datasets/IAM_data/train/train_image"
val_image_directory = "/content/drive/My Drive/ocr_project/datasets/IAM_data/validation/val_image"
test_image_directory = "/content/drive/My Drive/ocr_project/datasets/IAM_data/test/test_image"

# Make the directories if they don't exist
os.makedirs(train_image_directory, exist_ok=True)
os.makedirs(val_image_directory, exist_ok=True)
os.makedirs(test_image_directory, exist_ok=True)

# Directory containing the annotation files
label_directory = "/content/drive/My Drive/ocr_project/datasets/IAM_data/txt_labels"
# Directory to save the training, validation and test labels
train_label_directory = "/content/drive/My Drive/ocr_project/datasets/IAM_data/train/train_label"
val_label_directory = "/content/drive/My Drive/ocr_project/datasets/IAM_data/validation/val_label"
test_label_directory = "/content/drive/My Drive/ocr_project/datasets/IAM_data/test/test_label"

# Make the directories if they don't exist
os.makedirs(train_label_directory, exist_ok=True)
os.makedirs(val_label_directory, exist_ok=True)
os.makedirs(test_label_directory, exist_ok=True)

# Get list of all image files
image_files = glob.glob(os.path.join(image_directory, "*.png"))
# Get list of all annotation files
label_files = [os.path.join(label_directory, os.path.basename(f).replace('.png', '.txt')) for f in image_files]

# Split into training and temp (validation + test)
train_images, temp_images, train_labels, temp_labels = train_test_split(
    image_files, label_files, test_size=0.3, random_state=42)

# Split the temp into validation and test
val_images, test_images, val_labels, test_labels = train_test_split(
    temp_images, temp_labels, test_size=1/3, random_state=42)  # 1/3 of 30% will give 10% test size overall

# Copy the files to their new directories
copy_files(train_images, train_image_directory)
copy_files(val_images, val_image_directory)
copy_files(test_images, test_image_directory)
copy_files(train_labels, train_label_directory)
copy_files(val_labels, val_label_directory)
copy_files(test_labels, test_label_directory)

print(f"Training images: {len(train_images)}")
print(f"Validation images: {len(val_images)}")
print(f"Test images: {len(test_images)}")


Training images: 735
Validation images: 210
Test images: 106


#### Test Hand Writing / Machine-Typed Isolation

In [None]:

# Set up the directories
output_dir = "/content/drive/My Drive/ocr_project/datasets/IAM_data/test/test_output"
xml_dir = "/content/drive/My Drive/ocr_project/datasets/IAM_data/xml"
printed_output_dir = "/content/drive/My Drive/ocr_project/datasets/IAM_data/test/printed_output"
writing_output_dir = "/content/drive/My Drive/ocr_project/datasets/IAM_data/test/writing_output"
printed_label_dir = "/content/drive/My Drive/ocr_project/datasets/IAM_data/test/printed_label"

# Make sure the new directories exist
os.makedirs(printed_output_dir, exist_ok=True)
os.makedirs(writing_output_dir, exist_ok=True)
os.makedirs(printed_label_dir, exist_ok=True)

# List all the files in the output directory
for output_file in os.listdir(output_dir):
    if output_file.endswith('_result.txt'):
        base_name = output_file.replace('_result.txt', '')
        xml_file_path = os.path.join(xml_dir, base_name + '.xml')
        output_file_path = os.path.join(output_dir, output_file)
        printed_output_file_path = os.path.join(printed_output_dir, base_name + '_printed_output.txt')
        writing_output_file_path = os.path.join(writing_output_dir, base_name + '_writing_output.txt')
        printed_label_file_path = os.path.join(printed_label_dir, base_name + '_printed_label.txt')

        # Parse the XML file to determine how many lines to extract
        tree = ET.parse(xml_file_path)
        root = tree.getroot()
        machine_printed_part = root.find('machine-printed-part')
        if machine_printed_part is not None:
            machine_print_lines = machine_printed_part.findall('machine-print-line')
            x = len(machine_print_lines)  # This is how many lines we want to extract

            # Extract printed section from the OCR output file
            with open(output_file_path, 'r') as file:
                lines = file.readlines()[2:x+2]  # Skip the first 2 line and take x lines
                stripped_lines = [line.strip() for line in lines]
                single_line_output_p = ' '.join(stripped_lines)

            # Write the single line output to the new output file
            with open(printed_output_file_path, 'w') as file:
              file.write(single_line_output_p + '\n')

            # Extract writing from the OCR output file
            with open(output_file_path, 'r') as file:
                lines = file.readlines()[x+2:-1]
                stripped_lines = [line.strip() for line in lines]
                single_line_output_w = ' '.join(stripped_lines)
            # Write the single line output to the new output file
            with open(writing_output_file_path, 'w') as file:
              file.write(single_line_output_w + '\n')

            # Process the XML file to extract only the machine printed text
            printed_text = ' '.join([line.attrib['text'] for line in machine_print_lines])

            # Write the processed text to the new label file
            with open(printed_label_file_path, 'w') as file:
                file.write(printed_text)

print("Preprocessing completed.")


Preprocessing completed.
