## Process IAM Dataset
Download the word image of IAM dataset from [Kaggle Dataset](https://www.kaggle.com/datasets/nibinv23/iam-handwriting-word-database), clean it and save training and testing set for consistant evaluation.

## Step 1. Data Processing

### 1.1 Loading the Data

In [None]:
import os

# Change to your desired directory
# os.chdir('/common/users/$USER/df_words') # change $USER to netid

# Confirm it's changed
# print("Current directory:", os.getcwd())

images_dir = '.\iam_words'

In [None]:
import pandas as pd
from PIL import Image
label_file_path = images_dir + '\words.txt'
image_file_path = images_dir + '\iam_words\words'

data = []
with open(label_file_path, 'r') as f:
    lines = f.readlines()

for idx, line in enumerate(lines[18:]):
    if idx % 1000 == 0:
        print(f"Processing line {idx}")
    row = []
    tokens = line.strip().split()
    if len(tokens) < 2:
        continue

    subfolder = tokens[0].split('-')[0]
    subfolder2 = subfolder + "-" + tokens[0].split('-')[1]
    image_file_name = subfolder + "\\" + subfolder2 + "\\" + tokens[0] + ".png"
    image_path = os.path.join(image_file_path, image_file_name)
    try:
        with Image.open(image_path) as img:
            if img.size[0] >= 10 and img.size[1] >= 10:
                img_rgb = img.convert("RGB")  # Convert to RGB
                img_copy = img_rgb.copy()     # Copy after conversion
    except FileNotFoundError as e:
        print(f"Image file not found: {image_file_path}. Error: {e}")
        continue
    except Image.UnidentifiedImageError as e:
        print(f"Unidentified image error for file {image_file_path}: {e}")
        continue
    except Exception as e:
        print(f"Error opening image file {image_file_path}: {e}")
        continue
    row = [image_path, tokens[1], tokens[2], tokens[-1], img_copy]
    # if len(row) != 10:
    #     print(f"Row length mismatch: {len(row)} elements in row: {row}")
    #     continue
    data.append(row)


print(f"Length of a row in data: {len(data[0])}")  # Should print 10

print(data[0])
loaded = pd.DataFrame(data, columns=['image_id', 'segmentation_status', 'graylevel','text', 'image'])

In [None]:
loaded_dfwords = loaded.copy()
print(loaded_dfwords.info())
print(loaded_dfwords.head(30))

### 1.2 Show image

In [None]:
import matplotlib.pyplot as plt

def show_image(df, row):
    # plt.imshow(df.iloc[row]['image'], cmap='gray')
    img = Image.open(df.iloc[row]['image_id'])
    plt.imshow(img, cmap='gray')
    plt.show()

In [None]:
show_image(loaded_dfwords, 10)  # Show the first image

## Step 2. Cleaning the Data

### 2.1 Speical Character

In [None]:
import re

# Patter for all Special characters
special_char_pattern = r'[^a-zA-Z0-9\s]'  # Matches anything not alphanumeric or whitespace

# Select rows with special characters
special_char_rows = loaded_dfwords[loaded_dfwords['text'].str.contains(special_char_pattern, regex=True, na=False)]

In [None]:
special_char_rows.head(10)

In [None]:
allowed_pattern = r'^[\w\s\.,!?;:\-+*/=()\[\]{}<>@#\$%^&_\'"\t\n]+$'
mask = ~loaded_dfwords['text'].str.contains(allowed_pattern, regex=True)
non_standard_rows = loaded_dfwords[mask]

In [None]:
non_standard_rows.head()

In [None]:
mask = loaded_dfwords['text'].str.contains(r'\\', regex=True)
check_rows= loaded_dfwords[mask]

In [None]:
loaded_dfwords['text'] = loaded_dfwords['text'].str.replace('\\/', '/', regex=False)

In [None]:
mask = ~loaded_dfwords['text'].str.contains(allowed_pattern, regex=True)
non_standard_rows2 = loaded_dfwords[mask]

In [None]:
print("Words with special character:", len(non_standard_rows2), ", Percentage: ", len(non_standard_rows2)/len(loaded_dfwords))
print("Images with special character:", len(non_standard_rows2['image_id'].unique()), ", Percentage: ", len(non_standard_rows2['image_id'].unique())/len(loaded_dfwords['image_id'].unique()))

In [None]:
loaded_dfwords=loaded_dfwords[~mask]

In [None]:
print("total number of words", len(loaded_dfwords))


In [None]:
# confirm there is no special characters
count_matching = loaded_dfwords['text'].str.contains(allowed_pattern, regex=True, na=False).sum()
print(f"Number of rows with allowed characters: {count_matching}")

In [None]:
pattern = r'^[^a-zA-Z0-9]+$'  # Matches strings with no alphanumeric chars at all
non_alnum_rows = loaded_dfwords[loaded_dfwords['text'].str.contains(pattern, regex=True, na=False)]

In [None]:
print("total number of words", len(loaded_dfwords))


In [None]:
non_alnum_rows.head(20)

### 2.2 All-symbol Text

In [None]:
# check other rows that have only characters
pattern = r'^[^a-zA-Z0-9]+$'  # Matches strings with no alphanumeric chars at all
non_alnum_rows2 = loaded_dfwords[loaded_dfwords['text'].str.contains(pattern, regex=True, na=False)]

In [None]:
non_alnum_rows2.head()

In [None]:
non_alnum_rows2['text'].value_counts()

In [None]:
# remove these to match the other imgur dataset preprocessing (it also removes all instances of text as '.' due to incorrect labels)
only_period_rows= loaded_dfwords[loaded_dfwords['text'] == '.']
loaded_dfwords = loaded_dfwords[loaded_dfwords['text'] != '.']

In [None]:

hyphen_row = loaded_dfwords[loaded_dfwords['text'] == '-----------------------------------------------------']
id = hyphen_row['image_id'].to_string()
print(id)
print("image:", id.split('\\')[-1])  # Print the image file name
r = loaded_dfwords[loaded_dfwords['image_id'] == '.\iam_words\iam_words\words\p02\p02-109\p02-109-01-00.png']  # Get the row with the hyphen image
print(r)
show_image(r, 0)  # Show the hyphen image
# show_image(loaded_dfwords, hyphen_row.index[0] + 1)  # Show the hyphen image

In [None]:
loaded_dfwords = loaded_dfwords[loaded_dfwords['text'] != '-----------------------------------------------------']


In [None]:
loaded_dfwords = loaded_dfwords.reset_index(drop=True)

In [None]:
loaded_dfwords.info()

## Step 3. Splitting the Data into Training and Testing Subsets

### 3.1 Spliting into Training and Testing Set

In [None]:
import numpy as np

# Get unique groups
unique_images = loaded_dfwords['image_id'].unique()


# Randomly select 10% for test 
np.random.seed(42)
test_images = np.random.choice(unique_images, 
                              size=int(len(unique_images)*0.2), 
                              replace=False)

In [None]:
test_df = loaded_dfwords[loaded_dfwords['image_id'].isin(test_images)]
training_df = loaded_dfwords[~loaded_dfwords['image_id'].isin(test_images)]

In [None]:
print("Words in Train Dataset:", len(training_df), ", Percentage: ", len(training_df)/len(loaded_dfwords))

In [None]:
print("Words in Test Dataset:", len(test_df), ", Percentage: ", len(test_df)/len(loaded_dfwords))

In [None]:
print("total number of words", len(loaded_dfwords))

### 3.2 Saving the Dataset to CSV

In [None]:
# test_df_copy = test_df
# train_df_copy = training_df

In [None]:
# test_df_copy = test_df_copy.drop('image', axis=1)

In [None]:
# train_df_copy = train_df_copy.drop('image', axis=1)

In [None]:
# test_df_copy.info()

In [None]:
# train_df_copy.info()

In [None]:
# test_df_copy = test_df_copy.reset_index()
# test_df_copy['word_id'] = test_df_copy.index
# test_df_copy = test_df_copy.drop('index', axis=1)
# test_df_copy = test_df_copy.drop('level_0', axis=1)

# print(test_df_copy)

In [None]:
# train_df_copy = train_df_copy.reset_index()
# train_df_copy['word_id'] = train_df_copy.index
# train_df_copy = train_df_copy.drop('index', axis=1)
# train_df_copy = train_df_copy.drop('level_0', axis=1)

# print(train_df_copy)

In [None]:
# train_df_copy = train_df_copy[[train_df_copy.columns[2]] + train_df_copy.columns[:2].tolist()]

In [None]:
# test_df_copy = test_df_copy[[test_df_copy.columns[2]] + test_df_copy.columns[:2].tolist()]

In [None]:
# train_df_copy.info()

In [None]:
# test_df_copy.info()

In [None]:
# train_df_copy.head(10)

In [None]:
# test_df_copy.head(10)

In [None]:
# test_df_copy.to_csv('df_test.csv', index=False)
# train_df_copy.to_csv('df_train.csv', index=False)