## Data Collection

In [24]:
!tar xf IAM_Words\words.tgz -C data\words
!move IAM_Words\words.txt data

        1 file(s) moved.


## Libraries

In [7]:
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow import keras
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os

In [3]:
np.random.seed(42)
tf.random.set_seed(42)

## Data PreProcessing / Augmentation

### data split

In [4]:
word_list = []

words = open(f"data/words.txt","r").readlines()
for line in words:
    if line[0] == '#':
        continue
    if line.split(" ")[1] != "err":
        word_list.append(line)
    
print(len(word_list))
np.random.shuffle(word_list)

96456


In [5]:
split = int(0.9 * len(word_list))
train_data = word_list[:split]
test_data = word_list[split:]
val_split = int(0.5 * len(word_list))
val_data = word_list[:val_split]
test_data = word_list[val_split:]

# assert len(word_list) == len(train_data) + len(val_data) + len(test_data)

print("Total training samples:",len(train_data))
print("Total validation samples:", len(val_data))
print("Total test samples:", len(test_data))

Total training samples: 86810
Total validation samples: 48228
Total test samples: 48228


### data input pipeline

In [8]:
base_image_path = os.path.join(r'data/words')

def get_image_paths_and_labels(samples):
    paths = []
    corrected_samples = []
    for i, file_line in enumerate(samples):
        line_split = file_line.strip()
        line_split = line_split.split(" ")
        image_name = line_split[0]
        part1 = image_name.split("-")[0]
        part2 = image_name.split("-")[1]
        img_path = os.path.join(base_image_path, part1, part1 + "-" + part2, image_name + ".png")
        
        if os.path.getsize(img_path):
            paths.append(img_path)
            corrected_samples.append(file_line.split("\n")[0])
            
    return paths, corrected_samples

In [10]:
train_img_paths, train_labels = get_image_paths_and_labels(train_data)
val_img_paths, val_labels = get_image_paths_and_labels(val_data)
test_img_paths, test_labels = get_image_paths_and_labels(test_data)

#### data cleaning

In [11]:
train_labels_cleaned = []
characters = set()
max_len = 0
for label in train_labels:
    label = label.split(" ")[-1].strip()
    for char in label:
        characters.add(char)
        
    max_len = max(max_len, len(label))
    train_labels_cleaned.append(label)
    
print("Maximum length: ", max_len)
print("Vocab size: ", len(characters))

train_labels_cleaned[:10]

Maximum length:  21
Vocab size:  78


['sure',
 'he',
 'during',
 'of',
 'booty',
 'gastronomy',
 'boy',
 'The',
 'and',
 'in']

In [12]:
def clean_labels(labels):
    cleaned_labels = []
    for label in labels:
        label = label.split(" ")[-1].strip()
        cleaned_labels.append(label)
    
    return cleaned_labels

val_labels_cleaned = clean_labels(val_labels)
test_labels_cleaned = clean_labels(test_labels)

#### building vocabulary

In [13]:
AUTOTUNE = tf.data.AUTOTUNE

char_to_num = StringLookup(vocabulary = list(characters), mask_token = None)

num_to_char = StringLookup(vocabulary = char_to_num.get_vocabulary(), mask_token = None, invert = True)

#### resizing images

In [None]:
def distortion_free_resize(image, img_size):
    w,h = img_size
    image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio = True)
    
    pad_height = h - tf.shape(image)[0]
    pad_width = w - tf.shape(image)[1]
    
    if pad_height % 2 != 0:
        height = pad_height // 2
        pad_height_top = height + 1
        pad_height_bottom = height