In [1]:
import os
import json
import numpy as np
import math
from skimage.measure import block_reduce
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt

from pathlib import Path
from collections import Counter


In [2]:
from google.colab import drive
drive.mount('/gdrive')
%cd "/gdrive/Shared drives/deep learning"

Mounted at /gdrive
/gdrive/Shared drives/deep learning


In [3]:
data_filename = "data/training_data_labels.json"
with open(data_filename) as f:
  data_dict = json.load(f)
  data = np.array(data_dict["training_data"])
  data = data.reshape(np.append(data.shape, 1))
  data = data.reshape(np.append(data.shape, 1))
  text_labels = np.array(data_dict["training_labels"])

i = 1
print("Data array shape: ", data.shape, "\nFirst {} rows:".format(i))
print(data[:i], end='\n\n')

print("Text labels shape: ", text_labels.shape, "\nFirst {} rows:".format(i))
print(text_labels[:i], end='\n\n')

Data array shape:  (12237, 6613, 1, 1) 
First 1 rows:
[[[[0.04407315]]

  [[0.04305174]]

  [[0.05130609]]

  ...

  [[0.        ]]

  [[0.        ]]

  [[0.        ]]]]

Text labels shape:  (12237,) 
First 1 rows:
['the film won the naacp image award for outstanding']



In [4]:
max_length = max([len(text_label) for text_label in text_labels])
img_width = 6613
img_height = 1
batch_size = 16

In [5]:
alphabet = list('abcdefghijklmnopqrstuvwxyz0123456789,.?;:-_()=+@$!&/\'\" ')
alphabet_size = len(alphabet)

# Define a mapping between characters and indices
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
int_to_char = dict((i, c) for i, c in enumerate(alphabet))

max_text_length = 0
for d in text_labels:
  l = len(d)
  if l > max_text_length:
    max_text_length = l

num_rows = len(text_labels)
labels = np.zeros((num_rows, max_text_length), dtype=np.int8)

for i, line in enumerate(text_labels):
  # Target text data -> integer encodings
  labels[i] = np.array([char_to_int[char] for char in line])

print(text_labels[:2])
print(labels[:2])
#char_to_num = layers.experimental.preprocessing.StringLookup(
#    vocabulary=alphabet, num_oov_indices=0, mask_token=None)
#
## Mapping integers back to original characters
#num_to_char = layers.experimental.preprocessing.StringLookup(
#    vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True)

# label = char_to_num(tf.strings.unicode_split(labels[0], input_encoding="UTF-8"))
# print(label)
# print(labels[0])

['the film won the naacp image award for outstanding'
 'the keystone bridge company, founded in 1865 by an']
[[19  7  4 54  5  8 11 12 54 22 14 13 54 19  7  4 54 13  0  0  2 15 54  8
  12  0  6  4 54  0 22  0 17  3 54  5 14 17 54 14 20 19 18 19  0 13  3  8
  13  6]
 [19  7  4 54 10  4 24 18 19 14 13  4 54  1 17  8  3  6  4 54  2 14 12 15
   0 13 24 36 54  5 14 20 13  3  4  3 54  8 13 54 27 34 32 31 54  1 24 54
   0 13]]


In [6]:
def split_data(images, labels, train_size=0.9, shuffle=True):
    # 1. Get the total size of the dataset
    size = len(images)

    # 2. Make an indices array and shuffle it, if required
    indices = np.arange(size)
    if shuffle:
        np.random.shuffle(indices)

    # 3. Get the size of training samples
    train_samples = int(size * train_size)

    # 4. Split data into training and validation sets
    x_train, y_train = images[indices[:train_samples]], labels[indices[:train_samples]]
    x_valid, y_valid = images[indices[train_samples:]], labels[indices[train_samples:]]
    return x_train, x_valid, y_train, y_valid

# Splitting data into training and validation sets
x_train, x_valid, y_train, y_valid = split_data(data, labels)

def encode_single_sample(img, label):
    img = tf.image.convert_image_dtype(img, tf.float32)

    # # 4. Resize to the desired size
    img = tf.image.resize(img, [img_height, img_width])

    # 5. Transpose the image because we want the time
    # dimension to correspond to the width of the image.
    img = tf.transpose(img, perm=[1, 0, 2])

    # 7. Return a dict as our model is expecting two inputs
    return {"image": img, "label": label}

In [7]:
print(text_labels.shape)

(12237,)


In [8]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = (
    train_dataset.map(
        encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    .batch(batch_size)
    .prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)

validation_dataset = tf.data.Dataset.from_tensor_slices((x_valid, y_valid))
validation_dataset = (
    validation_dataset.map(
        encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    .batch(batch_size)
    .prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)

In [9]:
for batch in train_dataset.take(1):
  images = batch["image"]
  labels = batch["label"]
  print(labels)
  break
print(text_labels[1])

tf.Tensor(
[[19  7  4 18  4 54 19  4 17 12 18 54 13 14 54 11 14 13  6  4 17 54  0 15
  15 11 24 54 22  8 19  7 54  0 13 24 54  0  2  2 20 17  0  2 24 54  0 18
  54 15]
 [ 7  4 54  4  0 17 13  4  3 54  0 54  1 17 14 13 25  4 54 12  4  3  0 11
  54  8 13 54 19  7  4 54 31 26 26 26 41 12  4 19  4 17 54 17  4 11  0 24
  54 22]
 [ 8 19 18 54  7  0  1  8 19  0 19 54  8 18 54 18  0 13  3 54 15  0 19  2
   7  4 18 54  0 12 14 13  6 54 17  4  4  5 18 54  0 13  3 54 17 20  1  1
  11  4]
 [ 8 13 54 27 35 35 30 36 54  7  4 54 22  0 18 54  3  8  0  6 13 14 18  4
   3 54  0 18 54  7  8 21 41 15 14 18  8 19  8 21  4 36 54  1 20 19 54  7
   0 18]
 [19  7  4 54 12 20 13 18 19  4 17 54 20 13  3  4 17 41 28 27 54  5 14 14
  19  1  0 11 11 54  2  7  0 12 15  8 14 13 18  7  8 15 54  8 18 54 19  7
   4 54]
 [19  7  4 54 21  8 11 11  0  6  4 54 14  5 54 20 19  4 54 15  0 17 10 36
  54 14 15 15 14 18  8 19  4 54 19  7  4 54 12 14 20 19  7 54 14  5 54 20
  19  4]
 [11  0 19  4 17 36 54 10  8 13  3  4 17  6  0 

In [12]:
class CTCLayer(layers.Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = keras.backend.ctc_batch_cost

    def call(self, y_true, y_pred):
        # Compute the training-time loss value and add it
        # to the layer using `self.add_loss()`.
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        # At test time, just return the computed predictions
        return y_pred


def build_model():
    # Inputs to the model
    input_img = layers.Input(
        shape=(img_width, img_height, 1), name="image", dtype="float32"
    )
    labels = layers.Input(name="label", shape=(None,), dtype="float32")

    # First conv block
    x = layers.Conv2D(
        128,
        (25, 1),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv1",
    )(input_img)


    x = layers.MaxPooling2D((2, 1), name="pool1")(x)
    
    x = layers.Conv2D(
        128,
        (25, 1),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv2",
    )(x)
    x = layers.MaxPooling2D((2, 1), name="pool2")(x)
   
    # Second conv block
    x = layers.Conv2D(
        128,
        (25, 1),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv3",
    )(x)
    x = layers.MaxPooling2D((2, 1), name="pool3")(x)

    # We have used two max pool with pool size and strides 2.
    # Hence, downsampled feature maps are 4x smaller. The number of
    # filters in the last layer is 64. Reshape accordingly before
    # passing the output to the RNN part of the model
    new_shape = ((img_width // 8), 1 * 128)
    x = layers.Reshape(target_shape=new_shape, name="reshape")(x)
    x = layers.Dense(64, activation="relu", name="dense1")(x)
    x = layers.Dropout(0.2)(x)

    # RNNs
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.25))(x)
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.25))(x)

    # Output layer
    x = layers.Dense(len(alphabet) + 1, activation="softmax", name="dense2")(x)

    # Add CTC layer for calculating CTC loss at each step
    output = CTCLayer(name="ctc_loss")(labels, x)

    # Define the model
    model = keras.models.Model(
        inputs=[input_img, labels], outputs=output, name="ocr_model_v1"
    )
    # Optimizer
    opt = keras.optimizers.Adam()
    # Compile the model and return
    model.compile(optimizer=opt)
    return model

In [13]:
model = build_model()
model.summary()

Model: "ocr_model_v1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
image (InputLayer)              [(None, 6613, 1, 1)] 0                                            
__________________________________________________________________________________________________
Conv1 (Conv2D)                  (None, 6613, 1, 128) 3328        image[0][0]                      
__________________________________________________________________________________________________
pool1 (MaxPooling2D)            (None, 3306, 1, 128) 0           Conv1[0][0]                      
__________________________________________________________________________________________________
Conv2 (Conv2D)                  (None, 3306, 1, 128) 409728      pool1[0][0]                      
_______________________________________________________________________________________

In [21]:
epochs = 5
early_stopping_patience = 5
# Add early stopping
early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=early_stopping_patience, restore_best_weights=True
)

# Train the model
history = model.fit(
    train_dataset,
    validation_data=validation_dataset,
    epochs=epochs,
    callbacks=[early_stopping])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
prediction_model = keras.models.Model(
    model.get_layer(name="image").input, model.get_layer(name="dense2").output
)
prediction_model.summary()

# A utility function to decode the output of the network
def decode_batch_predictions(pred):
  input_len = np.ones(pred.shape[0]) * pred.shape[1]
  # Use greedy search. For complex tasks, you can use beam search
  print(pred)
  results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][
      :, :max_length
  ]
  # Iterate over the results and get back the text
  output_text = []
  print(results)
  for res in results:
    output_text.append(''.join([int_to_char[i] for i in res]))
  return output_text


#  Let's check results on some validation samples
for batch in validation_dataset.take(1):
  batch_images = batch["image"]
  batch_labels = batch["label"]

  print(batch_images.shape)
  preds = prediction_model.predict(batch_images)
  print(preds.shape)

  pred_texts = decode_batch_predictions(preds)

  orig_texts = []
  for s, sentence in enumerate(batch_labels):
    orig_texts.append([])
    
    for c, char in enumerate(sentence):
      orig_texts[-1].append(int_to_char[char])

  for i in range(len(pred_texts)):
    print("\nPrediction:\t{}".format(pred_texts[i]))
    print("Ground truth:\t{}".format(orig_texts[i]))

(16, 6613, 1, 1)
(16, 1653, 56)
[[[5.99068440e-02 2.21103951e-02 2.35201493e-02 ... 9.04412067e-04
   9.67717351e-05 1.05868444e-01]
  [6.55134767e-02 4.29294771e-03 6.03590440e-03 ... 3.53049632e-04
   8.14407424e-04 1.36972070e-01]
  [2.51089893e-02 2.30153976e-03 4.55265772e-03 ... 1.80286108e-04
   4.46430966e-02 3.05396020e-01]
  ...
  [1.85406255e-03 4.04833379e-04 7.10422057e-04 ... 2.02795200e-05
   5.95310004e-03 9.72938180e-01]
  [2.47144792e-03 5.25899639e-04 9.59842466e-04 ... 3.18355851e-05
   8.01945571e-03 9.65027511e-01]
  [6.73409645e-03 1.36613171e-03 2.62637297e-03 ... 1.61882548e-04
   2.14270912e-02 9.10743237e-01]]

 [[6.55186176e-02 2.30398104e-02 2.50969958e-02 ... 9.93411755e-04
   8.23496230e-05 9.41856876e-02]
  [7.18142316e-02 4.63645300e-03 6.68268697e-03 ... 3.68426903e-04
   6.94821880e-04 1.21637307e-01]
  [2.80850101e-02 2.52690190e-03 5.89228701e-03 ... 1.90200459e-04
   3.93353403e-02 3.04469347e-01]
  ...
  [1.95962191e-03 4.71568550e-04 7.86294346e-

TypeError: ignored