In [6]:
import tensorflow as tf
from tensorflow import keras as K
from tensorflow.keras import layers
import numpy as np

In [7]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [21]:
# Model Structure: 5 CNN
# 2 5x5 kernel, 3 3x3 kernels
# Non-linear RELU
# MaxPool

def backend_reshape(x):
    return tf.keras.backend.reshape(x, (32,-1,80))


def ctc_loss_fn(y_true, y_pred):
    
    print(y_pred)
    # y_pred.shape = (batch_size, string_length, alphabet_size_1_hot_encoded)
    # output of every model is softmax
    # so sum across alphabet_size_1_hot_encoded give 1
    #               string_length give string length
    input_length = tf.math.reduce_sum(y_pred, axis=-1, keepdims=False)
    input_length = tf.math.reduce_sum(input_length, axis=-1, keepdims=True)


    # y_true strings are padded with 0
    # so sum of non-zero gives number of characters in this string
    label_length = tf.math.count_nonzero(y_true, axis=-1, keepdims=True, dtype="int64")
    
    
    loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)

    # average loss across all entries in the batch
    loss = tf.reduce_mean(loss)

    return loss


model = K.Sequential(
    [
        
        layers.Conv2D(filters=32, kernel_size = 5, strides=1, padding="same", input_shape = (128, 32,1), activation="relu"),
        layers.MaxPooling2D(pool_size=2, strides = (2,2)),
        
        layers.Conv2D(filters=64, kernel_size = 5, strides=1, padding="same", activation="relu"),
        layers.MaxPooling2D(pool_size=2, strides = (2,2)),
        
        layers.Conv2D(filters=128, kernel_size = 3, strides=1, padding="same", activation="relu"),
        layers.MaxPooling2D(pool_size=(1,2), strides = (1,2)),
        
        layers.Conv2D(filters=128, kernel_size = 3, strides=1, padding="same", activation="relu"),
        layers.MaxPooling2D(pool_size=(1,2), strides = (1,2)),
        
        layers.Conv2D(filters=256, kernel_size = 3, strides=1, padding="same", activation="relu"),
        layers.MaxPooling2D(pool_size=(1,2), strides = (1,2)),
        
        layers.Reshape((32,256)),
        
        layers.Bidirectional(layers.LSTM(256, return_sequences=True)),
        
        layers.Reshape((32,1, 512)),
        
        layers.Conv2D(filters=80, kernel_size = 3, padding="same", dilation_rate=1, activation="softmax"),
        layers.Reshape((32,80)),
               
    ]
)

model.summary()
model.compile(optimizer="rmsprop", loss=ctc_loss_fn, metrics=['accuracy'])


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_18 (Conv2D)           (None, 128, 32, 32)       832       
_________________________________________________________________
max_pooling2d_15 (MaxPooling (None, 64, 16, 32)        0         
_________________________________________________________________
conv2d_19 (Conv2D)           (None, 64, 16, 64)        51264     
_________________________________________________________________
max_pooling2d_16 (MaxPooling (None, 32, 8, 64)         0         
_________________________________________________________________
conv2d_20 (Conv2D)           (None, 32, 8, 128)        73856     
_________________________________________________________________
max_pooling2d_17 (MaxPooling (None, 32, 4, 128)        0         
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 32, 4, 128)       

In [9]:
# some_file.py
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '/SimpleHTR')

from SimpleHTR.src.DataLoader import DataLoader
loader =  DataLoader(filePath='./SimpleHTR/data/', batchSize=50, imgSize=(128,32), maxTextLen=32)

In [10]:
	def toSparse(texts, charList):
		"put ground truth texts into sparse tensor for ctc_loss"
		indices = []
		values = []
		shape = [len(texts), 0] # last entry must be max(labelList[i])

		# go over all texts
		for (batchElement, text) in enumerate(texts):
			# convert to string of label (i.e. class-ids)
			labelStr = [charList.index(c) for c in text]
			# sparse tensor must have size of max. label-string
			if len(labelStr) > shape[1]:
				shape[1] = len(labelStr)
			# put each label into sparse tensor
			for (i, label) in enumerate(labelStr):
				indices.append([batchElement, i])
				values.append(label)

		return (indices, values, shape)


In [11]:
from SimpleHTR.src.SamplePreprocessor import preprocess
import cv2

gtTexts = [loader.samples[i].gtText for i in range(len(loader.samples))]
indices, values, dense_shape = toSparse(gtTexts[:int(len(gtTexts)*.95)], loader.charList)
dense_shape[1] = 32
y_train = tf.sparse.SparseTensor(indices, values, dense_shape)
y_train = tf.sparse.to_dense(y_train, default_value=0)

imgs_train = [preprocess(cv2.imread(loader.samples[i].filePath, cv2.IMREAD_GRAYSCALE), (128,32), False) for i in range(int(len(gtTexts)*.95))]

In [12]:
y_train.shape

TensorShape([23750, 32])

In [13]:
imgs_train = np.asarray(imgs_train)
imgs_train = imgs_train.reshape((imgs_train.shape[0], imgs_train.shape[1], imgs_train.shape[2], 1))
imgs_train = tf.convert_to_tensor(imgs_train)

imgs_train.shape

TensorShape([23750, 128, 32, 1])

In [22]:
model.fit(
    x=imgs_train,
    y=y_train,
    epochs=500,
    batch_size=50,
    verbose=1
)

Epoch 1/20
Tensor("sequential_3/reshape_11/Reshape:0", shape=(50, 32, 80), dtype=float32)
Tensor("sequential_3/reshape_11/Reshape:0", shape=(50, 32, 80), dtype=float32)

KeyboardInterrupt: 

In [15]:
model.save('./model2.h5')

In [28]:
model1 =tf.keras.models.load_model('./model1.h5', compile=False)
model1.compile(optimizer="rmsprop", loss=ctc_loss_fn, metrics=['accuracy'])

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_18 (Conv2D)           (None, 128, 32, 32)       832       
_________________________________________________________________
max_pooling2d_15 (MaxPooling (None, 64, 16, 32)        0         
_________________________________________________________________
conv2d_19 (Conv2D)           (None, 64, 16, 64)        51264     
_________________________________________________________________
max_pooling2d_16 (MaxPooling (None, 32, 8, 64)         0         
_________________________________________________________________
conv2d_20 (Conv2D)           (None, 32, 8, 128)        73856     
_________________________________________________________________
max_pooling2d_17 (MaxPooling (None, 32, 4, 128)        0         
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 32, 4, 128)       

In [29]:
model1.fit(
    x=imgs_train,
    y=y_train,
    epochs=500,
    batch_size=50,
    verbose=1
)

Epoch 1/500
Tensor("sequential/reshape_2/Reshape:0", shape=(50, 32, 80), dtype=float32)
Tensor("sequential/reshape_2/Reshape:0", shape=(50, 32, 80), dtype=float32)
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500

KeyboardInterrupt: 