In [1]:
import tensorflow as tf
try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
except: pass

from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard

from mltu.preprocessors import ImageReader
from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding, ImageShowCV2
from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate, RandomSharpen
from mltu.annotations.images import CVImage

from mltu.tensorflow.dataProvider import DataProvider
from mltu.tensorflow.losses import CTCloss
from mltu.tensorflow.callbacks import Model2onnx, TrainLogger
from mltu.tensorflow.metrics import CERMetric, WERMetric

from model import train_model
from configs import ModelConfigs

import os
from tqdm import tqdm

# Must download and extract datasets manually from https://fki.tic.heia-fr.ch/databases/download-the-iam-handwriting-database to Datasets\IAM_Sentences
sentences_txt_path = os.path.join("Datasets", "IAM_Sentences", "ascii", "sentences.txt")
sentences_folder_path = os.path.join("Datasets", "IAM_Sentences", "sentences")

dataset, vocab, max_len = [], set(), 0
words = open(sentences_txt_path, "r").readlines()
for line in tqdm(words):
    if line.startswith("#"):
        continue

    line_split = line.split(" ")
    if line_split[2] == "err":
        continue

    folder1 = line_split[0][:3]
    folder2 = "-".join(line_split[0].split("-")[:2])
    file_name = line_split[0] + ".png"
    label = line_split[-1].rstrip("\n")
    print(label)

    # replace "|" with " " in label
    label = label.replace("|", " ")

    rel_path = os.path.join(sentences_folder_path, folder1, folder2, file_name)
    if not os.path.exists(rel_path):
        print(f"File not found: {rel_path}")
        continue

    dataset.append([rel_path, label])
    vocab.update(list(label))
    max_len = max(max_len, len(label))

# Create a ModelConfigs object to store model configurations
configs = ModelConfigs()

# Save vocab and maximum text length to configs
configs.vocab = "".join(vocab)
configs.max_text_length = max_len
configs.save()

KeyboardInterrupt: 

In [2]:
configs.model_path

'Models/04_sentence_recognition\\202311231428'

In [3]:
data_provider = DataProvider(
    dataset=dataset,
    skip_validation=True,
    batch_size=configs.batch_size,
    data_preprocessors=[ImageReader(CVImage)],
    transformers=[
        ImageResizer(configs.width, configs.height, keep_aspect_ratio=True),
        LabelIndexer(configs.vocab),
        LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)),
        ],
)

# Split the dataset into training and validation sets
train_data_provider, val_data_provider = data_provider.split(split = 0.9)

# Augment training data with random brightness, rotation and erode/dilate
train_data_provider.augmentors = [
    RandomBrightness(), 
    RandomErodeDilate(),
    RandomSharpen(),
    ]

# Creating TensorFlow model architecture
model = train_model(
    input_dim = (configs.height, configs.width, 3),
    output_dim = len(configs.vocab),
)

# Compile the model and print summary
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate), 
    loss=CTCloss(), 
    metrics=[
        CERMetric(vocabulary=configs.vocab),
        WERMetric(vocabulary=configs.vocab)
        ],
    run_eagerly=False
)
model.summary(line_length=110)

# Define callbacks
earlystopper = EarlyStopping(monitor="val_CER", patience=20, verbose=1, mode="min")
checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min")
trainLogger = TrainLogger(configs.model_path)
tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1)
reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.9, min_delta=1e-10, patience=5, verbose=1, mode="auto")
model2onnx = Model2onnx(f"{configs.model_path}/model.h5")

# Train the model
model.fit(
    train_data_provider,
    validation_data=val_data_provider,
    epochs=1,
    callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx],
    workers=configs.train_workers
)

# Save training and validation datasets as csv files
train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv"))
val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv"))



Model: "model"
______________________________________________________________________________________________________________
 Layer (type)                    Output Shape                     Param #    Connected to                     
 input (InputLayer)              [(None, 96, 1408, 3)]            0          []                               
                                                                                                              
 lambda (Lambda)                 (None, 96, 1408, 3)              0          ['input[0][0]']                  
                                                                                                              
 conv2d (Conv2D)                 (None, 96, 1408, 32)             896        ['lambda[0][0]']                 
                                                                                                              
 batch_normalization (BatchNorm  (None, 96, 1408, 32)             128        ['conv2d[0][0]']  

In [16]:
# model.save("model.h5")

  saving_api.save_model(


In [27]:
# model

<keras.src.engine.functional.Functional at 0x1fbabf77f90>

  function = cls._parse_function_from_config(


In [4]:
model_new.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input (InputLayer)          [(None, 96, 1408, 3)]        0         []                            
                                                                                                  
 lambda_1 (Lambda)           (None, 96, 1408, 3)          0         ['input[0][0]']               
                                                                                                  
 conv2d_24 (Conv2D)          (None, 96, 1408, 32)         896       ['lambda_1[0][0]']            
                                                                                                  
 batch_normalization_18 (Ba  (None, 96, 1408, 32)         128       ['conv2d_24[0][0]']           
 tchNormalization)                                                                          

In [6]:
vocab = {' ',
 '!',
 '"',
 '#',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [2]:
from tensorflow.keras.models import load_model
from mltu.tensorflow.losses import CTCloss

# from keras.utils.custom_object_scope import get_custom_objects
# get_custom_objects().update({"CTCLoss": CTCloss})
# load model
model_new = load_model('model.h5',custom_objects={'CTCloss':CTCloss},compile = False)





  function = cls._parse_function_from_config(





In [10]:


from mltu.inferenceModel import OnnxInferenceModel
from mltu.utils.text_utils import  get_cer, get_wer
from mltu.transformers import ImageResizer
import numpy as np
import typing
from itertools import groupby

def ctc_decoder(predictions: np.ndarray, chars: typing.Union[str, list]) -> typing.List[str]:
    """ CTC greedy decoder for predictions
    
    Args:
        predictions (np.ndarray): predictions from model
        chars (typing.Union[str, list]): list of characters

    Returns:
        typing.List[str]: list of words
    """
    text = ""
    for ele in predictions:
        print("Now ele")
        print(ele)
        for ele2 in ele:
            # print(ele)
            idx = np.argmax(ele2)
            print(ele2)
            count = 0

            if(idx == 79):
                text = text + "$"
                continue
            text = text + chars[idx]
            
        
    return text
    # use argmax to find the index of the highest probability
    argmax_preds = np.argmax(predictions, axis=-1)
    
    # use groupby to find continuous same indexes
    print(predictions)
    grouped_preds = [[k for k,_ in groupby(preds)] for preds in argmax_preds]

    # convert indexes to chars
    texts = ["".join([chars[k] for k in group if k < len(chars)]) for group in grouped_preds]

    return texts

In [12]:
import cv2
import numpy as np
from mltu.utils.text_utils import ctc_decoder, get_cer, get_wer

if __name__ == "__main__":
    import pandas as pd
    from tqdm import tqdm
    from mltu.configs import BaseModelConfigs

    configs = BaseModelConfigs.load("configs.yaml")

    # model = ImageToWordModel(model_path=configs.model_path, char_list=configs.vocab)

    df = pd.read_csv("val.csv").values.tolist()
    # print(train_data_provider)
    
    accum_cer, accum_wer = [], []
    for image_path, label in tqdm(df):
        image = cv2.imread(image_path)
        height, width = image.shape[:2]
        width_target = 1408
        height_target = 96
        # ratio = min(width_target / width, height_target / height)
        # new_w, new_h = int(width * ratio), int(height * ratio)

        resized_image = cv2.resize(image, (width_target, height_target))
        image_pred = np.expand_dims(resized_image, axis=0).astype(np.float32)
        # delta_w = width_target - new_w
        # delta_h = height_target - new_h
        # top, bottom = delta_h//2, delta_h-(delta_h//2)
        # left, right = delta_w//2, delta_w-(delta_w//2)


        prediction_text = model_new.predict(image_pred)
       
      
        # print(prediction_text)
        prediction_text = ctc_decoder(prediction_text, vocab)
        print(prediction_text)
        print("Text at end")
        print(prediction_text)
        cer = get_cer(prediction_text, label)
        wer = get_wer(prediction_text, label)
        print("Image: ", image_path)
        print("Label:", label)
        print("Prediction: ", prediction_text)
        print(f"CER: {cer}; WER: {wer}")

        accum_cer.append(cer)
        accum_wer.append(wer)

        cv2.imshow(prediction_text, image)
        cv2.waitKey(0)
        cv2.destroyAllWindows()

    print(f"Average CER: {np.average(accum_cer)}, Average WER: {np.average(accum_wer)}")

  0%|          | 0/1409 [00:00<?, ?it/s]



  0%|          | 0/1409 [00:00<?, ?it/s]

['']
Text at end
['']
Error: preds and target must be either both strings or both lists of strings.
Image:  Datasets\IAM_Sentences\sentences\h07\h07-054a\h07-054a-s01-03.png
Label: much as the total area so far completed .
Prediction:  ['']
CER: 1.0; WER: inf





error: OpenCV(4.8.1) :-1: error: (-5:Bad argument) in function 'imshow'
> Overload resolution failed:
>  - Can't convert object to 'str' for 'winname'
>  - Can't convert object to 'str' for 'winname'
>  - Can't convert object to 'str' for 'winname'
