In [1]:
import os
from datetime import datetime

class ModelConfigs():
    def __init__(self):
        super().__init__()
        self.model_path = os.path.join("Models/04_sentence_recognition", datetime.strftime(datetime.now(), "%Y%m%d%H%M"))
        self.vocab = ""
        self.height = 96
        self.width = 1408
        self.max_text_length = 0
        self.batch_size = 32
        self.learning_rate = 0.0005
        self.train_epochs = 1000
        self.train_workers = 20

In [19]:
from tensorflow.keras import layers, Model
import tensorflow as tf

class ReshapeLayer(layers.Layer):
    def call(self, x):
        # Get the shape of the input tensor
        batch_size = tf.shape(x)[0]
        # Reshape to (batch_size, time_steps, features)
        return tf.reshape(x, [batch_size, -1, x.shape[-1]])

def train_model(input_dim, output_dim):
    """
    Creates a CNN + RNN model for text recognition
    
    Args:
        input_dim: Tuple of (height, width, channels)
        output_dim: Number of unique characters in vocabulary
    
    Returns:
        tf.keras.Model: Compiled model ready for training
    """
    # Input layer
    inputs = layers.Input(shape=input_dim, name='input')
    
    # CNN Feature Extraction
    x = layers.Conv2D(32, (3, 3), activation='relu', padding='same', name='conv1')(inputs)
    x = layers.MaxPooling2D((2, 2), name='pool1')(x)
    
    x = layers.Conv2D(64, (3, 3), activation='relu', padding='same', name='conv2')(x)
    x = layers.MaxPooling2D((2, 2), name='pool2')(x)
    
    x = layers.Conv2D(128, (3, 3), activation='relu', padding='same', name='conv3')(x)
    x = layers.MaxPooling2D((2, 2), name='pool3')(x)
    
    x = layers.Conv2D(256, (3, 3), activation='relu', padding='same', name='conv4')(x)
    x = layers.MaxPooling2D((2, 2), name='pool4')(x)
    
    # Dense layer for feature processing
    x = layers.Conv2D(256, (3, 3), activation='relu', padding='same', name='conv5')(x)
    
    # Reshape for RNN using custom layer
    x = ReshapeLayer()(x)
    
    # RNN layers
    x = layers.Bidirectional(layers.LSTM(256, return_sequences=True))(x)
    x = layers.Dropout(0.2)(x)
    
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(0.2)(x)
    
    # Output layer
    outputs = layers.Dense(output_dim + 1, activation='softmax', name='dense2')(x)
    
    model = Model(inputs=inputs, outputs=outputs, name='ocr_model')
    
    return model

class ModelConfigs:
    """Configuration class to store model parameters"""
    def __init__(self):
        self.vocab = ""
        self.max_text_length = 0
        self.height = 300
        self.width = 300
        self.batch_size = 32
        self.learning_rate = 0.001
        self.train_epochs = 5
        self.train_workers = 4
        self.model_path = "Models/cheque_recognition"
        
    def save(self):
        """Save configurations to JSON file"""
        import json
        import os
        
        os.makedirs(self.model_path, exist_ok=True)
        
        config_path = os.path.join(self.model_path, "configs.json")
        config_dict = {
            "vocab": self.vocab,
            "max_text_length": self.max_text_length,
            "height": self.height,
            "width": self.width,
            "batch_size": self.batch_size,
            "learning_rate": self.learning_rate,
            "train_epochs": self.train_epochs,
            "train_workers": self.train_workers,
            "model_path": self.model_path
        }
        
        with open(config_path, 'w') as f:
            json.dump(config_dict, f, indent=4)

In [20]:
import tensorflow as tf
try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
except: pass

from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from mltu.preprocessors import ImageReader
from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding
from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate, RandomSharpen
from mltu.annotations.images import CVImage
from mltu.tensorflow.dataProvider import DataProvider
from mltu.tensorflow.losses import CTCloss
from mltu.tensorflow.callbacks import Model2onnx, TrainLogger
from mltu.tensorflow.metrics import CERMetric, WERMetric


import os
import cv2
import json
from tqdm import tqdm

def load_dataset(image_folder, label_folder, target_size=(300, 300)):
    """
    Load dataset and create a list of [processed_image_path, label] pairs
    Returns:
        dataset: List of [image_path, label] pairs
        vocab: Set of unique characters in labels
        max_len: Maximum length of any label
    """
    dataset, vocab, max_len = [], set(), 0
    valid_formats = ('.jpg', '.jpeg', '.png', '.tif')
    
    # Create a directory for processed images if it doesn't exist
    processed_dir = os.path.join(os.path.dirname(image_folder), 'processed_images')
    os.makedirs(processed_dir, exist_ok=True)
    
    image_files = sorted([f for f in os.listdir(image_folder) 
                         if f.lower().endswith(valid_formats)],
                        reverse=True)
    
    for image_file in tqdm(image_files, desc="Processing images"):
        # Read the image
        image_path = os.path.join(image_folder, image_file)
        img = cv2.imread(image_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        # Load the corresponding label file
        label_file = os.path.splitext(image_file)[0] + ".json"
        label_path = os.path.join(label_folder, label_file)
        
        if not os.path.exists(label_path):
            print(f"Label file for {image_file} not found.")
            continue
        
        with open(label_path, 'r') as f:
            labels = json.load(f)
        
        # Process each labeled region
        for idx, shape in enumerate(labels['shapes']):
            points = shape['points']
            label = shape['label']
            
            # Get bounding box coordinates
            x_min, y_min = map(int, points[0])
            x_max, y_max = map(int, points[1])
            
            # Crop the labeled region
            crop = img[y_min:y_max, x_min:x_max]
            
            # Calculate padding
            crop_h, crop_w = crop.shape[:2]
            pad_top = max((target_size[0] - crop_h) // 2, 0)
            pad_bottom = max(target_size[0] - crop_h - pad_top, 0)
            pad_left = max((target_size[1] - crop_w) // 2, 0)
            pad_right = max(target_size[1] - crop_w - pad_left, 0)
            
            # Apply padding with white pixels
            padded_crop = cv2.copyMakeBorder(
                crop, pad_top, pad_bottom, pad_left, pad_right,
                cv2.BORDER_CONSTANT, value=(255, 255, 255)
            )
            
            # Save processed image
            processed_name = f"{os.path.splitext(image_file)[0]}_{idx}.png"
            processed_path = os.path.join(processed_dir, processed_name)
            cv2.imwrite(processed_path, cv2.cvtColor(padded_crop, cv2.COLOR_RGB2BGR))
            
            # Add to dataset
            dataset.append([processed_path, label])
            vocab.update(list(label))
            max_len = max(max_len, len(label))
    
    return dataset, vocab, max_len

# Define paths
image_folder = r"C:\Users\Kingstone\Desktop\All folder\project work\IDRBT_Cheque_Image_Dataset\code\train\data\image"
label_folder = r"C:\Users\Kingstone\Desktop\All folder\project work\IDRBT_Cheque_Image_Dataset\code\train\data\labels"

# Load dataset
print("Loading and processing dataset...")
dataset, vocab, max_len = load_dataset(image_folder, label_folder, target_size=(300, 300))

# Create a ModelConfigs object to store model configurations
configs = ModelConfigs()

# Save vocab and maximum text length to configs
configs.vocab = "".join(sorted(vocab))
configs.max_text_length = max_len
configs.height = 300  # Match target_size
configs.width = 300   # Match target_size
configs.save()

print(f"Vocabulary size: {len(configs.vocab)}")
print(f"Maximum text length: {configs.max_text_length}")
print(f"Dataset size: {len(dataset)}")

# Create a data provider for the dataset
data_provider = DataProvider(
    dataset=dataset,
    skip_validation=True,
    batch_size=configs.batch_size,
    data_preprocessors=[ImageReader(CVImage)],
    transformers=[
        ImageResizer(configs.width, configs.height, keep_aspect_ratio=True),
        LabelIndexer(configs.vocab),
        LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)),
    ],
)

# Split the dataset into training and validation sets
train_data_provider, val_data_provider = data_provider.split(split=0.9)

# Augment training data with random transformations
train_data_provider.augmentors = [
    RandomBrightness(),
    RandomErodeDilate(),
    RandomSharpen(),
]

# Create and compile model
model = train_model(
    input_dim=(configs.height, configs.width, 3),
    output_dim=len(configs.vocab),
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate),
    loss=CTCloss(),
    metrics=[
        CERMetric(vocabulary=configs.vocab),
        WERMetric(vocabulary=configs.vocab)
    ],
    run_eagerly=False
)

model.summary(line_length=110)

# Define callbacks
callbacks = [
    EarlyStopping(monitor="val_CER", patience=20, verbose=1, mode="min"),
    ModelCheckpoint(f"{configs.model_path}/model.keras", monitor="val_CER", verbose=1, save_best_only=True, mode="min"),
    TrainLogger(configs.model_path),
    TensorBoard(f"{configs.model_path}/logs", update_freq=1),
    ReduceLROnPlateau(monitor="val_CER", factor=0.9, min_delta=1e-10, patience=5, verbose=1, mode="auto"),
    Model2onnx(f"{configs.model_path}/model.keras")
]

# Train the model
print("Starting training...")
model.fit(
    train_data_provider,
    validation_data=val_data_provider,
    epochs=configs.train_epochs,
    callbacks=callbacks,
    #workers=configs.train_workers
)

# Save training and validation datasets as csv files
train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv"))
val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv"))

Loading and processing dataset...


Processing images: 100%|██████████| 112/112 [00:08<00:00, 12.78it/s]


Vocabulary size: 12
Maximum text length: 6
Dataset size: 336



Starting training...
Epoch 1/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - CER: 19.7123 - WER: 1.0000 - loss: 454.5406
Epoch 1: val_CER improved from inf to 9.12351, saving model to Models/cheque_recognition/model.keras
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 3s/step - CER: 18.8336 - WER: 1.0000 - loss: 438.7499 - val_CER: 9.1235 - val_WER: 1.0000 - val_loss: 46.7994 - learning_rate: 0.0010
Epoch 2/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - CER: 6.5956 - WER: 1.0000 - loss: 50.9357
Epoch 2: val_CER improved from 9.12351 to 5.03249, saving model to Models/cheque_recognition/model.keras
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 3s/step - CER: 6.4740 - WER: 1.0000 - loss: 50.2338 - val_CER: 5.0325 - val_WER: 1.0000 - val_loss: 47.9402 - learning_rate: 0.0010
Epoch 3/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - CER: 4.2419 - WER: 1.0000 - loss: 47.86

resnet 

In [21]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.applications import ResNet50

def train_model(input_dim, output_dim):
    """
    Creates a ResNet-based OCR model
    
    Args:
        input_dim: Tuple of (height, width, channels)
        output_dim: Number of classes (vocabulary size)
        
    Returns:
        tf.keras.Model: Compiled model ready for training
    """
    
    # Input layer
    inputs = layers.Input(shape=input_dim, name="input")
    
    # Load ResNet50 without top layers, freeze early layers
    resnet = ResNet50(
        include_top=False,
        weights='imagenet',
        input_tensor=inputs,
        pooling=None
    )
    
    # Freeze early layers (up to block3)
    for layer in resnet.layers[:100]:
        layer.trainable = False
    
    # Add custom layers on top of ResNet
    x = resnet.output
    
    # Reduce height dimension for OCR
    x = layers.Conv2D(512, (3, 3), padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D(pool_size=(2, 1), padding='same')(x)
    
    # Additional convolutional blocks for feature extraction
    x = layers.Conv2D(512, (3, 3), padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(512, (3, 3), padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    
    # Prepare for sequence prediction
    x = layers.Conv2D(output_dim, (1, 1), activation='relu')(x)
    
    # Convert to sequence
    x = layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(x)
    
    # Output layer with softmax activation
    outputs = layers.Activation('softmax', name='output')(x)
    
    # Create and return model
    model = Model(inputs=inputs, outputs=outputs)
    
    return model

class ModelConfigs:
    """Configuration class to store model parameters"""
    def __init__(self):
        self.vocab = ""
        self.max_text_length = 0
        self.height = 300
        self.width = 300
        self.batch_size = 32
        self.learning_rate = 0.001
        self.train_epochs = 100
        self.model_path = "./saved_models"
    
    def save(self):
        """Save configurations to a file"""
        if not os.path.exists(self.model_path):
            os.makedirs(self.model_path)
        
        config_path = os.path.join(self.model_path, "config.json")
        with open(config_path, 'w') as f:
            json.dump({
                'vocab': self.vocab,
                'max_text_length': self.max_text_length,
                'height': self.height,
                'width': self.width,
                'batch_size': self.batch_size,
                'learning_rate': self.learning_rate,
                'train_epochs': self.train_epochs
            }, f, indent=4)

In [50]:
import tensorflow as tf
try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
except: pass

from tensorflow.keras import layers, Model
from tensorflow.keras.applications import ResNet50
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from mltu.preprocessors import ImageReader
from mltu.transformers import ImageResizer
from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate, RandomSharpen
from mltu.annotations.images import CVImage
from mltu.tensorflow.dataProvider import DataProvider

import os
import cv2
import json
import numpy as np
from tqdm import tqdm

def train_model(input_dim, output_dim):
    """
    Creates a ResNet-based OCR model
    """
    # Input layer
    input_image = layers.Input(shape=input_dim, name="image_input")
    input_labels = layers.Input(shape=(None,), dtype=tf.int32, name="label_input")
    
    # Load ResNet50 without top layers
    resnet = ResNet50(
        include_top=False,
        weights='imagenet',
        input_tensor=input_image,
        pooling=None
    )
    
    # Freeze early layers
    for layer in resnet.layers[:100]:
        layer.trainable = False
    
    x = resnet.output
    
    # Add custom layers for OCR
    x = layers.Conv2D(512, (3, 3), padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D(pool_size=(2, 1), padding='same')(x)
    
    # Prepare for sequence processing
    shape = x.shape
    x = layers.Reshape((shape[1], shape[2] * shape[3]))(x)
    
    # RNN layers
    x = layers.Bidirectional(layers.LSTM(256, return_sequences=True))(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    
    # Output layer
    x = layers.Dense(output_dim + 1, activation='softmax')(x)  # +1 for CTC blank
    
    model = Model(inputs=input_image, outputs=x)
    
    return model

class LabelConverter:
    def __init__(self, vocab):
        self.vocab = vocab
        self.char_to_num = tf.keras.layers.StringLookup(
            vocabulary=list(vocab), num_oov_indices=0, mask_token=None
        )
    
    def __call__(self, data, label):
        """Convert text to sequence of numbers"""
        # Convert string to list of characters
        chars = tf.strings.unicode_split(label, input_encoding='UTF-8')
        # Convert chars to indices
        nums = self.char_to_num(chars)
        # Cast to int32
        return data, tf.cast(nums, tf.int32)

def train_step(model, optimizer, batch_images, batch_labels):
    """Custom training step with CTC loss"""
    with tf.GradientTape() as tape:
        # Forward pass
        y_pred = model(batch_images, training=True)
        
        # Prepare lengths
        input_length = tf.ones(tf.shape(batch_labels)[0]) * tf.cast(tf.shape(y_pred)[1], dtype=tf.int32)
        label_length = tf.cast(tf.reduce_sum(tf.cast(batch_labels != 0, tf.int32), axis=1), tf.int32)
        
        # Calculate CTC loss
        ctc_loss = tf.keras.backend.ctc_batch_cost(
            batch_labels,
            y_pred,
            input_length,
            label_length
        )
        
        # Calculate total loss
        total_loss = tf.reduce_mean(ctc_loss)
    
    # Calculate gradients and update weights
    grads = tape.gradient(total_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
    return total_loss

def main():
    # Define paths
    image_folder = r"C:\Users\Kingstone\Desktop\All folder\project work\IDRBT_Cheque_Image_Dataset\code\train\data\image"
    label_folder = r"C:\Users\Kingstone\Desktop\All folder\project work\IDRBT_Cheque_Image_Dataset\code\train\data\labels"

    print("Loading and processing dataset...")
    dataset, vocab, max_len = load_dataset(image_folder, label_folder)

    # Initialize configurations
    configs = ModelConfigs()
    configs.vocab = "␢" + "".join(sorted(set(vocab)))  # Add blank token at start
    configs.max_text_length = max_len
    configs.batch_size = 8  # Reduced batch size
    configs.learning_rate = 0.0001  # Reduced learning rate
    configs.save()

    # Create data providers
    label_converter = LabelConverter(configs.vocab)
    
    data_provider = DataProvider(
        dataset=dataset,
        skip_validation=True,
        batch_size=configs.batch_size,
        data_preprocessors=[
            ImageReader(CVImage)  # Ensure RGB output
        ],
        transformers=[
            ImageResizer(configs.width, configs.height, keep_aspect_ratio=True),
            label_converter
        ],
        prepare_batch=True  # Add this parameter
    )

    # Split into train and validation sets
    train_size = int(0.9 * len(dataset))
    train_dataset = dataset[:train_size]
    val_dataset = dataset[train_size:]

    train_data_provider = DataProvider(
        dataset=train_dataset,
        skip_validation=True,
        batch_size=configs.batch_size,
        data_preprocessors=[
            ImageReader(CVImage)
        ],
        transformers=[
            ImageResizer(configs.width, configs.height, keep_aspect_ratio=True),
            label_converter
        ],
        prepare_batch=True
    )

    val_data_provider = DataProvider(
        dataset=val_dataset,
        skip_validation=True,
        batch_size=configs.batch_size,
        data_preprocessors=[
            ImageReader(CVImage)
        ],
        transformers=[
            ImageResizer(configs.width, configs.height, keep_aspect_ratio=True),
            label_converter
        ],
        prepare_batch=True
    )

    train_data_provider.augmentors = [
        RandomBrightness(),
        RandomErodeDilate(),
        RandomSharpen(),
    ]

    # Create model and optimizer
    model = train_model(
        input_dim=(configs.height, configs.width, 3),
        output_dim=len(configs.vocab)
    )
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=configs.learning_rate, clipnorm=1.0)

    # Training loop
    num_epochs = configs.train_epochs
    best_val_loss = float('inf')
    patience = 10
    patience_counter = 0

    print("Starting training...")
    for epoch in range(num_epochs):
        # Training
        train_losses = []
        for batch_data in tqdm(train_data_provider, desc=f"Epoch {epoch + 1}/{num_epochs}"):
            # Process batch data
            if isinstance(batch_data, (list, tuple)):
                batch_images = np.array([item[0] for item in batch_data])
                batch_labels = np.array([item[1] for item in batch_data])
            else:
                print(f"Unexpected batch data format: {type(batch_data)}")
                continue
            
            loss = train_step(model, optimizer, batch_images, batch_labels)
            train_losses.append(loss)
        
        avg_train_loss = np.mean(train_losses)
        
        # Validation
        val_losses = []
        for batch_data in val_data_provider:
            # Process batch data
            if isinstance(batch_data, (list, tuple)):
                batch_images = np.array([item[0] for item in batch_data])
                batch_labels = np.array([item[1] for item in batch_data])
            else:
                continue
            
            y_pred = model(batch_images, training=False)
            input_length = tf.ones(len(batch_labels)) * tf.cast(tf.shape(y_pred)[1], dtype=tf.int32)
            label_length = tf.cast(tf.reduce_sum(tf.cast(batch_labels != 0, tf.int32), axis=1), tf.int32)
            
            val_loss = tf.keras.backend.ctc_batch_cost(
                batch_labels,
                y_pred,
                input_length,
                label_length
            )
            val_losses.append(tf.reduce_mean(val_loss))
        
        avg_val_loss = np.mean(val_losses)
        
        print(f"Epoch {epoch + 1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
        
        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            model.save(os.path.join(configs.model_path, "best_model.h5"))
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break

    print("Training completed!")

if __name__ == "__main__":
    main()

Loading and processing dataset...


Processing images: 100%|██████████| 112/112 [00:06<00:00, 18.49it/s]


TypeError: DataProvider.__init__() got an unexpected keyword argument 'prepare_batch'

VGG 19 


In [33]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.applications import VGG19

def train_model(input_dim, output_dim):
    """
    Creates a VGG19-based OCR model
    
    Args:
        input_dim: Tuple of (height, width, channels)
        output_dim: Number of classes (vocabulary size)
        
    Returns:
        tf.keras.Model: Compiled model ready for training
    """
    
    # Input layer
    inputs = layers.Input(shape=input_dim, name="input")
    
    # Load VGG19 without top layers, freeze early layers
    vgg = VGG19(
        include_top=False,
        weights='imagenet',
        input_tensor=inputs,
        pooling=None
    )
    
    # Freeze early layers (first few convolutional blocks)
    for layer in vgg.layers[:15]:  # Freeze up to block4
        layer.trainable = False
    
    # Add custom layers on top of VGG19
    x = vgg.output
    
    # Reduce height dimension for OCR while preserving width information
    x = layers.Conv2D(512, (3, 3), padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D(pool_size=(2, 1), padding='same')(x)
    
    # Additional convolutional blocks for OCR-specific feature extraction
    x = layers.Conv2D(512, (3, 3), padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.25)(x)
    
    x = layers.Conv2D(512, (3, 3), padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.25)(x)
    
    # Prepare for sequence prediction
    x = layers.Conv2D(output_dim, (1, 1), activation='relu')(x)
    
    # Convert to sequence by averaging across height
    x = layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(x)
    
    # Output layer with softmax activation for character prediction
    outputs = layers.Activation('softmax', name='output')(x)
    
    # Create and return model
    model = Model(inputs=inputs, outputs=outputs)
    
    return model

class ModelConfigs:
    """Configuration class to store model parameters"""
    def __init__(self):
        self.vocab = ""
        self.max_text_length = 0
        self.height = 300
        self.width = 300
        self.batch_size = 32
        self.learning_rate = 0.001
        self.train_epochs = 100
        self.model_path = "./saved_models"
    
    def save(self):
        """Save configurations to a file"""
        if not os.path.exists(self.model_path):
            os.makedirs(self.model_path)
        
        config_path = os.path.join(self.model_path, "config.json")
        with open(config_path, 'w') as f:
            json.dump({
                'vocab': self.vocab,
                'max_text_length': self.max_text_length,
                'height': self.height,
                'width': self.width,
                'batch_size': self.batch_size,
                'learning_rate': self.learning_rate,
                'train_epochs': self.train_epochs
            }, f, indent=4)

In [34]:
import tensorflow as tf
try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
except: pass

from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from mltu.preprocessors import ImageReader
from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding
from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate, RandomSharpen
from mltu.annotations.images import CVImage
from mltu.tensorflow.dataProvider import DataProvider
from mltu.tensorflow.losses import CTCloss
from mltu.tensorflow.callbacks import Model2onnx, TrainLogger
from mltu.tensorflow.metrics import CERMetric, WERMetric


import os
import cv2
import json
from tqdm import tqdm

def load_dataset(image_folder, label_folder, target_size=(300, 300)):
    """
    Load dataset and create a list of [processed_image_path, label] pairs
    Returns:
        dataset: List of [image_path, label] pairs
        vocab: Set of unique characters in labels
        max_len: Maximum length of any label
    """
    dataset, vocab, max_len = [], set(), 0
    valid_formats = ('.jpg', '.jpeg', '.png', '.tif')
    
    # Create a directory for processed images if it doesn't exist
    processed_dir = os.path.join(os.path.dirname(image_folder), 'processed_images')
    os.makedirs(processed_dir, exist_ok=True)
    
    image_files = sorted([f for f in os.listdir(image_folder) 
                         if f.lower().endswith(valid_formats)],
                        reverse=True)
    
    for image_file in tqdm(image_files, desc="Processing images"):
        # Read the image
        image_path = os.path.join(image_folder, image_file)
        img = cv2.imread(image_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        # Load the corresponding label file
        label_file = os.path.splitext(image_file)[0] + ".json"
        label_path = os.path.join(label_folder, label_file)
        
        if not os.path.exists(label_path):
            print(f"Label file for {image_file} not found.")
            continue
        
        with open(label_path, 'r') as f:
            labels = json.load(f)
        
        # Process each labeled region
        for idx, shape in enumerate(labels['shapes']):
            points = shape['points']
            label = shape['label']
            
            # Get bounding box coordinates
            x_min, y_min = map(int, points[0])
            x_max, y_max = map(int, points[1])
            
            # Crop the labeled region
            crop = img[y_min:y_max, x_min:x_max]
            
            # Calculate padding
            crop_h, crop_w = crop.shape[:2]
            pad_top = max((target_size[0] - crop_h) // 2, 0)
            pad_bottom = max(target_size[0] - crop_h - pad_top, 0)
            pad_left = max((target_size[1] - crop_w) // 2, 0)
            pad_right = max(target_size[1] - crop_w - pad_left, 0)
            
            # Apply padding with white pixels
            padded_crop = cv2.copyMakeBorder(
                crop, pad_top, pad_bottom, pad_left, pad_right,
                cv2.BORDER_CONSTANT, value=(255, 255, 255)
            )
            
            # Save processed image
            processed_name = f"{os.path.splitext(image_file)[0]}_{idx}.png"
            processed_path = os.path.join(processed_dir, processed_name)
            cv2.imwrite(processed_path, cv2.cvtColor(padded_crop, cv2.COLOR_RGB2BGR))
            
            # Add to dataset
            dataset.append([processed_path, label])
            vocab.update(list(label))
            max_len = max(max_len, len(label))
    
    return dataset, vocab, max_len

# Define paths
image_folder = r"C:\Users\Kingstone\Desktop\All folder\project work\IDRBT_Cheque_Image_Dataset\code\train\data\image"
label_folder = r"C:\Users\Kingstone\Desktop\All folder\project work\IDRBT_Cheque_Image_Dataset\code\train\data\labels"

# Load dataset
print("Loading and processing dataset... vgg 19 code ")
dataset, vocab, max_len = load_dataset(image_folder, label_folder, target_size=(300, 300))

# Create a ModelConfigs object to store model configurations
configs = ModelConfigs()

# Save vocab and maximum text length to configs
configs.vocab = "".join(sorted(vocab))
configs.max_text_length = max_len
configs.height = 300  # Match target_size
configs.width = 300   # Match target_size
configs.save()

print(f"Vocabulary size: {len(configs.vocab)}")
print(f"Maximum text length: {configs.max_text_length}")
print(f"Dataset size: {len(dataset)}")

# Create a data provider for the dataset
data_provider = DataProvider(
    dataset=dataset,
    skip_validation=True,
    batch_size=configs.batch_size,
    data_preprocessors=[ImageReader(CVImage)],
    transformers=[
        ImageResizer(configs.width, configs.height, keep_aspect_ratio=True),
        LabelIndexer(configs.vocab),
        LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)),
    ],
)

# Split the dataset into training and validation sets
train_data_provider, val_data_provider = data_provider.split(split=0.9)

# Augment training data with random transformations
train_data_provider.augmentors = [
    RandomBrightness(),
    RandomErodeDilate(),
    RandomSharpen(),
]

# Create and compile model
model = train_model(
    input_dim=(configs.height, configs.width, 3),
    output_dim=len(configs.vocab),
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate),
    loss=CTCloss(),
    metrics=[
        CERMetric(vocabulary=configs.vocab),
        WERMetric(vocabulary=configs.vocab)
    ],
    run_eagerly=False
)

model.summary(line_length=110)

# Define callbacks
callbacks = [
    EarlyStopping(monitor="val_CER", patience=20, verbose=1, mode="min"),
    ModelCheckpoint(f"{configs.model_path}/model.keras", monitor="val_CER", verbose=1, save_best_only=True, mode="min"),
    TrainLogger(configs.model_path),
    TensorBoard(f"{configs.model_path}/logs", update_freq=1),
    ReduceLROnPlateau(monitor="val_CER", factor=0.9, min_delta=1e-10, patience=5, verbose=1, mode="auto"),
    Model2onnx(f"{configs.model_path}/model.keras")
]

# Train the model
print("Starting training...")
model.fit(
    train_data_provider,
    validation_data=val_data_provider,
    epochs=configs.train_epochs,
    callbacks=callbacks,
    #workers=configs.train_workers
)

# Save training and validation datasets as csv files
train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv"))
val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv"))

Loading and processing dataset... vgg 19 code 


Processing images: 100%|██████████| 112/112 [00:08<00:00, 13.22it/s]


Vocabulary size: 12
Maximum text length: 6
Dataset size: 336
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m80134624/80134624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 2us/step


Starting training...
Epoch 1/100


InvalidArgumentError: Graph execution error:

Detected at node compile_loss/CTCLoss defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "C:\Users\Kingstone\AppData\Roaming\Python\Python312\site-packages\ipykernel_launcher.py", line 18, in <module>

  File "C:\Users\Kingstone\AppData\Roaming\Python\Python312\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "C:\Users\Kingstone\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "C:\Users\Kingstone\AppData\Roaming\Python\Python312\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "c:\Users\Kingstone\AppData\Local\Programs\Python\Python312\Lib\asyncio\base_events.py", line 641, in run_forever

  File "c:\Users\Kingstone\AppData\Local\Programs\Python\Python312\Lib\asyncio\base_events.py", line 1986, in _run_once

  File "c:\Users\Kingstone\AppData\Local\Programs\Python\Python312\Lib\asyncio\events.py", line 88, in _run

  File "C:\Users\Kingstone\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue

  File "C:\Users\Kingstone\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelbase.py", line 534, in process_one

  File "C:\Users\Kingstone\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell

  File "C:\Users\Kingstone\AppData\Roaming\Python\Python312\site-packages\ipykernel\ipkernel.py", line 362, in execute_request

  File "C:\Users\Kingstone\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelbase.py", line 778, in execute_request

  File "C:\Users\Kingstone\AppData\Roaming\Python\Python312\site-packages\ipykernel\ipkernel.py", line 449, in do_execute

  File "C:\Users\Kingstone\AppData\Roaming\Python\Python312\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "C:\Users\Kingstone\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "C:\Users\Kingstone\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "C:\Users\Kingstone\AppData\Roaming\Python\Python312\site-packages\IPython\core\async_helpers.py", line 128, in _pseudo_sync_runner

  File "C:\Users\Kingstone\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "C:\Users\Kingstone\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "C:\Users\Kingstone\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\Kingstone\AppData\Local\Temp\ipykernel_11912\2533748322.py", line 169, in <module>

  File "c:\Users\Kingstone\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\Kingstone\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 320, in fit

  File "c:\Users\Kingstone\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 121, in one_step_on_iterator

  File "c:\Users\Kingstone\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 108, in one_step_on_data

  File "c:\Users\Kingstone\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 54, in train_step

  File "c:\Users\Kingstone\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\trainers\trainer.py", line 398, in _compute_loss

  File "c:\Users\Kingstone\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\trainers\trainer.py", line 366, in compute_loss

  File "c:\Users\Kingstone\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\trainers\compile_utils.py", line 618, in __call__

  File "c:\Users\Kingstone\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\trainers\compile_utils.py", line 659, in call

  File "c:\Users\Kingstone\AppData\Local\Programs\Python\Python312\Lib\site-packages\mltu\tensorflow\losses.py", line 20, in __call__

  File "c:\Users\Kingstone\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\legacy\backend.py", line 666, in ctc_batch_cost

Saw a non-null label (index >= num_classes - 1) following a null label, batch: 1 num_classes: 12 labels: 0,7,9,11,8,10 labels seen so far: 0,7,9
	 [[{{node compile_loss/CTCLoss}}]] [Op:__inference_one_step_on_iterator_175655]

inception v3 

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.applications import InceptionV3

def train_model(input_dim, output_dim):
    """
    Creates an InceptionV3-based OCR model
    
    Args:
        input_dim: Tuple of (height, width, channels)
        output_dim: Number of classes (vocabulary size)
        
    Returns:
        tf.keras.Model: Compiled model ready for training
    """
    
    # Input layer
    inputs = layers.Input(shape=input_dim, name="input")
    
    # Load InceptionV3 without top layers, freeze early layers
    inception = InceptionV3(
        include_top=False,
        weights='imagenet',
        input_tensor=inputs,
        pooling=None
    )
    
    # Freeze early layers (up to mixed5)
    for layer in inception.layers[:249]:  # Freeze up to mixed5
        layer.trainable = False
    
    # Add custom layers on top of InceptionV3
    x = inception.output
    
    # Add inception-style module for OCR
    tower_1 = layers.Conv2D(192, (1, 1), padding='same', activation='relu')(x)
    tower_1 = layers.Conv2D(256, (3, 3), padding='same', activation='relu')(tower_1)
    
    tower_2 = layers.Conv2D(192, (1, 1), padding='same', activation='relu')(x)
    tower_2 = layers.Conv2D(256, (1, 5), padding='same', activation='relu')(tower_2)
    tower_2 = layers.Conv2D(256, (5, 1), padding='same', activation='relu')(tower_2)
    
    tower_3 = layers.MaxPooling2D((3, 3), strides=(1, 1), padding='same')(x)
    tower_3 = layers.Conv2D(128, (1, 1), padding='same', activation='relu')(tower_3)
    
    # Concatenate all towers
    x = layers.concatenate([tower_1, tower_2, tower_3], axis=3)
    
    # Reduce height dimension for OCR while preserving width
    x = layers.Conv2D(512, (3, 3), padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D(pool_size=(2, 1), padding='same')(x)
    
    # Add dropout for regularization
    x = layers.Dropout(0.3)(x)
    
    # Additional convolution for feature refinement
    x = layers.Conv2D(512, (3, 3), padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    
    # Prepare for sequence prediction
    x = layers.Conv2D(output_dim, (1, 1), activation='relu')(x)
    
    # Convert to sequence by averaging across height
    x = layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(x)
    
    # Output layer with softmax activation
    outputs = layers.Activation('softmax', name='output')(x)
    
    # Create and return model
    model = Model(inputs=inputs, outputs=outputs)
    
    return model

class ModelConfigs:
    """Configuration class to store model parameters"""
    def __init__(self):
        self.vocab = ""
        self.max_text_length = 0
        self.height = 300
        self.width = 300
        self.batch_size = 32
        self.learning_rate = 0.001
        self.train_epochs = 100
        self.model_path = "./saved_models"
    
    def save(self):
        """Save configurations to a file"""
        if not os.path.exists(self.model_path):
            os.makedirs(self.model_path)
        
        config_path = os.path.join(self.model_path, "config.json")
        with open(config_path, 'w') as f:
            json.dump({
                'vocab': self.vocab,
                'max_text_length': self.max_text_length,
                'height': self.height,
                'width': self.width,
                'batch_size': self.batch_size,
                'learning_rate': self.learning_rate,
                'train_epochs': self.train_epochs
            }, f, indent=4)

mobileVnet

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.applications import MobileNetV2

def train_model(input_dim, output_dim):
    """
    Creates a MobileNetV2-based OCR model
    
    Args:
        input_dim: Tuple of (height, width, channels)
        output_dim: Number of classes (vocabulary size)
        
    Returns:
        tf.keras.Model: Compiled model ready for training
    """
    
    # Input layer
    inputs = layers.Input(shape=input_dim, name="input")
    
    # Load MobileNetV2 without top layers
    mobilenet = MobileNetV2(
        include_top=False,
        weights='imagenet',
        input_tensor=inputs,
        pooling=None
    )
    
    # Freeze early layers (up to block 10)
    for layer in mobilenet.layers[:100]:
        layer.trainable = False
    
    # Add custom layers on top of MobileNet
    x = mobilenet.output
    
    # Add depthwise separable convolutions for efficiency
    x = layers.SeparableConv2D(512, (3, 3), padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    
    # Reduce height dimension for OCR while preserving width
    x = layers.MaxPooling2D(pool_size=(2, 1), padding='same')(x)
    
    # Additional depthwise separable convolutions
    x = layers.SeparableConv2D(512, (3, 3), padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(0.25)(x)
    
    # Final feature extraction
    x = layers.SeparableConv2D(512, (3, 3), padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    
    # Prepare for sequence prediction
    x = layers.Conv2D(output_dim, (1, 1), activation='relu')(x)
    
    # Convert to sequence by averaging across height
    x = layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(x)
    
    # Output layer with softmax activation
    outputs = layers.Activation('softmax', name='output')(x)
    
    # Create and return model
    model = Model(inputs=inputs, outputs=outputs)
    
    return model

class ModelConfigs:
    """Configuration class to store model parameters"""
    def __init__(self):
        self.vocab = ""
        self.max_text_length = 0
        self.height = 300
        self.width = 300
        self.batch_size = 32
        self.learning_rate = 0.001
        self.train_epochs = 100
        self.model_path = "./saved_models"
    
    def save(self):
        """Save configurations to a file"""
        if not os.path.exists(self.model_path):
            os.makedirs(self.model_path)
        
        config_path = os.path.join(self.model_path, "config.json")
        with open(config_path, 'w') as f:
            json.dump({
                'vocab': self.vocab,
                'max_text_length': self.max_text_length,
                'height': self.height,
                'width': self.width,
                'batch_size': self.batch_size,
                'learning_rate': self.learning_rate,
                'train_epochs': self.train_epochs
            }, f, indent=4)

densenet

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.applications import DenseNet121

def train_model(input_dim, output_dim):
    """
    Creates a DenseNet-based OCR model
    
    Args:
        input_dim: Tuple of (height, width, channels)
        output_dim: Number of classes (vocabulary size)
        
    Returns:
        tf.keras.Model: Compiled model ready for training
    """
    
    # Input layer
    inputs = layers.Input(shape=input_dim, name="input")
    
    # Load DenseNet121 without top layers
    densenet = DenseNet121(
        include_top=False,
        weights='imagenet',
        input_tensor=inputs,
        pooling=None
    )
    
    # Freeze early layers (up to block3)
    for layer in densenet.layers[:200]:
        layer.trainable = False
    
    # Get the base model output
    x = densenet.output
    
    # Create a custom dense block for OCR
    def dense_block(x, growth_rate, num_layers):
        concatenated_inputs = [x]
        
        for i in range(num_layers):
            # Composite function: BN -> ReLU -> Conv
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(growth_rate, (3, 3), padding='same')(x)
            
            # Concatenate with previous layers
            concatenated_inputs.append(x)
            x = layers.Concatenate(axis=-1)(concatenated_inputs)
        
        return x
    
    # Add custom dense block
    x = dense_block(x, growth_rate=32, num_layers=4)
    
    # Transition layer with height reduction for OCR
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.Conv2D(512, (1, 1), padding='same')(x)
    x = layers.MaxPooling2D(pool_size=(2, 1), padding='same')(x)
    
    # Additional dense block
    x = dense_block(x, growth_rate=32, num_layers=2)
    
    # Add dropout for regularization
    x = layers.Dropout(0.25)(x)
    
    # Final convolution layer
    x = layers.Conv2D(512, (3, 3), padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    
    # Prepare for sequence prediction
    x = layers.Conv2D(output_dim, (1, 1), activation='relu')(x)
    
    # Convert to sequence by averaging across height
    x = layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(x)
    
    # Output layer with softmax activation
    outputs = layers.Activation('softmax', name='output')(x)
    
    # Create and return model
    model = Model(inputs=inputs, outputs=outputs)
    
    return model

class ModelConfigs:
    """Configuration class to store model parameters"""
    def __init__(self):
        self.vocab = ""
        self.max_text_length = 0
        self.height = 300
        self.width = 300
        self.batch_size = 32
        self.learning_rate = 0.001
        self.train_epochs = 100
        self.model_path = "./saved_models"
    
    def save(self):
        """Save configurations to a file"""
        if not os.path.exists(self.model_path):
            os.makedirs(self.model_path)
        
        config_path = os.path.join(self.model_path, "config.json")
        with open(config_path, 'w') as f:
            json.dump({
                'vocab': self.vocab,
                'max_text_length': self.max_text_length,
                'height': self.height,
                'width': self.width,
                'batch_size': self.batch_size,
                'learning_rate': self.learning_rate,
                'train_epochs': self.train_epochs
            }, f, indent=4)

Xception


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.applications import Xception

def train_model(input_dim, output_dim):
    """
    Creates an Xception-based OCR model
    
    Args:
        input_dim: Tuple of (height, width, channels)
        output_dim: Number of classes (vocabulary size)
        
    Returns:
        tf.keras.Model: Compiled model ready for training
    """
    
    # Input layer
    inputs = layers.Input(shape=input_dim, name="input")
    
    # Load Xception without top layers
    xception = Xception(
        include_top=False,
        weights='imagenet',
        input_tensor=inputs,
        pooling=None
    )
    
    # Freeze early layers (entry flow and middle flow)
    for layer in xception.layers[:100]:
        layer.trainable = False
    
    # Get the base model output
    x = xception.output
    
    # Custom exit flow for OCR
    def separable_conv_block(x, filters):
        x = layers.Activation('relu')(x)
        x = layers.SeparableConv2D(filters, (3, 3), padding='same')(x)
        x = layers.BatchNormalization()(x)
        return x
    
    # First exit block
    residual = layers.Conv2D(512, (1, 1), strides=(2, 1), padding='same')(x)
    residual = layers.BatchNormalization()(residual)
    
    x = separable_conv_block(x, 512)
    x = separable_conv_block(x, 512)
    x = layers.MaxPooling2D((2, 1), padding='same')(x)
    
    # Add residual connection
    x = layers.add([x, residual])
    
    # Second exit block with height preservation
    x = separable_conv_block(x, 512)
    x = separable_conv_block(x, 512)
    x = layers.BatchNormalization()(x)
    
    # Add dropout for regularization
    x = layers.Dropout(0.3)(x)
    
    # Additional separable convolutions for feature refinement
    x = separable_conv_block(x, 512)
    x = layers.BatchNormalization()(x)
    
    # Prepare for sequence prediction
    x = layers.Conv2D(output_dim, (1, 1), activation='relu')(x)
    
    # Convert to sequence by averaging across height
    x = layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(x)
    
    # Output layer with softmax activation
    outputs = layers.Activation('softmax', name='output')(x)
    
    # Create and return model
    model = Model(inputs=inputs, outputs=outputs)
    
    return model

class ModelConfigs:
    """Configuration class to store model parameters"""
    def __init__(self):
        self.vocab = ""
        self.max_text_length = 0
        self.height = 300
        self.width = 300
        self.batch_size = 32
        self.learning_rate = 0.001
        self.train_epochs = 100
        self.model_path = "./saved_models"
    
    def save(self):
        """Save configurations to a file"""
        if not os.path.exists(self.model_path):
            os.makedirs(self.model_path)
        
        config_path = os.path.join(self.model_path, "config.json")
        with open(config_path, 'w') as f:
            json.dump({
                'vocab': self.vocab,
                'max_text_length': self.max_text_length,
                'height': self.height,
                'width': self.width,
                'batch_size': self.batch_size,
                'learning_rate': self.learning_rate,
                'train_epochs': self.train_epochs
            }, f, indent=4)

efficient 

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.applications import EfficientNetB0

def train_model(input_dim, output_dim):
    """
    Creates an EfficientNet-based OCR model
    
    Args:
        input_dim: Tuple of (height, width, channels)
        output_dim: Number of classes (vocabulary size)
        
    Returns:
        tf.keras.Model: Compiled model ready for training
    """
    
    # Input layer
    inputs = layers.Input(shape=input_dim, name="input")
    
    # Load EfficientNetB0 without top layers
    efficient_net = EfficientNetB0(
        include_top=False,
        weights='imagenet',
        input_tensor=inputs,
        pooling=None
    )
    
    # Freeze early layers
    for layer in efficient_net.layers[:150]:
        layer.trainable = False
    
    # Get the base model output
    x = efficient_net.output
    
    # Define MBConv block (Mobile Inverted Bottleneck Conv)
    def mbconv_block(x, expand_ratio, output_channels, kernel_size=3):
        # Expansion phase
        channels = x.shape[-1]
        x = layers.Conv2D(channels * expand_ratio, 1, padding='same', use_bias=False)(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('swish')(x)
        
        # Depthwise Convolution
        x = layers.DepthwiseConv2D(kernel_size, padding='same', use_bias=False)(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('swish')(x)
        
        # Squeeze and Excitation
        se = layers.GlobalAveragePooling2D()(x)
        se = layers.Dense(channels * expand_ratio // 4, activation='swish')(se)
        se = layers.Dense(channels * expand_ratio, activation='sigmoid')(se)
        se = layers.Reshape((1, 1, channels * expand_ratio))(se)
        x = layers.multiply([x, se])
        
        # Output phase
        x = layers.Conv2D(output_channels, 1, padding='same', use_bias=False)(x)
        x = layers.BatchNormalization()(x)
        
        return x
    
    # Add custom OCR-specific layers
    # First block with height reduction
    x = mbconv_block(x, expand_ratio=4, output_channels=512)
    x = layers.MaxPooling2D(pool_size=(2, 1), padding='same')(x)
    
    # Second block
    x = mbconv_block(x, expand_ratio=4, output_channels=512)
    x = layers.Dropout(0.25)(x)
    
    # Additional feature refinement
    x = layers.Conv2D(512, (1, 1), use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('swish')(x)
    
    # Prepare for sequence prediction
    x = layers.Conv2D(output_dim, (1, 1), activation='relu')(x)
    
    # Convert to sequence by averaging across height
    x = layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(x)
    
    # Output layer with softmax activation
    outputs = layers.Activation('softmax', name='output')(x)
    
    # Create and return model
    model = Model(inputs=inputs, outputs=outputs)
    
    return model

class ModelConfigs:
    """Configuration class to store model parameters"""
    def __init__(self):
        self.vocab = ""
        self.max_text_length = 0
        self.height = 300
        self.width = 300
        self.batch_size = 32
        self.learning_rate = 0.001
        self.train_epochs = 100
        self.model_path = "./saved_models"
    
    def save(self):
        """Save configurations to a file"""
        if not os.path.exists(self.model_path):
            os.makedirs(self.model_path)
        
        config_path = os.path.join(self.model_path, "config.json")
        with open(config_path, 'w') as f:
            json.dump({
                'vocab': self.vocab,
                'max_text_length': self.max_text_length,
                'height': self.height,
                'width': self.width,
                'batch_size': self.batch_size,
                'learning_rate': self.learning_rate,
                'train_epochs': self.train_epochs
            }, f, indent=4)

# god owns code

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Reshape, Conv2D, BatchNormalization, LSTM, Bidirectional, Dropout
from tensorflow.keras.models import Model

def create_resnet_ocr_model(input_dim, output_dim):
    """
    Create an OCR model using ResNet50 as the backbone
    Args:
        input_dim: Tuple of (height, width, channels)
        output_dim: Number of classes (vocabulary size)
    Returns:
        tf.keras.Model: Compiled model
    """
    # Load ResNet50 without top layers
    base_model = ResNet50(
        include_top=False,
        weights='imagenet',
        input_shape=input_dim
    )
    
    # Freeze the base model layers
    for layer in base_model.layers:
        layer.trainable = False
    
    # Get the output from ResNet50
    x = base_model.output
    
    # Add custom layers for OCR
    x = Conv2D(128, (3, 3), padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    
    # Reshape for sequential processing
    _, h, w, c = x.shape
    x = Reshape((-1, h * c))(x)
    
    # Add bidirectional LSTM layers
    x = Bidirectional(LSTM(256, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    
    # Output layer with softmax activation
    outputs = Dense(output_dim + 1, activation='softmax')(x)  # +1 for CTC blank label
    
    # Create model
    model = Model(inputs=base_model.input, outputs=outputs)
    
    return model

# Replace the original train_model function call with this:
model = create_resnet_ocr_model(
    input_dim=(configs.height, configs.width, 3),
    output_dim=len(configs.vocab)
)

# Fine-tuning setup (add this after initial training)
def unfreeze_model(model):
    """
    Unfreeze ResNet50 layers for fine-tuning
    """
    for layer in model.layers:
        layer.trainable = True
    
    return model

# After initial training, you can fine-tune the model:

# Fine-tuning (uncomment and use after initial training)
model = unfreeze_model(model)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate * 0.1),
    loss=CTCloss(),
    metrics=[
        CERMetric(vocabulary=configs.vocab),
        WERMetric(vocabulary=configs.vocab)
    ],
    run_eagerly=False
)

# Train for additional epochs with unfrozen layers
model.fit(
    train_data_provider,
    validation_data=val_data_provider,
    epochs=10,
    callbacks=callbacks,
)

Epoch 1/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12s/step - CER: 1.1274 - WER: 1.0000 - loss: 14.9145 

  self._warn_if_super_not_called()



Epoch 1: val_CER improved from inf to 1.03671, saving model to ./saved_models/model.keras
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 13s/step - CER: 1.1198 - WER: 1.0000 - loss: 14.7219 - val_CER: 1.0367 - val_WER: 1.0000 - val_loss: 14.2884 - learning_rate: 1.0000e-04
Epoch 2/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12s/step - CER: 1.0150 - WER: 1.0000 - loss: 9.4917 
Epoch 2: val_CER improved from 1.03671 to 0.98896, saving model to ./saved_models/model.keras
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 13s/step - CER: 1.0131 - WER: 1.0000 - loss: 9.4285 - val_CER: 0.9890 - val_WER: 1.0000 - val_loss: 10.7846 - learning_rate: 1.0000e-04
Epoch 3/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11s/step - CER: 0.9550 - WER: 0.9981 - loss: 7.1105 
Epoch 3: val_CER improved from 0.98896 to 0.90766, saving model to ./saved_models/model.keras
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119

<keras.src.callbacks.history.History at 0x2052bfefb90>

In [54]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Reshape, Conv2D, BatchNormalization, LSTM, Bidirectional, Dropout
from tensorflow.keras.models import Model

def create_resnet_ocr_model(input_dim, output_dim):
    """
    Create an OCR model using ResNet50 as the backbone
    Args:
        input_dim: Tuple of (height, width, channels)
        output_dim: Number of classes (vocabulary size)
    Returns:
        tf.keras.Model: Compiled model
    """
    # Load ResNet50 without top layers
    base_model = VGG19(
        include_top=False,
        weights='imagenet',
        input_shape=input_dim
    )
    
    # Freeze the base model layers
    for layer in base_model.layers:
        layer.trainable = False
    
    # Get the output from ResNet50
    x = base_model.output
    
    # Add custom layers for OCR
    x = Conv2D(128, (3, 3), padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    
    # Reshape for sequential processing
    _, h, w, c = x.shape
    x = Reshape((-1, h * c))(x)
    
    # Add bidirectional LSTM layers
    x = Bidirectional(LSTM(256, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    
    # Output layer with softmax activation
    outputs = Dense(output_dim + 1, activation='softmax')(x)  # +1 for CTC blank label
    
    # Create model
    model = Model(inputs=base_model.input, outputs=outputs)
    
    return model

# Replace the original train_model function call with this:
model = create_resnet_ocr_model(
    input_dim=(configs.height, configs.width, 3),
    output_dim=len(configs.vocab)
)

# Fine-tuning setup (add this after initial training)
def unfreeze_model(model):
    """
    Unfreeze ResNet50 layers for fine-tuning
    """
    for layer in model.layers:
        layer.trainable = True
    
    return model

# After initial training, you can fine-tune the model:

# Fine-tuning (uncomment and use after initial training)
model = unfreeze_model(model)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate * 0.1),
    loss=CTCloss(),
    metrics=[
        CERMetric(vocabulary=configs.vocab),
        WERMetric(vocabulary=configs.vocab)
    ],
    run_eagerly=False
)

# Train for additional epochs with unfrozen layers
model.fit(
    train_data_provider,
    validation_data=val_data_provider,
    epochs=10,
    callbacks=callbacks,
)

Epoch 1/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27s/step - CER: 1.0814 - WER: 1.0000 - loss: 14.7527 
Epoch 1: val_CER did not improve from 0.36267
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m294s[0m 28s/step - CER: 1.0756 - WER: 1.0000 - loss: 14.5900 - val_CER: 1.0141 - val_WER: 1.0000 - val_loss: 11.3462 - learning_rate: 1.0000e-04
Epoch 2/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27s/step - CER: 1.0052 - WER: 1.0000 - loss: 9.1471 
Epoch 2: val_CER did not improve from 0.36267
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m275s[0m 28s/step - CER: 1.0044 - WER: 1.0000 - loss: 9.0957 - val_CER: 0.9893 - val_WER: 1.0000 - val_loss: 9.4465 - learning_rate: 1.0000e-04
Epoch 3/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26s/step - CER: 0.9555 - WER: 0.9993 - loss: 7.2424 
Epoch 3: val_CER did not improve from 0.36267
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m275s[0m 27s/ste

  saveable.load_own_variables(weights_store.get(inner_path))


ValueError: A total of 20 objects could not be loaded. Example error message for object <Conv2D name=block1_conv1, built=True>:

The shape of the target variable and the shape of the target value in `variable.assign(value)` must match. variable.shape=(3, 3, 3, 64), Received: value.shape=(7, 7, 3, 64). Target variable: <KerasVariable shape=(3, 3, 3, 64), dtype=float32, path=block1_conv1/kernel>

List of objects that could not be loaded:
[<Conv2D name=block1_conv1, built=True>, <Conv2D name=block1_conv2, built=True>, <Conv2D name=block2_conv1, built=True>, <Conv2D name=block2_conv2, built=True>, <Conv2D name=block3_conv1, built=True>, <Conv2D name=block3_conv2, built=True>, <Conv2D name=block3_conv3, built=True>, <Conv2D name=block3_conv4, built=True>, <Conv2D name=block4_conv1, built=True>, <Conv2D name=block4_conv2, built=True>, <Conv2D name=block4_conv3, built=True>, <Conv2D name=block4_conv4, built=True>, <Conv2D name=block5_conv1, built=True>, <Conv2D name=block5_conv2, built=True>, <Conv2D name=block5_conv3, built=True>, <Conv2D name=block5_conv4, built=True>, <Conv2D name=conv2d_47, built=True>, <BatchNormalization name=batch_normalization_38, built=True>, <LSTMCell name=lstm_cell, built=True>, <LSTMCell name=lstm_cell, built=True>]

In [55]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Reshape, Conv2D, BatchNormalization, LSTM, Bidirectional, Dropout
from tensorflow.keras.models import Model

def create_resnet_ocr_model(input_dim, output_dim):
    """
    Create an OCR model using ResNet50 as the backbone
    Args:
        input_dim: Tuple of (height, width, channels)
        output_dim: Number of classes (vocabulary size)
    Returns:
        tf.keras.Model: Compiled model
    """
    # Load ResNet50 without top layers
    base_model = MobileNetV2(
        include_top=False,
        weights='imagenet',
        input_shape=input_dim
    )
    
    # Freeze the base model layers
    for layer in base_model.layers:
        layer.trainable = False
    
    # Get the output from ResNet50
    x = base_model.output
    
    # Add custom layers for OCR
    x = Conv2D(128, (3, 3), padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    
    # Reshape for sequential processing
    _, h, w, c = x.shape
    x = Reshape((-1, h * c))(x)
    
    # Add bidirectional LSTM layers
    x = Bidirectional(LSTM(256, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    
    # Output layer with softmax activation
    outputs = Dense(output_dim + 1, activation='softmax')(x)  # +1 for CTC blank label
    
    # Create model
    model = Model(inputs=base_model.input, outputs=outputs)
    
    return model

# Replace the original train_model function call with this:
model = create_resnet_ocr_model(
    input_dim=(configs.height, configs.width, 3),
    output_dim=len(configs.vocab)
)

# Fine-tuning setup (add this after initial training)
def unfreeze_model(model):
    """
    Unfreeze ResNet50 layers for fine-tuning
    """
    for layer in model.layers:
        layer.trainable = True
    
    return model

# After initial training, you can fine-tune the model:

# Fine-tuning (uncomment and use after initial training)
model = unfreeze_model(model)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate * 0.1),
    loss=CTCloss(),
    metrics=[
        CERMetric(vocabulary=configs.vocab),
        WERMetric(vocabulary=configs.vocab)
    ],
    run_eagerly=False
)

# Train for additional epochs with unfrozen layers
model.fit(
    train_data_provider,
    validation_data=val_data_provider,
    epochs=100,
    callbacks=callbacks,
)

  base_model = MobileNetV2(


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1us/step
Epoch 1/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - CER: 1.0869 - WER: 0.9973 - loss: 16.2915
Epoch 1: val_CER did not improve from 0.36267
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 5s/step - CER: 1.0800 - WER: 0.9973 - loss: 16.1609 - val_CER: 0.9943 - val_WER: 0.9970 - val_loss: 16.1418 - learning_rate: 1.0000e-04
Epoch 2/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - CER: 0.9961 - WER: 0.9980 - loss: 10.6635
Epoch 2: val_CER did not improve from 0.36267
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 5s/step - CER: 0.9961 - WER: 0.9980 - loss: 10.6233 - val_CER: 0.9872 - val_WER: 0.9985 - val_loss: 14.5555 - learning_rate: 1.0000e-04
Epoch

<keras.src.callbacks.history.History at 0x204b1ec5dc0>

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Reshape, Conv2D, BatchNormalization, LSTM, Bidirectional, Dropout
from tensorflow.keras.models import Model

def create_resnet_ocr_model(input_dim, output_dim):
    """
    Create an OCR model using ResNet50 as the backbone
    Args:
        input_dim: Tuple of (height, width, channels)
        output_dim: Number of classes (vocabulary size)
    Returns:
        tf.keras.Model: Compiled model
    """
    # Load ResNet50 without top layers
    base_model = Xception(
        include_top=False,
        weights='imagenet',
        input_shape=input_dim
    )
    
    # Freeze the base model layers
    for layer in base_model.layers:
        layer.trainable = False
    
    # Get the output from ResNet50
    x = base_model.output
    
    # Add custom layers for OCR
    x = Conv2D(128, (3, 3), padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    
    # Reshape for sequential processing
    _, h, w, c = x.shape
    x = Reshape((-1, h * c))(x)
    
    # Add bidirectional LSTM layers
    x = Bidirectional(LSTM(256, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    
    # Output layer with softmax activation
    outputs = Dense(output_dim + 1, activation='softmax')(x)  # +1 for CTC blank label
    
    # Create model
    model = Model(inputs=base_model.input, outputs=outputs)
    
    return model

# Replace the original train_model function call with this:
model = create_resnet_ocr_model(
    input_dim=(configs.height, configs.width, 3),
    output_dim=len(configs.vocab)
)

# Fine-tuning setup (add this after initial training)
def unfreeze_model(model):
    """
    Unfreeze ResNet50 layers for fine-tuning
    """
    for layer in model.layers:
        layer.trainable = True
    
    return model

# After initial training, you can fine-tune the model:

# Fine-tuning (uncomment and use after initial training)
model = unfreeze_model(model)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate * 0.1),
    loss=CTCloss(),
    metrics=[
        CERMetric(vocabulary=configs.vocab),
        WERMetric(vocabulary=configs.vocab)
    ],
    run_eagerly=False
)

# Train for additional epochs with unfrozen layers
model.fit(
    train_data_provider,
    validation_data=val_data_provider,
    epochs=10,
    callbacks=callbacks,
)

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Reshape, Conv2D, BatchNormalization, LSTM, Bidirectional, Dropout
from tensorflow.keras.models import Model

def create_resnet_ocr_model(input_dim, output_dim):
    """
    Create an OCR model using ResNet50 as the backbone
    Args:
        input_dim: Tuple of (height, width, channels)
        output_dim: Number of classes (vocabulary size)
    Returns:
        tf.keras.Model: Compiled model
    """
    # Load ResNet50 without top layers
    base_model = DenseNet121(
        include_top=False,
        weights='imagenet',
        input_shape=input_dim
    )
    
    # Freeze the base model layers
    for layer in base_model.layers:
        layer.trainable = False
    
    # Get the output from ResNet50
    x = base_model.output
    
    # Add custom layers for OCR
    x = Conv2D(128, (3, 3), padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    
    # Reshape for sequential processing
    _, h, w, c = x.shape
    x = Reshape((-1, h * c))(x)
    
    # Add bidirectional LSTM layers
    x = Bidirectional(LSTM(256, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    
    # Output layer with softmax activation
    outputs = Dense(output_dim + 1, activation='softmax')(x)  # +1 for CTC blank label
    
    # Create model
    model = Model(inputs=base_model.input, outputs=outputs)
    
    return model

# Replace the original train_model function call with this:
model = create_resnet_ocr_model(
    input_dim=(configs.height, configs.width, 3),
    output_dim=len(configs.vocab)
)

# Fine-tuning setup (add this after initial training)
def unfreeze_model(model):
    """
    Unfreeze ResNet50 layers for fine-tuning
    """
    for layer in model.layers:
        layer.trainable = True
    
    return model

# After initial training, you can fine-tune the model:

# Fine-tuning (uncomment and use after initial training)
model = unfreeze_model(model)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate * 0.1),
    loss=CTCloss(),
    metrics=[
        CERMetric(vocabulary=configs.vocab),
        WERMetric(vocabulary=configs.vocab)
    ],
    run_eagerly=False
)

# Train for additional epochs with unfrozen layers
model.fit(
    train_data_provider,
    validation_data=val_data_provider,
    epochs=10,
    callbacks=callbacks,
)

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Reshape, Conv2D, BatchNormalization, LSTM, Bidirectional, Dropout
from tensorflow.keras.models import Model

def create_resnet_ocr_model(input_dim, output_dim):
    """
    Create an OCR model using ResNet50 as the backbone
    Args:
        input_dim: Tuple of (height, width, channels)
        output_dim: Number of classes (vocabulary size)
    Returns:
        tf.keras.Model: Compiled model
    """
    # Load ResNet50 without top layers
    base_model = EfficientNetB0(
        include_top=False,
        weights='imagenet',
        input_shape=input_dim
    )
    
    # Freeze the base model layers
    for layer in base_model.layers:
        layer.trainable = False
    
    # Get the output from ResNet50
    x = base_model.output
    
    # Add custom layers for OCR
    x = Conv2D(128, (3, 3), padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    
    # Reshape for sequential processing
    _, h, w, c = x.shape
    x = Reshape((-1, h * c))(x)
    
    # Add bidirectional LSTM layers
    x = Bidirectional(LSTM(256, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    
    # Output layer with softmax activation
    outputs = Dense(output_dim + 1, activation='softmax')(x)  # +1 for CTC blank label
    
    # Create model
    model = Model(inputs=base_model.input, outputs=outputs)
    
    return model

# Replace the original train_model function call with this:
model = create_resnet_ocr_model(
    input_dim=(configs.height, configs.width, 3),
    output_dim=len(configs.vocab)
)

# Fine-tuning setup (add this after initial training)
def unfreeze_model(model):
    """
    Unfreeze ResNet50 layers for fine-tuning
    """
    for layer in model.layers:
        layer.trainable = True
    
    return model

# After initial training, you can fine-tune the model:

# Fine-tuning (uncomment and use after initial training)
model = unfreeze_model(model)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate * 0.1),
    loss=CTCloss(),
    metrics=[
        CERMetric(vocabulary=configs.vocab),
        WERMetric(vocabulary=configs.vocab)
    ],
    run_eagerly=False
)

# Train for additional epochs with unfrozen layers
model.fit(
    train_data_provider,
    validation_data=val_data_provider,
    epochs=10,
    callbacks=callbacks,
)

imporved ocr code 