In [None]:
import numpy as np
import pandas as pd
import cv2
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, Model, regularizers, Input
from tensorflow.keras.layers import Conv2D, BatchNormalization, Activation, MaxPooling2D
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import re
from PIL import Image, ImageOps

In [None]:
import json

# Open and read the JSON file
with open('synthetic_thermometer_images/metadata.json', 'r') as file:
    data = json.load(file)
    
dfs1 = pd.DataFrame(data).rename(columns={'temperature':'reading'}).dropna().sample(frac=0.15, random_state=42).reset_index(drop=True)

# Open and read the JSON file
with open('synthetic_thermometer_images (temp-dual)/metadata.json', 'r') as file:
    data = json.load(file)

dfs2 = pd.DataFrame(data).rename(columns={'temperature':'reading'}).dropna().sample(frac=0.2, random_state=42).reset_index(drop=True)

# Open and read the JSON file
with open('synthetic/synthetic_thermometer_images/metadata.json', 'r') as file:
    data = json.load(file)
    
dfs3 = pd.DataFrame(data).rename(columns={'temperature':'reading'}).dropna().sample(frac=0.3, random_state=42).reset_index(drop=True)
dfs3['image_path'] = dfs3['image_path'].apply(lambda x: 'synthetic/'+x)

dfs = pd.concat([dfs1,dfs2,dfs3]).reset_index(drop=True)
dfs

In [None]:
df1 = pd.read_csv('cropped_imgs.csv',dtype=str).rename(columns={'0':'image_path','1':'reading'})
df2 = pd.read_csv('OCR2cropped_imgs.csv',dtype=str).rename(columns={'0':'image_path','1':'reading'})
df3 = pd.read_csv('ocr3500.csv',dtype=str).rename(columns={'File Name':'image_path','Label 1':'reading'})
df4 = pd.read_csv('ocr5000.csv',dtype=str).rename(columns={'File Name':'image_path','Label 1':'reading'})
df5 = pd.read_csv('ocr2-2000.csv',dtype=str).rename(columns={'File Name':'image_path','Label 1':'reading'})

def fun(x):
    return ''.join(re.findall(r'[0-9.]+', str(x)))

train_df = pd.concat([df1,df3,df4,df5,dfs]).sample(frac=1, random_state=42).reset_index(drop=True)
val_df = df2.copy()


train_df['reading'] = train_df['reading'].apply(lambda x: fun(x))
train_df = train_df[train_df['reading'].str.len() <= 5].reset_index(drop=True)
# train_df = train_df[train_df['reading']!=''].reset_index(drop=True)

val_df['reading'] = val_df['reading'].apply(lambda x: fun(x))
val_df = val_df[val_df['reading'].str.len() <= 5].reset_index(drop=True)
# val_df = val_df[val_df['reading']!=''].reset_index(drop=True)

In [None]:
train_image_texts = train_df['reading'].replace('','^').astype(str).tolist()
val_image_texts = val_df['reading'].replace('','^').astype(str).tolist()

train_image_texts[:5],val_image_texts[:5]

In [None]:
# After creating train_image_texts and before further processing:
chars = sorted({c for txt in train_image_texts for c in txt})   # drop stray spaces
print("Characters found :", chars)

# --- Corrected character mapping ---
char_to_idx = {c: i for i, c in enumerate(chars)}       # Shift to zero-based indices
idx_to_char = {i: c for i, c in enumerate(chars)}
num_classes = len(chars) + 1        # Total classes: valid characters + 1 (blank token)

print("num_classes, char_to_idx, idx_to_char:")
print(num_classes, char_to_idx, idx_to_char)


In [None]:
max_label_len = max(len(t) for t in train_image_texts)

def encode(txt):
    seq = [char_to_idx[c] for c in txt]          # txt is *non‑empty*
    length = len(seq)
    seq = pad_sequences([seq], maxlen=max_label_len,
                        padding="post", value=-1) # pad with -1
    return seq[0].astype("int32"), np.int32(length)

train_encoded = [encode(t) for t in train_image_texts]
train_padded_labels, train_label_lengths = map(np.array, zip(*train_encoded))

val_encoded = [encode(t) for t in val_image_texts]
val_padded_labels, val_label_lengths = map(np.array, zip(*val_encoded))

In [None]:
max_label_len,train_encoded[:5],val_encoded[:5]

In [None]:
train_padded_labels[:5], train_label_lengths[:5]

In [None]:
train_paths = train_df['image_path'].values.tolist()
val_paths = val_df['image_path'].values.tolist()

In [None]:
def process_single_sample(img_path, label, label_len):
    img = tf.io.read_file(img_path)
    img = tf.io.decode_png(img, channels=3)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.image.resize(img, [128, 256])
    return {
        "image": img,
        "label": label,
        "label_length": label_len            # used by the CTCLayer
    }

batch_size = 32

def make_dataset(paths, labels, label_lens, shuffle=True):
    ds = tf.data.Dataset.from_tensor_slices((paths, labels, label_lens))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(paths), reshuffle_each_iteration=True)
    ds = (
        ds.map(process_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
          .batch(batch_size)
          .prefetch(tf.data.AUTOTUNE)
    )
    return ds

train_dataset = make_dataset(train_paths, train_padded_labels, train_label_lengths, shuffle=True)
validation_dataset = make_dataset(val_paths, val_padded_labels, val_label_lengths, shuffle=False)

print("num_classes (including CTC blank):", num_classes)
print("max_label_len :", max_label_len)

When you use a dense tensor for the targets, TensorFlow’s CTC loss (i.e. tf.keras.backend.ctc_batch_cost) uses the supplied label lengths to know which values to consider and which to ignore. In other words, if you’ve padded your labels with –1 *after* the true sequence (and you’ve correctly computed the label length), the loss function will only consider the first N valid entries and ignore the –1’s.

### Key Points

1. **Dense Target Tensor with Label Length:**
   - Your encoded labels are a dense tensor with padding (–1) for positions beyond the true sequence.
   - You also compute and pass `label_length`, which tells the loss function exactly how many elements in each label sequence are valid.
   - **Result:** tf.keras.backend.ctc_batch_cost will ignore any token beyond the provided length. The –1 padding isn’t interpreted as a valid class as long as the first `label_length` entries are within 0…10.

2. **CTC Blank Token vs. Pad Value:**
   - **CTC Blank:** In your network, the blank class is at index 11, and it is used internally by CTC to allow non-overlapping outputs.
   - **Pad Value (–1):** This value is used only to make all target sequences have the same length.
   - **Thus:** These two serve different purposes and need not be handled together—only the valid (first `label_length`) tokens matter for the loss.

3. **At Inference Time:**
   - When decoding predictions, you might see –1’s if your decoding mechanism (or beam search) doesn’t automatically remove them. In the decoding step, you need to filter out both –1 and the blank token (11). We showed how to do that in the previous `greedy_decode` function by filtering out both.

### In Summary

The code you have for feeding the targets to tf.keras.backend.ctc_batch_cost does handle the –1 padding automatically—as long as the true lengths (i.e. label_length) are correctly calculated and passed. The loss will ignore any numbers beyond each sequence’s length. Just remember to filter out the –1’s during any post‑processing (e.g. when decoding the model’s predictions) so that your final output only has valid character IDs.

If you decide to switch frameworks (e.g. to PyTorch’s torch.nn.CTCLoss), note that in PyTorch you must manually create a sparse representation (or otherwise mask out the –1 values) because it doesn’t support padded labels in the same way.

In [None]:
import tensorflow as tf
import numpy as np
from PIL import Image, ImageOps

# Assume idx_to_char is defined as:
# idx_to_char = {0: '.', 1: '0', 2: '1', 3: '2', 4: '3', 5: '4', 6: '5', 7: '6', 8: '7', 9: '8', 10: '9', 11: '^'}

def process_image(path):
    """
    Pre-process the image: open, correct orientation, resize, and normalize.
    """
    img = Image.open(path).convert("RGB")
    img = ImageOps.exif_transpose(img)  # Fix orientation
    img = img.resize((256, 128))        # Resize as per model specification (width, height)
    img = np.array(img).astype(np.float32)
    img = img / 255.0                   # Normalize to [0, 1]
    return np.expand_dims(img, axis=0)  # Shape becomes (1, 128, 256, 3)

def decode_predictions(y_pred, idx_to_char, remove_placeholder=True, placeholder_token='^'):
    """
    Decode CTC output using greedy decoding and optionally remove the placeholder.
    
    Parameters:
        y_pred (tf.Tensor): Prediction tensor of shape (batch, time_steps, num_classes).
        idx_to_char (dict): Mapping from indices to character strings.
        remove_placeholder (bool): If True, remove the placeholder token from decoded text.
        placeholder_token (str): The placeholder token to remove if found.
    
    Returns:
        List of decoded text strings.
    """
    # The length for each batch element (all equal to T in this case)
    input_len = np.ones(y_pred.shape[0]) * y_pred.shape[1]
    
    # Use tf.keras.backend.ctc_decode for greedy decoding
    decoded, log_probs = tf.keras.backend.ctc_decode(y_pred, input_length=input_len, greedy=True)
    decoded_sequences = decoded[0].numpy()  # Use the first (greedy) path
    
    texts = []
    for seq in decoded_sequences:
        # Filter out the padding (-1) and map indices to characters
        char_list = [idx_to_char.get(int(idx), "") for idx in seq if idx != -1]
        text = "".join(char_list)
        # Optionally remove the placeholder if it was only to denote an originally empty string
        if remove_placeholder:
            text = text.replace(placeholder_token, "")
        texts.append(text)
    return texts

def model_inference(inference_model, img_path, idx_to_char, remove_placeholder=True):
    """
    Complete inference: preprocesses image, runs inference, and decodes output.
    """
    # Preprocess the image
    img = process_image(img_path)
    
    # Perform inference
    y_pred = inference_model(img)
    
    # Decode the predictions
    decoded_texts = decode_predictions(y_pred, idx_to_char, remove_placeholder=remove_placeholder)
    return decoded_texts[0],img  # Return the text for the first image in the batch


In [None]:
# ================================================================
# 1.  Custom CTC loss layer
# ================================================================
class CTCLayer(layers.Layer):
    def __init__(self, name="ctc_loss"):
        super().__init__(name=name)
        self.loss_fn = tf.keras.backend.ctc_batch_cost  # always available

    def call(self, inputs):
        y_pred, labels, label_len = inputs        # unpack
        batch = tf.shape(labels)[0]
        T     = tf.shape(y_pred)[1]

        # ctc_batch_cost wants shape (B, 1)
        input_len  = tf.fill([batch, 1], T)
        label_len  = tf.expand_dims(label_len, 1)

        loss = self.loss_fn(labels, y_pred, input_len, label_len)
        self.add_loss(loss)
        return y_pred

# ================================================================
# 2.  CNN → Bi‑LSTM encoder‑decoder
# ================================================================
image_in  = Input(shape=(128, 256, 3), name="image")
label_in  = Input(shape=(max_label_len,), dtype="int32", name="label")
len_in    = Input(shape=(),dtype="int32", name="label_length")

x = image_in

x = Conv2D(16, 3, use_bias=False, kernel_initializer='he_normal',
           kernel_regularizer=regularizers.l2(1e-3))(x)
x = BatchNormalization()(x)
x = Activation('silu')(x)

x = Conv2D(16, 3, use_bias=False, kernel_initializer='he_normal',
           kernel_regularizer=regularizers.l2(1e-3))(x)
x = BatchNormalization()(x)
x = Activation('silu')(x)

x = MaxPooling2D(pool_size=(2, 2))(x)

x = Conv2D(32, 3, use_bias=False, kernel_initializer='he_normal',
           kernel_regularizer=regularizers.l2(1e-3))(x)
x = BatchNormalization()(x)
x = Activation('silu')(x)

x = Conv2D(32, 3, use_bias=False, kernel_initializer='he_normal',
           kernel_regularizer=regularizers.l2(1e-3))(x)
x = BatchNormalization()(x)
x = Activation('silu')(x)

x = MaxPooling2D(pool_size=(2, 2))(x)

x = Conv2D(64, 3, use_bias=False, kernel_initializer='he_normal',
           kernel_regularizer=regularizers.l2(1e-3))(x)
x = BatchNormalization()(x)
x = Activation('silu')(x)

x = Conv2D(64, 3, use_bias=False, kernel_initializer='he_normal',
           kernel_regularizer=regularizers.l2(1e-3))(x)
x = BatchNormalization()(x)
x = Activation('silu')(x)

x = MaxPooling2D(pool_size=(2, 2))(x)

x = Conv2D(128, 3, use_bias=False, kernel_initializer='he_normal',
           kernel_regularizer=regularizers.l2(1e-3))(x)
x = BatchNormalization()(x)
x = Activation('silu')(x)

x = Conv2D(128, 3, use_bias=False, kernel_initializer='he_normal',
           kernel_regularizer=regularizers.l2(1e-3))(x)
x = BatchNormalization()(x)
x = Activation('silu')(x)

x = MaxPooling2D(pool_size=(2, 2))(x)

# --- reshape: (B, H, W, C) -> (B, W, H*C) -----------------------
x = layers.Permute((2, 1, 3))(x)                # (B, W, H, C)
x = layers.TimeDistributed(layers.Flatten())(x) # (B, W, H*C)

# --- sequence modelling -----------------------------------------
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.1))(x)
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.1))(x)

y_pred = layers.Dense(num_classes, activation="softmax",name="softmax_dense")(x)

ctc_out = CTCLayer()([y_pred, label_in, len_in])

training_model  = Model([image_in, label_in, len_in], ctc_out)
inference_model = Model(image_in, y_pred)
training_model.compile(optimizer=tf.keras.optimizers.Adam(5e-4))

training_model.summary()

In [None]:
callbacks = [
    EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-6)
]

history = training_model.fit(
    train_dataset,
    epochs=50,
    validation_data=validation_dataset,
    callbacks=callbacks,
    verbose=1
)

In [None]:
for i in range(16):
    try:
        test_image_path =  f'ssd_cropped_images/image_{i}.png'

        # Assuming inference_model is defined from your training pipeline and compiled properly
        predicted_text,img = model_inference(inference_model, test_image_path, idx_to_char)
        print("Predicted text:", predicted_text)
        plt.imshow(img[0])
        plt.show()
    except:
        pass