In [None]:
Cài đặt

In [None]:
!pip install -q datasets pillow matplotlib tensorflow

Import thư viện cần thiết

In [None]:
from datasets import load_dataset
import numpy as np
import tensorflow as tf
from tensorflow import keras
import PIL
import matplotlib.pyplot as plt

Load dataset

In [None]:
ds = load_dataset("priyank-m/MJSynth_text_recognition", split="train[:20000]")  # rút gọn để thử nhanh

Mã hoá

In [None]:
charset = "abcdefghijklmnopqrstuvwxyz0123456789"
char_to_id = {c: i for i, c in enumerate(charset)}  # nhãn 0..len(charset)-1, blank sẽ là lớp cuối cùng của softmax
num_classes = len(charset)  # model Dense sẽ dùng num_classes + 1 (thêm blank)

def encode_text(t: str):
    t = t.lower()
    return [char_to_id[c] for c in t if c in char_to_id]

Tham số ảnh và pipeline


In [None]:
IMG_HEIGHT, IMG_WIDTH = 32, 128
BATCH_SIZE = 64

def gen(examples):
    for ex in examples:
        img: PIL.Image.Image = ex["image"].convert("L").resize((IMG_WIDTH, IMG_HEIGHT), PIL.Image.BILINEAR)
        img = np.array(img, dtype=np.float32) / 255.0
        img = img[..., None]  # (H, W, 1)
        label_ids = np.array(encode_text(ex["label"]), dtype=np.int32)
        yield img, label_ids, np.int32(len(label_ids))

Trai val/split


In [None]:
split = ds.train_test_split(test_size=0.1, seed=42)
train_raw, val_raw = split["train"], split["test"]

def make_tfds(hf_dataset):
    output_signature = (
        tf.TensorSpec(shape=(IMG_HEIGHT, IMG_WIDTH, 1), dtype=tf.float32),    # image
        tf.TensorSpec(shape=(None,), dtype=tf.int32),                          # label (variable length)
        tf.TensorSpec(shape=(), dtype=tf.int32),                               # label_length
    )
    ds_tf = tf.data.Dataset.from_generator(lambda: gen(hf_dataset), output_signature=output_signature)
    ds_tf = ds_tf.padded_batch(
        BATCH_SIZE,
        padded_shapes=(
            (IMG_HEIGHT, IMG_WIDTH, 1),   # image
            (None,),                      # label padded
            (),                           # label_length
        ),
        padding_values=(
            0.0,     # image pad
            -1,      # label pad value expected by ctc_label_dense_to_sparse
            0,       # label_length
        ),
        drop_remainder=True
    ).prefetch(tf.data.AUTOTUNE)
    return ds_tf

train_ds = make_tfds(train_raw)
val_ds = make_tfds(val_raw)

Model CRNN

In [None]:
def build_crnn(num_classes):
    inp = keras.Input(shape=(IMG_HEIGHT, IMG_WIDTH, 1), name="image")

    x = keras.layers.Conv2D(64, 3, padding="same", activation="relu")(inp)
    x = keras.layers.MaxPooling2D((2, 2))(x)               # 16x64
    x = keras.layers.Conv2D(128, 3, padding="same", activation="relu")(x)
    x = keras.layers.MaxPooling2D((2, 2))(x)               # 8x32
    x = keras.layers.Conv2D(256, 3, padding="same", activation="relu")(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Conv2D(256, 3, padding="same", activation="relu")(x)
    x = keras.layers.MaxPooling2D((2, 1))(x)               # 4x32
    x = keras.layers.Conv2D(512, 3, padding="same", activation="relu")(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.MaxPooling2D((2, 1))(x)               # 2x32
    x = keras.layers.Conv2D(512, 2, padding="valid", activation="relu")(x)  # 1x31

    # Reshape -> time steps x features (timesteps = 31)
    x = keras.layers.Reshape(target_shape=(31, 512))(x)

    x = keras.layers.Bidirectional(keras.layers.LSTM(256, return_sequences=True))(x)
    x = keras.layers.Bidirectional(keras.layers.LSTM(256, return_sequences=True))(x)

    # +1 cho blank
    out = keras.layers.Dense(num_classes + 1, activation="softmax")(x)  # (B, T, C+1)
    return keras.Model(inp, out, name="crnn")

base_model = build_crnn(num_classes)

Train model với CTC loss

In [None]:
labels = keras.Input(shape=(None,), dtype=tf.int32, name="labels")
input_length = keras.Input(shape=(1,), dtype=tf.int32, name="input_length")
label_length = keras.Input(shape=(1,), dtype=tf.int32, name="label_length")

logits = base_model.output  # (B, T, C+1)
def ctc_loss_layer(args):
    y_true, y_pred, in_len, lab_len = args
    return keras.backend.ctc_batch_cost(y_true, y_pred, in_len, lab_len)

loss_out = keras.layers.Lambda(ctc_loss_layer, name="ctc_loss")([labels, logits, input_length, label_length])

train_model = keras.Model(
    inputs=[base_model.input, labels, input_length, label_length],
    outputs=loss_out,
)

train_model.compile(optimizer=keras.optimizers.Adam(1e-3), loss=lambda y_true, y_pred: y_pred)

Pack batch: thêm input_length (timesteps = 31) và dummy y

In [None]:
TIMESTEPS = 31

def pack_batch(images, labels_batch, label_lens):
    bsz = tf.shape(images)[0]
    in_len = tf.fill([bsz, 1], TIMESTEPS)
    lab_len = tf.expand_dims(label_lens, axis=1)
    inputs = {
        "image": images,
        "labels": labels_batch,
        "input_length": in_len,
        "label_length": lab_len,
    }
    # y dummy (Keras cần target, nhưng loss đã ở outputs)
    y = tf.zeros((bsz, 1), dtype=tf.float32)
    return inputs, y

train_data = train_ds.map(pack_batch)
val_data = val_ds.map(pack_batch)

 Train

In [None]:
history = train_model.fit(
    train_data,
    validation_data=val_data,
    epochs=5,                 # tăng lên khi đã chạy ổn định
)


Mount google drive để tải model về

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!ls /content/drive

Xem vài mẫu

In [None]:
batch = next(iter(val_ds))
imgs, lab_ids, lab_lens = batch
preds = base_model.predict(imgs)

In [None]:
# Greedy decode
input_len_np = np.full((preds.shape[0],), TIMESTEPS)
results = keras.backend.ctc_decode(preds, input_length=input_len_np, greedy=True)[0][0].numpy()

id_to_char = {v: k for k, v in char_to_id.items()}
def decode_ids(ids):
    return "".join(id_to_char[i] for i in ids if i != -1)

for i in range(10):
    plt.imshow(imgs[i,...,0], cmap="gray")
    plt.axis("off")
    plt.title(f"pred: {decode_ids(results[i])}")
    plt.show()