In [1]:
# ================================================================
# Offline Handwritten Text OCR — ViT Encoder + Transformer Decoder (CTC-aux, FiLM, BOS-cond)
# - Preprocess: drop NaNs / 'unreadable', crop top-left 64x256, pad white
# - Encoder: ViT over 8x8 patches (T=256 tokens)
# - Decoder: Transformer decoder with cross-attention
# - Stabilizers against collapse:
#     * Auxiliary CTC head on encoder (joint loss: CE + 0.3*CTC)
#     * Token dropout on decoder inputs (p=0.15) during training
#     * Image-conditioned BOS + FiLM on decoder embeddings (with LayerNorm)
#     * Weight tying (decoder out <- embedding weight)
# - Vocab: <pad>=0, <sos>=1, <eos>=2, then dataset chars
# - Metrics: CER, 1-CER, ACC, WER
# - Saves: loss_curve_vit_seq2seq.png, checkpoint seq2seq_vit_ctc.ckpt
# ================================================================

import os
import math
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

import pytorch_lightning as pl

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

# ---------------------------
# Metrics
# ---------------------------

def levenshtein_distance(s1: str, s2: str) -> int:
    if len(s1) < len(s2):
        s1, s2 = s2, s1
    previous = list(range(len(s2) + 1))
    for i, c1 in enumerate(s1):
        current = [i + 1]
        for j, c2 in enumerate(s2):
            ins = previous[j + 1] + 1
            dele = current[j] + 1
            sub = previous[j] + (c1 != c2)
            current.append(min(ins, dele, sub))
        previous = current
    return previous[-1]

def compute_metrics(preds, truths):
    total_chars, total_char_errs = 0, 0
    total_words, total_word_errs = 0, 0
    exact = 0
    for gt, pr in zip(truths, preds):
        dist = levenshtein_distance(gt, pr)
        total_char_errs += dist
        total_chars += len(gt)
        total_words += 1
        total_word_errs += int(gt != pr)
        if gt == pr:
            exact += 1
    cer = (total_char_errs / total_chars) if total_chars > 0 else 0.0
    one_minus_cer = 1.0 - cer
    acc = (exact / total_words) if total_words > 0 else 0.0
    wer = (total_word_errs / total_words) if total_words > 0 else 0.0
    return cer, one_minus_cer, acc, wer

# ---------------------------
# Dataset (keeps your preprocessing)
# ---------------------------

class HandwritingDataset(Dataset):
    """
    - CSV must have 'FILENAME', 'IDENTITY'
    - Remove NaNs and label == 'unreadable'
    - Convert to grayscale, crop top-left to 64x256, pad white
    - Returns: image tensor [1, 64, 256], raw text string, filename
    """
    def __init__(self, csv_path, images_dir, transform=None,
                 crop_h=64, crop_w=256, char2idx=None):
        df = pd.read_csv(csv_path)
        if "IDENTITY" not in df.columns or "FILENAME" not in df.columns:
            raise ValueError(f"CSV {csv_path} must have columns 'FILENAME' and 'IDENTITY'")
        df = df.dropna(subset=["FILENAME", "IDENTITY"]).copy()
        df["IDENTITY"] = df["IDENTITY"].astype(str)
        df = df[df["IDENTITY"].str.strip().str.lower() != "unreadable"].reset_index(drop=True)

        self.df = df
        self.images_dir = images_dir
        self.transform = transform or transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.5], std=[0.5]),
        ])
        self.crop_h = crop_h
        self.crop_w = crop_w

        # Vocab: <pad>=0, <sos>=1, <eos>=2, then your chars
        if char2idx is None:
            chars = sorted(list({c for text in self.df["IDENTITY"] for c in text}))
            self.char2idx = {"<pad>":0, "<sos>":1, "<eos>":2}
            for i, c in enumerate(chars, start=3):
                self.char2idx[c] = i
        else:
            self.char2idx = char2idx
        self.idx2char = {i:c for c,i in self.char2idx.items()}

        # For CTC auxiliary: map only real characters (exclude specials) to [1..K], blank=0
        self.ctc_blank = 0
        self.ctc_chars = [c for c in self.char2idx.keys() if c not in ["<pad>", "<sos>", "<eos>"]]
        self.ctc_char2idx = {c: i+1 for i, c in enumerate(sorted(self.ctc_chars))}
        self.ctc_idx2char = {i+1:c for i, c in enumerate(sorted(self.ctc_chars))}

    def __len__(self):
        return len(self.df)

    def _crop_pad_top_left(self, img: Image.Image) -> Image.Image:
        arr = np.array(img.convert("L"), dtype=np.uint8)
        h, w = arr.shape[:2]
        crop = arr[:min(h, self.crop_h), :min(w, self.crop_w)]
        out = np.ones((self.crop_h, self.crop_w), dtype=np.uint8) * 255
        out[:crop.shape[0], :crop.shape[1]] = crop
        return Image.fromarray(out, mode="L")

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.images_dir, row["FILENAME"])
        img = Image.open(img_path).convert("L")
        img = self._crop_pad_top_left(img)
        img_t = self.transform(img)  # [1,64,256]
        text = row["IDENTITY"]
        return img_t, text, row["FILENAME"]

    def encode_text(self, text):
        return [self.char2idx["<sos>"]] + [self.char2idx[c] for c in text if c in self.char2idx] + [self.char2idx["<eos>"]]

    def encode_text_ctc(self, text):
        # CTC targets are just raw characters (no sos/eos), mapped to 1..K
        return [self.ctc_char2idx[c] for c in text if c in self.ctc_char2idx]

    def collate_fn(self, batch):
        imgs, texts, filenames = zip(*batch)
        imgs = torch.stack(imgs, dim=0)  # [B,1,64,256]

        # For CE (seq2seq)
        seqs = [self.encode_text(t) for t in texts]
        max_len = max(len(s) for s in seqs)
        pad_idx = self.char2idx["<pad>"]
        dec_in, dec_tg, lengths = [], [], []
        for s in seqs:
            inp = s[:-1]  # includes <sos> ... last char
            tgt = s[1:]   # ... up to <eos>
            lengths.append(len(tgt))
            dec_in.append(inp + [pad_idx]*(max_len-1-len(inp)))
            dec_tg.append(tgt + [pad_idx]*(max_len-1-len(tgt)))
        dec_in = torch.tensor(dec_in, dtype=torch.long)   # [B,L]
        dec_tg = torch.tensor(dec_tg, dtype=torch.long)   # [B,L]
        lengths = torch.tensor(lengths, dtype=torch.long) # [B]

        # For CTC auxiliary
        ctc_targets_list = [torch.tensor(self.encode_text_ctc(t), dtype=torch.long) for t in texts]
        ctc_targets = torch.cat(ctc_targets_list) if len(ctc_targets_list) else torch.empty(0, dtype=torch.long)
        ctc_target_lengths = torch.tensor([len(t) for t in ctc_targets_list], dtype=torch.long)

        return imgs, dec_in, dec_tg, lengths, filenames, texts, ctc_targets, ctc_target_lengths

# ---------------------------
# Positional emb
# ---------------------------

class PositionalEncodingLearned(nn.Module):
    def __init__(self, max_len, d_model):
        super().__init__()
        self.pe = nn.Parameter(torch.zeros(1, max_len, d_model))
        nn.init.trunc_normal_(self.pe, std=0.02)
    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

# ---------------------------
# ViT Encoder
# ---------------------------

class ViTEncoder(nn.Module):
    def __init__(self, img_h=64, img_w=256, patch=8, d_model=256, nhead=8, num_layers=4, dim_ff=512, dropout=0.1):
        super().__init__()
        assert img_h % patch == 0 and img_w % patch == 0
        self.patch = patch
        self.num_patches = (img_h // patch) * (img_w // patch)  # 8 * 32 = 256
        patch_dim = (patch * patch) * 1  # 1 channel

        self.proj = nn.Sequential(
            nn.Linear(patch_dim, d_model),
            nn.LayerNorm(d_model)
        )
        self.pos = PositionalEncodingLearned(self.num_patches, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_ff,
                                                   dropout=dropout, batch_first=True, activation="gelu", norm_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

    def forward(self, x):
        unfold = nn.Unfold(kernel_size=(self.patch, self.patch), stride=(self.patch, self.patch))
        patches = unfold(x).transpose(1, 2)      # [B, T, patch_dim]
        tokens = self.proj(patches)              # [B, T, d_model]
        tokens = self.pos(tokens)
        enc = self.encoder(tokens)               # [B, T, d_model]
        return enc

# ---------------------------
# Transformer Decoder (FiLM + BOS conditioning) with weight tying
# ---------------------------

class TXTDecoder(nn.Module):
    def __init__(self, vocab_size, max_len=64, d_model=256, nhead=8, num_layers=4, dim_ff=512, dropout=0.1, pad_idx=0):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, d_model, padding_idx=pad_idx)
        self.pos = PositionalEncodingLearned(max_len, d_model)
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_ff,
                                                   dropout=dropout, batch_first=True, activation="gelu", norm_first=True)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.out = nn.Linear(d_model, vocab_size)
        # Weight tying
        self.out.weight = self.emb.weight
        self.pad_idx = pad_idx

        # FiLM conditioning from image global embedding
        self.g_norm = nn.LayerNorm(d_model)
        self.film = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Linear(d_model, 2*d_model)
        )
        self.bos_adapter = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.Tanh()
        )

    def _generate_square_subsequent_mask(self, L, device):
        return torch.triu(torch.full((L, L), float("-inf"), device=device), diagonal=1)

    def forward(self, memory, y_inp, img_global):
        B, L = y_inp.shape
        device = y_inp.device

        # Token embeddings
        tgt = self.emb(y_inp)                 # [B,L,D]

        # Image-conditioned BOS + FiLM (after LayerNorm on global)
        g = self.g_norm(img_global)           # [B,D]
        tgt[:, 0, :] = tgt[:, 0, :] + self.bos_adapter(g)
        gamma, beta = torch.chunk(self.film(g), 2, dim=-1)  # [B,D], [B,D]
        tgt = tgt * gamma.unsqueeze(1) + beta.unsqueeze(1)  # FiLM
        tgt = self.pos(tgt)

        tgt_mask = self._generate_square_subsequent_mask(L, device)  # [L,L]
        tgt_key_padding_mask = (y_inp == self.pad_idx)                # [B,L]

        out = self.decoder(tgt, memory,
                           tgt_mask=tgt_mask,
                           tgt_key_padding_mask=tgt_key_padding_mask)
        logits = self.out(out)  # [B,L,V]
        return logits

# ---------------------------
# Lightning Module with CTC auxiliary + token dropout
# ---------------------------

class ViTSeq2SeqOCR(pl.LightningModule):
    def __init__(self, dataset: HandwritingDataset, d_model=256, nhead=8, enc_layers=4, dec_layers=4,
                 dim_ff=512, dropout=0.1, lr=1e-3, max_decode_len=40,
                 token_dropout_p=0.15, ctc_weight=0.3):
        super().__init__()
        self.save_hyperparameters(ignore=['dataset'])
        self.pad_idx = dataset.char2idx["<pad>"]
        self.sos_idx = dataset.char2idx["<sos>"]
        self.eos_idx = dataset.char2idx["<eos>"]
        self.vocab_size = len(dataset.char2idx)
        self.idx2char = dataset.idx2char

        # For CTC
        self.ctc_blank = dataset.ctc_blank
        self.ctc_char2idx = dataset.ctc_char2idx
        self.ctc_vocab = 1 + len(self.ctc_char2idx)  # blank + chars

        self.encoder = ViTEncoder(img_h=64, img_w=256, patch=8, d_model=d_model,
                                  nhead=nhead, num_layers=enc_layers, dim_ff=dim_ff, dropout=dropout)
        self.decoder = TXTDecoder(vocab_size=self.vocab_size, max_len=256, d_model=d_model,
                                  nhead=nhead, num_layers=dec_layers, dim_ff=dim_ff,
                                  dropout=dropout, pad_idx=self.pad_idx)

        self.criterion = nn.CrossEntropyLoss(ignore_index=self.pad_idx, label_smoothing=0.1)
        self.ctc_head = nn.Linear(d_model, self.ctc_vocab)  # logits over CTC chars (blank+chars)
        self.ctc_loss = nn.CTCLoss(blank=self.ctc_blank, zero_infinity=True)

        self.max_decode_len = max_decode_len
        self.token_dropout_p = token_dropout_p
        self.ctc_weight = ctc_weight

        # logging
        self.train_epoch_losses, self.val_epoch_losses = [], []
        self._train_buf, self._val_buf = [], []
        self.val_preds, self.val_truths = [], []

    def apply_token_dropout(self, y_inp):
        if not self.training or self.token_dropout_p <= 0.0:
            return y_inp
        # drop only non-special tokens
        y = y_inp.clone()
        mask = (y != self.pad_idx) & (y != self.sos_idx) & (y != self.eos_idx)
        drop = (torch.rand_like(y.float()) < self.token_dropout_p) & mask
        y[drop] = self.pad_idx
        return y

    def forward(self, imgs, y_inp):
        memory = self.encoder(imgs)         # [B,T,D]
        img_global = memory.mean(dim=1)     # [B,D]
        y_inp = self.apply_token_dropout(y_inp)
        logits = self.decoder(memory, y_inp, img_global)  # [B,L,V]
        # CTC head over encoder tokens
        ctc_logits = self.ctc_head(memory)  # [B,T,C_ctc]
        return logits, ctc_logits

    def training_step(self, batch, batch_idx):
        imgs, dec_in, dec_tg, lengths, _, _, ctc_targets, ctc_target_lengths = batch
        logits, ctc_logits = self(imgs, dec_in)

        # CE loss
        ce = self.criterion(logits.reshape(-1, logits.size(-1)), dec_tg.reshape(-1))

        # CTC loss
        # ctc_logits: [B,T,C] -> [T,B,C] log_probs
        B, T, C = ctc_logits.shape
        logp = ctc_logits.log_softmax(dim=-1).permute(1,0,2)  # [T,B,C]
        input_lengths = torch.full((B,), T, dtype=torch.long, device=logp.device)
        if ctc_targets.numel() == 0:
            ctc = torch.tensor(0.0, device=logp.device)
        else:
            ctc = self.ctc_loss(logp, ctc_targets, input_lengths, ctc_target_lengths)

        loss = ce + self.ctc_weight * ctc

        self.log("train_loss", loss, prog_bar=True, on_epoch=True, batch_size=imgs.size(0))
        self.log("train_ce", ce, prog_bar=False, on_epoch=True)
        self.log("train_ctc", ctc, prog_bar=False, on_epoch=True)
        self._train_buf.append(loss.detach().cpu().item())
        return loss

    def validation_step(self, batch, batch_idx):
        imgs, dec_in, dec_tg, lengths, filenames, raw_texts, ctc_targets, ctc_target_lengths = batch
        logits, ctc_logits = self(imgs, dec_in)

        ce = self.criterion(logits.reshape(-1, logits.size(-1)), dec_tg.reshape(-1))
        B, T, C = ctc_logits.shape
        logp = ctc_logits.log_softmax(dim=-1).permute(1,0,2)
        input_lengths = torch.full((B,), T, dtype=torch.long, device=logp.device)
        if ctc_targets.numel() == 0:
            ctc = torch.tensor(0.0, device=logp.device)
        else:
            ctc = self.ctc_loss(logp, ctc_targets, input_lengths, ctc_target_lengths)

        loss = ce + self.ctc_weight * ctc
        self.log("val_loss", loss, prog_bar=True, on_epoch=True, batch_size=imgs.size(0))
        self.log("val_ce", ce, prog_bar=False, on_epoch=True)
        self.log("val_ctc", ctc, prog_bar=False, on_epoch=True)
        self._val_buf.append(loss.detach().cpu().item())

        # Greedy decode for metrics
        preds = self.greedy_decode(imgs, max_len=min(self.max_decode_len, dec_in.size(1)+5))
        self.val_preds.extend(preds)
        self.val_truths.extend(list(raw_texts))

        if batch_idx == 0:
            for i in range(min(3, len(preds))):
                self.print(f"VAL SAMPLE {filenames[i]} → pred: {preds[i]} | gt: {raw_texts[i]}")

    def on_train_epoch_end(self):
        if self._train_buf:
            self.train_epoch_losses.append(float(np.mean(self._train_buf)))
            self._train_buf = []

    def on_validation_epoch_end(self):
        if self._val_buf:
            self.val_epoch_losses.append(float(np.mean(self._val_buf)))
            self._val_buf = []
        if self.val_preds:
            cer, one_minus_cer, acc, wer = compute_metrics(self.val_preds, self.val_truths)
            self.log('val_CER', cer, prog_bar=True)
            self.log('val_1_minus_CER', one_minus_cer, prog_bar=True)
            self.log('val_ACC', acc, prog_bar=True)
            self.log('val_WER', wer, prog_bar=True)
            self.val_preds, self.val_truths = [], []

    def on_fit_end(self):
        if self.train_epoch_losses:
            plt.figure(figsize=(6,4))
            plt.plot(range(1, len(self.train_epoch_losses)+1), self.train_epoch_losses, label="Train Loss")
            if self.val_epoch_losses:
                plt.plot(range(1, len(self.val_epoch_losses)+1), self.val_epoch_losses, label="Val Loss")
            plt.xlabel("Epoch"); plt.ylabel("Loss (CE + λ·CTC)"); plt.title("ViT+Decoder (CTC-aux) Loss")
            plt.legend(); plt.tight_layout()
            plt.savefig("loss_curve_vit_seq2seq.png")
            self.print("[Saved] loss_curve_vit_seq2seq.png")

    @torch.no_grad()
    def greedy_decode(self, imgs, max_len=40):
        self.eval()
        device = imgs.device
        memory = self.encoder(imgs)           # [B,T,D]
        img_global = memory.mean(dim=1)       # [B,D]

        B = imgs.size(0)
        ys = torch.full((B, 1), self.sos_idx, dtype=torch.long, device=device)
        finished = torch.zeros(B, dtype=torch.bool, device=device)
        out_texts = [""] * B

        for _ in range(max_len):
            logits = self.decoder(memory, ys, img_global)  # [B,L,V]
            next_logit = logits[:, -1, :]
            next_tok = next_logit.argmax(dim=-1)

            for i in range(B):
                if finished[i]: 
                    continue
                tok = next_tok[i].item()
                if tok == self.eos_idx:
                    finished[i] = True
                elif tok not in (self.pad_idx, self.sos_idx):
                    out_texts[i] += self.idx2char.get(tok, "")
            ys = torch.cat([ys, next_tok.unsqueeze(1)], dim=1)
            if finished.all():
                break
        return out_texts

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=self.hparams.lr)

# ---------------------------
# Training / Testing harness
# ---------------------------

def make_loaders():
    train_ds = HandwritingDataset(
        "/kaggle/input/handwriting-recognition/written_name_train_v2.csv",
        "/kaggle/input/handwriting-recognition/train_v2/train",
        crop_h=64, crop_w=256
    )
    val_ds = HandwritingDataset(
        "/kaggle/input/handwriting-recognition/written_name_validation_v2.csv",
        "/kaggle/input/handwriting-recognition/validation_v2/validation",
        crop_h=64, crop_w=256,
        char2idx=train_ds.char2idx
    )
    test_ds = HandwritingDataset(
        "/kaggle/input/handwriting-recognition/written_name_test_v2.csv",
        "/kaggle/input/handwriting-recognition/test_v2/test",
        crop_h=64, crop_w=256,
        char2idx=train_ds.char2idx
    )

    train_loader = DataLoader(train_ds, batch_size=128, shuffle=True, num_workers=3,
                              pin_memory=True, collate_fn=train_ds.collate_fn)
    val_loader = DataLoader(val_ds, batch_size=128, shuffle=False, num_workers=3,
                            pin_memory=True, collate_fn=val_ds.collate_fn)
    test_loader = DataLoader(test_ds, batch_size=128, shuffle=False, num_workers=3,
                             pin_memory=True, collate_fn=test_ds.collate_fn)
    return train_ds, val_ds, test_ds, train_loader, val_loader, test_loader

def train_vit_seq2seq():
    train_ds, val_ds, test_ds, train_loader, val_loader, test_loader = make_loaders()
    model = ViTSeq2SeqOCR(train_ds, d_model=256, nhead=8, enc_layers=4, dec_layers=4,
                          dim_ff=512, dropout=0.1, lr=1e-3, max_decode_len=40,
                          token_dropout_p=0.15, ctc_weight=0.3)
    accelerator = "gpu" if torch.cuda.is_available() else "cpu"
    trainer = pl.Trainer(max_epochs=40, accelerator=accelerator, devices=1, log_every_n_steps=20)
    trainer.fit(model, train_loader, val_loader)
    trainer.save_checkpoint("seq2seq_vit_ctc.ckpt")
    return model, test_loader, test_ds

@torch.no_grad()
def test_vit_seq2seq(model, test_loader, idx2char):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    all_preds, all_truths = [], []
    for batch in tqdm(test_loader, desc="Testing (ViT+Decoder CTC-aux)"):
        imgs, dec_in, dec_tg, lengths, filenames, raw_texts, ctc_targets, ctc_target_lengths = batch
        imgs = imgs.to(device)
        preds = model.greedy_decode(imgs, max_len=dec_in.size(1)+5)
        all_preds.extend(preds)
        all_truths.extend(list(raw_texts))

    for i in range(min(5, len(all_preds))):
        print(f"GT: {all_truths[i]} | PRED: {all_preds[i]}")

    cer, one_minus_cer, acc, wer = compute_metrics(all_preds, all_truths)
    print(f"Test CER: {cer:.6f}")
    print(f"Test 1-CER (Char Acc): {one_minus_cer:.6f}")
    print(f"Test ACC (Exact): {acc:.6f}")
    print(f"Test WER: {wer:.6f}")

if __name__ == "__main__":
    model, test_loader, test_ds = train_vit_seq2seq()
    # Optionally reload:
    # model = ViTSeq2SeqOCR.load_from_checkpoint(
    #     "seq2seq_vit_ctc.ckpt", dataset=test_ds, d_model=256, nhead=8, enc_layers=4, dec_layers=4,
    #     dim_ff=512, dropout=0.1, lr=1e-3, max_decode_len=40, token_dropout_p=0.15, ctc_weight=0.3
    # )
    test_vit_seq2seq(model, test_loader, test_ds.idx2char)


2025-10-03 15:06:33.964286: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759503994.130065      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759503994.183666      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 128. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


VAL SAMPLE VALIDATION_0001.jpg → pred: DDDDDDDDDDDDDDDDDDDDDD | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: DDDDDDDDDDDDDDDDDDDDDD | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: DDDDDDDDDDDDDDDDDDDDDD | gt: LEA


Training: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 54. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: LEA | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LEANA | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEANE | gt: LEA


/usr/local/lib/python3.11/dist-packages/pytorch_lightning/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 64. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: LELIE | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: MANDINE | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LOUIS | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Validation: |          | 0/? [00:00<?, ?it/s]

VAL SAMPLE VALIDATION_0001.jpg → pred: BILEL | gt: BILEL
VAL SAMPLE VALIDATION_0002.jpg → pred: LAUMONIER | gt: LAUMIONIER
VAL SAMPLE VALIDATION_0003.jpg → pred: LEA | gt: LEA


Testing (ViT+Decoder CTC-aux): 100%|██████████| 323/323 [01:58<00:00,  2.72it/s]


GT: KEVIN | PRED: KEVIN
GT: CLOTAIRE | PRED: CLOTLIEL
GT: LENA | PRED: LENA
GT: JULES | PRED: JULES
GT: CHERPIN | PRED: CHERPIN
Test CER: 0.044544
Test 1-CER (Char Acc): 0.955456
Test ACC (Exact): 0.843324
Test WER: 0.156676
