In [1]:
import os
from PIL import Image
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# Set up the data directory
data_dir = 'Register Numbers/'

# Load image paths and labels
image_pairs = [
    (os.path.join(data_dir, f), f.split('.')[0]) 
    for f in os.listdir(data_dir) 
    if f.endswith('.png')
]

# Validate labels
def validate_label(label):
    if len(label) != 12 or not label.isdigit():
        raise ValueError(f"Invalid register number: {label}")
    return label

cleaned_pairs = [(path, validate_label(label[:12])) for path, label in image_pairs]

# Split the data
random.shuffle(cleaned_pairs)
total = len(cleaned_pairs)
train = cleaned_pairs[:int(0.8 * total)]
val = cleaned_pairs[int(0.8 * total):int(0.9 * total)]
test = cleaned_pairs[int(0.9 * total):]
print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")

Train: 4276, Val: 535, Test: 535


In [3]:
class RegisterNumberDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert('L')
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        label_seq = [int(digit) + 1 for digit in label]  # 0->1, 1->2, ..., 9->10
        return image, torch.tensor(label_seq, dtype=torch.long)

transform = transforms.Compose([
    transforms.Resize((32, 256)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = RegisterNumberDataset([p for p, _ in train], [l for _, l in train], transform=transform)
val_dataset = RegisterNumberDataset([p for p, _ in val], [l for _, l in val], transform=transform)
test_dataset = RegisterNumberDataset([p for p, _ in test], [l for _, l in test], transform=transform)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [4]:
class CRNN(nn.Module):
    def __init__(self, num_classes):
        super(CRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # (N, 64, 16, W/2)
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # (N, 128, 8, W/4)
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d((2,1), (2,1)),  # (N, 256, 4, W/4)
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d((2,1), (2,1)),  # (N, 512, 2, W/4)
            nn.Conv2d(512, 512, kernel_size=(2,1)),  # (N, 512, 1, W/4)
            nn.BatchNorm2d(512),
            nn.ReLU(),
        )
        self.rnn = nn.LSTM(512, 256, num_layers=2, bidirectional=True)
        self.fc = nn.Linear(512, num_classes)  # 512 for bidirectional

    def forward(self, x):
        x = self.cnn(x)  # (N, 512, 1, W/4)
        x = x.squeeze(2)  # (N, 512, W/4)
        x = x.permute(2, 0, 1)  # (W/4, N, 512)
        x, _ = self.rnn(x)  # (W/4, N, 512)
        x = self.fc(x)  # (W/4, N, num_classes)
        return x

# Initialize model
model = CRNN(num_classes=11)  # 11 classes: blank + 0-9
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

CRNN(
  (cnn): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (13): ReLU()
    (14): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation

In [5]:
criterion = nn.CTCLoss(blank=0)
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 75
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for images, targets in train_loader:
        images, targets = images.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(images)  # (W/4, N, num_classes), e.g., (64, N, 11)
        outputs = outputs.log_softmax(2)  # Apply log_softmax for CTC
        batch_size = images.size(0)
        input_lengths = torch.full((batch_size,), outputs.size(0), dtype=torch.long, device=device)  # e.g., 64
        target_lengths = torch.full((batch_size,), 12, dtype=torch.long, device=device)
        targets = targets.view(-1)  # Flatten: N * 12
        loss = criterion(outputs, targets, input_lengths, target_lengths)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/75, Loss: 1.1786
Epoch 2/75, Loss: 0.1993
Epoch 3/75, Loss: 0.0922
Epoch 4/75, Loss: 0.0662
Epoch 5/75, Loss: 0.0573
Epoch 6/75, Loss: 0.0481
Epoch 7/75, Loss: 0.0420
Epoch 8/75, Loss: 0.0358
Epoch 9/75, Loss: 0.0365
Epoch 10/75, Loss: 0.0348
Epoch 11/75, Loss: 0.0321
Epoch 12/75, Loss: 0.0322
Epoch 13/75, Loss: 0.0282
Epoch 14/75, Loss: 0.0282
Epoch 15/75, Loss: 0.0285
Epoch 16/75, Loss: 0.0252
Epoch 17/75, Loss: 0.0226
Epoch 18/75, Loss: 0.0254
Epoch 19/75, Loss: 0.0216
Epoch 20/75, Loss: 0.0230
Epoch 21/75, Loss: 0.0193
Epoch 22/75, Loss: 0.0202
Epoch 23/75, Loss: 0.0239
Epoch 24/75, Loss: 0.0189
Epoch 25/75, Loss: 0.0166
Epoch 26/75, Loss: 0.0149
Epoch 27/75, Loss: 0.0135
Epoch 28/75, Loss: 0.0116
Epoch 29/75, Loss: 0.0186
Epoch 30/75, Loss: 0.0199
Epoch 31/75, Loss: 0.0131
Epoch 32/75, Loss: 0.0116
Epoch 33/75, Loss: 0.0140
Epoch 34/75, Loss: 0.0116
Epoch 35/75, Loss: 0.0094
Epoch 36/75, Loss: 0.0073
Epoch 37/75, Loss: 0.0093
Epoch 38/75, Loss: 0.0135
Epoch 39/75, Loss: 0.

In [7]:
def decode_predictions(outputs):
    outputs = outputs.softmax(2).argmax(2)  # (T, N) -> e.g., (64, N)
    batch_size = outputs.size(1)
    decoded = []
    for b in range(batch_size):
        seq = outputs[:, b].cpu().numpy()  # (T,)
        prev = -1
        result = []
        for s in seq:
            if s != 0 and s != prev:
                result.append(s - 1)
            prev = s
        decoded.append(''.join(map(str, result)))
    return decoded

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, targets in test_loader:
        images = images.to(device)
        outputs = model(images)  # (T, N, num_classes)
        predictions = decode_predictions(outputs)
        target_labels = [''.join(str(d - 1) for d in t.tolist()) for t in targets]
        for pred, target in zip(predictions, target_labels):
            if pred == target:
                correct += 1
            total += 1
print(f"Test Accuracy: {correct/total*100:.2f}%")

Test Accuracy: 95.14%


In [15]:
def predict_register_number(model, image_path, device, transform):
    image = Image.open(image_path).convert('L')
    image = transform(image)
    image = image.unsqueeze(0)  # (1, 1, 32, 256)
    model.eval()
    with torch.no_grad():
        image = image.to(device)
        output = model(image)  # (T, 1, num_classes), e.g., (64, 1, 11)
        output = output.squeeze(1)  # (T, num_classes)
        output = output.softmax(1).argmax(1)  # (T,)
        seq = output.cpu().numpy()
        prev = -1
        result = []
        for s in seq:
            if s != 0 and s != prev:
                result.append(s - 1)
            prev = s
    return ''.join(map(str, result))

# Test with your image
image_path = 'my_reg.png'  # Replace with your image path
predicted_number = predict_register_number(model, image_path, device, transform)
print(f"Predicted Register Number: {predicted_number}")

Predicted Register Number: 22040066
