In [1]:
import os
from PIL import Image
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# Set up the data directory
data_dir = 'Register Numbers/'

# Load image paths and labels
image_pairs = [
    (os.path.join(data_dir, f), f.split('.')[0]) 
    for f in os.listdir(data_dir) 
    if f.endswith('.png')
]

# Validate labels
def validate_label(label):
    if len(label) != 12 or not label.isdigit():
        raise ValueError(f"Invalid register number: {label}")
    return label

cleaned_pairs = [(path, validate_label(label[:12])) for path, label in image_pairs]

# Split the data
random.shuffle(cleaned_pairs)
total = len(cleaned_pairs)
train = cleaned_pairs[:int(0.8 * total)]
val = cleaned_pairs[int(0.8 * total):int(0.9 * total)]
test = cleaned_pairs[int(0.9 * total):]
print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")

Train: 4276, Val: 535, Test: 535


In [2]:
class RegisterNumberDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert('L')
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        label_seq = [int(digit) + 1 for digit in label]  # 0->1, 1->2, ..., 9->10
        return image, torch.tensor(label_seq, dtype=torch.long)

transform = transforms.Compose([
    transforms.Resize((32, 256)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = RegisterNumberDataset([p for p, _ in train], [l for _, l in train], transform=transform)
val_dataset = RegisterNumberDataset([p for p, _ in val], [l for _, l in val], transform=transform)
test_dataset = RegisterNumberDataset([p for p, _ in test], [l for _, l in test], transform=transform)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [3]:
class RegisterNumberDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert('L')
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        label_seq = [int(digit) + 1 for digit in label]  # 0->1, 1->2, ..., 9->10
        return image, torch.tensor(label_seq, dtype=torch.long)

transform = transforms.Compose([
    transforms.Resize((32, 256)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = RegisterNumberDataset([p for p, _ in train], [l for _, l in train], transform=transform)
val_dataset = RegisterNumberDataset([p for p, _ in val], [l for _, l in val], transform=transform)
test_dataset = RegisterNumberDataset([p for p, _ in test], [l for _, l in test], transform=transform)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [4]:
import torch
import torch.nn as nn

class CRNN(nn.Module):
    def __init__(self, num_classes):
        super(CRNN, self).__init__()
        # CNN component with dropout
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # (N, 64, 16, W/2)
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # (N, 128, 8, W/4)
            nn.Dropout2d(0.3),  # Dropout after second maxpool
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d((2,1), (2,1)),  # (N, 256, 4, W/4)
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d((2,1), (2,1)),  # (N, 512, 2, W/4)
            nn.Dropout2d(0.3),  # Dropout after fourth maxpool
            nn.Conv2d(512, 512, kernel_size=(2,1)),  # (N, 512, 1, W/4)
            nn.BatchNorm2d(512),
            nn.ReLU(),
        )
        # LSTM with dropout between layers
        self.rnn = nn.LSTM(512, 256, num_layers=2, bidirectional=True, dropout=0.3)
        # Dropout before the fully connected layer
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(512, num_classes)  # 512 because bidirectional (256 * 2)

    def forward(self, x):
        # Pass through CNN
        x = self.cnn(x)  # (N, 512, 1, W/4)
        x = x.squeeze(2)  # (N, 512, W/4)
        x = x.permute(2, 0, 1)  # (W/4, N, 512) for LSTM
        # Pass through LSTM
        x, _ = self.rnn(x)  # (W/4, N, 512)
        # Apply dropout before FC
        x = self.dropout(x)
        # Fully connected layer for classification
        x = self.fc(x)  # (W/4, N, num_classes)
        return x

# Initialize the model (assuming 11 classes: blank + digits 0-9)
model = CRNN(num_classes=11)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

CRNN(
  (cnn): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Dropout2d(p=0.3, inplace=False)
    (9): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (10): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): ReLU()
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (14): ReLU()
    (15): MaxPool2d(kernel_size=

In [5]:
import torch
from torch.utils.data import DataLoader

# Define loss function and optimizer
criterion = nn.CTCLoss(blank=0)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Lists to store metrics
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

# Decoding function (used in both training and evaluation)
def decode_predictions(outputs):
    outputs = outputs.softmax(2).argmax(2)  # (T, N)
    batch_size = outputs.size(1)
    decoded = []
    for b in range(batch_size):
        seq = outputs[:, b].cpu().numpy()
        prev = -1
        result = []
        for s in seq:
            if s != 0 and s != prev:
                result.append(s - 1)
            prev = s
        decoded.append(''.join(map(str, result)))
    return decoded

# Training loop with validation
num_epochs = 70
for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0
    for images, targets in train_loader:
        images, targets = images.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(images)  # (W/4, N, num_classes)
        outputs_log = outputs.log_softmax(2)
        batch_size = images.size(0)
        input_lengths = torch.full((batch_size,), outputs.size(0), dtype=torch.long, device=device)
        target_lengths = torch.full((batch_size,), 12, dtype=torch.long, device=device)
        targets_flat = targets.view(-1)
        loss = criterion(outputs_log, targets_flat, input_lengths, target_lengths)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

        # Compute training accuracy
        predictions = decode_predictions(outputs)
        target_labels = [''.join(str(d - 1) for d in t.tolist()) for t in targets]
        for pred, target in zip(predictions, target_labels):
            if pred == target:
                train_correct += 1
            train_total += 1

    train_loss = train_loss / len(train_loader)
    train_accuracy = train_correct / train_total * 100
    train_losses.append(train_loss)
    train_accuracies.append(train_accuracy)

    # Validation phase
    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for images, targets in val_loader:
            images, targets = images.to(device), targets.to(device)
            outputs = model(images)
            outputs_log = outputs.log_softmax(2)
            batch_size = images.size(0)
            input_lengths = torch.full((batch_size,), outputs.size(0), dtype=torch.long, device=device)
            target_lengths = torch.full((batch_size,), 12, dtype=torch.long, device=device)
            targets_flat = targets.view(-1)
            loss = criterion(outputs_log, targets_flat, input_lengths, target_lengths)
            val_loss += loss.item()

            # Compute validation accuracy
            predictions = decode_predictions(outputs)
            target_labels = [''.join(str(d - 1) for d in t.tolist()) for t in targets]
            for pred, target in zip(predictions, target_labels):
                if pred == target:
                    val_correct += 1
                val_total += 1

    val_loss = val_loss / len(val_loader)
    val_accuracy = val_correct / val_total * 100
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)

    # Print metrics
    print(f"Epoch {epoch+1}/{num_epochs}, "
          f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, "
          f"Train Acc: {train_accuracy:.2f}%, Val Acc: {val_accuracy:.2f}%")

Epoch 1/70, Train Loss: 1.5128, Val Loss: 1.0863, Train Acc: 0.00%, Val Acc: 0.00%
Epoch 2/70, Train Loss: 0.8534, Val Loss: 0.6677, Train Acc: 0.00%, Val Acc: 0.00%
Epoch 3/70, Train Loss: 0.5524, Val Loss: 0.4512, Train Acc: 1.26%, Val Acc: 2.43%
Epoch 4/70, Train Loss: 0.3689, Val Loss: 0.2421, Train Acc: 9.71%, Val Acc: 24.49%
Epoch 5/70, Train Loss: 0.2285, Val Loss: 0.1672, Train Acc: 37.35%, Val Acc: 57.01%
Epoch 6/70, Train Loss: 0.1439, Val Loss: 0.1004, Train Acc: 63.99%, Val Acc: 75.70%
Epoch 7/70, Train Loss: 0.1116, Val Loss: 0.0926, Train Acc: 75.00%, Val Acc: 82.80%
Epoch 8/70, Train Loss: 0.0778, Val Loss: 0.0658, Train Acc: 84.07%, Val Acc: 87.48%
Epoch 9/70, Train Loss: 0.0677, Val Loss: 0.0854, Train Acc: 85.97%, Val Acc: 81.50%
Epoch 10/70, Train Loss: 0.0842, Val Loss: 0.1217, Train Acc: 82.97%, Val Acc: 76.26%
Epoch 11/70, Train Loss: 0.0728, Val Loss: 0.0637, Train Acc: 85.17%, Val Acc: 88.97%
Epoch 12/70, Train Loss: 0.0539, Val Loss: 0.0650, Train Acc: 89.36%, 

In [6]:
def decode_predictions(outputs):
    outputs = outputs.softmax(2).argmax(2)  # (T, N) -> e.g., (64, N)
    batch_size = outputs.size(1)
    decoded = []
    for b in range(batch_size):
        seq = outputs[:, b].cpu().numpy()  # (T,)
        prev = -1
        result = []
        for s in seq:
            if s != 0 and s != prev:
                result.append(s - 1)
            prev = s
        decoded.append(''.join(map(str, result)))
    return decoded

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, targets in test_loader:
        images = images.to(device)
        outputs = model(images)  # (T, N, num_classes)
        predictions = decode_predictions(outputs)
        target_labels = [''.join(str(d - 1) for d in t.tolist()) for t in targets]
        for pred, target in zip(predictions, target_labels):
            if pred == target:
                correct += 1
            total += 1
print(f"Test Accuracy: {correct/total*100:.2f}%")

Test Accuracy: 91.03%


In [8]:
def predict_register_number(model, image_path, device, transform):
    image = Image.open(image_path).convert('L')
    image = transform(image)
    image = image.unsqueeze(0)  # (1, 1, 32, 256)
    model.eval()
    with torch.no_grad():
        image = image.to(device)
        output = model(image)  # (T, 1, num_classes), e.g., (64, 1, 11)
        output = output.squeeze(1)  # (T, num_classes)
        output = output.softmax(1).argmax(1)  # (T,)
        seq = output.cpu().numpy()
        prev = -1
        result = []
        for s in seq:
            if s != 0 and s != prev:
                result.append(s - 1)
            prev = s
    return ''.join(map(str, result))

# Test with your image
image_path = 'my_reg.png'  # Replace with your image path
predicted_number = predict_register_number(model, image_path, device, transform)
print(f"Predicted Register Number: {predicted_number}")

Predicted Register Number: 2122240066
