# Baseline CAPTCHA Recognition Model

This notebook implements a simple CNN baseline model for CAPTCHA recognition. The baseline uses a classic convolutional neural network architecture to demonstrate the baseline performance, which highlights the superiority of our custom approach.

In [1]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import os
import numpy as np
from tqdm import tqdm
import string

## Dataset Configuration

Define the character set and dataset parameters for CAPTCHA recognition.

In [2]:
# Define character set (digits + lowercase letters)
CHARACTERS = string.digits + string.ascii_lowercase  # '0123456789abcdefghijklmnopqrstuvwxyz'
NUM_CLASSES = len(CHARACTERS)  # 36 classes
MAX_LENGTH = 8  # Maximum CAPTCHA length

# Create character to index mapping
char_to_idx = {char: idx for idx, char in enumerate(CHARACTERS)}
idx_to_char = {idx: char for idx, char in enumerate(CHARACTERS)}

# Data paths
TRAIN_DIR = 'processed/train'
TEST_DIR = 'processed/test'

print(f"Number of character classes: {NUM_CLASSES}")
print(f"Character set: {CHARACTERS}")
print(f"Maximum CAPTCHA length: {MAX_LENGTH}")

Number of character classes: 36
Character set: 0123456789abcdefghijklmnopqrstuvwxyz
Maximum CAPTCHA length: 8


## Custom Dataset Class

Create a PyTorch Dataset for loading CAPTCHA images and labels.

In [3]:
class CaptchaDataset(Dataset):
    def __init__(self, data_dir, transform=None, max_length=MAX_LENGTH):
        self.data_dir = data_dir
        self.transform = transform
        self.max_length = max_length
        self.image_files = [f for f in os.listdir(data_dir) if f.endswith('.png')]
        
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.data_dir, img_name)
        
        # Load image
        image = Image.open(img_path).convert('RGB')
        
        # Extract label from filename (format: label-0.png)
        label_str = img_name.split('-')[0].lower()
        
        # Convert to fixed-length label (pad with -1 for unused positions)
        label = [-1] * self.max_length
        for i, char in enumerate(label_str[:self.max_length]):
            label[i] = char_to_idx[char]
        
        if self.transform:
            image = self.transform(image)
            
        return image, torch.tensor(label, dtype=torch.long), label_str

# Define transforms
transform = transforms.Compose([
    transforms.Resize((64, 200)),  # Resize to standard size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

print("Dataset class created successfully")

Dataset class created successfully


## Simplified Baseline CNN Model

Define a simplified CNN baseline using basic convolutional blocks. This baseline uses standard convolutions without advanced techniques like depthwise separable convolutions or CTC loss, making it a reasonable comparison point.

In [4]:
class SimpleResidualBlock(nn.Module):
    """Simplified residual block without depthwise separable convolutions"""
    def __init__(self, in_ch, out_ch, stride=1):
        super().__init__()
        self.use_residual = (stride == 1 and in_ch == out_ch)
        
        self.conv = nn.Sequential(
            nn.Conv2d(in_ch, out_ch, 3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_ch),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_ch, out_ch, 3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(out_ch),
        )
        
        # Shortcut for dimension matching
        self.shortcut = nn.Sequential()
        if stride != 1 or in_ch != out_ch:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_ch, out_ch, 1, stride=stride, bias=False),
                nn.BatchNorm2d(out_ch)
            )
    
    def forward(self, x):
        out = self.conv(x)
        if self.use_residual:
            out = out + self.shortcut(x)
        return nn.functional.relu(out)


class BaselineCNN(nn.Module):
    """Simplified baseline CNN for single character recognition"""
    def __init__(self, num_classes=NUM_CLASSES, max_length=MAX_LENGTH, dropout=0.3):
        super().__init__()
        self.max_length = max_length
        self.num_classes = num_classes
        
        # Simple CNN backbone
        self.stem = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True)
        )
        
        self.features = nn.Sequential(
            SimpleResidualBlock(32, 64, stride=2),   # 32x100
            SimpleResidualBlock(64, 128, stride=2),  # 16x50
            SimpleResidualBlock(128, 256, stride=2), # 8x25
        )
        
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        
        # Separate classifier head for each character position
        self.classifiers = nn.ModuleList([
            nn.Sequential(
                nn.Flatten(),
                nn.Dropout(dropout),
                nn.Linear(256, num_classes)
            ) for _ in range(max_length)
        ])
    
    def forward(self, x):
        x = self.stem(x)
        x = self.features(x)
        x = self.pool(x)
        
        # Predict each character position independently
        outputs = [classifier(x) for classifier in self.classifiers]
        return outputs

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BaselineCNN(num_classes=NUM_CLASSES, max_length=MAX_LENGTH).to(device)

print(f"Model initialized on device: {device}")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f"Architecture: Simple CNN with per-position classification")

Model initialized on device: cuda
Total parameters: 1,281,856
Trainable parameters: 1,281,856
Architecture: Simple CNN with per-position classification


## Training Setup

Initialize data loaders, loss function, and optimizer.

In [5]:
# Create datasets
train_dataset = CaptchaDataset(TRAIN_DIR, transform=transform)
test_dataset = CaptchaDataset(TEST_DIR, transform=transform)

# Create data loaders
BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=-1)  # Ignore padded positions
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3, verbose=True)

print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Loss function: CrossEntropyLoss (per-position)")

Training samples: 7836
Test samples: 2000
Batch size: 64
Loss function: CrossEntropyLoss (per-position)




## Training Function

Define the training loop for one epoch.

In [None]:
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct_chars = 0
    total_chars = 0
    
    progress_bar = tqdm(train_loader, desc='Training')
    
    for images, labels, _ in progress_bar:
        images = images.to(device)
        labels = labels.to(device)  # [batch, max_length]
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(images)  # List of [batch, num_classes] for each position
        
        # Calculate loss for each position
        loss = 0
        for pos in range(len(outputs)):
            loss += criterion(outputs[pos], labels[:, pos])
        loss /= len(outputs)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        # Calculate character-level accuracy
        for pos in range(len(outputs)):
            preds = outputs[pos].argmax(dim=1)
            mask = labels[:, pos] != -1  # Only count valid positions
            correct_chars += (preds[mask] == labels[:, pos][mask]).sum().item()
            total_chars += mask.sum().item()
        
        char_acc = 100 * correct_chars / total_chars if total_chars > 0 else 0
        
        # Update progress bar
        progress_bar.set_postfix({
            'loss': total_loss / (progress_bar.n + 1),
            'char_acc': f'{char_acc:.2f}%'
        })
    
    avg_loss = total_loss / len(train_loader)
    train_acc = 100 * correct_chars / total_chars if total_chars > 0 else 0
    
    return avg_loss, train_acc

print("Training function defined")

Training function defined


## Evaluation Function

Define the evaluation function to test the model and return predictions as strings.

In [None]:
def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct_sequences = 0
    total_sequences = 0
    correct_chars = 0
    total_chars = 0
    
    with torch.no_grad():
        progress_bar = tqdm(test_loader, desc='Evaluating')
        
        for images, labels, label_strs in progress_bar:
            images = images.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(images)
            
            # Calculate loss
            loss = 0
            for pos in range(len(outputs)):
                loss += criterion(outputs[pos], labels[:, pos])
            loss /= len(outputs)
            total_loss += loss.item()
            
            # Get predictions for each position
            preds = torch.stack([out.argmax(dim=1) for out in outputs], dim=1)  # [batch, max_length]
            
            # Calculate accuracy
            for i, true_str in enumerate(label_strs):
                pred_chars = []
                for pos in range(len(true_str)):
                    pred_chars.append(idx_to_char[preds[i, pos].item()])
                pred_str = ''.join(pred_chars)
                
                # Sequence accuracy
                if pred_str == true_str:
                    correct_sequences += 1
                
                # Character-level accuracy
                for j in range(len(true_str)):
                    if pred_str[j] == true_str[j]:
                        correct_chars += 1
                total_chars += len(true_str)
            
            total_sequences += len(label_strs)
            
            seq_acc = 100 * correct_sequences / total_sequences
            char_acc = 100 * correct_chars / total_chars if total_chars > 0 else 0
            
            progress_bar.set_postfix({
                'loss': total_loss / (progress_bar.n + 1),
                'char_acc': f'{char_acc:.2f}%',
                'seq_acc': f'{seq_acc:.2f}%'
            })
    
    avg_loss = total_loss / len(test_loader)
    char_accuracy = 100 * correct_chars / total_chars if total_chars > 0 else 0
    seq_accuracy = 100 * correct_sequences / total_sequences
    
    return avg_loss, char_accuracy, seq_accuracy

def predict_captcha(model, image, device):
    """Predict CAPTCHA text from an image and return as string"""
    model.eval()
    with torch.no_grad():
        image = image.unsqueeze(0).to(device)
        outputs = model(image)
        
        pred_chars = []
        for out in outputs:
            pred_idx = out.argmax(dim=1).item()
            pred_chars.append(idx_to_char[pred_idx])
        
        return ''.join(pred_chars)

print("Evaluation functions defined")

Evaluation functions defined


## Training Loop

Train the baseline model for multiple epochs.

In [None]:
# Training configuration
NUM_EPOCHS = 20

# Training history
history = {
    'train_loss': [],
    'train_char_acc': [],
    'test_loss': [],
    'test_char_acc': [],
    'test_seq_acc': []
}

print("Starting training...")
best_seq_acc = 0

for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")
    print("-" * 50)
    
    # Train
    train_loss, train_char_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    history['train_loss'].append(train_loss)
    history['train_char_acc'].append(train_char_acc)
    
    # Evaluate
    test_loss, test_char_acc, test_seq_acc = evaluate(model, test_loader, criterion, device)
    history['test_loss'].append(test_loss)
    history['test_char_acc'].append(test_char_acc)
    history['test_seq_acc'].append(test_seq_acc)
    
    print(f"\nTrain Loss: {train_loss:.4f} | Train Char Acc: {train_char_acc:.2f}%")
    print(f"Test Loss: {test_loss:.4f} | Test Char Acc: {test_char_acc:.2f}% | Test Seq Acc: {test_seq_acc:.2f}%")
    
    # Learning rate scheduling based on sequence accuracy
    scheduler.step(test_seq_acc)
    
    # Save best model
    if test_seq_acc > best_seq_acc:
        best_seq_acc = test_seq_acc
        torch.save(model.state_dict(), 'baseline_best_model.pth')
        print(f"✓ Best model saved with sequence accuracy: {best_seq_acc:.2f}%")

print("\n" + "=" * 50)
print("Training completed!")
print(f"Best Test Sequence Accuracy: {best_seq_acc:.2f}%")

Starting training...

Epoch 1/15
--------------------------------------------------


Training:   0%|          | 0/123 [00:00<?, ?it/s]



ValueError: not enough values to unpack (expected 4, got 3)

## Test Predictions

Test the model on sample images and display predictions as strings.

In [None]:
# Load best model
model.load_state_dict(torch.load('baseline_best_model.pth'))
model.eval()

# Test on a few samples
import matplotlib.pyplot as plt

# Get some test samples
test_images = []
test_labels = []
for i in range(10):
    img, _, label_str = test_dataset[i]
    test_images.append(img)
    test_labels.append(label_str)

# Make predictions
print("Sample Predictions:")
print("=" * 60)
correct_count = 0
for i, (img, true_label) in enumerate(zip(test_images, test_labels)):
    predicted_text = predict_captcha(model, img, device)
    match = "✓" if predicted_text == true_label else "✗"
    if predicted_text == true_label:
        correct_count += 1
    print(f"{match} Sample {i+1}: True: '{true_label}' | Predicted: '{predicted_text}'")

print("\n" + "=" * 60)
print(f"Accuracy on samples: {100 * correct_count / len(test_images):.1f}%")

## Results Summary

Display the training history and final performance metrics.

In [None]:
# Plot training history
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Loss
axes[0, 0].plot(history['train_loss'], label='Train Loss', marker='o')
axes[0, 0].plot(history['test_loss'], label='Test Loss', marker='s')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].set_title('Training and Test Loss')
axes[0, 0].legend()
axes[0, 0].grid(True)

# Character Accuracy
axes[0, 1].plot(history['train_char_acc'], label='Train Char Acc', marker='o', color='blue')
axes[0, 1].plot(history['test_char_acc'], label='Test Char Acc', marker='s', color='orange')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy (%)')
axes[0, 1].set_title('Character-Level Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True)

# Sequence Accuracy
axes[1, 0].plot(history['test_seq_acc'], label='Test Seq Acc', marker='s', color='green')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('Accuracy (%)')
axes[1, 0].set_title('Sequence-Level Accuracy')
axes[1, 0].legend()
axes[1, 0].grid(True)

# Hide empty subplot
axes[1, 1].axis('off')

plt.tight_layout()
plt.savefig('baseline_training_history.png', dpi=150, bbox_inches='tight')
plt.show()

# Print final results
print("\n" + "=" * 60)
print("SIMPLIFIED BASELINE CNN MODEL FINAL RESULTS")
print("=" * 60)
print(f"Best Test Character Accuracy: {max(history['test_char_acc']):.2f}%")
print(f"Best Test Sequence Accuracy: {max(history['test_seq_acc']):.2f}%")
print(f"Final Test Character Accuracy: {history['test_char_acc'][-1]:.2f}%")
print(f"Final Test Sequence Accuracy: {history['test_seq_acc'][-1]:.2f}%")
print("=" * 60)
print(f"\nModel parameters: {sum(p.numel() for p in model.parameters()):,}")
print("This simplified baseline uses standard convolutions with per-position")
print("classification, serving as a reasonable comparison point.")