In [1]:
import os
import json
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms
import matplotlib.pyplot as plt

class CaptchaDatasetRGB(Dataset):
    def __init__(self, data_dir, transform=None):
        """
        Args:
            data_dir (string): Direct path to the specific data folder (train, val, or test)
                               e.g., '/path/to/UTN-CV25-Captcha-Dataset/part2/train'
            transform (callable, optional): Optional transform to be applied on images
        """
        self.data_dir = data_dir
        self.images_dir = os.path.join(self.data_dir, 'images')
        self.transform = transform
        self.image_list = sorted([f for f in os.listdir(self.images_dir) if f.endswith('.png')])
        
        # Load labels if available
        self.labels_dict = {}
        labels_file = os.path.join(self.data_dir, 'labels.json')
        if os.path.exists(labels_file):
            with open(labels_file, 'r') as f:
                labels = json.load(f)
                # Create a dictionary for faster lookup by image_id
                self.labels_dict = {item['image_id']: item for item in labels}
    
    def __len__(self):
        return len(self.image_list)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        img_name = self.image_list[idx]
        img_path = os.path.join(self.images_dir, img_name)
        
        # Load image as RGB
        image = Image.open(img_path).convert('RGB')  # Convert to RGB instead of 'L'
        
        # Get image_id without extension
        image_id = os.path.splitext(img_name)[0]
        
        # Get labels if available
        label_info = self.labels_dict.get(image_id, {})
        
        # Extract captcha string and bounding boxes
        captcha_string = label_info.get('captcha_string', '')
        annotations = label_info.get('annotations', [])
        
        # Apply transforms if specified
        if self.transform:
            image = self.transform(image)
        
        sample = {
            'image': image,
            'image_id': image_id,
            'captcha_string': captcha_string,
            'annotations': annotations
        }
        
        return sample

# Custom collate function for RGB images
def custom_collate_fn_rgb(batch):
    """Custom collate function to handle variable-length annotations for RGB images"""
    images = torch.stack([item['image'] for item in batch])
    image_ids = [item['image_id'] for item in batch]
    captcha_strings = [item['captcha_string'] for item in batch]
    annotations = [item['annotations'] for item in batch]  # Keep as list of lists
    
    return {
        'image': images,
        'image_id': image_ids,
        'captcha_string': captcha_strings,
        'annotations': annotations
    }

# Helper function to create RGB dataloaders
def get_dataloader_rgb(data_folder, batch_size=32, shuffle=True):
    """
    Args:
        data_folder (string): Direct path to specific data folder
                            e.g. '/path/to/UTN-CV25-Captcha-Dataset/part2/train'
        batch_size (int): Batch size for the dataloader
        shuffle (bool): Whether to shuffle the data
    """
    # Define transformations for RGB images
    transform = transforms.Compose([
        # transforms.Resize((160, 640)),  # Keep original size
        transforms.ToTensor(),         # Convert to tensor [0, 1]
        transforms.Normalize(
            mean=[0.5, 0.5, 0.5],     # RGB channels normalization to [-1, 1]
            std=[0.5, 0.5, 0.5]
        )
    ])
    
    # Create dataset
    dataset = CaptchaDatasetRGB(
        data_dir=data_folder,
        transform=transform
    )
    
    # Create dataloader with custom collate function
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=0,
        collate_fn=custom_collate_fn_rgb
    )
    
    return dataloader

# Example usage and visualization
def visualize_rgb_sample(batch):
    """Visualize RGB CAPTCHA sample"""
    # Get first image in batch
    img = batch['image'][0]  # Shape: [3, 160, 640]
    
    # Denormalize from [-1, 1] to [0, 1]
    img = img * 0.5 + 0.5
    
    # Convert from [C, H, W] to [H, W, C] for matplotlib
    img = img.permute(1, 2, 0)
    
    plt.figure(figsize=(12, 4))
    plt.imshow(img)
    plt.title(f"RGB CAPTCHA: {batch['captcha_string'][0]}")
    plt.axis('off')
    plt.show()
    
    print(f"Image shape: {batch['image'].shape}")  # Should be [batch_size, 3, 160, 640]
    print(f"Image value range: [{batch['image'].min():.3f}, {batch['image'].max():.3f}]")

In [2]:
import os

# Base path to the dataset
base_path = '/home/utn/omul36yx/git/UTN-CAPTCHASOLVER/UTN-CV25-Captcha-Dataset/part2'

# Create RGB dataloaders for each split
train_loader_rgb = get_dataloader_rgb(os.path.join(base_path, 'train'), batch_size=32, shuffle=True)
val_loader_rgb = get_dataloader_rgb(os.path.join(base_path, 'val'), batch_size=32, shuffle=False)
test_loader_rgb = get_dataloader_rgb(os.path.join(base_path, 'test'), batch_size=32, shuffle=False)

# Print dataset sizes
print(f"RGB Training samples: {len(train_loader_rgb.dataset)}")
print(f"RGB Validation samples: {len(val_loader_rgb.dataset)}")
print(f"RGB Test samples: {len(test_loader_rgb.dataset)}")

# Display a sample RGB image
for batch in train_loader_rgb:
    visualize_rgb_sample(batch)
    print(f"Number of annotations for first image: {len(batch['annotations'][0])}")
    break

FileNotFoundError: [Errno 2] No such file or directory: '/home/utn/omul36yx/git/UTN-CAPTCHASOLVER/UTN-CV25-Captcha-Dataset/part2/train/images'

resnet backbone from medium

In [None]:
### ResNet 18 Backbone

import torch
import torch.nn as nn
import collections

stem_sequence = nn.Sequential(collections.OrderedDict([
    ('conv1',nn.Conv2d(3,64,kernel_size=7,padding=3,stride=2)),
    ('pool1', nn.MaxPool2d(kernel_size=3,stride=2,padding=1))
]))


class Stem(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(Stem, self).__init__()
        self.model = stem_sequence

class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        # self.downsample = downsample if needed

    def forward(self,x):
        skip = x
        #first basic block
        out = self.conv1(x)
        out = self.bn1(out)
        out =  self.relu(out)
        # second basic block
        out = self.conv2(out)
        out = self.bn2(out)

        out += skip

        out = self.relu(out)
        
        return out

class ResNet18BackBone(nn.Module):
    def __init__(self, num_classes=1000):
        super(ResNet18BackBone, self).__init__()
        self.in_channels = 64
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        # Define the layers based on ResNet-18 architecture
        self.layer1 = self._make_layer(64, 2)
        self.layer2 = self._make_layer(128, 2, stride=2)
        self.layer3 = self._make_layer(256, 2, stride=2)
        self.layer4 = self._make_layer(512, 2, stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)
        
        # Initialize weights is needed
        
    def _make_layer(self, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
        layers = [ResidualBlock(self.in_channels, out_channels, stride, downsample)]
        self.in_channels = out_channels
        for _ in range(1, blocks):
            layers.append(ResidualBlock(out_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        #x = self.avgpool(x)# Outputs feature map
        # x = torch.flatten(x, 1)not needed for feeding into yolo
        #x = self.fc(x)
        
        return x # Final Output is a 20x5 feature map

Improved Resnet 18 with weight init function

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# ---------- Stem ----------
class Stem(nn.Module):
    """ResNet stem: 7x7 s2 + BN + ReLU + 3x3 MaxPool s2."""
    def __init__(self, in_ch=3, out_ch=64):
        super().__init__()
        self.conv = nn.Conv2d(in_ch, out_ch, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn   = nn.BatchNorm2d(out_ch)
        self.relu = nn.ReLU(inplace=True)
        self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # Kaiming init for the conv
        nn.init.kaiming_normal_(self.conv.weight, mode='fan_out', nonlinearity='relu')
        nn.init.constant_(self.bn.weight, 1.0)
        nn.init.constant_(self.bn.bias, 0.0)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        x = self.pool(x)
        return x  # e.g., 640x160 -> 160x40


# ---------- Basic Residual Block ----------
class ResidualBlock(nn.Module):
    expansion = 1
    def __init__(self, in_ch, out_ch, stride=1, downsample=None):
        super().__init__()
        self.conv1 = nn.Conv2d(in_ch, out_ch, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1   = nn.BatchNorm2d(out_ch)
        self.relu  = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_ch, out_ch, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2   = nn.BatchNorm2d(out_ch)
        self.downsample = downsample  # 1x1 conv + BN when shape changes

        # Init
        for m in (self.conv1, self.conv2):
            nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
        nn.init.constant_(self.bn1.weight, 1.0); nn.init.constant_(self.bn1.bias, 0.0)
        nn.init.constant_(self.bn2.weight, 1.0); nn.init.constant_(self.bn2.bias, 0.0)

    def forward(self, x):
        identity = x

        out = self.conv1(x); out = self.bn1(out); out = self.relu(out)
        out = self.conv2(out); out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)
        return out


# ---------- ResNet-18 Backbone ----------
class ResNet18Backbone(nn.Module):
    """
    Backbone only (no GAP/FC). For 640x160 input:
      stem -> 160x40
      layer2 -> 80x20
      layer3 -> 40x10
      layer4 -> 20x5   (512 channels)  <-- feed this to your YOLO head
    If return_p3=True, also returns the 40x10 map for a higher-res head.
    """
    def __init__(self, in_ch=3, return_p3=False):
        super().__init__()
        self.stem = Stem(in_ch=in_ch, out_ch=64)
        self.in_ch = 64

        self.layer1 = self._make_layer(ResidualBlock, 64,  blocks=2, stride=1)  # 160x40
        self.layer2 = self._make_layer(ResidualBlock, 128, blocks=2, stride=2)  #  80x20
        self.layer3 = self._make_layer(ResidualBlock, 256, blocks=2, stride=2)  #  40x10
        self.layer4 = self._make_layer(ResidualBlock, 512, blocks=2, stride=2)  #  20x5

        self.return_p3 = return_p3

    def _make_layer(self, block, out_ch, blocks, stride):
        down = None
        if stride != 1 or self.in_ch != out_ch:
            down = nn.Sequential(
                nn.Conv2d(self.in_ch, out_ch, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_ch)
            )

        layers = [block(self.in_ch, out_ch, stride=stride, downsample=down)]
        self.in_ch = out_ch
        for _ in range(1, blocks):
            layers.append(block(out_ch, out_ch, stride=1, downsample=None))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.stem(x)         # -> 160x40
        x = self.layer1(x)       # -> 160x40
        x = self.layer2(x)       # ->  80x20
        p3 = self.layer3(x)      # ->  40x10
        p4 = self.layer4(p3)     # ->  20x5
        return (p4, p3) if self.return_p3 else p4


# --- Weight Init with He Init method

def init_weights_kaiming(m):
    """Applies Kaiming initialization to Conv/Linear layers, zeros to biases."""
    if isinstance(m, (nn.Conv2d, nn.Linear)):
        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
        if m.bias is not None:
            nn.init.constant_(m.bias, 0)
    elif isinstance(m, nn.BatchNorm2d):
        nn.init.constant_(m.weight, 1)
        nn.init.constant_(m.bias, 0)


# ---- quick sanity check ----
if __name__ == "__main__":
    model = ResNet18Backbone(in_ch=3, return_p3=True)
    x = torch.randn(1, 3, 160, 640)  # HxW ordering if you prefer, swap to (B,3,160,640)
    p4, p3 = model(x)
    model.apply(init_weights_kaiming)
    print(p4.shape, p3.shape)  # expect: torch.Size([1, 512, 5, 20]) and [1, 256, 10, 40]


torch.Size([1, 512, 5, 20]) torch.Size([1, 256, 10, 40])
