In [1]:
import os
import csv

from PIL import Image, ImageOps
import random

import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

from torchvision import transforms
import torchvision.transforms.functional as TF
import torchvision.models as models

import numpy as np

from scipy.ndimage import gaussian_filter

from blf_torch import BilateralFilter, DEVICE


In [2]:
# print('check for train dataset rows:')
with open('train_dataset.csv', mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)  # Reads the CSV into a list of dictionaries
    train = [{**row, 'label_num': int(row['label_num'])} for row in reader]  # Convert label_num to int
# Print the dictionary
# for row in train[:3]:
#     print(row)

print('loading image metadata...', end='')
# print('check for test dataset rows:')
with open('test_dataset.csv', mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)  # Reads the CSV into a list of dictionaries
    test = [{**row, 'label_num': int(row['label_num'])} for row in reader]  # Convert label_num to int
# Print the dictionary
# for row in test[:3]:
#     print(row)

# These datasets were pre-shuffled, so we should be able to take a small sample for testing out algorithms.
mini_length = 100
train_mini = train[:mini_length]
# print('\ntrain_mini length:',len(train_mini))

print(' done')

loading image metadata... done


In [3]:
# https://clamm.irht.cnrs.fr/icdar-2017/data-set/

script_conversion = {
    '1':"Caroline",
    '2':"Cursiva",
    '3':"Half Uncial",
    '4':"Humanistic",
    '5':"Humanistic Cursive",
    '6':"Hybrida",
    '7':"Praegothica",
    '8':"Semihybrida",
    '9':"Semitextualis",
    '10':"Southern Textualis",
    '11':"Textualis",
    '12':"Uncial"
}

reverse_script_conversion = {v: int(k) for k, v in script_conversion.items()}

def convert_to_script(label_number):
    """
    Convert a label number (e.g., '11') to its script name.
    """
    return script_conversion.get(str(label_number), "Unknown")

def clean_label(label):
    # Replace '_' with ' ', convert to title case, and strip spaces
    return label.replace("_", " ").title().strip()

def convert_to_number(script_name):
    """
    Convert a script name (e.g., 'Textualis') to its corresponding number.
    """
    cleaned_script_name = clean_label(script_name)
    return reverse_script_conversion.get(cleaned_script_name, -1)

def build_dataset(csv_path, image_folder, icdar = 0):


        # Routine cleaning function

    
    # Step 1: Read the CSV and store labels in a dictionary
    label_dict = {}
    with open(csv_path, mode='r') as csvfile:
        reader = csv.reader(csvfile, delimiter=';')
        # Assuming the first row is header, skip it if needed
        next(reader)
        for row in reader:
            filename = row[0+icdar]  # Adjust index based on your CSV structure
            label = row[1+icdar]  # Adjust index based on your CSV structure
            label = str(label)
            label = clean_label(script_conversion.get(label, label))
            if (label in script_conversion):
                label = script_conversion[label]
            label_dict[filename] = label

    # Step 2: Match images to labels and store the full path
    dataset = {}
    with os.scandir(image_folder) as entries:
        for entry in entries:
            if entry.is_file():
                image_id = entry.name
                
                if image_id in label_dict:  # Only add if we have a matching label
                    dataset[image_id] = {
                        'filepath': os.path.join(image_folder, entry.name),
                        'label': label_dict[image_id],
                        'label_num': reverse_script_conversion[label_dict[image_id]]
                    }

    print(f"Dataset created with {len(dataset)} items.")
    return dataset

In [4]:

class AddGaussianNoise:
    def __init__(self, mean=0.0, std=0.1):
        self.mean = mean
        self.std = std

    def __call__(self, image):
        is_pillow = isinstance(image, Image.Image)
        if is_pillow:
            image = TF.to_tensor(image)
        noise = torch.randn_like(image) * self.std + self.mean
        noisy_image = torch.clamp(image + noise, 0, 1)
        if is_pillow:
            return TF.to_pil_image(noisy_image)
        return noisy_image


class AddSpeckleNoise:
    def __init__(self, mean=0.0, std=0.1):
        self.mean = mean
        self.std = std

    def __call__(self, image):
        is_pillow = isinstance(image, Image.Image)
        if is_pillow:
            image = TF.to_tensor(image)
        noise = torch.randn_like(image) * self.std + self.mean
        noisy_image = torch.clamp(image + image * noise, 0, 1)
        if is_pillow:
            return TF.to_pil_image(noisy_image)
        return noisy_image


class AddSaltAndPepperNoise:
    def __init__(self, amount=0.02, salt_vs_pepper=0.5):
        self.amount = amount
        self.salt_vs_pepper = salt_vs_pepper

    def __call__(self, image):
        is_pillow = isinstance(image, Image.Image)
        if is_pillow:
            image = TF.to_tensor(image)
        noisy_image = image.clone()
        num_pixels = int(self.amount * image.numel())
        salt_indices = torch.randint(0, image.numel(), (int(num_pixels * self.salt_vs_pepper),), device=image.device)
        pepper_indices = torch.randint(0, image.numel(), (int(num_pixels * (1 - self.salt_vs_pepper)),), device=image.device)
        noisy_image.view(-1)[salt_indices] = 1.0
        noisy_image.view(-1)[pepper_indices] = 0.0
        if is_pillow:
            return TF.to_pil_image(noisy_image)
        return noisy_image

# Prepare a bilateral filter class for the pipeline.
# blf() above uses CPU and is too slow.  Below uses CUDA if available (DEVICE).
class ApplyBilateralFilter:
    def __init__(self, kernel_size=5, sigma_space=5, sigma_color=0.1):
        self.kernel_size = kernel_size
        self.sigma_space = sigma_space
        self.sigma_color = sigma_color

    def __call__(self, img):
        # Convert PIL image to tensor
        img_tensor = transforms.ToTensor()(img).unsqueeze(0).to(DEVICE)

        # Initialize the bilateral filter (dynamic size detection)
        bilateral_filter = BilateralFilter(
            channels=img_tensor.shape[1],
            k=self.kernel_size,
            height=img_tensor.shape[2],
            width=img_tensor.shape[3],
            sigma_space=self.sigma_space,
            sigma_color=self.sigma_color,
            device=DEVICE
        )

        # Apply the filter
        with torch.no_grad():
            filtered_tensor = bilateral_filter(img_tensor)

        # Convert the tensor back to a PIL image
        filtered_img = transforms.ToPILImage()(filtered_tensor.squeeze(0).to(DEVICE))
        return filtered_img
        # return filtered_tensor




# geometric transform
transform_pipeline = transforms.Compose([
    # geometric
    transforms.RandomApply(      [transforms.RandomAffine(degrees=0, shear=10, interpolation=Image.BICUBIC)              ], p=0.5)
    ,transforms.RandomApply(     [transforms.RandomRotation(degrees=15, interpolation=Image.BICUBIC)                     ], p=0.5)
    ,transforms.RandomApply(     [transforms.RandomPerspective(distortion_scale=0.2, p=0.3, interpolation=Image.BICUBIC) ], p=0.2)
    ,transforms.RandomResizedCrop(
        size=(300,300),
        scale=(0.8, 1.2),
        ratio=(0.8, 1.2),
        interpolation=Image.BICUBIC
    )
    ,ApplyBilateralFilter(kernel_size=5, sigma_space=5, sigma_color=0.1)

    # color / photo effects
    ,transforms.RandomApply(     [transforms.GaussianBlur(kernel_size=(3,3))                                             ], p=0.2)

    ,transforms.RandomApply([transforms.RandomChoice([
                            AddGaussianNoise(mean=0.0, std=0.1),
                            AddSpeckleNoise(mean=0.0, std=0.1),
                            AddSaltAndPepperNoise(amount=0.02, salt_vs_pepper=0.5)
                        ])                                                                                               ], p = 0.25)


    
    ,transforms.CenterCrop((224,224)) # ResNet50 expects 224x224
    ,transforms.ToTensor()
    ,transforms.Lambda(lambda x: x.repeat(3, 1, 1))  # Convert 1-channel grayscale to 3-channel
    ,transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Standard normalization for ResNet50
])

test_transform_pipeline = transforms.Compose([
    # geometric
    ApplyBilateralFilter(kernel_size=5, sigma_space=5, sigma_color=0.1)

    
    ,transforms.CenterCrop((224,224)) # ResNet50 expects 224x224
    ,transforms.ToTensor()
    ,transforms.Lambda(lambda x: x.repeat(3, 1, 1))  # Convert 1-channel grayscale to 3-channel
    ,transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Standard normalization for ResNet50
])



minimal_preprocessing_pipeline = transforms.Compose([
    # geometric

    transforms.CenterCrop((224,224)) # ResNet50 expects 224x224
    ,transforms.ToTensor()
    ,transforms.Lambda(lambda x: x.repeat(3, 1, 1))  # Convert 1-channel grayscale to 3-channel
    ,transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Standard normalization for ResNet50
])

In [5]:


class ScriptDataset(Dataset):
    def __init__(self, dataset, transform=None, multiplier=10, max_size=None):
        """
        dataset: List of dicts, each containing 'filepath', 'label', and 'label_num'
        transform: torchvision transforms (augmentations + preprocessing)
        multiplier: Number of times each raw image is virtually repeated
        max_size: Upper limit on the dataset size (optional)
        """
        self.dataset = dataset  # Now a list of dicts, not a dict itself
        self.transform = transform
        self.multiplier = multiplier
        
        # Compute virtual dataset size
        self.virtual_size = len(self.dataset) * self.multiplier

        # Apply max_size limit if provided
        if max_size is not None:
            self.virtual_size = min(self.virtual_size, max_size)

    def crop_sample(self, image, crop_dim=300):
        """Crop a random 300x300 region."""
        img_width, img_height = image.size
        margin_x = int(img_width * 0.05)
        margin_y = int(img_height * 0.05)
        max_x = img_width - crop_dim - margin_x
        max_y = img_height - crop_dim - margin_y

        left = random.randint(margin_x, max_x)
        upper = random.randint(margin_y, max_y)
        crop_box = (left, upper, left + crop_dim, upper + crop_dim)
        return image.crop(crop_box)

    def __len__(self):
        """Return the virtual dataset size (capped at max_size if set)."""
        return self.virtual_size

    def __getitem__(self, idx):
        # Map virtual index back to the original dataset
        real_idx = idx % len(self.dataset)
        row = self.dataset[real_idx]

        # Load image
        image = Image.open(row["filepath"]).convert("L")

        # Apply random crop
        image = self.crop_sample(image)

        # Apply transforms (if any)
        if self.transform:
            image = self.transform(image)
            
        #  Python counts from 0, but the dataset counts from 1.  We will have to account for this later
        corrected_label = row['label_num'] - 1
        # Return image and numerical label as a tuple
        return image, corrected_label

In [6]:
script_dataset = ScriptDataset(train_mini, transform=transform_pipeline, multiplier=20, max_size=2_000)
print(len(script_dataset))

data_loader = DataLoader(script_dataset, batch_size=4, shuffle=True)


2000


In [7]:
next(iter(data_loader))

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


[tensor([[[[ 1.6495,  1.6324,  1.6153,  ..., -0.1314,  0.9303,  1.0844],
           [ 1.6495,  1.6324,  1.5982,  ..., -0.1828,  0.8789,  1.0502],
           [ 1.6324,  1.6153,  1.5810,  ..., -0.8335,  0.5878,  1.0331],
           ...,
           [ 1.4098,  1.4098,  1.3927,  ...,  1.4098,  1.4269,  1.4440],
           [ 1.3927,  1.3755,  1.3755,  ...,  1.4269,  1.4269,  1.4440],
           [ 1.3584,  1.3584,  1.3584,  ...,  1.4440,  1.4269,  1.4440]],
 
          [[ 1.8158,  1.7983,  1.7808,  ..., -0.0049,  1.0805,  1.2381],
           [ 1.8158,  1.7983,  1.7633,  ..., -0.0574,  1.0280,  1.2031],
           [ 1.7983,  1.7808,  1.7458,  ..., -0.7227,  0.7304,  1.1856],
           ...,
           [ 1.5707,  1.5707,  1.5532,  ...,  1.5707,  1.5882,  1.6057],
           [ 1.5532,  1.5357,  1.5357,  ...,  1.5882,  1.5882,  1.6057],
           [ 1.5182,  1.5182,  1.5182,  ...,  1.6057,  1.5882,  1.6057]],
 
          [[ 2.0300,  2.0125,  1.9951,  ...,  0.2173,  1.2980,  1.4548],
           [ 

In [8]:
sample_images, sample_labels = next(iter(data_loader))
print("Image batch shape:", sample_images.shape)  # Expected: [batch_size, 3, 224, 224]
print("Label batch shape:", sample_labels.shape)  # Expected: [batch_size]

Image batch shape: torch.Size([4, 3, 224, 224])
Label batch shape: torch.Size([4])


In [9]:
model = models.resnet50(pretrained=False)
# Run a forward pass with a single batch
with torch.no_grad():
    output = model(sample_images)  # Check if this runs without error

print("Output shape:", output.shape)  # Expected: [batch_size, 1000] (default ResNet output)



Output shape: torch.Size([4, 1000])


In [10]:
print("Sample labels:", sample_labels)
print("Label dtype:", sample_labels.dtype)
print("Label min/max:", sample_labels.min().item(), sample_labels.max().item())


Sample labels: tensor([5, 2, 4, 8])
Label dtype: torch.int64
Label min/max: 2 8


In [11]:
num_classes = 12

model = models.resnet50(pretrained=True)
model.fc = torch.nn.Linear(2048, num_classes)  # Adjust output layer for your number of classes

criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop (simplified)
for images, labels in data_loader:
    images, labels = images.to(device), labels.to(device)
    
    optimizer.zero_grad()
    outputs = model(images)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    print("Loss:", loss.item())




Loss: 2.4872446060180664
Loss: 2.5358476638793945
Loss: 2.614347219467163
Loss: 2.352200746536255
Loss: 2.8540596961975098
Loss: 2.2352347373962402
Loss: 2.807258129119873
Loss: 2.5036325454711914
Loss: 2.4849510192871094
Loss: 2.6718215942382812
Loss: 2.6976802349090576
Loss: 2.7805144786834717
Loss: 2.541954278945923
Loss: 2.1956281661987305
Loss: 2.479978084564209
Loss: 2.7356362342834473
Loss: 2.354104995727539
Loss: 2.0953011512756348
Loss: 1.9145890474319458
Loss: 2.730525255203247
Loss: 2.3479676246643066
Loss: 2.3844809532165527
Loss: 2.253364324569702
Loss: 2.542165517807007
Loss: 2.2536568641662598
Loss: 2.3144609928131104
Loss: 1.752107858657837
Loss: 2.331458330154419
Loss: 2.1103110313415527
Loss: 2.6560120582580566
Loss: 2.0476701259613037
Loss: 2.25929856300354
Loss: 2.4025590419769287
Loss: 1.6341776847839355
Loss: 2.3846993446350098
Loss: 2.529937267303467
Loss: 2.833601951599121
Loss: 1.904239296913147
Loss: 2.957261085510254
Loss: 2.4448046684265137
Loss: 2.089660882

# Test

In [12]:
test_dataset = ScriptDataset(test[:100], transform=test_transform_pipeline, multiplier=1)  # No need for augmentation in testing
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)  # No shuffle for consistent evaluation

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [13]:
correct = 0
total = 0

with torch.no_grad():  # No gradients needed during evaluation
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)

        outputs = model(images)  # Get predictions
        _, predicted = torch.max(outputs, 1)  # Convert logits to class indices

        correct += (predicted == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total * 100
print(f"Test Accuracy: {accuracy:.2f}%")




Test Accuracy: 27.00%
