Data Setup

Dataset builder- Supervised

In [1]:
import torch
from torch import nn

In [2]:
import torch

def word_to_tensor(word):
    # Convert the word to lowercase
    word = word.lower()

    # Use a fixed size of 26 for the alphabet
    letter_count = [0] * 26

    for char in word:
        if 'a' <= char <= 'z':
            letter_count[ord(char) - ord('a')] = 1

    return torch.tensor(letter_count, dtype=torch.float32)

In [3]:
import os
import csv
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms


class SupervisedDataset(Dataset):
    def __init__(self, root_dir, labels_path, transform = None):
        self.root_dir = root_dir
        self.labels_path = labels_path
        self.data = []
        self.transform = transform
        with open(labels_path, newline="") as labels_file:
            labels_reader = csv.reader(labels_file)
            for row in labels_reader:
                self.data.append(row)  # a list of [filename, [chars in image]]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.data[idx][0])

        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image=self.transform(image)
        label_text = self.data[idx][1]

        # Convert label text to array of letter counts
        label_tensor = word_to_tensor(label_text)

        return image, label_tensor

# Define the transform
transform = transforms.Compose([

    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.9, 1.1)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
    transforms.RandomRotation(20),
    transforms.ToTensor(),
])

# Set your root directory
root_dir = "supervised"
# subfolder = 'supervised_data'

# Create datasets for each model
models = [
    "arial",
    "bradhitc",
    "century_schoolbook",
    "comic",
    "cour",
    "papyrus",
    "times",
]
train_datasets, val_datasets = [], []

for model in models:
    model_dir = os.path.join(root_dir, f"{model}_images")
    labels_path = os.path.join(root_dir, f"{model}.csv")
    all_data = SupervisedDataset(model_dir, labels_path, transform)

    # Split data into training and validation sets
    train_size = 1000
    val_size = 100
    train_data, val_data = torch.utils.data.random_split(
        all_data, [train_size, val_size]
    )

    train_datasets.append(train_data)
    val_datasets.append(val_data)

# Create DataLoaders
train_loaders = [
    DataLoader(dataset, batch_size=32, shuffle=True) for dataset in train_datasets
]
val_loaders = [
    DataLoader(dataset, batch_size=32, shuffle=False) for dataset in val_datasets
]


letter_counts = {chr(ord('a') + i): 0 for i in range(26)}

# Iterate through the dataset and update letter counts
for _, label in train_datasets[0]:
    for i, value in enumerate(label):
        if value == 1:
            letter = chr(ord('a') + i)
            letter_counts[letter] += 1

# Print the counts for each letter
for letter, count in letter_counts.items():
    print(f"{letter}: {count}")

class_counts = torch.tensor([letter_counts[letter] for letter in sorted(letter_counts.keys())], dtype=torch.float32)

# Calculate class weights
total_samples = len(train_datasets[0])
class_weights = total_samples / (26 * class_counts)

# Print the calculated class weights
print("Class Weights:", class_weights)





a: 552
b: 157
c: 284
d: 233
e: 613
f: 85
g: 183
h: 211
i: 540
j: 11
k: 86
l: 393
m: 240
n: 451
o: 434
p: 234
q: 10
r: 494
s: 355
t: 423
u: 293
v: 73
w: 70
x: 32
y: 170
z: 33
Class Weights: tensor([0.0697, 0.2450, 0.1354, 0.1651, 0.0627, 0.4525, 0.2102, 0.1823, 0.0712,
        3.4965, 0.4472, 0.0979, 0.1603, 0.0853, 0.0886, 0.1644, 3.8462, 0.0779,
        0.1083, 0.0909, 0.1313, 0.5269, 0.5495, 1.2019, 0.2262, 1.1655])


Dataset builder - Unsupervised

In [None]:
# Stacked Convolutional Auto-Encoder (the unsupervised sub-network)
class SCAE(nn.Module):
  def __init__(self, num_channels):
    super().__init__()

    self.conv1 = nn.Conv2d(
        in_channels=num_channels,
        out_channels=64,
        kernel_size=11,
        padding=1,
        stride=2
    )
    self.conv2 = nn.Conv2d(
        in_channels=64,
        out_channels=128,
        kernel_size=5,
        padding=2
    )
    self.deconv1 = nn.ConvTranspose2d(
        in_channels = 128,
        out_channels = 64,
        kernel_size = 5,
        padding = 2
    )
    self.deconv2 = nn.ConvTranspose2d(
        in_channels=64,
        out_channels=3,
        kernel_size=11,
        padding=1,
        # using stride in the conv1 layer means that multiple input sizes are mapped to the same size
        # output_padding of 1 ensures that the output is the same size as the input
        # in the specific case that the model is producing an output 1 smaller than the input in both dimensions
        # change the output padding value if you change the input image size
        output_padding=1,
        stride=2
    )
    self.maxpool = nn.MaxPool2d(2, return_indices=True)
    self.unpool = nn.MaxUnpool2d(2)
    self.relu = nn.ReLU()

  def forward(self, x):

    x1 = self.conv1(x)
    x2 = self.relu(x1)
    x3, indices = self.maxpool(x2)

    x4 = self.conv2(x3)
    x5 = self.relu(x4)

    x6 = self.deconv1(x5)
    x7 = self.unpool(x6, indices, output_size=x2.size())
    x8 = self.relu(x7)

    x9 = self.deconv2(x8)
    x10 = self.relu(x9)

    return x10

In [4]:
class DeepFont(nn.Module):
  def __init__(self, num_channels, num_classes):
    super().__init__()

    self.baby = nn.Linear(in_features=256*256*num_channels, out_features=num_classes)

    self.conv1 = nn.Conv2d(
        in_channels=num_channels,
        out_channels=64,
        kernel_size=11,
        padding=1,
        stride=2
    )
    self.conv2 = nn.Conv2d(
        in_channels=64,
        out_channels=128,
        kernel_size=5,
        padding=2
    )
    self.conv3 = nn.Conv2d(
        in_channels=128,
        out_channels=256,
        kernel_size=3,
        padding=1
    )
    self.conv4 = nn.Conv2d(
        in_channels=256,
        out_channels=256,
        kernel_size=3,
        padding=1
    )
    self.conv5 = nn.Conv2d(
        in_channels=256,
        out_channels=256,
        kernel_size=3,
        padding=1
    )
    self.fc6 = nn.Linear(in_features=31*31*256, out_features=4096) # assuming input image size of 256x256. change in_feats for different sample size
    self.fc7 = nn.Linear(in_features=4096, out_features=4096)
    self.fc8 = nn.Linear(in_features=4096, out_features=num_classes)
    self.norm1 = nn.BatchNorm2d(num_features=64)
    self.norm2 = nn.BatchNorm2d(num_features=128)
    self.dropout = nn.Dropout(0.5)
    self.maxpool = nn.MaxPool2d(2)
    self.relu = nn.ReLU()
    self.flatten = nn.Flatten()


  def forward(self, x):
    # x = self.flatten(x)
    # x = self.baby(x)

    x = self.conv1(x)
    x = self.norm1(x)
    x = self.maxpool(x)
    x = self.relu(x)

    x = self.conv2(x)
    x = self.norm2(x)
    x = self.maxpool(x)
    x = self.relu(x)

    x = self.conv3(x)
    x = self.relu(x)

    x = self.conv4(x)
    x = self.relu(x)

    x = self.conv5(x)
    x = self.relu(x)

    x = self.flatten(x)

    x = self.dropout(self.fc6(x))
    x = self.relu(x)

    x = self.dropout(self.fc7(x))
    x = self.relu(x)

    x = self.fc8(x)

    return x

In [None]:
def training_unsupervised(model, dataloader, criterion, optimizer, device, epochs, model_path):
    model = model.to(device)
    model.train()
    best_loss = torch.inf
    for _ in range(epochs):
        total_loss = 0
        for batch_index, (images, _) in enumerate(dataloader):
            optimizer.zero_grad()
            images = images.to(device)
            outputs = model(images)
            loss = criterion(outputs, images)
            total_loss += loss.item() * images.size(0)
            loss.backward()
            optimizer.step()
        avg_loss = total_loss / (batch_index+1)
        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save(model.state_dict(), model_path)

In [5]:
import torch
from torch.utils.data import DataLoader

def evaluation(model, dataloader, criterion, device, phase='Validation'):
    model.eval()
    predictions = []
    ground_truth = []

    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0

    misclassified_examples = []

    with torch.no_grad():
        total_loss = 0
        total_samples = 0

        for _, (images, labels) in enumerate(dataloader):
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            #print(outputs)
            #print(labels)
            loss = criterion(outputs, labels)
            #print(loss)
            total_loss += loss.item() * images.size(0)
            total_samples += images.size(0)

            # Convert output probabilities to binary predictions
            preds = (torch.sigmoid(outputs) > 0.5).float()

            # Update multi-label metrics
            true_positives += (preds * labels).sum().item()
            true_negatives += ((1 - labels) * (1 - preds)).sum().item()
            false_positives += ((1 - labels) * preds).sum().item()
            false_negatives += (labels * (1 - preds)).sum().item()

            # Collect misclassified examples


            predictions.extend(preds.cpu().numpy())
            ground_truth.extend(labels.cpu().numpy())

        # Calculate multi-label metrics
        precision = true_positives / (true_positives + false_positives + 1e-10)
        recall = true_positives / (true_positives + false_negatives + 1e-10)
        f1_score = 2 * (precision * recall) / (precision + recall + 1e-10)

        accuracy = (true_positives + true_negatives) / (total_samples + 1e-10)
        loss = total_loss / total_samples

        print(f'{phase}\tF1-Score={f1_score:<10.4f}' +
              f'\t\tLoss= {loss:<10.4f}' +
              f'\t\tPrecision: {precision:<10.4f}' +
              f'\t\tRecall: {recall:<10.4f}' +
              f'\t\tAccuracy: {accuracy:<10.4f}')

        return {'loss': loss,
                'f1_score': f1_score,
                'precision': precision,
                'recall': recall,
                'accuracy': accuracy,
                'ground_truth': ground_truth,
                'predictions': predictions}

# Example usage:
# Replace 'your_model' and 'your_dataloader' with your actual model and dataloader
# Replace 'your_device' with 'cuda' or 'cpu' depending on your setup
# evaluation_results = evaluation(your_model, your_dataloader, criterion, your_device)
# misclassified_examples = evaluation_results['misclassified_examples']


In [6]:
from torch.optim import lr_scheduler

def training_supervised(model, train_loader, val_loader, criterion, optimizer, scheduler, device, epochs, best_model_path):
    model = model.to(device)
    model.train()
    best_loss = torch.inf
    best_results = None
    youre_on_thin_ice_buster = False
    #misclassified_examples = []

    for epoch in range(epochs):
        total_loss = 0
        total_samples = 0

        # New variables for multi-label metrics
        true_positives = 0
        true_negatives = 0
        false_positives = 0
        false_negatives = 0

        for batch_idx, (images, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()


            optimizer.step()

            total_loss += loss.item() * images.size(0)
            total_samples += images.size(0)

            # Convert output probabilities to binary predictions
            preds = (torch.sigmoid(outputs) > 0.5).float()

            # Update multi-label metrics
            true_positives += (preds * labels).sum().item()
            true_negatives += ((1 - labels) * (1 - preds)).sum().item()
            false_positives += ((1 - labels) * preds).sum().item()
            false_negatives += (labels * (1 - preds)).sum().item()

        #if scheduler:
            #scheduler.step()


        # Calculate multi-label metrics
        precision = true_positives / (true_positives + false_positives + 1e-10)
        recall = true_positives / (true_positives + false_negatives + 1e-10)
        f1_score = 2 * (precision * recall) / (precision + recall + 1e-10)

        accuracy = (true_positives + true_negatives) / (total_samples + 1e-10)
        loss = total_loss / total_samples

        print(f'{epoch:<4}\tTrain\tF1-Score={f1_score:<10.4f}' +
              f'\t\tLoss= {loss:<10.4f}' +
              f'\t\tPrecision: {precision:<10.4f}' +
              f'\t\tRecall: {recall:<10.4f}' +
              f'\t\tAccuracy: {accuracy:<10.4f}')

        results = evaluation(model, val_loader, criterion, device)
        model.train()

        # early stopping:
        if results['loss'] < best_loss:             # we are still improving
            torch.save(model.state_dict(), best_model_path)
            best_loss = results['loss']
            best_results = results
            youre_on_thin_ice_buster = False
        elif youre_on_thin_ice_buster:              # we didn't improve last time and we didn't improve this time
            break
        else:                                       # we didn't improve this time, but it was the first time in a while
            youre_on_thin_ice_buster = True

        #if epoch == epochs - 1:  # Check if it's the last epoch
         # misclassified_examples = find_misclassified_examples(model, val_loader, device)
          #print("Misclassified Examples:")
          #for example in misclassified_examples:
           #   print(example)

        for name, param in model.named_parameters():
              if param.requires_grad and param.grad is not None:
               # print(f"Layer: {name}, Gradient Norm: {param.grad.norm().item()}")
               pass
        print()


    # Print misclassified examples after the last epoch

    # Print misclassified examples after the last epoch
    #print("Misclassified Examples:")
    #for example in misclassified_examples:
     #   print(example)

    return best_results

def find_misclassified_examples(model, data_loader, device):
    model.eval()
    misclassified_examples = []

    with torch.no_grad():
        for images, labels in data_loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            preds = (torch.sigmoid(outputs) > 0.5).float()

            misclassified_mask = (preds != labels)
            misclassified_indices = torch.nonzero(misclassified_mask).squeeze()

            for idx in misclassified_indices:
                # Append to misclassified examples without moving to CPU
                misclassified_examples.append({
                    'image': images[idx].clone(),  # Use clone to avoid modifying the original tensor
                    'predicted_label': preds[idx].clone(),
                    'true_label': labels[idx].clone()
                })

    return misclassified_examples



In [7]:
print(len(train_datasets[0]))


1000


In [14]:
import torch.nn as nn
import torch.optim as optim

models_dir = 'models'
device = torch.device("cpu")

# Train the unsupervised sub-network IS NOT IN THIS NOTEBOOK ANYMORE GO SEE OTHER NOTEBOOK


# Train the supervised sub-network

# Hyperparameters
learning_rate = 0.0001
momentum = 0.95
weight_decay = 1e-4
epochs = 16 # CHANGED BY XANNA 
criterion = torch.nn.BCEWithLogitsLoss(weight=class_weights.to(device))


In [None]:
# Stops colab from breaking sometimes
# Only works sometimes
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [11]:
# define supervised model
supervised_model = DeepFont(
    num_channels=3, num_classes=26
)  # one class per letter (not case-sensitive)

# Import the convolutional layers of the SCAE as conv1 and conv2
scae_path = os.path.join(models_dir, f"SCAE.pt")
supervised_model.load_state_dict(torch.load(scae_path), strict=False)

# Freeze the convolutional layers from SCAE
for param in supervised_model.conv1.parameters():
    param.requires_grad = False
for param in supervised_model.conv2.parameters():
    param.requires_grad = False

# unfreeze layers                                   # something something when i removed coconunt.jpg the whole project broke and we don;t even wanna know if this is a coconut
for param in supervised_model.conv3.parameters():
    param.requires_grad = True
for param in supervised_model.conv4.parameters():
    param.requires_grad = True
for param in supervised_model.conv5.parameters():
    param.requires_grad = True
for param in supervised_model.fc6.parameters():
    param.requires_grad = True
for param in supervised_model.fc7.parameters():
    param.requires_grad = True
for param in supervised_model.fc8.parameters():
    param.requires_grad = True




In [15]:
# define optimizer and scheduler :) thank u xanna. ur welcome
optimizer = optim.SGD(supervised_model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay)
scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

In [29]:
#torch.cuda.empty_cache()
for i in range(2):
    font_name = models[i]
    print(font_name)

    train_loader = train_loaders[i]
    val_loader = val_loaders[i]
    best_model_path = os.path.join(models_dir, f"{font_name}_model.pt")
    best_results = training_supervised(
        supervised_model,
        train_loader,
        val_loader,
        criterion,
        optimizer,
        scheduler,
        device,
        epochs,
        best_model_path,
    )


arial
0   	Train	F1-Score=0.3593    		Loss= 0.2399    		Precision: 0.3497    		Recall: 0.3696    		Accuracy: 17.2500   
Validation	F1-Score=0.3452    		Loss= 0.0863    		Precision: 0.5500    		Recall: 0.2515    		Accuracy: 19.7400   

1   	Train	F1-Score=0.4142    		Loss= 0.1160    		Precision: 0.5138    		Recall: 0.3470    		Accuracy: 19.4840   
Validation	F1-Score=0.3452    		Loss= 0.0927    		Precision: 0.5500    		Recall: 0.2515    		Accuracy: 19.7400   

2   	Train	F1-Score=0.3875    		Loss= 0.1052    		Precision: 0.5455    		Recall: 0.3005    		Accuracy: 19.6930   
Validation	F1-Score=0.4129    		Loss= 0.0861    		Precision: 0.5450    		Recall: 0.3323    		Accuracy: 19.8000   

3   	Train	F1-Score=0.3815    		Loss= 0.1039    		Precision: 0.5511    		Recall: 0.2917    		Accuracy: 19.7190   
Validation	F1-Score=0.4129    		Loss= 0.0811    		Precision: 0.5450    		Recall: 0.3323    		Accuracy: 19.8000   



In [None]:
!unzip VFR_labelled.zip

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Create the test_dataloader

In [None]:
import os
import csv
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms
from torch.utils.data._utils.collate import default_collate


class TrainingDataset(Dataset): # Modified from the SupervisedDataset class
    def __init__(self, root_dir, labels_path, transform = None):
        self.root_dir = root_dir
        self.labels_path = labels_path
        self.data = []
        self.transform = transform
        with open(labels_path, newline="") as labels_file:
            labels_reader = csv.reader(labels_file)
            next(labels_reader)  # Skip the header
            for row in labels_reader:
                self.data.append(row)  # a list of [filename, [chars in image]]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.data[idx][0])
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image=self.transform(image)

        label_text = self.data[idx][1]
        label_text = label_text.replace(" ", "")  # Removes spaces from the label (so that all labels are one word)

        # Convert label text to array of letter counts
        label_tensor = word_to_tensor(label_text)

        return image, label_tensor


#Define a custom collate function for the dataloader (this is a workaround
# for the test images being different sizes)
def custom_collate(batch):
    # Filter out None items (if your dataset returns None for some images)
    batch = list(filter(lambda x: x is not None, batch))

    # Handle the case for an empty batch
    if len(batch) == 0:
        return torch.Tensor()

    # Separate images and labels
    images = [item[0] for item in batch]
    labels = [item[1] for item in batch]

    # We can't stack images of different sizes, so we just keep them in a list
    # Alternatively, you can pad images here to the same size
    batch = (default_collate(images), default_collate(labels))
    return batch



#Define transforms for testing data
test_transform = transforms.Compose([
    transforms.ToTensor()
])

test_images_dir = "/content/VFR_labelled" # Replace with the directory of the testing images
test_labels_path = "/content/VFR_labelled/real_test.csv" # Replace with the path to the test images csv label file
test_dataset = TrainingDataset(test_images_dir, test_labels_path, test_transform)

#Create dataloader for test dataset
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=custom_collate)

#dataloader with shuffling enabled for visualization/testing, for debugging dataloader logic
vis_loader = DataLoader(test_dataset, batch_size=64, shuffle=True, collate_fn=custom_collate)

Test the test_loader

In [None]:
def tensor_to_word(tensor):
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    characters = [alphabet[i] for i in range(26) if tensor[i] == 1]

    return ''.join(characters)


import matplotlib.pyplot as plt

def show_images(images, labels, num_images=4):
    plt.figure(figsize=(15, 10))
    for i in range(num_images):
        plt.subplot(1, num_images, i + 1)
        plt.imshow(images[i].numpy().transpose(1, 2, 0))  # Convert tensor to image format
        plt.title('Label: ' + labels[i])
        plt.axis('off')
    plt.show()

# Get a random batch of images and labels
for images, label_tensors in vis_loader:
    text_labels = [tensor_to_word(label_tensor) for label_tensor in label_tensors]
    show_images(images, text_labels)
    break  # Display only the first batch




In [None]:
# Testing our models
import torch.nn as nn
models_dir = 'models'
criterion = nn.BCEWithLogitsLoss()
device = torch.device("cpu")

# ...

for i in range(len(models)):
    font_name = models[i]
    print(font_name)
    model_path = os.path.join(models_dir, f"{font_name}_model.pt")

    # Create an instance of your model
    model = DeepFont(num_channels=3, num_classes=26)

    # Load the state dictionary into the model
    model.load_state_dict(torch.load(model_path, map_location=device))

    results = evaluation(model, test_loader, criterion, device, 'Test')
