Data Setup

In [2]:
import shutil
import os
from pathlib import Path
import zipfile as zipfile


# Destination directories for storing the downloaded data
supervised_dest_dir = 'supervised'
unsupervised_synthetic_dest_dir = 'unsupervised_synthetic'
unsupervised_dest_dir = 'unsupervised'

# Create destination directories if they don't exist
os.makedirs(supervised_dest_dir, exist_ok=True)
os.makedirs(unsupervised_synthetic_dest_dir, exist_ok=True)
os.makedirs(unsupervised_dest_dir, exist_ok=True)

# Function to download a folder from Google Drive
def download_folder(zip_name,destination):

    with zipfile.ZipFile(zip_name, 'r') as zip_ref:
        zip_ref.extractall(destination)

    # Move the contents of the extracted folder to the destination
    extracted_folder = os.path.join(destination, Path(zip_name).stem)
    for item in os.listdir(extracted_folder):
        s = os.path.join(extracted_folder, item)
        d = os.path.join(destination, item)
        if os.path.isdir(s):
            shutil.move(s, d)
        else:
            shutil.copy2(s, d)

    # Clean up temporary files
    #os.remove(zip_name)
    shutil.rmtree(extracted_folder)

# Download and organize the supervised dataset
#download_folder('supervised_data.zip', supervised_dest_dir)

# Download and organize the unsupervised dataset
download_folder("unsupervised_data.zip", unsupervised_dest_dir)

#download_folder("unsupervised_synthetic_data.zip", unsupervised_synthetic_dest_dir)


FileNotFoundError: [Errno 2] No such file or directory: 'unsupervised_data.zip'

In [None]:
import os
import hashlib
import shutil

def get_file_checksum(file_path):
    """Calculate the checksum of a file."""
    sha256_hash = hashlib.sha256()
    with open(file_path, "rb") as f:
        # Read and update hash string value in blocks of 4K
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

def remove_duplicate_images(folder_path):
    """Remove duplicate images in a folder."""
    # Dictionary to store checksums and corresponding file paths
    checksums = {}

    # List all files in the folder
    files = os.listdir(folder_path)

    for file_name in files:
        file_path = os.path.join(folder_path, file_name)

        # Check if the file is a regular file and not a directory
        if os.path.isfile(file_path):
            # Calculate the checksum of the file
            checksum = get_file_checksum(file_path)

            # Check if the checksum is already in the dictionary
            if checksum in checksums:
                # If a duplicate is found, remove the file
                print(f"Removing duplicate: {file_path}")
                os.remove(file_path)
            else:
                # Add the checksum to the dictionary
                checksums[checksum] = file_path

def rename_files(folder_path):
    """Rename files with '(1)' in their names."""
    # List all files in the folder
    files = os.listdir(folder_path)

    for file_name in files:
        file_path = os.path.join(folder_path, file_name)

        # Check if the file is a regular file and not a directory
        if os.path.isfile(file_path):
            # Check if the file name contains '(1)'
            if '(1)' in file_name:
                # Rename the file by removing '(1)'
                new_file_name = file_name.replace('(1)', '')
                new_file_path = os.path.join(folder_path, new_file_name)
                os.rename(file_path, new_file_path)
                print(f"Renamed: {file_path} to {new_file_path}")

if __name__ == "__main__":
    folder_path = r"unsupervised_synthetic\times_alphabet_images_rotated"
    remove_duplicate_images(folder_path)
    #rename_files(folder_path)


In [2]:
import torch
from torch import nn

Dataset builder- Supervised

In [2]:
import torch

def word_to_tensor(word):
    letter_count = [0] * 52
    for char in word:
        if 'a' <= char <= 'z':
            letter_count[ord(char) - ord('a')] += 1
        elif 'A' <= char <= 'Z':
            letter_count[ord(char) - ord('A') + 26] += 1
    return torch.tensor(letter_count, dtype=torch.float32)


In [3]:
import os
import csv
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms


class SupervisedDataset(Dataset):
    def __init__(self, root_dir, labels_path,transform=None):
        self.root_dir = root_dir
        self.labels_path = labels_path
        self.data = []
        self.transform=transform
        with open(labels_path, newline="") as labels_file:
            labels_reader = csv.reader(labels_file)
            for row in labels_reader:
                self.data.append(row)  # a list of [filename, [chars in image]]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.data[idx][0])

        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image= self.transform(image)
        label_text = self.data[idx][1]

        # Convert label text to array of letter counts
        label_tensor = word_to_tensor(label_text)

        return image, label_tensor

# Define the transform
transform = transforms.Compose([transforms.ToTensor()])

# Set your root directory
root_dir = "supervised"
# subfolder = 'supervised_data'

# Create datasets for each model
models = [
    "arial",
    "bradhitc",
    "century_schoolbook",
    "comic",
    "cour",
    "papyrus",
    "times",
]
train_datasets, val_datasets = [], []
transform = transforms.Compose([
            
            transforms.ToTensor(),
        ])
for model in models:
    model_dir = os.path.join(root_dir, f"{model}_images")
    labels_path = os.path.join(root_dir, f"{model}.csv")
    all_data = SupervisedDataset(model_dir, labels_path,transform)

    # Split data into training and validation sets
    train_size = 1000
    val_size = 100
    train_data, val_data = torch.utils.data.random_split(
        all_data, [train_size, val_size]
    )

    train_datasets.append(train_data)
    val_datasets.append(val_data)

# Create DataLoaders
train_loaders = [
    DataLoader(dataset, batch_size=16, shuffle=True) for dataset in train_datasets
]
val_loaders = [
    DataLoader(dataset, batch_size=16, shuffle=False) for dataset in val_datasets
]

Dataset builder - Unsupervised

In [12]:
from torchvision import transforms
from torch.utils.data import ConcatDataset, Dataset, DataLoader, Subset
from PIL import Image
import os
import numpy as np
from sklearn.model_selection import train_test_split

class CustomFontDataset(Dataset):
    def __init__(self, root, transform=None):
        self.root = root

        self.rwlabel = 0
        self.synlabel = 1
        
        self.transform = transform
        self.unsupervised_data = self.load_unsupervised_data()
        self.synthetic_data = self.load_synthetic_data()

        # use this for stratifying the data
        self.img_labels = [self.rwlabel] * len(self.unsupervised_data) + [self.synlabel] * len(self.synthetic_data)

    def load_unsupervised_data(self):
        unsupervised_path = os.path.join(self.root, 'unsupervised')
        unsupervised_images = [os.path.join(unsupervised_path, img) for img in os.listdir(unsupervised_path)]
        return unsupervised_images

    def load_synthetic_data(self):
        synthetic_path = os.path.join(self.root, f'unsupervised_synthetic')
        synthetic_images = [os.path.join(synthetic_path, img) for img in os.listdir(synthetic_path)]
        # actually only take every third one
        return synthetic_images[0::3]

    def __len__(self):
        return len(self.unsupervised_data) + len(self.synthetic_data)

    def __getitem__(self, index):
        if index < len(self.unsupervised_data):
            img_path = self.unsupervised_data[index]
            label = self.rwlabel  # set the label for real world unsupervised data
        else:
            adjusted_index = index - len(self.unsupervised_data)
            img_path = self.synthetic_data[adjusted_index]
            label = self.synlabel  # set the label for synthetic data

        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)

        return image, label

# Define the transform 
transform = transforms.Compose([
            transforms.Resize((64, 64)),
            transforms.ToTensor(),
        ])

# Set root to the current working directory
current_working_directory = os.getcwd()

# Create datasets for each font
dataset = CustomFontDataset(root=current_working_directory, transform=transform)

# Split unsupervised data into train and val
train_idx, validation_idx = train_test_split(np.arange(len(dataset)),
                                             test_size=0.1,
                                             random_state=999,
                                             shuffle=True,
                                             stratify=dataset.img_labels)

# Subset dataset for train and val
train_dataset = Subset(dataset, train_idx)
validation_dataset = Subset(dataset, validation_idx)

# Create a dataloader for the dataset
batch_size = 32

# Dataloader for train and val
unsupervised_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
unsupervised_val_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

# Access the combined dataloader
print(f"Number of samples in the unsupervised dataloader: {len(unsupervised_loader.dataset)}")
print(f"Number of samples in the validation dataloader: {len(unsupervised_val_loader.dataset)}")

Number of samples in the unsupervised dataloader: 7956
Number of samples in the validation dataloader: 884


In [13]:
# Stacked Convolutional Auto-Encoder (the unsupervised sub-network)
class SCAE(nn.Module):
  def __init__(self, num_channels):
    super().__init__()

    self.conv1 = nn.Conv2d(
        in_channels=num_channels,
        out_channels=64,
        kernel_size=11,
        padding=1,
        stride=2
    )
    self.conv2 = nn.Conv2d(
        in_channels=64,
        out_channels=128,
        kernel_size=5,
        padding=2
    )
    self.deconv1 = nn.ConvTranspose2d(
        in_channels = 128,
        out_channels = 64,
        kernel_size = 5,
        padding = 2
    )
    self.deconv2 = nn.ConvTranspose2d(
        in_channels=64,
        # out channels should be the same as in_channels
        out_channels=num_channels,
        kernel_size=11,
        padding=1,
        # using stride in the conv1 layer means that multiple input sizes are mapped to the same size
        # output_padding of 1 ensures that the output is the same size as the input
        # in the specific case that the model is producing an output 1 smaller than the input in both dimensions
        # change the output padding value if you change the input image size
        output_padding=1,
        stride=2
    )
    self.maxpool = nn.MaxPool2d(2, return_indices=True)
    self.unpool = nn.MaxUnpool2d(2)
    self.relu = nn.ReLU()

  def forward(self, x):

    x1 = self.conv1(x)
    x2 = self.relu(x1)
    x3, indices = self.maxpool(x2)

    x4 = self.conv2(x3)
    x5 = self.relu(x4)

    x6 = self.deconv1(x5)
    x7 = self.unpool(x6, indices, output_size=x2.size())
    x8 = self.relu(x7)

    x9 = self.deconv2(x8)
    x10 = self.relu(x9)

    return x10

In [4]:
class DeepFont(nn.Module):
  def __init__(self, num_channels, num_classes):
    super().__init__()
    
    self.conv1 = nn.Conv2d(
        in_channels=num_channels,
        out_channels=64,
        kernel_size=11,
        padding=1,
        stride=2
    )
    self.conv2 = nn.Conv2d(
        in_channels=64,
        out_channels=128,
        kernel_size=5,
        padding=2
    )
    self.conv3 = nn.Conv2d(
        in_channels=128,
        out_channels=256,
        kernel_size=3,
        padding=1
    )
    self.conv4 = nn.Conv2d(
        in_channels=256,
        out_channels=256,
        kernel_size=3,
        padding=1
    )
    self.conv5 = nn.Conv2d(
        in_channels=256,
        out_channels=256,
        kernel_size=3,
        padding=1
    )
    self.fc6 = nn.Linear(in_features=31*31*256, out_features=4096) # assuming input image size of 256x256. change in_feats for different sample size
    self.fc7 = nn.Linear(in_features=4096, out_features=4096)
    self.fc8 = nn.Linear(in_features=4096, out_features=num_classes)
    self.norm1 = nn.BatchNorm2d(num_features=64)
    self.norm2 = nn.BatchNorm2d(num_features=128)
    self.dropout = nn.Dropout(0.5)
    self.maxpool = nn.MaxPool2d(2)
    self.relu = nn.ReLU()
    self.flatten = nn.Flatten()

  def forward(self, x):
    x =self.conv1(x)
    x = self.norm1(x)
    x = self.maxpool(x)
    x = self.relu(x)

    x = self.conv2(x)
    x = self.norm2(x)
    x = self.maxpool(x)
    x = self.relu(x)

    x = self.conv3(x)
    x = self.relu(x)

    x = self.conv4(x)
    x = self.relu(x)

    x = self.conv5(x)
    x = self.relu(x)

    x = self.flatten(x)

    x = self.dropout(self.fc6(x))
    x = self.relu(x)

    x = self.dropout(self.fc7(x))
    x = self.relu(x)

    x = self.fc8(x)

    return x

In [14]:
def training_unsupervised(model, dataloader, criterion, optimizer, device, epochs, model_path):
    model = model.to(device)
    model.train()
    best_loss = torch.inf
    for i in range(epochs):
        total_loss = 0
        for batch_index, (images, _) in enumerate(dataloader):
            optimizer.zero_grad()
            images = images.to(device)
            outputs = model(images)
            loss = criterion(outputs, images)
            total_loss += loss.item() * images.size(0)
            loss.backward()
            optimizer.step()
        avg_loss = total_loss / (batch_index+1)
        print(f'Epoch {i} | Avg Loss: {avg_loss:.4f} | Total Loss: {total_loss:.4f}')
        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save(model.state_dict(), model_path)

In [15]:
import torch.nn.functional as F
def evaluate_unsupervised(model, dataloader, device):
    model = model.to(device)
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch_index, (images, _) in enumerate(dataloader):
            images = images.to(device)
            
            # Forward pass
            outputs = model(images)
            
            # Calculate reconstruction loss (MSE)
            loss = F.mse_loss(outputs, images)
            
            total_loss += loss.item() * images.size(0)

    avg_loss = total_loss / len(dataloader.dataset)
    print(f'Validation Loss: {avg_loss:.4f}')
    return avg_loss


In [5]:
def evaluation(model, dataloader, criterion, device, phase='Validation'):
    model.eval()
    predictions = []
    ground_truth = []

    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0

    with torch.no_grad():
        total_loss = 0
        total_samples = 0

        for _, (images, labels) in enumerate(dataloader):
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            total_loss += loss.item() * images.size(0)
            total_samples += images.size(0)

            # Convert output probabilities to binary predictions
            preds = (torch.sigmoid(outputs) > 0.5).float()

            # Update multi-label metrics
            true_positives += (preds * labels).sum().item()
            true_negatives += ((1 - labels) * (1 - preds)).sum().item()
            false_positives += ((1 - labels) * preds).sum().item()
            false_negatives += (labels * (1 - preds)).sum().item()

            predictions.extend(preds.cpu().numpy())
            ground_truth.extend(labels.cpu().numpy())

        # Calculate multi-label metrics
        precision = true_positives / (true_positives + false_positives + 1e-10)
        recall = true_positives / (true_positives + false_negatives + 1e-10)
        f1_score = 2 * (precision * recall) / (precision + recall + 1e-10)

        accuracy = (true_positives + true_negatives) / (total_samples + 1e-10)
        loss = total_loss / total_samples

        print(f'{phase}\tF1-Score={f1_score:<10.4f}' +
              f'\t\tLoss= {loss:<10.4f}' +
              f'\t\tPrecision: {precision:<10.4f}' +
              f'\t\tRecall: {recall:<10.4f}' +
              f'\t\tAccuracy: {accuracy:<10.4f}')

        return {'loss': loss,
                'f1_score': f1_score,
                'precision': precision,
                'recall': recall,
                'accuracy': accuracy,
                'ground_truth': ground_truth,
                'predictions': predictions}

In [6]:
def training_supervised(model, train_loader, val_loader, criterion, optimizer, device, epochs, best_model_path):
    model = model.to(device)
    model.train()
    best_loss = torch.inf
    best_results = None

    for epoch in range(epochs):
        total_loss = 0
        total_samples = 0

        # New variables for multi-label metrics
        true_positives = 0
        true_negatives = 0
        false_positives = 0
        false_negatives = 0

        for batch_idx, (images, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()

            optimizer.step()

            total_loss += loss.item() * images.size(0)
            total_samples += images.size(0)

            # Convert output probabilities to binary predictions
            preds = (torch.sigmoid(outputs) > 0.5).float()

            # Update multi-label metrics
            true_positives += (preds * labels).sum().item()
            true_negatives += ((1 - labels) * (1 - preds)).sum().item()
            false_positives += ((1 - labels) * preds).sum().item()
            false_negatives += (labels * (1 - preds)).sum().item()

        # Calculate multi-label metrics
        precision = true_positives / (true_positives + false_positives + 1e-10)
        recall = true_positives / (true_positives + false_negatives + 1e-10)
        f1_score = 2 * (precision * recall) / (precision + recall + 1e-10)

        accuracy = (true_positives + true_negatives) / (total_samples + 1e-10)
        loss = total_loss / total_samples

        print(f'{epoch:<4}\tTrain\tF1-Score={f1_score:<10.4f}' +
              f'\t\tLoss= {loss:<10.4f}' +
              f'\t\tPrecision: {precision:<10.4f}' +
              f'\t\tRecall: {recall:<10.4f}' +
              f'\t\tAccuracy: {accuracy:<10.4f}')

        results = evaluation(model, val_loader, criterion, device)

        if results['loss'] < best_loss:
            torch.save(model.state_dict(), best_model_path)
            best_loss = results['loss']
            best_results = results

        print()

    return best_results


In [16]:
import torch.nn as nn
import torch.optim as optim
import os
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "512"
#os.environ['CUDA_VISIBLE_DEVICES'] = '1'
#torch.cuda.empty_cache()
models_dir = 'models'
device = torch.device("cpu")

# Train the unsupervised sub-network
# Hyperparameters
learning_rate = 0.01
momentum = 0.9
weight_decay = 5e-4
epochs = 20

scae_model = SCAE(num_channels=3)
criterion = nn.MSELoss()
optimizer = optim.SGD(scae_model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay)

model_path = os.path.join(models_dir, f"SCAE.pt")

In [17]:
# train the SCAE
training_unsupervised(scae_model, unsupervised_loader, criterion, optimizer, device, epochs, model_path)

Epoch 0 | Avg Loss: 2.9112 | Total Loss: 724.8937
Epoch 1 | Avg Loss: 1.4500 | Total Loss: 361.0467
Epoch 2 | Avg Loss: 1.1622 | Total Loss: 289.3810
Epoch 3 | Avg Loss: 0.9416 | Total Loss: 234.4570
Epoch 4 | Avg Loss: 0.8296 | Total Loss: 206.5784
Epoch 5 | Avg Loss: 0.7505 | Total Loss: 186.8727
Epoch 6 | Avg Loss: 0.7054 | Total Loss: 175.6546
Epoch 7 | Avg Loss: 0.6603 | Total Loss: 164.4263
Epoch 8 | Avg Loss: 0.6307 | Total Loss: 157.0462
Epoch 9 | Avg Loss: 0.6080 | Total Loss: 151.3987


In [18]:
# evaluate the SCAE
evaluate_unsupervised(scae_model, unsupervised_val_loader, device)

Validation Loss: 0.0183


0.018342625231764435

In [13]:
# Train the supervised sub-network
models_dir = 'models'
device = torch.device("cpu")

# Train the unsupervised sub-network
# Hyperparameters
learning_rate = 0.01
momentum = 0.9
weight_decay = 5e-4
epochs = 10

supervised_model = DeepFont(num_channels=3, num_classes=52) # one class per letter (case-sensitive)

# Import the convolutional layers of the SCAE as conv1 and conv2
scae_path = os.path.join(models_dir, f"SCAE.pt")
supervised_model.load_state_dict(torch.load(scae_path), strict=False)

# Freeze the convolutional layers from SCAE
for param in supervised_model.conv1.parameters():
    param.requires_grad = False
for param in supervised_model.conv2.parameters():
    param.requires_grad = False

criterion = nn.BCEWithLogitsLoss()
i=5
font_name = models[i]
print(font_name)
    
train_loader = train_loaders[i]
val_loader = val_loaders[i]
best_model_path = os.path.join(models_dir, f"{font_name}_model.pt")
best_results = training_supervised(supervised_model, train_loader, val_loader, criterion, optimizer, device, epochs, best_model_path)

papyrus
0   	Train	F1-Score=0.1823    		Loss= 0.6933    		Precision: 0.1214    		Recall: 0.3658    		Accuracy: 26.2230   
Validation	F1-Score=0.1865    		Loss= 0.6933    		Precision: 0.1272    		Recall: 0.3494    		Accuracy: 27.9300   

1   	Train	F1-Score=0.1728    		Loss= 0.6934    		Precision: 0.1182    		Recall: 0.3211    		Accuracy: 27.8450   
Validation	F1-Score=0.1865    		Loss= 0.6933    		Precision: 0.1272    		Recall: 0.3494    		Accuracy: 27.9300   

2   	Train	F1-Score=0.1728    		Loss= 0.6934    		Precision: 0.1182    		Recall: 0.3211    		Accuracy: 27.8450   
Validation	F1-Score=0.1865    		Loss= 0.6933    		Precision: 0.1272    		Recall: 0.3494    		Accuracy: 27.9300   



In [None]:
# Testing our models

models_dir = 'models'
criterion = nn.BCEWithLogitsLoss()
device = torch.device("cpu")

for i in range(len(models)):
    font_name = models[i]
    print(font_name)
    model_path = os.path.join(models_dir, f"{font_name}_model.pt")
    model = torch.load(model_path, map_location=device)
    results = evaluation(model, test_loader, criterion, device, 'Test')