
First we define some global variables which are used for the whole training process

In [12]:
import os
import torch
# Change the paths accordingly
path_train_csv = "train.csv"
path_test_csv = "test.csv"
# First we have to select the classes on which we would like to train on
classes = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
dict_classes_to_numbers = dict(zip(classes, range(len(classes))))
dict_numbers_to_classes = dict(zip(range(len(classes)), classes))

# All available classes
#classes = ['!','$','&','(',')','+','0','1','2','3','4','5','6','7','8','9','<','>','?','A','B','C','D','E','F','G','H',
#           'I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f','g','h','i',
#           'j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','~','ß','α','β','π','φ','€','∑','√','∞',
#           '∫']
num_classes = len(classes)
num_val_samples_per_class = 250
# Standard DL-parameters
batch_size_train = 64
batch_size_val = 256
num_workers = 12
hparams = {"num_epochs": 100, "early_stopping_patience": 3, "early_stopping_threshold": 0.001}
# For getting reproducible results
seed = 0
torch.manual_seed(seed)


<torch._C.Generator at 0x7ff71c0d5210>

We define some helper functions for the training

In [13]:
import torchmetrics

class EarlyStopper:
    def __init__(self, patience=5, min_delta=0.001, model_weights=None):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_validation_acc = 0
        self.best_model_weights = model_weights

    def early_stop(self, validation_acc, model_weights):
       # If the model improved we store the best weights
        if validation_acc > self.best_validation_acc:
            self.best_model_weights = model_weights
        
        # If the model improved more than the threshold  
        if validation_acc > self.best_validation_acc + self.min_delta:
            self.counter = 0
            self.best_validation_acc = validation_acc
        else:          
            if self.counter >= self.patience:
                return True
            self.counter += 1    
        return False


class EpochInformation:
    def __init__(self, model, device, num_classes, dataset_sizes):
        self.mcc_metric = torchmetrics.MatthewsCorrCoef(task='multiclass', num_classes=num_classes).to(device)
        self.auc_metric = torchmetrics.AUROC(task='multiclass', num_classes=num_classes).to(device)
        self.dataset_sizes = dataset_sizes
        self.running_loss = 0.0
        self.running_outputs = None
        self.running_labels = None
        self.model = model
        
    def reset_metrics(self):
        self.running_loss = 0.0
        self.running_outputs = None
        self.running_labels = None            
        
    def update_metrics_for_batch(self, outputs, loss, inputs, labels):
        if self.running_outputs is None:
            self.running_outputs = outputs
        else:
            self.running_outputs = torch.cat((self.running_outputs, outputs), dim=0)
        
        if self.running_labels is None:
            self.running_labels = labels
        else:
            self.running_labels = torch.cat((self.running_labels, labels), dim=0)

        #update the loss
        self.running_loss += loss.item() * inputs.size(0)
        
    def calculate_metrics(self, phase):
        loss = self.running_loss / self.dataset_sizes[phase]
                
        _, predictions = torch.max(self.running_outputs, 1)
        comparison = predictions == self.running_labels
        corrects = torch.sum(comparison)
        
        acc = corrects.double() / self.dataset_sizes[phase]
        mcc = self.mcc_metric(predictions, self.running_labels)
        auc = self.auc_metric(self.running_outputs, self.running_labels)
        
        # The gradient norm can only be calculated during training
        # Also we calculate the weight-norm only once in each training epoch
        if self.model.training:
            grads = [param.grad.detach().flatten() for param in self.model.parameters() if param.grad is not None]
            l2_norm_grads = torch.linalg.vector_norm(torch.cat(grads))
            weights = [param.detach().flatten() for param in self.model.parameters()]
            l2_norm_weights = torch.linalg.vector_norm(torch.cat(weights)) 
           
        result_dict = {
            "loss" : loss,
            "acc"  : acc.item(),
            "mcc"  : mcc.item(),
            "auc"  : auc.item()
        }
        if self.model.training:
            result_dict["l2_grad"] = l2_norm_grads.item()
            result_dict["l2_weights"] = l2_norm_weights.item()
            
        return result_dict 

Now we define a simple CNN-model

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PyTorchClassifier(nn.Module):
    def __init__(self, num_classes):
        super(PyTorchClassifier, self).__init__()
        self.size_fc1 = 256
        self.conv1 = nn.Conv2d(1, 32, 6)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 64, 4)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.conv3 = nn.Conv2d(64, 128, 3)
        self.pool3 = nn.MaxPool2d(2, 2)
        self.conv4 = nn.Conv2d(128, 256, 2)
        self.fc1 = nn.Linear(self.size_fc1, 256)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = self.pool3(F.relu(self.conv3(x)))
        x = F.relu(self.conv4(x))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

The code for generating the dataset

In [15]:
from torch.utils.data import Dataset
import numpy as np

class GermanCharacterRecognitionDS(Dataset):
    def __init__(self, path_csv, dict_classes_to_numbers, transform=None, target_transform=None, classes=[],
                 num_channels=1):
        self.path_csv = path_csv
        self.transform = transform
        self.target_transform = target_transform
        self.data_lines = self.read_lines_csv(classes)
        self.n = len(self.data_lines)
        self.classes = classes
        self.num_channels = num_channels

    def __len__(self):
        return self.n

    def __getitem__(self, idx):
        label, image = self.parse_one_line(idx)
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)            
        # We have to convert the label to an integer value
        label = dict_classes_to_numbers[label]
        return image, label

    def read_lines_csv(self, classes):
        training_data_file = open(self.path_csv, 'r', encoding="latin-1")
        data_lines = training_data_file.readlines()
        training_data_file.close()
        data_lines = [line for line in data_lines if line[0] in classes]
        return data_lines

    def parse_one_line(self, index):
        line = self.data_lines[index].split(',')
        image_np = np.asarray(line[1:1601], dtype=np.float32)
        image_np = image_np.reshape(40, 40, 1)
        if self.num_channels != 1:
            image_np = np.repeat(image_np, self.num_channels, axis=2)
        return line[0], image_np

Then we can define the train loop

In [16]:
import copy
from tqdm import tqdm
from torchvision import transforms

def train_model(data_loaders, model, loss_func, optimizer, device):
    print("training started")
    num_epochs = hparams["num_epochs"]
    information = EpochInformation(model, device, num_classes, dataset_sizes)
    early_stopper = EarlyStopper(patience=hparams["early_stopping_patience"],
                             min_delta=hparams["early_stopping_threshold"],
                             model_weights=copy.deepcopy(model.state_dict()))
    strop_training = False
    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)
        if strop_training == True:
            break
        # Each epoch has a training and validation phase
        for phase in ['val', 'train']:
            if phase == 'train':
                model.train()  
            else:
                model.eval()  
            information.reset_metrics()
            
            if phase == 'train':
                print("training...")
            else:
                print("validating...")                
            data_loader = tqdm(data_loaders[phase])
            for inputs, labels in data_loader:
                inputs = inputs.to(device, non_blocking=True)
                labels = labels.to(device, non_blocking=True)
                optimizer.zero_grad()
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    loss = loss_func(outputs, labels)
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                information.update_metrics_for_batch(outputs, loss, inputs, labels)

            result_dict = information.calculate_metrics(phase)
            # prints the all metrics of the training and validation phase
            print(" ".join(name + ": " + str(round(value, 4)) for name, value in result_dict.items()))

            if phase == 'val':
                if early_stopper.early_stop(result_dict["mcc"], copy.deepcopy(model.state_dict())):
                    print('early stopping')
                    strop_training = True
    # load best model
    model.load_state_dict(early_stopper.best_model_weights) 
    return model

For loading the data we need some helper methods. As stated in the description of the dataset, the representation of each class of the train data set is the same for each class. We also want to make sure that the validation data has the same distribution as the test data, so we need a function which takes a certain amount of samples from each class of the train data set and puts them into the validation data set. To optimize the run time, we save the indices of the train and validation data set in a numpy array. This approach eliminates the need to regenerate the data split each time, thereby significantly reducing processing time.

In [17]:
from torch.utils.data import Subset

def get_train_and_val_loader(train_data_set, num_samples_validation_data=250):
    # Define the ratio for train and validation data
    print("Splitting train- and val-data ...")
    val_count = dict(zip(classes, len(classes) * [0]))
    val_indices = []
    train_indices = []
    for i in range(len(train_data_set)):
        _, label = train_data_set[i]
        label_string = dict_numbers_to_classes[label[0]]
        if val_count[label_string] < num_samples_validation_data:
            val_count[label_string] += 1
            val_indices.append(i)
        else:
            train_indices.append(i)

    np.save("val_indices.npy", np.asarray(val_indices))
    np.save("train_indices.npy", np.asarray(train_indices))
    train_set, val_set = split_train_set(train_data_set, train_indices, val_indices)
    print("Splitting done")
    return train_set, val_set

def split_train_set(train_data_set, train_indices, val_indices):
    train_set = Subset(train_data_set, train_indices)
    val_set = Subset(train_data_set, val_indices)
    return train_set, val_set

def get_class_counts_of_data_loader(data_loader):
    labels_count_dict = dict(zip(classes, len(classes) * [0]))
    for _, labels in data_loader:
        string_labels = [dict_numbers_to_classes[number] for number in labels.tolist()]
        for label in string_labels:
            labels_count_dict[label] += 1
    return labels_count_dict

Now, we can proceed to construct all the necessary data loaders.

In [18]:
from random import random

# We normalize with the men and std of the train set
standard_transforms = [transforms.ToTensor(),transforms.Normalize(35.37502147246886, 75.87412766890324)]
test_set = GermanCharacterRecognitionDS(path_test_csv, transform=transforms.Compose(standard_transforms), classes=classes,
                                        dict_classes_to_numbers=dict_classes_to_numbers, num_channels=1)
train_set = GermanCharacterRecognitionDS(path_train_csv, transform=None, classes=classes,
                                         dict_classes_to_numbers=dict_classes_to_numbers, num_channels=1)
# TODO comment the following line after the first run
#train_loader, val_loader = get_train_and_val_loader(train_set, num_val_samples_per_class)
# TODO uncomment this line if you want to use the precalculated indices which speeds up the run time
train_set, val_set = split_train_set(train_set, np.load("train_indices.npy"), np.load("val_indices.npy"))

train_transforms = standard_transforms + [transforms.RandomRotation(30), transforms.RandomGrayscale(p=0.1), 
                                          transforms.GaussianBlur(kernel_size=3, sigma=(0.1, 2.0))]
train_set.dataset.transform = transforms.Compose(train_transforms)
val_set.dataset.transform = transforms.Compose(standard_transforms)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(seed)

train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size_train, shuffle=True, num_workers=num_workers,                                                   generator=g)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size_val, shuffle=False, num_workers=num_workers,
                                         generator=g)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size_val, shuffle=False, num_workers=num_workers,
                                          generator=g)

class_counts_train = get_class_counts_of_data_loader(train_loader)
class_counts_val = get_class_counts_of_data_loader(val_loader)
class_counts_test = get_class_counts_of_data_loader(test_loader)

print("train_loader: " + str(class_counts_train))
print("val_loader: " + str(class_counts_val))
print("test_loader: " + str(class_counts_test))

data_loaders = {"train": train_loader, "val": val_loader, "test": test_loader}
dataset_sizes = {"train": len(train_loader.dataset), "val": len(val_loader.dataset), "test": len(test_loader.dataset)}

train_loader: {'0': 4541, '1': 4203, '2': 4168, '3': 4120, '4': 4019, '5': 3966, '6': 4235, '7': 4161, '8': 4213, '9': 4125}
val_loader: {'0': 250, '1': 250, '2': 250, '3': 250, '4': 250, '5': 250, '6': 250, '7': 250, '8': 250, '9': 250}
test_loader: {'0': 500, '1': 500, '2': 500, '3': 500, '4': 500, '5': 500, '6': 500, '7': 500, '8': 500, '9': 500}


We also calculate the class weights in order to use a weighted loss function

In [19]:
class_weights = []
number_train_values = len(train_loader.dataset)
for class_label in classes:
    weight = float(number_train_values) / class_counts_train[class_label]
    class_weights.append(weight)
class_weights = torch.tensor(class_weights)
sum_class_weights = torch.sum(class_weights)
class_weights = class_weights / sum_class_weights
print("class weights: ", str(dict(zip(classes, class_weights.tolist()))))

class weights:  {'0': 0.09183375537395477, '1': 0.09921891242265701, '2': 0.10005208849906921, '3': 0.10121773928403854, '4': 0.10376140475273132, '5': 0.10514803230762482, '6': 0.09846921265125275, '7': 0.10022040456533432, '8': 0.09898340702056885, '9': 0.101095050573349}


Now we can start the training

In [20]:
model = PyTorchClassifier(len(classes))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
# Put the tensors on the GPU
model.to(device)
class_weights = class_weights.to(device)
optimizer = torch.optim.NAdam(model.parameters(), lr=0.001, weight_decay=0.000)
loss_func = torch.nn.CrossEntropyLoss(weight=class_weights)
model = train_model(data_loaders, model, loss_func, optimizer, device)

cuda:0
training started
Epoch 0/99
----------
validating...


100%|██████████| 10/10 [00:01<00:00,  9.29it/s]


loss: 2.3019 acc: 0.0992 mcc: -0.0059 auc: 0.5584
training...


100%|██████████| 653/653 [00:07<00:00, 86.22it/s] 


loss: 0.2868 acc: 0.9073 mcc: 0.897 auc: 0.9948 l2_grad: 2.8797 l2_weights: 20.2377
Epoch 1/99
----------
validating...


100%|██████████| 10/10 [00:01<00:00,  9.20it/s]


loss: 0.1916 acc: 0.9484 mcc: 0.9432 auc: 0.9988
training...


100%|██████████| 653/653 [00:07<00:00, 84.70it/s] 


loss: 0.087 acc: 0.9735 mcc: 0.9706 auc: 0.9994 l2_grad: 2.4594 l2_weights: 22.4498
Epoch 2/99
----------
validating...


100%|██████████| 10/10 [00:00<00:00, 10.18it/s]


loss: 0.074 acc: 0.978 mcc: 0.9756 auc: 0.9995
training...


100%|██████████| 653/653 [00:08<00:00, 80.60it/s]


loss: 0.0636 acc: 0.9809 mcc: 0.9788 auc: 0.9996 l2_grad: 0.0334 l2_weights: 24.457
Epoch 3/99
----------
validating...


100%|██████████| 10/10 [00:01<00:00,  9.83it/s]


loss: 0.0763 acc: 0.9784 mcc: 0.976 auc: 0.9996
training...


100%|██████████| 653/653 [00:07<00:00, 82.73it/s] 


loss: 0.0524 acc: 0.9841 mcc: 0.9824 auc: 0.9997 l2_grad: 0.3741 l2_weights: 26.4261
Epoch 4/99
----------
validating...


100%|██████████| 10/10 [00:01<00:00,  9.10it/s]


loss: 0.0743 acc: 0.9764 mcc: 0.9738 auc: 0.9995
training...


100%|██████████| 653/653 [00:08<00:00, 78.51it/s]


loss: 0.0425 acc: 0.9869 mcc: 0.9855 auc: 0.9998 l2_grad: 0.3011 l2_weights: 28.3459
Epoch 5/99
----------
validating...


100%|██████████| 10/10 [00:01<00:00,  7.24it/s]


loss: 0.0964 acc: 0.9732 mcc: 0.9703 auc: 0.9994
training...


100%|██████████| 653/653 [00:08<00:00, 80.55it/s]


loss: 0.0385 acc: 0.9877 mcc: 0.9863 auc: 0.9999 l2_grad: 0.0028 l2_weights: 30.3312
Epoch 6/99
----------
validating...


100%|██████████| 10/10 [00:01<00:00,  9.14it/s]


loss: 0.067 acc: 0.9836 mcc: 0.9818 auc: 0.9996
training...


100%|██████████| 653/653 [00:09<00:00, 66.73it/s]


loss: 0.033 acc: 0.9903 mcc: 0.9892 auc: 0.9999 l2_grad: 0.003 l2_weights: 32.172
Epoch 7/99
----------
validating...


100%|██████████| 10/10 [00:01<00:00,  6.45it/s]


loss: 0.0984 acc: 0.9804 mcc: 0.9783 auc: 0.9993
training...


100%|██████████| 653/653 [00:10<00:00, 64.99it/s]


loss: 0.0307 acc: 0.9905 mcc: 0.9895 auc: 0.9999 l2_grad: 0.0177 l2_weights: 34.1778
Epoch 8/99
----------
validating...


100%|██████████| 10/10 [00:01<00:00,  6.65it/s]


loss: 0.0718 acc: 0.9828 mcc: 0.9809 auc: 0.9996
training...


100%|██████████| 653/653 [00:09<00:00, 65.55it/s]


loss: 0.0264 acc: 0.9916 mcc: 0.9906 auc: 0.9999 l2_grad: 0.0141 l2_weights: 36.0953
Epoch 9/99
----------
validating...


100%|██████████| 10/10 [00:01<00:00,  6.63it/s]


loss: 0.0743 acc: 0.9824 mcc: 0.9805 auc: 0.9994
training...


100%|██████████| 653/653 [00:09<00:00, 68.52it/s]


loss: 0.0239 acc: 0.9928 mcc: 0.992 auc: 0.9999 l2_grad: 0.1738 l2_weights: 37.7499
Epoch 10/99
----------
validating...


100%|██████████| 10/10 [00:01<00:00,  6.38it/s]


loss: 0.0836 acc: 0.984 mcc: 0.9822 auc: 0.9995
early stopping
training...


100%|██████████| 653/653 [00:09<00:00, 68.89it/s]

loss: 0.0208 acc: 0.9933 mcc: 0.9925 auc: 1.0 l2_grad: 0.0023 l2_weights: 39.5625
Epoch 11/99
----------





After the training we evaluate the model

In [21]:
information_test = EpochInformation(model, device, num_classes, dataset_sizes)
model.eval()
for inputs, labels in data_loaders["test"]:
    inputs = inputs.to(device, non_blocking=True)
    labels = labels.to(device, non_blocking=True)
    optimizer.zero_grad()
    with torch.set_grad_enabled(False):
        outputs = model(inputs)
        loss = loss_func(outputs, labels)
    information_test.update_metrics_for_batch(outputs, loss, inputs, labels)

result_dict = information_test.calculate_metrics("test")
print("Test metrics:")
print(" ".join(name + ": " + str(round(value, 4)) for name, value in result_dict.items()))

Test metrics:
loss: 0.092 acc: 0.9786 mcc: 0.9762 auc: 0.9994
