In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np

import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim

In [2]:
from torch.utils.tensorboard import SummaryWriter

In [3]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 4

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)

train_size = int(0.8 * len(trainset))  # 80% for training
val_size = len(trainset) - train_size  # 20% for validation
torch.manual_seed(123)
trainset, valset = torch.utils.data.random_split(trainset, [train_size, val_size])
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)                                          

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
    
    def activation(self, x):
        return F.leaky_relu(x)

    def forward(self, x):
        x = self.pool(self.activation(self.conv1(x)))
        x = self.pool(self.activation(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()

### an alternative model which did show increased training time

# class Net(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
#         self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
#         self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
#         self.pool = nn.MaxPool2d(2, 2)
#         self.fc1 = nn.Linear(128 * 4 * 4, 256)
#         self.fc2 = nn.Linear(256, 10)
#         self.leaky_relu = nn.LeakyReLU(0.01)

#     def forward(self, x):
#         x = self.pool(self.leaky_relu(self.conv1(x)))
#         x = self.pool(self.leaky_relu(self.conv2(x)))
#         x = self.pool(self.leaky_relu(self.conv3(x)))
#         x = x.view(-1, 128 * 4 * 4)
#         x = self.leaky_relu(self.fc1(x))
#         x = self.fc2(x)
#         return x
    
# net = Net()

In [None]:
### Early stopping, from: https://www.geeksforgeeks.org/how-to-handle-overfitting-in-pytorch-models-using-early-stopping/
class EarlyStopping:
    def __init__(self, patience=5, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_score = None
        self.early_stop = False
        self.counter = 0
        self.best_model_state = None

    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.best_model_state = model.state_dict()
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.best_model_state = model.state_dict()
            self.counter = 0

    def load_best_model(self, model):
        model.load_state_dict(self.best_model_state)




In [None]:
criterion = nn.CrossEntropyLoss()
optimizer_dict = {
    'SGD, lr=0.001, LeakyReLu': lambda params: optim.SGD(params, lr=0.001),             # The requested learning rate is very slow, therefore also a higher LR was tested
    'SGD, lr=0.0001, LeakyReLu': lambda params: optim.SGD(params, lr=0.0001),
    'Adam, LeakyReLu': lambda params: optim.Adam(params)
}

In [None]:
for optimizer_name, optimizer_function in optimizer_dict.items():        

                                                    
    writer = SummaryWriter(f'./diff_optimizers/{optimizer_name}')
    net = Net()
    optimizer = optimizer_function(net.parameters())                        # it is necessary to delay the execution of the optimizer initialisation until here
    early_stopping = EarlyStopping(patience=15, delta=0)

    for epoch in range(200):  # loop over the dataset multiple times        # 200 epochs are a lot and only used for the low LR, then even 200 epochs are not enough but the runtime of the program exceeded 3 hours

        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics on TRAINING data:
            running_loss += loss.item()
            if i % 2000 == 1999:    
                avg_running_loss = running_loss / 2000
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {avg_running_loss:.3f}')
                writer.add_scalar("Loss/train", avg_running_loss, epoch * len(trainloader) + i)
                #writer.add_scalar("Accuracy/train", (outputs.argmax(1) == labels).float().mean(), epoch * len(trainloader) + i)
                writer.flush()
                running_loss = 0.0

        # Print statistics on VALIDATION data:
        correct = 0
        total = 0
        avg_loss = 0
        with torch.no_grad():
            for data in valloader:
                images, labels = data
                # calculate outputs by running images through the network
                outputs = net(images)
                # the class with the highest energy is what we choose as prediction
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                avg_loss += criterion(outputs, labels)
        avg_loss /= len(valloader)
        writer.add_scalar("Loss/validation", avg_loss, epoch * (len(trainloader) + 1) )
        writer.add_scalar("Accuracy/validation", correct / total, epoch * (len(trainloader) + 1))
        writer.flush()
        early_stopping(avg_loss, net)
        if early_stopping.early_stop:
            print("Early stopping")
            writer.add_scalar("Early stopping after epochs", epoch)
            break

    early_stopping.load_best_model(net)
    ### Test of best model on TEST data:
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            # calculate outputs by running images through the network
            outputs = net(images)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            avg_loss += criterion(outputs, labels)
    avg_loss /= len(valloader)
    writer.add_scalar("Loss/test", avg_loss, epoch * (len(trainloader) + 1) )
    writer.add_scalar("Accuracy/test", correct / total, epoch * (len(trainloader) + 1))
    writer.flush()



    writer.close()
    PATH = f'./diff_optimizers/{optimizer_name}.pth'
    torch.save(net.state_dict(), PATH)
    print('Finished Training')


[1,  2000] loss: 2.303
[1,  4000] loss: 2.298
[1,  6000] loss: 2.290
[1,  8000] loss: 2.266
[1, 10000] loss: 2.216
[2,  2000] loss: 2.129
[2,  4000] loss: 2.044
[2,  6000] loss: 1.986
[2,  8000] loss: 1.909
[2, 10000] loss: 1.856
[3,  2000] loss: 1.791
[3,  4000] loss: 1.742
[3,  6000] loss: 1.697
[3,  8000] loss: 1.667
[3, 10000] loss: 1.647
[4,  2000] loss: 1.618
[4,  4000] loss: 1.598
[4,  6000] loss: 1.567
[4,  8000] loss: 1.543
[4, 10000] loss: 1.531
[5,  2000] loss: 1.512
[5,  4000] loss: 1.496
[5,  6000] loss: 1.486
[5,  8000] loss: 1.457
[5, 10000] loss: 1.471
[6,  2000] loss: 1.441
[6,  4000] loss: 1.438
[6,  6000] loss: 1.407
[6,  8000] loss: 1.400
[6, 10000] loss: 1.396
[7,  2000] loss: 1.392
[7,  4000] loss: 1.365
[7,  6000] loss: 1.346
[7,  8000] loss: 1.341
[7, 10000] loss: 1.343
[8,  2000] loss: 1.318
[8,  4000] loss: 1.312
[8,  6000] loss: 1.302
[8,  8000] loss: 1.306
[8, 10000] loss: 1.289
[9,  2000] loss: 1.247
[9,  4000] loss: 1.269
[9,  6000] loss: 1.256
[9,  8000] 