Since the dataset remains to be the same, I will skip the minimal EDA, required in the previous coursework.

## Preprocess the data

### Download the datasets

In [1]:
from torchvision import datasets
from torchvision import transforms

transform = transforms.Compose([
 transforms.ToTensor(),
 transforms.Normalize((0.5,), (0.5,))
])

trainset = datasets.FashionMNIST(
 root="./data",
 train=True,
 download=True,
 transform=transform
)
testset = datasets.FashionMNIST(
 root="./data",
 train=False,
 download=True,
 transform=transform
)


### Create Validation Dataset

In [2]:
import torch
import torch.utils.data as torch_data

val, train = torch_data.random_split(trainset, [int(len(trainset)*0.15), int(len(trainset)*0.85)], generator=torch.Generator().manual_seed(42))

### Create data loaders

In [3]:
BATCH_SIZE = 64

train_loader = torch_data.DataLoader(train, BATCH_SIZE, shuffle=True)
val_loader = torch_data.DataLoader(val, BATCH_SIZE)
test_loader = torch_data.DataLoader(testset, BATCH_SIZE)

### Create the Convolution Model

In [4]:
import torch

from torch import nn



class ParameterizedConvTwoD(nn.Module):

  def __init__(self, img_height: int, img_width: int, num_classes: int, hidden_layers: list[int], use_batch_norm: bool, dropout_rate: float = 0.0,  in_channels: int = 1, pool: type[nn.Module] = None, use_gap: bool = True, stride: int = 2, padding: int = 1) -> None:

    super().__init__()

    self.use_gap = use_gap
    self.layers = nn.ModuleList()
    self.current_in = in_channels
    h, w = img_height, img_width


    for h_layer in hidden_layers:

      # init hidden layer

      self.layers.append(nn.Conv2d(self.current_in, h_layer, kernel_size=3, padding=padding))

      if (use_batch_norm):

        self.layers.append(nn.BatchNorm2d(h_layer))

      self.layers.append(nn.ReLU())

      if pool:
        self.layers.append(pool(kernel_size=2, stride=stride))
        h //= 2
        w //= 2

      # if dropout

      if dropout_rate > 0:
        self.layers.append(nn.Dropout2d(dropout_rate))
      self.current_in = h_layer

    if self.use_gap:
      self.gap_layer = nn.AdaptiveAvgPool2d((1, 1))
      in_features = self.current_in
    else:
      self.gap_layer = None
      in_features = self.current_in * h * w

    self.classifier = nn.Linear(in_features, num_classes)


  def forward(self, x):
    for layer in self.layers:

      x = layer(x)

    if self.use_gap:
      x = self.gap_layer(x)
      
    x = torch.flatten(x, 1)
    x = self.classifier(x)

    return x

### Set up a seed for output reproducibility

In [5]:
import numpy as np

def set_seed(seed=42):
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  np.random.seed(seed)
  torch.backends.cudnn.deterministic = True

set_seed()

### Set up Logging

In [6]:
from torch.utils.tensorboard import SummaryWriter

## Model Training

### Training Params

In [7]:
import os
import torch.optim as optim
from sklearn.metrics import f1_score

# Set training params up
criterion = nn.CrossEntropyLoss()
patience = 8 # early stopping param - how many epochs to wait for an improvement

# Calculate input size
image_shape = trainset[0][0].shape
input_size = image_shape[1] * image_shape[2]

# Set up model config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_model_path = 'models/best_model.pth'
os.makedirs('models', exist_ok=True)

# Set up metric tracking
train_loss_history = []
val_loss_history = []
train_f1_history = []
val_f1_history = []


In [8]:
def training_loop(epochs, model, optimizer, criterion, patience, writer, early_stopping, lr_scheduler=None):
  if early_stopping:
    patience_counter = 0
    best_val_loss = float('inf')
    # Training loop
  for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    all_train_predictions = []
    all_train_labels = []

    for train_images, train_labels in train_loader:
      # Move to cuda's vram, if cuda avaliable (torch handles that)
      train_images, train_labels = train_images.to(device), train_labels.to(device)

      optimizer.zero_grad()

      train_outputs = model(train_images)
      _, predicted = torch.max(train_outputs, dim=1)
      correct_predictions += (predicted == train_labels).sum().item()
      total_predictions += train_labels.size(0)

      train_loss = criterion(train_outputs, train_labels)
      running_loss += train_loss.item()

      all_train_predictions.extend(predicted.cpu().numpy())
      all_train_labels.extend(train_labels.cpu().numpy())

      train_loss.backward()

      optimizer.step()


    # calculate avg loss
    avg_train_loss = running_loss / len(train_loader)
    train_loss_history.append(avg_train_loss)
    train_f1 = f1_score(y_pred=all_train_predictions, y_true=all_train_labels, average='macro')
    train_f1_history.append(train_f1)

    all_val_predictions = []
    all_val_labels = []

    # eval
    model.eval()
    with torch.no_grad():
      val_running_loss = 0.0
      correct_predictions = 0
      total_predictions = 0

      for val_images, val_labels in val_loader:
        val_images, val_labels = val_images.to(device), val_labels.to(device)
        val_outputs = model(val_images)
        _, predicted = torch.max(val_outputs, dim=1)
        correct_predictions += (predicted == val_labels).sum().item()
        total_predictions += val_labels.size(0)

        val_loss = criterion(val_outputs, val_labels)
        val_running_loss += val_loss.item()

        all_val_predictions.extend(predicted.cpu().numpy())
        all_val_labels.extend(val_labels.cpu().numpy())

      avg_val_loss = val_running_loss / len(val_loader)
      val_loss_history.append(avg_val_loss)
      val_f1 = f1_score(y_pred=all_val_predictions, y_true=all_val_labels, average='macro')
      val_f1_history.append(val_f1)

    # Step the scheduler if provided
    if lr_scheduler is not None:
        if isinstance(lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            lr_scheduler.step(avg_val_loss)  # Needs metric
        else:
            lr_scheduler.step()  # StepLR, etc. don't need metric

        # Log current learning rate to TensorBoard
        current_lr = optimizer.param_groups[0]['lr']
        writer.add_scalar('Learning_Rate', current_lr, epoch)

    # log the stats to tensorboard
    writer.add_scalar('Loss/train', avg_train_loss, epoch)
    writer.add_scalar('Loss/val', avg_val_loss, epoch)
    writer.add_scalar("F1/val", val_f1, epoch)
    writer.add_scalar("F1/train", train_f1, epoch)

    # early stopping logic
    if (early_stopping):
      if (avg_val_loss < best_val_loss):
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), best_model_path)
        patience_counter = 0
        print(f"Epoch {epoch+1}/{epochs} - Model saved! Val loss improved to {best_val_loss:.4f}")
      else:
        patience_counter += 1
        print(f"Epoch {epoch+1}/{epochs} - No improvement. Patience: {patience_counter}/{patience}")

      if (patience_counter >= patience):
        print(f"Early stopping triggered after {epoch+1} epochs")
        break

    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Train F1: {train_f1:.4f}, Val Loss: {avg_val_loss:.4f}, Val F1: {val_f1:.4f}")




  writer.close()

### Baseline Model

In [18]:
from torchsummary import summary

# Set model up
LR = 0.001
EPOCHS = 20
hidden_layers = [64,32]
use_batch_norm = False
early_stopping = True

model = ParameterizedConvTwoD(
    img_height=28,
    img_width=28,
    num_classes=10,
    hidden_layers=hidden_layers,
    use_batch_norm=use_batch_norm,
)

optimizer = optim.Adam(model.parameters(), lr=LR)

model.to(device)
writer = SummaryWriter(log_dir='runs/baseline')

training_loop(epochs=EPOCHS, criterion=criterion, model=model, optimizer=optimizer, patience=patience, writer=writer, early_stopping=early_stopping)
summary(model, input_size=(1, 28, 28))

Epoch 1/20 - Model saved! Val loss improved to 1.1256
Epoch 1/20, Train Loss: 1.4635, Train F1: 0.4706, Val Loss: 1.1256, Val F1: 0.5775
Epoch 2/20 - Model saved! Val loss improved to 1.0156
Epoch 2/20, Train Loss: 1.0647, Train F1: 0.6119, Val Loss: 1.0156, Val F1: 0.6083
Epoch 3/20 - Model saved! Val loss improved to 0.9576
Epoch 3/20, Train Loss: 0.9868, Train F1: 0.6415, Val Loss: 0.9576, Val F1: 0.6397
Epoch 4/20 - Model saved! Val loss improved to 0.8912
Epoch 4/20, Train Loss: 0.9332, Train F1: 0.6630, Val Loss: 0.8912, Val F1: 0.6785
Epoch 5/20 - Model saved! Val loss improved to 0.8575
Epoch 5/20, Train Loss: 0.8834, Train F1: 0.6859, Val Loss: 0.8575, Val F1: 0.6855
Epoch 6/20 - Model saved! Val loss improved to 0.7941
Epoch 6/20, Train Loss: 0.8291, Train F1: 0.7080, Val Loss: 0.7941, Val F1: 0.7136
Epoch 7/20 - Model saved! Val loss improved to 0.7383
Epoch 7/20, Train Loss: 0.7810, Train F1: 0.7297, Val Loss: 0.7383, Val F1: 0.7451
Epoch 8/20 - Model saved! Val loss improv

### Max Pooling

In [19]:
pool = nn.MaxPool2d

model = ParameterizedConvTwoD(
    img_height=28,
    img_width=28,
    num_classes=10,
    hidden_layers=hidden_layers,
    use_batch_norm=use_batch_norm,
    pool=pool
)

optimizer = optim.Adam(model.parameters(), lr=LR)


model.to(device)
writer = SummaryWriter(log_dir='runs/max_pooling')

training_loop(epochs=EPOCHS, criterion=criterion, model=model, optimizer=optimizer, patience=patience, writer=writer, early_stopping=early_stopping)
summary(model, input_size=(1, 28, 28))

Epoch 1/20 - Model saved! Val loss improved to 0.8840
Epoch 1/20, Train Loss: 1.2317, Train F1: 0.5698, Val Loss: 0.8840, Val F1: 0.6781
Epoch 2/20 - Model saved! Val loss improved to 0.7563
Epoch 2/20, Train Loss: 0.8147, Train F1: 0.7088, Val Loss: 0.7563, Val F1: 0.7187
Epoch 3/20 - Model saved! Val loss improved to 0.6920
Epoch 3/20, Train Loss: 0.7155, Train F1: 0.7401, Val Loss: 0.6920, Val F1: 0.7373
Epoch 4/20 - Model saved! Val loss improved to 0.6365
Epoch 4/20, Train Loss: 0.6558, Train F1: 0.7634, Val Loss: 0.6365, Val F1: 0.7711
Epoch 5/20 - Model saved! Val loss improved to 0.6144
Epoch 5/20, Train Loss: 0.6198, Train F1: 0.7749, Val Loss: 0.6144, Val F1: 0.7638
Epoch 6/20 - Model saved! Val loss improved to 0.5703
Epoch 6/20, Train Loss: 0.5847, Train F1: 0.7886, Val Loss: 0.5703, Val F1: 0.7986
Epoch 7/20 - No improvement. Patience: 1/8
Epoch 7/20, Train Loss: 0.5635, Train F1: 0.7987, Val Loss: 0.5721, Val F1: 0.7994
Epoch 8/20 - Model saved! Val loss improved to 0.532

### Average Pooling

In [20]:
pool = nn.AvgPool2d

model = ParameterizedConvTwoD(
    img_height=28,
    img_width=28,
    num_classes=10,
    hidden_layers=hidden_layers,
    use_batch_norm=use_batch_norm,
    pool=pool
)

optimizer = optim.Adam(model.parameters(), lr=LR)


model.to(device)
writer = SummaryWriter(log_dir='runs/avg_pooling')

training_loop(epochs=EPOCHS, criterion=criterion, model=model, optimizer=optimizer, patience=patience, writer=writer, early_stopping=early_stopping)
summary(model, input_size=(1, 28, 28))

Epoch 1/20 - Model saved! Val loss improved to 1.0602
Epoch 1/20, Train Loss: 1.3857, Train F1: 0.5006, Val Loss: 1.0602, Val F1: 0.6168
Epoch 2/20 - Model saved! Val loss improved to 0.8914
Epoch 2/20, Train Loss: 0.9552, Train F1: 0.6544, Val Loss: 0.8914, Val F1: 0.6509
Epoch 3/20 - Model saved! Val loss improved to 0.7989
Epoch 3/20, Train Loss: 0.8348, Train F1: 0.6970, Val Loss: 0.7989, Val F1: 0.7016
Epoch 4/20 - Model saved! Val loss improved to 0.7494
Epoch 4/20, Train Loss: 0.7692, Train F1: 0.7241, Val Loss: 0.7494, Val F1: 0.7306
Epoch 5/20 - Model saved! Val loss improved to 0.7001
Epoch 5/20, Train Loss: 0.7264, Train F1: 0.7384, Val Loss: 0.7001, Val F1: 0.7504
Epoch 6/20 - Model saved! Val loss improved to 0.6925
Epoch 6/20, Train Loss: 0.6973, Train F1: 0.7495, Val Loss: 0.6925, Val F1: 0.7535
Epoch 7/20 - Model saved! Val loss improved to 0.6562
Epoch 7/20, Train Loss: 0.6697, Train F1: 0.7619, Val Loss: 0.6562, Val F1: 0.7635
Epoch 8/20 - Model saved! Val loss improv

### Global Average Pooling

In [21]:
use_gap = True

model = ParameterizedConvTwoD(
    img_height=28,
    img_width=28,
    num_classes=10,
    hidden_layers=hidden_layers,
    use_batch_norm=use_batch_norm,
    pool=pool,
    use_gap=use_gap
)

optimizer = optim.Adam(model.parameters(), lr=LR)


model.to(device)
writer = SummaryWriter(log_dir='runs/gap')

training_loop(epochs=EPOCHS, criterion=criterion, model=model, optimizer=optimizer, patience=patience, writer=writer, early_stopping=early_stopping)
summary(model, input_size=(1, 28, 28))

Epoch 1/20 - Model saved! Val loss improved to 1.0344
Epoch 1/20, Train Loss: 1.3647, Train F1: 0.5051, Val Loss: 1.0344, Val F1: 0.6131
Epoch 2/20 - Model saved! Val loss improved to 0.9080
Epoch 2/20, Train Loss: 0.9531, Train F1: 0.6541, Val Loss: 0.9080, Val F1: 0.6738
Epoch 3/20 - Model saved! Val loss improved to 0.8122
Epoch 3/20, Train Loss: 0.8552, Train F1: 0.6919, Val Loss: 0.8122, Val F1: 0.6886
Epoch 4/20 - Model saved! Val loss improved to 0.7592
Epoch 4/20, Train Loss: 0.7965, Train F1: 0.7147, Val Loss: 0.7592, Val F1: 0.7229
Epoch 5/20 - Model saved! Val loss improved to 0.7403
Epoch 5/20, Train Loss: 0.7532, Train F1: 0.7314, Val Loss: 0.7403, Val F1: 0.7400
Epoch 6/20 - Model saved! Val loss improved to 0.6839
Epoch 6/20, Train Loss: 0.7148, Train F1: 0.7483, Val Loss: 0.6839, Val F1: 0.7547
Epoch 7/20 - Model saved! Val loss improved to 0.6595
Epoch 7/20, Train Loss: 0.6877, Train F1: 0.7577, Val Loss: 0.6595, Val F1: 0.7651
Epoch 8/20 - Model saved! Val loss improv

### Dropout & Batch Normalization

In [22]:
use_batch_norm = True

model = ParameterizedConvTwoD(
    img_height=28,
    img_width=28,
    num_classes=10,
    hidden_layers=hidden_layers,
    use_batch_norm=use_batch_norm,
    dropout_rate=0.2,
    pool=pool,
    use_gap=use_gap
)

optimizer = optim.Adam(model.parameters(), lr=LR)


model.to(device)
writer = SummaryWriter(log_dir='runs/dropout_batchnorm')

training_loop(epochs=EPOCHS, criterion=criterion, model=model, optimizer=optimizer, patience=patience, writer=writer, early_stopping=early_stopping)
summary(model, input_size=(1, 28, 28))

Epoch 1/20 - Model saved! Val loss improved to 1.0839
Epoch 1/20, Train Loss: 1.4736, Train F1: 0.4782, Val Loss: 1.0839, Val F1: 0.6518
Epoch 2/20 - Model saved! Val loss improved to 0.9257
Epoch 2/20, Train Loss: 1.1346, Train F1: 0.5860, Val Loss: 0.9257, Val F1: 0.7044
Epoch 3/20 - Model saved! Val loss improved to 0.8308
Epoch 3/20, Train Loss: 1.0283, Train F1: 0.6223, Val Loss: 0.8308, Val F1: 0.7150
Epoch 4/20 - Model saved! Val loss improved to 0.7673
Epoch 4/20, Train Loss: 0.9585, Train F1: 0.6486, Val Loss: 0.7673, Val F1: 0.7396
Epoch 5/20 - Model saved! Val loss improved to 0.7223
Epoch 5/20, Train Loss: 0.9060, Train F1: 0.6687, Val Loss: 0.7223, Val F1: 0.7488
Epoch 6/20 - Model saved! Val loss improved to 0.6860
Epoch 6/20, Train Loss: 0.8608, Train F1: 0.6857, Val Loss: 0.6860, Val F1: 0.7746
Epoch 7/20 - Model saved! Val loss improved to 0.6687
Epoch 7/20, Train Loss: 0.8273, Train F1: 0.7001, Val Loss: 0.6687, Val F1: 0.7799
Epoch 8/20 - Model saved! Val loss improv

### Increased Stride & Padding

In [23]:
use_batch_norm = True

model = ParameterizedConvTwoD(
    img_height=28,
    img_width=28,
    num_classes=10,
    stride=2,
    padding=2,
    hidden_layers=hidden_layers,
    use_batch_norm=use_batch_norm,
    dropout_rate=0.2,
    pool=pool,
    use_gap=use_gap
)

optimizer = optim.Adam(model.parameters(), lr=LR)


model.to(device)
writer = SummaryWriter(log_dir='runs/sride_padding')

training_loop(epochs=EPOCHS, criterion=criterion, model=model, optimizer=optimizer, patience=patience, writer=writer, early_stopping=early_stopping)
summary(model, input_size=(1, 28, 28))

Epoch 1/20 - Model saved! Val loss improved to 1.1867
Epoch 1/20, Train Loss: 1.5755, Train F1: 0.4231, Val Loss: 1.1867, Val F1: 0.6397
Epoch 2/20 - Model saved! Val loss improved to 0.9934
Epoch 2/20, Train Loss: 1.2094, Train F1: 0.5512, Val Loss: 0.9934, Val F1: 0.6559
Epoch 3/20 - Model saved! Val loss improved to 0.8949
Epoch 3/20, Train Loss: 1.0910, Train F1: 0.5938, Val Loss: 0.8949, Val F1: 0.7041
Epoch 4/20 - Model saved! Val loss improved to 0.8280
Epoch 4/20, Train Loss: 1.0102, Train F1: 0.6241, Val Loss: 0.8280, Val F1: 0.7176
Epoch 5/20 - Model saved! Val loss improved to 0.7929
Epoch 5/20, Train Loss: 0.9505, Train F1: 0.6480, Val Loss: 0.7929, Val F1: 0.7364
Epoch 6/20 - Model saved! Val loss improved to 0.7226
Epoch 6/20, Train Loss: 0.9016, Train F1: 0.6666, Val Loss: 0.7226, Val F1: 0.7510
Epoch 7/20 - Model saved! Val loss improved to 0.6905
Epoch 7/20, Train Loss: 0.8582, Train F1: 0.6824, Val Loss: 0.6905, Val F1: 0.7525
Epoch 8/20 - No improvement. Patience: 1/

## Test Set Evaluation

In [24]:
def evaluate_on_test(model, test_loader, criterion, device, writer=None, experiment_name="test"):
    model.eval()
    test_running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    
    all_test_predictions = []
    all_test_labels = []
    
    with torch.no_grad():
        for test_images, test_labels in test_loader:
            test_images, test_labels = test_images.to(device), test_labels.to(device)
            
            test_outputs = model(test_images)
            _, predicted = torch.max(test_outputs, dim=1)
            
            correct_predictions += (predicted == test_labels).sum().item()
            total_predictions += test_labels.size(0)
            
            test_loss = criterion(test_outputs, test_labels)
            test_running_loss += test_loss.item()
            
            all_test_predictions.extend(predicted.cpu().numpy())
            all_test_labels.extend(test_labels.cpu().numpy())
    
    avg_test_loss = test_running_loss / len(test_loader)
    test_accuracy = correct_predictions / total_predictions
    test_f1 = f1_score(y_pred=all_test_predictions, y_true=all_test_labels, average='macro')
    
    # Log to TensorBoard if writer is provided
    if writer is not None:
        writer.add_scalar('Test/Loss', avg_test_loss, 0)
        writer.add_scalar('Test/Accuracy', test_accuracy, 0)
        writer.add_scalar('Test/F1', test_f1, 0)
        writer.close()
    
    print(f"\n{'='*50}")
    print(f"Test Results for {experiment_name}:")
    print(f"{'='*50}")
    print(f"Test Loss: {avg_test_loss:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Test F1 Score: {test_f1:.4f}")
    print(f"{'='*50}\n")
    
    return avg_test_loss, test_accuracy, test_f1

### Evaluate Best Model on Test Set

Load the best saved model and evaluate it on the test set.

In [25]:
# Load the best model from training
if os.path.exists(best_model_path):
    model.load_state_dict(torch.load(best_model_path))
    print(f"Loaded best model from {best_model_path}")
else:
    print("No saved model found, using current model state")

# Create a TensorBoard writer for test results
test_writer = SummaryWriter(log_dir='runs/test_evaluation')

# Evaluate on test set
test_loss, test_accuracy, test_f1 = evaluate_on_test(
    model=model,
    test_loader=test_loader,
    criterion=criterion,
    device=device,
    writer=test_writer,
    experiment_name="Final Model"
)

Loaded best model from models/best_model.pth


  model.load_state_dict(torch.load(best_model_path))



Test Results for Final Model:
Test Loss: 0.5214
Test Accuracy: 0.8294
Test F1 Score: 0.8248



## Analysis

What was common for all of the models, were the following parameters:

```py
LR = 0.001
EPOCHS = 20
hidden_layers = [64,32]
use_batch_norm = False
early_stopping = True
```

Then, I kept creating new models with one incremental change at a time.

### Baseline vs Max Pooling:

For whatever reason, baseline model wasn't tracked fully and it wasn't early stopping that was the cause. Honestly, I don't know what the cause was, I rna baseline code 2 times and in both cases the tracking stopped at step 3. 

Despite that, charts tell us much because the loss is smaller for max pooling model and f1 score is higher. This is no evidence that the best model would always be max pooling one, but it tells us it's highly likely, I think.

![Baseline - Max Pooling](./analysis/baseline_maxpooling.png)


### Max Pooling vs Average Pooling

Max pooling outperformed average pooling model on both metrics. I left average pooling for the rest of the models, which means the final best model may not be the very best that could have been but maybe thanks to this choice, the influence of other changes will be more visible.

![max vs avg](./analysis/maxpool_avgpool.png)


### Average Pooling vs Global Average Pooling

Aaand the model got worse again. On both metrics. At this point I can rationally stop hoping to create the best model there could be (with at least somewhat close approximation - I wouldn't be exploring all the values of all the parameters anyway). My goal will remain purely experimental, to see how certain parameters affect the model's performance. Clearly GAP was not the right choice for this combination of parameters.

![avg vs gap](./analysis/avgpool_gap.png)


### Global Average Pooling vs Dropout & Batch Normalization

Finally something more interesting, because metrics don't reveal the better model at a first glance. In terms of f1 score - training dropout and batchnorm model performed worse than gap model, but on validation set the tides reversed. That means the dropout and batchnorm model actually 'learned' instead of memorizing. Something similar is shown on the loss metric, after 7th epoch on loss val, dropout and batchnorm model got better and stayed better than the other - and that was the most important part of the loss tracking. So it ultimately won.

![gap vs dropbatch](./analysis/gap_dropbatch.png)


### Dropout & Batch Normalization vs Stride & Padding

Increasing stride and padding did not turn out for the better. The model falls short to the other in terms of both f1 as well as loss.

![dropbatch vs stridepad](./analysis/dropbatch_stridepad.png)


### Test Set

After all the changes, during the very last training phase, the best model was saved. We might call it best local model, not best global model, as the analysis revealed that. 

The final results are:

F1: 0,8248
Loss: 0,5214

For comparison, our best MLP model from the previous coursework (modified-up-to-dropout-0.5-model) achieved the following results:

F1: >0.895
Loss: <0.3

That means it was worse performence-wise. But I felt the potential and perhaps it indeed could've been better.