### Libraries

In [29]:
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm
import numpy as np

from torch.utils.tensorboard import SummaryWriter
from torchvision.datasets import FashionMNIST
import torchvision
import torch

In [4]:
# !unzip experiment.zip -d .
%load_ext tensorboard
%tensorboard --logdir ./experiment/

Archive:  experiment.zip
   creating: ./experiment/
   creating: ./experiment/{'bn': True, 'dropout': 0.25, 'kernel_size': 5, 'layers': 2, 'pooling': True}/
  inflating: ./experiment/{'bn': True, 'dropout': 0.25, 'kernel_size': 5, 'layers': 2, 'pooling': True}/events.out.tfevents.1711283788.VOKERLEE.22547.19  
  inflating: ./experiment/{'bn': True, 'dropout': 0.25, 'kernel_size': 5, 'layers': 2, 'pooling': True}/events.out.tfevents.1709936094.VOKERLEE.29627.16  
   creating: ./experiment/{'bn': False, 'dropout': 0.25, 'kernel_size': 3, 'layers': 3, 'pooling': False}/
  inflating: ./experiment/{'bn': False, 'dropout': 0.25, 'kernel_size': 3, 'layers': 3, 'pooling': False}/events.out.tfevents.1711285710.VOKERLEE.22547.54  
  inflating: ./experiment/{'bn': False, 'dropout': 0.25, 'kernel_size': 3, 'layers': 3, 'pooling': False}/events.out.tfevents.1709938256.VOKERLEE.29627.51  
   creating: ./experiment/{'bn': True, 'dropout': 0.5, 'kernel_size': 5, 'layers': 2, 'pooling': False}/
  infla

Reusing TensorBoard on port 6006 (pid 25549), started 0:03:15 ago. (Use '!kill 25549' to kill it.)

In [31]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### Dataset & general parameters

Используем `FashionMNIST` dataset.

In [32]:
random_seed = 61
torch.manual_seed(random_seed)
batch_size_train = 64
batch_size_test = 64

In [33]:
# 0.1307 and 0.3081 are just mean and std values for normalization

train_dataset = FashionMNIST('.', train=True, download=True,
                            transform=torchvision.transforms.Compose([
                                torchvision.transforms.ToTensor(),
                                torchvision.transforms.Normalize(
                                    (0.1307,), (0.3081,))
                            ]))
test_dataset = FashionMNIST('.', train=False, download=True,
                            transform=torchvision.transforms.Compose([
                                torchvision.transforms.ToTensor(),
                                torchvision.transforms.Normalize(
                                    (0.1307,), (0.3081,))
                            ]))

print(train_dataset)

Dataset FashionMNIST
    Number of datapoints: 60000
    Root location: .
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.1307,), std=(0.3081,))
           )


In [34]:
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [45000, 15000])
print(len(train_dataset), len(val_dataset), len(test_dataset))

45000 15000 10000


### General training code

Самый обычный код для обучения, взятый с семинаров.

In [35]:
def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):
    model.train()
    model.zero_grad()

    output = model(x_batch.to(device))

    loss = loss_function(output, y_batch.to(device))
    loss.backward()

    optimizer.step()
    return loss.cpu().item()

In [36]:
def train_epoch(train_generator, model, loss_function, optimizer, callback=None):
    epoch_loss = 0
    total = 0

    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        batch_loss = train_on_batch(model, batch_of_x.to(device), batch_of_y.to(device), optimizer, loss_function)

        if callback is not None:
            callback(model, batch_loss)

        epoch_loss += batch_loss * len(batch_of_x)
        total += len(batch_of_x)

    return epoch_loss / total

In [37]:
def trainer(count_of_epoch,
            batch_size,
            dataset,
            model,
            loss_function,
            optimizer,
            lr=0.001,
            callback=None):

    optima = optimizer(model.parameters(), lr=lr)

    iterations = range(count_of_epoch)
    for it in iterations:
        batch_generator = torch.utils.data.DataLoader(dataset=dataset,
                                                      batch_size=batch_size,
                                                      shuffle=True)

        epoch_loss = train_epoch(train_generator=batch_generator,
                    model=model,
                    loss_function=loss_function,
                    optimizer=optima,
                    callback=callback)

In [38]:
def quality_of_train(batch_size,
                     dataset,
                     model,
                     loss_function):

    batch_generator = torch.utils.data.DataLoader(dataset=dataset,
                                                  batch_size=batch_size)

    pred = []
    real = []
    test_loss = 0

    for it, (x_batch, y_batch) in enumerate(batch_generator):
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        output = model(x_batch)

        test_loss += loss_function(output, y_batch).cpu().item() * len(x_batch)

        pred.extend(torch.argmax(output, dim=-1).cpu().numpy().tolist())
        real.extend(y_batch.cpu().numpy().tolist())

    test_loss /= len(dataset)

    return test_loss, pred, real

### CNN model

In [39]:
class CNN(torch.nn.Module):
    @property
    def device(self):
        for p in self.parameters():
            return p.device

    def __init__(self, n_layers=1, kernel_size=5, pooling=False, batch_norm=False, dropout=0.0):
        super().__init__()

        self.n_channels = 1
        self.layers = torch.nn.Sequential()

        for layer in range(n_layers):
            self.layers.add_module('conv' + str(layer),
                torch.nn.Conv2d(self.n_channels, self.n_channels * 4,
                                kernel_size=kernel_size, padding=(kernel_size - 1) // 2))
            self.n_channels *= 4

            if batch_norm:
                self.layers.add_module('bn' + str(layer), torch.nn.BatchNorm2d(self.n_channels))

            self.layers.add_module('relu' + str(layer), torch.nn.ReLU())

            if pooling:
                self.layers.add_module('pool' + str(layer), torch.nn.MaxPool2d(kernel_size=2))

        self.layers.add_module('flatten', torch.nn.Flatten(start_dim=1))
        self.layers.add_module('dropout1', torch.nn.Dropout(dropout))
        self.layers.add_module('linear1',
            torch.nn.Linear(int(self.n_channels * (28 // 2 ** n_layers if pooling else 28) ** 2), 10))

    def forward(self, input):
        return self.layers(input)

### Tensorboard training tracking

In [40]:
class callback():
    def __init__(self, writer, dataset, loss_function, delimeter=100, batch_size=64):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function
        self.batch_size = batch_size

        self.dataset = dataset

    def forward(self, model, loss):
        self.step += 1
        self.writer.add_scalar('LOSS/train', loss, self.step)

        if self.step % self.delimeter == 0:

            self.writer.add_graph(model, self.dataset[0][0].view(1, 1, 28, 28).to(model.device))

            test_loss, pred, real = quality_of_train(batch_size=self.batch_size, dataset=self.dataset,
                                                     model=model, loss_function=self.loss_function)
            self.writer.add_scalar('LOSS/test', test_loss, self.step)

    def __call__(self, model, loss):
        return self.forward(model, loss)

### Code for CNN training

In [41]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam

In [42]:
grid = ParameterGrid({
    'layers': [2, 3],
    'kernel_size': [3, 5, 7],
    'bn': [True, False],
    'pooling': [True, False],
    'dropout': [0.0, 0.25, 0.5],
})

scores = dict()

for item in tqdm(grid):
    print(str(item))

    model = CNN(
        n_layers=item['layers'],
        kernel_size=item['kernel_size'],
        pooling=item['pooling'],
        batch_norm=item['bn'],
        dropout=item['dropout']
    )

    writer = SummaryWriter('experiment/' + str(item))

    model.float().to(device)

    call = callback(writer, test_dataset, loss_function, batch_size=batch_size_test, delimeter=10)

    trainer(count_of_epoch=1,
            batch_size=batch_size_train,
            dataset=train_dataset,
            model=model,
            loss_function=loss_function,
            optimizer=optimizer,
            lr=0.001,
            callback=call)

  0%|          | 0/72 [00:00<?, ?it/s]

{'bn': True, 'dropout': 0.0, 'kernel_size': 3, 'layers': 2, 'pooling': True}


  1%|▏         | 1/72 [00:51<1:00:51, 51.43s/it]

{'bn': True, 'dropout': 0.0, 'kernel_size': 3, 'layers': 2, 'pooling': False}


  3%|▎         | 2/72 [02:02<1:13:15, 62.80s/it]

{'bn': True, 'dropout': 0.0, 'kernel_size': 3, 'layers': 3, 'pooling': True}


  4%|▍         | 3/72 [03:17<1:18:42, 68.45s/it]

{'bn': True, 'dropout': 0.0, 'kernel_size': 3, 'layers': 3, 'pooling': False}


  6%|▌         | 4/72 [04:07<1:09:30, 61.33s/it]

{'bn': True, 'dropout': 0.0, 'kernel_size': 5, 'layers': 2, 'pooling': True}


  7%|▋         | 5/72 [05:05<1:07:13, 60.20s/it]

{'bn': True, 'dropout': 0.0, 'kernel_size': 5, 'layers': 2, 'pooling': False}


  8%|▊         | 6/72 [05:58<1:03:27, 57.69s/it]

{'bn': True, 'dropout': 0.0, 'kernel_size': 5, 'layers': 3, 'pooling': True}


 10%|▉         | 7/72 [06:56<1:02:26, 57.64s/it]

{'bn': True, 'dropout': 0.0, 'kernel_size': 5, 'layers': 3, 'pooling': False}


 11%|█         | 8/72 [08:47<1:19:34, 74.60s/it]

{'bn': True, 'dropout': 0.0, 'kernel_size': 7, 'layers': 2, 'pooling': True}


 12%|█▎        | 9/72 [09:35<1:09:34, 66.26s/it]

{'bn': True, 'dropout': 0.0, 'kernel_size': 7, 'layers': 2, 'pooling': False}


 14%|█▍        | 10/72 [10:27<1:03:58, 61.90s/it]

{'bn': True, 'dropout': 0.0, 'kernel_size': 7, 'layers': 3, 'pooling': True}


 15%|█▌        | 11/72 [11:29<1:02:56, 61.90s/it]

{'bn': True, 'dropout': 0.0, 'kernel_size': 7, 'layers': 3, 'pooling': False}


 17%|█▋        | 12/72 [12:35<1:03:07, 63.12s/it]

{'bn': True, 'dropout': 0.25, 'kernel_size': 3, 'layers': 2, 'pooling': True}


 18%|█▊        | 13/72 [14:10<1:11:46, 72.99s/it]

{'bn': True, 'dropout': 0.25, 'kernel_size': 3, 'layers': 2, 'pooling': False}


 19%|█▉        | 14/72 [15:13<1:07:34, 69.91s/it]

{'bn': True, 'dropout': 0.25, 'kernel_size': 3, 'layers': 3, 'pooling': True}


 21%|██        | 15/72 [16:08<1:02:02, 65.30s/it]

{'bn': True, 'dropout': 0.25, 'kernel_size': 3, 'layers': 3, 'pooling': False}


 22%|██▏       | 16/72 [17:03<58:07, 62.28s/it]  

{'bn': True, 'dropout': 0.25, 'kernel_size': 5, 'layers': 2, 'pooling': True}


 24%|██▎       | 17/72 [17:55<54:14, 59.17s/it]

{'bn': True, 'dropout': 0.25, 'kernel_size': 5, 'layers': 2, 'pooling': False}


 25%|██▌       | 18/72 [18:44<50:26, 56.05s/it]

{'bn': True, 'dropout': 0.25, 'kernel_size': 5, 'layers': 3, 'pooling': True}


 26%|██▋       | 19/72 [19:36<48:25, 54.83s/it]

{'bn': True, 'dropout': 0.25, 'kernel_size': 5, 'layers': 3, 'pooling': False}


 28%|██▊       | 20/72 [20:32<47:58, 55.36s/it]

{'bn': True, 'dropout': 0.25, 'kernel_size': 7, 'layers': 2, 'pooling': True}


 29%|██▉       | 21/72 [21:19<44:49, 52.74s/it]

{'bn': True, 'dropout': 0.25, 'kernel_size': 7, 'layers': 2, 'pooling': False}


 31%|███       | 22/72 [22:07<42:54, 51.49s/it]

{'bn': True, 'dropout': 0.25, 'kernel_size': 7, 'layers': 3, 'pooling': True}


 32%|███▏      | 23/72 [23:04<43:16, 52.99s/it]

{'bn': True, 'dropout': 0.25, 'kernel_size': 7, 'layers': 3, 'pooling': False}


 33%|███▎      | 24/72 [24:05<44:13, 55.29s/it]

{'bn': True, 'dropout': 0.5, 'kernel_size': 3, 'layers': 2, 'pooling': True}


 35%|███▍      | 25/72 [24:53<41:38, 53.16s/it]

{'bn': True, 'dropout': 0.5, 'kernel_size': 3, 'layers': 2, 'pooling': False}


 36%|███▌      | 26/72 [25:41<39:43, 51.81s/it]

{'bn': True, 'dropout': 0.5, 'kernel_size': 3, 'layers': 3, 'pooling': True}


 38%|███▊      | 27/72 [26:35<39:08, 52.18s/it]

{'bn': True, 'dropout': 0.5, 'kernel_size': 3, 'layers': 3, 'pooling': False}


 39%|███▉      | 28/72 [27:25<37:54, 51.70s/it]

{'bn': True, 'dropout': 0.5, 'kernel_size': 5, 'layers': 2, 'pooling': True}


 40%|████      | 29/72 [28:13<36:13, 50.55s/it]

{'bn': True, 'dropout': 0.5, 'kernel_size': 5, 'layers': 2, 'pooling': False}


 42%|████▏     | 30/72 [29:00<34:39, 49.51s/it]

{'bn': True, 'dropout': 0.5, 'kernel_size': 5, 'layers': 3, 'pooling': True}


 43%|████▎     | 31/72 [29:52<34:14, 50.10s/it]

{'bn': True, 'dropout': 0.5, 'kernel_size': 5, 'layers': 3, 'pooling': False}


 44%|████▍     | 32/72 [30:47<34:32, 51.81s/it]

{'bn': True, 'dropout': 0.5, 'kernel_size': 7, 'layers': 2, 'pooling': True}


 46%|████▌     | 33/72 [31:42<34:11, 52.59s/it]

{'bn': True, 'dropout': 0.5, 'kernel_size': 7, 'layers': 2, 'pooling': False}


 47%|████▋     | 34/72 [32:48<35:52, 56.65s/it]

{'bn': True, 'dropout': 0.5, 'kernel_size': 7, 'layers': 3, 'pooling': True}


 49%|████▊     | 35/72 [33:54<36:46, 59.62s/it]

{'bn': True, 'dropout': 0.5, 'kernel_size': 7, 'layers': 3, 'pooling': False}


 50%|█████     | 36/72 [34:47<34:29, 57.50s/it]

{'bn': False, 'dropout': 0.0, 'kernel_size': 3, 'layers': 2, 'pooling': True}


 51%|█████▏    | 37/72 [35:41<32:54, 56.40s/it]

{'bn': False, 'dropout': 0.0, 'kernel_size': 3, 'layers': 2, 'pooling': False}


 53%|█████▎    | 38/72 [36:37<31:53, 56.29s/it]

{'bn': False, 'dropout': 0.0, 'kernel_size': 3, 'layers': 3, 'pooling': True}


 54%|█████▍    | 39/72 [37:34<31:07, 56.61s/it]

{'bn': False, 'dropout': 0.0, 'kernel_size': 3, 'layers': 3, 'pooling': False}


 56%|█████▌    | 40/72 [38:40<31:43, 59.48s/it]

{'bn': False, 'dropout': 0.0, 'kernel_size': 5, 'layers': 2, 'pooling': True}


 57%|█████▋    | 41/72 [39:35<29:54, 57.88s/it]

{'bn': False, 'dropout': 0.0, 'kernel_size': 5, 'layers': 2, 'pooling': False}


 58%|█████▊    | 42/72 [40:20<27:03, 54.11s/it]

{'bn': False, 'dropout': 0.0, 'kernel_size': 5, 'layers': 3, 'pooling': True}


 60%|█████▉    | 43/72 [41:13<25:59, 53.77s/it]

{'bn': False, 'dropout': 0.0, 'kernel_size': 5, 'layers': 3, 'pooling': False}


 61%|██████    | 44/72 [42:08<25:16, 54.16s/it]

{'bn': False, 'dropout': 0.0, 'kernel_size': 7, 'layers': 2, 'pooling': True}


 62%|██████▎   | 45/72 [42:56<23:34, 52.38s/it]

{'bn': False, 'dropout': 0.0, 'kernel_size': 7, 'layers': 2, 'pooling': False}


 64%|██████▍   | 46/72 [44:15<26:10, 60.39s/it]

{'bn': False, 'dropout': 0.0, 'kernel_size': 7, 'layers': 3, 'pooling': True}


 65%|██████▌   | 47/72 [45:27<26:34, 63.77s/it]

{'bn': False, 'dropout': 0.0, 'kernel_size': 7, 'layers': 3, 'pooling': False}


 67%|██████▋   | 48/72 [46:29<25:18, 63.29s/it]

{'bn': False, 'dropout': 0.25, 'kernel_size': 3, 'layers': 2, 'pooling': True}


 68%|██████▊   | 49/72 [47:21<22:58, 59.93s/it]

{'bn': False, 'dropout': 0.25, 'kernel_size': 3, 'layers': 2, 'pooling': False}


 69%|██████▉   | 50/72 [48:08<20:32, 56.03s/it]

{'bn': False, 'dropout': 0.25, 'kernel_size': 3, 'layers': 3, 'pooling': True}


 71%|███████   | 51/72 [49:05<19:45, 56.44s/it]

{'bn': False, 'dropout': 0.25, 'kernel_size': 3, 'layers': 3, 'pooling': False}


 72%|███████▏  | 52/72 [50:40<22:40, 68.02s/it]

{'bn': False, 'dropout': 0.25, 'kernel_size': 5, 'layers': 2, 'pooling': True}


 74%|███████▎  | 53/72 [51:27<19:31, 61.67s/it]

{'bn': False, 'dropout': 0.25, 'kernel_size': 5, 'layers': 2, 'pooling': False}


 75%|███████▌  | 54/72 [52:17<17:25, 58.07s/it]

{'bn': False, 'dropout': 0.25, 'kernel_size': 5, 'layers': 3, 'pooling': True}


 76%|███████▋  | 55/72 [53:12<16:11, 57.13s/it]

{'bn': False, 'dropout': 0.25, 'kernel_size': 5, 'layers': 3, 'pooling': False}


 78%|███████▊  | 56/72 [54:05<14:52, 55.78s/it]

{'bn': False, 'dropout': 0.25, 'kernel_size': 7, 'layers': 2, 'pooling': True}


 79%|███████▉  | 57/72 [55:15<15:03, 60.24s/it]

{'bn': False, 'dropout': 0.25, 'kernel_size': 7, 'layers': 2, 'pooling': False}


 81%|████████  | 58/72 [56:36<15:30, 66.44s/it]

{'bn': False, 'dropout': 0.25, 'kernel_size': 7, 'layers': 3, 'pooling': True}


 82%|████████▏ | 59/72 [57:29<13:32, 62.46s/it]

{'bn': False, 'dropout': 0.25, 'kernel_size': 7, 'layers': 3, 'pooling': False}


 83%|████████▎ | 60/72 [58:36<12:43, 63.65s/it]

{'bn': False, 'dropout': 0.5, 'kernel_size': 3, 'layers': 2, 'pooling': True}


 85%|████████▍ | 61/72 [59:25<10:52, 59.31s/it]

{'bn': False, 'dropout': 0.5, 'kernel_size': 3, 'layers': 2, 'pooling': False}


 86%|████████▌ | 62/72 [1:00:12<09:16, 55.67s/it]

{'bn': False, 'dropout': 0.5, 'kernel_size': 3, 'layers': 3, 'pooling': True}


 88%|████████▊ | 63/72 [1:01:52<10:20, 68.92s/it]

{'bn': False, 'dropout': 0.5, 'kernel_size': 3, 'layers': 3, 'pooling': False}


 89%|████████▉ | 64/72 [1:02:48<08:40, 65.00s/it]

{'bn': False, 'dropout': 0.5, 'kernel_size': 5, 'layers': 2, 'pooling': True}


 90%|█████████ | 65/72 [1:03:37<07:02, 60.31s/it]

{'bn': False, 'dropout': 0.5, 'kernel_size': 5, 'layers': 2, 'pooling': False}


 92%|█████████▏| 66/72 [1:04:25<05:38, 56.46s/it]

{'bn': False, 'dropout': 0.5, 'kernel_size': 5, 'layers': 3, 'pooling': True}


 93%|█████████▎| 67/72 [1:05:17<04:35, 55.18s/it]

{'bn': False, 'dropout': 0.5, 'kernel_size': 5, 'layers': 3, 'pooling': False}


 94%|█████████▍| 68/72 [1:06:12<03:40, 55.05s/it]

{'bn': False, 'dropout': 0.5, 'kernel_size': 7, 'layers': 2, 'pooling': True}


 96%|█████████▌| 69/72 [1:06:57<02:36, 52.26s/it]

{'bn': False, 'dropout': 0.5, 'kernel_size': 7, 'layers': 2, 'pooling': False}


 97%|█████████▋| 70/72 [1:07:48<01:43, 51.82s/it]

{'bn': False, 'dropout': 0.5, 'kernel_size': 7, 'layers': 3, 'pooling': True}


: 

### Conclusions

Была реализована модель CNN и проведён grid-search по параметрам:
* В среднем лучше качество у моделей с 2мя слоями, а не 3мя, что неожиданно (видимо, сеть получилась слишком богатой), но при большем размере ядра (а это можно было ожидать, больше размер ядра $→$ больше рецептивное поле).
* Dropout практически не повлиял на результаты, что менее ожидаемо (учитывая его диапазон вариации при grid-search'е).
* Неожиданно, что лучший score вышел у модели без pooling слоёв.
* Batch normalization ожидаемо ускорила сходимость.