In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import FashionMNIST
from torch import nn
from torch.nn import functional as F
from torchvision.transforms import ToTensor, Normalize, Compose
from torch.utils.tensorboard import SummaryWriter
from tqdm.auto import tqdm
import numpy as np
device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
# EMNIST не скачался со странными ошибками (
train_data = FashionMNIST(root="./data", download=True, train=True)
test_data = FashionMNIST(root="./data", download=True, train=False)

transforms = Compose([ToTensor(), Normalize(0,1)])

class MyDataset(Dataset):
    def __init__(self, data, transforms=None):
        self.images = [elem[0] for elem in data]
        self.labels = [elem[1] for elem in data]
        self.transforms = transforms
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return self.transforms(self.images[idx]).view(1,28,28), self.labels[idx]


train_dataset = MyDataset(train_data, transforms)
test_dataset = MyDataset(test_data, transforms)

In [25]:
class Model(nn.Module):
    def __init__(self, 
                 kernel_size=3, 
                 n_layers=3,
                 pooling=False, 
                 batch_norm=False,
                 dropout=False):
        super().__init__()

        self.in_conv = nn.Conv2d(1, 5, kernel_size, padding=kernel_size // 2)
        self.conv = nn.ModuleList([self.in_conv] + [nn.Conv2d(5, 5, kernel_size, padding=kernel_size // 2) for _ in range(n_layers-2)])
        
        self.batch_norm = nn.ModuleList([nn.BatchNorm2d(5) for _ in range(n_layers-1)]) if batch_norm else False
        self.act = F.relu
        self.pooling = False if not pooling else nn.MaxPool2d(pooling)
        self.dropout = nn.Dropout(dropout) if dropout else False

        self.out_linear = nn.Linear(5*28*28 // (1 if not pooling else pooling**2), 10)
        self.flat = torch.nn.Flatten()

    def forward(self, x):
        for i, conv in enumerate(self.conv[:-1]):
            x = self.act(conv(x))

            if self.dropout:
                x = self.dropout(x)
            
            if self.batch_norm:
                x = self.batch_norm[i](x)
                
        x = self.conv[-1](x)
        
        if self.pooling:
            x = self.pooling(x)
        if self.batch_norm:
            x = self.batch_norm[-1](x)
            
        x = self.act(x)
        
        x = self.out_linear(self.flat(x))

        return x

In [4]:
def train_epoch(train_loader, model, loss_function, optimizer, callback=None):
    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(tqdm(train_loader, leave=False)):
        batch_loss = train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function)
        
        if callback is not None:
            with torch.no_grad():
                callback(model, batch_loss)
            
        epoch_loss += batch_loss * len(batch_of_x)
        total += len(batch_of_x)
    
    return epoch_loss / total


def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):
    model.train()
    optimizer.zero_grad()
    preds = model(x_batch.to(device))
    loss = loss_function(preds, y_batch.to(device))
    loss.backward()

    optimizer.step()
    return loss.cpu().item()


def trainer(count_of_epoch, 
            batch_size, 
            loader,
            model, 
            loss_function,
            optimizer,
            lr = 0.001,
            callback = None):

    optima = optimizer(model.parameters(), lr=lr)
    
    iterations = tqdm(range(count_of_epoch), desc='epoch')
    iterations.set_postfix({'train epoch loss': np.nan})
    for it in iterations:
        
        
        epoch_loss = train_epoch(train_loader=loader, 
                    model=model, 
                    loss_function=loss_function,
                    optimizer=optima, 
                    callback=callback)
        
        iterations.set_postfix({'train epoch loss': epoch_loss})


class Callback():
    def __init__(self, writer, test_loader, loss_function, delimeter=100, batch_size=64):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function
        self.batch_size = batch_size

        self.loader = test_loader

    def forward(self, model, loss):
        self.step += 1
        self.writer.add_scalar('LOSS/train', loss, self.step)
        
        if self.step % self.delimeter == 0:
            
            pred = []
            real = []
            model.eval()
            
            for it, (x_batch, y_batch) in enumerate(self.loader):
                x_batch = x_batch.to(device)

                output = model(x_batch)

                pred.extend(torch.argmax(output, dim=-1).cpu().numpy().tolist())
                real.extend(y_batch.numpy().tolist())
                
            test_acc = np.mean(np.array(pred) == np.array(real))
            
            self.writer.add_scalar('ACC/test', test_acc, self.step)

          
    def __call__(self, model, loss):
        return self.forward(model, loss)

In [26]:
%load_ext tensorboard
%tensorboard --logdir ./

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Начнем с изменения размеров ядра при фиксированных прочих параметрах

In [6]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam
lr = 3e-3
kernel_sizes = [1,3,5,7,9]
batch_size = 1024

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

for kernel_size in kernel_sizes:

    model = Model(kernel_size=kernel_size, 
                  n_layers=3,
                  pooling=False, 
                  batch_norm=False,
                  dropout=False).to(device)
    
    writer = SummaryWriter(log_dir=f'different_kernel_sizes/{kernel_size}')

    callback = Callback(writer, test_loader, loss_function, delimeter=29)

    trainer(count_of_epoch=10, 
            batch_size=batch_size, 
            loader=train_loader,
            model=model, 
            loss_function=loss_function,
            optimizer=optimizer,
            lr=lr,
            callback=callback)

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

Возьмем kernel_size=3, остальные параметры трогать не будем и переберем различные значения количества слоев

In [7]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam
lr = 3e-3
numbers_of_layers = [3,4,5,6,7]
batch_size = 1024

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

for n_layers in numbers_of_layers:

    model = Model(kernel_size=3, 
                  n_layers=n_layers,
                  pooling=False, 
                  batch_norm=False,
                  dropout=False).to(device)
    
    writer = SummaryWriter(log_dir=f'different_numbers_of_layers/{n_layers}')

    callback = Callback(writer, test_loader, loss_function, delimeter=29)

    trainer(count_of_epoch=10, 
            batch_size=batch_size, 
            loader=train_loader,
            model=model, 
            loss_function=loss_function,
            optimizer=optimizer,
            lr=lr,
            callback=callback)

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

Как видно по графикам на всего 3 слоях ядро размером 5 показывает лучший результат, с одной стороны у него достаточный receptive field, а с другой не слишком много параметров чтобы успеть обучиться но не переобучиться

Возьмем 5 слоев (не меньше, чтобы был заметен эффект) с ядром размера 5 и попробуем применить batchnorm

In [22]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam
lr = 3e-3
batch_norms = [True, False]
batch_size = 1024

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

for batch_norm in batch_norms:

    model = Model(kernel_size=5, 
                  n_layers=3,
                  pooling=False, 
                  batch_norm=batch_norm,
                  dropout=False).to(device)
    
    writer = SummaryWriter(log_dir=f'batch_norms/{batch_norm}')

    callback = Callback(writer, test_loader, loss_function, delimeter=29)

    trainer(count_of_epoch=10, 
            batch_size=batch_size, 
            loader=train_loader,
            model=model, 
            loss_function=loss_function,
            optimizer=optimizer,
            lr=lr,
            callback=callback)

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

Обучение с batchnorm предсказуемо быстрее, кроме того, что было не так очевидно, разница в качестве сохраняется до конца, возможно сойдется позже, но как-то не хочется учить больше 10 эпох..

Возьмем 7 слоев, т.е. сеть которая скорее всего может переобучиться под наш простой датасет, оставим batchnorm и будем проверять спасают ли pooling и dropout 

In [23]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam
lr = 3e-3
dropouts = [False, 0.1, 0.2, 0.3]
batch_size = 1024

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

for dropout in dropouts:

    model = Model(kernel_size=5, 
                  n_layers=5,
                  pooling=False, 
                  batch_norm=False,
                  dropout=dropout).to(device)
    
    writer = SummaryWriter(log_dir=f'dropouts/{dropout}')

    callback = Callback(writer, test_loader, loss_function, delimeter=29)

    trainer(count_of_epoch=10, 
            batch_size=batch_size, 
            loader=train_loader,
            model=model, 
            loss_function=loss_function,
            optimizer=optimizer,
            lr=lr,
            callback=callback)

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

С выключенным dropout сеть явно переобучается -- и лосс, и качество ниже остальных. С dropout 0.3 регуляризация выглядит слишком сильной и качество тоже просаживается. Варианты 0.1 и 0.2 не различимы

In [27]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam
lr = 3e-3
poolings = [False, 2, 4]
batch_size = 1024

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

for pooling in poolings:

    model = Model(kernel_size=5, 
                  n_layers=5,
                  pooling=pooling, 
                  batch_norm=False,
                  dropout=False).to(device)
    
    writer = SummaryWriter(log_dir=f'poolings/{pooling}')

    callback = Callback(writer, test_loader, loss_function, delimeter=29)

    trainer(count_of_epoch=10, 
            batch_size=batch_size, 
            loader=train_loader,
            model=model, 
            loss_function=loss_function,
            optimizer=optimizer,
            lr=lr,
            callback=callback)

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

Чем больше ядро пулинга, тем медленнее идет обучение (в терминах лосса), что логично, и тем меньше итоговое качество -- уже не так логично. Не знаю что еще сказать, с картинками побольше обычно все таки помогает.