In [1]:
import torchvision
import torch
import torch.nn as nn

from torchvision import transforms

from tqdm import tqdm

from convnext.model.convnext import ConvNeXT, ConvNeXTConfig

In [2]:
model = ConvNeXT(in_channels=3, num_classes=10, conf=ConvNeXTConfig.T)
print(model)

ConvNeXT(
  (downsamples): ModuleList(
    (0): Sequential(
      (0): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
      (1): LayerNorm()
    )
    (1): DownsampleBlock(
      (ln): LayerNorm()
      (conv): Conv2d(96, 192, kernel_size=(2, 2), stride=(2, 2))
    )
    (2): DownsampleBlock(
      (ln): LayerNorm()
      (conv): Conv2d(192, 384, kernel_size=(2, 2), stride=(2, 2))
    )
    (3): DownsampleBlock(
      (ln): LayerNorm()
      (conv): Conv2d(384, 768, kernel_size=(2, 2), stride=(2, 2))
    )
  )
  (stages): ModuleList(
    (0): Sequential(
      (0): ConvNeXTBlock(
        (dw): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96)
        (ln): LayerNorm()
        (pw1): Linear(in_features=96, out_features=384, bias=True)
        (gelu): GELU(approximate=none)
        (pw2): Linear(in_features=384, out_features=96, bias=True)
        (drop_path): DropPath()
      )
      (1): ConvNeXTBlock(
        (dw): Conv2d(96, 96, kernel_size=(7, 7), stride

In [3]:
test_in = torch.rand(1, 3, 224, 224)
print(model(test_in).shape)

torch.Size([1, 10])


In [4]:
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])
dataset1 = torchvision.datasets.CIFAR10('./data', train=True, download=True,
                                        transform=transform)
dataset2 = torchvision.datasets.CIFAR10('./data', train=False, download=True,
                                        transform=transform)

Files already downloaded and verified
Files already downloaded and verified


In [5]:
train_params = {
    'epochs': 16,
    'lr': 0.1,
    'eval_portion': 0.2,
    'batch_size': 16
}

device = torch.device('cuda')

In [6]:
from torch.utils.data import DataLoader, random_split

EVAL_LENGTH = int(len(dataset1) * train_params['eval_portion'])

train_set, eval_set = random_split(dataset1, [len(dataset1) - EVAL_LENGTH, EVAL_LENGTH])
train_loader = DataLoader(train_set, batch_size=train_params['batch_size'],
                          shuffle=True)

eval_loader = DataLoader(eval_set, batch_size=train_params['batch_size'])

test_loader = DataLoader(dataset2, batch_size=train_params['batch_size'],
                         shuffle=True)


In [7]:
model = model.to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=train_params['lr'], momentum=0.9, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=0.9)

cross_entropy = nn.CrossEntropyLoss()

best_accuracy = 0.0

for e in range(train_params['epochs']):
    train_loss = 0.0
    model.train()
    for images, labels in tqdm(iter(train_loader), desc='Training...'):
        optimizer.zero_grad()

        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)
        loss = cross_entropy(outputs, labels)
        train_loss += loss.cpu().detach().numpy()

        loss.backward()
        optimizer.step()

    print(f"Training average loss: {train_loss / len(train_loader)}")

    test_acc_count = 0.0
    eval_loss = 0.0

    model.eval()
    with torch.no_grad():
        for images, labels in tqdm(iter(eval_loader), desc='Eval...'):
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)

            loss = cross_entropy(outputs, labels)
            eval_loss += loss.cpu().detach().numpy()

            pred = torch.argmax(outputs, 1)
            test_acc_count += float(torch.sum(pred == labels))

    test_accuracy = float(test_acc_count) / float(len(eval_set))
    eval_loss /= len(eval_loader)

    print(f'Epoch: {e + 1}, eval accuracy {test_accuracy}, eval loss {eval_loss}')
    if test_accuracy > best_accuracy:
        torch.save(model.state_dict(), 'checkpoints/model.pth')
        best_accuracy = test_accuracy

Training...:   2%|▏         | 48/2500 [00:12<10:59,  3.72it/s] 


KeyboardInterrupt: 