### Data Loading (MNIST)

In [10]:
import torch, torchvision
import torch.nn as nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'matplotlib'

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [3]:
batch_size = 64

transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize((28, 28)),
    torchvision.transforms.Grayscale(),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0,), (1,))])

train_dataset = torchvision.datasets.MNIST('datasets', train=True, transform=transform, target_transform=None, download=True)
test_dataset = torchvision.datasets.MNIST('datasets', train=False, transform=transform, target_transform=None, download=True)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

### Residual Block

In the original paper [ref] of the RestNet the residual blocks are comprised of two back-to-back convolutional (conv) layers, with a residual connection from the input of the first conv layer to the output of the second one.

In [4]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()

        self.shortcut_cupling = None
        if stride != 1 or in_channels != out_channels:
            self.shortcut_cupling = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)

        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1),
            nn.ReLU()
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
        )
        self.relu = nn.ReLU()

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.conv2(out)

        if self.shortcut_cupling:
            identity = self.shortcut_cupling(identity)

        out += identity
        out = self.relu(out)

        return out

In [5]:
class ResNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ResNet, self).__init__()

        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
            nn.ReLU()
        )
        
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.res_blk1 = nn.Sequential(
            ResidualBlock(64, 64, stride=1),
            ResidualBlock(64, 64, stride=1),
            ResidualBlock(64, 64, stride=1)
        )

        self.res_blk2 = nn.Sequential(
            ResidualBlock(64, 128, stride=1),
            ResidualBlock(128, 128, stride=1),
            ResidualBlock(128, 128, stride=1),
            ResidualBlock(128, 128, stride=1)
        )

        self.avgpool = nn.AvgPool2d(7, stride=2)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(128, num_classes)

    def forward(self, x):
        out = self.conv1(x)
        out = self.maxpool(out)
        out = self.res_blk1(out)
        out = self.res_blk2(out)
        out = self.avgpool(out)
        out = self.flatten(out)
        out = self.fc(out)

        return out

In [6]:
model = ResNet().to(device)

# Loss and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [7]:
def run_on_testloader():
    with torch.no_grad():
        total = 0
        correct = 0
        model.eval()

        for data, labels in iter(test_loader):
            data = data.to(device)
            labels = labels.to(device)
            out = model(data)

            _, predictions = torch.max(out.data, 1)
            total += labels.size(0)
            correct += (predictions == labels).sum().item()
            del data, labels, out

    return correct/total

In [9]:
bc = 0
batches = []
test_accuracies = []

for data, labels in iter(train_loader):
    data = data.to(device)
    labels = labels.to(device)

    preds = model(data)

    loss = loss_fn(preds, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    bc += 1

    del data, labels, preds

    torch.cuda.empty_cache()                        #  releases the cache held by the CUDA memory allocator

    test_acc = run_on_testloader()

    print(f'batch {bc}, training loss: {loss.item():.4f}, test accuracy: {test_acc*100:.4f}\%')

    batches.append(bc)
    test_accuracies.append(test_acc*100)

SyntaxError: invalid decimal literal (522423788.py, line 28)

In [None]:
plt.plot(batches, test_accuracies, '-o')
plt.ylim(0, 100)
plt.ylabel('test accuracy (%)')
plt.xlabel('training batch')
plt.show()