## Load Data

In [1]:
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision.datasets import MNIST
import torchvision.transforms as transforms
import numpy as np
import random

np.random.seed(0)
random.seed(0)
torch.manual_seed(0)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [2]:
batch_size = 512
num_epochs = 10

train_dataset = MNIST('./data', train=True, download=True, transform=transforms.ToTensor())
train_loader = DataLoader(train_dataset, batch_size, shuffle=True)

In [3]:
class MLP(nn.Module):
    def __init__(self, input_dims, hidden_dims, output_dims):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(input_dims, hidden_dims)
        self.bn1    = nn.BatchNorm1d(hidden_dims)
        self.layer2 = nn.Linear(hidden_dims, hidden_dims)
        self.bn2    = nn.BatchNorm1d(hidden_dims)
        self.layer3 = nn.Linear(hidden_dims, hidden_dims)
        self.bn3    = nn.BatchNorm1d(hidden_dims)
        self.output = nn.Linear(hidden_dims, output_dims)
        self.act    = nn.LeakyReLU()

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='leaky_relu')
                nn.init.constant_(m.bias, 0.0)

    def forward(self, x):
        x = nn.Flatten()(x)

        # first hidden layer
        x1 = self.layer1(x)
        x1 = self.bn1(x1)
        x1 = self.act(x1)

        # second hidden layer
        x2 = self.layer2(x1)
        x2 = self.bn2(x2)
        x2 = self.act(x2)

        # skip connection: add x1 into input of third layer
        x3_in = x2 + x1

        # third hidden layer
        x3 = self.layer3(x3_in)
        x3 = self.bn3(x3)
        x3 = self.act(x3)

        out = self.output(x3)
        return out

In [4]:
model = MLP(input_dims=784, hidden_dims=256, output_dims=10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.RAdam(model.parameters())

In [5]:
for epoch in range(num_epochs):
    t_loss = 0
    t_acc = 0
    cnt = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        t_loss += loss.item()
        t_acc += (torch.argmax(outputs, 1) == y).sum().item()
        cnt += len(y)

    t_loss /= len(train_loader)
    t_acc /= cnt
    print(f"Epoch {epoch+1}/{num_epochs}, Train_Loss: {t_loss:.4f}, Train_Acc: {t_acc:.4f}")

Epoch 1/10, Train_Loss: 1.1063, Train_Acc: 0.6645
Epoch 2/10, Train_Loss: 0.2630, Train_Acc: 0.9304
Epoch 3/10, Train_Loss: 0.1585, Train_Acc: 0.9573
Epoch 4/10, Train_Loss: 0.1098, Train_Acc: 0.9702
Epoch 5/10, Train_Loss: 0.0789, Train_Acc: 0.9789
Epoch 6/10, Train_Loss: 0.0561, Train_Acc: 0.9859
Epoch 7/10, Train_Loss: 0.0404, Train_Acc: 0.9908
Epoch 8/10, Train_Loss: 0.0283, Train_Acc: 0.9942
Epoch 9/10, Train_Loss: 0.0200, Train_Acc: 0.9965
Epoch 10/10, Train_Loss: 0.0137, Train_Acc: 0.9981
