In [7]:
import torch
from torchvision import datasets, transforms

In [2]:
class MLP(torch.nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = torch.nn.Linear(28*28, 512)
        self.d1 = torch.nn.Dropout(0.4)
        self.fc2 = torch.nn.Linear(512, 512)
        self.d2 = torch.nn.Dropout(0.4)
        self.fc3 = torch.nn.Linear(512, 10)

    def forward(self, x):
        x = x.view(-1, 28*28)
        x = torch.nn.functional.relu(self.fc1(x))
        x = self.d1(x)
        x = torch.nn.functional.relu(self.fc2(x))
        x = self.d2(x)
        # note that a softmax is not being used explicitly here
        x = self.fc3(x)
        return x

In [3]:
def train(model, device, loader, opt, epoch, log_every=1):
    model.train()
    for batch_idx, (data, target) in enumerate(loader):
        data, target = data.to(device), target.to(device)
        opt.zero_grad()
        output = model(data)
        # Compute the cross entropy loss between input logits and target
        # it means that predictions are not normalized into a probability distribution
        loss = torch.nn.functional.cross_entropy(output, target)
        loss.backward()
        opt.step()
        if batch_idx % log_every == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(loader.dataset),
                100. * batch_idx / len(loader), loss.item()))
            
def test(model, device, loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += torch.nn.functional.cross_entropy(output, target, reduction='sum').item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
    test_loss /= len(loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(loader.dataset),
        100. * correct / len(loader.dataset)))

In [4]:
# define constants (hyperparameters)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
b_size = 64
l_rate = 0.0005
n_epochs = 5

In [5]:
# define the transformations
transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])
# load the MNIST data
train_data = datasets.MNIST('~/.cache/', train=True, download=True, transform=transform)
test_data = datasets.MNIST('~/.cache/', train=False, download=True, transform=transform)
# create data loaders
train_loader = torch.utils.data.DataLoader(train_data, batch_size=b_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=len(test_data), shuffle=False)


In [6]:
model = MLP().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=l_rate)

for epoch in range(n_epochs):
    train(model, device, train_loader, optimizer, epoch, log_every=100)
    test(model, device, test_loader)


Test set: Average loss: 0.1224, Accuracy: 9611/10000 (96%)


Test set: Average loss: 0.0961, Accuracy: 9690/10000 (97%)


Test set: Average loss: 0.0746, Accuracy: 9752/10000 (98%)


Test set: Average loss: 0.0754, Accuracy: 9780/10000 (98%)


Test set: Average loss: 0.0620, Accuracy: 9792/10000 (98%)

