In [1]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import numpy as np
import time
from tqdm import tqdm

In [2]:
MNIST_std = 0.3081
MNIST_mean = 0.1307

In [3]:
# Load MNIST dataset
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]))
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]))

In [None]:
print(f'Before scaling: {torch.std_mean(train_dataset.data.float())}')
print(f'After scaling: {torch.std_mean(train_dataset.data.float() / 255)}')

Before scaling: (tensor(78.5675), tensor(33.3184))
After scaling: (tensor(0.3081), tensor(0.1307))


In [4]:
# Load dataset in batches
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [5]:
def sigmoid(x):
    return (1 / (1 + torch.exp(-x)))

In [6]:
def softmax(x, dim=1):
  exp_x = torch.exp(x)
  sum_exp = torch.sum(exp_x, dim=dim, keepdim=True)
  return exp_x / sum_exp

In [7]:
def relu(x):
  return torch.max(torch.zeros_like(x), x)

In [8]:
def one_hot_encode(labels, num_classes=10):
    one_hot = torch.zeros((labels.size(0), num_classes))
    one_hot.scatter_(1, labels.unsqueeze(1).long(), 1)
    return one_hot

98% accuracy Bonus

#### Smart init weights -> random values from a normal distribution with:
##### mean = 0
##### standard deviation = (1 / sqrt(n)), where n is the total number of connections that go into the neuron

In [9]:
def cross_entropy(output, target):
    loss = -torch.sum(target * torch.log(output))
    return loss.item()

In [19]:
class MLP:
    def __init__(self, input_dim, hidden_dim, output_dim, learning_rate, batch_size):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.learning_rate = learning_rate
        self.batch_size = batch_size

        # Smart init weights and biases (He initialization)
        self.weights_hidden = np.random.normal(0, 2/np.sqrt(input_dim), size=(input_dim, hidden_dim))
        self.bias_hidden = np.random.normal(0, 2/np.sqrt(input_dim), size=(hidden_dim))
        self.weights_output = np.random.normal(0, 2/np.sqrt(hidden_dim), size=(hidden_dim, output_dim))
        self.bias_output = np.random.normal(0, 2/np.sqrt(hidden_dim), size=(output_dim))

        self.weights_hidden = torch.from_numpy(self.weights_hidden).to(torch.float32)
        self.bias_hidden = torch.from_numpy(self.bias_hidden).to(torch.float32)
        self.weights_output = torch.from_numpy(self.weights_output).to(torch.float32)
        self.bias_output = torch.from_numpy(self.bias_output).to(torch.float32)

    def forward(self, inputs):
        hidden_linear = torch.add(torch.mm(inputs, self.weights_hidden), self.bias_hidden)
        hidden_activation = torch.sigmoid(hidden_linear)

        output_linear = torch.add(torch.mm(hidden_activation, self.weights_output), self.bias_output)
        output_activation = torch.softmax(output_linear, dim=1)

        return hidden_activation, output_activation

    def backward(self, inputs, hidden_activation, output_activation, targets):
        # Gradient of the cross-entropy loss with respect to the output layer
        grad_output = output_activation - targets

        # Gradient of the hidden layer
        grad_hidden = torch.mm(grad_output, self.weights_output.t()) * hidden_activation * (1 - hidden_activation)

        # Gradients for weights and biases
        grad_weights_output = torch.mm(hidden_activation.t(), grad_output)
        grad_bias_output = torch.sum(grad_output, dim=0)
        grad_weights_hidden = torch.mm(inputs.t(), grad_hidden)
        grad_bias_hidden = torch.sum(grad_hidden, dim=0)

        # Update the weights and biases
        self.weights_output -= self.learning_rate * grad_weights_output / self.batch_size
        self.bias_output -= self.learning_rate * grad_bias_output / self.batch_size
        self.weights_hidden -= self.learning_rate * grad_weights_hidden / self.batch_size
        self.bias_hidden -= self.learning_rate * grad_bias_hidden / self.batch_size

    def train(self, train_loader, epochs):
        for epoch in range(epochs):
            total_loss = 0
            total_correct = 0
            total_samples = 0

            for inputs, targets in tqdm(train_loader):
                # Preprocess inputs and targets
                inputs = torch.flatten(inputs, start_dim=1)
                targets_one_hot = one_hot_encode(targets, num_classes=self.output_dim).float()

                hidden_activation, output_activation = self.forward(inputs)
                total_loss += torch.nn.functional.cross_entropy(output_activation, targets_one_hot)
                self.backward(inputs, hidden_activation, output_activation, targets_one_hot)

                # Compute accuracy for batch
                predictions = torch.argmax(output_activation, dim=1)
                correct = torch.sum(predictions == targets).item()
                total_correct += correct
                total_samples += inputs.size(0)

            average_loss = total_loss / len(train_loader)
            accuracy = total_correct / total_samples

            self.evaluate(test_loader)

            print(f'Epoch {epoch+1} --- Loss: {average_loss} --- Train acc: {round(accuracy*100)}%')

    def evaluate(self, test_loader):
        correct = 0
        total = len(test_loader.dataset)
        for inputs, targets in test_loader:
            inputs = torch.flatten(inputs, start_dim=1)

            _, output_activation = self.forward(inputs)
            predictions = torch.argmax(output_activation, dim=1)
            correct += torch.sum(predictions == targets).item()
        accuracy = correct / total
        print(f'Test accuracy: {round(accuracy*100)}%')

MLP with other methods:
- Batch normalization ~ 80%
- Adam optimizer ~ 15%
- Dropout ~ 90%
- relu and log_softmax ~ 97%

In [22]:
# Parameters initialization
epochs = 10
learning_rate = 0.2
input_dim = 784
hidden_dim = 100
num_classes = 10
batch_size = 32

In [23]:
model = MLP(input_dim, hidden_dim, num_classes, learning_rate, batch_size)

In [24]:
start = time.time()

model.train(train_loader, epochs)

end = time.time()
print(f'Time for training: {(end - start)//60}m : {(end - start)}s.')

model.evaluate(test_loader)

100%|██████████| 1875/1875 [00:11<00:00, 157.61it/s]


Test accuracy: 95%
Epoch 1 --- Loss: 1.5970135927200317 --- Train acc: 92%


100%|██████████| 1875/1875 [00:13<00:00, 136.30it/s]


Test accuracy: 96%
Epoch 2 --- Loss: 1.5313067436218262 --- Train acc: 96%


100%|██████████| 1875/1875 [00:16<00:00, 116.56it/s]


Test accuracy: 97%
Epoch 3 --- Loss: 1.514066457748413 --- Train acc: 97%


100%|██████████| 1875/1875 [00:14<00:00, 126.39it/s]


Test accuracy: 97%
Epoch 4 --- Loss: 1.5044746398925781 --- Train acc: 98%


100%|██████████| 1875/1875 [00:11<00:00, 159.58it/s]


Test accuracy: 97%
Epoch 5 --- Loss: 1.4974911212921143 --- Train acc: 98%


100%|██████████| 1875/1875 [00:11<00:00, 161.48it/s]


Test accuracy: 97%
Epoch 6 --- Loss: 1.4923949241638184 --- Train acc: 99%


100%|██████████| 1875/1875 [00:11<00:00, 159.50it/s]


Test accuracy: 97%
Epoch 7 --- Loss: 1.4884897470474243 --- Train acc: 99%


100%|██████████| 1875/1875 [00:11<00:00, 161.63it/s]


Test accuracy: 98%
Epoch 8 --- Loss: 1.4849803447723389 --- Train acc: 99%


100%|██████████| 1875/1875 [00:11<00:00, 161.88it/s]


Test accuracy: 98%
Epoch 9 --- Loss: 1.4823334217071533 --- Train acc: 99%


100%|██████████| 1875/1875 [00:11<00:00, 162.25it/s]


Test accuracy: 98%
Epoch 10 --- Loss: 1.479832410812378 --- Train acc: 99%
Time for training: 2.0m : 144.68344831466675s.
Test accuracy: 98%


## Compare with torch high-level API:

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(784, 100)
        self.fc2 = nn.Linear(100, 10)

    def forward(self, x):
        x = x.view(-1, 784)
        x = F.sigmoid(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim=1)

model = MLP()

In [None]:
import torch.optim as optim
from torch.utils.data import DataLoader

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train
for epoch in tqdm(range(15)):
    for data in train_loader:
        inputs, labels = data
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

print('Finished Training')

# Test
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        images, labels = data
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the test images: %f %%' % (correct / total))

100%|██████████| 15/15 [05:06<00:00, 20.45s/it]


Finished Training
Accuracy of the network on the test images: 0.971700 %
