In [4]:
import torch
from torchvision import datasets
from torchvision.transforms import ToTensor
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_data = datasets.MNIST(root="data", train=True, transform=ToTensor(), download=True)
test_data = datasets.MNIST(root="data", train=False, transform=ToTensor(), download=True)

batch_size = 64
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)

class MLP:
    def __init__(self, input_size, hidden_size, output_size, dropout_prob=0.2):
        self.W1 = torch.randn(input_size, hidden_size, device=device) * math.sqrt(2.0 / input_size)
        self.b1 = torch.zeros(hidden_size, device=device)
        self.W2 = torch.randn(hidden_size, output_size, device=device) * math.sqrt(2.0 / hidden_size)
        self.b2 = torch.zeros(output_size, device=device)
        self.dropout_prob = dropout_prob

    def relu(self, x):
        return torch.maximum(x, torch.tensor(0.0, device=device))

    def softmax(self, x):
        exp_x = torch.exp(x - torch.max(x, dim=1, keepdim=True)[0])
        return exp_x / exp_x.sum(dim=1, keepdim=True)

    def apply_dropout(self, x):
        dropout_mask = (torch.rand(x.shape, device=device) > self.dropout_prob).float()
        return x * dropout_mask / (1 - self.dropout_prob)

    def forward(self, X, train=True):
        self.Z1 = X @ self.W1 + self.b1
        self.A1 = self.relu(self.Z1)
        if train:
            self.A1 = self.apply_dropout(self.A1)
        self.Z2 = self.A1 @ self.W2 + self.b2
        self.A2 = self.softmax(self.Z2)
        return self.A2

    def cross_entropy_loss(self, predictions, targets):
        epsilon = 1e-10
        batch_size = predictions.shape[0]
        log_likelihood = -torch.log(predictions[range(batch_size), targets] + epsilon)
        loss = torch.sum(log_likelihood) / batch_size
        return loss

    def backward(self, X, Y, learning_rate, weight_decay=4e-4):
        m = X.shape[0]

        # Gradient for output layer
        dZ2 = self.A2.clone()
        dZ2[range(m), Y] -= 1
        dW2 = (self.A1.T @ dZ2) / m + weight_decay * self.W2
        db2 = dZ2.sum(axis=0) / m

        # Gradient for hidden layer
        dA1 = dZ2 @ self.W2.T
        dZ1 = dA1 * (self.Z1 > 0).float()  # Derivative of ReLU
        dW1 = (X.T @ dZ1) / m + weight_decay * self.W1
        db1 = dZ1.sum(axis=0) / m

        # Update weights and biases manually
        self.W1 -= learning_rate * dW1
        self.b1 -= learning_rate * db1
        self.W2 -= learning_rate * dW2
        self.b2 -= learning_rate * db2

def learning_rate_decay(epoch, initial_learning_rate, decay_factor=0.95):
    return initial_learning_rate * (decay_factor ** (epoch // 5))

def train(model, train_loader, initial_learning_rate=0.03, epochs=30):
    for epoch in range(epochs):
        learning_rate = learning_rate_decay(epoch, initial_learning_rate)
        total_loss = 0
        correct = 0

        for X, Y in train_loader:
            X, Y = X.view(-1, 28*28).to(device), Y.to(device)
            X = (X - 0.5) / 0.5  # Normalize input

            predictions = model.forward(X, train=True)

            loss = model.cross_entropy_loss(predictions, Y)
            total_loss += loss.item()

            model.backward(X, Y, learning_rate)

            predicted_labels = predictions.argmax(dim=1)
            correct += (predicted_labels == Y).sum().item()

        accuracy = correct / len(train_loader.dataset)
        print(f"Epoch {epoch+1}, Train Loss: {total_loss:.4f}, Train Accuracy: {accuracy:.4f}")

def validate(model, test_loader):
    correct = 0
    total_loss = 0
    for X, Y in test_loader:
        X, Y = X.view(-1, 28*28).to(device), Y.to(device)

        predictions = model.forward(X, train=False)

        loss = model.cross_entropy_loss(predictions, Y)
        total_loss += loss.item()
        predicted_labels = predictions.argmax(dim=1)
        correct += (predicted_labels == Y).sum().item()
    accuracy = correct / len(test_loader.dataset)
    return total_loss / len(test_loader), accuracy

model = MLP(784, 100, 10, dropout_prob=0.15)
train(model, train_loader, initial_learning_rate=0.03, epochs=30)
validation_loss, validation_accuracy = validate(model, test_loader)
print(f"Validation Loss: {validation_loss:.4f}, Validation Accuracy: {validation_accuracy:.4f}")


Epoch 1, Train Loss: 517.6520, Train Accuracy: 0.8255
Epoch 2, Train Loss: 311.6589, Train Accuracy: 0.8999
Epoch 3, Train Loss: 259.6775, Train Accuracy: 0.9176
Epoch 4, Train Loss: 228.4081, Train Accuracy: 0.9257
Epoch 5, Train Loss: 210.0848, Train Accuracy: 0.9324
Epoch 6, Train Loss: 195.7329, Train Accuracy: 0.9369
Epoch 7, Train Loss: 184.5106, Train Accuracy: 0.9413
Epoch 8, Train Loss: 179.4707, Train Accuracy: 0.9420
Epoch 9, Train Loss: 169.1816, Train Accuracy: 0.9444
Epoch 10, Train Loss: 161.8299, Train Accuracy: 0.9470
Epoch 11, Train Loss: 157.5534, Train Accuracy: 0.9487
Epoch 12, Train Loss: 153.7473, Train Accuracy: 0.9496
Epoch 13, Train Loss: 148.8627, Train Accuracy: 0.9530
Epoch 14, Train Loss: 145.2421, Train Accuracy: 0.9526
Epoch 15, Train Loss: 144.4509, Train Accuracy: 0.9525
Epoch 16, Train Loss: 138.2497, Train Accuracy: 0.9548
Epoch 17, Train Loss: 138.9159, Train Accuracy: 0.9554
Epoch 18, Train Loss: 134.0489, Train Accuracy: 0.9563
Epoch 19, Train Los