In [2]:
import torch 
import torch.nn
import torch.nn.functional as F
from torchvision.datasets import MNIST
from torchvision import transforms
from torch.utils.data import DataLoader

In [3]:
transform = transforms.Compose([
    transforms.ToTensor(),   
    transforms.Normalize((0.1307,), (0.3081,))   
])
 
train_data = MNIST(root='./data', train=True, download=True, transform=transform)
test_data = MNIST(root='./data', train=False, download=True, transform=transform)
 
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

In [4]:
def softmax(x):
    exp = torch.exp(x - x.max(dim=1, keepdim=True).values)
    return exp / exp.sum(dim=1, keepdim=True)

def relu(x):
    return torch.clamp(x, min=0)

def relu_derivative(x):
    return (x > 0).float()

In [6]:
class Model:
    def __init__(self): 
        self.w1 = torch.randn(784, 1024) * 0.01
        self.b1 = torch.zeros(1, 1024)
        self.w2 = torch.randn(1024, 10) * 0.01
        self.b2 = torch.zeros(1, 10)

    def forward(self, x):
        self.z1 = x @ self.w1 + self.b1      
        self.a1 = relu(self.z1)
        self.z2 = self.a1 @ self.w2 + self.b2
        self.a2 = softmax(self.z2)
        return self.a2
    
    def backward(self, x, y_true, learning_rate=0.01):

        batch_size = x.shape[0] 
        y_onehot = torch.zeros_like(self.a2)
        y_onehot[range(batch_size), y_true] = 1 

        dz2 = self.a2 - y_onehot
        dw2 = self.a1.T @ dz2 / batch_size
        db2 = dz2.mean(dim=0, keepdim=True) 
        da1 = dz2 @ self.w2.T
        dz1 = da1 * relu_derivative(self.z1)
        dw1 = x.T @ dz1 / batch_size
        db1 = dz1.mean(dim=0, keepdim=True) 
        self.w1 -= learning_rate * dw1
        self.b1 -= learning_rate * db1
        self.w2 -= learning_rate * dw2
        self.b2 -= learning_rate * db2
    
    def loss(self, predictions, targets):
        batch_size = predictions.shape[0]
        log_prob = -torch.log(predictions[range(batch_size), targets] + 1e-8)
        return log_prob.mean()

model = Model()
epochs=10
learning_rate=0.01

for epoch in range(epochs):
    total_loss = 0
    correct = 0
    total = 0
    
    for batch_idx, (data, target) in enumerate(train_loader): 
        data = data.view(data.shape[0], -1)
            
        output = model.forward(data) 
        loss = model.loss(output, target)
        total_loss += loss.item()
            
        model.backward(data, target, learning_rate) 
        pred = output.argmax(dim=1)
        correct += (pred == target).sum().item()
        total += target.size(0)

    train_acc = 100 * correct / total
    avg_loss = total_loss / len(train_loader)
    
    print(f'Epoch {epoch+1}/{epochs}: Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%')

Epoch 1/10: Loss: 0.6459, Train Acc: 84.78%
Epoch 2/10: Loss: 0.3086, Train Acc: 91.10%
Epoch 3/10: Loss: 0.2585, Train Acc: 92.60%
Epoch 4/10: Loss: 0.2242, Train Acc: 93.64%
Epoch 5/10: Loss: 0.1975, Train Acc: 94.44%
Epoch 6/10: Loss: 0.1763, Train Acc: 95.06%
Epoch 7/10: Loss: 0.1588, Train Acc: 95.52%
Epoch 8/10: Loss: 0.1445, Train Acc: 95.93%
Epoch 9/10: Loss: 0.1322, Train Acc: 96.29%
Epoch 10/10: Loss: 0.1219, Train Acc: 96.63%
