# Import

In [1]:
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import numpy as np
from torch import nn
import torch
import matplotlib.pyplot as plt
from tqdm import tqdm

# Hyperparameters

In [2]:
batch_size = 64
lr = 1e-4

# Prepare dataset

In [3]:
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

# Network Utils

In [None]:
class Linear():
    def __init__(self, input_dim: int, output_dim: int):
        gen = np.random.default_rng()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.k = np.sqrt(1.0 / input_dim)
        self.weights = gen.uniform(-self.k, self.k, size=(output_dim, input_dim)).astype(np.float32)
        self.bias = gen.uniform(-self.k, self.k, size=(output_dim)).astype(np.float32)
        # Gradient storage
        self.d_weights = np.zeros_like(self.weights)
        self.d_bias = np.zeros_like(self.bias)
        self.input = None
        self.m_weights = np.zeros_like(self.weights)
        self.v_weights = np.zeros_like(self.weights)
        self.m_bias = np.zeros_like(self.bias)
        self.v_bias = np.zeros_like(self.bias)
        self.t = 0
    
    def __call__(self, x: np.ndarray) -> np.ndarray:
        self.input = x  # Store input for backward pass
        y = x @ self.weights.T + self.bias
        return y
    
    def backward(self, grad_output: np.ndarray) -> np.ndarray:
        # Compute gradients
        self.d_weights = grad_output.T @ self.input
        self.d_bias = grad_output.sum(axis=0)
        # Propagate gradient backward
        grad_input = grad_output @ self.weights
        return grad_input
    
    def zero_grad(self):
        self.d_weights.fill(0)
        self.d_bias.fill(0)
    
def ReLU(x: np.ndarray) -> np.ndarray:
    return np.clip(x, min=0)

def ReLU_derivative(x: np.ndarray) -> np.ndarray:
    return (x > 0).astype(np.float32)

class CrossEntropyLoss():
    def __init__(self):
        self.input = None
        self.target = None
    
    def __call__(self, x: np.ndarray, y: np.ndarray) -> float:
        self.input = x
        self.target = y
        log_softmax = self.Softmax(x)
        loss = -log_softmax[np.arange(log_softmax.shape[0]), y]
        return loss.mean()
    
    def Softmax(self, x: np.ndarray) -> np.ndarray:
        c = x.max(axis=1).reshape(-1, 1)
        lse = c + np.log(np.exp(x - c).sum(axis=1)).reshape(-1, 1)
        log_softmax = x - lse
        return log_softmax

    def backward(self) -> np.ndarray:
        batch_size = self.input.shape[0]
        # Compute softmax
        exp_x = np.exp(self.input - self.input.max(axis=1, keepdims=True))
        softmax = exp_x / exp_x.sum(axis=1, keepdims=True)
        # Gradient of cross-entropy loss with softmax
        grad = softmax.copy()
        grad[np.arange(batch_size), self.target] -= 1
        grad /= batch_size
        return grad

In [None]:
a = np.array([[1, 2, 3], [4, 5, 6]])
t = np.array([0, 2])
print(a, "\n")
print(t, "\n")
print(a[np.arange(a.shape[0]), t])

# Model

In [26]:
class Network():
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
        self.layer1 = Linear(input_dim, hidden_dim)
        self.layer2 = Linear(hidden_dim, hidden_dim)
        self.layer3 = Linear(hidden_dim, output_dim)
        self.loss_fn = CrossEntropyLoss()
        # Storage for intermediate activations
        self.z1 = None
        self.a1 = None
        self.z2 = None
        self.a2 = None
        self.z3 = None
    
    def __call__(self, input: np.ndarray) -> np.ndarray:
        x = input.reshape(input.shape[0], -1)
        self.z1 = self.layer1(x)
        self.a1 = ReLU(self.z1)
        self.z2 = self.layer2(self.a1)
        self.a2 = ReLU(self.z2)
        self.z3 = self.layer3(self.a2)
        return self.z3
    
    def backward(self):
        # Backward pass through the network
        grad = self.loss_fn.backward()
        grad = self.layer3.backward(grad)
        grad = grad * ReLU_derivative(self.z2)
        grad = self.layer2.backward(grad)
        grad = grad * ReLU_derivative(self.z1)
        grad = self.layer1.backward(grad)
    
    def zero_grad(self):
        self.layer1.zero_grad()
        self.layer2.zero_grad()
        self.layer3.zero_grad()
    
    def update(self, lr: float = 0.001, beta1: float = 0.9, beta2: float = 0.999, weight_decay: float = 0.01, eps: float = 1e-8):
        for layer in [self.layer1, self.layer2, self.layer3]:
            layer.t += 1  # Increment timestep
            
            # Update weights
            layer.m_weights = beta1 * layer.m_weights + (1 - beta1) * layer.d_weights
            layer.v_weights = beta2 * layer.v_weights + (1 - beta2) * (layer.d_weights ** 2)
            m_hat_w = layer.m_weights / (1 - beta1 ** layer.t)
            v_hat_w = layer.v_weights / (1 - beta2 ** layer.t)
            layer.weights -= lr * m_hat_w / (np.sqrt(v_hat_w) + eps)  # Adam update
            layer.weights -= lr * weight_decay * layer.weights        # Weight decay
            
            # Update biases
            layer.m_bias = beta1 * layer.m_bias + (1 - beta1) * layer.d_bias
            layer.v_bias = beta2 * layer.v_bias + (1 - beta2) * (layer.d_bias ** 2)
            m_hat_b = layer.m_bias / (1 - beta1 ** layer.t)
            v_hat_b = layer.v_bias / (1 - beta2 ** layer.t)
            layer.bias -= lr * m_hat_b / (np.sqrt(v_hat_b) + eps)    # Adam update
            # Note: Typically no weight decay for biases, but can be added if desired
    
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )
        self.linear_relu_stack.apply(init_weights)

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

def init_weights(m):
    gen = np.random.default_rng(257)
    if isinstance(m, nn.Linear):
        m.weight.data = torch.tensor(gen.uniform(-1, 1, size=(m.out_features, m.in_features)), dtype=torch.float32)
        m.bias.data = torch.tensor(gen.uniform(-1, 1, size=(m.out_features)), dtype=torch.float32)


In [27]:
mynet = Network(28*28, 512, 10)

losses = []
with tqdm(total=25) as progress:
    for epoch in range(25):
        for feature, lable in train_dataloader:
            mynet.zero_grad()
            output = mynet(feature.numpy())
            loss = mynet.loss_fn(output, lable.numpy())
            losses.append(loss)
            mynet.backward()
            mynet.update(lr=lr)

        progress.update()

  0%|          | 0/25 [00:00<?, ?it/s]


AttributeError: 'Linear' object has no attribute 'm_weights'

In [20]:
plt.plot(losses)

# Add labels and title (optional)
plt.xlabel('Step')
plt.ylabel('Loss')

# Display the plot
plt.show()

In [19]:
%matplotlib qt

In [17]:
all = len(test_data)
right = 0
for feat, labl in test_data:
    out = mynet(feat.numpy())
    if np.argmax(np.exp(mynet.loss_fn.Softmax(out))) == labl:
        right += 1

print(f"Accuracy: {(right/all) * 100}%")

Accuracy: 64.91%
