In [8]:
#|default_exp Neural_network_from_scratch

In [9]:
#|export
import torch 

In [10]:
#|export
def _crossentropy_loss( y_hat , y: int): 
    return -y_hat[y] + y_hat.exp().sum().log() 
#calculated skipping the softmax calculation, and instead applying simplified formula 

#print(_crossentropy_loss(torch.tensor([-10,-10,10,-10]), 2))
#print(_crossentropy_loss(torch.tensor([3,15,15,5]), 2))

In [11]:
#|export

class Layer: #layer class to save the succession of layers i want
    def __init__(self, n_perceptrons, activation="relu"):
        self.n_perceptrons=n_perceptrons
        self.activation= activation

class _Layer: #version that gets defined according to the previous layer output size, should not be used by the user
    def __init__(self, size, activation="relu"):
        self.size=size
        self.weights=2*torch.rand(self.size)-1 #initialize as a matrix random matrix between -1 and 1
        self.activation= activation


class MLP_Model:
    """
    A class for managing inference on individual vectors.
    This class is designed to perform inference on a single vector at a time to simplify understanding and debugging. 
    Although in real-world applications inference is usually performed on batches of vectors for efficiency, 
    extending this implementation to handle batches is straightforward. Since derivatives are linear operators, 
    backpropagation for a batch of vectors is conceptually similar to that for a single vector.
    """
    def __init__(self, layers, input_size, seed=42): 

        self.input_size=input_size
        torch.manual_seed(seed)
        #start creating the layers with the correct matrix size
        nrows= input_size
        ncolumns=layers[0].n_perceptrons
        self.sequence=[_Layer(size=(nrows ,ncolumns), activation= layers[0].activation)]
        for layer in layers[1:]:
            nrows= ncolumns
            ncolumns = layer.n_perceptrons
            self.sequence.append(_Layer(size=(nrows ,ncolumns), activation= layer.activation ))

    
    def predict_with_softmax(self, vector):
        
        if len(vector) != self.input_size: #input size is not flexible, must correspond to the model one
            raise ValueError("Length of vector does not match the expected input shape.")
        
        for layer in self.sequence:
            vector = layer.weights.t().mv(vector) #matrix vector multiplication of input layers 
            if layer.activation == "relu":
                vector=vector.clamp(min=0) #relu activation function
            if layer.activation == "sigmoid":
                vector=vector.sigmoid()
            #if layer.activation == "none":
        return torch.softmax(vector)
    
    def _predict(self, vector):
        
        if len(vector) != self.input_size: #input size is not flexible, must correspond to the model one
            raise ValueError("Length of vector does not match the expected input shape.")
        vectors=[vector]
        for i,layer in enumerate(self.sequence):
            vector = layer.weights.t().mv(vector) #matrix vector multiplication of input layers 
            
            if layer.activation == "relu":
                vector=vector.clamp(min=0) #relu activation function
            if layer.activation == "sigmoid":
                vector=vector.sigmoid()
            #if layer.activation == "none":
            vectors.append(vector)
        return vectors
    
    def _predict_val(self, vector):
        if len(vector) != self.input_size: #input size is not flexible, must correspond to the model one
            raise ValueError("Length of vector does not match the expected input shape.")
        for i,layer in enumerate(self.sequence):
            vector = layer.weights.t().mv(vector) #matrix vector multiplication of input layers 
            
            if layer.activation == "relu":
                vector=vector.clamp(min=0) #relu activation function
            if layer.activation == "sigmoid":
                vector=vector.sigmoid()
            #if layer.activation == "none":
        return vector
    
    def train(self, train_x, train_y, val_x, val_y, max_epochs=3, step_size=0.001):
        #add initial train and validation loss and accuracy
        n_train=len(train_x)
        n_val=len(val_x)

        for epoch in range(max_epochs):
            train_loss=0
            val_loss =0
            train_correct = 0  # counter for correct predictions in training
            val_correct = 0 

            for x,y in zip(train_x,train_y): #batch size 1, since it's a simplified, trivial to modify to more
                vectors = self._predict(x)
                loss= _crossentropy_loss(vectors[-1],y)
                #if correct, n_corect += 1
                train_loss+=loss/n_train

                pred = vectors[-1].argmax().item()
                if pred == y:
                    train_correct += 1

                self._backpropagate_error(vectors,y, step_size=step_size)
            
            for x,y in  zip(val_x,val_y):#test validation loss and accuracy
                vector = self._predict_val(x)
                loss = _crossentropy_loss(vector,y)
                #if correct, n_corect += 1
                val_loss += loss/n_val
                pred = vector.argmax().item()
                if pred == y:
                    val_correct += 1

            train_accuracy = train_correct / n_train
            val_accuracy = val_correct / n_val

            print(f"Epoch {epoch:>3} | Train Loss: {train_loss:.4f} | Train Acc: {train_accuracy:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.4f}")

    
    def _backpropagate_error(self, vectors, y, step_size=0.001):
        # gradient descent is made easier by the chain rule of Jacobian matrices and also some simplification
        # of calculations that comes out since these jacobian matrices for the weights are zero except where the corresponding
        # row in the vector i that comes out after multiplication of the weights matrix W_i with vector i-1...
        # this means we can simply calculate the gradient of the weights doing the outer product v_i-1 by dLdzi

        #relu activation 
        v=vectors[-1]
        v_stable = v - v.max()
        dLdzi=v_stable.exp()/v_stable.exp().sum() #notice that dLdzi contains the predicted softmax probabilities
        dLdzi[y]-=1 #loss derivative for increasing the correct prediciton is negative, simply subtracting -1 

        #now let's see the matrix of weight Wk, the gradient is trivially calculated as
         #it's an outer product, because with the chain rule, the previous vector value is involved and the resulting vector gradient is involved
        for i in range(-2,-len(vectors)-1,-1):
            #print(f"layer: {i+1}\nweights before:\n{self.sequence[i+1].weights}")
            v=vectors[i]
            #now let's see the matrix of weight Wk, the gradient is trivially calculated as
            dLdw=torch.outer(v,dLdzi )  #it's an outer product, because with the chain rule, the previous vector value is involved and the resulting vector gradient is involved
            
            #i calculate chain rule iteratively
            mask = (v > 0).to(dtype=dLdzi.dtype) 
            dLdzi = (self.sequence[i+1].weights @ dLdzi) * mask #compute the gradient for previous z vector, 

            #each column ith element must be subtracted of the value of the previous vector ith value
            self.sequence[i+1].weights -= dLdw*step_size #descend the gradient for the weights 
            #print(f"\nweights after:\n{self.sequence[i+1].weights}")



In [12]:
#|export

import torch
from torchvision import datasets, transforms

# Define a transform pipeline that converts the PIL image to a tensor and then flattens it.
transform = transforms.Compose([
    transforms.ToTensor(),               # Converts PIL image to tensor of shape (1, 28, 28)
    transforms.Lambda(lambda x: x.view(-1))  # Flattens the tensor to shape (784,)
])

# Download and load the MNIST dataset with the defined transform.
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# Check the shape of one sample.
sample, label = train_dataset[0]
print(f"Flattened image shape:{sample.shape} label:{label}")  # each image is a torch.Size([784])



Flattened image shape:torch.Size([784]) label:5


In [17]:
#|export

sequence = [Layer(32,"relu") for _ in range(2)] +[Layer(16,"relu") for _ in range(1)]+ [Layer(10,"none")]

m1= MLP_Model( layers = sequence , input_size= 784, seed=1234)

train_x, train_y =zip(*train_dataset)
test_x, test_y =zip(*test_dataset)

#train and validate on the whole dataset
m1.train(train_x=train_x, train_y=train_y, val_x=test_x, val_y=test_y, step_size=0.01, max_epochs=20)

Epoch   0 | Train Loss: 2.2013 | Train Acc: 0.1525 | Val Loss: 2.0638 | Val Acc: 0.2089
Epoch   1 | Train Loss: 2.0453 | Train Acc: 0.2182 | Val Loss: 1.9488 | Val Acc: 0.2593
Epoch   2 | Train Loss: 1.2943 | Train Acc: 0.5268 | Val Loss: 0.5506 | Val Acc: 0.8527
Epoch   3 | Train Loss: 0.3877 | Train Acc: 0.8940 | Val Loss: 0.3395 | Val Acc: 0.9125
Epoch   4 | Train Loss: 0.2886 | Train Acc: 0.9234 | Val Loss: 0.3226 | Val Acc: 0.9200
Epoch   5 | Train Loss: 0.2506 | Train Acc: 0.9338 | Val Loss: 0.2924 | Val Acc: 0.9294
Epoch   6 | Train Loss: 0.2264 | Train Acc: 0.9401 | Val Loss: 0.2639 | Val Acc: 0.9368
Epoch   7 | Train Loss: 0.2068 | Train Acc: 0.9448 | Val Loss: 0.2480 | Val Acc: 0.9414
Epoch   8 | Train Loss: 0.1928 | Train Acc: 0.9483 | Val Loss: 0.2215 | Val Acc: 0.9465
Epoch   9 | Train Loss: 0.1818 | Train Acc: 0.9521 | Val Loss: 0.2264 | Val Acc: 0.9444
Epoch  10 | Train Loss: 0.1734 | Train Acc: 0.9540 | Val Loss: 0.2056 | Val Acc: 0.9499
Epoch  11 | Train Loss: 0.1647 |

In [41]:
from nbdev.export import nb_export
nb_export(nbname="functions.ipynb")

In [42]:
a= torch.tensor([[0,1,2],[0,1,1]])
b= torch.tensor([1,1,1])
print(a +5)
print(a+4*b)

tensor([[5, 6, 7],
        [5, 6, 6]])
tensor([[4, 5, 6],
        [4, 5, 5]])


sequence = [Layer(10,"relu") for _ in range(2)] + [Layer(10,"none")]

m1= MLP_Model( layers = sequence , input_size= 784)

train_x, train_y =zip(*train_dataset)
test_x, test_y =zip(*test_dataset)

m1.train(train_x=train_x[:5], train_y=train_y[:5], val_x=test_x[:5], val_y=test_y[:5], step_size=0.01, max_epochs=20, seed=42)