In [223]:
## Engine.py

import numpy as np

class Tensor:
    def __init__(self, data, _children=(), _op=""):
        self.data = np.array(data, dtype=np.float32)
        self.grad = np.zeros_like(self.data)


        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op  # for graph debugging

    def __repr__(self):
        return f"Tensor(data={self.data}, grad={self.grad})"

    def __add__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        out = Tensor(self.data + other.data, _children = (self, other), _op="+")
        
        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
            
        out._backward = _backward
        return out
    
    def __neg__(self):
        out = Tensor(-self.data, (self,), _op='neg')

        def _backward():
            self.grad += -1 * out.grad

        out._backward = _backward
        return out
    
    def __sub__(self, other):
        return self + (-other)
        
    def __mul__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        out = Tensor(self.data * other.data, _children = (self, other), _op = "*")
        
        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        
        out._backward = _backward
        return out

    def __matmul__(self, other):
        out = Tensor(self.data @ other.data, (self, other), '@')
        
        def _backward():
            self.grad += out.grad @ other.data.T
            other.grad += self.data.T @ out.grad
        
        out._backward = _backward
        return out
    
    
    def sum(self):
        out = Tensor(self.data.sum(), (self,), 'sum')

        def _backward():
            grad = np.ones_like(self.data) * out.grad
            self.grad += grad 

        out._backward = _backward
        return out
    
    def __pow__(self, power):
        out = Tensor(self.data ** power, (self,), f'**{power}')
        
        def _backward():
            self.grad += (power * (self.data ** (power - 1))) * out.grad
        out._backward = _backward
        return out
    
    def backward(self):
        topo = []
        visited = set()

        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)

        build_topo(self)

        self.grad = np.ones_like(self.data)

        for node in reversed(topo):
            node._backward()

    def zero_grad(self):
        self.grad = 0


In [224]:
class Module: # Every lego piece fits like this, must have a set of parameters, able to zer_grad and a calling mechanism
    def parameters(self):
        return []

    def zero_grad(self):
        for p in self.parameters():
            p.zero_grad()

    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)

In [225]:
class Sequential(Module):
    def __init__(self, layers):
        self.layers = layers

    def forward(self, x: Tensor):
        for layer in self.layers: # Iterate the input through each layer
            x = layer(x)
        return x

    def parameters(self):
        params = []
        for layer in self.layers:
            params.extend(layer.parameters())
        return params

In [226]:
class ReLU(Module):
    def forward(self, x: Tensor):
        out = Tensor(np.maximum(0, x.data), (x,), 'ReLU')

        def _backward():
            x.grad += (x.data > 0) * out.grad

        out._backward = _backward
        return out

class LeakyReLU(Module):
    def __init__(self, negative_slope=0.01):
        self.negative_slope = negative_slope

    def forward(self, x: Tensor):
        out_data = np.where(x.data > 0, x.data, self.negative_slope * x.data)
        out = Tensor(out_data, (x,), 'LeakyReLU')

        def _backward():
            if x.requires_grad:
                grad = np.where(x.data > 0, 1.0, self.negative_slope)
                x.grad += grad * out.grad

        out._backward = _backward
        return out
            
class Softmax(Module):
    def forward(self, x: Tensor):
        exps = np.exp(x.data - np.max(x.data, axis=-1, keepdims=True))
        softmax = exps / np.sum(exps, axis=-1, keepdims=True)
        out = Tensor(softmax, (x,), 'Softmax')
        
        def _backward():
            x.grad += out.grad
            
        out._backward = _backward
        return out

In [227]:
class Linear(Module): # Linear layer
    def __init__(self, in_features, out_features):
        self.W = Tensor(np.random.randn(in_features, out_features) * 0.01)
        self.b = Tensor(np.zeros(out_features))
        
    def forward(self, x: Tensor):
        out = x @ self.W
        out = out + self.b
        return out
    
    def parameters(self):
        return [self.W, self.b]

In [228]:
# Loss functions
class MSELoss(Module):
    def forward(self, pred: Tensor, target: Tensor):
        loss = ((pred - target) ** 2).sum()
        return loss

In [235]:
class SGD:
    def __init__(self, parameters, lr = 0.01):
        self.parameters = parameters
        self.lr = lr
        
    def step(self):
        for p in self.parameters:
            p.data -= self.lr * p.grad.reshape(p.data.shape)
    
    def zero_grad(self):
        for p in self.parameters:
            p.zero_grad()

In [236]:
linear = Linear(1, 1)

#  y = 2x + 1
x_data = [[0.0], [1.0], [2.0], [3.0]]
y_data = [[1.0], [3.0], [5.0], [7.0]]

x_train = [Tensor(x) for x in x_data]
y_train = [Tensor(y) for y in y_data]

loss_fn = MSELoss()
optimizer = SGD(linear.parameters(), lr=0.1)

y_pred = linear(x_train[0])
loss = loss_fn(y_pred, y_train[0])
optimizer.zero_grad()
loss.backward()
for i, p in enumerate(linear.parameters()):
    print(f"Param {i} grad:\n", p.grad)
optimizer.step()

total_loss += 0.0

print(total_loss)


Param 0 grad:
 0.0
Param 1 grad:
 [-2.]
85.30482292175293


In [237]:
# Simple training loop

MLP = Sequential([Linear(2, 4), ReLU(), Linear(4, 1)])
loss_fn = MSELoss()
optimizer = SGD(MLP.parameters(), lr=0.01)
epoch_n = []
loss_n = []

for epoch in range(10):
    x = Tensor([[1.0, 2.0]])
    y_true = Tensor([[0.5]])
    
    y_pred = MLP(x)
    loss = loss_fn(y_pred, y_true)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    epoch_n.append(epoch)
    loss_n.append(loss.data)
    print(f"Epoch {epoch} Loss: {loss.data}")

Epoch 0 Loss: 0.25
Epoch 1 Loss: 0.24010001122951508
Epoch 2 Loss: 0.2305920273065567
Epoch 3 Loss: 0.22146061062812805
Epoch 4 Loss: 0.21269075572490692
Epoch 5 Loss: 0.2042681872844696
Epoch 6 Loss: 0.19617918133735657
Epoch 7 Loss: 0.18841049075126648
Epoch 8 Loss: 0.18094943463802338
Epoch 9 Loss: 0.1737838238477707
