Based on https://github.com/karpathy/micrograd/blob/master/micrograd/engine.py

In [1]:
import numpy as np

In [2]:
class Tensor:
    def __init__(self, data, prev=(), op=None, *args, **kwargs):
        self.data = data
        self.prev = prev
        self.grad = 0
        self.op = op
        self.grad_fn = lambda x: None
        self.broadcast_dim = None
    
    def backward(self, gradient=None):
        if gradient is None:
            gradient = np.ones_like(self.data)
        self.grad = gradient
        self.grad_fn(self.grad)
        for p in self.prev:
            p.backward(p.grad)

    def __repr__(self):
        return repr(self.data)
    
    def __add__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        self.checkbroadcast(other)
        out = Tensor(self.data + other.data, (self, other), op='+')
        def grad_fn(gradient):
            self.grad += gradient if self.broadcast_dim is None else gradient.sum(self.broadcast_dim)
            other.grad += gradient if other.broadcast_dim is None else gradient.sum(other.broadcast_dim)
        out.grad_fn = grad_fn
        return out
    
    def __mul__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        out = Tensor(self.data * other.data, (self, other), op='*')
        def grad_fn(gradient):
            self.grad += gradient * other.data
            other.grad += gradient * self.data
        out.grad_fn = grad_fn
        return out
    
    def __pow__(self, other):
        assert isinstance(other, (int, float))
        out = Tensor(self.data ** other, (self,), op='*')
        def grad_fn(gradient):
            self.grad += gradient * (other * (self.data ** (other-1)))
        out.grad_fn = grad_fn
        return out

    def __matmul__(self, other):
        out = Tensor(self.data @ other.data, (self, other), op='@')
        def grad_fn(gradient):
            self.grad += gradient @ other.data.T
            other.grad += self.data.T @ gradient
        out.grad_fn = grad_fn
        return out
    
    def relu(self):
        out = Tensor(self.data*(self.data>0), (self,), op='relu')
        def grad_fn(gradient):
            self.grad += gradient * (out.data > 0)
        out.grad_fn = grad_fn
        return out
    
    def __neg__(self):
        return self * -1
    
    def __radd__(self, other):
        return self + other
    
    def __sub__(self, other):
        return self + (-other)

    def __rsub__(self, other):
        return other + (-self)

    def __rmul__(self, other):
        return self * other
    
    def __truediv__(self, other):
        return self * (other**-1)

    def __rtruediv__(self, other):
        return other * self**-1
    
    @property
    def shape(self):
        return self.data.shape
    
    def checkbroadcast(self, other):
        for n,(i,j) in enumerate(zip(self.shape, other.shape)):
            if i==j:
                continue
            if i<j:
                self.broadcast_dim = n
                break
            else:
                other.broadcast_dim = n
                break

In [139]:
inp = Tensor(np.ones((4,5),dtype=np.float32)*2)
w = Tensor(np.ones((5,4),dtype=np.float32)*3)
bi = Tensor(np.ones((4,1),dtype=np.float32)*4)
y = Tensor(np.ones((4,4),dtype=np.float32)*20)
lr=1e-2

In [140]:
for i in range(15):
    o = inp @ w + bi
    loss = (o - y)**2
    print(loss.data.sum())
#     w.grad = bi.grad = 0
    loss.backward()
    w, bi = [x - lr*x.grad for x in [w,bi]]

3136.0
1450.0865
557.4134
214.26971
82.36513
31.661057
12.170502
4.6783466
1.798372
0.6912988
0.26573542
0.10215093
0.039266698
0.015093631
0.0058021545


In [69]:
o

array([[20.019043, 20.019043, 20.019043, 20.019043],
       [20.019043, 20.019043, 20.019043, 20.019043],
       [20.019043, 20.019043, 20.019043, 20.019043],
       [20.019043, 20.019043, 20.019043, 20.019043]], dtype=float32)

In [39]:
y

array([[20., 20., 20., 20.],
       [20., 20., 20., 20.],
       [20., 20., 20., 20.],
       [20., 20., 20., 20.]], dtype=float32)

In [131]:
import torch

In [155]:
a = torch.tensor(np.ones((4,5))*2.)
b = torch.tensor(np.ones((5,4))*3., requires_grad=True)
d = torch.tensor(np.ones((1,4))*4., requires_grad=True)
f = torch.tensor(np.ones((4,4))*20)

In [156]:
for i in range(15):
    e = a @ b + d
    loss = (e - f)**2
    print(loss.sum())
    loss.sum().backward()
    with torch.no_grad():
        b -= lr*b.grad
        d -= lr*d.grad
        b.grad = None
        d.grad = None

tensor(3136., dtype=torch.float64, grad_fn=<SumBackward0>)
tensor(1450.0864, dtype=torch.float64, grad_fn=<SumBackward0>)
tensor(670.5200, dtype=torch.float64, grad_fn=<SumBackward0>)
tensor(310.0484, dtype=torch.float64, grad_fn=<SumBackward0>)
tensor(143.3664, dtype=torch.float64, grad_fn=<SumBackward0>)
tensor(66.2926, dtype=torch.float64, grad_fn=<SumBackward0>)
tensor(30.6537, dtype=torch.float64, grad_fn=<SumBackward0>)
tensor(14.1743, dtype=torch.float64, grad_fn=<SumBackward0>)
tensor(6.5542, dtype=torch.float64, grad_fn=<SumBackward0>)
tensor(3.0307, dtype=torch.float64, grad_fn=<SumBackward0>)
tensor(1.4014, dtype=torch.float64, grad_fn=<SumBackward0>)
tensor(0.6480, dtype=torch.float64, grad_fn=<SumBackward0>)
tensor(0.2996, dtype=torch.float64, grad_fn=<SumBackward0>)
tensor(0.1386, dtype=torch.float64, grad_fn=<SumBackward0>)
tensor(0.0641, dtype=torch.float64, grad_fn=<SumBackward0>)
