Based on https://github.com/karpathy/micrograd/blob/master/micrograd/engine.py

In [1]:
import numpy as np

In [2]:
class Tensor:
    def __init__(self, data, prev=(), op=None, *args, **kwargs):
        self.data = data
        self.prev = prev
        self.grad = 0
        self.op = op
        self.grad_fn = lambda x: None
        self.broadcast_dim = None
    
    def backward(self, gradient=None):
        if gradient is None:
            gradient = np.ones_like(self.data)
        self.grad = gradient
        self.grad_fn(self.grad)
        for p in self.prev:
            p.backward(p.grad)

    def __repr__(self):
        return repr(self.data)
    
    def __add__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        self.checkbroadcast(other)
        out = Tensor(self.data + other.data, (self, other), op='+')
        def grad_fn(gradient):
            self.grad += gradient if self.broadcast_dim is None else gradient.sum(axis=self.broadcast_dim, keepdims=True)
            other.grad += gradient if other.broadcast_dim is None else gradient.sum(axis=other.broadcast_dim, keepdims=True)
        out.grad_fn = grad_fn
        return out
    
    def __mul__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        out = Tensor(self.data * other.data, (self, other), op='*')
        def grad_fn(gradient):
            self.grad += gradient * other.data
            other.grad += gradient * self.data
        out.grad_fn = grad_fn
        return out
    
    def __pow__(self, other):
        assert isinstance(other, (int, float))
        out = Tensor(self.data ** other, (self,), op='*')
        def grad_fn(gradient):
            self.grad += gradient * (other * (self.data ** (other-1)))
        out.grad_fn = grad_fn
        return out

    def __matmul__(self, other):
        out = Tensor(self.data @ other.data, (self, other), op='@')
        def grad_fn(gradient):
            self.grad += gradient @ other.data.T
            other.grad += self.data.T @ gradient
        out.grad_fn = grad_fn
        return out
    
    def relu(self):
        out = Tensor(self.data*(self.data>0), (self,), op='relu')
        def grad_fn(gradient):
            self.grad += gradient * (out.data > 0)
        out.grad_fn = grad_fn
        return out
    
    def __neg__(self):
        return self * -1
    
    def __radd__(self, other):
        return self + other
    
    def __sub__(self, other):
        return self + (-other)

    def __rsub__(self, other):
        return other + (-self)

    def __rmul__(self, other):
        return self * other
    
    def __truediv__(self, other):
        return self * (other**-1)

    def __rtruediv__(self, other):
        return other * self**-1
    
    @property
    def shape(self):
        return self.data.shape
    
    def checkbroadcast(self, other):
        for n,(i,j) in enumerate(zip(self.shape, other.shape)):
            if i==j:
                continue
            if i<j:
                self.broadcast_dim = n
                break
            else:
                other.broadcast_dim = n
                break

In [3]:
inp = Tensor(np.ones((4,5),dtype=np.float32)*2.)
w = Tensor(np.ones((5,4),dtype=np.float32)*-3.)
bi = Tensor(np.ones((4,1),dtype=np.float32)*4.)
y = Tensor(np.ones((4,4),dtype=np.float32)*20.)
lr=1e-2
params = (w,bi)

In [4]:
for i in range(15):
    o = inp @ params[0] + params[1]
    loss = (o - y)**2
    print(loss.data.sum())
#     w.grad = bi.grad = 0
    loss.backward()
    params = [x - lr*x.grad for x in params]

33856.0
15655.013
7238.8794
3347.257
1547.7716
715.6899
330.9349
153.02432
70.75845
32.718678
15.1291275
6.9957247
3.234822
1.4957839
0.691654


In [5]:
o

array([[19.792086, 19.792086, 19.792086, 19.792086],
       [19.792086, 19.792086, 19.792086, 19.792086],
       [19.792086, 19.792086, 19.792086, 19.792086],
       [19.792086, 19.792086, 19.792086, 19.792086]], dtype=float32)

In [6]:
y

array([[20., 20., 20., 20.],
       [20., 20., 20., 20.],
       [20., 20., 20., 20.],
       [20., 20., 20., 20.]], dtype=float32)

In [7]:
import torch

In [8]:
a = torch.tensor(np.ones((4,5),dtype=np.float32)*2.)
b = torch.tensor(np.ones((5,4),dtype=np.float32)*-3., requires_grad=True)
d = torch.tensor(np.ones((1,4),dtype=np.float32)*4., requires_grad=True)
f = torch.tensor(np.ones((4,4),dtype=np.float32)*20)

In [9]:
for i in range(15):
    e = a @ b + d
    loss = (e - f)**2
    print(loss.sum())
    loss.sum().backward()
    with torch.no_grad():
        b -= lr*b.grad
        d -= lr*d.grad
        b.grad = None
        d.grad = None

tensor(33856., grad_fn=<SumBackward0>)
tensor(15655.0137, grad_fn=<SumBackward0>)
tensor(7238.8794, grad_fn=<SumBackward0>)
tensor(3347.2573, grad_fn=<SumBackward0>)
tensor(1547.7716, grad_fn=<SumBackward0>)
tensor(715.6899, grad_fn=<SumBackward0>)
tensor(330.9349, grad_fn=<SumBackward0>)
tensor(153.0243, grad_fn=<SumBackward0>)
tensor(70.7585, grad_fn=<SumBackward0>)
tensor(32.7187, grad_fn=<SumBackward0>)
tensor(15.1291, grad_fn=<SumBackward0>)
tensor(6.9957, grad_fn=<SumBackward0>)
tensor(3.2348, grad_fn=<SumBackward0>)
tensor(1.4958, grad_fn=<SumBackward0>)
tensor(0.6917, grad_fn=<SumBackward0>)


In [10]:
e

tensor([[19.7921, 19.7921, 19.7921, 19.7921],
        [19.7921, 19.7921, 19.7921, 19.7921],
        [19.7921, 19.7921, 19.7921, 19.7921],
        [19.7921, 19.7921, 19.7921, 19.7921]], grad_fn=<AddBackward0>)

In [11]:
f

tensor([[20., 20., 20., 20.],
        [20., 20., 20., 20.],
        [20., 20., 20., 20.],
        [20., 20., 20., 20.]])