In [491]:
from graphviz import Digraph

def trace(root):
  # builds a set of all nodes and edges in a graph
  nodes, edges = set(), set()
  def build(v):
    if v not in nodes:
      nodes.add(v)
      for child in v._prev:
        edges.add((child, v))
        build(child)
  build(root)
  return nodes, edges

def draw_dot(root):
  dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
  
  nodes, edges = trace(root)
  for n in nodes:
    uid = str(id(n))
    # for any value in the graph, create a rectangular ('record') node for it
    dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')
    if n._op:
      # if this value is a result of some operation, create an op node for it
      dot.node(name = uid + n._op, label = n._op)
      # and connect this node to it
      dot.edge(uid + n._op, uid)

  for n1, n2 in edges:
    # connect n1 to the op node of n2
    dot.edge(str(id(n1)), str(id(n2)) + n2._op)

  return dot

In [492]:
import math
import random

In [528]:
class Value:
    def __init__(self, data, label='', _children=(), _op='', grad = 0, _backward = lambda: None):
        self.data = data
        self.label = label
        self._prev = set(_children)
        self._op = _op
        self.grad = grad
        self._backward = _backward

    def __repr__(self):
        return f"Value(data={self.data})"

    def __add__(self, other):
        if not isinstance(other, Value):
            other = Value(other)
        out = Value(self.data + other.data, _children = (self, other), _op = '+')
        def _backward():
            self.grad += out.grad
            other.grad += out.grad

        out._backward = _backward
        
        return out

    def __mul__(self, other):
        if not isinstance(other, Value):
            other = Value(other)
        out = Value(self.data * other.data, _children = (self, other), _op= '*')

        def _backward():
            self.grad += out.grad * other.data
            other.grad += out.grad * self.data

        out._backward = _backward
        return out

    def __rmul__(self, other):
        return self * other

    def __radd__(self, other):
        return self + other

    def __sub__(self, other):
        return self + -(other)

    def __rsub__(self, other):
        return self + -(other)

    def __pow__(self, other):
        if not isinstance(other, (int, float)):
            raise TypeError(f"Exponent must be an int or float, got {type(other).__name__}")

        out = Value(self.data ** other, _children=(self,), _op=f"**{other}")

        def _backward():
            self.grad += out.grad * ( other * self.data ** (other - 1))
            
        out._backward = _backward
        
        return out
            
    
    def tanh(self):
        x = (math.exp(2 * self.data) - 1)/ (math.exp(2 * self.data) + 1)
        out = Value(x, _children = (self, ), _op = 'tanh')

        def _backward():
            self.grad += (1 - (out.data ** 2)) * out.grad
        out._backward = _backward
        return out    
        
        

    def backward(self):     
        topo = []
        visited = set()
        def build_topo(v):
          if v not in visited:
            visited.add(v)
            for child in v._prev:
              build_topo(child)
            topo.append(v)
        build_topo(self)
        
        self.grad = 1.0
        for node in reversed(topo):
          node._backward()
    

In [529]:
class Neuron:
    # nin is the number of inputs to the neuron
    def __init__(self, nin):
        # generate weights & bias
        self.w = [Value(random.uniform(-1, 1)) for _ in range(nin)]
        self.b = Value(random.uniform(-1, 1))

    def __call__(self, x):
        # x will be a [x1, x2..]
        # need to return activation_func((w1x1 + w2x2) + b)
        dp = sum((xi * wi for xi, wi in zip(x, self.w)))
        return (dp + self.b).tanh()

    def parameters(self):
        return [self.b]+ self.w

class Layer:
    # nin: number of inputs
    # nout: number of neurons
    def __init__(self, nin, nout):
        self.neurons = [Neuron(nin) for _ in range(nout)]

    def __call__(self, x):
        out = [n(x) for n in self.neurons]
        return out if len(out) > 1 else out[0] 

    def parameters(self):
        params = []
        for n in self.neurons:
            params.extend(n.parameters())
        return params

class MLP:
    # nin: number of inputs
    # nouts: list of layers where each layer will define how many neurons it has
    def __init__(self, nin, nouts):
        # This is to create a list where each pair of elements will represent inputs <> ouputs
        self.sz = [nin] + nouts
        self.layers = [Layer(self.sz[i], self.sz[i+1]) for i in range(0, len(self.sz) - 1)]

    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x

    def parameters(self):
        params = []
        for l in self.layers:
            params.extend(l.parameters())
        return params

    def clear_grad(self):
        for param in self.parameters():
            param.grad = 0.0

In [530]:
# Earlier, we were just focused on one set of inputs resulting in one output
# Now, we will look at a bunch of inputs resulting in a bunch of outputs, and minimise the error rate by reducing loss
xs = [
  [2.0, 3.0, -1.0],
  [3.0, -1.0, 0.5],
  [0.5, 1.0, 1.0],
  [1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0] # desired targets

In [538]:
m = MLP(3, [4, 4, 1])

In [554]:
h = 0.01
for i in range(1, 100):

    # Generate predictions basis the current set of weights
    ypred = [m(x) for x in xs]
    
    # loss is one number to quantify how far away we are from the prediction, using MSE here
    loss = sum((ysi-ypredi)**2 for ysi, ypredi in zip(ys, ypred))
    
    # The objective is to get to a lower loss with each iteration
    # To do this, we will move all the parameters in the direction of the gradient every time
    # In this case, in the opposite direction since we want the loss to actually go down    
    loss.backward() # Will generate all the gradients

    for p in m.parameters():
        p.data += -h * p.grad

    m.clear_grad() # need to clear the grads as they accumulate

    print(f'Loss {loss.data}')

Loss 0.01631754777502196
Loss 0.01621564829754353
Loss 0.01611491850713276
Loss 0.016015339229101765
Loss 0.01591689169355344
Loss 0.01581955752497735
Loss 0.0157233187321597
Loss 0.015628157698396712
Loss 0.015534057172001087
Loss 0.015441000257091723
Loss 0.015348970404655791
Loss 0.01525795140387566
Loss 0.015167927373710439
Loss 0.015078882754723643
Loss 0.014990802301149448
Loss 0.01490367107318883
Loss 0.014817474429528322
Loss 0.014732198020073538
Loss 0.014647827778891368
Loss 0.01456434991735271
Loss 0.014481750917470013
Loss 0.014400017525423414
Loss 0.01431913674526845
Loss 0.014239095832820585
Loss 0.014159882289709786
Loss 0.014081483857600793
Loss 0.014003888512572987
Loss 0.013927084459655004
Loss 0.013851060127509245
Loss 0.013775804163261927
Loss 0.01370130542747337
Loss 0.013627552989244495
Loss 0.013554536121455596
Loss 0.013482244296133098
Loss 0.01341066717993973
Loss 0.013339794629785531
Loss 0.013269616688554909
Loss 0.013200123580946715
Loss 0.013131305709423991

In [555]:
ypred

[Value(data=0.9551456599442291),
 Value(data=-0.963213084970007),
 Value(data=-0.933812461631954),
 Value(data=0.9529792514540648)]