**Lecture 1: The spelled-out intro to neural networks and backpropagation: building micrograd**

Backpropagation and training of neural networks. Assumes basic knowledge of Python and a vague recollection of calculus from high school.

- [YouTube video lecture](https://www.youtube.com/watch?v=VMj-3S1tku0)
- [Jupyter notebook files](lectures/micrograd)
- [micrograd Github repo](https://github.com/karpathy/micrograd)

In [None]:
import math
import torch
import random
import numpy as np
import matplotlib.pyplot as plt

from utils import draw_dot, build_topo


%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from nn import Module
n = Module()
n.parameters()

[]

In [None]:
def f(x):
  return 3*x**2 - 4*x + 5

print(f(3))
xs = np.arange(-5, 5, 0.25)
ys = f(xs)
plt.plot(xs, ys)

In [None]:
class Value:
    def __init__(self, data, _children=(), _op='', label=""):
        self.data = data
        self.grad = 0.0
        self._backward = lambda : None
        self._prev = set(_children)
        self._op = _op
        self.label = label
    
    def __repr__(self):
        return f"Value(data={self.data}, label={self.label})"
    
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(data=self.data + other.data, _children=(self, other), _op='+')

        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad

        out._backward = _backward
        return out
    
    def __radd__(self, other):
        return self + other
    
    def __neg__(self):
        return self * -1
    
    def __sub__(self, other):
        return self + (-other)
    
    def __rsub__(self, other):
        return other + (-self)

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(data=self.data * other.data, _children=(self, other), _op="*")

        def _backward():
            self.grad += out.grad * other.data
            other.grad += out.grad * self.data
        
        out._backward = _backward
        return out
    
    def __rmul__(self, other):
        return self * other
    
    def __truediv__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        return self * (other ** -1)
    
    def __rtruediv__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        return other / self
    
    def __abs__(self):
        out = Value(data= abs(self.data), _children=(self,), _op="abs")

        def _backward():
            self.grad += out.grad * (1.0 if self.data > 0 else -1.0)
        
        out._backward = _backward
        return out
    
    def __pow__(self, other):
        """
        Exponential is no longer limited to scalars. Have to be careful however as the derivative of 'other'
        in respect to 'out', when 'self' is negative, doesn't exist !!
        """
        other = other if isinstance(other, Value) else Value(other)
        out = Value(data= self.data ** other.data, _children=(self, other,), _op=f"**{other.data}")

        def _backward():
            self.grad += out.grad * other.data * self.data ** (other.data -1)
            assert self.data > 0 
            other.grad += out.grad * out.data * math.log(self.data)
        
        out._backward = _backward
        return out
    
    def __rpow__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        return other ** self

    def tanh(self):
        x = self.data
        t = (math.exp(2*x)-1) / (math.exp(2*x)+1)
        out = Value(t, _children=(self,), _op="tanh")

        def _backward():
            self.grad += out.grad * (1 - out.data ** 2)

        out._backward = _backward
        return out
    
    def exp(self):
        x = self.data
        t = math.exp(x)
        out = Value(t, _children=(self,), _op="exp")

        def _backward():
            self.grad += out.grad * t

        out._backward = _backward
        return out

    def backward(self):
        
        
        def build_topo(v, topo=[], visited=set()):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    topo = build_topo(child, topo=topo, visited=visited)
                topo.append(v)  
            return topo
        
        topo = build_topo(self)
        self.grad = 1
        
        for node in reversed(topo):
            node._backward()

In [None]:
a = Value(2.0, label="a")
b = Value(-3.0, label="b")
c = Value(10.0, label="c")

e = a * b; e.label="e"
d = e + c; d.label="d"

f = Value(-2, label="f")
L = f * d; L.label="L"
draw_dot(L)

In [None]:
# inputs x1,x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights w1,w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias of the neuron
b = Value(6.8813735870195432, label='b')
# x1*w1 + x2*w2 + b
x1w1 = x1*w1; x1w1.label = 'x1*w1'
x2w2 = x2*w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
n = x1w1x2w2 + b; n.label = 'n'
o = n.tanh(); o.label="o"

o.grad = 1
n.grad = 1 - o.data ** 2
b.grad = n.grad
x1w1x2w2.grad = n.grad
x1w1.grad = n.grad
x2w2.grad = n.grad

x1.grad = w1.data * n.grad
x2.grad = w2.data * n.grad
w1.grad = x1.data * n.grad
w2.grad = x2.data * n.grad

draw_dot(o)

In [None]:
# inputs x1,x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights w1,w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias of the neuron
b = Value(6.8813735870195432, label='b')
# x1*w1 + x2*w2 + b
x1w1 = x1*w1; x1w1.label = 'x1*w1'
x2w2 = x2*w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
n = x1w1x2w2 + b; n.label = 'n'
o = n.tanh(); o.label="o"

o.grad = 1
o._backward()
n._backward()
b._backward()
x1w1x2w2._backward()
x1w1._backward()
x2w2._backward()

draw_dot(o)

In [None]:
# inputs x1,x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights w1,w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias of the neuron
b = Value(6.8813735870195432, label='b')
# x1*w1 + x2*w2 + b
x1w1 = x1*w1; x1w1.label = 'x1*w1'
x2w2 = x2*w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
n = x1w1x2w2 + b; n.label = 'n'
o = n.tanh(); o.label="o"

o.backward()

draw_dot(o)

### Multivariable case

In [None]:
a = Value(3.0, label='a')
b = a + a   ; b.label = 'b'
b.backward()
draw_dot(b)

In [None]:
a = Value(-2.0, label='a')
b = Value(3.0, label='b')
d = a * b    ; d.label = 'd'
e = a + b    ; e.label = 'e'
f = d * e    ; f.label = 'f'

f.backward()

draw_dot(f)

### Breaking down tanh

In [None]:
# inputs x1,x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights w1,w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias of the neuron
b = Value(6.8813735870195432, label='b')
# x1*w1 + x2*w2 + b
x1w1 = x1*w1; x1w1.label = 'x1*w1'
x2w2 = x2*w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
n = x1w1x2w2 + b; n.label = 'n'

#o = n.tanh(); o.label="o"

# --- Manual tanh --- #
e = (2 * n).exp()
o = (e - 1) /  (e + 1)
o.backward()

draw_dot(o)

### PyTorch

In [None]:
x1 = torch.Tensor([2.0]).double()                ; x1.requires_grad = True
x2 = torch.Tensor([0.0]).double()                ; x2.requires_grad = True
w1 = torch.Tensor([-3.0]).double()               ; w1.requires_grad = True
w2 = torch.Tensor([1.0]).double()                ; w2.requires_grad = True
b = torch.Tensor([6.8813735870195432]).double()  ; b.requires_grad = True
n = x1*w1 + x2*w2 + b
o = torch.tanh(n)

print(o.data.item())
o.backward()

print('---')
print('x2', x2.grad.item())
print('w2', w2.grad.item())
print('x1', x1.grad.item())
print('w1', w1.grad.item())

In [None]:
class Module:

    def zero_grad(self):
        for p in self.parameters():
            p.grad = 0

    def parameters(self):
        return []
    
class Neuron(Module):
    
    def __init__(self, nin:int):
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
        self.b = Value(random.uniform(-1,1))

    def __call__(self, x):
        # w * x + b
        act = sum((wi * xi for wi, xi in zip(self.w,x)), self.b) 
        out = act.tanh()
        return out
    
    def parameters(self) -> list[Value]:
        return self.w + [self.b]
    
class Layer(Module):
    def __init__(self, nin:int, nout:int):
        self.neurons = [Neuron(nin) for _ in range(nout)]
    
    def __call__(self, x):
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs)==1 else outs
    
    def parameters(self) -> list[Value]:
        return [p for neuron in self.neurons for p in neuron.parameters()]

class MLP(Module):
    def __init__(self, nin:int, nouts:list[int]):
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]
    
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
    def parameters(self) -> list[Value]:
        return [p for layer in self.layers for p in layer.parameters()]

In [None]:
x = [2.0, 3.0, -1]
n = MLP(3, [4, 4, 1])
n(x)

In [None]:
random.seed(0)
xs = [
  [2.0, 3.0, -1.0],
  [3.0, -1.0, 0.5],
  [0.5, 1.0, 1.0],
  [1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0] # desired targets
n = MLP(3, [4, 4, 1])

losses = []

for k in range(1000):

    # forward pass
    ypred = [n(x) for x in xs]
    loss = sum((yout - ygt) ** 2 if (yout - ygt).data > 0 else (ygt - yout) ** 2 for yout, ygt in zip(ypred, ys))
    loss = sum(abs(yout - ygt) ** 2 for yout, ygt in zip(ypred, ys))
    losses.append(loss.data)
    
    if k == 0:
        print(f"Initial loss : {loss.data}")

    # zero grad
    for p in n.parameters():
        p.grad = 0
    
    # backward pass
    loss.backward()

    # update(gradient descent)
    for p in n.parameters():
        p.data -= 0.001 * p.grad
    

print(f"Final loss : {loss.data}")

plt.plot(losses)


In [None]:
a = Value(-2.0)
b = Value(3.0)
c = abs(a) ** b
c.backward()
draw_dot(c)

In [None]:
a_torch = torch.tensor(-2.0, requires_grad=True)
b_torch = torch.tensor(3.0, requires_grad=True)
c_torch = a_torch ** b_torch
c_torch.backward()



print(f"PyTorch - Value of a ** b: {c_torch.item()}")
print(f"PyTorch - Gradient of a: {a_torch.grad.item()}")
print(f"PyTorch - Gradient of b: {b_torch.grad.item()}")

In [None]:
b_torch.data  += 0.01 * b_torch.grad.item()
b_torch