# Day 2: Backpropagation

Understanding gradients, manual backpropagation, and implementing a simple neuron.

In [None]:
import math
from graphviz import Digraph

In [None]:
class Value:
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        self._prev = set(_children)
        self._op = _op
        self.label = label

    def __repr__(self):
        return f"Value(data={self.data}, label={self.label})"

    def __add__(self, other):
        out = Value(self.data + other.data, (self, other), '+')
        return out

    def __mul__(self, other):
        out = Value(self.data * other.data, (self, other), '*')
        return out

    def tanh(self):
        x = self.data
        t = (math.exp(2*x) - 1) / (math.exp(2*x) + 1)
        out = Value(t, (self,), 'tanh')
        return out

In [None]:
def trace(root):
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges

def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'})
    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        dot.node(name=uid, label="{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')
        if n._op:
            dot.node(name=uid + n._op, label=n._op)
            dot.edge(uid + n._op, uid)
    for n1, n2 in edges:
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)
    return dot

## Manual Gradient Calculation

Let's manually compute the gradients using the chain rule.

In [None]:
# Build the expression: L = (a * b + c) * f
a = Value(2.0, label='a')
b = Value(-3.0, label='b')
c = Value(10.0, label='c')
e = a * b; e.label = 'e'
d = e + c; d.label = 'd'
f = Value(-2.0, label='f')
L = d * f; L.label = 'L'

# Manually set gradients (backpropagation by hand)
L.grad = 1.0       # dL/dL = 1
f.grad = d.data    # dL/df = d = 4
d.grad = f.data    # dL/dd = f = -2
c.grad = d.grad    # dL/dc = dL/dd * dd/dc = -2 * 1 = -2
e.grad = d.grad    # dL/de = dL/dd * dd/de = -2 * 1 = -2
a.grad = e.grad * b.data  # dL/da = dL/de * de/da = -2 * -3 = 6
b.grad = e.grad * a.data  # dL/db = dL/de * de/db = -2 * 2 = -4

# Visualize with gradients
draw_dot(L)

## Gradient Checking with Numerical Differentiation

We can verify our gradients using the definition of derivative: `df/dx â‰ˆ (f(x+h) - f(x)) / h`

In [None]:
def grad_check():
    h = 0.0001

    # Forward pass
    a = Value(2.0, label='a')
    b = Value(-3.0, label='b')
    c = Value(10.0, label='c')
    e = a * b; e.label = 'e'
    d = e + c; d.label = 'd'
    f = Value(-2.0, label='f')
    L = d * f; L.label = 'L'
    L1 = L.data

    # Forward pass with perturbed 'a'
    a = Value(2.0 + h, label='a')
    b = Value(-3.0, label='b')
    c = Value(10.0, label='c')
    e = a * b; e.label = 'e'
    d = e + c; d.label = 'd'
    f = Value(-2.0, label='f')
    L = d * f; L.label = 'L'
    L2 = L.data

    print(f"Numerical dL/da: {(L2 - L1) / h:.4f}")
    print(f"Analytical dL/da: 6.0000 (from chain rule)")

grad_check()

## A Simple Neuron

Let's implement a single neuron with tanh activation: `o = tanh(x1*w1 + x2*w2 + b)`

In [None]:
# Inputs
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')

# Weights
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')

# Bias
b = Value(6.8813735870195432, label='b')

# x1*w1 + x2*w2 + b
x1w1 = x1 * w1; x1w1.label = 'x1*w1'
x2w2 = x2 * w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1w1+x2w2'
n = x1w1x2w2 + b; n.label = 'n'
o = n.tanh(); o.label = 'o'

# Visualize
draw_dot(o)