In [None]:
import math
import random

In [None]:
class Value:
  def __init__(self, data, _children = (), _op = ''):
    self.data = data #numerical value
    self._prev = set(_children) #child nodes that created this value
    self._op = _op #the operation that produced this value
    self.grad = 0
    '''
    The _backward function applies the chain rule at each node.
    It distributes gradients according to the operation performed,
    enabling backpropagation of gradients through the network.
    Leaf nodes (like biases) have an empty _backward by default.
    '''
    self._backward =  lambda : None

  def __repr__(self):
    return f"Value(data = {self.data})"
  
  def __add__(self, other):
    '''
    Addition: the derivative with respect to both inputs is 1.
    Gradients flow back unchanged to both operands.
    '''
    other = other if isinstance(other, Value) else Value(other)
    out = Value(self.data + other.data, (self,other), '+')
    def _backward():
      '''
      We use += to accumulate gradients rather than overwriting them.
      This is crucial when the same variable appears multiple times,
      e.g., b = a + a (both occurrences of 'a' must receive gradients).
      '''
      self.grad += 1.0 * out.grad
      other.grad += 1.0 * out.grad
    out._backward = _backward
    return out

  def __radd__(self, other):
    return self + other

  def __rmul__(self, other):
    return self * other

  def __mul__(self, other):
    '''
    Multiplication: each gradient depends on the other operand's value.
    If z = x * y, then dz/dx = y and dz/dy = x
    '''
    other = other if isinstance(other, Value) else Value(other)
    out = Value(self.data * other.data, (self,other), '*')
    def _backward():
      self.grad += out.grad * other.data
      other.grad += out.grad * self.data
    out._backward = _backward
    return out
  
  def tanh(self):
    '''
    Hyperbolic tangent activation: tanh(x) = (e^(2x) - 1) / (e^(2x) + 1)
    This operation combines exponentiation, subtraction, and division,
    but we can compute its derivative directly: d(tanh)/dx = 1 - tanh(x)^2
    '''
    x = self.data
    t = (math.exp(2 * x) - 1) / (math.exp(2 * x) + 1)
    out = Value(t, (self, ), 'tanh')
    def _backward():
      self.grad += (1 - t**2) * out.grad
    out._backward = _backward
    return out

  '''
  Helper operations that decompose more complex functions
  '''
  def __neg__(self):
    return self * -1

  def __sub__(self, other):
    return self + (-other)

  def exp(self):
    '''Exponential function: d(e^x)/dx = e^x'''
    x = self.data
    out = Value(math.exp(x), (self, ), 'exp')
    def _backward():
      self.grad += out.data * out.grad
    out._backward = _backward
    return out

  def __truediv__(self, other):
    '''Division as multiplication: a / b = a * (b^-1)'''
    return self * other**-1

  def __pow__(self, other):
    '''
    Power function: d(x^n)/dx = n * x^(n-1)
    Supports only constant integer or float exponents.
    '''
    assert isinstance(other, (int, float))
    out = Value(self.data**other, (self,), f'**{other}')
    def _backward():
      self.grad +=  other * (self.data ** (other - 1)) * out.grad
    out._backward = _backward
    return out

  def backward(self):
    '''
    Backpropagation: computes gradients for all nodes in the computation graph.
    Uses topological sorting to process nodes in reverse order of their creation,
    ensuring each node's gradient is computed after all dependent nodes.
    '''
    topo = []
    visited = set()
    def build_topo(v):
        if v not in visited:
            visited.add(v)
            for child in v._prev:
                build_topo(child)
            topo.append(v)
    build_topo(self)
    self.grad = 1.0 #start with gradient 1 at the output node
    for node in reversed(topo):
      node._backward()

In [None]:
class Neuron:
  '''
  A single neuron computes: output = tanh(wÂ·x + b)
  where w are weights, x is input, and b is bias.
  '''
  def __init__(self, nin):
    self.w = [Value(random.uniform(-1, 1)) for _ in range(nin)] #weights (one per input)
    self.b = Value(random.uniform(-1, 1)) #bias term

  def __call__(self, x):
    '''Compute neuron output: tanh(sum of weighted inputs + bias)'''
    act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
    out = act.tanh() #apply activation function
    return out

  def parameters(self):
    '''Return list of all trainable parameters (weights and bias)'''
    return self.w + [self.b]


class Layer:
  '''
  A layer contains multiple neurons in parallel.
  Each neuron receives the same input and produces one output.
  '''
  def __init__(self, nin, nout):
    self.neurons = [Neuron(nin) for _ in range(nout)]

  def __call__(self, x):
    outs = [n(x) for n in self.neurons]
    return outs[0] if len(outs) == 1 else outs

  def parameters(self):
    '''Collect all parameters from all neurons in this layer'''
    return [p for neuron in self.neurons for p in neuron.parameters()]


class MLP:
  '''
  Multi-Layer Perceptron: a feedforward neural network.
  nouts specifies the number of neurons in each hidden and output layer.
  '''
  def __init__(self, nin, nouts):
    sz = [nin] + nouts
    self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]

  def __call__(self, x):
    '''Pass input through all layers sequentially'''
    for layer in self.layers:
      x = layer(x)
    return x

  def parameters(self):
    '''Collect all parameters from all layers'''
    return [p for layer in self.layers for p in layer.parameters()]

In [None]:
'''
Example 1: Create and test a neural network
'''
#MLP with 3 inputs, two hidden layers (4 neurons each), and 1 output
x = [2.0, 3.0, -1.0]
n = MLP(3, [4, 4, 1])
output = n(x) #test forward pass with sample input
print(f"Network output: {output.data}")

In [None]:
'''
Example 2: Prepare training data
Four samples with 3 features each and binary classification targets.
'''
xs = [ #input samples
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0] #target outputs for each sample

In [None]:
'''
Example 3: Training loop using backpropagation
The network learns to fit the training data by minimizing the loss.
'''

for epoch in range(10):
  #===== FORWARD PASS =====
  #Generate predictions for all samples
  ypred = [n(x) for x in xs]
  #Compute mean squared error loss
  loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))

  #===== BACKWARD PASS =====
  #Reset gradients to zero (accumulated from previous iterations)
  for p in n.parameters():
    p.grad = 0.0
  #Compute gradients via backpropagation
  loss.backward()

  #===== PARAMETER UPDATE =====
  '''
  Gradient descent: move parameters opposite to the gradient direction.
  The learning rate (0.05) controls step size. Too large causes instability;
  too small makes training slow.
  '''
  for p in n.parameters():
    p.data += -0.05 * p.grad

  print(f"Epoch {epoch}: loss = {loss.data:.4f}")