In [262]:
class Value:
    """stores a single scalar value and its gradient"""

    def __init__(self, data, _previous=(), _operation=""):
        self.data = data
        self.gradient = 0
        # internal variables used for autograd graph construction
        self._backward = lambda: None
        self._previous = set(_previous)
        self._operation = (
            _operation  # the op that produced this node, for graphviz / debugging / etc
        )

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), "+")

        def _backward():
            self.gradient += out.gradient
            other.gradient += out.gradient

        out._backward = _backward

        return out

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), "*")

        def _backward():
            self.gradient += other.data * out.gradient
            other.gradient += self.data * out.gradient

        out._backward = _backward

        return out

    def __pow__(self, other):
        assert isinstance(
            other, (int, float)
        ), "only supporting int/float powers for now"
        out = Value(self.data**other, (self,), f"**{other}")

        def _backward():
            self.gradient += (other * self.data ** (other - 1)) * out.gradient

        out._backward = _backward

        return out

    def relu(self):
        out = Value(0 if self.data < 0 else self.data, (self,), "ReLU")

        def _backward():
            self.gradient += (out.data > 0) * out.gradient

        out._backward = _backward

        return out

    def backward(self):

        # topological order all of the children in the graph
        topo = []
        visited = set()

        def build_topo(current):
            if current not in visited:
                visited.add(current)
                for child in current._previous:
                    build_topo(child)
                topo.append(current)

        build_topo(self)

        # go one variable at a time and apply the chain rule to get its gradient
        self.gradient = 1
        for current in reversed(topo):
            current._backward()

    def __neg__(self):  # -self
        return self * -1

    def __radd__(self, other):  # other + self
        return self + other

    def __sub__(self, other):  # self - other
        return self + (-other)

    def __rsub__(self, other):  # other - self
        return other + (-self)

    def __rmul__(self, other):  # other * self
        return self * other

    def __truediv__(self, other):  # self / other
        return self * other**-1

    def __rtruediv__(self, other):  # other / self
        return other * self**-1

    def __repr__(self):
        return f"Value(data={self.data}, grad={self.gradient})"

In [263]:
import random


class Module:
    """Base class for all modules in the neural network."""

    def zero_grad(self):
        for parameter in self.parameters():
            parameter.gradient = 0

    def parameters(self):
        return []


class Neuron(Module):
    """Represents a single neuron in the neural network."""

    def __init__(self, nin, nonlinear=True, neuron_name=None):
        self.weights = [Value(random.uniform(-1, 1)) for _ in range(nin)]
        self.bias = Value(0)
        self.nonlinear = nonlinear
        self.neuron_name = neuron_name
        self.activation = None

    def __call__(self, x):
        activation = sum((wi * xi for wi, xi in zip(self.weights, x)), self.bias)
        self.activation = activation.relu() if self.nonlinear else activation
        return self.activation

    def parameters(self):
        return self.weights + [self.bias]

    def __repr__(self):
        return f"{self.neuron_name}"


class Layer(Module):
    """Represents a layer of neurons in the neural network."""

    def __init__(self, nin, nout, nonlinear, layer_name):
        self.layer_name = layer_name
        self.neurons = [
            Neuron(nin, nonlinear, f"{layer_name}_neuron_{i}") for i in range(nout)
        ]

    def __call__(self, x):
        out = [n(x) for n in self.neurons]
        return out[0] if len(out) == 1 else out

    def parameters(self):
        return [
            parameter for neuron in self.neurons for parameter in neuron.parameters()
        ]

    def __repr__(self):
        return f"[{'\n'.join(str(neuron) for neuron in self.neurons)}]"


class MLP(Module):
    """Represents a multi-layer perceptron (MLP) neural network."""

    def __init__(self, nin, nouts, layer_names):
        size = [nin] + nouts
        self.layers = [
            Layer(size[i], size[i + 1], i != len(nouts) - 1, layer_names[i])
            for i in range(len(nouts))
        ]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

    def parameters(self):
        return [parameter for layer in self.layers for parameter in layer.parameters()]

    def __repr__(self):
        return f"[{'\n\n'.join(f'{layer.layer_name} => {layer}' for layer in self.layers)}]"

In [264]:
import json, os, time
from dataclasses import dataclass, asdict
from typing import List, Dict, Any


@dataclass
class NeuronSnapshot:
    layer: str
    neuron: str
    activation: str
    weights: List[float]
    bias: float


def snapshot_mlp(mlp) -> Dict[str, Any]:
    """Hierarchical snapshot: layers → neurons → weights/bias."""

    layers = []
    for layer in mlp.layers:
        neurons = []
        for neuron in layer.neurons:
            neurons.append(
                asdict(
                    NeuronSnapshot(
                        layer=layer.layer_name,
                        neuron=neuron.neuron_name,
                        activation=str(neuron.activation.data) if neuron.activation else "None",
                        weights=[float(weight.data) for weight in neuron.weights],
                        bias=float(neuron.bias.data),
                    )
                )
            )
        layers.append({"layer": layer.layer_name, "neurons": neurons})
    return {"layers": layers}


def write_snapshot(
    payload: Dict[str, Any], step: int, dirpath="snapshots", keep_history=False
):
    os.makedirs(dirpath, exist_ok=True)
    payload = {"step": step, "timestamp": time.time(), "snapshot": payload}

    with open(os.path.join(dirpath, "latest.json"), "w") as f:
        json.dump(payload, f)

    if keep_history:
        with open(os.path.join(dirpath, f"step_{step:06d}.json"), "w") as f:
            json.dump(payload, f)

In [268]:
def f(x1, x2):
    return x1 * x1 + 0.5 * x2


train_X = [
    [-2, 0],
    [-2, 1],
    [-2, 2],
    [-1, 0],
    [-1, 1],
    [-1, 2],
    [0, 0],
    [0, 1],
    [0, 2],
    [1, 0],
    [1, 1],
    [1, 2],
    [2, 0],
    [2, 1],
    [2, 2],
]
train_Y = [f(x1, x2) for (x1, x2) in train_X]

val_X = [[-1.5, 0.5], [0.5, 1.5], [1.5, 0.5], [-0.5, 2.0]]
val_Y = [f(x1, x2) for (x1, x2) in val_X]


def mean_square_error(predictions, targets):
    differences = [
        (prediction - target) for prediction, target in zip(predictions, targets)
    ]
    squared_differences = [difference * difference for difference in differences]

    return sum(squared_differences) * (1.0 / len(squared_differences))


def tune_parameters(parameters, learning_rate):
    for parameter in parameters:
        parameter.data += -learning_rate * parameter.gradient


def train(mlp, train_X, train_Y, epochs=10, learning_rate=0.1):
    for epoch in range(epochs):
        # Forward pass
        predictions = [mlp([Value(x1), Value(x2)]) for x1, x2 in train_X]
        loss = mean_square_error(predictions, [Value(y) for y in train_Y])
        print(f"Epoch {epoch+1}/{epochs}  mean_squared_error={loss.data:.4f}")

        # Backward pass
        mlp.zero_grad()
        loss.backward()
        tune_parameters(mlp.parameters(), learning_rate)

        # Snapshot
        snapshot = snapshot_mlp(mlp)
        write_snapshot(snapshot, epoch + 1)
        time.sleep(1)


mlp = MLP(2, [4, 3, 4, 1], ["hidden1", "hidden2", "hidden3", "logits"])
train(mlp, train_X, train_Y, epochs=50, learning_rate=0.01)

Epoch 1/50  mean_squared_error=10.0813
Epoch 2/50  mean_squared_error=9.5022
Epoch 3/50  mean_squared_error=9.0247
Epoch 4/50  mean_squared_error=8.6429
Epoch 5/50  mean_squared_error=8.3252
Epoch 6/50  mean_squared_error=8.0521
Epoch 7/50  mean_squared_error=7.8489
Epoch 8/50  mean_squared_error=7.6556
Epoch 9/50  mean_squared_error=7.4699
Epoch 10/50  mean_squared_error=7.2916
Epoch 11/50  mean_squared_error=7.1203
Epoch 12/50  mean_squared_error=6.9558
Epoch 13/50  mean_squared_error=6.7978
Epoch 14/50  mean_squared_error=6.6461
Epoch 15/50  mean_squared_error=6.5004
Epoch 16/50  mean_squared_error=6.3605
Epoch 17/50  mean_squared_error=6.2261
Epoch 18/50  mean_squared_error=6.0970
Epoch 19/50  mean_squared_error=5.9731
Epoch 20/50  mean_squared_error=5.8540
Epoch 21/50  mean_squared_error=5.7397
Epoch 22/50  mean_squared_error=5.6299
Epoch 23/50  mean_squared_error=5.5244
Epoch 24/50  mean_squared_error=5.4231
Epoch 25/50  mean_squared_error=5.3258
Epoch 26/50  mean_squared_error=5