In [5]:
from __future__ import annotations
from typing import Iterable, List, Sequence, Tuple, Union, Optional, Literal
import math
import random

Number = Union[int, float]


class Tensor:
    def __init__(
        self,
        data: Sequence[Number],
        shape: Sequence[int] = (),
        *,
        requires_grad: bool = False,
        _children: Sequence["Tensor"] = (),
        _op: str = "",
    ):
        self.shape = tuple(int(s) for s in shape)
        self.size = math.prod(self.shape) if self.shape else 1
        flat = list(data)
        if len(flat) != self.size:
            raise ValueError(f"Data size {len(flat)} != shape size {self.size}")
        self.data: List[float] = [float(x) for x in flat]  # row-major flat storage
        self.strides = self._compute_strides(self.shape)
        self.requires_grad = bool(requires_grad)
        self.grad: Optional[List[float]] = [0.0] * self.size if self.requires_grad else None
        self._prev = set(_children)
        self._op = _op
        self._backward = lambda: None
        self._name: Optional[str] = None

    def named(self, name: str) -> "Tensor":
        self._name = name
        return self

    @staticmethod
    def _compute_strides(shape: Sequence[int]) -> Tuple[int, ...]:
        if not shape:
            return ()
        strides = [1] * len(shape)
        for i in range(len(shape) - 2, -1, -1):
            strides[i] = strides[i + 1] * shape[i + 1]
        return tuple(strides)

    @staticmethod
    def zeros(shape: Sequence[int], *, requires_grad: bool = False) -> "Tensor":
        size = math.prod(shape) if shape else 1
        return Tensor([0.0] * size, shape, requires_grad=requires_grad)

    @staticmethod
    def ones(shape: Sequence[int], *, requires_grad: bool = False) -> "Tensor":
        size = math.prod(shape) if shape else 1
        return Tensor([1.0] * size, shape, requires_grad=requires_grad)

    @staticmethod
    def randn(
        shape: Sequence[int],
        *,
        mean: float = 0.0,
        std: float = 1.0,
        requires_grad: bool = False,
        seed: Optional[int] = None,
    ) -> "Tensor":
        if seed is not None:
            rnd = random.Random(seed)
            data = [rnd.gauss(mean, std) for _ in range(math.prod(shape) if shape else 1)]
        else:
            data = [random.gauss(mean, std) for _ in range(math.prod(shape) if shape else 1)]
        return Tensor(data, shape, requires_grad=requires_grad)

    @staticmethod
    def from_nested(nested: Sequence) -> "Tensor":
        shape: List[int] = []
        temp = nested
        while isinstance(temp, list):
            shape.append(len(temp))
            temp = temp[0] if temp else []
        flat: List[Number] = []

        def _flatten(x):
            if isinstance(x, list):
                for v in x:
                    _flatten(v)
            else:
                flat.append(x)

        _flatten(nested)
        return Tensor(flat, shape)

    def to_nested(self) -> List:
        def _build(shape, offset):
            if not shape:
                return self.data[offset]
            step = self._compute_strides(shape)[0]
            return [_build(shape[1:], offset + i * step) for i in range(shape[0])]

        return _build(list(self.shape), 0)

    def item(self) -> float:
        if self.size != 1:
            raise ValueError("item() only valid for size==1")
        return float(self.data[0])

    def _index(self, idx: Sequence[int]) -> int:
        if len(idx) != len(self.shape):
            raise IndexError("Incorrect number of indices")
        flat = 0
        for i, s, st in zip(idx, self.shape, self.strides):
            if i < 0 or i >= s:
                raise IndexError("Index out of bounds")
            flat += i * st
        return flat

    def __getitem__(self, idx: Sequence[int]) -> float:
        return self.data[self._index(idx)]

    def __setitem__(self, idx: Sequence[int], value: Number) -> None:
        self.data[self._index(idx)] = float(value)

    def zero_grad(self) -> None:
        if self.requires_grad:
            self.grad = [0.0] * self.size

    def _accum_grad(self, grad_data: List[float]) -> None:
        if not self.requires_grad:
            return
        if self.grad is None:
            self.grad = [0.0] * self.size
        if len(grad_data) != self.size:
            raise ValueError("Gradient size mismatch")
        for i in range(self.size):
            self.grad[i] += grad_data[i]

    def backward(self, grad: Optional[Union[Number, "Tensor"]] = None) -> None:
        if not self.requires_grad:
            raise RuntimeError("backward() called on tensor that does not require gradients")
        if grad is None:
            if self.size != 1:
                raise RuntimeError("grad must be specified for non-scalar outputs")
            grad_data = [1.0]
        elif isinstance(grad, Tensor):
            if grad.size != self.size:
                raise ValueError("grad tensor must match output size")
            grad_data = grad.data[:]
        else:
            if self.size != 1:
                raise RuntimeError("numeric grad only supported for scalar outputs")
            grad_data = [float(grad)]

        topo: List[Tensor] = []
        visited = set()

        def build(v: Tensor):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build(child)
                topo.append(v)

        build(self)
        self._accum_grad(grad_data)
        for v in reversed(topo):
            v._backward()

    def detach(self) -> "Tensor":
        return Tensor(self.data[:], self.shape, requires_grad=False)

    def reshape(self, *new_shape: int) -> "Tensor":
        if -1 in new_shape:
            raise ValueError("-1 reshape not supported")
        if math.prod(new_shape) != self.size:
            raise ValueError("New shape size mismatch")
        out = Tensor(self.data[:], new_shape, requires_grad=self.requires_grad, _children=(self,), _op="reshape")
        def _backward():
            if self.requires_grad and out.grad is not None:
                self._accum_grad(out.grad[:])
        out._backward = _backward
        return out

    def transpose(self) -> "Tensor":
        if len(self.shape) != 2:
            raise ValueError("transpose expects 2D tensor")
        m, n = self.shape
        out = Tensor.zeros((n, m), requires_grad=self.requires_grad)
        for i in range(m):
            for j in range(n):
                out[j, i] = self[i, j]
        out._prev = {self}
        out._op = "transpose"
        def _backward():
            if self.requires_grad and out.grad is not None:
                grad_self = [0.0] * self.size
                # out shape (n,m), self shape (m,n)
                for i in range(m):
                    for j in range(n):
                        grad_self[i * self.strides[0] + j] += out.grad[j * out.strides[0] + i]
                self._accum_grad(grad_self)
        out._backward = _backward
        return out

    @property
    def T(self) -> "Tensor":
        return self.transpose()

    def sum(self) -> "Tensor":
        out = Tensor([sum(self.data)], (), requires_grad=self.requires_grad, _children=(self,), _op="sum")
        def _backward():
            if self.requires_grad and out.grad is not None:
                self._accum_grad([out.grad[0]] * self.size)
        out._backward = _backward
        return out

    def __neg__(self) -> "Tensor":
        out = Tensor([-v for v in self.data], self.shape, requires_grad=self.requires_grad, _children=(self,), _op="neg")
        def _backward():
            if self.requires_grad and out.grad is not None:
                self._accum_grad([-g for g in out.grad])
        out._backward = _backward
        return out

    def __add__(self, other: Union["Tensor", Number]) -> "Tensor":
        if isinstance(other, Tensor):
            if self.shape != other.shape:
                raise ValueError("Shape mismatch")
            out = Tensor([a + b for a, b in zip(self.data, other.data)], self.shape, requires_grad=(self.requires_grad or other.requires_grad), _children=(self, other), _op="+")
            def _backward():
                if out.grad is None:
                    return
                if self.requires_grad:
                    self._accum_grad(out.grad)
                if other.requires_grad:
                    other._accum_grad(out.grad)
            out._backward = _backward
            return out
        else:
            c = float(other)
            out = Tensor([a + c for a in self.data], self.shape, requires_grad=self.requires_grad, _children=(self,), _op="+c")
            def _backward():
                if self.requires_grad and out.grad is not None:
                    self._accum_grad(out.grad)
            out._backward = _backward
            return out

    def __radd__(self, other: Number) -> "Tensor":
        return self.__add__(other)

    def __sub__(self, other: Union["Tensor", Number]) -> "Tensor":
        if isinstance(other, Tensor):
            if self.shape != other.shape:
                raise ValueError("Shape mismatch")
            out = Tensor([a - b for a, b in zip(self.data, other.data)], self.shape, requires_grad=(self.requires_grad or other.requires_grad), _children=(self, other), _op="-")
            def _backward():
                if out.grad is None:
                    return
                if self.requires_grad:
                    self._accum_grad(out.grad)
                if other.requires_grad:
                    other._accum_grad([-g for g in out.grad])
            out._backward = _backward
            return out
        else:
            c = float(other)
            out = Tensor([a - c for a in self.data], self.shape, requires_grad=self.requires_grad, _children=(self,), _op="-c")
            def _backward():
                if self.requires_grad and out.grad is not None:
                    self._accum_grad(out.grad)
            out._backward = _backward
            return out

    def __rsub__(self, other: Number) -> "Tensor":
        c = float(other)
        return Tensor([c - a for a in self.data], self.shape, requires_grad=self.requires_grad, _children=(self,), _op="c-")

    def __mul__(self, other: Union["Tensor", Number]) -> "Tensor":
        if isinstance(other, Tensor):
            if self.shape != other.shape:
                raise ValueError("Shape mismatch")
            out = Tensor([a * b for a, b in zip(self.data, other.data)], self.shape, requires_grad=(self.requires_grad or other.requires_grad), _children=(self, other), _op="*")
            def _backward():
                if out.grad is None:
                    return
                if self.requires_grad:
                    self._accum_grad([b * g for b, g in zip(other.data, out.grad)])
                if other.requires_grad:
                    other._accum_grad([a * g for a, g in zip(self.data, out.grad)])
            out._backward = _backward
            return out
        else:
            c = float(other)
            out = Tensor([a * c for a in self.data], self.shape, requires_grad=self.requires_grad, _children=(self,), _op="*c")
            def _backward():
                if self.requires_grad and out.grad is not None:
                    self._accum_grad([c * g for g in out.grad])
            out._backward = _backward
            return out

    def __rmul__(self, other: Number) -> "Tensor":
        return self.__mul__(other)

    def __truediv__(self, other: Union["Tensor", Number]) -> "Tensor":
        if isinstance(other, Tensor):
            if self.shape != other.shape:
                raise ValueError("Shape mismatch")
            out = Tensor([a / b for a, b in zip(self.data, other.data)], self.shape, requires_grad=(self.requires_grad or other.requires_grad), _children=(self, other), _op="/")
            def _backward():
                if out.grad is None:
                    return
                if self.requires_grad:
                    self._accum_grad([(1.0 / b) * g for b, g in zip(other.data, out.grad)])
                if other.requires_grad:
                    other._accum_grad([(-a / (b * b)) * g for a, b, g in zip(self.data, other.data, out.grad)])
            out._backward = _backward
            return out
        else:
            c = float(other)
            out = Tensor([a / c for a in self.data], self.shape, requires_grad=self.requires_grad, _children=(self,), _op="/c")
            def _backward():
                if self.requires_grad and out.grad is not None:
                    self._accum_grad([(1.0 / c) * g for g in out.grad])
            out._backward = _backward
            return out

    def __pow__(self, power: Number) -> "Tensor":
        p = float(power)
        out = Tensor([a ** p for a in self.data], self.shape, requires_grad=self.requires_grad, _children=(self,), _op="pow")
        def _backward():
            if self.requires_grad and out.grad is not None:
                self._accum_grad([(p * (a ** (p - 1.0))) * g for a, g in zip(self.data, out.grad)])
        out._backward = _backward
        return out

    def matmul(self, other: "Tensor") -> "Tensor":
        if len(self.shape) != 2 or len(other.shape) != 2:
            raise ValueError("matmul expects 2D tensors")
        m, n = self.shape
        n2, p = other.shape
        if n != n2:
            raise ValueError("Inner dimensions mismatch")
        out = Tensor.zeros((m, p), requires_grad=(self.requires_grad or other.requires_grad))
        for i in range(m):
            for k in range(n):
                a = self[i, k]
                base = i * out.strides[0]
                bbase = k * other.strides[0]
                for j in range(p):
                    out.data[base + j] += a * other.data[bbase + j]
        out._prev = {self, other}
        out._op = "matmul"
        def _backward():
            if out.grad is None:
                return
            if self.requires_grad:
                grad_a = [0.0] * self.size
                # dA = dOut @ B^T
                for i in range(m):
                    for k in range(n):
                        s = 0.0
                        for j in range(p):
                            s += out.grad[i * out.strides[0] + j] * other.data[k * other.strides[0] + j]
                        grad_a[i * self.strides[0] + k] += s
                self._accum_grad(grad_a)
            if other.requires_grad:
                grad_b = [0.0] * other.size
                # dB = A^T @ dOut
                for k in range(n):
                    for j in range(p):
                        s = 0.0
                        for i in range(m):
                            s += self.data[i * self.strides[0] + k] * out.grad[i * out.strides[0] + j]
                        grad_b[k * other.strides[0] + j] += s
                other._accum_grad(grad_b)
        out._backward = _backward
        return out

    def __repr__(self) -> str:
        name = f", name={self._name}" if self._name else ""
        rg = ", requires_grad=True" if self.requires_grad else ""
        return f"Tensor(shape={self.shape}{name}{rg})"


def relu(x: Tensor) -> Tensor:
    out = Tensor([v if v > 0 else 0.0 for v in x.data], x.shape, requires_grad=x.requires_grad, _children=(x,), _op="relu")
    def _backward():
        if x.requires_grad and out.grad is not None:
            x._accum_grad([g if v > 0 else 0.0 for v, g in zip(x.data, out.grad)])
    out._backward = _backward
    return out


def conv2d(
    x: Tensor,
    w: Tensor,
    b: Optional[Tensor] = None,
    *,
    stride: int = 1,
    padding: int = 0,
) -> Tensor:
    """NCHW input, OIHW weights.

    x: (N, C_in, H, W)
    w: (C_out, C_in, KH, KW)
    b: (C_out,) optional
    out: (N, C_out, H_out, W_out)
    """
    if len(x.shape) != 4 or len(w.shape) != 4:
        raise ValueError("conv2d expects x as (N,C,H,W) and w as (O,C,KH,KW)")
    N, C_in, H, W = x.shape
    C_out, C_in2, KH, KW = w.shape
    if C_in != C_in2:
        raise ValueError("conv2d channel mismatch")
    if b is not None and b.shape != (C_out,):
        raise ValueError("bias must have shape (C_out,)")
    if stride <= 0 or padding < 0:
        raise ValueError("stride must be >0 and padding >=0")

    H_out = (H + 2 * padding - KH) // stride + 1
    W_out = (W + 2 * padding - KW) // stride + 1
    if H_out <= 0 or W_out <= 0:
        raise ValueError("Output spatial size is non-positive; check padding/stride/kernel")

    requires_grad = x.requires_grad or w.requires_grad or (b.requires_grad if b is not None else False)
    out = Tensor.zeros((N, C_out, H_out, W_out), requires_grad=requires_grad)
    out._prev = {x, w} | ({b} if b is not None else set())
    out._op = "conv2d"

    xs0, xs1, xs2, xs3 = x.strides
    ws0, ws1, ws2, ws3 = w.strides
    os0, os1, os2, os3 = out.strides

    # forward
    for n in range(N):
        for co in range(C_out):
            bias_val = b.data[co] if b is not None else 0.0
            for ho in range(H_out):
                hi0 = ho * stride - padding
                for wo in range(W_out):
                    wi0 = wo * stride - padding
                    acc = bias_val
                    for ci in range(C_in):
                        x_base = n * xs0 + ci * xs1
                        w_base = co * ws0 + ci * ws1
                        for kh in range(KH):
                            hi = hi0 + kh
                            if hi < 0 or hi >= H:
                                continue
                            x_row = x_base + hi * xs2
                            w_row = w_base + kh * ws2
                            for kw in range(KW):
                                wi = wi0 + kw
                                if wi < 0 or wi >= W:
                                    continue
                                acc += x.data[x_row + wi * xs3] * w.data[w_row + kw * ws3]
                    out.data[n * os0 + co * os1 + ho * os2 + wo * os3] = acc

    def _backward():
        if out.grad is None:
            return
        # dx
        if x.requires_grad:
            dx = [0.0] * x.size
            for n in range(N):
                for co in range(C_out):
                    for ho in range(H_out):
                        hi0 = ho * stride - padding
                        for wo in range(W_out):
                            wi0 = wo * stride - padding
                            go = out.grad[n * os0 + co * os1 + ho * os2 + wo * os3]
                            if go == 0.0:
                                continue
                            for ci in range(C_in):
                                x_base = n * xs0 + ci * xs1
                                w_base = co * ws0 + ci * ws1
                                for kh in range(KH):
                                    hi = hi0 + kh
                                    if hi < 0 or hi >= H:
                                        continue
                                    x_row = x_base + hi * xs2
                                    w_row = w_base + kh * ws2
                                    for kw in range(KW):
                                        wi = wi0 + kw
                                        if wi < 0 or wi >= W:
                                            continue
                                        dx[x_row + wi * xs3] += w.data[w_row + kw * ws3] * go
            x._accum_grad(dx)
        # dw
        if w.requires_grad:
            dw = [0.0] * w.size
            for co in range(C_out):
                for ci in range(C_in):
                    for kh in range(KH):
                        for kw in range(KW):
                            acc = 0.0
                            for n in range(N):
                                for ho in range(H_out):
                                    hi = ho * stride - padding + kh
                                    if hi < 0 or hi >= H:
                                        continue
                                    for wo in range(W_out):
                                        wi = wo * stride - padding + kw
                                        if wi < 0 or wi >= W:
                                            continue
                                        go = out.grad[n * os0 + co * os1 + ho * os2 + wo * os3]
                                        acc += x.data[n * xs0 + ci * xs1 + hi * xs2 + wi * xs3] * go
                            dw[co * ws0 + ci * ws1 + kh * ws2 + kw * ws3] = acc
            w._accum_grad(dw)
        # db
        if b is not None and b.requires_grad:
            db = [0.0] * b.size
            for co in range(C_out):
                s = 0.0
                for n in range(N):
                    base = n * os0 + co * os1
                    for ho in range(H_out):
                        row = base + ho * os2
                        for wo in range(W_out):
                            s += out.grad[row + wo * os3]
                db[co] = s
            b._accum_grad(db)

    out._backward = _backward
    return out


def maxpool2d(x: Tensor, *, kernel: int = 2, stride: Optional[int] = None) -> Tensor:
    """NCHW maxpool with square kernel."""
    if len(x.shape) != 4:
        raise ValueError("maxpool2d expects x as (N,C,H,W)")
    if stride is None:
        stride = kernel
    if kernel <= 0 or stride <= 0:
        raise ValueError("kernel/stride must be > 0")
    N, C, H, W = x.shape
    H_out = (H - kernel) // stride + 1
    W_out = (W - kernel) // stride + 1
    if H_out <= 0 or W_out <= 0:
        raise ValueError("Output spatial size is non-positive")

    out = Tensor.zeros((N, C, H_out, W_out), requires_grad=x.requires_grad)
    out._prev = {x}
    out._op = "maxpool2d"

    xs0, xs1, xs2, xs3 = x.strides
    os0, os1, os2, os3 = out.strides
    argmax: List[int] = [0] * out.size
    idx = 0
    for n in range(N):
        for c in range(C):
            for ho in range(H_out):
                hi0 = ho * stride
                for wo in range(W_out):
                    wi0 = wo * stride
                    best_val = -float("inf")
                    best_flat = 0
                    for kh in range(kernel):
                        hi = hi0 + kh
                        for kw in range(kernel):
                            wi = wi0 + kw
                            flat = n * xs0 + c * xs1 + hi * xs2 + wi * xs3
                            v = x.data[flat]
                            if v > best_val:
                                best_val = v
                                best_flat = flat
                    out.data[n * os0 + c * os1 + ho * os2 + wo * os3] = best_val
                    argmax[idx] = best_flat
                    idx += 1

    def _backward():
        if x.requires_grad and out.grad is not None:
            dx = [0.0] * x.size
            for i, g in enumerate(out.grad):
                dx[argmax[i]] += g
            x._accum_grad(dx)
    out._backward = _backward
    return out


def mse_loss(pred: Tensor, target: Tensor) -> Tensor:
    if pred.shape != target.shape:
        raise ValueError("mse_loss shape mismatch")
    diff = pred - target
    return (diff * diff).sum() * (1.0 / pred.size)

In [4]:
# OpenCV image -> custom Tensor
from __future__ import annotations

import cv2

from typing import Literal, Optional, Tuple


def image_to_tensor(
    path: str,
    *,
    mode: Literal["color", "grayscale", "unchanged"] = "color",
    channel_order: Literal["bgr", "rgb"] = "rgb",
    normalize: bool = True,
    resize_to: Optional[Tuple[int, int]] = None,  # (width, height)
    add_batch_dim: bool = False,
    dtype: Literal["float", "int"] = "float",
) -> Tensor:
    """Load an image from disk with OpenCV and convert to your Tensor.

    - unchanged: whatever OpenCV reads (can be (H,W), (H,W,3), (H,W,4))
    If add_batch_dim=True, shape becomes (1, ...)
    """
    if not path or not isinstance(path, str):
        raise ValueError("path must be a non-empty string")

    if mode == "color":
        flag = cv2.IMREAD_COLOR
    elif mode == "grayscale":
        flag = cv2.IMREAD_GRAYSCALE
    elif mode == "unchanged":
        flag = cv2.IMREAD_UNCHANGED
    else:
        raise ValueError(f"Unknown mode: {mode}")

    img = cv2.imread(path, flag)
    if img is None:
        raise FileNotFoundError(f"Could not read image at: {path}")

    if resize_to is not None:
        w, h = resize_to
        img = cv2.resize(img, (int(w), int(h)), interpolation=cv2.INTER_AREA)

    # Convert BGR->RGB if requested and image has channels
    if channel_order == "rgb" and mode != "grayscale" and getattr(img, "ndim", 0) == 3:
        # Only swap first 3 channels; preserves alpha if present (BGRA->RGBA)
        if img.shape[2] == 3:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        elif img.shape[2] == 4:
            img = cv2.cvtColor(img, cv2.COLOR_BGRA2RGBA)
    elif channel_order != "bgr" and channel_order != "rgb":
        raise ValueError(f"Unknown channel_order: {channel_order}")

    flat = img.reshape(-1).tolist()
    if dtype == "float":
        if normalize:
            flat = [v / 255.0 for v in flat]
        else:
            flat = [float(v) for v in flat]
    elif dtype == "int":
        # keep ints (and ignore normalize)
        flat = [int(v) for v in flat]
    else:
        raise ValueError(f"Unknown dtype: {dtype}")

    shape = tuple(int(s) for s in img.shape)
    if add_batch_dim:
        shape = (1,) + shape
    return Tensor(flat, shape)


# image_path = "/path/to/your/image.jpg"
# img_tensor = image_to_tensor(image_path, mode="color", channel_order="rgb", normalize=True)
# print("tensor shape:", img_tensor.shape)
# print("first 10 values:", img_tensor.data[:10])

In [6]:
# Sanity check: conv2d -> relu -> loss -> backward
x = Tensor.randn((1, 1, 5, 5), requires_grad=True, seed=0).named("x")
w = Tensor.randn((2, 1, 3, 3), requires_grad=True, seed=1).named("w")
b = Tensor.zeros((2,), requires_grad=True).named("b")

y = conv2d(x, w, b, stride=1, padding=0)
y = relu(y)
loss = y.sum()
loss.backward()

print("loss:", loss.item())
print("x.grad nonzero?", any(abs(g) > 1e-12 for g in (x.grad or [])))
print("w.grad nonzero?", any(abs(g) > 1e-12 for g in (w.grad or [])))
print("b.grad:", b.grad)

loss: 17.401077974429356
x.grad nonzero? True
w.grad nonzero? True
b.grad: [1.0, 5.0]


In [8]:
# Layers / model / loss / optimizers for 32x32 image classification (NCHW)
from __future__ import annotations

import math
import random
from typing import Iterable, List, Optional, Sequence, Tuple


# =========================
# Utilities
# =========================
def as_nchw(img: Tensor, *, add_batch: bool = True) -> Tensor:
    """Convert (H,W,C) or (H,W) to (N,C,H,W)."""
    if len(img.shape) == 2:
        H, W = img.shape
        C = 1
        chw: List[float] = img.data[:]
        out = Tensor(chw, (C, H, W), requires_grad=img.requires_grad, _children=(img,), _op="as_nchw")
        def _backward():
            if img.requires_grad and out.grad is not None:
                img._accum_grad(out.grad[:])
        out._backward = _backward
    elif len(img.shape) == 3:
        H, W, C = img.shape
        if C not in (1, 3, 4):
            raise ValueError("Expected C in {1,3,4}")
        # HWC -> CHW
        chw = [0.0] * (C * H * W)
        for h in range(H):
            for w_ in range(W):
                base = (h * W + w_) * C
                for c in range(C):
                    chw[c * H * W + h * W + w_] = img.data[base + c]
        out = Tensor(chw, (C, H, W), requires_grad=img.requires_grad, _children=(img,), _op="as_nchw")
        def _backward():
            if img.requires_grad and out.grad is not None:
                grad_img = [0.0] * img.size
                for h in range(H):
                    for w_ in range(W):
                        base = (h * W + w_) * C
                        for c in range(C):
                            grad_img[base + c] += out.grad[c * H * W + h * W + w_]
                img._accum_grad(grad_img)
        out._backward = _backward
    else:
        raise ValueError("as_nchw expects (H,W) or (H,W,C)")

    if add_batch:
        out2 = Tensor(out.data[:], (1,) + out.shape, requires_grad=out.requires_grad, _children=(out,), _op="add_batch")
        def _backward2():
            if out.requires_grad and out2.grad is not None:
                out._accum_grad(out2.grad[:])
        out2._backward = _backward2
        return out2
    return out


# =========================
# Core module system
# =========================
class Module:
    def parameters(self) -> List[Tensor]:
        params: List[Tensor] = []
        for v in self.__dict__.values():
            if isinstance(v, Tensor) and v.requires_grad:
                params.append(v)
            elif isinstance(v, Module):
                params.extend(v.parameters())
            elif isinstance(v, (list, tuple)):
                for item in v:
                    if isinstance(item, Tensor) and item.requires_grad:
                        params.append(item)
                    elif isinstance(item, Module):
                        params.extend(item.parameters())
        return params

    def zero_grad(self) -> None:
        for p in self.parameters():
            p.zero_grad()

    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)

    def forward(self, *args, **kwargs):
        raise NotImplementedError


# =========================
# Activation functions
# =========================
def softmax(x: Tensor) -> Tensor:
    """Softmax over classes for logits shaped (N, C)."""
    if len(x.shape) != 2:
        raise ValueError("softmax expects (N,C)")
    N, C = x.shape
    probs: List[float] = [0.0] * x.size
    # forward stable softmax row-wise
    for n in range(N):
        row = [x.data[n * x.strides[0] + c * x.strides[1]] for c in range(C)]
        m = max(row)
        exps = [math.exp(v - m) for v in row]
        s = sum(exps)
        for c in range(C):
            probs[n * x.strides[0] + c * x.strides[1]] = exps[c] / s

    out = Tensor(probs, x.shape, requires_grad=x.requires_grad, _children=(x,), _op="softmax")
    def _backward():
        if not x.requires_grad or out.grad is None:
            return
        dx = [0.0] * x.size
        # Jacobian-vector product per row: p * (g - sum(g*p))
        for n in range(N):
            dot = 0.0
            for c in range(C):
                p = out.data[n * out.strides[0] + c * out.strides[1]]
                g = out.grad[n * out.strides[0] + c * out.strides[1]]
                dot += g * p
            for c in range(C):
                p = out.data[n * out.strides[0] + c * out.strides[1]]
                g = out.grad[n * out.strides[0] + c * out.strides[1]]
                dx[n * x.strides[0] + c * x.strides[1]] += p * (g - dot)
        x._accum_grad(dx)
    out._backward = _backward
    return out


class ReLU(Module):
    def forward(self, x: Tensor) -> Tensor:
        return relu(x)


class Softmax(Module):
    def forward(self, x: Tensor) -> Tensor:
        return softmax(x)


# =========================
# Pooling layers
# =========================
class MaxPool2D(Module):
    def __init__(self, kernel: int = 2, stride: Optional[int] = None):
        self.kernel = int(kernel)
        self.stride = None if stride is None else int(stride)

    def forward(self, x: Tensor) -> Tensor:
        return maxpool2d(x, kernel=self.kernel, stride=self.stride)


# =========================
# Learnable layers
# =========================
class Flatten(Module):
    def forward(self, x: Tensor) -> Tensor:
        if len(x.shape) != 4:
            raise ValueError("Flatten expects NCHW")
        N, C, H, W = x.shape
        return x.reshape(N, C * H * W)


class Conv2D(Module):
    def __init__(self, in_channels: int, out_channels: int, kernel_size: int, *, stride: int = 1, padding: int = 0):
        self.in_channels = int(in_channels)
        self.out_channels = int(out_channels)
        self.kernel_size = int(kernel_size)
        self.stride = int(stride)
        self.padding = int(padding)
        # Kaiming-like init (simple)
        fan_in = self.in_channels * self.kernel_size * self.kernel_size
        scale = math.sqrt(2.0 / fan_in)
        self.weight = Tensor.randn((self.out_channels, self.in_channels, self.kernel_size, self.kernel_size), std=scale, requires_grad=True)
        self.bias = Tensor.zeros((self.out_channels,), requires_grad=True)

    def forward(self, x: Tensor) -> Tensor:
        return conv2d(x, self.weight, self.bias, stride=self.stride, padding=self.padding)


class Linear(Module):
    def __init__(self, in_features: int, out_features: int):
        self.in_features = int(in_features)
        self.out_features = int(out_features)
        scale = math.sqrt(2.0 / self.in_features)
        self.weight = Tensor.randn((self.out_features, self.in_features), std=scale, requires_grad=True)  # (out,in)
        self.bias = Tensor.zeros((self.out_features,), requires_grad=True)

    def forward(self, x: Tensor) -> Tensor:
        if len(x.shape) != 2:
            raise ValueError("Linear expects (N, in_features)")
        N, D = x.shape
        if D != self.in_features:
            raise ValueError("Linear input feature mismatch")
        W = self.weight
        b = self.bias
        out = Tensor.zeros((N, self.out_features), requires_grad=(x.requires_grad or W.requires_grad or b.requires_grad))
        out._prev = {x, W, b}
        out._op = "linear"
        xs0, xs1 = x.strides
        ws0, ws1 = W.strides
        os0, os1 = out.strides
        # forward: y = x @ W^T + b
        for n in range(N):
            for o in range(self.out_features):
                acc = b.data[o]
                wrow = o * ws0
                xrow = n * xs0
                for d in range(D):
                    acc += x.data[xrow + d * xs1] * W.data[wrow + d * ws1]
                out.data[n * os0 + o * os1] = acc

        def _backward():
            if out.grad is None:
                return
            if x.requires_grad:
                dx = [0.0] * x.size
                for n in range(N):
                    xrow = n * xs0
                    for d in range(D):
                        s = 0.0
                        for o in range(self.out_features):
                            s += out.grad[n * os0 + o * os1] * W.data[o * ws0 + d * ws1]
                        dx[xrow + d * xs1] += s
                x._accum_grad(dx)
            if W.requires_grad:
                dW = [0.0] * W.size
                for o in range(self.out_features):
                    wrow = o * ws0
                    for d in range(D):
                        s = 0.0
                        for n in range(N):
                            s += x.data[n * xs0 + d * xs1] * out.grad[n * os0 + o * os1]
                        dW[wrow + d * ws1] += s
                W._accum_grad(dW)
            if b.requires_grad:
                db = [0.0] * b.size
                for o in range(self.out_features):
                    s = 0.0
                    for n in range(N):
                        s += out.grad[n * os0 + o * os1]
                    db[o] += s
                b._accum_grad(db)

        out._backward = _backward
        return out


# =========================
# Loss functions
# =========================
def cross_entropy_loss(logits: Tensor, targets: Sequence[int]) -> Tensor:
    """Softmax cross-entropy. logits: (N,C), targets: length N ints in [0,C)."""
    if len(logits.shape) != 2:
        raise ValueError("cross_entropy_loss expects logits (N,C)")
    N, C = logits.shape
    if len(targets) != N:
        raise ValueError("targets length must equal N")
    for t in targets:
        if not (0 <= int(t) < C):
            raise ValueError("target out of range")

    probs = [[0.0] * C for _ in range(N)]
    loss_val = 0.0
    for n in range(N):
        row = [logits.data[n * logits.strides[0] + c * logits.strides[1]] for c in range(C)]
        m = max(row)
        exps = [math.exp(v - m) for v in row]
        s = sum(exps)
        p = [e / s for e in exps]
        probs[n] = p
        loss_val += -math.log(max(p[int(targets[n])], 1e-12))
    loss_val /= N

    out = Tensor([loss_val], (), requires_grad=logits.requires_grad, _children=(logits,), _op="cross_entropy")
    def _backward():
        if not logits.requires_grad or out.grad is None:
            return
        g = out.grad[0]
        dlogits = [0.0] * logits.size
        for n in range(N):
            t = int(targets[n])
            for c in range(C):
                grad = probs[n][c]
                if c == t:
                    grad -= 1.0
                dlogits[n * logits.strides[0] + c * logits.strides[1]] += (grad / N) * g
        logits._accum_grad(dlogits)
    out._backward = _backward
    return out


# =========================
# Optimizers
# =========================
class Optimizer:
    def __init__(self, params: Iterable[Tensor]):
        self.params = [p for p in params]
    def step(self) -> None:
        raise NotImplementedError
    def zero_grad(self) -> None:
        for p in self.params:
            p.zero_grad()


class SGD(Optimizer):
    def __init__(self, params: Iterable[Tensor], *, lr: float = 1e-2, momentum: float = 0.0, weight_decay: float = 0.0):
        super().__init__(params)
        self.lr = float(lr)
        self.momentum = float(momentum)
        self.weight_decay = float(weight_decay)
        self._v: dict[int, List[float]] = {}
    def step(self) -> None:
        for p in self.params:
            if not p.requires_grad or p.grad is None:
                continue
            grad = p.grad[:]
            if self.weight_decay != 0.0:
                for i in range(p.size):
                    grad[i] += self.weight_decay * p.data[i]
            if self.momentum != 0.0:
                vid = id(p)
                v = self._v.get(vid)
                if v is None:
                    v = [0.0] * p.size
                    self._v[vid] = v
                for i in range(p.size):
                    v[i] = self.momentum * v[i] + grad[i]
                    p.data[i] -= self.lr * v[i]
            else:
                for i in range(p.size):
                    p.data[i] -= self.lr * grad[i]


class Adam(Optimizer):
    def __init__(self, params: Iterable[Tensor], *, lr: float = 1e-3, betas: Tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.0):
        super().__init__(params)
        self.lr = float(lr)
        self.beta1, self.beta2 = float(betas[0]), float(betas[1])
        self.eps = float(eps)
        self.weight_decay = float(weight_decay)
        self.t = 0
        self.m: dict[int, List[float]] = {}
        self.v: dict[int, List[float]] = {}
    def step(self) -> None:
        self.t += 1
        b1, b2 = self.beta1, self.beta2
        for p in self.params:
            if not p.requires_grad or p.grad is None:
                continue
            g = p.grad[:]
            if self.weight_decay != 0.0:
                for i in range(p.size):
                    g[i] += self.weight_decay * p.data[i]
            pid = id(p)
            m = self.m.get(pid)
            v = self.v.get(pid)
            if m is None:
                m = [0.0] * p.size
                v = [0.0] * p.size
                self.m[pid] = m
                self.v[pid] = v
            for i in range(p.size):
                m[i] = b1 * m[i] + (1.0 - b1) * g[i]
                v[i] = b2 * v[i] + (1.0 - b2) * (g[i] * g[i])
                mhat = m[i] / (1.0 - (b1 ** self.t))
                vhat = v[i] / (1.0 - (b2 ** self.t))
                p.data[i] -= self.lr * mhat / (math.sqrt(vhat) + self.eps)


# =========================
# Models
# =========================
class SimpleCNN(Module):
    """Example model for 32x32 images. Output logits (N,num_classes)."""
    def __init__(self, in_channels: int = 3, num_classes: int = 10):
        self.conv1 = Conv2D(in_channels, 8, 3, padding=1)   # 32x32
        self.act1 = ReLU()
        self.pool1 = MaxPool2D(2)                           # 16x16
        self.conv2 = Conv2D(8, 16, 3, padding=1)            # 16x16
        self.act2 = ReLU()
        self.pool2 = MaxPool2D(2)                           # 8x8
        self.flat = Flatten()
        self.fc = Linear(16 * 8 * 8, num_classes)
    def forward(self, x: Tensor) -> Tensor:
        x = self.pool1(self.act1(self.conv1(x)))
        x = self.pool2(self.act2(self.conv2(x)))
        x = self.flat(x)
        x = self.fc(x)
        return x


# Quick smoke test: one training step on random 32x32 batch
model = SimpleCNN(in_channels=3, num_classes=10)
optim = Adam(model.parameters(), lr=1e-3)

N = 4
x_batch = Tensor.randn((N, 3, 32, 32), requires_grad=False, seed=42)
y_batch = [0, 1, 2, 3]  # fake labels
logits = model(x_batch)
loss = cross_entropy_loss(logits, y_batch)
optim.zero_grad()
loss.backward()
optim.step()
print("loss:", loss.item())

# (Optional) probabilities for inspection
probs = Softmax()(logits.detach())
print("probs[0] sum:", sum(probs.data[0:10]))

loss: 2.3032716773973307
probs[0] sum: 0.9999999999999998


In [None]:
from __future__ import annotations

import math
from typing import List, Optional, Sequence, Tuple, Union


def _ensure_nchw(x: Tensor) -> Tensor:
    """Accepts (H,W,C), (N,H,W,C), (N,C,H,W) and returns (N,C,H,W)."""
    if len(x.shape) == 3:
        # HWC -> NCHW
        return as_nchw(x, add_batch=True)
    if len(x.shape) == 4:
        # Could be NCHW or NHWC. Heuristic: if second dim is 1/3/4 treat as NCHW, else treat as NHWC.
        N, A, B, C = x.shape
        if A in (1, 3, 4):
            return x
        # NHWC -> NCHW
        H, W, Ch = A, B, C
        nchw = [0.0] * (N * Ch * H * W)
        for n in range(N):
            for h in range(H):
                for w_ in range(W):
                    base = n * (H * W * Ch) + (h * W + w_) * Ch
                    for c in range(Ch):
                        nchw[n * (Ch * H * W) + c * (H * W) + h * W + w_] = x.data[base + c]
        out = Tensor(nchw, (N, Ch, H, W), requires_grad=x.requires_grad, _children=(x,), _op="nhwc_to_nchw")
        def _backward():
            if x.requires_grad and out.grad is not None:
                grad_x = [0.0] * x.size
                for n in range(N):
                    for h in range(H):
                        for w_ in range(W):
                            base = n * (H * W * Ch) + (h * W + w_) * Ch
                            for c in range(Ch):
                                grad_x[base + c] += out.grad[n * (Ch * H * W) + c * (H * W) + h * W + w_]
                x._accum_grad(grad_x)
        out._backward = _backward
        return out
    raise ValueError("Expected input as (H,W,C), (N,H,W,C), or (N,C,H,W)")


def _apply_activation(x: Tensor, activation: Optional[str]) -> Tensor:
    if activation is None:
        return x
    act = str(activation).lower()
    if act == "relu":
        return relu(x)
    if act == "softmax":
        return softmax(x)
    raise ValueError(f"Unsupported activation: {activation}")


class Sequential(Module):
    def __init__(self):
        self.layers: List[Module] = []
    def add(self, layer: Module) -> None:
        self.layers.append(layer)
    def forward(self, x: Tensor) -> Tensor:
        for layer in self.layers:
            x = layer(x)
        return x


class _Conv2D_(Module):
    def __init__(
        self,
        filters: int,
        kernel_size: Union[int, Tuple[int, int]] = (3, 3),
        *,
        activation: Optional[str] = None,
        input_shape: Optional[Tuple[int, int, int]] = None,  # (H,W,C) channels_last
        stride: int = 1,
        padding: Union[int, str] = "valid",
    ):
        self.filters = int(filters)
        if isinstance(kernel_size, int):
            self.kh = self.kw = int(kernel_size)
        else:
            self.kh = int(kernel_size[0])
            self.kw = int(kernel_size[1])
        self.activation = activation
        self.input_shape = input_shape
        self.stride = int(stride)
        self.padding = padding
        self.weight: Optional[Tensor] = None
        self.bias: Optional[Tensor] = None
        # If input_shape is provided, we can initialize immediately
        if input_shape is not None:
            in_ch = int(input_shape[2])
            self._build(in_ch)

    def _build(self, in_channels: int) -> None:
        fan_in = in_channels * self.kh * self.kw
        scale = math.sqrt(2.0 / fan_in)
        self.weight = Tensor.randn((self.filters, in_channels, self.kh, self.kw), std=scale, requires_grad=True)
        self.bias = Tensor.zeros((self.filters,), requires_grad=True)

    def forward(self, x: Tensor) -> Tensor:
        x = _ensure_nchw(x)
        in_ch = x.shape[1]
        if self.weight is None or self.bias is None:
            self._build(in_ch)
        pad = 0
        if isinstance(self.padding, str):
            p = self.padding.lower()
            if p == "valid":
                pad = 0
            elif p == "same":
                if self.kh != self.kw:
                    raise ValueError("padding='same' only supported for square kernels in this implementation")
                pad = self.kh // 2
            else:
                raise ValueError("padding must be 'valid', 'same', or an int")
        else:
            pad = int(self.padding)
        y = conv2d(x, self.weight, self.bias, stride=self.stride, padding=pad)
        y = _apply_activation(y, self.activation)
        return y


class _MaxPooling2D_(Module):
    def __init__(self, pool_size: Tuple[int, int] = (2, 2), *, strides: Optional[Tuple[int, int]] = None):
        self.pool_size = (int(pool_size[0]), int(pool_size[1]))
        self.strides = None if strides is None else (int(strides[0]), int(strides[1]))
    def forward(self, x: Tensor) -> Tensor:
        x = _ensure_nchw(x)
        kh, kw = self.pool_size
        if kh != kw:
            raise ValueError("Only square pool_size supported (e.g., (2,2))")
        if self.strides is None:
            return maxpool2d(x, kernel=kh, stride=kh)
        sh, sw = self.strides
        if sh != sw:
            raise ValueError("Only square strides supported (e.g., (2,2))")
        return maxpool2d(x, kernel=kh, stride=sh)


class _Flatten_(Module):
    def forward(self, x: Tensor) -> Tensor:
        if len(x.shape) == 4:
            N, C, H, W = x.shape
            return x.reshape(N, C * H * W)
        if len(x.shape) == 2:
            return x
        raise ValueError("Flatten expects (N,C,H,W) or already-flat (N,D)")


class _Dense_(Module):
    def __init__(self, units: int, *, activation: Optional[str] = None, input_shape: Optional[Tuple[int]] = None):
        self.units = int(units)
        self.activation = activation
        self.input_shape = input_shape
        self.linear: Optional[Linear] = None
        if input_shape is not None:
            self.linear = Linear(int(input_shape[0]), self.units)
    def forward(self, x: Tensor) -> Tensor:
        if len(x.shape) != 2:
            raise ValueError("Dense expects (N,D). Add Flatten() before Dense for conv nets.")
        if self.linear is None:
            self.linear = Linear(x.shape[1], self.units)
        y = self.linear(x)
        y = _apply_activation(y, self.activation)
        return y
    def parameters(self) -> List[Tensor]:
        return [] if self.linear is None else self.linear.parameters()


class layers:
    Conv2D = _Conv2D_
    MaxPooling2D = _MaxPooling2D_
    Flatten = _Flatten_
    Dense = _Dense_
    ReLU = ReLU
    Softmax = Softmax


model2 = Sequential()
model2.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)))
model2.add(layers.MaxPooling2D((2, 2)))
model2.add(layers.Conv2D(64, (3, 3), activation='relu'))
model2.add(layers.MaxPooling2D((2, 2)))
model2.add(layers.Conv2D(64, (3, 3), activation='relu'))
model2.add(layers.Flatten())
model2.add(layers.Dense(10))

x_nhwc = Tensor.randn((4, 32, 32, 3), seed=7) 
y_true = [0, 1, 2, 3]
logits2 = model2(x_nhwc)
loss2 = cross_entropy_loss(logits2, y_true)
opt2 = Adam(model2.parameters(), lr=1e-3)
opt2.zero_grad()
loss2.backward()
opt2.step()
print("Sequential(loss):", loss2.item())

Sequential(loss): 6.664145446092037
