# Comfy Kettenregel (Autograd DIY) - univariate, skalare Funktionen

$$F(x) = f_1 \circ f_2 = f_1(f_2(x)) \Rightarrow f_1'(f_2(x)) \cdot f'_2(x)$$

$$F(x) = f_1 \circ f_2 \circ f_3 = f_1(f_2(f_3(x))) \Rightarrow f_1'(f_2(f_3(x))) \cdot f_2'(f_3
(x)) \cdot f_3'(x)$$

## Aufgabe

Ziel: Gradientenbasierte Optimierung von $f(x) = \sqrt{\frac{1}{e^{\sin(x)}}}$


### 0. Imports

In [None]:
import math
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

### 1.0 Operationen definieren

In [None]:
def one_div_x(x: float, inner_derivative: float = 1) -> tuple[float, float]:

    value = 1 / x
    derivative = -inner_derivative / x**2

    return value, derivative


def sin(x: float, inner_derivative: float = 1) -> tuple[float, float]:

    value = math.sin(x)
    derivative = math.cos(x) * inner_derivative

    return value, derivative


def sqrt(x: float, inner_derivative: float = 1) -> tuple[float, float]:

    value = math.sqrt(x)
    derivative = 1 / (2 * math.sqrt(x)) * inner_derivative

    return value, derivative


def exp(x: float, inner_derivative: float = 1) -> tuple[float, float]:

    value = math.exp(x)
    derivative = math.exp(x) * inner_derivative

    return value, derivative

### 1.1 Funktionsdefinition

In [None]:
def f_x(x: float) -> tuple[float, float]:

    return sqrt(*one_div_x(*exp(*sin(x))))

### 2. Gradient Descent

In [None]:
x_start = 4.0  # starting value
x_min = x_start - 8.0  # x-axis limits
x_max = x_start + 8.0
xs = []  # values for the animation
ys = []

lr = 1e-2  # step size
significant_gradient = 1e-3  # termination criteria
iter = 1  # counter

while True:
    y_measured, deriv = f_x(x_start)
    if np.fabs(deriv) >= significant_gradient:
        xs.append(x_start)
        ys.append(y_measured)
        x_start -= lr * deriv
        print(iter, x_start, y_measured) if iter % 100 == 0 or iter == 1 else None
    else:
        xs.append(x_start)
        ys.append(y_measured)
        break
    iter += 1

### 3.0 Funktionsplot

In [None]:
x = np.arange(x_min, x_max, 0.01)

res = [f_x(_) for _ in x]
y_measured, derivative = zip(*res)

df = pd.DataFrame(
    {
        "x": x,
        "y": y_measured,
        "derivative": derivative,
    }
)

px.line(df, x="x", y="y")

### 3.1 Animation

In [None]:
# get the values
x = np.arange(x_min, x_max, 0.01)

res = [f_x(_) for _ in x]
y_measured, _ = zip(*res)

# define both graphs
fig = go.Figure(
    data=[
        go.Scatter(
            x=x,
            y=y_measured,
            mode="lines",
            line=dict(color="green", width=1),
            name="Function Graph",
        ),
        go.Scatter(
            x=[xs[0]],
            y=[ys[0]],
            mode="markers",
            marker=dict(color="red", size=10),
            name="Current Position",
        ),
    ]
)

# update layout parameters and add start button for animation
fig.update_layout(
    width=1400,
    height=900,
    xaxis=dict(range=(x_min, x_max), autorange=False),
    yaxis=dict(
        range=(np.min(y_measured) - 0.5, np.max(y_measured) + 0.5), autorange=False
    ),
    title_text="Gradient Descent Animation",
    # start button config
    updatemenus=[
        dict(
            type="buttons",
            buttons=[
                dict(
                    args=[
                        None,
                        {
                            "frame": {"duration": 5, "redraw": False},
                            "fromcurrent": True,
                            "transition": {"duration": 0, "easing": "linear"},
                        },
                    ],
                    label="start",
                    method="animate",
                )
            ],
        )
    ],
)

# specify the animation frames
fig.update(
    frames=[
        go.Frame(data=[go.Scatter(x=[xs[k]], y=[ys[k]])], traces=[1])
        for k in range(len(ys))
    ]
)

# show result
fig.show()

# 2024-11-18 

Bisherige Ansatz hat folgende Limitierungen
- funktioniert nur für Ausdrücke in geschlossener Form, keine Kontrollflusslogik
- inkompatibel mit binären Operatoren (+, *, ...)
- funktioniert nur in 1D

# Value Klasse

In [None]:
from __future__ import annotations
import graphviz
from IPython.display import display
import math


class Value:
    def __init__(
        self, value: float, ancestors: tuple[Value, ...] = (), name="", operand=""
    ):
        self.value = value
        self.ancestors = ancestors
        self.name = name
        self.grad = 0.0
        self._backward = lambda: None
        self.operand = operand

    # make values printable
    def __repr__(self) -> str:
        return f"{self.name}, value={self.value}, grad={self.grad}"

    # Addition
    def __add__(self, other: Value) -> Value:
        if not isinstance(other, Value):
            other = Value(other)
        result = Value(self.value + other.value, (self, other), name="add", operand="+")

        def _backward():
            self.grad += result.grad
            other.grad += result.grad

        result._backward = _backward
        return result

    def __iadd__(self, other: Value) -> Value:
        if not isinstance(other, Value):
            other = Value(other)
        result = Value(
            self.value + other.value, (self, other), name="iadd", operand="+="
        )

        def _backward():
            self.grad += result.grad
            other.grad += result.grad

        result._backward = _backward
        return result

    def __radd__(self, other: Value):
        return self + other

    # Subtraktion
    def __sub__(self, other: Value) -> Value:
        if not isinstance(other, Value):
            other = Value(other)
        result = Value(self.value - other.value, (self, other), name="sub", operand="-")

        def _backward():
            self.grad += 1.0 * result.grad
            other.grad += -1.0 * result.grad

        result._backward = _backward
        return result

    def __rsub__(self, other: Value) -> Value:
        return self - other

    # Multiplikation
    def __mul__(self, other: Value) -> Value:
        if not isinstance(other, Value):
            other = Value(other)
        result = Value(self.value * other.value, (self, other), name="mul", operand="*")

        def _backward():
            self.grad += other.value * result.grad
            other.grad += self.value * result.grad

        result._backward = _backward
        return result

    def __rmul__(self, other: Value) -> Value:
        return self * other

    # Floatingpointdivision
    def __truediv__(self, other: Value) -> Value:
        if not isinstance(other, Value):
            other = Value(other)
        result = Value(self.value / other.value, (self, other), name="div", operand="/")

        def _backward():
            self.grad += 1 / other.value * result.grad
            other.grad += -self.value / other.value**2 * result.grad

        result._backward = _backward
        return result

    def __rtruediv__(self, other: Value) -> Value:
        return self / other

    # Potenzierung (x**n)
    def __pow__(self, other: Value) -> Value:
        if not isinstance(other, Value):
            other = Value(other)
        result = Value(self.value**other.value, (self, other), name="pow", operand="^")

        def _backward():
            self.grad += other.value * self.value ** (other.value - 1.0) * result.grad
            # assert self.value >= 0, "cannot compute log with negative base
            other.grad += self.value**other.value * np.log(self.value) * result.grad
            # print(self.grad, other.grad)

        result._backward = _backward
        return result

    # Exponentierung (e**x)
    def exp(self) -> Value:
        result = Value(np.exp(self.value), (self,), name="exp", operand="e^")

        def _backward():
            self.grad += result.value * result.grad

        result._backward = _backward
        return result

    def log(self) -> Value:
        result = Value(np.log(self.value), (self,), name="log")

        def _backward():
            self.grad += 1 / self.value * result.grad

        result._backward = _backward
        return result

    # backwards up until this point
    # Negation
    def __neg__(self) -> Value:
        result = Value(-self.value, (self,), name="neg", operand="-")

        def _backward():
            self.grad += -result.grad

        result._backward = _backward
        return result

    def sigmoid(self) -> Value:
        sigmoid_value = 1 / (1 + np.exp(-self.value))
        result = Value(sigmoid_value, (self,), name="sigmoid")

        def _backward():
            self.grad += sigmoid_value * (1 - sigmoid_value) * result.grad

        result._backward = _backward
        return result

    # make this make sense
    # def softmax(self) -> Value:
    #     exp_element = Value(np.exp(self.value - np.max(self.value)))
    #     result = Value(exp_element / np.sum(exp_element, keepdims=True))  # -> returns 1

    #     # finish this
    #     def _backward():
    #         self.grad += (
    #             (exp_element * np.sum(exp_element) - np.sum(exp_element) * exp_element)
    #             / (np.sum(exp_element)) ** 2
    #             * result.grad
    #         )  # -> returns 0

    #     result._backward = _backward
    #     return result

    # Vergleichsoperatoren <, >, >=, <=
    def __lt__(self, other: Value) -> bool:
        if not isinstance(other, Value):
            other = Value(other)
        return self.value < other.value

    def __gt__(self, other: Value) -> bool:
        if not isinstance(other, Value):
            other = Value(other)
        return self.value > other.value

    def __le__(self, other: Value) -> bool:
        if not isinstance(other, Value):
            other = Value(other)
        return self.value <= other.value

    def __ge__(self, other: Value) -> bool:
        if not isinstance(other, Value):
            other = Value(other)
        return self.value >= other.value

    def backward(self) -> None:
        # iterate through the graph, calculate gradients and update nodes
        topo_sorted_nodes = []
        visited = set()

        # topological sort of the nodes
        def build_topo(node: Value):
            if node not in visited:
                visited.add(node)
                for ancestor in node.ancestors:
                    build_topo(ancestor)
                topo_sorted_nodes.append(node)

        build_topo(self)

        self.grad = 1.0
        for node in reversed(topo_sorted_nodes):
            node._backward()

    def plot_graph(self):
        # "graph visualization python", graphviz
        dot = graphviz.Digraph(format="svg", graph_attr={"rankdir": "LR"})

        def add_nodes(dot: graphviz.Digraph, node: Value):
            label = f"{node.name}|value={node.value}|grad={node.grad}"
            unique_node_name = str(id(node))

            # add value nodes to graph
            dot.node(
                name=unique_node_name,
                label=label,
                shape="record",
                color=(
                    "lightgreen" if node.ancestors == () and node.name != "" else None
                ),  # check if input
                style="filled",
            )

            if node.operand:  # check if there is an operand to display
                op_name = unique_node_name + node.operand
                # add operation node
                dot.node(
                    name=op_name,
                    label=node.operand,
                )
                # draw edge from operand to result
                dot.edge(op_name, unique_node_name)

            # iterate through the ancestors to build the whole graph
            for ancestor in node.ancestors:
                ancestor_name = add_nodes(dot, ancestor)
                if node.operand:
                    # ensure ancestor edge goes to operand node if it exists
                    dot.edge(ancestor_name, op_name)
                else:
                    dot.edge(ancestor_name, unique_node_name)

            return unique_node_name

        add_nodes(dot, self)
        display(dot)

## Examples

In [None]:
# initialize values
x = Value(5.0, name="x")
y_measured = Value(2.5, name="y")

a = Value(2.5, name="a")
b = Value(3.0, name="b")
c = Value(1.5, name="c")

# Folgendes sollte ausfühbar sein:
print(x + y_measured)
print(x * y_measured)
print(x - y_measured)
print(x / y_measured)
print(x**y_measured)
print(x**5)
print(-x)
print(x == y_measured)


def foo(a: Value, b: Value, c: Value):
    if a > Value(2):
        return a * b + c
    return a - b * c


def f(a: Value, b: Value, c: Value):
    # (((b**2) * c) + a)
    x = b**2 * c
    y = a + x
    return y


z1 = foo(a, b, c)
z1.plot_graph()

z2 = foo(Value(-1, name="a2"), b, c)
z2.plot_graph()

z3 = f(a, b, c)
z3.plot_graph()

# Lineare Regression

In [None]:
import numpy as np
import plotly.express as px
from plotly import graph_objects as go

np.random.seed(0xDEADBEEF)

x = np.linspace(-10, 10, 200)
y_ideal = 2 * x - 2
y_measured = y_ideal + np.random.randn(len(x)) * 1.5

fig = px.scatter(x=x, y=y_measured)
fig.add_trace(go.Scatter(x=x, y=y_ideal, mode="lines"))

In [None]:
# Lineare Regression f(x) = m*x + c
np.random.seed(0xDEADBEEF)
x = np.linspace(-10, 10, 200)
y_ideal = 2 * x - 2
y_measured = y_ideal + np.random.randn(len(x)) * 1.5

# Random init von m und c
m = Value(np.random.random(size=None) * 5, name="slope")
c = Value(np.random.random(size=None) * 5, name="intercept")


# Lossfunktion definieren
def loss(m: Value, c: Value) -> Value:
    sum_error = Value(0.0)
    for ii, x_i in enumerate(x):
        sample_error = (m * x_i + c - y_measured[ii]) ** 2
        sum_error = sum_error + sample_error
    sum_error = sum_error / len(x)
    return sum_error


# Vergleich Algorithmus mit Arithmetik
def partial_derivs(m, x, c):
    sum_dloss_dm = 0.0
    sum_dloss_dc = 0.0
    for ii, x_i in enumerate(x):
        # dloss_dm = 2 * (m * x_i + x_i * (c - y_measured[ii]))
        dloss_dm = 2 * (m * x_i + c - y_measured[ii]) * x_i
        dloss_dc = 2 * (m * x_i + c - y_measured[ii])
        sum_dloss_dm += dloss_dm
        sum_dloss_dc += dloss_dc

    return sum_dloss_dm, sum_dloss_dc


# Hyperparameter
epochs = 1000
lr = 1e-4
ms = []
cs = []
m_grad = []
c_grad = []

# Trainingloop
for i in range(epochs):

    precision_loss = loss(m, c)

    m.grad = 0
    c.grad = 0
    precision_loss.backward()

    # - Zwischenergebnisse von (m und c) speichern
    if i < 50 or i % 50 == 0:
        ms.append(m.value)
        cs.append(c.value)
        m_grad.append(m.grad)
        c_grad.append(c.grad)

    # values anhand des negativen Gradienten akkumulieren
    m.value -= lr * m.grad
    c.value -= lr * c.grad


print(f"final m: {m.value}, final c: {c.value}, final loss: {precision_loss.value}")

In [None]:
# Vergleich analytisches Verfahren & Backwards Pass
d = partial_derivs(4.034023390966637, x, 0.9569717983633408)

print(d[0], m_grad[0], d[1], c_grad[0])

## Animation

In [None]:
import pandas as pd

ms = np.array(ms)
cs = np.array(cs)

data = []

for i, (m, c) in enumerate(zip(ms, cs)):
    ys = m * x + c
    for xi, yi in zip(x, ys):
        data.append(
            {
                "x": xi,
                "y": yi,
                "frame": i,
                "m": m,
                "c": c,
            }
        )

df = pd.DataFrame(data)


fig = px.line(df, x="x", y="y", animation_frame="frame")
fig.add_trace(go.Scatter(x=x, y=y_measured, mode="markers"))
fig.show()

In [None]:
px.line(x=range(len(m_grad)), y=np.abs(m_grad), log_y=True)

In [None]:
px.line(x=range(len(c_grad)), y=np.abs(c_grad), log_y=True)

## Test

In [None]:
# Lossfunktion definieren
def loss(m: Value, c: Value, x, y) -> Value:
    sum_error = Value(0.0)
    for ii, x_i in enumerate(x):
        sample_error = (m * x_i + c - y[ii]) ** 2
        sum_error = sum_error + sample_error
    return sum_error


# Vergleich Algorithmus mit Arithmetik
def partial_derivs(m, c, x, y):
    sum_dloss_dm = 0.0
    sum_dloss_dc = 0.0
    for ii, x_i in enumerate(x):
        dloss_dm = 2 * (m * x_i + c - y[ii]) * x_i
        dloss_dc = 2 * (m * x_i + c - y[ii])
        sum_dloss_dm += dloss_dm
        sum_dloss_dc += dloss_dc

    return sum_dloss_dm, sum_dloss_dc


np.random.seed(0xDEADBEEF)
x = np.linspace(-10, 10, 10)
y_ideal = 2 * x - 2
y_measured = y_ideal + np.random.randn(len(x)) * 1.5

# Random init von m und c
m = Value(np.random.random(size=None) * 5, name="slope")
c = Value(np.random.random(size=None) * 5, name="intercept")


L = loss(m, c, x, y_measured)

L.backward()

ds = partial_derivs(m.value, c.value, x, y_measured)

print(m.grad, c.grad, ds)

In [None]:
L.plot_graph()

# Quadratische Regression

In [None]:
np.random.seed(0xDEADBEED)

x = np.linspace(-10, 10, 200)
y_quad_ideal = 2.0 * x**2 - 1.5 * x - 4.0
y_quad_measured = y_quad_ideal + np.random.randn(len(x)) * 20

fig = px.scatter(x=x, y=y_quad_measured)
fig.add_trace(go.Scatter(x=x, y=y_quad_ideal, mode="lines"))

In [None]:
np.random.seed(0xDEADBEED)

a = Value(np.random.random(size=None) * 5, name="a")
b = Value(np.random.random(size=None) * 5, name="b")
c = Value(np.random.random(size=None) * 5, name="c")

x_quad = np.linspace(-10, 10, 200)
y_quad_ideal = 2.0 * x_quad**2 - 1.5 * x_quad - 4.0
y_quad_measured = y_quad_ideal + np.random.randn(len(x_quad)) * 20


# Loss Funktion
def loss_quad(x: np.ndarray, y: np.ndarray, a: Value, b: Value, c: Value) -> Value:
    sum_loss = Value(0.0)
    for x_i, y_i in zip(x, y):
        sample_loss = (a * x_i**2 + b * x_i + c - y_i) ** 2
        sum_loss = sum_loss + sample_loss
    sum_loss = sum_loss / len(x)
    sum_loss.name = "loss"
    return sum_loss


# liste/named tuple der zu optimierenden parameter
def loss_quad(x: np.ndarray, y: np.ndarray, params: list) -> Value:
    sum_loss = Value(0.0)
    for x_i, y_i in zip(x, y):
        sample_loss = (params[0] * x_i**2 + params[1] * x_i + params[2] - y_i) ** 2
        sum_loss = sum_loss + sample_loss
    sum_loss = sum_loss / len(x)
    sum_loss.name = "loss"
    return sum_loss


def partials(
    x: np.ndarray, y: np.ndarray, a: Value, b: Value, c: Value
) -> tuple[float, float, float]:
    dloss_da = 0.0
    dloss_db = 0.0
    dloss_dc = 0.0
    for ii, x_i in enumerate(x):
        dloss_da += 2 * (a * x_i**2 + b * x_i + c - y[ii]) * x_i**2
        dloss_db += 2 * (a * x_i**2 + b * x_i + c - y[ii]) * x_i
        dloss_dc += 2 * (a * x_i**2 + b * x_i + c - y[ii])

    dloss_da = dloss_da / len(x)
    dloss_db = dloss_db / len(x)
    dloss_dc = dloss_dc / len(x)

    return dloss_da, dloss_db, dloss_dc


# Learning Rate eingrenzen -> Wann e+400 Gradienten
# Ab lr von 5e-4 funktioniert Gradient descent nicht mehr
# Hyperparameter
epochs = 5000
lr = 4e-4

# Plot values
a_vals = []
a_grad = []
b_vals = []
b_grad = []
c_vals = []
c_grad = []
losses = []

params = [a, b, c]
# Trainingsloop
for i in range(epochs):
    loss = loss_quad(x=x_quad, y=y_quad_measured, params=params)

    for p in params:
        p.grad = 0.0

    loss.backward()
    if i < 50 or i % 50 == 0:
        a_vals.append(a.value)
        a_grad.append(a.grad)
        b_vals.append(b.value)
        b_grad.append(b.grad)
        c_vals.append(c.value)
        c_grad.append(c.grad)
        losses.append(loss.value)
    # live debugging statement
    if i % 100 == 0:
        print(f"{i}: loss: {loss}, a: {a}, b: {b}, c: {c}")

    for p in params:
        p.value -= lr * p.grad

print(f"Final loss: {loss}, final a: {a}, final b: {b}, final c: {c}")

In [None]:
approximate_loss = loss_quad(
    x_quad, y_quad_measured, [Value(2.0), Value(-1.5), Value(-4.0)]
)
approximate_loss

In [None]:
part = partials(x, y_quad_measured, a_vals[0], b_vals[0], c_vals[0])

print(part[0], a_grad[0], part[1], b_grad[0], part[2], c_grad[0])

## Animation

In [None]:
a_vals = np.array(a_vals)
b_vals = np.array(b_vals)
c_vals = np.array(c_vals)

data = []

for i, (a, b, c) in enumerate(zip(a_vals, b_vals, c_vals)):
    ys = a * x**2 + b * x + c
    for xi, yi in zip(x_quad, ys):
        data.append(
            {
                "x": xi,
                "y": yi,
                "frame": i,
            }
        )

df = pd.DataFrame(data=data)

fig = px.line(df, x="x", y="y", animation_frame="frame")
fig.add_trace(go.Scatter(x=x, y=y_quad_measured, mode="markers"))
fig.show()

In [None]:
# Beträge der Gradienten plotten
px.line(x=range(len(a_grad)), y=np.abs(a_grad), log_y=True)

In [None]:
px.line(x=range(len(b_grad)), y=np.abs(b_grad), log_y=True)

In [None]:
px.line(x=range(len(c_grad)), y=np.abs(c_grad), log_y=True)

In [None]:
px.line(x=range(len(losses)), y=losses)

# MNIST binary classification

### TODO: 
- Filtern 2 Zahlen -> MNIST Datensatz done
- runterskalieren auf 8x8-10x10 pixel
- binäre Klassifikation -> 64/100 -> 1

In [None]:
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt


def parse_mnist_data(
    idx_file_training_samples: str,
    idx_file_training_labels: str,
    number_1: int,
    number_2: int,
) -> tuple[np.ndarray, np.ndarray]:

    training_labels = parse_mnist_labels(idx_file_training_labels)
    training_samples = parse_mnist_images(idx_file_training_samples)

    # filter only two numbers with a mask
    mask = (training_labels.flatten() == number_1) | (
        training_labels.flatten() == number_2
    )
    filtered_labels = training_labels[mask]
    filtered_samples = training_samples[mask]

    # downscale images with pillow
    downscaled_samples = np.array(
        # Image.Resampling.LANCZOS für Antialiasing
        [
            Image.fromarray(img).resize((10, 10), Image.Resampling.LANCZOS)
            for img in filtered_samples
        ]
    )

    downscaled_samples = downscaled_samples / 255

    return downscaled_samples, filtered_labels


def parse_mnist_images(idx_file_path: str) -> np.ndarray:
    with open(idx_file_path, "rb") as f:

        # read magic number
        f.read(4)
        num_img = int.from_bytes(f.read(4), "big")
        num_rows = int.from_bytes(f.read(4), "big")
        num_cols = int.from_bytes(f.read(4), "big")

        data = f.read()
        out = np.ndarray((num_img, num_rows, num_cols), np.uint8, data)
        return out


def parse_mnist_labels(idx_file_path: str) -> np.ndarray:
    with open(idx_file_path, "rb") as f:

        # read magic number
        f.read(4)
        num_item = int.from_bytes(f.read(4), "big")

        data = f.read()
        out = np.ndarray((num_item, 1), np.uint8, data)
        return out


def plot_image(img: np.ndarray) -> plt.Figure:
    assert len(img.shape) == 2, "input must be 2-dimensional (single image)"

    fig, ax = plt.subplots()
    ax.axis("off")
    ax.imshow(img * 255, cmap="gray")

    plt.close()
    return fig

In [None]:
class Neuron:
    def __init__(self, num_inputs: int) -> None:
        self.weights = [Value(np.random.random(size=None)) for _ in range(num_inputs)]
        self.bias = Value(1.0, name="bias")

    def __call__(self, x: np.ndarray) -> Value:
        # implement f(x) = activation (bias + sum(weights * values))
        if isinstance(x, np.ndarray):
            x = x.flatten()
        res = sum(w_i * x_i for w_i, x_i in zip(self.weights, x)) + self.bias
        output = res.sigmoid()
        return output

    def parameters(self) -> list[Value]:
        return self.weights + [self.bias]

    def param_count(self) -> int:
        return len(self.weights + [self.bias])


class Layer:
    def __init__(self, num_inputs: int, num_outputs: int) -> None:
        self.neurons = [Neuron(num_inputs) for _ in range(num_outputs)]

    def __call__(self, x: np.ndarray) -> list[Value]:
        outputs = [n(x) for n in self.neurons]
        return outputs[0] if len(outputs) == 1 else outputs

    def parameters(self) -> list:
        params = [p for n in self.neurons for p in n.parameters()]
        return params


class MLP:
    def __init__(self, num_inputs: int, num_outputs: list[int]) -> None:
        self.size = [num_inputs] + num_outputs
        self.layers = [
            Layer(self.size[i], self.size[i + 1]) for i in range(len(num_outputs))
        ]

    def __call__(self, x: np.ndarray) -> Value:
        for layer in self.layers:
            x = layer(x)
        return x

    def parameters(self) -> list:
        params = [p for l in self.layers for p in l.parameters()]
        return params

In [None]:
test_img_path = "../data/test_img.idx"
test_label_path = "../data/test_label.idx"
fs, fl = parse_mnist_data(test_img_path, test_label_path, 0, 1)

In [None]:
plot_image(fs[np.random.choice(len(fs))])

In [None]:
n = Neuron(100)
res = n(fs[5])
res.backward()
res
# n.param_count()
# lol = cross_entropy_loss(n(fs[5]), fl[5])
# print(lol)

In [None]:
# res.plot_graph()

In [None]:
l = Layer(100, 10)
res2 = l(fs[5])
res2

In [None]:
m = MLP(100, [50, 10, 1])
res3 = m(fs[5])
res3

$\frac{1}{N}\Sigma_{i=1}^N(y_i\cdot log(p_i) + (1-y_i)log(1-p_i))$

In [None]:
def res_loss(y_pred: Value, y_gt) -> Value:
    y_gt = Value(y_gt.item(), (), name="ground truth")
    loss = (y_gt - y_pred) ** 2
    return loss


def cross_entropy_loss(y_pred: Value, y_gt) -> Value:
    y_gt = Value(y_gt.item(), ())
    log_loss_positive = y_gt * y_pred.log()
    log_loss_negative = (1 - y_gt) * (1 - y_pred).log()

    loss = -(log_loss_positive + log_loss_negative)
    return loss

### Trainingsloop

In [None]:
# load images
test_img_path = "../data/test_img.idx"
test_label_path = "../data/test_label.idx"
train_img, train_label = parse_mnist_data(test_img_path, test_label_path, 0, 1)

# initialize MLP
nin = 100
nouts = [50, 10, 1]
mlp = MLP(nin, nouts)
parameters = mlp.parameters()

len(parameters)

In [None]:
# Hyperparameter
lr = 1e-3
epochs = 50

for e in range(epochs):
    # forward pass
    y_pred = [mlp(img) for img in train_img]
    # backward pass
    for p in parameters:
        p.grad = 0.0
    loss = sum(res_loss(yout, ygt) for yout, ygt in zip(y_pred, train_label.item()))
    loss.backward()
    print(e, loss.value)
    # optimization
    for p in parameters:
        p.value -= lr * p.grad

In [None]:
plot_image(train_img[1])

In [None]:
mlp(train_img[1])