In [172]:
%matplotlib widget
from graphviz import Digraph
import math
import torch


In [220]:
def trace(root):
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v.parents:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges

def draw_dot(root, format='svg', rankdir='LR'):
    """
    format: png | svg | ...
    rankdir: TB (top to bottom graph) | LR (left to right)
    """
    assert rankdir in ['LR', 'TB']
    nodes, edges = trace(root)
    dot = Digraph(format=format, graph_attr={'rankdir': rankdir}) #, node_attr={'rankdir': 'TB'})
    
    for n in nodes:
        dot.node(name=str(id(n)), label = "{ %s | value: %.4f | grad: %.4f }" % (n.name, n.value, n.gradient), shape='record')
        if n.operation:
            dot.node(name=str(id(n)) + n.operation, label=n.operation)
            dot.edge(str(id(n)) + n.operation, str(id(n)))
    
    for n1, n2 in edges:
        dot.edge(str(id(n1)), str(id(n2)) + n2.operation)
    
    return dot


def backward_sorted(value):
    sorted_nodes = []
    visited = set()
    def build(v):
        if v not in visited:
            visited.add(v)
            for child in v.parents:
                build(child)
            sorted_nodes.append(v)
    build(value)
    return reversed(sorted_nodes)



In [317]:
class Value: 
    # Parents are the Value objects that performed the operation that created this value
    def __init__(self, value, parents=[], operation=None, name=None):
        # Use value as double
        self.value = float(value)
        self.origin = []
        self.gradient = 0.0
        self.parents = set(parents)
        self.operation = operation
        self.name = name
        self.backward = lambda: None

    def __repr__(self):
        return f"Value({self.value})"
    
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        result = Value(self.value + other.value, parents=[self, other], operation="+")

        def __backward():
            # Derivative of addition is 1, and chain rule means we have to multiply by the gradient
            # We are += as we can accumulate gradients from multiple nodes infront of us
            self.gradient += (1.0 * result.gradient)
            other.gradient += (1.0 * result.gradient)

        result.backward = __backward
        return result
    
    def __neg__(self): # -self
        return self * -1
    
    def __sub__(self, other): # self - other
        return self + (-other)

    def __rsub__(self, other): # other - self
        return other + (-self)

    def __pow__(self, other):
        other = other if isinstance(other, (int, float)) else Value(other)
        result = Value(self.value ** other, parents=[self], operation="**")

        def __backward():
            self.gradient += (other * (self.value ** (other - 1.0))) * result.gradient

        result.backward = __backward
        return result

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        result = Value(self.value * other.value, parents=[self, other], operation="*")
        def __backward():
            self.gradient += other.value * result.gradient 
            other.gradient += self.value * result.gradient

        result.backward = __backward
        return result
    
    def __rmul__(self, other):
        return self.__mul__(other)
    
    def __radd__(self, other):
        return self.__add__(other)

    def tanh(self):
        tan_value = math.tanh(self.value)
        result = Value(tan_value, parents=[self], operation="tanh")
        def __backward():
            self.gradient += (1.0 - (tan_value ** 2)) * result.gradient
            
        result.backward = __backward
        return result
    
    def backprop(self):
        self.gradient = 1.0 
        
        for n in backward_sorted(self):
            n.backward()


In [330]:
import random 

class Neuron:
    def __init__(self, dimensions):
        # Randomly initialize weights
        self.weights = [Value(random.uniform(1, -1)) for _ in range(dimensions)]
        self.bias = Value(random.uniform(1, -1))

    def forward(self, inputs):
        # Zip inputs and weights together for dot product
        zipped = zip(inputs, self.weights)
        # Sum of all inputs * weights
        dotproduct = sum([i * w for i, w in zipped], self.bias)
        # Apply activation function
        return self.activation(dotproduct)
    
    def activation(self, x):
        return x.tanh()
    
    def parameters(self):
        return self.weights + [self.bias]


class Layer:
    def __init__(self, dimensions, num_neurons):
        self.neurons = [Neuron(dimensions) for _ in range(num_neurons)]

    def forward(self, inputs):
        return [n.forward(inputs) for n in self.neurons]
    
    def parameters(self):
        return [p for n in self.neurons for p in n.parameters()]
    
class MLP:
    # Layers is a list of sizes of each layer
    def __init__(self, dimensions, layers):
        self.layers = [Layer(dimensions, layers[0])]
        for i in range(1, len(layers)):
            self.layers.append(Layer(layers[i-1], layers[i]))

    def forward(self, inputs):
        for layer in self.layers:
            inputs = layer.forward(inputs)
        # If we have more than one output, return a list, otherwise return the value
        return inputs if len(inputs) > 1 else inputs[0]
    
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]


In [344]:
inputs = [
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9],
    [-1, -2, -3],
]

desired_outputs = [1, -1, -1, 1]


In [388]:
# We can see that the loss has decreased, meaning that the model has learned something. We can repeat this process
# to further decrease the loss, and therefore improve the model. This is the basic idea behind training a neural network.
mlp = MLP(dimensions=3, layers=[10, 10, 5, 1])


In [406]:
learning_rate = 0.001
steps = 100
for step in range(steps):
    # Forward pass
    predicted_outputs = [mlp.forward(i) for i in inputs]
    loss = sum([(predicted - desired)**2 for predicted, desired in zip(predicted_outputs, desired_outputs)])

    # Backward pass
    # Reset gradients. Otherwise they will accumulate from previous steps.
    for p in mlp.parameters():
        p.gradient = 0.0

    loss.backprop()

    # Update parameters
    for p in mlp.parameters():
        p.value += -learning_rate * p.gradient

    # Print progress
    if step % 20 == 0:
        print(f"Step {step}, loss = {loss}")
        print("Predicted: ", predicted_outputs)
        print("Desired: ", desired_outputs, "\n")


Step 0, loss = Value(0.001336687402554782)
Predicted:  [Value(0.9785304990442912), Value(-0.9815704229058357), Value(-0.9827775211023586), Value(0.98452470226399)]
Desired:  [1, -1, -1, 1] 

Step 20, loss = Value(0.0013359655024166965)
Predicted:  [Value(0.9785360727966932), Value(-0.9815756481813508), Value(-0.9827821924922268), Value(0.9845288754088222)]
Desired:  [1, -1, -1, 1] 

Step 40, loss = Value(0.0013352443166745498)
Predicted:  [Value(0.9785416424105203), Value(-0.9815808696955681), Value(-0.9827868605365783), Value(0.9845330457142422)]
Desired:  [1, -1, -1, 1] 

Step 60, loss = Value(0.0013345238443043302)
Predicted:  [Value(0.9785472078907167), Value(-0.9815860874527471), Value(-0.9827915252393211), Value(0.9845372131834716)]
Desired:  [1, -1, -1, 1] 

Step 80, loss = Value(0.0013338040842839653)
Predicted:  [Value(0.9785527692422179), Value(-0.9815913014571406), Value(-0.9827961866043571), Value(0.9845413778197268)]
Desired:  [1, -1, -1, 1] 

