# Implementando Michigrad

In [1]:
import torch
import numpy as np
from graphviz import Digraph


In [2]:
def trace(root):
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges

def show_graph(root, format='svg', rankdir='LR'):
    """
    format: png | svg | ...
    rankdir: TB (top to bottom graph) | LR (left to right)
    """
    assert rankdir in ['LR', 'TB']
    nodes, edges = trace(root)
    dot = Digraph(format=format, graph_attr={'rankdir': rankdir}) #, node_attr={'rankdir': 'TB'})
    
    for n in nodes:
        dot.node(name=str(id(n)), label = "{%s | data %.4f | grad %.4f}" % (n.name, n.data, n.grad), shape='record')
        if n._op:
            dot.node(name=str(id(n)) + n._op, label=n._op)
            dot.edge(str(id(n)) + n._op, str(id(n)))
    
    for n1, n2 in edges:
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)
    
    return dot
    
class Value():
    def __init__(self, data, _children=(), _op="", name=""):
        self.data = data
        self.name = name
        self.grad = 0.0
        self._backward = lambda: None
        self._op = _op
        self._prev = set(_children)

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self,other), "+")
        
        def _backward():
            self.grad += out.grad
            other.grad += out.grad
        out._backward = _backward
        return out

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self,other), "*")

        def _backward():
            self.grad += out.grad * other.data
            other.grad += out.grad * self.data
        out._backward = _backward
        return out

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "solamente soportamos potencias de int/float"
        out = Value(self.data ** other, _children=(self, ), _op=f"**{other}")

        def _backward():
            self.grad += other * (self.data ** (other - 1)) * out.grad
        out._backward = _backward
        return out

    def __truediv__(self, other):
        return self * other ** -1

    def exp(self):
        out = np.exp(self.data)
        out = Value(out, _children=(self,), _op="exp")
        
        def _backward():
            self.grad += out.data * out.grad
        out._backward = _backward       
        return out

    def tanh(self):
        x = self.data
        t = (np.e ** (2*x) - 1)/(np.e ** (2*x) + 1)
        out = Value(t, _children=(self, ), _op="tanh")

        def _backward():
            self.grad += (1 - t**2) * out.grad
        out._backward = _backward
        return out

    def backward(self):
        topo = []
        visited = set()
        def build_topo(v):
            if v is not visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        self.grad = 1.
        for node in reversed(topo):
            node._backward()

    def __radd__(self, other):
        return self + other

    def __rmul__(self, other):
        return self * other

    def __neg__(self):
        return self * -1

    def __sub__(self, other):
        return self + (-other)

    def __rsub__(self, other):
        return other + (-self)

    def __repr__(self):
        return f'Value(data={self.data}, grad={self.grad}, _op="{self._op}", name={self.name})'

In [23]:
W0 = Value(-.5, name='W₀')
x0 = Value(.5, name='x₀')
W1 = Value(-.3, name='W₁')
x1 = Value(2, name='x₁')
W0x0 = W0 * x0; W0x0.name = 'W0x0'
W1x1 = W1 * x1; W1x1.name = 'W1x1'
yhat = W0x0 + W1x1; yhat.name = 'ŷ'
yhat = yhat.tanh(); yhat.name = 'tanh(ŷ)'
y = Value(.8, name='y')
L = y - yhat; L.name = 'L'
L.backward()
W0.grad

-0.26121149393241616

In [29]:
W0 = torch.tensor(-.5); W0.requires_grad = True
x0 = torch.tensor(.5);  x0.requires_grad = True
W1 = torch.tensor(-.3); W1.requires_grad = True
x1 = torch.tensor(2.);   x1.requires_grad = True
yhat = torch.tanh(W0 * x0 + W1 * x1)
y = torch.tensor(.8); y.requires_grad = True
L = y - yhat
L.backward()
W0.grad

tensor(-0.2612)

In [19]:
torch.tensor(.8).type()

'torch.FloatTensor'

In [157]:
class Neuron:
    def __init__(self, nin, bias=True):
        self.W = [Value(np.random.uniform(-1, 1)) for _ in range(nin)]
        self.b = Value(np.random.uniform(-1, 1))
    
    def __repr__(self):
        return f'Neuron(nin={len(self.W)})'

    def __call__(self, x):
        return sum([wi * xi for wi, xi in zip(self.W, x)], self.b)

    def parameters(self):
        return self.W + [self.b]
        
class Layer:
    def __init__(self, nin, nout):
        self.neurons = [Neuron(nin) for _ in range(nout)]

    def __call__(self, x):
        act = [n(x).tanh() for n in self.neurons]
        return act
    
    def __repr__(self):
        return f"Layer of [{', '.join(str(n) for n in self.neurons)}]"

    def parameters(self):
        return [p for neuron in self.neurons for p in neuron.parameters()]
    
class MLP: #Multi layer perceptron
    def __init__(self, nin, nouts):
        self.nin = nin
        self.nouts = nouts
        sz = [nin] + nouts
        #self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]
        self.layers = [Layer(a, b) for a, b in zip(sz, sz[1:])]

    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x

    def __repr__(self):
        return f"MLP of [{', '.join(str(layer) for layer in self.layers)}]"

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]
    

In [70]:
x = [2, 3]
n = Neuron(2)
n(x)

Value(data=-0.9991496671795999, grad=0.0, _op="tanh", name=)

In [118]:
model = MLP(2, [3, 3, 1])
model

MLP of [Layer of [Neuron(nin=2), Neuron(nin=2), Neuron(nin=2)], Layer of [Neuron(nin=3), Neuron(nin=3), Neuron(nin=3)], Layer of [Neuron(nin=3)]]

In [119]:
model([1,2])[0].tanh()

Value(data=-0.7785767522718746, grad=0.0, _op="tanh", name=)

In [195]:
np.random.seed(40)
xor = MLP(2, [3, 3, 1])
xs = [[0, 0], [0, 1], [1, 0], [1, 1]]
ys = [0, 1, 1, 0]


In [196]:
yhats = [xor(x)[0] for x in xs]
yhats

[Value(data=0.011082209707468916, grad=0.0, _op="tanh", name=),
 Value(data=0.1193526531019616, grad=0.0, _op="tanh", name=),
 Value(data=0.06230056664007959, grad=0.0, _op="tanh", name=),
 Value(data=0.1644186594141305, grad=0.0, _op="tanh", name=)]

In [215]:
loss = sum([(y - yhat)**2 for y, yhat in zip(ys,yhats )])/4
loss.backward()


In [216]:
# for p in xor.parameters():
#     p.grad =0.
xor.parameters()

[Value(data=-0.18462594383942776, grad=1540718.913950618, _op="", name=),
 Value(data=-0.8892679197762798, grad=868040.2332652615, _op="", name=),
 Value(data=0.5770697549735053, grad=1311926.923450415, _op="", name=),
 Value(data=-0.4253896304213083, grad=1613758.4093098948, _op="", name=),
 Value(data=-0.0992988260654577, grad=559144.0673464814, _op="", name=),
 Value(data=-0.39217538931846896, grad=1055362.3799538866, _op="", name=),
 Value(data=0.052799048594290854, grad=-131717.11289210053, _op="", name=),
 Value(data=0.24762442576937338, grad=-16983.257668143662, _op="", name=),
 Value(data=0.5535509154499147, grad=-57339.83687797864, _op="", name=),
 Value(data=0.37248329073291475, grad=2666.348378597243, _op="", name=),
 Value(data=0.9618777263756102, grad=-7328.8754457735795, _op="", name=),
 Value(data=0.2016321844318263, grad=3530.3471500855435, _op="", name=),
 Value(data=0.6279370398859911, grad=13247.309005815441, _op="", name=),
 Value(data=0.41729030432639735, grad=249.

In [188]:
for p in xor.parameters():
    p.data -= p.grad * 0.1

In [189]:
yhats = [xor(x)[0] for x in xs]
loss = sum([(y - yhat)**2 for y, yhat in zip(ys,yhats )])/4
loss

Value(data=0.24934804788136358, grad=0.0, _op="*", name=)

In [190]:
np.random.seed(40)
xor = MLP(2, [3, 3, 1])
xs = [[0, 0], [0, 1], [1, 0], [1, 1]]
ys = [0, 1, 1, 0]
steps = 1000
lr = 0.01

In [248]:
for _ in range(steps):
    # 1. forward pass
    yhats = [xor(x)[0] for x in xs]
    # 2. calcular la loss funcition
    loss = sum([(y - yhat)**2 for y, yhat in zip(ys,yhats )])/4
    # 3. zero grad
    for p in xor.parameters():
        p.grad = 0.
    # 4. backward pass
    loss.backward()
    # 5. update
    for p in xor.parameters():
        p.data -= p.grad * lr
    #print(loss)
loss

Value(data=0.007677743828866295, grad=1.0, _op="*", name=)

In [249]:
xor([0,0]),xor([0,1]), xor([0,1]), xor([1,1])

([Value(data=0.01958828661994304, grad=0.0, _op="tanh", name=)],
 [Value(data=0.8808263916002204, grad=0.0, _op="tanh", name=)],
 [Value(data=0.8808263916002204, grad=0.0, _op="tanh", name=)],
 [Value(data=0.010631312726344844, grad=0.0, _op="tanh", name=)])

## Conclusiones
* Hemos creado una pequeña librería para representar MLPs.
* La librería es suficientemente pontente para representar un clasificador sencillo.
* Es fácil de extender y la interfáz es en su mayor parte compatible con PyTorch.
* Lo visto debería alcanzar para comprender totalmente el código de Michigrad y Micrograd.
* 

## Ejercicios
* Implementar el parámetro `bias` en `Neuron` y `Layer` para poder crear neuronas sin sesgo. (Fácil)
  * Pista: mirar la implementación de Michigrad.
* Implementar el modelo `xor` usando PyTorch. (Fácil si usaste Pytorch antes, Intermedio si no lo hiciste)
  * Pista: PyTorch no implementa un MLP. Se puede redefirnir la clase `MLP` usando `torch.Linear` en lugar de capas de neuronas (`Layer` y `Neuron`).
* Implementar las funciones de activación como capas. (Intermedio)
  * Pista: Reemplazar `Layer` por `Linear`, y definir una clase por cada función de activación. La capa `Linear` se comportará como una capa de neuronas y la capa de activación aplicará la función de activación a cada salida de todas las neuronas de la capa anterior.
* Implementar la clase Module que permita crear modelos como listas de modulos. Todos los módulos deben soportar la `__call__(self, x)` que hace la forward pass.  (Difícil)
  * Pista: Se tendría que poder crear un modelo como una lista de modulos, así:
   ```python
       model = [Linear(2, 4), Linear(4, 4, bias=False), Linear(4, 3), Tanh(3)]
   ```