In [None]:
import math
import random

In [None]:
class Value:
  def __init__(self, data, _children = (), _op = ''):
    self.data = data #value
    self._prev = set(_children) #who "creates" the value
    self._op = _op #operation that "created the value"
    self.grad = 0
    '''
    function that will apply the chain rule for each node
    that is, distribute the gradient values according to the operation
    performed, so it is the function that does backpropagation of the gradient.
    This function by default does nothing because there are leaf nodes
    like the bias
    '''
    self._backward =  lambda : None

  def __repr__(self):
    return f"Value(data = {self.data})"
  '''
  when using addition the derivative is 1 and this passes the gradient
  from the created node back equally to both nodes
  '''
  def __add__(self, other):
    #if other is of type Value it's fine, otherwise we create it as Value
    other = other if isinstance(other, Value) else Value(other)
    out = Value(self.data + other.data, (self,other), '+')
    def _backward():
      '''
      use += in operations because you must not overwrite the gradient,
      but accumulate it (there is a bug when using the same variable for example
      b = a + a, the gradient of a is wrong, because it would overwrite it if
      we used = instead of +=)
      '''
      self.grad += 1.0 * out.grad
      other.grad += 1.0 * out.grad
    out._backward = _backward
    return out

  def __radd__(self, other):  # fallback for when you have other + self
    return self + other

  def __rmul__(self, other): #fallback for when you have other * self
    return self * other

  '''
  when using multiplication, the gradient value is calculated as
  the created gradient value multiplied by the value of the other node
  '''
  def __mul__(self, other):
    #if other is of type Value it's fine, otherwise we create it as Value
    other = other if isinstance(other, Value) else Value(other)
    out = Value(self.data * other.data, (self,other), '*')
    def _backward():
      self.grad += out.grad * other.data
      other.grad += out.grad * self.data
    out._backward = _backward
    return out
  '''
  need to implement another tanh because so far we have only implemented
  sum and multiplication, while tanh also uses exponential in fact:
  tanh(x) = (e^(2x) - 1) / (e^(2x) + 1) and obviously also implement
  division.
  Whether the implemented operation is simple or extremely complex, the important thing
  is that we know how to find the local derivative of these
  operations to know how the input affects the output
  '''
  def tanh(self):
    x = self.data
    t = (math.exp(2 * x) - 1) / (math.exp(2 * x) + 1)
    out = Value(t, (self, ), 'tanh')
    def _backward():
      '''
      to propagate backward through tanh we need to know the local derivative
      of tanh, derivative of tanh = 1 - tanh(x)^2
      '''
      self.grad += (1 - t**2) * out.grad
    out._backward = _backward
    return out

  '''
  functions that decompose tanh into smaller pieces
  '''
  def __neg__(self):  #-self
    return self * -1

  def __sub__(self, other): #subtraction self - other
    return self + (-other)

  def exp(self): #exponential function
    x = self.data
    out = Value(math.exp(x), (self, ), 'exp')
    def _backward():
      self.grad += out.data * out.grad #derivative of exp
    out._backward = _backward
    return out

  '''
  division can be decomposed as follows:
  a / b -> a * (1/b) -> a * (b**-1)
  '''
  def __truediv__(self, other): #self/other
    return self * other**-1

  def __pow__(self, other): #power function
    assert isinstance(other, (int, float)) #supports only int/float power
    out = Value(self.data**other, (self,), f'**{other}')
    def _backward():
      self.grad +=  other * (self.data ** (other - 1)) * out.grad #derivative of power
    out._backward = _backward
    return out


  def backward(self):
    #for topological ordering
    topo = []
    visited = set()
    def build_topo(v):
        if v not in visited:
            visited.add(v)
            for child in v._prev:
                build_topo(child)
            topo.append(v)
    build_topo(self)
    self.grad = 1.0 #the gradient of the final result is 1.0 (base case)
    for node in reversed(topo):
      node._backward()

In [None]:
class Neuron:
  def __init__(self, nin): #nin = number of inputs for the neuron
    self.w = [Value(random.uniform(-1, 1)) for _ in range(nin)] #weights of the neuron for each input
    self.b = Value(random.uniform(-1, 1)) #bias of the neuron

  def __call__(self, x):  #output of the neuron
    # w * x + b
    '''
    #zip takes two iterators, and creates a new iterator that iterates over pairs from the input iterators
    '''
    act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
    out = act.tanh() #activation function
    return out

  def parameters(self): #returns the parameters
    return self.w + [self.b]

class Layer:
  def __init__(self, nin, nout):
    #a layer is a list of neurons (equal to the number of n output (nout))
    #this indicates the number of neurons in a single layer
    self.neurons = [Neuron(nin) for _ in range(nout)]

  def __call__(self, x):
    outs = [n(x) for n in self.neurons]
    return outs[0] if len(outs) == 1 else outs

  def parameters(self):
    '''
    params = []
    for neuron in self.neurons:
      ps = neuron.parameters()
      params.extend(ps)
    return params

    this code can be simplified as follows:
    '''
    return [p for neuron in self.neurons for p in neuron.parameters()]



class MLP:
  def __init__(self, nin, nouts): #nouts indicates the size of each layer of neurons
    sz = [nin] + nouts
    self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]

  def __call__(self, x):
    for layer in self.layers:
      x = layer(x)
    return x

  def parameters(self):
    return [p for layer in self.layers for p in layer.parameters()]

In [None]:
#MLP definition
x = [2.0, 3.0, -1.0] #example input
'''
[4, 4, 1] = list that defines the layers:
 - First hidden layer: 4 neurons (receives the 3 inputs)
 - Second hidden layer: 4 neurons (receives output from the first layer)
 - Output layer: 1 neuron (receives output from the second layer)
'''
n = MLP(3, [4, 4, 1])
n(x) #passes the input through the network and calculates the final output (network prediction for the initial input)

In [None]:
#data definition (another example input)
xs = [ #inputs
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0] # desired targets

In [None]:
'''
the loss tells us how our network is performing,
it is the sum of the squared difference between the desired predictions and the actual predictions of the network
obviously we want to minimize this value
'''
for k in range(10):

  #forward pass
  ypred = [n(x) for x in xs]
  loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))

  #backward pass
  for p in n.parameters():
    #we need to reset the gradients to zero as in the initial constructor, so all go to 0 for each iteration
    #and the backward then takes the loss and passes the gradient to all
    p.grad = 0.0
  loss.backward()

  '''
  gradient descent: think of the gradient as the vector that points in the direction of increase of the loss function.
  We want to decrease the loss function, so we need to move in the opposite direction of the gradient. This must be done in many small
  steps (learning rate) and if we take too large a step we might skip the optimum,
  in this case training becomes unstable and the loss increases.
  '''
  #update (gradient descent)
  for p in n.parameters():
    p.data += -0.05 * p.grad # -0.05 is the learning rate

  print(k, loss.data)
