In [24]:
class Value:
    def __init__(self, data,  op='', children = ()):
        self.op = op
        self.children = set(children)
        self.data = data
        self.grad = 0.0 #-> guys!! this is what calculates derivatives
        self.backward = lambda: None #and this is for gradient calculation function
    def __repr__(self):
        return f"Value(data={self.data})"
    
    def __add__(self, other):
        if isinstance(other, Value):
            other = other
        else:
            other = Value(other)
        add_data = self.data + other.data
        children = (self, other)
        output = Value(add_data, op="+", children=children)
        def backward():
            self.grad += 1.0*output.grad # remember it is this way because, if your output is a+b, then if we are to find out PD of out wrt a then it would be constant (if b is constant, say = 5, then out = a + 5)
            other.grad += 1.0*output.grad
        output.backward = backward
        return output

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        output = Value(self.data * other.data, op="*", children=(self, other))
        def backward():
            self.grad += other.data*output.grad # now here if we keep b as constant, then output = 5a and change in "a" increase the output by "b" times
            other.grad += self.data*output.grad
        output.backward = backward
        return output
    
    def __neg__(self):
        return self * -1
    
    def __sub__(self, other):
        return self + (-other)
    
    def __pow__(self, other):
        output = Value(self.data ** other, children=(self,), op=f'**{other}')
        def backward():
            self.grad += (other * self.data**(other - 1)) * output.grad
        output.backward = backward
        return output

    def __radd__(self, other):
        return self + other
    
    def __rmul__(self, other):
        return self*other
    
    def backward_pass(self):
        topo = []
        visited = set()

        def build_topological(v):
            if v not in visited:
                visited.add(v)
                for child in v.children:
                    build_topological(child)
                topo.append(v)
                
        
        build_topological(self)

        self.grad = 1.0

        for node in reversed(topo):
            node.backward()
    
    def relu(self):
        out = Value(0 if self.data <= 0 else self.data, op="RELU", children=(self,))

        def backward():
            self.grad += 0.0 if self.data <= 0 else 1.0*out.grad
        
        out.backward = backward
        return out
    
    def sigmoid(self):
        import math
        y = 1 / (1 + math.exp(-self.data))
        out = Value(y, op="SIGMOID", children=(self,))
        def backward():
            deriv = y*(1-y) # since derivative of sigmoid = sigmoid(1 - sigmoid)
            self.grad += deriv*out.grad
        out.backward = backward
        return out
    

In [29]:
import random

#just a sample with RELU

class Neuron:
    def __init__(self, inputs, nonlin = True):
        self.w = [Value(random.uniform(-1,1)) for _ in range(inputs)]
        self.b = Value(0.0)
        self.nonlin = nonlin

    def __call__(self, x):
        outputs = sum((w_i*x_i for w_i, x_i in zip(self.w, x)), self.b)
        return outputs.relu() if self.nonlin else outputs
    
    def parameter(self):
        return self.w + [self.b]
    
class Layer:
    def __init__(self, inputs, outputs, nonlin=True):
        self.neurons = [Neuron(inputs, nonlin=nonlin) for _ in range(outputs)]
    
    def __call__(self, x):
        outputs = [neuron(x) for neuron in self.neurons]
        if len(outputs) == 1: return outputs[0]
        else:
             return outputs
    
    def parameter(self):
        params = []
        for neuron in self.neurons:
            params.extend(neuron.parameter())
        return params
    
class MLP:
    def __init__(self, inputs, outputs):
        #here outputs are list of integers that represent the number of nodes in each layer except the first layer.
        size = [inputs] + outputs
        # self.layers = [Layer(size[i], size[i+1]) for i in range(len(outputs))] ## this just means Layer 1 : input -> output[0]; layer2: output[0] -> output[1]... so on
        self.layers = []
        for i in range(len(outputs)):
            is_last_layer = (i == len(outputs) - 1)
            self.layers.append(Layer(size[i], size[i+1], nonlin=not is_last_layer))
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
    def parameter(self):
        params = []
        for layer in self.layers:
            params.extend(layer.parameter())
        return params

In [30]:
## training the network

xs = [
  [2.0, 3.0, -1.0],
  [3.0, -1.0, 0.5],
  [0.5, 1.0, 1.0],
  [1.0, 1.0, -1.0],
]

ys = [1.0, -1.0, -1.0, 1.0]

n = MLP(3, [4,4,1])

learning_rate = 0.05

for k in range(20):
    ypred = [n(x) for x in xs]
    loss = sum((y_out - y_act) ** 2 for y_act, y_out in zip(ys, ypred))

    for p in n.parameter():
        p.grad = 0.0
    
    loss.backward_pass()

    for p in n.parameter():
        p.data += -learning_rate*p.grad
    
    print(f"epoch: {k}, Loss: {loss.data:.4f}")

print("\n Final Prediction vs target:")
for pred, target in zip(ypred, ys):
    print(f"target: {target}, Predicted: {pred.data:.4f}")

epoch: 0, Loss: 3.9718
epoch: 1, Loss: 3.6960
epoch: 2, Loss: 3.4514
epoch: 3, Loss: 3.2177
epoch: 4, Loss: 2.9628
epoch: 5, Loss: 2.8672
epoch: 6, Loss: 2.7369
epoch: 7, Loss: 2.6662
epoch: 8, Loss: 2.5969
epoch: 9, Loss: 2.5307
epoch: 10, Loss: 2.4522
epoch: 11, Loss: 2.3558
epoch: 12, Loss: 2.2277
epoch: 13, Loss: 2.0527
epoch: 14, Loss: 1.8151
epoch: 15, Loss: 1.5045
epoch: 16, Loss: 1.1926
epoch: 17, Loss: 0.8659
epoch: 18, Loss: 0.6646
epoch: 19, Loss: 0.4244

 Final Prediction vs target:
target: 1.0, Predicted: 0.5504
target: -1.0, Predicted: -1.0164
target: -1.0, Predicted: -0.7151
target: 1.0, Predicted: 0.6248
