In [1]:
# Doing the same things in pytorch

In [2]:
import torch
import random
import math
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from graphviz import Digraph

def trace(root):
    # builds a set of all nodes and edges in a graph
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges

def draw_dot(root):
    dot = Digraph(format = 'svg', graph_attr = {'rankdir': 'LR'})   # LR = Left to right
    
    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        # for any value in the graph, create a rectangular ('record') node for it
        dot.node(name = uid, label = "{%s | data %.4f | grad %.4f}" %(n.label ,n.data, n.grad ), shape = 'record')
        if n._op:
            # if this value is a result of some operation, create an op node for it
            dot.node(name = uid + n._op, label = n._op)
            # and connect this node to it
            dot.edge(uid + n._op, uid)
            
    for n1, n2 in edges:   
        # connect n1 to the op node of n2
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)
        
    return dot

In [28]:
class Value:
    
    def __init__(self, data, _children=(), _op = '', label = ''):
        self.data = data
        self._prev = set(_children)
        self._op = _op
        self.label = label
        self.grad = 0.0         # initial value
        self._backward = lambda : None
    def __repr__(self):
        return f"Value(data = {self.data})"
    
    # Note:
    #out = Class(argument, (who are childrem node, again children node), 'name of operation')
    
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self,other), '+')

        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        out._backward = _backward
        return out
    
    def __radd__(self, other):   # (other * self) ; reverse multi to consider "2*a.data" as well 
        return self+other
    
    #def __neg__(self):
     #   return self * -1
    
    def __sub__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data - other.data, (self, other), '-')
        
        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += -1.0 * out.grad
        out._backward = _backward
        return out #self + (-other)
    
    #def __rsub__(self, other):
     #   return self - other
    
    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')
        
        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward
        return out
    def __rmul__(self, other):   # (other + self) ; reverse multi to consider "2+a.data" as well 
        return self*other
    
    def __pow__(self, other):
        assert isinstance(other, (int, float)),   "asserting only int/float powers for now"
        out = Value(self.data**other,(self,), f'**{other}')
        
        def _backward():
            self.grad += (other)*(self.data**((other - 1.0)))*out.grad
        out._backward = _backward
        return out
        
    def __truediv__(self, other):
        out = Value(self.data * other.data**-1,  (self, other), '/')
        out = Value(self.data / other.data,  (self, other), '/')
        def _backward():
            self.grad += (1.0/other.data)*out.grad
            other.grad += (-1.0*self.data/(other.data**2))*out.grad
        out._backward = _backward
        return out #self * other**-1
    
    def __rtruediv__(self, other):
        return self/other
    
    def tanh(self):
        x = self.data
        t = (np.exp(x) - np.exp(-x))/ (np.exp(x) + np.exp(-x))
        out = Value(t, (self, ), 'tanh')
        
        def _backward():
            self.grad += (1 - t**2) * out.grad
        out._backward = _backward    
        return out
    
    def relu(self):
        out = Value(0 if self.data <0 else self.data, (self,), 'ReLU')
        
        def _backward():
            self.grad += (out.grad > 0) * out.grad
        out._backward= _backward
        return out
    
    def exp(self):
        x = self.data
        out = Value(math.exp(x), (self,), 'exp')
        
        def _backward():
            self.grad = out.data * out.grad
        out._backward = _backward
        return out
    
    def backward(self):
        
        topo = []     # empty list to be appended with the children nodes
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)       
        build_topo(self)
        
        self.grad = 1.0
        for node in reversed(topo):
            node._backward()

In [5]:
# .Double() is for double precision of float numbers, i.e. float64 
x1 = torch.Tensor([2.0]).double()      ;   x1.requires_grad = True
x2 = torch.Tensor([0.0]).double()      ;   x2.requires_grad = True
w1 = torch.Tensor([-3.0]).double()     ;   w1.requires_grad = True
w2 = torch.Tensor([1.0]).double()      ;   w2.requires_grad = True
b = torch.Tensor([6.88137358701954]).double()  ;   b.requires_grad = True
n = x1*w1 + x2*w2 + b
o = torch.tanh(n)

In [6]:
print(o.data.item())     # .item(), is used for extracting the scaler from the tensor
o.backward()             # o is a tensor object

print('--')
print('x2', x2.grad.item())
print('w2', w2.grad.item())
print('x1', x1.grad.item())
print('w1', w1.grad.item())

0.7071066904050358
--
x2 0.5000001283844369
w2 0.0
x1 -1.5000003851533106
w1 1.0000002567688737


### Micrograd is scaler value engine, in pytorch, the input is based on tensors data

## Lets build the micrograd neural network

In [7]:
class Neuron:
    
    def __init__(self, nin):    # nin = size of vector of one neuron
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
        self.b = Value(random.uniform(-1,1))
        
    def __call__(self, x):
        # w*x +b
        act = sum(wi*xi for wi, xi in zip(self.w, x)) + self.b
        out = act.tanh()
        return out
    # how does __call__() works? 
    # x = [2.0, 3.0]
    # n = Neuron(2)
    # n(x)
    
    def parameter(self):
        return self.w + [self.b]      # returning the list parameters of single neuron
    

In [8]:
x = [2.0, 3.0]
n = Neuron(2)
n(x)

Value(data = 0.982765194481908)

In [9]:
class Layer:
    
    def __init__(self,nin, nout):     
        # nout is the number of neurons in present layer,and each of the neurons are defined  
        # as vector, hence, nin as the argument defines the dimension of every neuron.
        
        self.neurons = [Neuron(nin) for _ in range(nout)]
        
    def __call__(self,x):
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs) == 1 else outs
    
    def parameters(self):
        params = []
        for neuron in self.neurons:
            ps = neuron.parameter()
            params.extend(ps)
        return params

In [10]:
x = [2.0, 3.0]
n = Layer(2,3)   
# nin = 2, is for the vector dimensionality of every neuron
# nout = 3, is for the number of neurons in every layer
n(x)

[Value(data = -0.6841174159074246),
 Value(data = 0.7007011892762086),
 Value(data = 0.5162170273771025)]

#### Lets design a MLP (multi layer perceptron)

In [11]:
class MLP:
    def __init__(self, nin, nouts): 
        #nouts:is defined with the list; dimension of list would give the idea of number of layer
        # and the magnitude of every element gives the idea of number of neurons in every layer
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]
        
    def __call__(self, x):   # calling the layers sequentially
        for layer in self.layers:
            x = layer(x)
        return x
    
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

In [12]:
x = [2.0, 3.0, -1.0]
n = MLP(3, [4,4,1])      
# this defines a neural network of 4 layers,
# input layer with 3 neurons
# 2 hidden layers with 4 neurons each
# output layer with 1 neuron
n(x)

Value(data = -0.5790961842169915)

In [13]:
# all the paramters weights and biases inside the N.N.
# n.parameters()
len(n.parameters())

41

In [14]:
# creating data-set
xs = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0]
]
ys = [1.0, -1.0,-1.0, 1.0]   # desired targets

In [15]:
y_pred = [n(x) for x in xs]
y_pred

[Value(data = -0.5790961842169915),
 Value(data = -0.17801144150641232),
 Value(data = -0.33318790274278137),
 Value(data = -0.6263184057002686)]

In [16]:
'''# implementing the mean squred loss
loss = [(yout - ygt)**2 for ygt, yout in zip(ys, y_pred)]   # ygt : y ground truth
loss'''

'# implementing the mean squred loss\nloss = [(yout - ygt)**2 for ygt, yout in zip(ys, y_pred)]   # ygt : y ground truth\nloss'

In [17]:
'''# now cost from the given losses would be:
cost = sum(loss)
cost'''

'# now cost from the given losses would be:\ncost = sum(loss)\ncost'

In [18]:
'''# calculating the cost function gradient
cost.backward()'''

'# calculating the cost function gradient\ncost.backward()'

In [19]:
'''# applying the backpropogation helps us in finding the gradient of every parameter,
# weight and bias both. These are helping us in defining the nature of neuron needed for our ygt
# lets look at the gradient of, first element of first neuron of first layer
n.layers[0].neurons[0].w[0].grad'''

'# applying the backpropogation helps us in finding the gradient of every parameter,\n# weight and bias both. These are helping us in defining the nature of neuron needed for our ygt\n# lets look at the gradient of, first element of first neuron of first layer\nn.layers[0].neurons[0].w[0].grad'

In [20]:
'''n.layers[0].neurons[0].w[0].data'''

'n.layers[0].neurons[0].w[0].data'

### Applying the gradient descent to update the parameters according to the grad.

In [21]:
'''for p in n.parameters():
    p.data = p.data - 0.01 * p.grad'''

'for p in n.parameters():\n    p.data = p.data - 0.01 * p.grad'

In [22]:
'''n.layers[0].neurons[0].w[0].data'''

'n.layers[0].neurons[0].w[0].data'

In [23]:
'''# with one step of gradient descent, we are the cost to have goen down
y_pred = [n(x) for x in xs]
loss = [(yout - ygt)**2 for ygt, yout in zip(ys, y_pred)]   # ygt : y ground truth
cost = sum(loss)
cost
# It has gone down'''

'# with one step of gradient descent, we are the cost to have goen down\ny_pred = [n(x) for x in xs]\nloss = [(yout - ygt)**2 for ygt, yout in zip(ys, y_pred)]   # ygt : y ground truth\ncost = sum(loss)\ncost\n# It has gone down'

In [26]:
# we can simply with the gradient and gradient descent function to minimize the cost
for k in range(15):
    
    # forward pass
    y_pred = [n(x) for x in xs]
    
    # calculate the cost 
    loss = [(yout - ygt)**2 for ygt, yout in zip(ys, y_pred)]
    cost = sum(loss)
    print(f"cost in {k} step is:{cost.data}")
    
    # set old gradient = 0.0 for the new iteration, otherwise grad will keep adding up
    for p in n.parameters():
        p.grad = 0.0
        
    # backward pass or backpropogation
    cost.backward()
    
    # update the parameters
    for p in n.parameters():
        p.data = p.data - 0.05 * p.grad

cost in 0 step is:0.14930305346013895
cost in 1 step is:0.13440779504464312
cost in 2 step is:0.12210267089310445
cost in 3 step is:0.11177556720409056
cost in 4 step is:0.10299138931103177
cost in 5 step is:0.09543286308246363
cost in 6 step is:0.08886344579952246
cost in 7 step is:0.08310333667821374
cost in 8 step is:0.07801351802459323
cost in 9 step is:0.07348487102631472
cost in 10 step is:0.06943058612441518
cost in 11 step is:0.0657807646994373
cost in 12 step is:0.06247851030960562
cost in 13 step is:0.059477052530593946
cost in 14 step is:0.05673759947483965


In [27]:
y_pred

[Value(data = 0.9252619093531166),
 Value(data = -0.9218283661494826),
 Value(data = -0.8553330834711083),
 Value(data = 0.8447180106887434)]