In [2]:
import numpy as np
from typing import Tuple

np.set_printoptions(precision = 5)  # this is ugly here

class Tensor():

    def __init__(self, data, _children = ()):
        
        self.data = data if isinstance(data, (np.ndarray, np.generic)) else np.array(data, dtype = np.float32)
        self.shape = self.data.shape
        self._backward = lambda : None
        self.prev = set(_children)
        self.grad = 0.0

    def __repr__(self):
        return f"<Tensor data = {self.data}>"
    
    def shape(self)-> Tuple[int]: return self.shape

    def size(self)-> int: return self.data.size
    #-                                            BINARY                                                 -
    def __add__(self, other )-> 'Tensor': 
        other = other if isinstance(other, Tensor) else Tensor(other)
        output_T  = Tensor(self.data + other.data, (self,other))

        def _backward():
            self.grad += output_T.grad 
            other.grad += output_T.grad

        output_T._backward = _backward
        return output_T

    def __mul__(self, other)-> 'Tensor': 
        other = other if isinstance(other, Tensor) else Tensor(other)
        output_T = Tensor(self.data * other.data,(self, other))

        def _backward():
            self.grad += other * output_T.grad 
            other.grad += self * output_T.grad
    
        output_T._backward = _backward
        return output_T
    
    def __pow__(self, other) -> 'Tensor': #https://testbook.com/learn/maths-derivative-of-exponential-function
        other = other if isinstance(other, Tensor) else Tensor(other)
        output_T = Tensor(self.data ** other.data, (self, other))

        def _backward():
            self.grad += other * (self ** (other - 1)) * output_T.grad
            other.grad += output_T * self.log() * output_T.grad

        output_T._backward = _backward
        return output_T

    def __sub__(self, other)-> 'Tensor':
        other  = other if isinstance(other , Tensor) else Tensor(other)
        output_T = Tensor(self.data - other.data, (self, other))

        def _backward():
            self.grad += output_T.grad
            other.grad += -output_T.grad

        output_T._backward = _backward
        return output_T 

    def __radd__(self, other) -> 'Tensor':
        return self + other
    
    def __rmul__(self, other)-> 'Tensor':
        return self * other 
    
    def __rsub__(self, other)-> 'Tensor':
        return other + (self * -1)
    
    def __truediv__(self, other)-> 'Tensor':
        return self * (other **-1)
    
    def __rtruediv__(self, other)-> 'Tensor':
        return other * (self**-1)
    #-                                             UNARY      math                                     -
    def sum(self) -> 'Tensor':
        output_T = Tensor(self.data.sum(), (self, ))

        def _backward():
            self.grad += Tensor.ones_like(self) * output_T.grad

        output_T._backward = _backward
        return output_T
    
    def log(self)-> 'Tensor':
        output_T = Tensor(np.log(self.data), (self, ))

        def _backward():
            self.grad += Tensor.ones_like(self) / self * output_T.grad
        
        output_T._backward = _backward
        return output_T
    
    def mean(self)-> 'Tensor':
        output_T = Tensor(np.mean(self.data), (self, ))

        def _backward():
            t = Tensor.ones_like(self)  
            self.grad += t / self.size() * output_T.grad

        output_T._backward = _backward
        return output_T
    
    def sqrt(self)-> 'Tensor':
        output_T = Tensor(np.sqrt(self.data), (self, ))

        def _backward():
            self.grad += 1 / (2 * output_T) * output_T.grad

        output_T._backward = _backward
        return output_T
    #                                            UNARY transformation                                          -
    def __neg__(self)-> 'Tensor':      # TODO this may couse errors 
        return self * -1 

    def abs(self) -> 'Tensor':   
        output_T =  Tensor(np.abs(self.data), (self, ))
        def _backward():
            self.grad += Tensor(np.sign(self.data)) * output_T.grad

        output_T._backward = _backward
        return output_T 
    # TODO write T.grad more efficiently               
    def T(self) -> 'Tensor':
        output_T = Tensor(np.transpose(self.data), (self, ))

        def _backward():
            
            self.grad += Tensor(np.transpose(np.inner(output_T.grad.data, np.ones_like(self.data))))   #TODO find a nicer way to do this

        output_T._backward  = _backward
        return output_T
    
    def unsqueeze(self, axis) -> 'Tensor':
        return Tensor(np.expand_dims(self.data, axis = axis))
    
    #                                                DOT                                                     - 
    def dot(self, other) -> 'Tensor':
        other = other if isinstance(other , Tensor) else Tensor(other)

        output_T  = Tensor(np.dot(self.data, other.data), (self, other))

        def _backward():
            self.grad += Tensor(output_T.grad.data.dot(other.data.T))
            other.grad +=  Tensor(self.data.T.dot(output_T.grad.data))

        output_T._backward = _backward
        return output_T
    #                                               Activation functions                                      - 
    def ReLU(self):
        output_T = Tensor(np.maximum(0, self.data), (self, ))

        def _backward():
            self.grad += Tensor(output_T.data > 0) * output_T.grad

        output_T._backward = _backward
        return output_T
    
    def Sigmoid(self):

        exp = np.exp(-self.data)
        output_T = Tensor((1/(1 + exp)), (self, )) # der_sig (1/(1 + np.exp(-input))* 1- 1/(1 + np.exp(-input)))

        def _backwrad():
            self.grad += Tensor(output_T.data - output_T.data**2) * output_T.grad 

        output_T._backward = _backwrad
        return output_T
    
    def Tanh(self):
        output_T = Tensor(np.tanh(self.data), (self, ))

        def _backward():
            self.grad = Tensor(1- output_T.data**2) * output_T.grad
        
        output_T._backward = _backward
        return output_T
    
    def Softmax(self):      # https://stackoverflow.com/questions/42599498/numerically-stable-softmax

        z = self.data - max(self.data)
        o = np.exp(z)
        softmax = o / np.sum(o)

        output_T = Tensor(softmax)
        def _backward():
            self.grad 
        return output_T

    
    @classmethod
    def zeros(cls, shape)-> 'Tensor': return cls(np.zeros(shape))
        
    @classmethod
    def ones(cls, shape)-> 'Tensor': return cls(np.ones(shape))

    @classmethod
    def ones_like(cls, Tensor)-> 'Tensor' : return cls(np.ones(Tensor.shape))
       
    @classmethod
    def zeros_like(cls, Tensor)-> 'Tensor': return cls(np.zeros(Tensor.shape))

    RNG = np.random.default_rng() #https://numpy.org/doc/stable/reference/random/generator.html
    @classmethod
    def randn(cls, shape)-> 'Tensor': return cls(Tensor.RNG.standard_normal(size = shape))
        
    @classmethod
    def uniform(cls, shape)-> 'Tensor': return cls(Tensor.RNG.uniform(low = -1 , high =  1, size = shape))
       
    @classmethod
    def arange(cls, start, stop, step)-> 'Tensor': return cls(np.arange(start = start, stop = stop , step = step ))

    #-                                           ENGINE                                                  -
    def backward(self):
        
        topo = []
        visited = set()
    
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v.prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)
        
        self.grad = Tensor([1.0])

        for node in reversed(topo):
            node._backward()


In [3]:
t = Tensor([1,2,3,4])
p = t.Softmax()
print(p)

<Tensor data = [0.03206 0.08714 0.23688 0.64391]>


In [4]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
m = torch.nn.Softmax(dim=0)
input = torch.Tensor([1,2,3,4])
output = m(input)
L = output

tensor([0.0321, 0.0871, 0.2369, 0.6439])


In [42]:
t1 = Tensor([1,2,3,4]) * 0.2
t2 = Tensor([5,6,7,8]) * 0.1
t3 = Tensor([9,10,11,12]) * 0.01

first = t1 + t2
second = first * t3
third = second.Softmax()
L = third.sum()
print(third)
L.backward()

<Tensor data = [0.23482 0.24367 0.25437 0.26715]>


In [630]:

loss_fn = torch.nn.CrossEntropyLoss()

In [631]:
true_hat = torch.tensor([1.,0.,0.,0.])
#true_hat = torch.Tensor([1,0,0,0])

torch.manual_seed(100)
t1_ = torch.randn((4,4))                                        ;t1_.requires_grad = True 
t2_ = torch.randn((4,1))                                        ;t2_.requires_grad = True
t3_ = torch.randn((4,1))  * 0.1                                 ;t3_.requires_grad = True


first_ = t1_.matmul(t2_)                                        ; first_.retain_grad()
second_ = first_ + t3_                                          ; second_.retain_grad()
third_ =second_.softmax(dim = 0)                                ; third_.retain_grad()  
loss = loss_fn(second_.squeeze(), true_hat)                                 ; loss.retain_grad()
loss.backward()                                               

In [674]:
print(third_.grad)

tensor([[-5.3766],
        [-0.0000],
        [-0.0000],
        [-0.0000]])


In [655]:
print(t1_.grad)

tensor([[-1.4230,  0.2246,  0.7941, -0.3899],
        [ 0.8437, -0.1331, -0.4708,  0.2312],
        [ 0.5134, -0.0810, -0.2865,  0.1407],
        [ 0.0659, -0.0104, -0.0368,  0.0181]])


In [633]:
# der of softmax = si(1 - sj)

#n = np.size(self.out)
#ret = np.dot((np.identity(n) - self.out.T) * self.out, o_grad)
n = 4 #third_.size()
der_CLE = third_.squeeze() - true_hat
ret  = torch.matmul(torch.eye(4) - torch.transpose(third_, 0, 1) * third_, der_CLE)
print(der_CLE) # THIS IS IT
#print(ret)
# der of CEL is s - y  (s is the predictied ones)


tensor([-0.8140,  0.4826,  0.2937,  0.0377], grad_fn=<SubBackward0>)
tensor([-0.8455,  0.4010,  0.2440,  0.0313], grad_fn=<MvBackward0>)


In [634]:
print(t1_.grad)

tensor([[-1.4230,  0.2246,  0.7941, -0.3899],
        [ 0.8437, -0.1331, -0.4708,  0.2312],
        [ 0.5134, -0.0810, -0.2865,  0.1407],
        [ 0.0659, -0.0104, -0.0368,  0.0181]])


In [635]:
import math
loss_ = -torch.sum(true_hat * torch.log(third_.squeeze()))
print(loss, loss_)
 





tensor(1.6821, grad_fn=<DivBackward1>) tensor(1.6821, grad_fn=<NegBackward0>)


In [710]:
torch.manual_seed(100)
t1_1 = torch.randn((4,4))                                        ;t1_1.requires_grad = True 
t2_1 = torch.randn((4,1))                                        ;t2_1.requires_grad = True
t3_1 = torch.randn((4,1))  * 0.1                                 ;t3_1.requires_grad = True


first_1 = t1_1.matmul(t2_1)                                        ; first_1.retain_grad()
second_1 = first_1 + t3_1                                          ; second_1.retain_grad()
third_1 =second_1.softmax(dim = 0)                                ; third_1.retain_grad()                     ; loss.retain_grad()
loss_2 = -torch.sum(true_hat * torch.log(third_1.squeeze()))

loss_2.backward()                                                 ; loss_2.retain_grad()

In [757]:
print(loss_2.grad)
print(third_1.grad)
print(second_1.grad)
print(t3_1.grad)

None
tensor([[-5.3766],
        [-0.0000],
        [-0.0000],
        [-0.0000]])
tensor([[-0.8140],
        [ 0.4826],
        [ 0.2937],
        [ 0.0377]])
tensor([[-0.8140],
        [ 0.4826],
        [ 0.2937],
        [ 0.0377]])


In [804]:
der_third  =  (-1 / (third_1.squeeze()) * true_hat.squeeze())  *  1#(loss_grad is one) ## MINUS CUZ THE SUM IS MINUS
print(der_third)
der_second  = torch.matmul(torch.eye(4) - torch.transpose(third_, 0, 1) * third_, der_third) #torch.matmul(torch.eye(4) - torch.transpose(second_1, 0, 1) * second_1.squeeze(), der_third)
print(der_second)

tensor([-5.3766, -0.0000, -0.0000, -0.0000], grad_fn=<MulBackward0>)
tensor([-5.1906,  0.4826,  0.2937,  0.0377], grad_fn=<MvBackward0>)


In [811]:
third_1.squeeze() * (1- third_1.squeeze()) * der_third
print(torch.matmul(torch.eye(4) - torch.transpose(third_, 0 , 1) * third_, der_third))

tensor([-5.1906,  0.4826,  0.2937,  0.0377], grad_fn=<MvBackward0>)


In [780]:
print(t1_1.grad)

tensor([[-1.4230,  0.2246,  0.7941, -0.3899],
        [ 0.8437, -0.1331, -0.4708,  0.2312],
        [ 0.5134, -0.0810, -0.2865,  0.1407],
        [ 0.0659, -0.0104, -0.0368,  0.0181]])


In [748]:
t = torch.eye(4)
k = third_1.squeeze().dot((third_1).squeeze())
(t- k)* der_third

tensor([[-3.4669,  0.0000,  0.0000,  0.0000],
        [ 1.9097, -0.0000,  0.0000,  0.0000],
        [ 1.9097,  0.0000, -0.0000,  0.0000],
        [ 1.9097,  0.0000,  0.0000, -0.0000]], grad_fn=<MulBackward0>)