In [11]:
import numpy as np
from src.tensor import Tensor 
from src.module import Module
from src.functional import Linear, ReLU, Softmax

class MLP(Module):
    def __init__(self):
        self.fc1 = Linear(24, 12, bias=True)
        self.relu = ReLU()
        self.fc2 = Linear(12,6)
        self.softmax = Softmax()

    def forward(self, x):
        out = self.fc1(x)
        # out = self.relu(out)
        out = self.fc2(out)
        # out = self.softmax(out)
        out = out.sum()
        return out

network = MLP()
input = Tensor(np.random.randn(10,24, 1))
output = network(input)

output.backwards()

In [12]:
for name, param in network.parameters().items():
    print(name)
    print(param.shape)
    print(param.grad.shape)

fc1__weights
(12, 24)
(10, 12, 24)
fc1__bias
(12, 1)
(10, 12, 1)
fc2__weights
(6, 12)
(10, 6, 12)
fc2__bias
(6, 1)
(10, 6, 1)


In [19]:
zero_grad = np.zeros(param.shape)
grad = param.grad

In [25]:
zero_grad.shape

(6, 1)

In [2]:
point = output.b.a.a.b.a

print(point.shape, point.grad.shape)
print(type(point.a))
if point.a: print(point.a.shape, point.a.grad.shape)
print(type(point.b))
if point.b: print(point.b.shape, point.b.grad.shape)


(10, 12, 1) (10, 12, 1)
<class 'src.tensor.Tensor'>
(12, 24) (12, 24)
<class 'src.tensor.Tensor'>
(10, 24, 1) (10, 24, 1)


In [6]:
import pytest
import numpy as np
from src.tensor import Tensor
from src.module import Module
from src.functional import Linear, ReLU, Softmax
from tests.util import get_numerical_gradient

class MLP(Module):
    def __init__(self):
        self.fc1 = Linear(24, 12)
        self.relu = ReLU()
        self.fc2 = Linear(12,3)
        self.softmax = Softmax()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        # out = self.softmax(out)
        out = out.sum()
        return out


network = MLP()
input_tensor = Tensor(np.random.randn(3,24,1))

output = network(input_tensor)
output.backwards()

parameters = network.parameters()

for name, parameter in parameters.items():
    numerical_gradient = get_numerical_gradient(network, input_tensor, parameter)
    analytical_gradient = parameter.grad


    print(
        'Parameter: {}\n'.format(name),
        'Numerical: {}\n'.format(numerical_gradient[0]),
        'Analytical: {}\n'.format(analytical_gradient[1]),
    )
    # assert np.allclose(analytical_gradient, numerical_gradient, rtol=1e-4)


Parameter: fc1__weights
 Numerical: [-0.31714717  0.67218665 -0.15955513  0.93238769 -0.81960036 -1.65244866
  0.15299202 -0.40875772 -2.86273907 -1.65214821 -0.6426733  -1.12724083
  0.14303217  2.01383753 -1.22888828  0.9516711  -2.69486153 -0.12370934
  0.11356418 -0.67005702 -2.07473908  2.08483003  1.26924049 -1.08578765]
 Analytical: [ -1.15204501   2.44173474  -0.57958799   3.38692149  -2.97721873
  -6.00256094   0.5557473   -1.4848226  -10.398971    -6.00146953
  -2.33452679  -4.0947304    0.51956793   7.31531501  -4.46396729
   3.45696898  -9.78915162  -0.44937726   0.4125247   -2.43399881
  -7.53654136   7.57319702   4.61054771  -3.94415068]

Parameter: fc1__bias
 Numerical: [1.87772326]
 Analytical: [[ 0.        ]
 [ 0.        ]
 [-0.66032247]
 [ 2.23605748]
 [ 0.        ]
 [ 0.16552855]
 [-3.12532834]
 [ 0.        ]
 [ 0.        ]
 [-3.07618306]
 [ 0.        ]
 [ 0.        ]]

Parameter: fc2__weights
 Numerical: [ 2.66418218  0.9730659   3.10905489  6.09332695  0.         1

In [10]:
parameter.grad.shape


(3, 3, 1)

In [11]:
parameter.shape

(3, 1)

In [11]:
point.a.value.shape

(12, 24)

In [11]:
point.b.value.shape

(10, 24, 1)

In [90]:
point = output.a

print(point.shape)
print(type(point).__name__)
print('a: ' + type(point.a).__name__)
print('b: ' + type(point.b).__name__)

# point.a.value.T @ point.grad
point.grad @ point.b.value.swapaxes(-2,-1)

(10, 1)
Dot
a: Tensor
b: Tensor


array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 

In [72]:
point.grad.shape

(10, 1)

In [73]:
point.b.value.T.shape

(1, 24)

In [68]:
point.grad.shape

(10, 10, 1)

In [69]:
point.b.value.T.shape

(1, 24, 10)

In [52]:
output.a.shape

(10, 10, 1)

In [74]:
output.a.a.grad.shape

(10, 24)

In [146]:
numerical_gradient

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 

In [125]:
output = network(input)
output.backwards()

In [126]:
np.isclose(network.fc1.weights.grad, numerical_gradient, atol=1e-1)

array([[False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False],
       [ True,  True,  True,  True,

In [117]:
(network.fc1.weights.grad - numerical_gradient)

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e