In [14]:
import numpy as np
from src.tensor import Tensor 
from src.module import Module
from src.functional import Linear, ReLU, Softmax

class MLP(Module):
    def __init__(self):
        self.fc1 = Linear(24, 12, bias=True)
        self.relu = ReLU()
        self.fc2 = Linear(12,6)
        self.softmax = Softmax()

    def forward(self, x):
        out = self.fc1(x)
        # out = self.relu(out)
        out = self.fc2(out)
        # out = self.softmax(out)
        out = out.sum()
        return out

network = MLP()
input = Tensor(np.random.randn(10,24, 1))
output = network(input)

output.backwards()

In [15]:
for name, param in network.parameters().items():
    print(name)
    print(param.shape)
    print(param.grad.shape)

fc1__weights
(12, 24)
(12, 24)
fc1__bias
(12, 1)
(12, 1)
fc2__weights
(6, 12)
(6, 12)
fc2__bias
(6, 1)
(6, 1)


In [19]:
zero_grad = np.zeros(param.shape)
grad = param.grad

In [25]:
zero_grad.shape

(6, 1)

In [2]:
point = output.b.a.a.b.a

print(point.shape, point.grad.shape)
print(type(point.a))
if point.a: print(point.a.shape, point.a.grad.shape)
print(type(point.b))
if point.b: print(point.b.shape, point.b.grad.shape)


(10, 12, 1) (10, 12, 1)
<class 'src.tensor.Tensor'>
(12, 24) (12, 24)
<class 'src.tensor.Tensor'>
(10, 24, 1) (10, 24, 1)


In [1]:
import pytest
import numpy as np
from src.tensor import Tensor
from src.module import Module
from src.functional import Linear, ReLU, Softmax
from tests.util import get_numerical_gradient

class MLP(Module):
    def __init__(self):
        self.fc1 = Linear(24, 12)
        self.relu = ReLU()
        self.fc2 = Linear(12,3)
        self.softmax = Softmax()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        # out = self.softmax(out)
        out = out.sum()
        return out


network = MLP()
input_tensor = Tensor(np.random.randn(3,24,1))

output = network(input_tensor)
output.backwards()

parameters = network.parameters()

for name, parameter in parameters.items():
    numerical_gradient = get_numerical_gradient(network, input_tensor, parameter)
    analytical_gradient = parameter.grad


    print(
        'Parameter: {}\n'.format(name),
        'Numerical: {}\n'.format(numerical_gradient[0]),
        'Analytical: {}\n'.format(analytical_gradient[1]),
    )
    # assert np.allclose(analytical_gradient, numerical_gradient, rtol=1e-4)


Parameter: fc1__weights
 Numerical: [ 1.12245707 -0.94245557 -1.06058313 -0.52065496  0.53291607 -3.74327464
  2.10633105  0.89070619 -1.8108125  -0.48625615  1.09073584 -2.70428622
  0.19360515  1.23849575 -0.87455583 -1.30768025  0.04221979 -0.73104656
 -0.17754166  0.95969114  1.49005514  0.94041646 -2.51642308  1.13239156]
 Analytical: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

Parameter: fc1__bias
 Numerical: [-1.28946115]
 Analytical: [0.]

Parameter: fc2__weights
 Numerical: [0.60421479 0.         1.77703772 0.         5.23313441 1.92193524
 0.         9.5745611  4.43730367 0.         2.18493934 7.19739712]
 Analytical: [0.60421478 0.         1.77703772 0.         5.23313441 1.92193524
 0.         9.5745611  4.43730367 0.         2.18493934 7.19739713]

Parameter: fc2__bias
 Numerical: [3.]
 Analytical: [3.]



In [22]:
point = output.a.a.b.a.b
print(type(point))
point.grad

<class 'src.tensor.Tensor'>


array([[[ 13.11325525],
        [ -2.1701625 ],
        [ -1.60704709],
        [  5.91985131],
        [  1.55876407],
        [ -7.76177209],
        [ 12.03278283],
        [ -3.94733159],
        [  7.01277285],
        [ -5.2045574 ],
        [  1.51618728],
        [ -3.63402472],
        [  6.1264434 ],
        [ 12.33180724],
        [  2.78603374],
        [ -5.15317112],
        [ -0.63593963],
        [-10.75349647],
        [  7.13087402],
        [ -4.59450731],
        [  2.99646092],
        [ -1.18050586],
        [ -6.23091185],
        [ -0.72408347]],

       [[ 13.11325525],
        [ -2.1701625 ],
        [ -1.60704709],
        [  5.91985131],
        [  1.55876407],
        [ -7.76177209],
        [ 12.03278283],
        [ -3.94733159],
        [  7.01277285],
        [ -5.2045574 ],
        [  1.51618728],
        [ -3.63402472],
        [  6.1264434 ],
        [ 12.33180724],
        [  2.78603374],
        [ -5.15317112],
        [ -0.63593963],
        [-10.7

In [11]:
parameter.shape

(3, 1)

In [11]:
point.a.value.shape

(12, 24)

In [11]:
point.b.value.shape

(10, 24, 1)

In [90]:
point = output.a

print(point.shape)
print(type(point).__name__)
print('a: ' + type(point.a).__name__)
print('b: ' + type(point.b).__name__)

# point.a.value.T @ point.grad
point.grad @ point.b.value.swapaxes(-2,-1)

(10, 1)
Dot
a: Tensor
b: Tensor


array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 

In [72]:
point.grad.shape

(10, 1)

In [73]:
point.b.value.T.shape

(1, 24)

In [68]:
point.grad.shape

(10, 10, 1)

In [69]:
point.b.value.T.shape

(1, 24, 10)

In [52]:
output.a.shape

(10, 10, 1)

In [74]:
output.a.a.grad.shape

(10, 24)

In [146]:
numerical_gradient

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 

In [125]:
output = network(input)
output.backwards()

In [126]:
np.isclose(network.fc1.weights.grad, numerical_gradient, atol=1e-1)

array([[False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False],
       [ True,  True,  True,  True,

In [117]:
(network.fc1.weights.grad - numerical_gradient)

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e