In [1]:
%config IPCompleter.greedy=True

import numpy as np
import matplotlib.pyplot as plt
import sklearn.preprocessing, sklearn.datasets, sklearn.model_selection

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [2]:
class CategoricalCrossEntropyLoss:
    def __call__(self, target, predicted):
        indices = np.arange(len(target))
        return -np.log(np.maximum(predicted[indices,target], 1e-15))
    
    def gradient(self, target, predicted):
        grad = np.zeros((len(target), 10))
        indices = np.arange(len(target))
        grad[indices,target] = -1 / predicted[indices,target]
        return grad

In [3]:
class SoftmaxLayer:
    def __init__(self):
        self.params = []
        self.grads = []
    
    def __call__(self, inputs):
        inputs = inputs - np.max(inputs)
        return np.exp(inputs) / np.sum(np.exp(inputs), axis=-1)[:,np.newaxis]
    
    def gradient(self, inputs, gradients):
        outputs = self(inputs)  # examples, classes
        examples, classes = outputs.shape
        diag = np.zeros((examples, classes, classes))  # examples, classes, classes
        diag[:, np.arange(classes), np.arange(classes)] = outputs # set the diagonal of each example
        my_gradient = diag - outputs[:,:,np.newaxis] * outputs[:,np.newaxis,:]  # examples, classes, classes
        return np.sum(gradients[:,np.newaxis,:] * my_gradient, axis=2) # examples, classes

In [4]:
class SigmoidLayer:
    def __init__(self):
        self.params = []
        self.grads = []
        
    def __call__(self, inputs):
        return 1 / (1 + np.exp(-inputs))
    
    def gradient(self, inputs, gradients):
        outputs = self(inputs)
        my_gradient = outputs * (1 - outputs)
        return my_gradient * gradients

In [5]:
class DenseLayer:
    def __init__(self,inputs, outputs, random_seed=None):
        self._random_state = np.random.RandomState(random_seed)
        self._W = self._random_state.uniform(-2,2,size=(inputs, outputs))
        self._b = self._random_state.uniform(-2,2,size=(outputs,))
        self.params = [self._W, self._b]
        self.grads = [np.zeros_like(self._W), np.zeros_like(self._b)]
        self._cache = None
    
    def __call__(self, inputs):
        return inputs @ self._W + self._b[np.newaxis,:]
    
    def gradient(self, inputs, gradients):
        # create the cache
        if self._cache is None or self._cache.shape[0] != inputs.shape[0]:
            self._cache = np.ndarray((inputs.shape[0],inputs.shape[1], gradients.shape[1]))
        # gradient in respect to W
        w_grad = np.multiply(inputs[:,:,np.newaxis], gradients[:,np.newaxis,:], out=self._cache)  # examples, inputs, outputs
        np.add(self.grads[0], np.sum(w_grad, axis=0), out=self.grads[0])  # inputs, outputs
        # gradient in respect to b
        b_grad = gradients  # examples, outputs
        np.add(self.grads[1], np.sum(b_grad, axis=0), out=self.grads[1])  # outputs
        # gradient in respect to inputs
        in_grad = np.multiply(self._W[np.newaxis,:,:], gradients[:,np.newaxis,:], out=self._cache)  # examples, inputs, outputs
        #in_grad = np.add(in_grad, np.sign(self._b)[np.newaxis, np.newaxis, :], out=self._cache)  # examples, inputs, outputs
        return np.sum(in_grad, axis=2) # examples, inputs

In [6]:
target = np.random.randint(0,10,size=(3,), dtype=int)
vals = np.random.uniform(size=(3,7))

In [7]:
dense1 = DenseLayer(7,13,42)
dense2 = DenseLayer(13,10,42)
sig = SigmoidLayer()
soft = SoftmaxLayer()
loss = CategoricalCrossEntropyLoss()

d1 = dense1(vals)
a1 = sig(d1)
d2 = dense2(a1)
a2 = soft(d2)
l = loss(target, a2)

l_grad = loss.gradient(target, a2)
a2_grad = soft.gradient(d2, l_grad)
d2_grad = dense2.gradient(a1, a2_grad)
a1_grad = sig.gradient(d1, d2_grad)
d1_grad = dense1.gradient(vals, a1_grad)

In [8]:
import tensorflow as tf

w1_tf = tf.Variable(dense1._W)
b1_tf = tf.Variable(dense1._b)
w2_tf = tf.Variable(dense2._W)
b2_tf = tf.Variable(dense2._b)
target_tf = tf.Variable(target)
vals_tf = tf.Variable(vals)

with tf.GradientTape() as tape:
    d1_tf = vals_tf @ w1_tf + b1_tf
    a1_tf = tf.math.sigmoid(d1_tf)
    d2_tf = a1_tf @ w2_tf + b2_tf
    a2_tf = tf.nn.softmax(d2_tf)
    l_tf = tf.keras.losses.sparse_categorical_crossentropy(target_tf, a2_tf)

l_grad_tf, a2_grad_tf, d2_grad_tf, a1_grad_tf, d1_grad_tf, w1_grad_tf, b1_grad_tf, w2_grad_tf, b2_grad_tf = tape.gradient(l_tf, [a2_tf, d2_tf, a1_tf, d1_tf, vals_tf, w1_tf, b1_tf, w2_tf, b2_tf])

In [9]:
import torch

w1_t = torch.tensor(dense1._W, requires_grad=True)
b1_t = torch.tensor(dense1._b, requires_grad=True)
w2_t = torch.tensor(dense2._W, requires_grad=True)
b2_t = torch.tensor(dense2._b, requires_grad=True)
target_t = torch.tensor(target, dtype=torch.long)
vals_t = torch.tensor(vals, requires_grad=True)

d1_t = vals_t @ w1_t + b1_t
d1_t.retain_grad()
a1_t = torch.sigmoid(d1_t)
a1_t.retain_grad()
d2_t = a1_t @ w2_t + b2_t
d2_t.retain_grad()
a2_t = torch.nn.functional.softmax(d2_t, dim=1)
a2_t.retain_grad()
l_t = torch.nn.functional.nll_loss(torch.log(a2_t), target_t, reduction='none')
l_t.backward(torch.ones(l_t.size()))

In [10]:
print("d1")
print(d1)
print(d1_tf)
print(d1_t)

print(np.abs(d1 - d1_t.detach().numpy()) < 1e-12)
print(np.abs(d1 - d1_tf.numpy()) < 1e-12)

d1
[[ 0.6932029   0.12440943  1.6350084  -0.4385792   0.29678732 -2.53289151
  -3.62099081 -2.76084936 -0.72306391 -1.33772196  0.21228408  1.0553516
   0.10723124]
 [-0.19377441 -0.53171348  1.24997532 -0.32948138 -0.59951081 -4.39256857
  -4.24204798 -1.88886257 -1.25979112 -1.69652151 -0.02766604  0.49136799
   0.97723431]
 [-0.69778271 -1.45381162  1.64622787 -1.85416627  0.82989938 -3.49434065
  -4.71753211 -2.2721133  -0.24414888 -0.92795472  0.08874479  0.481691
   0.41429437]]
tf.Tensor(
[[ 0.6932029   0.12440943  1.6350084  -0.4385792   0.29678732 -2.53289151
  -3.62099081 -2.76084936 -0.72306391 -1.33772196  0.21228408  1.0553516
   0.10723124]
 [-0.19377441 -0.53171348  1.24997532 -0.32948138 -0.59951081 -4.39256857
  -4.24204798 -1.88886257 -1.25979112 -1.69652151 -0.02766604  0.49136799
   0.97723431]
 [-0.69778271 -1.45381162  1.64622787 -1.85416627  0.82989938 -3.49434065
  -4.71753211 -2.2721133  -0.24414888 -0.92795472  0.08874479  0.481691
   0.41429437]], shape=(3, 1

In [11]:
print("a1")
print(a1)
print(a1_tf)
print(a1_t)

print(np.abs(a1 - a1_t.detach().numpy()) < 1e-12)
print(np.abs(a1 - a1_tf.numpy()) < 1e-12)

a1
[[0.66667905 0.5310623  0.83685458 0.39207957 0.57365696 0.07358429
  0.02605892 0.05947684 0.32671865 0.20788493 0.55287261 0.74180123
  0.52678215]
 [0.45170741 0.37011734 0.77729559 0.41836682 0.35445562 0.0122178
  0.01417432 0.13137421 0.22100985 0.15492012 0.49308393 0.62042864
  0.7265591 ]
 [0.33230401 0.18941565 0.83838058 0.13538447 0.69633365 0.02947369
  0.00885804 0.09345901 0.43926418 0.28333984 0.52217165 0.6181471
  0.60211714]]
tf.Tensor(
[[0.66667905 0.5310623  0.83685458 0.39207957 0.57365696 0.07358429
  0.02605892 0.05947684 0.32671865 0.20788493 0.55287261 0.74180123
  0.52678215]
 [0.45170741 0.37011734 0.77729559 0.41836682 0.35445562 0.0122178
  0.01417432 0.13137421 0.22100985 0.15492012 0.49308393 0.62042864
  0.7265591 ]
 [0.33230401 0.18941565 0.83838058 0.13538447 0.69633365 0.02947369
  0.00885804 0.09345901 0.43926418 0.28333984 0.52217165 0.6181471
  0.60211714]], shape=(3, 13), dtype=float64)
tensor([[0.6667, 0.5311, 0.8369, 0.3921, 0.5737, 0.0736, 

In [12]:
print("d2")
print(d2)
print(d2_tf)
print(d2_t)

print(np.abs(d2 - d2_t.detach().numpy()) < 1e-12)
print(np.abs(d2 - d2_tf.numpy()) < 1e-12)

d2
[[-2.67020338 -0.19100413 -2.11920083 -0.03316696  0.96553367 -0.14223227
  -1.66501519  1.65931148 -0.28927579 -1.58724065]
 [-1.58243255 -0.61787009 -2.67978935 -0.50374138  1.42582527  0.03625567
  -0.72057143  1.53947271 -0.86776417 -1.15894359]
 [-1.87781046 -0.78600125 -3.07633088 -0.49874809  1.08827325  0.21363996
  -0.49899418  1.80610943 -0.68527891 -1.82621801]]
tf.Tensor(
[[-2.67020338 -0.19100413 -2.11920083 -0.03316696  0.96553367 -0.14223227
  -1.66501519  1.65931148 -0.28927579 -1.58724065]
 [-1.58243255 -0.61787009 -2.67978935 -0.50374138  1.42582527  0.03625567
  -0.72057143  1.53947271 -0.86776417 -1.15894359]
 [-1.87781046 -0.78600125 -3.07633088 -0.49874809  1.08827325  0.21363996
  -0.49899418  1.80610943 -0.68527891 -1.82621801]], shape=(3, 10), dtype=float64)
tensor([[-2.6702, -0.1910, -2.1192, -0.0332,  0.9655, -0.1422, -1.6650,  1.6593,
         -0.2893, -1.5872],
        [-1.5824, -0.6179, -2.6798, -0.5037,  1.4258,  0.0363, -0.7206,  1.5395,
         -0.8

In [13]:
print("a2")
print(a2)
print(a2_tf)
print(a2_t)

print(np.abs(a2 - a2_t.detach().numpy()) < 1e-12)
print(np.abs(a2 - a2_tf.numpy()) < 1e-12)

a2
[[0.00583075 0.0695708  0.0101163  0.0814657  0.22115918 0.073048
  0.01593207 0.4425974  0.06305916 0.01722064]
 [0.01644069 0.04313445 0.00548712 0.04834925 0.3329583  0.08296739
  0.03892437 0.37303218 0.0335967  0.02510955]
 [0.01192101 0.03552055 0.00359585 0.04734042 0.2314551  0.09652023
  0.04732877 0.47448124 0.03928464 0.01255218]]
tf.Tensor(
[[0.00583075 0.0695708  0.0101163  0.0814657  0.22115918 0.073048
  0.01593207 0.4425974  0.06305916 0.01722064]
 [0.01644069 0.04313445 0.00548712 0.04834925 0.3329583  0.08296739
  0.03892437 0.37303218 0.0335967  0.02510955]
 [0.01192101 0.03552055 0.00359585 0.04734042 0.2314551  0.09652023
  0.04732877 0.47448124 0.03928464 0.01255218]], shape=(3, 10), dtype=float64)
tensor([[0.0058, 0.0696, 0.0101, 0.0815, 0.2212, 0.0730, 0.0159, 0.4426, 0.0631,
         0.0172],
        [0.0164, 0.0431, 0.0055, 0.0483, 0.3330, 0.0830, 0.0389, 0.3730, 0.0336,
         0.0251],
        [0.0119, 0.0355, 0.0036, 0.0473, 0.2315, 0.0965, 0.0473, 0.47

In [14]:
print("l")
print(l)
print(l_tf)
print(l_t)

print(np.abs(l - l_t.detach().numpy()) < 1e-12)
print(np.abs(l - l_tf.numpy()) < 1e-12)

l
[2.61663848 0.98609058 5.62797351]
tf.Tensor([2.61663848 0.98609058 5.62797351], shape=(3,), dtype=float64)
tensor([2.6166, 0.9861, 5.6280], dtype=torch.float64,
       grad_fn=<NllLossBackward>)
[ True  True  True]
[ True  True  True]


In [24]:
print("Loss grad")
print(l_grad)
print(l_grad_tf)
print(a2_t.grad)

print(np.abs(l_grad - a2_t.grad.detach().numpy()) < 1e-12)
print(np.abs(l_grad + 1 - l_grad_tf.numpy()) < 1e-12)

Loss grad
[[   0.            0.            0.            0.            0.
   -13.6896282     0.            0.            0.            0.        ]
 [   0.            0.            0.            0.            0.
     0.            0.           -2.68073384    0.            0.        ]
 [   0.            0.         -278.0979848     0.            0.
     0.            0.            0.            0.            0.        ]]
tf.Tensor(
[[   1.            1.            1.            1.            1.
   -12.6896282     1.            1.            1.            1.        ]
 [   1.            1.            1.            1.            1.
     1.            1.           -1.68073384    1.            1.        ]
 [   1.            1.         -277.0979848     1.            1.
     1.            1.            1.            1.            1.        ]], shape=(3, 10), dtype=float64)
tensor([[   0.0000,    0.0000,    0.0000,    0.0000,    0.0000,  -13.6896,
            0.0000,    0.0000,    0.0000,    0.00

In [16]:
print("Softmax grad")
print(a2_grad)
print(a2_grad_tf)
print(d2_t.grad)

print(np.abs(a2_grad - d2_t.grad.detach().numpy()) < 1e-12)
print(np.abs(a2_grad - a2_grad_tf.numpy()) < 1e-12)

Softmax grad
[[ 0.00583075  0.0695708   0.0101163   0.0814657   0.22115918 -0.926952
   0.01593207  0.4425974   0.06305916  0.01722064]
 [ 0.01644069  0.04313445  0.00548712  0.04834925  0.3329583   0.08296739
   0.03892437 -0.62696782  0.0335967   0.02510955]
 [ 0.01192101  0.03552055 -0.99640415  0.04734042  0.2314551   0.09652023
   0.04732877  0.47448124  0.03928464  0.01255218]]
tf.Tensor(
[[ 0.00583075  0.0695708   0.0101163   0.0814657   0.22115918 -0.926952
   0.01593207  0.4425974   0.06305916  0.01722064]
 [ 0.01644069  0.04313445  0.00548712  0.04834925  0.3329583   0.08296739
   0.03892437 -0.62696782  0.0335967   0.02510955]
 [ 0.01192101  0.03552055 -0.99640415  0.04734042  0.2314551   0.09652023
   0.04732877  0.47448124  0.03928464  0.01255218]], shape=(3, 10), dtype=float64)
tensor([[ 0.0058,  0.0696,  0.0101,  0.0815,  0.2212, -0.9270,  0.0159,  0.4426,
          0.0631,  0.0172],
        [ 0.0164,  0.0431,  0.0055,  0.0483,  0.3330,  0.0830,  0.0389, -0.6270,
       

In [17]:
print("Dense2 grad")
print(d2_grad)
print(d2_grad_tf)
print(a1_t.grad)

print(np.abs(d2_grad - a1_t.grad.detach().numpy()) < 1e-12)
print(np.abs(d2_grad - d2_grad_tf.numpy()) < 1e-12)

Dense2 grad
[[ 1.795194    0.93151355 -1.24778256 -1.37444831 -0.69800677 -1.93485727
  -0.01335748 -1.45980517  0.72861035  0.07015516  1.66160574 -1.67449081
   0.54782942]
 [-1.43124487 -0.65029031 -0.16471132  1.74729687 -0.32909427  1.0544566
  -1.16690022  1.42045241 -0.61270384  0.47027209 -0.28386317  1.18771925
  -1.29771763]
 [-0.6615301  -1.76361014  0.78857692  1.55238034  1.74792935 -2.09704912
  -1.09831361  1.48203526  0.74766589 -0.99463539  1.42330229 -1.91921736
   1.11034652]]
tf.Tensor(
[[ 1.795194    0.93151355 -1.24778256 -1.37444831 -0.69800677 -1.93485727
  -0.01335748 -1.45980517  0.72861035  0.07015516  1.66160574 -1.67449081
   0.54782942]
 [-1.43124487 -0.65029031 -0.16471132  1.74729687 -0.32909427  1.0544566
  -1.16690022  1.42045241 -0.61270384  0.47027209 -0.28386317  1.18771925
  -1.29771763]
 [-0.6615301  -1.76361014  0.78857692  1.55238034  1.74792935 -2.09704912
  -1.09831361  1.48203526  0.74766589 -0.99463539  1.42330229 -1.91921736
   1.11034652]]

In [18]:
print("W2 grad")
print(dense2.grads[0])
print(w2_grad_tf)
print(w2_t.grad)

print(np.abs(dense2.grads[0] - w2_t.grad.detach().numpy()) < 1e-12)
print(np.abs(dense2.grads[0] - w2_grad_tf.numpy()) < 1e-12)

W2 grad
[[ 1.52750198e-02  7.76691649e-02 -3.21886198e-01  9.18826025e-02
   3.74755385e-01 -5.48428431e-01  4.39315441e-02  1.69536421e-01
   7.02705416e-02  2.69939495e-02]
 [ 1.14395019e-02  5.96393834e-02 -1.81331273e-01  7.01252747e-02
   2.84524163e-01 -4.43279152e-01  3.18323153e-02  9.28693077e-02
   5.33641881e-02  2.08162915e-02]
 [ 2.76531081e-02  1.21528597e-01 -8.22634905e-01  1.45445894e-01
   6.37932557e-01 -6.30313156e-01  8.32680913e-02  2.80846200e-01
   1.11821394e-01  4.44522202e-02]
 [ 1.07782770e-02  5.01322410e-02 -1.28635625e-01  5.85779157e-02
   2.57346128e-01 -3.15660797e-01  2.89388843e-02 -2.45317404e-02
   4.40984818e-02  1.89562342e-02]
 [ 1.74733446e-02  7.99331747e-02 -6.86081513e-01  9.68357596e-02
   4.06058421e-01 -4.35133927e-01  5.58931221e-02  3.62064067e-01
   7.54380834e-02  2.75194678e-02]
 [ 9.81276683e-04  6.69324736e-03 -2.85562615e-02  7.98061398e-03
   2.71636935e-02 -6.43506192e-02  3.04287336e-03  3.88927608e-02
   6.20850421e-03  1.9439

In [19]:
print("b2 grad")
print(dense2.grads[1])
print(b2_grad_tf)
print(b2_t.grad)

print(np.abs(dense2.grads[1] - b2_t.grad.detach().numpy()) < 1e-12)
print(np.abs(dense2.grads[1] - b2_grad_tf.numpy()) < 1e-12)

b2 grad
[ 0.03419245  0.1482258  -0.98080073  0.17715537  0.78557258 -0.74746438
  0.10218521  0.29011082  0.1359405   0.05488237]
tf.Tensor(
[ 0.03419245  0.1482258  -0.98080073  0.17715537  0.78557258 -0.74746438
  0.10218521  0.29011082  0.1359405   0.05488237], shape=(10,), dtype=float64)
tensor([ 0.0342,  0.1482, -0.9808,  0.1772,  0.7856, -0.7475,  0.1022,  0.2901,
         0.1359,  0.0549], dtype=torch.float64)
[ True  True  True  True  True  True  True  True  True  True]
[ True  True  True  True  True  True  True  True  True  True]


In [20]:
print("a1 grad")
print(a1_grad)
print(a1_grad_tf)
print(d1_t.grad)

print(np.abs(a1_grad - d1_t.grad.detach().numpy()) < 1e-12)
print(np.abs(a1_grad - a1_grad_tf.numpy()) < 1e-12)

a1 grad
[[ 3.98924590e-01  2.31979600e-01 -1.70358492e-01 -3.27604128e-01
  -1.70714762e-01 -1.31898528e-01 -3.39010854e-04 -8.16605402e-02
   1.60275022e-01  1.15523655e-02  4.10756395e-01 -3.20718852e-01
   1.36564406e-01]
 [-3.54473304e-01 -1.51602500e-01 -2.85127074e-02  4.25180273e-01
  -7.53023082e-02  1.27257332e-02 -1.63055686e-02  1.62094968e-01
  -1.05485849e-01  6.15679652e-02 -7.09522149e-02  2.79704251e-01
  -2.57818828e-01]
 [-1.46779012e-01 -2.70780044e-01  1.06851053e-01  1.81714681e-01
   3.69605073e-01 -5.99860644e-02 -9.64272822e-03  1.25564583e-01
   1.84158452e-01 -2.01969047e-01  3.55125904e-01 -4.53014489e-01
   2.66008036e-01]]
tf.Tensor(
[[ 3.98924590e-01  2.31979600e-01 -1.70358492e-01 -3.27604128e-01
  -1.70714762e-01 -1.31898528e-01 -3.39010854e-04 -8.16605402e-02
   1.60275022e-01  1.15523655e-02  4.10756395e-01 -3.20718852e-01
   1.36564406e-01]
 [-3.54473304e-01 -1.51602500e-01 -2.85127074e-02  4.25180273e-01
  -7.53023082e-02  1.27257332e-02 -1.63055686e

In [21]:
print("d1 grad")
print(d1_grad)
print(d1_grad_tf)
print(vals_t.grad)

print(np.abs(d1_grad - vals_t.grad.detach().numpy()) < 1e-12)
print(np.abs(d1_grad - d1_grad_tf.numpy()) < 1e-12)

d1 grad
[[-0.90645419 -0.54740991  0.62821749 -0.87863711  1.46949315 -0.22028005
   0.02940728]
 [ 0.72578925  0.17482157 -0.7996455  -0.4114022  -0.1683685  -0.43896257
   0.71792223]
 [-1.74109686  0.448638    1.09240371 -0.36257421 -0.34664971  0.04732065
   0.77180218]]
tf.Tensor(
[[-0.90645419 -0.54740991  0.62821749 -0.87863711  1.46949315 -0.22028005
   0.02940728]
 [ 0.72578925  0.17482157 -0.7996455  -0.4114022  -0.1683685  -0.43896257
   0.71792223]
 [-1.74109686  0.448638    1.09240371 -0.36257421 -0.34664971  0.04732065
   0.77180218]], shape=(3, 7), dtype=float64)
tensor([[-0.9065, -0.5474,  0.6282, -0.8786,  1.4695, -0.2203,  0.0294],
        [ 0.7258,  0.1748, -0.7996, -0.4114, -0.1684, -0.4390,  0.7179],
        [-1.7411,  0.4486,  1.0924, -0.3626, -0.3466,  0.0473,  0.7718]],
       dtype=torch.float64)
[[ True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True]]
[[ True  True  True  True  True

In [22]:
print("W1 grad")
print(dense1.grads[0])
print(w1_grad_tf)
print(w1_t.grad)

print(np.abs(dense1.grads[0] - w1_t.grad.detach().numpy()) < 1e-12)
print(np.abs(dense1.grads[0] - w1_grad_tf.numpy()) < 1e-12)

W1 grad
[[-4.69966628e-02 -7.93710735e-02 -4.96189443e-02  1.30556705e-01
   3.89463792e-02 -8.19961017e-02 -1.22817259e-02  9.42785067e-02
   1.02682363e-01 -5.01196917e-02  3.11530440e-01 -2.07661811e-01
   5.12921236e-02]
 [-2.78820512e-01 -2.81225729e-01 -1.68399980e-04  4.02753089e-01
   1.95388246e-01 -9.81253619e-02 -2.32633057e-02  2.22445723e-01
   1.40018492e-01 -1.22949540e-01  4.29755276e-01 -2.95744807e-01
   6.92175285e-02]
 [-1.48135435e-01 -1.86512024e-01  1.13651319e-02  2.23834970e-01
   1.67334297e-01 -7.17055956e-02 -1.40083551e-02  1.35449939e-01
   1.21334020e-01 -1.05095011e-01  3.22668515e-01 -2.68656253e-01
   1.00730522e-01]
 [-1.41758607e-01 -1.72937080e-01 -6.11886503e-02  2.76827599e-01
   9.29796377e-02 -1.23183241e-01 -2.13697672e-02  1.76618786e-01
   1.54797658e-01 -8.88543564e-02  4.81371573e-01 -3.13424676e-01
   6.85093156e-02]
 [-9.28067378e-02 -1.12475885e-01 -7.68296616e-02  2.14331156e-01
   2.96540493e-02 -1.10114657e-01 -1.82221652e-02  1.41035

In [23]:
print("b1 grad")
print(dense1.grads[1])
print(b1_grad_tf)
print(b1_t.grad)

print(np.abs(dense1.grads[1] - b1_t.grad.detach().numpy()) < 1e-12)
print(np.abs(dense1.grads[1] - b1_grad_tf.numpy()) < 1e-12)

b1 grad
[-0.10232773 -0.19040294 -0.09202015  0.27929083  0.123588   -0.17915886
 -0.02628731  0.20599901  0.23894762 -0.12884872  0.69493008 -0.49402909
  0.14475361]
tf.Tensor(
[-0.10232773 -0.19040294 -0.09202015  0.27929083  0.123588   -0.17915886
 -0.02628731  0.20599901  0.23894762 -0.12884872  0.69493008 -0.49402909
  0.14475361], shape=(13,), dtype=float64)
tensor([-0.1023, -0.1904, -0.0920,  0.2793,  0.1236, -0.1792, -0.0263,  0.2060,
         0.2389, -0.1288,  0.6949, -0.4940,  0.1448], dtype=torch.float64)
[ True  True  True  True  True  True  True  True  True  True  True  True
  True]
[ True  True  True  True  True  True  True  True  True  True  True  True
  True]
