In [1]:
%config IPCompleter.greedy=True

import numpy as np
import matplotlib.pyplot as plt
import sklearn.preprocessing, sklearn.datasets, sklearn.model_selection

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [2]:
class CategoricalCrossEntropyLoss:
    def __call__(self, target, predicted):
        indices = np.arange(len(target))
        return -np.log(np.maximum(predicted[indices,target], 1e-15))
    
    def gradient(self, target, predicted):
        grad = np.zeros((len(target), 10))
        indices = np.arange(len(target))
        grad[indices,target] = -1 / predicted[indices,target]
        return grad

In [3]:
class SoftmaxLayer:
    def __init__(self):
        self.params = []
        self.grads = []
    
    def __call__(self, inputs):
        inputs = inputs - np.max(inputs)
        return np.exp(inputs) / np.sum(np.exp(inputs), axis=-1)[:,np.newaxis]
    
    def gradient(self, inputs, gradients):
        outputs = self(inputs)  # examples, classes
        examples, classes = outputs.shape
        diag = np.zeros((examples, classes, classes))  # examples, classes, classes
        diag[:, np.arange(classes), np.arange(classes)] = outputs # set the diagonal of each example
        my_gradient = diag - outputs[:,:,np.newaxis] * outputs[:,np.newaxis,:]  # examples, classes, classes
        return np.sum(gradients[:,np.newaxis,:] * my_gradient, axis=2) # examples, classes

In [136]:
class DenseLayer:
    def __init__(self,inputs, outputs, random_seed=None):
        self._random_state = np.random.RandomState(random_seed)
        self._W = self._random_state.uniform(-2,2,size=(inputs, outputs))
        self._b = self._random_state.uniform(-2,2,size=(outputs,))
        self.params = [self._W, self._b]
        self.grads = [np.zeros_like(self._W), np.zeros_like(self._b)]
    
    def __call__(self, inputs):
        return inputs @ self._W + self._b[np.newaxis,:]
    
    def gradient(self, inputs, gradients):
        # gradient in respect to W
        w_grad = inputs[:,:,np.newaxis] * gradients[:,np.newaxis,:]  # examples, inputs, outputs
        np.add(self.grads[0], np.sum(w_grad, axis=0), out=self.grads[0])  # inputs, outputs
        # gradient in respect to b
        b_grad = gradients  # examples, outputs
        np.add(self.grads[1], np.sum(gradients, axis=0), out=self.grads[1])  # outputs
        # gradient in respect to inputs
        in_grad = self._W[np.newaxis,:,:] * gradients[:,np.newaxis,:] + np.sign(self._b)[np.newaxis, np.newaxis, :]  # examples, inputs, outputs
        return np.sum(in_grad, axis=2) # examples, inputs

In [137]:
target = np.random.randint(0,10,size=(3,), dtype=int)
vals = np.random.uniform(size=(3,7))

In [138]:
dense = DenseLayer(7,10,42)
soft = SoftmaxLayer()
loss = CategoricalCrossEntropyLoss()

first = dense(vals)
predicted = soft(first)
l = loss(target, predicted)
l_grad = loss.gradient(target, predicted)
s_grad = soft.gradient(first, l_grad)
f_grad = dense.gradient(vals, s_grad)

In [139]:
import tensorflow as tf

w_tf = tf.Variable(dense._W)
b_tf = tf.Variable(dense._b)
target_tf = tf.Variable(target)
vals_tf = tf.Variable(vals)

with tf.GradientTape() as tape:
    first_tf = vals_tf @ w_tf + b_tf
    predicted_tf = tf.nn.softmax(first_tf)
    l_tf = tf.keras.losses.sparse_categorical_crossentropy(target_tf, predicted_tf)

l_grad_tf, s_grad_tf, w_tf_grad, b_tf_grad, f_tf_grad = tape.gradient(l_tf, [predicted_tf, first_tf, w_tf, b_tf, vals_tf])

In [140]:
import torch

w_t = torch.tensor(dense._W, requires_grad=True)
b_t = torch.tensor(dense._b, requires_grad=True)
target_t = torch.tensor(target, dtype=torch.long)
vals_t = torch.tensor(vals, requires_grad=True)

first_t = vals_t @ w_t + b_t
first_t.retain_grad()
predicted_t = torch.nn.functional.softmax(first_t, dim=1)
predicted_t.retain_grad()
l_t = torch.nn.functional.nll_loss(torch.log(predicted_t), target_t, reduction='none')
l_t.backward(torch.ones(l_t.size()))

In [141]:
print("prediction")
print(predicted)
print("loss")
print(l)
print("loss grad")
print(l_grad)
print("softmax grad")
print(s_grad)
print("W grad")
print(dense.grads[0])
print("b grad")
print(dense.grads[1])
print("vals grad")
print(f_grad)

prediction
[[1.12266094e-02 3.40723142e-02 1.47358772e-03 8.03037682e-01
  1.13033016e-02 1.04788833e-01 1.37781349e-03 6.22199035e-03
  2.58615332e-02 6.36335544e-04]
 [6.35036520e-02 4.09659816e-03 4.64408156e-03 4.40106757e-01
  3.58909784e-02 4.03573917e-01 2.27405181e-03 2.21779879e-02
  1.45106324e-02 9.22134397e-03]
 [6.83276413e-02 2.99907419e-03 2.12594883e-03 4.72575321e-01
  6.79847118e-02 3.54894315e-01 7.18306216e-03 5.94326080e-03
  1.21204946e-02 5.84617006e-03]]
loss
[6.52005522 3.32726931 4.41285749]
loss grad
[[   0.            0.         -678.61586049    0.            0.
     0.            0.            0.            0.            0.        ]
 [   0.            0.            0.            0.          -27.86215492
     0.            0.            0.            0.            0.        ]
 [   0.            0.            0.            0.            0.
     0.            0.            0.          -82.50488373    0.        ]]
softmax grad
[[ 1.12266094e-02  3.40723142e-02 

In [142]:
print("prediction")
print(predicted_tf)
print("loss")
print(l_tf)
print("loss grad")
print(l_grad_tf)
print("softmax grad")
print(s_grad_tf)
print("W grad")
print(w_tf_grad)
print("b grad")
print(b_tf_grad)
print("vals grad")
print(f_tf_grad)

prediction
tf.Tensor(
[[1.12266094e-02 3.40723142e-02 1.47358772e-03 8.03037682e-01
  1.13033016e-02 1.04788833e-01 1.37781349e-03 6.22199035e-03
  2.58615332e-02 6.36335544e-04]
 [6.35036520e-02 4.09659816e-03 4.64408156e-03 4.40106757e-01
  3.58909784e-02 4.03573917e-01 2.27405181e-03 2.21779879e-02
  1.45106324e-02 9.22134397e-03]
 [6.83276413e-02 2.99907419e-03 2.12594883e-03 4.72575321e-01
  6.79847118e-02 3.54894315e-01 7.18306216e-03 5.94326080e-03
  1.21204946e-02 5.84617006e-03]], shape=(3, 10), dtype=float64)
loss
tf.Tensor([6.52005522 3.32726931 4.41285749], shape=(3,), dtype=float64)
loss grad
tf.Tensor(
[[   1.            1.         -677.61586049    1.            1.
     1.            1.            1.            1.            1.        ]
 [   1.            1.            1.            1.          -26.86215492
     1.            1.            1.            1.            1.        ]
 [   1.            1.            1.            1.            1.
     1.            1.         

In [143]:
print("prediction")
print(predicted_t)
print("loss")
print(l_t)
print("loss grad")
print(predicted_t.grad)
print("softmax grad")
print(first_t.grad)
print("W grad")
print(w_t.grad)
print("b grad")
print(b_t.grad)
print("vals grad")
print(vals_t.grad)

prediction
tensor([[1.1227e-02, 3.4072e-02, 1.4736e-03, 8.0304e-01, 1.1303e-02, 1.0479e-01,
         1.3778e-03, 6.2220e-03, 2.5862e-02, 6.3634e-04],
        [6.3504e-02, 4.0966e-03, 4.6441e-03, 4.4011e-01, 3.5891e-02, 4.0357e-01,
         2.2741e-03, 2.2178e-02, 1.4511e-02, 9.2213e-03],
        [6.8328e-02, 2.9991e-03, 2.1259e-03, 4.7258e-01, 6.7985e-02, 3.5489e-01,
         7.1831e-03, 5.9433e-03, 1.2120e-02, 5.8462e-03]], dtype=torch.float64,
       grad_fn=<SoftmaxBackward>)
loss
tensor([6.5201, 3.3273, 4.4129], dtype=torch.float64,
       grad_fn=<NllLossBackward>)
loss grad
tensor([[   0.0000,    0.0000, -678.6159,    0.0000,    0.0000,    0.0000,
            0.0000,    0.0000,    0.0000,    0.0000],
        [   0.0000,    0.0000,    0.0000,    0.0000,  -27.8622,    0.0000,
            0.0000,    0.0000,    0.0000,    0.0000],
        [   0.0000,    0.0000,    0.0000,    0.0000,    0.0000,    0.0000,
            0.0000,    0.0000,  -82.5049,    0.0000]], dtype=torch.float64)
soft

In [144]:
print("prediction")
print(np.abs(predicted - predicted_t.detach().numpy()) < 1e-12)
print("loss")
print(np.abs(l - l_t.detach().numpy()) < 1e-12)
print("loss grad")
print(np.abs(l_grad - predicted_t.grad.detach().numpy()) < 1e-12)
print("softmax grad")
print(np.abs(s_grad - first_t.grad.detach().numpy()) < 1e-12)
print("W grad")
print(np.abs(dense.grads[0] - w_t.grad.detach().numpy()) < 1e-12)
print("b grad")
print(np.abs(dense.grads[1] - b_t.grad.detach().numpy()) < 1e-12)
print("vals grad")
print(np.abs(f_grad - vals_t.grad.detach().numpy()) < 1e-12)

prediction
[[ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]]
loss
[ True  True  True]
loss grad
[[ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]]
softmax grad
[[ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]]
W grad
[[ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True