In [1]:
%config IPCompleter.greedy=True

import numpy as np
import matplotlib.pyplot as plt
import sklearn.preprocessing, sklearn.datasets, sklearn.model_selection

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [2]:
class CategoricalCrossEntropyLoss:
    def __call__(self, target, predicted):
        indices = np.arange(len(target))
        return -np.log(np.maximum(predicted[indices,target], 1e-15))
    
    def gradient(self, target, predicted):
        grad = np.zeros((len(target), 10))
        indices = np.arange(len(target))
        grad[indices,target] = -1 / predicted[indices,target]
        return grad

In [3]:
class SoftmaxLayer:
    def __init__(self):
        self.params = []
        self.grads = []
    
    def __call__(self, inputs):
        inputs = inputs - np.max(inputs)
        return np.exp(inputs) / np.sum(np.exp(inputs), axis=-1)[:,np.newaxis]
    
    def gradient(self, inputs, gradients):
        outputs = self(inputs)  # examples, classes
        examples, classes = outputs.shape
        diag = np.zeros((examples, classes, classes))  # examples, classes, classes
        diag[:, np.arange(classes), np.arange(classes)] = outputs # set the diagonal of each example
        my_gradient = diag - outputs[:,:,np.newaxis] * outputs[:,np.newaxis,:]  # examples, classes, classes
        return np.sum(gradients[:,np.newaxis,:] * my_gradient, axis=2) # examples, classes

In [123]:
class DenseLayer:
    def __init__(self,inputs, outputs, random_seed=None):
        self._random_state = np.random.RandomState(random_seed)
        self._W = self._random_state.uniform(-2,2,size=(inputs, outputs))
        self._b = self._random_state.uniform(-2,2,size=(outputs,))
        self.params = [self._W, self._b]
        self.grads = [np.zeros_like(self._W), np.zeros_like(self._b)]
    
    def __call__(self, inputs):
        return inputs @ self._W + self._b
    
    def gradient(self, inputs, gradients):
        # gradient in respect to W
        w_grad = inputs[:,:,np.newaxis] * gradients[:,np.newaxis,:]  # examples, inputs, outputs
        np.add(self.grads[0], np.sum(w_grad, axis=0), out=self.grads[0])  # inputs, outputs
        # gradient in respect to b
        b_grad = gradients  # examples, outputs
        np.add(self.grads[1], np.sum(gradients, axis=0), out=self.grads[1])  # outputs
        # gradient in respect to inputs
        in_grad = self._W[np.newaxis,:,:] * gradients[:,np.newaxis,:] + np.sign(self._b)[np.newaxis, np.newaxis, :]  # examples, inputs, outputs
        return np.sum(in_grad, axis=2) # examples, inputs

In [124]:
target = np.random.randint(0,10,size=(3,), dtype=int)
vals = np.random.uniform(size=(3,7))

In [125]:
dense = DenseLayer(7,10,42)
soft = SoftmaxLayer()
loss = CategoricalCrossEntropyLoss()

first = dense(vals)
predicted = soft(first)
l = loss(target, predicted)
l_grad = loss.gradient(target, predicted)
s_grad = soft.gradient(first, l_grad)
f_grad = dense.gradient(vals, s_grad)

In [132]:
import tensorflow as tf

w_tf = tf.Variable(dense._W)
b_tf = tf.Variable(dense._b)
target_tf = tf.Variable(target)
vals_tf = tf.Variable(vals)

with tf.GradientTape() as tape:
    first_tf = vals_tf @ w_tf + b_tf
    predicted_tf = tf.nn.softmax(first_tf)
    l_tf = tf.keras.losses.sparse_categorical_crossentropy(target_tf, predicted_tf)

l_grad_tf, s_grad_tf, w_tf_grad, b_tf_grad, f_tf_grad = tape.gradient(l_tf, [predicted_tf, first_tf, w_tf, b_tf, vals_tf])

In [127]:
import torch

w_t = torch.tensor(dense._W, requires_grad=True)
b_t = torch.tensor(dense._b, requires_grad=True)
target_t = torch.tensor(target, dtype=torch.long)
vals_t = torch.tensor(vals, requires_grad=True)

first_t = vals_t @ w_t + b_t
first_t.retain_grad()
predicted_t = torch.nn.functional.softmax(first_t, dim=1)
predicted_t.retain_grad()
l_t = torch.nn.functional.nll_loss(torch.log(predicted_t), target_t, reduction='none')
l_t.backward(torch.ones(l_t.size()))

In [133]:
print("prediction")
print(predicted)
print("loss")
print(l)
print("loss grad")
print(l_grad)
print("softmax grad")
print(s_grad)
print("W grad")
print(dense.grads[0])
print("b grad")
print(dense.grads[1])
print("vals grad")
print(f_grad)

prediction
[[5.23538903e-02 5.42054255e-02 9.70625661e-03 4.35072983e-01
  4.50116097e-02 3.51086763e-01 1.10968445e-02 8.94194225e-03
  3.14159738e-02 1.10831134e-03]
 [5.71876434e-02 1.85377267e-03 7.47330390e-04 5.50580004e-01
  3.33689323e-02 3.42114524e-01 1.31423331e-03 2.71495398e-03
  9.01596837e-03 1.10263764e-03]
 [3.06704943e-02 1.45364088e-03 1.25016358e-03 7.06644851e-01
  1.50051632e-02 2.40874780e-01 1.27466093e-04 1.15820776e-03
  1.12816896e-03 1.68706444e-03]]
loss
[4.63498459 6.29053243 1.42347807]
loss grad
[[   0.            0.         -103.02633038    0.            0.
     0.            0.            0.            0.            0.        ]
 [   0.         -539.44046884    0.            0.            0.
     0.            0.            0.            0.            0.        ]
 [   0.            0.            0.            0.            0.
    -4.15153467    0.            0.            0.            0.        ]]
softmax grad
[[ 5.23538903e-02  5.42054255e-02 -9.90293

In [134]:
print("prediction")
print(predicted_tf)
print("loss")
print(l_tf)
print("loss grad")
print(l_grad_tf)
print("softmax grad")
print(s_grad_tf)
print("W grad")
print(w_tf_grad)
print("b grad")
print(b_tf_grad)
print("vals grad")
print(f_tf_grad)

prediction
tf.Tensor(
[[5.23538903e-02 5.42054255e-02 9.70625661e-03 4.35072983e-01
  4.50116097e-02 3.51086763e-01 1.10968445e-02 8.94194225e-03
  3.14159738e-02 1.10831134e-03]
 [5.71876434e-02 1.85377267e-03 7.47330390e-04 5.50580004e-01
  3.33689323e-02 3.42114524e-01 1.31423331e-03 2.71495398e-03
  9.01596837e-03 1.10263764e-03]
 [3.06704943e-02 1.45364088e-03 1.25016358e-03 7.06644851e-01
  1.50051632e-02 2.40874780e-01 1.27466093e-04 1.15820776e-03
  1.12816896e-03 1.68706444e-03]], shape=(3, 10), dtype=float64)
loss
tf.Tensor([4.63498459 6.29053243 1.42347807], shape=(3,), dtype=float64)
loss grad
tf.Tensor(
[[   1.            1.         -102.02633038    1.            1.
     1.            1.            1.            1.            1.        ]
 [   1.         -538.44046884    1.            1.            1.
     1.            1.            1.            1.            1.        ]
 [   1.            1.            1.            1.            1.
    -3.15153467    1.            1.   

In [135]:
print("prediction")
print(predicted_t)
print("loss")
print(l_t)
print("loss grad")
print(predicted_t.grad)
print("softmax grad")
print(first_t.grad)
print("W grad")
print(w_t.grad)
print("b grad")
print(b_t.grad)
print("vals grad")
print(vals_t.grad)

prediction
tensor([[5.2354e-02, 5.4205e-02, 9.7063e-03, 4.3507e-01, 4.5012e-02, 3.5109e-01,
         1.1097e-02, 8.9419e-03, 3.1416e-02, 1.1083e-03],
        [5.7188e-02, 1.8538e-03, 7.4733e-04, 5.5058e-01, 3.3369e-02, 3.4211e-01,
         1.3142e-03, 2.7150e-03, 9.0160e-03, 1.1026e-03],
        [3.0670e-02, 1.4536e-03, 1.2502e-03, 7.0664e-01, 1.5005e-02, 2.4087e-01,
         1.2747e-04, 1.1582e-03, 1.1282e-03, 1.6871e-03]], dtype=torch.float64,
       grad_fn=<SoftmaxBackward>)
loss
tensor([4.6350, 6.2905, 1.4235], dtype=torch.float64,
       grad_fn=<NllLossBackward>)
loss grad
tensor([[   0.0000,    0.0000, -103.0263,    0.0000,    0.0000,    0.0000,
            0.0000,    0.0000,    0.0000,    0.0000],
        [   0.0000, -539.4405,    0.0000,    0.0000,    0.0000,    0.0000,
            0.0000,    0.0000,    0.0000,    0.0000],
        [   0.0000,    0.0000,    0.0000,    0.0000,    0.0000,   -4.1515,
            0.0000,    0.0000,    0.0000,    0.0000]], dtype=torch.float64)
soft

In [131]:
print("prediction")
print(np.abs(predicted - predicted_t.detach().numpy()) < 1e-12)
print("loss")
print(np.abs(l - l_t.detach().numpy()) < 1e-12)
print("loss grad")
print(np.abs(l_grad - predicted_t.grad.detach().numpy()) < 1e-12)
print("softmax grad")
print(np.abs(s_grad - first_t.grad.detach().numpy()) < 1e-12)
print("W grad")
print(np.abs(dense.grads[0] - w_t.grad.detach().numpy()) < 1e-12)
print("b grad")
print(np.abs(dense.grads[1] - b_t.grad.detach().numpy()) < 1e-12)
print("vals grad")
print(np.abs(f_grad - vals_t.grad.detach().numpy()) < 1e-12)

prediction
[[ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]]
loss
[ True  True  True]
loss grad
[[ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]]
softmax grad
[[ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]]
W grad
[[ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True