In [None]:
%config IPCompleter.greedy=True

import numpy as np
import matplotlib.pyplot as plt
import sklearn.preprocessing, sklearn.datasets, sklearn.model_selection

In [None]:
class CategoricalCrossEntropyLoss:
    def __call__(self, target, predicted):
        indices = np.arange(len(target))
        return -np.log(np.maximum(predicted[indices,target], 1e-15))
    
    def gradient(self, target, predicted):
        grad = np.zeros((len(target), 10))
        indices = np.arange(len(target))
        grad[indices,target] = -1 / predicted[indices,target]
        return grad

In [None]:
class SoftmaxLayer:
    def __init__(self):
        self.params = []
        self.grads = []
    
    def __call__(self, inputs):
        inputs = inputs - np.max(inputs)
        return np.exp(inputs) / np.sum(np.exp(inputs), axis=-1)[:,np.newaxis]
    
    def gradient(self, inputs, gradients):
        outputs = self(inputs)  # examples, classes
        examples, classes = outputs.shape
        diag = np.zeros((examples, classes, classes))  # examples, classes, classes
        diag[:, np.arange(classes), np.arange(classes)] = outputs # set the diagonal of each example
        my_gradient = diag - outputs[:,:,np.newaxis] * outputs[:,np.newaxis,:]  # examples, classes, classes
        return np.sum(gradients[:,np.newaxis,:] * my_gradient, axis=2) # examples, classes

In [None]:
def DenseLayer:
    pass

In [None]:
target = np.random.randint(0,10,size=(3,), dtype=int)
vals = np.random.uniform(size=(3,10))

In [None]:
soft = SoftmaxLayer()
loss = CategoricalCrossEntropyLoss()

predicted = soft(vals)
l = loss(target, predicted)
l_grad = loss.gradient(target, predicted)
s_grad = soft.gradient(vals, l_grad)

In [None]:
import tensorflow as tf

target_tf = tf.Variable(target)
vals_tf = tf.Variable(vals)

with tf.GradientTape() as tape:
    predicted_tf = tf.nn.softmax(vals_tf)
    l_tf = tf.keras.losses.sparse_categorical_crossentropy(target_tf, predicted_tf)

l_grad_tf, s_grad_tf = tape.gradient(l_tf, [predicted_tf, vals_tf])

In [None]:
import torch

target_t = torch.tensor(target, dtype=torch.long)
vals_t = torch.tensor(vals, requires_grad=True)

predicted_t = torch.nn.functional.softmax(vals_t, dim=1)
predicted_t.retain_grad()
l_t = torch.nn.functional.nll_loss(torch.log(predicted_t), target_t, reduction='none')
l_t.backward(torch.ones(l_t.size()))

In [None]:
print("prediction")
print(predicted)
print("loss")
print(l)
print("loss grad")
print(l_grad)
print("softmax grad")
print(s_grad)

In [None]:
print("prediction")
print(predicted_tf)
print("loss")
print(l_tf)
print("loss grad")
print(l_grad_tf)
print("softmax grad")
print(s_grad_tf)

In [None]:
print("prediction")
print(predicted_t)
print("loss")
print(l_t)
print("loss grad")
print(predicted_t.grad)
print("softmax grad")
print(vals_t.grad)