In [208]:
import numpy as np
import math

## Categorical Cross-Entropy Loss

In [209]:
softmax_output = [0.7, 0.1, 0.2]
one_hot = [1, 0, 0]

loss = -np.sum(one_hot * np.log(softmax_output))
print(loss)

0.35667494393873245


In [210]:
# The code above can be simplified given the features of a one_hot output predication distribution
# All incorrect values (0s) when plugged into y * log(y_hat) = 0, the formula simplifies to -log(y_hat[k]) (k is the correct index)

loss = -math.log(softmax_output[0])
print(loss)

0.35667494393873245


In [211]:
# The more confident the output is (closer to 1), the smaller the loss (closer to 0)

print(math.log(1.))
print(math.log(0.95))
print(math.log(0.9))
print(math.log(0.8))
print('...')
print(math.log(0.2))
print(math.log(0.1))
print(math.log(0.05))
print(math.log(0.01))

0.0
-0.05129329438755058
-0.10536051565782628
-0.2231435513142097
...
-1.6094379124341003
-2.3025850929940455
-2.995732273553991
-4.605170185988091


In [212]:
softmax_outputs = np.array(
    [[0.7, 0.1, 0.2],
     [0.1, 0.5, 0.4],
     [0.02, 0.9, 0.08]]
)

class_targets = [0, 1, 1] # dog, cat, cat

# Indexing all the class_target indexes of each of the softmax_output rows (0, 1, 2)
print(softmax_outputs[range(len(softmax_output)), class_targets])

# Average loss per batch
neg_log = -np.log(softmax_outputs[range(len(softmax_output)), class_targets])
average_loss = np.mean(neg_log)
print(average_loss)

[0.7 0.5 0.9]
0.38506088005216804


In [213]:
softmax_outputs = np.array(
    [[0.7, 0.1, 0.2],
     [0.1, 0.5, 0.4],
     [0.02, 0.9, 0.08]]
)

class_targets = np.array(
    [[1, 0, 0],
     [0, 1, 0],
     [0, 1, 0]]
)

# class_targets = np.array([0, 1, 1])

# One dimensional (sparse)
if len(class_targets.shape) == 1:
    correct_confidences = softmax_outputs[range(len(softmax_outputs)), class_targets]
# Two dimensional (one-hot encoded)
elif len(class_targets.shape) == 2:
    correct_confidences = np.sum(softmax_outputs * class_targets, axis=1)

neg_log = -np.log(correct_confidences)
loss = np.mean(neg_log)
print(loss)

0.38506088005216804


In [214]:
# To prevent ln(0) we clip the prediction values by a tiny margin

y_pred = [0, 1, 0]
y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
print(y_pred_clipped)

[1.000000e-07 9.999999e-01 1.000000e-07]


In [215]:
# All loss classes inherit this base Loss class
# loss classes will have a forward method that will output a matrix
# of all the sample losses (1 loss per sample)

# Common loss class
class Loss:
    
    # Calculates the data and regularization losses
    # given model output and ground truth values
    def calculate(self, output, y):

        # Calculate sample losses
        sample_losses = self.forward(output, y)

        # Calculate mean loss
        data_loss = np.mean(sample_losses)

        # Return loss
        return data_loss

In [216]:
# Cross-entropy loss
class Loss_CategoricalCrossentropy(Loss):

    def forward(self, y_pred, y_true):

        # Number of samples in a batch
        samples = len(y_pred)

        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Probabilities for target values -
        # only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples), y_true]

        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)
            
        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    
loss_function = Loss_CategoricalCrossentropy()
loss = loss_function.calculate(softmax_outputs, class_targets)
print(loss)

0.38506088005216804


## Combining everything up to this point:

In [217]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

nnfs.init()

# Dense layer
class Layer_Dense:
    # Layer initialization
    def __init__(self, n_inputs, n_neurons):
        # Initialize weights and biases
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
    
    # Forward pass
    def forward(self, inputs):
        # Calculate output values from inputs, weights, and biases
        self.output = np.dot(inputs, self.weights) + self.biases

# ReLU activation
class Activation_ReLU:
    # Forward pass
    def forward(self, inputs):
        # Calculate output values from inputs
        self.output = np.maximum(0, inputs)

# Softmax activation
class Activation_Softmax:
    # Forward pass
    def forward(self, inputs):
        # Get unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        # Normalize them for each sample
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities

# Common loss class
class Loss:
    # Calculates the data and regularization losses given model output and ground truth values
    def calculate(self, output, y):
        # Calculate sample losses
        sample_losses = self.forward(output, y)
        # Calculate mean loss
        data_loss = np.mean(sample_losses)
        # Return loss
        return data_loss

# Cross-entropy loss
class Loss_CategoricalCrossentropy(Loss):
    # Forward pass
    def forward(self, y_pred, y_true):
        # Number of samples in a batch
        samples = len(y_pred)
        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Probabilities for target values - only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples), y_true]
        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)

        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

# Create dataset
X, y = spiral_data(samples=100, classes=3)

# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 3)

# Create ReLU activation (to be used with Dense layer)
activation1 = Activation_ReLU()

# Create second Dense layer with 3 input features (output of previous layer) and 3 output values
dense2 = Layer_Dense(3, 3)

# Create Softmax activation (to be used with Dense layer)
activation2 = Activation_Softmax()

# Create loss function
loss_function = Loss_CategoricalCrossentropy()

# Perform a forward pass of our training data through the first Dense layer
dense1.forward(X)

# Perform a forward pass through the ReLU activation
# Takes the output of the first Dense layer
activation1.forward(dense1.output)

# Perform a forward pass through the second Dense layer
# Takes the output of the activation function of the first layer
dense2.forward(activation1.output)

# Perform a forward pass through the Softmax activation
# Takes the output of the second Dense layer
activation2.forward(dense2.output)

# Let's see the output of the first few samples
print(activation2.output[:5])

# Perform a forward pass through the loss function
# Takes the output of the second Dense layer and returns loss
loss = loss_function.calculate(activation2.output, y)

# Print loss value
print('loss:', loss)


[[0.33333334 0.33333334 0.33333334]
 [0.3333332  0.3333332  0.33333364]
 [0.3333329  0.33333293 0.3333342 ]
 [0.3333326  0.33333263 0.33333477]
 [0.33333233 0.3333324  0.33333528]]
loss: 1.0986104


## Accuracy Calculation

In [218]:
# Accuracy describes often how the largest confidence is the correct class in terms of a fraction

# Probabilities of 3 samples
softmax_outputs = np.array(
    [[0.7, 0.2, 0.1],
     [0.5, 0.1, 0.4],
     [0.02, 0.9, 0.08]]
)

# Target (ground-truth) labels for 3 samples
class_targets = np.array([0, 1, 1])

# Calculate values along second axis (axis of index 1)
# predictions = np.argmax(softmax_outputs, axis=1)
predictions = np.argmax(activation2.output, axis=1)

# If targets are one-hot encoded - convert them
if len(y.shape) == 2:
    y = np.argmax(y, axis=1)

# True evaluates to 1; False to 0
accuracy = np.mean(predictions==y)
print('acc:', accuracy)

acc: 0.34
