In [7]:
# # Pytorch MNIST digits example from https://machinelearningmastery.com/handwritten-digit-recognition-with-lenet5-model-in-pytorch/
# # THIS IS ME PLAYING WITH a relatively large MLP for fun
# # SEEMS TO WORK WELL > 98% ACCURACY!!!


# # Imports
import torch # Main PyTorch module
import torch.nn as nn # Neural Network module
import torch.optim as optim  # Optimization module
import torchvision # Computer Vision module
import matplotlib.pyplot as plt
from tqdm import tqdm
import sys
import numpy as np
import time
import math
import nnfs



In [8]:
## Calculating loss function

## Categorical cross-entropy loss function L = - log(p) where p is the predicted probability of the true class (natural logarithm to base e)
# Loss for one hot vector: L = - sum(y_true * log(y_pred)) where y_true is the one hot vector and y_pred is the predicted probability vector

# What is one hot vector?
# A one hot vector is a vector where the index of the true class is 1 and all other indices are 0.

In [9]:
softmax_outputs = np.array([0.7, 0.1, 0.2])

target_outputs = np.array([1, 0, 0] ) # One hot vector for class 0

loss = -(math.log(softmax_outputs[0]) * target_outputs[0] + #
            math.log(softmax_outputs[1]) * target_outputs[1] +
            math.log(softmax_outputs[2]) * target_outputs[2])

print('Loss: ', loss)  # Should print 0.35667494393873245


loss = -np.log(softmax_outputs[0])  # Since only the first term is non-zero

print('Loss simplified: ', loss)  # Should print 0.35667494393873245

print('Loss over simplified calculation matches full calculation: ',-math.log(0.7) )

Loss:  0.35667494393873245
Loss simplified:  0.35667494393873245
Loss over simplified calculation matches full calculation:  0.35667494393873245


In [10]:
from nnfs.datasets import spiral_data
np.random.seed(0) ## Set random seed for reproducibility

nnfs.init() ## Initialize nnfs (sets default data type to float32 etc.)

## Create Layer class 
class Layer_Dense: 
    def __init__(self, n_inputs, n_neurons): ## n_inputs = number of inputs to the layer, n_neurons = number of neurons in the layer
        ## Initialize weights and biases
        self.weights = 0.10 * np.random.randn(n_inputs, n_neurons) ## Random weights with small values. This is useful for forward pass and we do not need to transpose weights here.
        self.biases = np.zeros((1, n_neurons)) ## Biases should be initialized to zero values

    def forward(self, inputs):
        ## Forward pass
        self.output = np.dot(inputs, self.weights) + self.biases


class Activation_ReLU:
    def forward(self, inputs):
        ## Apply ReLU activation function
        self.output = np.maximum(0, inputs) ## Element-wise maximum operation

class softmax_Activation:
    def forward(self, inputs):
        ## Apply Softmax activation function
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True)) ## Subtract max for numerical stability
        probabilities= exp_values / np.sum(exp_values, axis=1, keepdims=True) ## Normalization step
        self.output = probabilities

class Loss_log:
    def calculate(self, y_pred, y_true):
        ## Calculate categorical cross-entropy loss
        samples = y_pred.shape[0] ## Number of samples
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7) ## Clip predictions to avoid log(0)
        
        ## Probabilities for target values
        correct_confidences = y_pred_clipped[range(samples), y_true]
        
        negative_log_likelihoods = -np.log(correct_confidences) ## Negative log likelihoods
        loss = np.mean(negative_log_likelihoods) ## Mean loss
        return loss

## Create first layer
X, y = spiral_data(samples = 100, classes =3) ## Create dataset with 3 classes
dense1 = Layer_Dense(2,3) ## Create first dense layer with 2 inputs and 3 neurons
# Create activation function
activation1 = Activation_ReLU() ## Create ReLU activation function object


dense2 = Layer_Dense(3,3) ## Create second dense layer with 3 inputs (from previous layer) and 3 neurons
activation2 = softmax_Activation() ## Create Softmax activation function object

dense1.forward(X) ## Forward pass through first layer
dense2.forward(dense1.output) ## Forward pass through second layer

activation1.forward(dense1.output) ## Apply ReLU activation function to output of first layer
activation2.forward(dense2.output) ## Apply Softmax activation function to output of second layer

print("First layer output after ReLU activation = \n", activation1.output[:5]) ## Print output of first layer after ReLU activation
print("\n")
print("Second layer output after Softmax activation = \n", activation2.output[:5]) ## Print output of second layer after Softmax activation
print("\n") 

First layer output after ReLU activation = 
 [[0.         0.         0.        ]
 [0.         0.00113954 0.        ]
 [0.         0.00317292 0.        ]
 [0.         0.00526663 0.        ]
 [0.         0.00714014 0.        ]]


Second layer output after Softmax activation = 
 [[0.33333334 0.33333334 0.33333334]
 [0.33334687 0.33334196 0.3333112 ]
 [0.3333612  0.3333536  0.33328524]
 [0.33336097 0.33335987 0.33327916]
 [0.33337367 0.3333704  0.33325592]]




In [15]:
softmax_outputs = np.array([[0.7, 0.1, 0.2],
                            [0.1, 0.5, 0.4],
                            [0.02, 0.9, 0.08]]) # Predicted probabilities for 3 samples and 3 classes
class_targets = np.array([0, 1, 1]) ## Class labels for each sample

print(softmax_outputs[[0, 1, 2], class_targets]) ## Print predicted probabilities for true classes

print('Losses for each sample: ',-np.log(softmax_outputs[[0, 1, 2], class_targets])) 
## Print losses for each sample. Smaller loss means the model is doing better as it is predicting higher probabilities for the true classes.

print('Mean of losses: ', np.mean(-np.log(softmax_outputs[[0, 1, 2], class_targets])))


[0.7 0.5 0.9]
Losses for each sample:  [0.35667494 0.69314718 0.10536052]
Mean of losses:  0.38506088005216804


In [17]:
from nnfs.datasets import spiral_data
np.random.seed(0) ## Set random seed for reproducibility

nnfs.init() ## Initialize nnfs (sets default data type to float32 etc.)

## Create Layer class 
class Layer_Dense: 
    def __init__(self, n_inputs, n_neurons): ## n_inputs = number of inputs to the layer, n_neurons = number of neurons in the layer
        ## Initialize weights and biases
        self.weights = 0.10 * np.random.randn(n_inputs, n_neurons) ## Random weights with small values. This is useful for forward pass and we do not need to transpose weights here.
        self.biases = np.zeros((1, n_neurons)) ## Biases should be initialized to zero values

    def forward(self, inputs):
        ## Forward pass
        self.output = np.dot(inputs, self.weights) + self.biases


class Activation_ReLU:
    def forward(self, inputs):
        ## Apply ReLU activation function
        self.output = np.maximum(0, inputs) ## Element-wise maximum operation

class softmax_Activation:
    def forward(self, inputs):
        ## Apply Softmax activation function
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True)) ## Subtract max for numerical stability
        probabilities= exp_values / np.sum(exp_values, axis=1, keepdims=True) ## Normalization step
        self.output = probabilities

class Loss:
    def calculate(self, output, y):
        ## Calculate loss
        sample_losses = self.forward(output, y) ## Forward pass to calculate losses, it will vary depending on the loss function
        data_loss = np.mean(sample_losses) ## Mean loss
        return data_loss
    
class Loss_CategoricalCrossentropy(Loss):
    def forward(self, y_pred, y_true): # y_pred values from NN, y_true are target training values
        ## Calculate categorical cross-entropy loss
        samples = y_pred.shape[0] ## Number of samples
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7) 
        ## Clip predictions to avoid log(0). We use 1-1e-7 to avoid biasing towards any class which means we do not allow any predicted probability to be exactly 1.
        
        if len(y_true.shape) == 1: ##
            correct_confidences = y_pred_clipped[range(samples), y_true]
        elif len(y_true.shape) == 2: ## One hot encoded vector
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1) 
        ## Probabilities for target values
        correct_confidences = y_pred_clipped[range(samples), y_true]
        
        negative_log_likelihoods = -np.log(correct_confidences) ## Negative log likelihoods
        return negative_log_likelihoods
    

## Create first layer
X, y = spiral_data(samples = 100, classes =3) ## Create dataset with 3 classes
dense1 = Layer_Dense(2,3) ## Create first dense layer with 2 inputs and 3 neurons
# Create activation function
activation1 = Activation_ReLU() ## Create ReLU activation function object


dense2 = Layer_Dense(3,3) ## Create second dense layer with 3 inputs (from previous layer) and 3 neurons
activation2 = softmax_Activation() ## Create Softmax activation function object

dense1.forward(X) ## Forward pass through first layer
dense2.forward(dense1.output) ## Forward pass through second layer

activation1.forward(dense1.output) ## Apply ReLU activation function to output of first layer
activation2.forward(dense2.output) ## Apply Softmax activation function to output of second layer

print("First layer output after ReLU activation = \n", activation1.output[:5]) ## Print output of first layer after ReLU activation
print("\n")
print("Second layer output after Softmax activation = \n", activation2.output[:5]) ## Print output of second layer after Softmax activation
print("\n") 

loss_function = Loss_CategoricalCrossentropy() ## Create loss function object
loss = loss_function.calculate(activation2.output, y) ## Calculate loss, so you need to pass the y prediction and true labels
print("Loss: ", loss) ## Print loss

# Calculating accuracy not as usueful as loss but good to see
predictions = np.argmax(activation2.output, axis=1) ## this gives the index of the highest predicted probability for each sample
accuracy = np.mean(predictions == y) ## this compares the predictions to the true labels and calculates the mean accuracy
print("Accuracy: ", accuracy) ## Print accuracy

First layer output after ReLU activation = 
 [[0.         0.         0.        ]
 [0.         0.00113954 0.        ]
 [0.         0.00317292 0.        ]
 [0.         0.00526663 0.        ]
 [0.         0.00714014 0.        ]]


Second layer output after Softmax activation = 
 [[0.33333334 0.33333334 0.33333334]
 [0.33334687 0.33334196 0.3333112 ]
 [0.3333612  0.3333536  0.33328524]
 [0.33336097 0.33335987 0.33327916]
 [0.33337367 0.3333704  0.33325592]]


Loss:  1.0984595
Accuracy:  0.35


In [19]:
np.argmax?

[31mSignature:[39m       np.argmax(a, axis=[38;5;28;01mNone[39;00m, out=[38;5;28;01mNone[39;00m, *, keepdims=<no value>)
[31mCall signature:[39m  np.argmax(*args, **kwargs)
[31mType:[39m            _ArrayFunctionDispatcher
[31mString form:[39m     <function argmax at 0x10edab2e0>
[31mFile:[39m            ~/miniconda3/envs/mlproj/lib/python3.12/site-packages/numpy/_core/fromnumeric.py
[31mDocstring:[39m      
Returns the indices of the maximum values along an axis.

Parameters
----------
a : array_like
    Input array.
axis : int, optional
    By default, the index is into the flattened array, otherwise
    along the specified axis.
out : array, optional
    If provided, the result will be inserted into this array. It should
    be of the appropriate shape and dtype.
keepdims : bool, optional
    If this is set to True, the axes which are reduced are left
    in the result as dimensions with size one. With this option,
    the result will broadcast correctly against the 