In [30]:
# # Pytorch MNIST digits example from https://machinelearningmastery.com/handwritten-digit-recognition-with-lenet5-model-in-pytorch/
# # THIS IS ME PLAYING WITH a relatively large MLP for fun
# # SEEMS TO WORK WELL > 98% ACCURACY!!!


# # Imports
import torch # Main PyTorch module
import torch.nn as nn # Neural Network module
import torch.optim as optim  # Optimization module
import torchvision # Computer Vision module
import matplotlib.pyplot as plt
from tqdm import tqdm
import sys
import numpy as np
import time
import math
import nnfs



In [None]:
# Q1 Why do we need a softmask layer here if we are not doing classification yet? 
# Answer1: We don't need it necessarily, but it's useful to see how the outputs look after softmax activation.

# Q2 What is the point of softmax if we are not doing classification yet? 
# Answer2: Softmax converts raw output scores into probabilities, 
# which can be useful for understanding the model's confidence in its predictions, even if we're not using it for classification at this stage.

# Q3 What is the difference between softmax and ReLU?
# Answer3: ReLU (Rectified Linear Unit) is an activation function that outputs the input directly if it is positive; otherwise, it outputs zero. 
# It is used to introduce non-linearity in the model. Softmax, on the other hand, is an activation function that converts a vector of raw scores into probabilities, where the probabilities sum to one.
# It is typically used in the output layer of a classification model to represent the likelihood of each class.

# Q4 Do we need both ReLU and softmax in the same model?
# Answer4: Yes, it is common to use both ReLU and softmax in the same model because one is used for hidden layers (ReLU) and the other for the output layer (softmax).

# Q5 Can we use softmax in hidden layers?
# Answer5: While it is technically possible to use softmax in hidden layers, it is not common practice.


In [32]:
## Implementing softmax now in a neural networks

## Softmask function is exp(value) / sum of all exp(values) 

layer_outputs = [[4.8, 1.21, 2.385],
                 [8.9, -1.81, 0.2],
                 [1.41, 1.051, 0.026]]

# Subtract largest value prior to exponentiation to improve numerical stability and avoid overflow. The output does not change.

exp_values = np.exp(layer_outputs - np.max(layer_outputs, axis=1, keepdims=True))

norm_values = exp_values / np.sum(exp_values, axis=1, keepdims=True) ## Matrix of exact same dimension of layer_outputs

print("Normalised values = ", norm_values)
# print("Sum of normalised values = ", np.sum(norm_values), " (should be 1.0)")



Normalised values =  [[8.95282664e-01 2.47083068e-02 8.00090293e-02]
 [9.99811129e-01 2.23163963e-05 1.66554348e-04]
 [5.13097164e-01 3.58333899e-01 1.28568936e-01]]


In [37]:
from nnfs.datasets import spiral_data
np.random.seed(0) ## Set random seed for reproducibility

nnfs.init() ## Initialize nnfs (sets default data type to float32 etc.)

## Create Layer class 
class Layer_Dense: 
    def __init__(self, n_inputs, n_neurons): ## n_inputs = number of inputs to the layer, n_neurons = number of neurons in the layer
        ## Initialize weights and biases
        self.weights = 0.10 * np.random.randn(n_inputs, n_neurons) ## Random weights with small values. This is useful for forward pass and we do not need to transpose weights here.
        self.biases = np.zeros((1, n_neurons)) ## Biases should be initialized to zero values

    def forward(self, inputs):
        ## Forward pass
        self.output = np.dot(inputs, self.weights) + self.biases


class Activation_ReLU:
    def forward(self, inputs):
        ## Apply ReLU activation function
        self.output = np.maximum(0, inputs) ## Element-wise maximum operation

class softmax_Activation:
    def forward(self, inputs):
        ## Apply Softmax activation function
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True)) ## Subtract max for numerical stability
        probabilities= exp_values / np.sum(exp_values, axis=1, keepdims=True) ## Normalization step
        self.output = probabilities


## Create first layer
X, y = spiral_data(samples = 100, classes =3) ## Create dataset with 3 classes
dense1 = Layer_Dense(2,3) ## Create first dense layer with 2 inputs and 3 neurons
# Create activation function
activation1 = Activation_ReLU() ## Create ReLU activation function object


dense2 = Layer_Dense(3,3) ## Create second dense layer with 3 inputs (from previous layer) and 3 neurons
activation2 = softmax_Activation() ## Create Softmax activation function object

dense1.forward(X) ## Forward pass through first layer
dense2.forward(dense1.output) ## Forward pass through second layer

activation1.forward(dense1.output) ## Apply ReLU activation function to output of first layer
activation2.forward(dense2.output) ## Apply Softmax activation function to output of second layer

print("First layer output after ReLU activation = \n", activation1.output[:5]) ## Print output of first layer after ReLU activation
print("\n")
print("Second layer output after Softmax activation = \n", activation2.output[:5]) ## Print output of second layer after Softmax activation


First layer output after ReLU activation = 
 [[0.         0.         0.        ]
 [0.         0.00113954 0.        ]
 [0.         0.00317292 0.        ]
 [0.         0.00526663 0.        ]
 [0.         0.00714014 0.        ]]


Second layer output after Softmax activation = 
 [[0.33333334 0.33333334 0.33333334]
 [0.33334687 0.33334196 0.3333112 ]
 [0.3333612  0.3333536  0.33328524]
 [0.33336097 0.33335987 0.33327916]
 [0.33337367 0.3333704  0.33325592]]
