In [1]:
import numpy as np

In [2]:
# Coding a Single neuron that take input from 3 neurons.

inputs = [1, 2, 3.5]      # inputs from 3 neurons
weights = [2.7, 3.5, 4.2] # weights associated with 3 links from 3 neurosns
bias = 3                  # bias associated with the neuron that's taking the input

output = inputs[0]*weights[0] + inputs[1]*weights[1] + inputs[2]*weights[2] + bias

In [3]:
output

27.4

In [4]:
# Coding a 3 neuron layer with each neuron taking input from 4 neurons from previous 4 neuron layer

inputs = [1.0, 2.0, 3.0, 2.5]              # The input from 4 neurons will be same for all 3 neurons in this layer

weights = [[0.2, 0.8, -0.5, 1.0],           # The weights associated with each link will vary
           [0.5, -0.91, 0.26, -0.5],
           [-0.26, -0.27, 0.17, 0.87]]

biases = [2.0, 3.0, 0.5]

output = []

for b, wht in zip(biases, weights):
    product_sum = 0
    for i, w in zip(inputs, wht):
        product_sum += i*w
    output.append(product_sum + b)
    
print(output)


[4.8, 1.21, 2.385]


In [7]:
output = np.dot(inputs, weights) + biases
print(output)

# Why error? See Below

ValueError: shapes (4,) and (3,4) not aligned: 4 (dim 0) != 3 (dim 0)

In [6]:
# Now the dot product. Easier way of doing the above

output = np.dot(weights, inputs) + biases
print(output)

# Dot prooduct is matrix multiplication -> weights is 3*4 matrix, inputs 1*4 vectorb. 
# For two matrices to be dot product compatible, both matrix must have same number of column
# output will be of rows of 2st matrix * rows of 1nd matrix, matrix. 
# High school, College math is coming in hand, who would have thought.

[4.8   1.21  2.385]


In [23]:
# With 3 input sets, i.e., inputs from 3 different records
# Here the multiplication follows matrix multiplication

inputs = [[1.0, 2.0, 3.0, 2.5],
          [2.0, 5.0, -1.0, 2.0],
          [-1.5, 2.7, 3.3, -0.8]]

weights = [[0.2, 0.8, -0.5, 1.0],          
           [0.5, -0.91, 0.26, -0.5],
           [-0.26, -0.27, 0.17, 0.87]]

biases = [2.0, 3.0, 0.5]

# Matrix mul of 3*4 and 3*4 is not possible. You know y, u read it in school.
# So, you are transposing the weights to 4*3. Now 3*4 x 4*3 will give 3*3

# output = np.dot(inputs, np.array(weights).T) #+ biases
# print(output)

# output1 = np.dot(weights, np.array(inputs).T) #+ biases
# print(output1)

output = np.dot(inputs, np.array(weights).T) + biases
print(output)

# output1 = np.dot(weights, np.array(inputs).T + biases)
# print(output1)

[[ 4.8    1.21   2.385]
 [ 8.9   -1.81   0.2  ]
 [ 1.41   1.051  0.026]]


In [26]:
inputs = [[1.0, 2.0, 3.0, 2.5],
          [2.0, 5.0, -1.0, 2.0],
          [-1.5, 2.7, 3.3, -0.8]]

weights = [[0.2, 0.8, -0.5, 1.0],          
           [0.5, -0.91, 0.26, -0.5],
           [-0.26, -0.27, 0.17, 0.87]]

biases = [2.0, 3.0, 0.5]

weights2 = [[0.1, -0.14, 0.5],          
           [-0.5, 0.12, -0.33],
           [-0.44, 0.73, -0.13]]

biases2 = [-1.0, 2.0, -0.5]


layer1_output = np.dot(inputs, np.array(weights).T) + biases
layer2_output = np.dot(layer1_output, np.array(weights2).T) + biases2  # adding second layer

print(layer2_output)


[[ 0.5031  -1.04185 -2.03875]
 [ 0.2434  -2.7332  -5.7633 ]
 [-0.99314  1.41254 -0.35655]]


In [34]:
# So now we'll start building a NN with random weights and bises with OOP.
# We ususaly keep weights range between -0.1 to 0.1, so that the values o/p by neurons don't become too large, called exploding
# And we start of the biases with 0. If for 0 biase, the output is all zero, then the NN is deaad that means start the bias with
# a non zero number.
# We'll use the same input as above

np.random.seed(0)

X = [[1.0, 2.0, 3.0, 2.5],            # input sample set. 3 input sets with 4 features each.
     [2.0, 5.0, -1.0, 2.0], 
     [-1.5, 2.7, 3.3, -0.8]]

class Layer_Dense:
    
    def __init__(self, n_inputs, n_neurons):                 # n_inputs is the size of single set in X, n is no. of neurons  
        self.weights = 0.1*np.random.randn(n_inputs, n_neurons)  # here for randn, 1st parameter is input size and 2nd is no. of neurons
        self.biases = np.zeros((1, n_neurons))              # here for np.zeroes, the first parameter itself if tuple of shape.
    
    def forward(self, inputs):
        self.output = np.dot(inputs, self.weights) + self.biases
        
layer1 = Layer_Dense(4, 5)   # n_inputs is 4 beacuse there are 4 inputs in each of 3 sets in X, we set n_neurons to be 5
layer2 = Layer_Dense(5, 2)   # n_inputs is 5 beacuse 5 output from layer 1, here we set n_neurons to be 2

layer1.forward(X)
print(layer1.output)       # for 3 sample set, three outputs with 5 neurons each

print('\n\n<------------->\n\n')

layer2.forward(layer1.output)
print(layer2.output)            # for 3 sample set, three outputs with 2 neurons each

[[ 0.10758131  1.03983522  0.24462411  0.31821498  0.18851053]
 [-0.08349796  0.70846411  0.00293357  0.44701525  0.36360538]
 [-0.50763245  0.55688422  0.07987797 -0.34889573  0.04553042]]


<------------->


[[ 0.148296   -0.08397602]
 [ 0.14100315 -0.01340469]
 [ 0.20124979 -0.07290616]]


## Activation Functions



Input to the activation function is the result from ((input*weight) + biase)

Step Function --> Gives output as 1 if input is > 0, else 0

Sigmoid Function --> 
y = 1/(1+(e^-1))

Why sigmoid over step funtion?
Output from sigmoid is more granular than step function i., with sigmoid we can determine with given input how close the output get to 1. One main issue with sigmoid is something called vanishing gradient problem.

Rectified Linear Unit --> If input is > 0, then the output is input. If input <= 0, then output is 0
y = x, if x > 0
y = 0, if x <= 0

Why relu over sigmoid?
As granular as sigmoid, but the calculation of relu is way simpler than sigmoid, so is much faster than sigmoid.
RElu is most commonly used activation function.

In [6]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data  # See for code: https://gist.github.com/Sentdex/454cb20ec5acf0e76ee8ab8448e6266c
                                       #from https://cs231n.github.io/neural-networks-case-study/ 

nnfs.init()        # Similar to setting a seed value
X, y = spiral_data(100, 3) # so 3 classes with 100 inputs will be retured. This input is x,y coordinate of dots. So each input
                           # will two features i.e., x coordinate and y coordinate.


class Layer_Dense:
    
    def __init__(self, n_inputs, n_neurons):                 
        self.weights = 0.1*np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
        
    def forward(self, inputs):
        self.output = np.dot(inputs, self.weights) + self.biases
        
class Activation_ReLU:
    def forward(self, inputs):
        self.output = np.maximum(0, inputs)
        
layer1 = Layer_Dense(2, 5)  # 2 beacuse two feature per input. 5 denotes the neurons
layer1.forward(X)
print(layer1.output)
activation = Activation_ReLU()
activation.forward(layer1.output)
print('\n\n\n<--------------->')
print(activation.output)    # So all the negetive inputs are changed to zero.  

          

[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [-8.35815910e-04 -7.90404272e-04 -1.33452227e-03  4.65504505e-04
   4.56846210e-05]
 [-2.39994470e-03  5.93469958e-05 -2.24808278e-03  2.03573116e-04
   6.10024377e-04]
 ...
 [ 1.13291524e-01 -1.89262271e-01 -2.06855070e-02  8.11079666e-02
  -6.71350807e-02]
 [ 1.34588361e-01 -1.43197834e-01  3.09493970e-02  5.66337556e-02
  -6.29687458e-02]
 [ 1.07817926e-01 -2.00809643e-01 -3.37579325e-02  8.72561932e-02
  -6.81458861e-02]]



<--------------->
[[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 4.65504505e-04
  4.56846210e-05]
 [0.00000000e+00 5.93469958e-05 0.00000000e+00 2.03573116e-04
  6.10024377e-04]
 ...
 [1.13291524e-01 0.00000000e+00 0.00000000e+00 8.11079666e-02
  0.00000000e+00]
 [1.34588361e-01 0.00000000e+00 3.09493970e-02 5.66337556e-02
  0.00000000e+00]
 [1.07817926e-01 0.00000000e+00 0.00000000e+00 8.72561932e

## Softmax Activation Function:

Why softmax for last output layer, why not just relu?
Well, while training a neural network it is important to see how close is each output to the actual value i.e., calculate loss, back propogate it to the NN to increase accuracy. So, the first reason isReLU is associated with one neuron, i., input to one neuron and output to that neuron, while for a output we need a probability distribution across each class/op neuron, so as to train the model. The second reason is, relu converts all -ve outputs to 0. So if a set of inputs give -0.00056 and another gives -90000.6, even though the actual diff very very high, ReLU will convert both to 0 and we'll never know which is closer to actual output. Third reason is, what if all the outputs are negetive values, then everthing will become zero and NN will be dead.

What's in softmax function?


- In softmax function, first we exponentiate all the values, to convert -ve to +ve values without loosing the meaning to the -ve values.

y = e^x, where e is the euiler's number. e = 2.718281528459045
So this function, for negetive values it'll give a very low positive value, and exponentially increases as it goes up.

- Then we normalize the output values i.e., we divide each output by sum of all the outputs, so as to get the probability distribution among the outputs.

In [9]:
# Implementing softmax

import numpy as np
import nnfs
from nnfs.datasets import spiral_data 
                                       
nnfs.init()        
 
class Layer_Dense:
    
    def __init__(self, n_inputs, n_neurons):                 
        self.weights = 0.1*np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
        
    def forward(self, inputs):
        self.output = np.dot(inputs, self.weights) + self.biases
        
class ActivationReLU:
    def forward(self, inputs):
        self.output = np.maximum(0, inputs)
        
class ActivationSoftmax:
    def forward(self, inputs):
        exp_values = np.exp(inputs - np.max(inputs, axis = 1, keepdims = True)) 
        # Taking exponential of inputs.
        # inputs - max of inputs, to save the exp output from exploding and doing this will keep the values between 0 and 1.
        # np.max(axis = None #default value) will give max number from all the batches. 
        # np.max(axis = 1) specifies to take the max value from each batch.
        # The batches will be of dimension, (1 * no. of batches). In this case we cannot subtract eah batch of input values with
        # the max value specific to that batch.
        # np.max(keepdims = True) will convert the dimentions to (no. of batches * 1), too achevie the above.
        
        probabilities = exp_values/np.sum(exp_values, axis=1, keepdims = True)
        self.output = probabilities
        

X, y = spiral_data(samples = 100, classes = 3)   

layer1 = Layer_Dense(2, 5)
activation1 = ActivationReLU()

layer2 = Layer_Dense(5, 3)  # output is 3 classes
activation2 = ActivationSoftmax()

layer1.forward(X)
activation1.forward(layer1.output)
layer2.forward(activation1.output)
activation2.forward(layer2.output)

#print(activation2.output)

print(activation2.output[:5])

[[0.33333334 0.33333334 0.33333334]
 [0.33334148 0.33333018 0.33332834]
 [0.33335316 0.33332598 0.3333209 ]
 [0.333332   0.3333076  0.3333604 ]
 [0.33333603 0.33330086 0.33336315]]


## Loss Functions:

Categorical Cross-Entropy -> -ve(Sum(softmax_output\[i\] * one_hot_vector\[i\]))

One Hot Vector-> It is a vector of size of number of target classes in the model, with the target class index value 1 and all other index values 0.

Let take for a model there are three target classes.

target classes = \[cat, dog, squriel\]

- for input of image x1 with label cat --> one hot vector = \[1, 0, 0\]
- for input of image x2 with label dog --> one hot vector = \[0, 1, 0\]
- for input of image x1 with label squriels --> one hot vector = \[0, 0, 1\]


In [11]:
import math

softmax_output = [0.7, 0.1, 0.2]     
target_output = [1, 0, 0]            # one_hot_vector

loss = -(math.log(softmax_output[0]) * target_output[0] +
         math.log(softmax_output[1]) * target_output[1] +
         math.log(softmax_output[2]) * target_output[2])

print(loss)

print(-math.log(0.7))  # If the prediction is close to one, the loss is less
print(-math.log(0.5))  # If the prediction is close to zero, the loss is high as seen below

0.35667494393873245
0.35667494393873245
0.6931471805599453
