In [1]:
import numpy as np
import pandas as pd

# Flow of Training a neural network

1. Initialize Parameters
2. For each epoch:
   - For every sample:
       - Forward Propagation (Predict Output)
       - Calculate Loss
       - Update weights and biases

In [2]:
# Let us create our data

df = pd.DataFrame([[8, 8, 4], [7, 9, 5],[6,10,6],[5,12,7]], columns=['cgpa', 'profile_score', 'package(lpa)'])
df

Unnamed: 0,cgpa,profile_score,package(lpa)
0,8,8,4
1,7,9,5
2,6,10,6
3,5,12,7


In [3]:
# Function to initialize parameters

# This will take in each layer dimensions and return us the weights and biases
# Initialization will be weights = 0.1, biases = 0 
# Initialization can be random as well if we want

# Input is of the form (2, 2, 1) - 2 Input Nodes, 2 Hidden Layer nodes, 1 Output Layer node 

# Output is of the form us 
# Wi which contains the weights going into nodes of Layer i
# bi which contains bias going into the nodes of Layer i

def initialize_parameters(layer_dims):
    
    parameters = {}
    num_layers = len(layer_dims)
    
    for layer_num in range(1, num_layers):
        parameters["W" + str(layer_num)] =  np.ones((layer_dims[layer_num-1], layer_dims[layer_num])) * 0.1
        parameters['b' + str(layer_num)] = np.zeros((layer_dims[layer_num], 1))

    return parameters

In [4]:
initialize_parameters([2,2,1])

{'W1': array([[0.1, 0.1],
        [0.1, 0.1]]),
 'b1': array([[0.],
        [0.]]),
 'W2': array([[0.1],
        [0.1]]),
 'b2': array([[0.]])}

In [5]:
# Forward Propagation at each layer

# Activation function will be linear

# Input takes 
# A_prev - Activation (output) from nodes of previous layer 
# W - Weights going into current layer
# b - Bias going into current layer

# Output will give us the Activations (outputs) going out of the current layer

def linear_forward(A_prev, W, b):
    
    Z = (W.T @ A_prev) + b
    
    return Z

In [6]:
# Forward propagation for N layers

# Takes input
# X - Single input sample from dataset
# parameters - Parameters of the ANN

def N_layer_forward(X, paramaters):

    A = X                             # Activation (output) from input layer
    N = (len(parameters) // 2) + 1    # Number of layers-1. // 2 cuz both weights and biases there in parameters. +1 to include input layer as well               

    
    for layer_num in range(1, N): 
        A_prev = A                                         # A_prev - output of prev layer
        W_curr_layer = parameters["W" + str(layer_num)]    # Current layer weights
        b_curr_layer = parameters["b" + str(layer_num)]    # Current layer biases

        # Printing out stuff
        # print("A"+str(layer_num-1) + ":\n", A_prev)
        # print("W"+str(layer_num) + ":\n", W_curr_layer)
        # print("b"+str(layer_num) + ":\n", b_curr_layer)
        # print("*" * 20)

        A = linear_forward(A_prev, W_curr_layer, b_curr_layer)    # forward propagation through current layer
        
        # print("A" + str(layer_num) + ":\n", A)
        # print("*" * 20)
        # print()
        # print()
        
    # We return both final layer and second final layer output cuz we need it in gradient calculations (see notes)
    return A, A_prev

## Now let us see the training 

### Initialize parameters

In [7]:
parameters = initialize_parameters([2, 2, 1])
parameters

{'W1': array([[0.1, 0.1],
        [0.1, 0.1]]),
 'b1': array([[0.],
        [0.]]),
 'W2': array([[0.1],
        [0.1]]),
 'b2': array([[0.]])}

### Select an input sample

In [8]:
# Selecting the first sample

X = df[['cgpa', 'profile_score']].values[0].reshape(2,1)    # Shape(no of features,1)
y = df[['package(lpa)']].values[0][0]

X, y

(array([[8],
        [8]]),
 4)

### Forward propagation

In [9]:
y_pred, A1 = N_layer_forward(X, parameters)
y_pred 

array([[0.32]])

In [10]:
# A2 - Final layer output (y_pred) 
# A0 - Inputs
# A1 - Hidden layer inputs

### Calculate Loss

In [11]:
loss = (y - y_pred)**2    # MSE
loss

array([[13.5424]])

### Update parameters

In [12]:
parameters['W2'][0].item()

0.1

In [13]:
# Function to update parameters 

# Note - This is for regression. Gradients are calculated using Linear Activation and MSE Loss
# Activation functions of all the nodes are taken as linear here

# From the derivations seen in notes
# A1 - output from the hidden layer - told ya we will need it 

def update_parameters(parameters, y, y_pred, A1, X, lr):
    
    parameters['W2'][0][0] = parameters['W2'][0][0] + (lr * 2 * (y - y_pred)*A1[0][0])
    parameters['W2'][1][0] = parameters['W2'][1][0] + (lr * 2 * (y - y_pred)*A1[1][0])
    parameters['b2'][0][0] = parameters['W2'][1][0] + (lr * 2 * (y - y_pred))
    
    parameters['W1'][0][0] = parameters['W1'][0][0] + (lr * 2 * (y - y_pred)*parameters['W2'][0][0]*X[0][0])
    parameters['W1'][0][1] = parameters['W1'][0][1] + (lr * 2 * (y - y_pred)*parameters['W2'][0][0]*X[1][0])
    parameters['b1'][0][0] = parameters['b1'][0][0] + (lr * 2 * (y - y_pred)*parameters['W2'][0][0])
    
    parameters['W1'][1][0] = parameters['W1'][1][0] + (lr * 2 * (y - y_pred)*parameters['W2'][1][0]*X[0][0])
    parameters['W1'][1][1] = parameters['W1'][1][1] + (lr * 2 * (y - y_pred)*parameters['W2'][1][0]*X[1][0])
    parameters['b1'][1][0] = parameters['b1'][1][0] + (lr * 2 * (y - y_pred)*parameters['W2'][1][0])

In [14]:
y_pred

array([[0.32]])

In [15]:
y_pred = y_pred[0][0]    # extracting element from array
update_parameters(parameters, y, y_pred, A1, X, 0.01)

In [16]:
parameters

{'W1': array([[0.22821709, 0.22821709],
        [0.22821709, 0.22821709]]),
 'b1': array([[0.01602714],
        [0.01602714]]),
 'W2': array([[0.21776],
        [0.21776]]),
 'b2': array([[0.29136]])}

In [17]:
# We can see that the parameters have been updated

# We just have to do the entire thing for all the samples once
# Then we get one epoch 

## Implementing Epochs

In [18]:
# epochs implementation

epochs = 5
parameters = initialize_parameters([2,2,1])

for epoch in range(epochs):

    loss = []     # to calculate average loss of all samples each epoch
    
    # Perform update for each sample
    for i in range(df.shape[0]): 

        # Select one sample 
        X = df[['cgpa', 'profile_score']].values[i].reshape(2, 1) # Shape(no of features, no. of training example)
        y = df[['package(lpa)']].values[i][0]

        # Forward Propagation
        y_pred,A1 = N_layer_forward(X,parameters)
        y_pred = y_pred[0][0]

        # Update Parameters
        update_parameters(parameters, y, y_pred, A1, X, 0.001)
        loss.append((y - y_pred) ** 2)
    
    print('Epoch-', epoch, 'Loss - ', np.array(loss).mean())

Epoch- 0 Loss -  25.321744156025517
Epoch- 1 Loss -  18.320004165722047
Epoch- 2 Loss -  9.473661050729628
Epoch- 3 Loss -  3.2520938634031613
Epoch- 4 Loss -  1.3407132589299962


In [19]:
parameters

{'W1': array([[0.26507636, 0.38558861],
        [0.27800387, 0.40980287]]),
 'b1': array([[0.02749056],
        [0.02974394]]),
 'W2': array([[0.41165744],
        [0.48302736]]),
 'b2': array([[0.48646246]])}

### Welp thats it thats entire backpropagation for you

## For Classification

Exact same process.
Only difference is
- Activation function is Sigmoid
- Loss is Log Loss (Binary Cross Entropy)

In [20]:
# Activation function
def sigmoid(Z):
  A = 1 / (1 + np.exp(-Z))
  return A

# Forward propagation 
def sigmoid_forward(A_prev, W, b):
  
  Z = np.dot(W.T, A_prev) + b
  A = sigmoid(Z)                      # added activation function 
  
  return A

In [None]:
# N Layer forward propagation
# Remains same

def N_layer_forward(X, paramaters):

    A = X                             
    N = (len(parameters) // 2) + 1                                

    for layer_num in range(1, N): 
        A_prev = A                                         
        W_curr_layer = parameters["W" + str(layer_num)]    
        b_curr_layer = parameters["b" + str(layer_num)]    

        A = linear_forward(A_prev, W_curr_layer, b_curr_layer)    

    return A, A_prev

In [None]:
# Function to update weights

# Based on the calculations done in notes
# Activation - Sigmoid, Loss - Binary Cross Entropy

def update_parameters(parameters, y, y_pred, A1, X, lr):
  parameters['W2'][0][0] = parameters['W2'][0][0] + (lr * (y - y_pred)*A1[0][0])
  parameters['W2'][1][0] = parameters['W2'][1][0] + (lr * (y - y_pred)*A1[1][0])
  parameters['b2'][0][0] = parameters['W2'][1][0] + (lr * (y - y_pred))

  parameters['W1'][0][0] = parameters['W1'][0][0] + (lr * (y - y_pred)*parameters['W2'][0][0]*A1[0][0]*(1-A1[0][0])*X[0][0])
  parameters['W1'][0][1] = parameters['W1'][0][1] + (lr * (y - y_pred)*parameters['W2'][0][0]*A1[0][0]*(1-A1[0][0])*X[1][0])
  parameters['b1'][0][0] = parameters['b1'][0][0] + (lr * (y - y_pred)*parameters['W2'][0][0]*A1[0][0]*(1-A1[0][0]))

  parameters['W1'][1][0] = parameters['W1'][1][0] + (lr * (y - y_pred)*parameters['W2'][1][0]*A1[1][0]*(1-A1[1][0])*X[0][0])
  parameters['W1'][1][1] = parameters['W1'][1][1] + (lr * (y - y_pred)*parameters['W2'][1][0]*A1[1][0]*(1-A1[1][0])*X[1][0])
  parameters['b1'][1][0] = parameters['b1'][1][0] + (lr * (y - y_pred)*parameters['W2'][1][0]*A1[1][0]*(1-A1[1][0]))