In [None]:
# import required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
%matplotlib inline

In [None]:
# get the data from the file `backprop.csv`

df = pd.read_csv('backprop.csv')

x = df.x.values.reshape(-1,1)

y = df.y.values.reshape(-1,1)

In [None]:
# Initialize the weights, but keep the random seed as 310 for reproducable results

np.random.seed(310)
W = [np.random.randn(1, 1), np.random.randn(1, 1)]

## Defining the activation function and the neural network

In [29]:
# Function to compute the activation function
def A(x):
    return np.sin(x)

# Function to compute the derivative of the activation function
def der_A(x):
    return np.cos(x)

In [30]:
# Defining a simple neural network we used in the previous exercise

def neural_network(W, x):
    
    # Computing the first activation
    a1 = np.dot(x, W[0])
    
    # Defining sin() as the activation function
    fa1 = A(a1)
    
    # Computing the second activation
    a2 = np.dot(fa1,W[1])
    
    # Defining sin() as the activation function
    y = A(a2)
    
    return a1,a2,y

## Building the chain-rule components 

### ⏸ Look at the schematic in the instructions. If we consider the first weight, what is $\ \frac{\partial a_1}{\partial w_1}$?

- A. $x$
- B. $f(x)$
- C. $f'(x)$
- D. $w_1x$

In [None]:
### edTest(test_chow1) ###
# Submit an answer choice as a string below (eg. if you choose option C, put 'C')
answer1 = '___'

In [32]:
# Function to compute the partial derivate of a (particular neuron) with respect to corresponding weight w

def dadw(x,firstweight=False):
    '''
    The derivative of the activation wrt the preceding weight is just the activation of the previous neuron
    Note, account for the case where the input layer has no activation layers associated with it. i.e return x if its the first weight 
    '''
    if firstweight == True:
        return x
    return A(x)

In [33]:
# Function to compute the partial derivate of h with respect to a

def dhda(a):
    '''
    This is the derivative of the output of the activation function wrt the affine transformation.
    Return the derivative of the activation of the affine transformation
    '''
    
    return der_A(a)

In [34]:
# Function to compute the partial derivate of y with respect to a

def dyda(a):
    '''
    This is the derivative of the output of the neural network wrt the affine transformation.
    Return the derivative of the activation of the affine transformation
    '''
    
    return der_A(a)

In [35]:
# Function to compute the partial derivate of a with respect to h
def dadh(w):
    
    return w

In [63]:
# Function to compute the partial derivate of loss with respect to y
def dldy(y_pred,y):
    '''
    Since our loss function is the MSE,
    The partial derivative of L wrt y will be 2*(y_pred - y), for all predictions and response
    '''
    
    return 2*(y_pred - y)

### ⏸ Look at the schematic in the instructions. What is the difference between $h_i$ and $a_i$ for a given layer $i$?

- A. $h_i$ and $a_i$ are one and the same
- B. $h_i$ is the affine transformation on inputs from layer $i-1$ and $a_i$ is the activation over $h_i$
- C. $a_i$ is the affine transformation on inputs from layer $i-1$ and $h_i$ is the activation over $a_i$
- D. We use $a_i$ in case of linear functions and $h_i$ in case of non-linear functions

In [None]:
### edTest(test_chow2) ###
# Submit an answer choice as a string below (eg. if you choose option C, put 'C')
answer2 = '___'

In [64]:
# Function to compute the partial derivate of loss with respect to w

def dldw(W,x):
    
    '''
    Now, combine the functions from above and find the derivative wrt weights.
    These will be for all the points, hence take a mean of all values for each partial derivative and return as a list of 2 values
    
    '''
    a1,a2,y_pred = neural_network(W,x)
    w1,w2 = W
    
    dldw2 = dldy(y_pred,y)*dyda(a2)*dadw(a1)
    dldw1 = dldy(y_pred,y)*dyda(a2)*dadh(w2)*dhda(a1)*dadw(x,firstweight=True)
    
    return [np.mean(dldw1),np.mean(dldw2)]

### Hint: 

For the above, remember:

$$\frac{\partial L}{\partial w_1}\ =\ \frac{\partial L}{\partial y}\ \frac{\partial y}{\partial a_2}\frac{\partial a_2}{\partial h_1}\ \frac{\partial h_1}{\partial a_1}\frac{\partial a_1}{\partial w_1}$$

$$\frac{\partial L}{\partial w_2}\ =\ \frac{\partial L}{\partial y}\ \frac{\partial y}{\partial a_2}\frac{\partial a_2}{\partial w_2}$$

In [71]:
### edTest(test_gradient) ###

# Compute the gradient of the loss function with respect to the weights using function defined above
gradW = dldw(W,x)

# Print the list of your gradients below
print(f'The derivatives of w1 w2 wrt L are {gradW}')


The derivatives of w1 w2 wrt L are [-0.007777900562982125, 0.2788002123149505]


# Mindchow 🍲

1. Compare your computed partial derivatives wrt the previous exercise. Are they the same?

2. This example was just for a simple case of 1 neuron in 1 hidden layer. How could we generalize this idea to compute partial derivatives of all the weights?