In [422]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
%matplotlib inline


In [423]:
# Get the data from the file `backprop.csv`
df = pd.read_csv('backprop.csv')

# Take a look at the data 
df.head()


Unnamed: 0,x,y
0,-5.0,0.006693
1,-4.98999,0.00676
2,-4.97998,0.006827
3,-4.96997,0.006895
4,-4.95996,0.006964


In [424]:
# Assign the predictor and response variables to x and y
# Ensure you reshape the data appropriately to convert vector to a matrix
x = np.array(df.x).reshape(-1, 1)
y = np.array(df.y).reshape(-1, 1)

# Take a look at the predictor and response data post reshaping
print(x)
print(y)


[[-5.        ]
 [-4.98998999]
 [-4.97997998]
 [-4.96996997]
 [-4.95995996]
 [-4.94994995]
 [-4.93993994]
 [-4.92992993]
 [-4.91991992]
 [-4.90990991]
 [-4.8998999 ]
 [-4.88988989]
 [-4.87987988]
 [-4.86986987]
 [-4.85985986]
 [-4.84984985]
 [-4.83983984]
 [-4.82982983]
 [-4.81981982]
 [-4.80980981]
 [-4.7997998 ]
 [-4.78978979]
 [-4.77977978]
 [-4.76976977]
 [-4.75975976]
 [-4.74974975]
 [-4.73973974]
 [-4.72972973]
 [-4.71971972]
 [-4.70970971]
 [-4.6996997 ]
 [-4.68968969]
 [-4.67967968]
 [-4.66966967]
 [-4.65965966]
 [-4.64964965]
 [-4.63963964]
 [-4.62962963]
 [-4.61961962]
 [-4.60960961]
 [-4.5995996 ]
 [-4.58958959]
 [-4.57957958]
 [-4.56956957]
 [-4.55955956]
 [-4.54954955]
 [-4.53953954]
 [-4.52952953]
 [-4.51951952]
 [-4.50950951]
 [-4.4994995 ]
 [-4.48948949]
 [-4.47947948]
 [-4.46946947]
 [-4.45945946]
 [-4.44944945]
 [-4.43943944]
 [-4.42942943]
 [-4.41941942]
 [-4.40940941]
 [-4.3993994 ]
 [-4.38938939]
 [-4.37937938]
 [-4.36936937]
 [-4.35935936]
 [-4.34934935]
 [-4.33933

The network has 2 layers: a hidden layer and an output layer.\
Each layer has a single neuron (i.e., just one weight).\
$w_1$ is the weight for the 1st layer.\
$w_2$ is the weight for the 2nd layer.\
There are no bias terms.

We will use randomly initialized weights.

In [425]:
# Initialize the weights, keeping the random seed as 310 for reproducible results
np.random.seed(310)

# W is a list that contains both w1 and w2
# That is, W = [w1,w2]
W = [np.random.randn(1, 1), np.random.randn(1, 1)]


## Defining the activation function and the neural network

In [426]:
### edTest(test_activation_fn) ###

# Function to compute the sin activation function for backpropagation
def A(x):
    return np.sin(x)

# Function to compute the derivative of the sin activation function
def der_A(x):
    return np.cos(x)
    

In [427]:
### edTest(test_forward) ###

# Function to define the forward pass of neural network
def neural_network(W, x):

    # W is a list of the NN's two weights (w1,w2)
    # x is the input to the neural network
    w1 = W[0]
    w2 = W[1]
    
    '''
    Computes z1, z2, a1, and y_hat based on the image in the description
    '''

    z1 = x * w1
    a1 = A(z1)
    z2 = a1 * w2
    y_hat = A(z2)
    
    return z1, z2, a1, y_hat
    

## Building the chain-rule components 
Construct the individual partial derivatives required for finding the derivative of the loss w.r.t. each weight.

In [428]:
### edTest(test_dzdw) ###

# Function to compute the partial derivate of affine transformation 
# wrt corresponding weight w

def dzdw(layer_input):

    '''
    Take into account that this could be the first layer or an 
    intermediate layer
    This function should be able to handle either case. 
    '''

    return layer_input
    

In [429]:
### edTest(test_dadz) ###

# Function to compute the partial derivate of activation wrt affine

def dadz(z):
    
    return der_A(z)
    

In [430]:
### edTest(test_dzda) ###

# Function to compute the partial derivate of affine
# wrt the previous activation

def dzda(w):

    return w
    

We use `y` rather than `y_hat` in the function names below just to keep things tidy (`dy_hatdz` would be more accurate but also very ugly).

In [431]:
### edTest(test_dydz) ###

# Function to compute the partial derivate of output, y_hat, 
# wrt the affine

def dydz(z):

    return der_A(z)
    

In [432]:
### edTest(test_dldy) ###

# Function to compute the partial derivate of loss with respect to y_hat
# The loss used here is the squared error

def dldy(y, y_hat):

    '''
    y is the grouth truth 
    y_hat is the predicted response
    '''

    return 2 * (y_hat-y)
    

In [433]:
# Function to compute the partial derivate of loss with respect to w

def dldw(W, x, y):

    '''
    ARGS:
    W: List of weights, w1 and w2
    x: input for forward pass
    y: true response variable

    Combine the functions from above and find the derivative wrt the weights
    These will be for all the points, hence take a mean of all values 
    for each partial derivative and return as a list of 2 values
    '''

    z1, z2, a1, y_hat = neural_network(W, x)
    w1, w2 = W
    
    # Derivative of the loss wrt the second weight
    # dldw2 = dldy(y, y_hat).reshape(-1,)*dydz(z2).reshape(-1,)*dzda(w2).reshape(-1,)[0]
    dldw2 = dldy(y, y_hat)*dydz(z2)*dzdw(a1)

    # print("dldy(y, y_hat)", dldy(y, y_hat)[:5])
    # print("dydz(z2)", dydz(z2)[:5])
    # print("dzdw(a1)", dzdw(a1)[:5])


    # Derivative of the loss wrt the first weight
    dldw1 = dldy(y, y_hat)*dydz(z2)*dzda(w2)*dadz(z1)*dzdw(x)
    
    return [np.mean(dldw1), np.mean(dldw2)]
    

In [434]:
### edTest(test_gradient) ###

# Use dldw() defined above to compute the gradient of the loss function 
# with respect to the weights
gradW = dldw(W, x, y)
# gradW = [0.02414, 0.27880]

# Print the list of your gradients below
print(f'The derivatives of w1 and w2 wrt L are {gradW}')


The derivatives of w1 and w2 wrt L are [-0.007777900562982069, 0.27880021231495045]
