In [4]:
import numpy as np
import pandas as pd

# Input vector
X = np.array([[2], [1], [3]])

# Layer 1 parameters (weights and biases)
W1 = np.array([[1, -1, 1], [1, 1, 0], [0, 1, 1], [1, 0, 1]])
b1 = np.array([-5, 0, 1, -2])
b1= b1.reshape(-1,1) #reshape your array with 1 column and as many rows as necessary


In [5]:
# Activation function: Rectified Linear Unit (ReLU)
def relu(x):
    return np.maximum(0, x)

In [6]:
# Forward pass through Layer 1
z1 = np.dot(W1, X) + b1
a1 = relu(z1)
print(" z1:\n", z1)
print(" a1:\n", a1)

 z1:
 [[-1]
 [ 3]
 [ 5]
 [ 3]]
 a1:
 [[0]
 [3]
 [5]
 [3]]


In [7]:
# Layer 2 parameters
W2 = np.array([[1, -1, 1, 0],
               [0, 1, -1, 1]])
b2 = np.array([0, 3])
b2= b2.reshape(-1,1)

# Forward pass through Layer 2
z2 = np.dot(W2, a1) + b2
a2 = relu(z2)

print(" z2:\n", z2)
print(" a2:\n", a2)

 z2:
 [[2]
 [4]]
 a2:
 [[2]
 [4]]


In [8]:
# Layer 3 parameters
W3 = np.array([[2, 0],
               [0, 2],
               [1, 1]])
b3 = np.array([-1, -5, -7])
b3= b3.reshape(-1,1)

# Forward pass through Layer 3
z3 = np.dot(W3, a2) + b3

In [9]:
def softmax(values):
 
    # Computing element wise exponential value
    exp_values = np.exp(values)
 
    # Computing sum of these values
    exp_values_sum = np.sum(exp_values)
 
    # Returing the softmax output.
    return exp_values/exp_values_sum


In [13]:
a3 =np.round(softmax(z3),1)
print(" a3:\n", a3)

 a3:
 [[0.5]
 [0.5]
 [0. ]]


In [14]:
# Predictions and target values
Y_pred = a3
Y_target = np.array([0, 1, 0])
Y_target= Y_target.reshape(-1,1)

In [15]:
# Derivative of the ReLU function
def relu_derivative(x):
    return (x > 0).astype(x.dtype) #converts the boolean array to the same data type as x


In [23]:
# Begin backpropagation

# Compute the gradient of the loss with respect to z3 (Layer 3 pre-activation)
dL_dz3 = Y_pred - Y_target
print(" ∂L_∂z3:\n", dL_dz3)



 dL_dz3:
 [[ 0.5]
 [-0.5]
 [ 0. ]]


In [25]:
# Compute the gradients of the loss with respect to Layer 3 weights and biases
dL_dW3 = np.dot(dL_dz3.reshape(-1, 1), a2.T)
#dL_dW3 =np.round(dL_dW3,1)
dL_db3 = dL_dz3.T

print(" ∂L/∂W3:\n", dL_dW3.T)
print(" ∂L/∂b3:\n", dL_db3)


 ∂L/∂W3:
 [[ 1. -1.  0.]
 [ 2. -2.  0.]]
 ∂L/∂b3:
 [[ 0.5 -0.5  0. ]]


In [27]:
# Compute the gradient of the loss with respect to a2 (Layer 2 activation)
dL_da2 = np.dot(W3.T, dL_dz3)


array([[ 1.],
       [-1.]])

In [31]:
# Compute the gradient of the loss with respect to z2 (Layer 2 pre-activation)
dL_dz2 = dL_da2

# Compute the gradients of the loss with respect to Layer 2 weights and biases
dL_dW2 = np.dot(dL_dz2.reshape(-1, 1), a1.T)
dL_db2 = dL_dz2

print(" ∂L/∂b2:\n", dL_db2)
print(" ∂L/∂W2:\n", dL_dW2.T)



 ∂L/∂b2:
 [[ 1.]
 [-1.]]
 ∂L/∂W2:
 [[ 0.  0.]
 [ 3. -3.]
 [ 5. -5.]
 [ 3. -3.]]


In [32]:
# Compute the gradient of the loss with respect to a1 (Layer 1 activation)
dL_da1 = np.dot(W2.T, dL_dz2)
dL_da1

array([[ 1.],
       [-2.],
       [ 2.],
       [-1.]])

In [33]:
# Compute the gradient of the loss with respect to a1 (Layer 1 activation)
dL_da1 = np.dot(W2.T, dL_dz2)


# Apply the derivative of ReLU to the backpropagated gradient
# to compute the gradient of the loss with respect to z1 (Layer 1 pre-activation)
dL_dz1 = dL_da1.T * relu_derivative(z1.ravel())
print(" ∂L/∂z1:\n", dL_dz1)
# Compute the gradients of the loss with respect to Layer 1 biases
# This is the gradient of the loss with respect to z1 itself since we have a single sample
dL_db1 = dL_dz1

# Compute the gradients of the loss with respect to Layer 1 weights
# by outer product of the input X and dL_dz1
dL_dW1 = np.dot(X, dL_dz1.reshape(1, -1))

print(" ∂L/∂b1:\n", dL_db1)
print(" ∂L/∂W1:\n", dL_dW1)

 ∂L/∂z1:
 [[ 0. -2.  2. -1.]]
 ∂L/∂b1:
 [[ 0. -2.  2. -1.]]
 ∂L/∂W1:
 [[ 0. -4.  4. -2.]
 [ 0. -2.  2. -1.]
 [ 0. -6.  6. -3.]]


In [None]:



















# Print the gradients for each layer to verify the backpropagation process
print("Layer 3 gradients:")
print(" z3:\n", z3.T)
print(" ∂L/∂b3:\n", dL_db3)
print(" ∂L/∂W3:\n", dL_dW3.T)

print("Layer 2 gradients:")
print(" z2:\n", z2.T)
print(" a2:\n", a2.T)
print(" ∂L/∂b2:\n", dL_db2)
print(" ∂L/∂W2:\n", dL_dW2.T)

print("Layer 1 gradients:")
print(" z1:\n", z1.T)
print(" a1:\n", a1.T)
print(" ∂L/∂b1:\n", dL_db1)
print(" ∂L/∂W1:\n", dL_dW1)