In [None]:
import numpy as np

# Initialize 
np.random.seed(0)

x = np.array([[2.0, -1.0]])
y = np.array([[1.0]])

# Weight and biases
W1 = np.random.randn(2, 3)
b1 = np.random.randn(1, 3)


W2 = np.random.randn(3, 1)
b2 = np.random.randn(1, 1)


# ReLU and its derivative
def relu(z):
    return np.maximum(0, z)

def relu_derivative(z):
    return (z > 0).astype(float)


# THE FORWARD PASS
z1 = x @ W1 + b1
h = relu(z1)
y_hat = h @ W2 + b2


# Compute loss (MSE)
loss = 0.5 * (y_hat - y) ** 2


print("Prediction:", y_hat)
print("Loss:", loss)

Prediction: [[5.79749605]]
Loss: [[11.50798415]]


In [3]:
# THE BACKWARD PASS
# Derivatives of loss w.r.t. prediction
dL_dyhat = y_hat - y     # shape (1, 1)


# Gradients for output layer
dL_dW2 = h.T @ dL_dyhat   # shape (3, 1)
dL_db2 = dL_dyhat      # shape (1, 3)


# Gradient flowing into hidden layer
dL_dh = dL_dyhat @ W2.T       # shape (1, 3)
dL_dz1 = dL_dh * relu_derivative(z1)     #shape (1, 3)


# Gradients for first layer
dL_dW1 = x.T @ dL_dz1       # (2, 3)
dL_db1 = dL_dz1             # (1, 3)

In [4]:
print("Grad W2:", dL_dW2)
print("Grad b2:", dL_db2)
print("Grad W1:", dL_dW1)
print("Grad b1:", dL_db1)

Grad W2: [[10.73343747]
 [ 0.        ]
 [13.58427795]]
Grad b2: [[4.79749605]]
Grad W1: [[ 3.93968938  0.         13.9537428 ]
 [-1.96984469  0.         -6.9768714 ]]
Grad b1: [[1.96984469 0.         6.9768714 ]]
