In [190]:
import numpy as np

In [246]:
# Define the method for the first layer
def calculate_first_layer(X,W,b):
    # Define the weight matrix W (3x2)
    # W = np.array([
    #     [0.4, -0.3],  # w11, w12
    #     [0.2,  0.2],  # w21, w22
    #     [-0.5,  0.1]   # w31, w32
    # ])
    
    # Define the bias vector (1x2)
    # b = np.array([0, -0.4])  # b1, b2
    
    # Calculate Z = XW + b
    Z = np.dot(X, W) + b
    return Z

In [192]:
X1 = np.array([1,0,0.5])
X2 = np.array([0.1, 0.5, 1])
X3 = np.array([0,1,0.7])

In [193]:
W0 = np.array([
    [0.4, -0.3],  # w11, w12
    [0.2,  0.2],  # w21, w22
    [-0.5,  0.1]   # w31, w32
])
z1 = calculate_first_layer(X1,W0)
z2 = calculate_first_layer(X2,W0)
z3 = calculate_first_layer(X3,W0)


In [194]:
print(z1)
print(z2)
print(z3)

[ 0.15 -0.65]
[-0.36 -0.23]
[-0.15 -0.13]


In [195]:
# Define the SiLU activation function
def silu_activation(Z):
    """
    Applies the SiLU activation function to the input Z.
    Z is the output of the first layer (dimension: m x 2).
    """
    # Sigmoid of Z
    sigmoid = 1 / (1 + np.exp(-Z))
    
    # SiLU activation: Z * sigmoid(Z)
    silu = Z * sigmoid
    return silu


In [196]:
silu1 = silu_activation(z1)
silu2 = silu_activation(z2)
silu3 = silu_activation(z3)

In [197]:
print(silu1)
print(silu2)
print(silu3)

[ 0.08061448 -0.2229432 ]
[-0.14794544 -0.10183299]
[-0.06938552 -0.06078094]


In [245]:
# Define the method for the second layer
def calculate_second_layer(Z_activated,W,b):
    """
    Calculates the output of the second layer.
    Input:
      - Z_activated: Activated output from the first layer (m x 2 matrix)
    Output:
      - y: Output of the second layer (m x 1 matrix)
    """
    # Weights and bias for the second layer
    # W = np.array([0.3, -0.1])  # 2x1 vector
    # b = 0.1  # Bias scalar
    
    # Calculate y = Z_activated * W + b
    y = np.dot(Z_activated, W) + b
    return y


In [199]:
U0 = np.array([0.3, -0.1])  # 2x1 vector

f1 = calculate_second_layer(silu1,U0)
f2 = calculate_second_layer(silu2,U0)
f3 = calculate_second_layer(silu3,U0)

In [200]:
print(f1)
print(f2)
print(f3)

0.14647866296669132
0.06579966622256907
0.0852624370574336


In [201]:
# Define the sigmoid activation function
def sigmoid_activation(y):
    """
    Applies the sigmoid activation function to the input y.
    Input:
      - y: Output from the second layer (m x 1 matrix)
    Output:
      - activated_y: Sigmoid-activated output (m x 1 matrix)
    """
    # Apply the sigmoid function
    activated_y = 1 / (1 + np.exp(-y))
    return activated_y


In [202]:
y1 = sigmoid_activation(f1)
y2 = sigmoid_activation(f2)
y3 = sigmoid_activation(f3)

In [203]:
print(y1)
print(y2)
print(y3)

0.536554329964136
0.5164439839996734
0.5213027055010718


In [204]:
# Define the CE loss function with individual losses
def cross_entropy_loss(y_true, y_pred):
    """
    Calculates the binary cross-entropy loss.
    Inputs:
      - y_true: True labels (m x 1 matrix, values 0 or 1)
      - y_pred: Predicted probabilities (m x 1 matrix, from sigmoid activation)
    Outputs:
      - mean_loss: Scalar value representing the average loss over the batch
      - individual_losses: List of losses for each sample in the batch
    """
    # Ensure numerical stability by clipping predictions
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    
    # Calculate individual losses
    individual_losses = -(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)).flatten()
    
    # Calculate mean loss
    mean_loss = np.mean(individual_losses)
    
    return mean_loss, individual_losses


In [205]:
y_true = np.array([1, 0, 0])
y_pred = np.array([y1, y2, y3])
loss_iter1,individ_losses = cross_entropy_loss(y_true, y_pred)

In [206]:
print(loss_iter1)
print(individ_losses[0])
print(individ_losses[1])
print(individ_losses[2])

0.6952874681915379
0.6225874546310588
0.7265881155764734
0.7366868343670815


## Wij

In [207]:
# Define the backward pass for calculating gradient of L w.r.t. w_ij
def backward_pass_wij(i, j, x, y_true, U, z, y):
    """
    Computes the gradient of the loss with respect to w_ij in the first layer.
    
    Parameters:
        i (int): Index of the first layer neuron.
        j (int): Index of the input feature.
        x (array): Input vector (1D array).
        y_true (float): Ground truth label.
        U (1D array): Weight vector of the second layer (shape: 2x1).
        z (1D array): Pre-activation output of the first layer (shape: 2).
        y (float): Output of the second layer (sigmoid activation).

    Returns:
        float: Gradient of the loss with respect to w_ij.
    """

    # 1. Compute ∂L/∂y
    dL_dy = -y_true / y + (1 - y_true) / (1 - y)
    print(f"dL/dy: {dL_dy}")

    # 2. Compute ∂y/∂f
    dy_df = y * (1 - y)
    print(f"dy/df: {dy_df}")

    # 3. Compute ∂f/∂h_i
    df_dhi = U[i]
    print(f"df/dh_{i}: {df_dhi}")

    # 4. Compute ∂h_i/∂z_i
    sigmoid_zi = 1 / (1 + np.exp(-z[i]))  # Sigmoid of z_i
    dh_dzi = sigmoid_zi + z[i] * sigmoid_zi * (1 - sigmoid_zi)
    print(f"dh_{i}/dz_{i}: {dh_dzi}")

    # 5. Compute ∂z_i/∂w_ij
    dz_dwij = x[j]
    print(f"dz_{i}/dw_{i}{j}: {dz_dwij}")

    # 6. Chain rule: ∂L/∂w_ij
    dL_dwij = dL_dy * dy_df * df_dhi * dh_dzi * dz_dwij
    print(f"Gradient of L w.r.t. w_{i}{j}: {dL_dwij}")

    return dL_dwij

def calculate_jacobian_dl_dW(x, y_true, U, z, y):
    """
    Computes the Jacobian matrix of gradients (∂L/∂W) for all weights in W.

    Parameters:
        x (array): Input vector (1D array of size 3).
        y_true (float): Ground truth label (scalar).
        U (1D array): Weight vector of the second layer (shape: 2).
        z (1D array): Pre-activation output of the first layer (shape: 2).
        y (float): Output of the second layer (scalar).

    Returns:
        jacobian (2D array): Jacobian matrix (2x3) containing ∂L/∂w_ij.
    """
    # Initialize Jacobian matrix with the same shape as W (2x3)
    jacobian = np.zeros((2, 3))

    # Iterate over all i (neurons) and j (input features)
    for i in range(2):  # 2 neurons in the first layer
        for j in range(3):  # 3 input features
            # Compute the gradient for w_ij using backward_pass_wij
            print(f"$$$w_{i+1}{j+1} iteration")
            gradient = backward_pass_wij(i, j, x, y_true, U, z, y)
            jacobian[i, j] = gradient  # Store the gradient in the Jacobian matrix


    # Print the Jacobian matrix
    print("Jacobian Matrix (∂L/∂W):")
    print(jacobian)

    return jacobian


### x(1)

In [208]:
gradw1 = calculate_jacobian_dl_dW(x=X1, y_true=y_true[0], U=np.array([0.3, -0.1]), z=z1, y=y_pred[0])


$$$w_11 iteration
dL/dy: -1.8637441618760981
dy/df: 0.24866378096087308
df/dh_0: 0.3
dh_0/dz_0: 0.574719696345381
dz_0/dw_00: 1.0
Gradient of L w.r.t. w_00: -0.07990540642667802
$$$w_12 iteration
dL/dy: -1.8637441618760981
dy/df: 0.24866378096087308
df/dh_0: 0.3
dh_0/dz_0: 0.574719696345381
dz_0/dw_01: 0.0
Gradient of L w.r.t. w_01: -0.0
$$$w_13 iteration
dL/dy: -1.8637441618760981
dy/df: 0.24866378096087308
df/dh_0: 0.3
dh_0/dz_0: 0.574719696345381
dz_0/dw_02: 0.5
Gradient of L w.r.t. w_02: -0.03995270321333901
$$$w_21 iteration
dL/dy: -1.8637441618760981
dy/df: 0.24866378096087308
df/dh_1: -0.1
dh_1/dz_1: 0.19651352282931617
dz_1/dw_10: 1.0
Gradient of L w.r.t. w_10: 0.00910733412587405
$$$w_22 iteration
dL/dy: -1.8637441618760981
dy/df: 0.24866378096087308
df/dh_1: -0.1
dh_1/dz_1: 0.19651352282931617
dz_1/dw_11: 0.0
Gradient of L w.r.t. w_11: 0.0
$$$w_23 iteration
dL/dy: -1.8637441618760981
dy/df: 0.24866378096087308
df/dh_1: -0.1
dh_1/dz_1: 0.19651352282931617
dz_1/dw_12: 0.5
Gradi

### x(2)

In [209]:
gradw2 = calculate_jacobian_dl_dW(x=X2, y_true=y_true[1], U=np.array([0.3, -0.1]), z=z2, y=y_pred[1])

$$$w_11 iteration
dL/dy: 2.0680127367070633
dy/df: 0.2497295953902185
df/dh_0: 0.3
dh_0/dz_0: 0.3238137175443828
dz_0/dw_00: 0.1
Gradient of L w.r.t. w_00: 0.00501694939087098
$$$w_12 iteration
dL/dy: 2.0680127367070633
dy/df: 0.2497295953902185
df/dh_0: 0.3
dh_0/dz_0: 0.3238137175443828
dz_0/dw_01: 0.5
Gradient of L w.r.t. w_01: 0.0250847469543549
$$$w_13 iteration
dL/dy: 2.0680127367070633
dy/df: 0.2497295953902185
df/dh_0: 0.3
dh_0/dz_0: 0.3238137175443828
dz_0/dw_02: 1.0
Gradient of L w.r.t. w_02: 0.0501694939087098
$$$w_21 iteration
dL/dy: 2.0680127367070633
dy/df: 0.2497295953902185
df/dh_1: -0.1
dh_1/dz_1: 0.38600592827835595
dz_1/dw_10: 0.1
Gradient of L w.r.t. w_10: -0.0019935043944756636
$$$w_22 iteration
dL/dy: 2.0680127367070633
dy/df: 0.2497295953902185
df/dh_1: -0.1
dh_1/dz_1: 0.38600592827835595
dz_1/dw_11: 0.5
Gradient of L w.r.t. w_11: -0.009967521972378319
$$$w_23 iteration
dL/dy: 2.0680127367070633
dy/df: 0.2497295953902185
df/dh_1: -0.1
dh_1/dz_1: 0.3860059282783559

### x(3)

In [210]:
gradw3 = calculate_jacobian_dl_dW(x=X3, y_true=y_true[2], U=np.array([0.3, -0.1]), z=z3, y=y_pred[2])

$$$w_11 iteration
dL/dy: 2.0890028238967604
dy/df: 0.24954619473833461
df/dh_0: 0.3
dh_0/dz_0: 0.425280303654619
dz_0/dw_00: 0.0
Gradient of L w.r.t. w_00: 0.0
$$$w_12 iteration
dL/dy: 2.0890028238967604
dy/df: 0.24954619473833461
df/dh_0: 0.3
dh_0/dz_0: 0.425280303654619
dz_0/dw_01: 1.0
Gradient of L w.r.t. w_01: 0.06650993186744109
$$$w_13 iteration
dL/dy: 2.0890028238967604
dy/df: 0.24954619473833461
df/dh_0: 0.3
dh_0/dz_0: 0.425280303654619
dz_0/dw_02: 0.7
Gradient of L w.r.t. w_02: 0.04655695230720876
$$$w_21 iteration
dL/dy: 2.0890028238967604
dy/df: 0.24954619473833461
df/dh_1: -0.1
dh_1/dz_1: 0.43518262027308185
dz_1/dw_10: 0.0
Gradient of L w.r.t. w_10: -0.0
$$$w_22 iteration
dL/dy: 2.0890028238967604
dy/df: 0.24954619473833461
df/dh_1: -0.1
dh_1/dz_1: 0.43518262027308185
dz_1/dw_11: 1.0
Gradient of L w.r.t. w_11: -0.022686187733540322
$$$w_23 iteration
dL/dy: 2.0890028238967604
dy/df: 0.24954619473833461
df/dh_1: -0.1
dh_1/dz_1: 0.43518262027308185
dz_1/dw_12: 0.7
Gradient of

### average of gradients

In [211]:
print(gradw2)

[[ 0.00501695  0.02508475  0.05016949]
 [-0.0019935  -0.00996752 -0.01993504]]


In [212]:
stacked = np.stack((gradw1, gradw2, gradw3), axis=0)

# Calculate the mean
grads_w_mean = np.mean(stacked, axis=0)

print("Mean Matrix:")
print(grads_w_mean)

Mean Matrix:
[[-0.02496282  0.03053156  0.01892458]
 [ 0.00237128 -0.01088457 -0.01042057]]


## b(1)

In [213]:
def backward_pass_bi(i, y_true, U, z, y):
    """
    Computes the gradient of the loss with respect to b_i in the first layer.
    
    Parameters:
        i (int): Index of the first layer neuron.
        y_true (float): Ground truth label.
        U (1D array): Weight vector of the second layer (shape: 2).
        z (1D array): Pre-activation output of the first layer (shape: 2).
        y (float): Output of the second layer (sigmoid activation).

    Returns:
        float: Gradient of the loss with respect to b_i.
    """

    # 1. Compute ∂L/∂y
    dL_dy = -y_true / y + (1 - y_true) / (1 - y)
    print(f"dL/dy: {dL_dy}")

    # 2. Compute ∂y/∂f
    dy_df = y * (1 - y)
    print(f"dy/df: {dy_df}")

    # 3. Compute ∂f/∂h_i
    df_dhi = U[i]
    print(f"df/dh_{i}: {df_dhi}")

    # 4. Compute ∂h_i/∂z_i
    sigmoid_zi = 1 / (1 + np.exp(-z[i]))  # Sigmoid of z_i
    dh_dzi = sigmoid_zi + z[i] * sigmoid_zi * (1 - sigmoid_zi)
    print(f"dh_{i}/dz_{i}: {dh_dzi}")

    # 5. Chain rule: ∂L/∂b_i
    dL_dbi = dL_dy * dy_df * df_dhi * dh_dzi
    print(f"Gradient of L w.r.t. b_{i}: {dL_dbi}")

    return dL_dbi


def calculate_bias_gradients_b_first(y_true, U, z, y):
    """
    Computes the gradients of the loss with respect to the biases (b1, b2).
    
    Parameters:
        y_true (float): Ground truth label.
        U (1D array): Weight vector of the second layer (shape: 2).
        z (1D array): Pre-activation output of the first layer (shape: 2).
        y (float): Output of the second layer (sigmoid activation).

    Returns:
        gradients (1D array): Gradients of the loss with respect to biases [b1, b2].
    """
    # Initialize gradient vector for biases (2 biases in the first layer)
    gradients = np.zeros(2)

    # Compute gradient for each bias b_i
    for i in range(2):  # 2 neurons in the first layer
        print(f"$$$b{i+1} iteration")
        gradients[i] = backward_pass_bi(i, y_true, U, z, y)

    # Print the gradients
    print("Gradients of the Loss w.r.t. Biases [b1, b2]:")
    print(gradients)

    return gradients

### x(1)

In [214]:
gradb1_1 = calculate_bias_gradients_b_first(y_true=y_true[0], U=np.array([0.3, -0.1]), z=z1, y=y_pred[0])


$$$b1 iteration
dL/dy: -1.8637441618760981
dy/df: 0.24866378096087308
df/dh_0: 0.3
dh_0/dz_0: 0.574719696345381
Gradient of L w.r.t. b_0: -0.07990540642667802
$$$b2 iteration
dL/dy: -1.8637441618760981
dy/df: 0.24866378096087308
df/dh_1: -0.1
dh_1/dz_1: 0.19651352282931617
Gradient of L w.r.t. b_1: 0.00910733412587405
Gradients of the Loss w.r.t. Biases [b1, b2]:
[-0.07990541  0.00910733]


### x(2)

In [215]:
gradb1_2 = calculate_bias_gradients_b_first(y_true=y_true[1], U=np.array([0.3, -0.1]), z=z2, y=y_pred[1])

$$$b1 iteration
dL/dy: 2.0680127367070633
dy/df: 0.2497295953902185
df/dh_0: 0.3
dh_0/dz_0: 0.3238137175443828
Gradient of L w.r.t. b_0: 0.0501694939087098
$$$b2 iteration
dL/dy: 2.0680127367070633
dy/df: 0.2497295953902185
df/dh_1: -0.1
dh_1/dz_1: 0.38600592827835595
Gradient of L w.r.t. b_1: -0.019935043944756637
Gradients of the Loss w.r.t. Biases [b1, b2]:
[ 0.05016949 -0.01993504]


### x(3)

In [216]:
gradb1_3 = calculate_bias_gradients_b_first(y_true=y_true[2], U=np.array([0.3, -0.1]), z=z3, y=y_pred[2])


$$$b1 iteration
dL/dy: 2.0890028238967604
dy/df: 0.24954619473833461
df/dh_0: 0.3
dh_0/dz_0: 0.425280303654619
Gradient of L w.r.t. b_0: 0.06650993186744109
$$$b2 iteration
dL/dy: 2.0890028238967604
dy/df: 0.24954619473833461
df/dh_1: -0.1
dh_1/dz_1: 0.43518262027308185
Gradient of L w.r.t. b_1: -0.022686187733540322
Gradients of the Loss w.r.t. Biases [b1, b2]:
[ 0.06650993 -0.02268619]


### average of gradients

In [217]:
stacked = np.stack((gradb1_1, gradb1_2, gradb1_3), axis=0)

# Calculate the mean
grads_b1_mean = np.mean(stacked, axis=0)

print("Mean Matrix:")
print(grads_b1_mean)

Mean Matrix:
[ 0.01225801 -0.0111713 ]


## Ui

In [218]:
def backward_pass_U(y_true, y, h):
    """
    Computes the gradient of the loss with respect to U (second-layer weights).
    
    Parameters:
        y_true (float): Ground truth label.
        y (float): Output of the second layer (sigmoid activation).
        h (array): Activated output of the first layer (1D array, size: 2).
    
    Returns:
        gradient_U (array): Gradient of the loss with respect to U (size: 2x1).
    """
    # 1. Compute ∂L/∂y
    dL_dy = -y_true / y + (1 - y_true) / (1 - y)
    print(f"dL/dy: {dL_dy}")

    # 2. Compute ∂y/∂f
    dy_df = y * (1 - y)
    print(f"dy/df: {dy_df}")

    # 3. Chain rule: ∂L/∂U
    dL_dU = dL_dy * dy_df * h
    print(f"Gradient of L w.r.t. U: {dL_dU}")

    # Reshape to 2x1 for consistency
    return dL_dU.reshape(-1, 1)


def calculate_weights_gradients_U(y_true, y, h):
    """
    Computes the gradients of the loss with respect to the weights (u1, u2) of the second layer.
    
    Parameters:
        y_true (float): Ground truth label.
        y (float): Output of the second layer (sigmoid activation).
        h (array): Activated output of the first layer (1D array, size: 2).
    
    Returns:
        gradients_U (array): Gradient of the loss with respect to U (size: 2x1).
    """
    # Compute gradient for second-layer weights
    print("$$$ Computing Gradients for U (Second Layer Weights)")
    gradients_U = backward_pass_U(y_true, y, h)

    # Print the gradients
    print("Gradients of the Loss w.r.t. U (Second Layer Weights):")
    print(gradients_U)

    return gradients_U

### x(1)

In [219]:
gradsU_1 = calculate_weights_gradients_U(y_true[0], y_pred[0], h=silu1)

$$$ Computing Gradients for U (Second Layer Weights)
dL/dy: -1.8637441618760981
dy/df: 0.24866378096087308
Gradient of L w.r.t. U: [-0.03736043  0.10332206]
Gradients of the Loss w.r.t. U (Second Layer Weights):
[[-0.03736043]
 [ 0.10332206]]


### x(2)

In [220]:
gradsU_2 = calculate_weights_gradients_U(y_true[1], y_pred[1], h=silu2)

$$$ Computing Gradients for U (Second Layer Weights)
dL/dy: 2.0680127367070633
dy/df: 0.2497295953902185
Gradient of L w.r.t. U: [-0.07640553 -0.05259104]
Gradients of the Loss w.r.t. U (Second Layer Weights):
[[-0.07640553]
 [-0.05259104]]


### x(3)

In [221]:
gradsU_3 = calculate_weights_gradients_U(y_true[2], y_pred[2], h=silu3)

$$$ Computing Gradients for U (Second Layer Weights)
dL/dy: 2.0890028238967604
dy/df: 0.24954619473833461
Gradient of L w.r.t. U: [-0.03617086 -0.03168527]
Gradients of the Loss w.r.t. U (Second Layer Weights):
[[-0.03617086]
 [-0.03168527]]


### average

In [222]:
stacked = np.stack((gradsU_1, gradsU_2, gradsU_3), axis=0)

# Calculate the mean
grads_u_mean = np.mean(stacked, axis=0)

print("Mean Matrix:")
print(grads_u_mean)

Mean Matrix:
[[-0.04997894]
 [ 0.00634858]]


## b(2)

In [223]:
def backward_pass_b3(y_true, y):
    """
    Computes the gradient of the loss with respect to b3 (second-layer bias).
    
    Parameters:
        y_true (float): Ground truth label.
        y (float): Output of the second layer (sigmoid activation).
    
    Returns:
        gradient_b3 (float): Gradient of the loss with respect to b3.
    """
    # 1. Compute ∂L/∂y
    dL_dy = -y_true / y + (1 - y_true) / (1 - y)
    print(f"dL/dy: {dL_dy}")

    # 2. Compute ∂y/∂f
    dy_df = y * (1 - y)
    print(f"dy/df: {dy_df}")

    # 3. Chain rule: ∂L/∂b3
    dL_db3 = dL_dy * dy_df
    print(f"Gradient of L w.r.t. b3: {dL_db3}")

    return dL_db3


def calculate_bias_gradient_b_second(y_true, y):
    """
    Computes the gradient of the loss with respect to the bias b3 of the second layer.
    
    Parameters:
        y_true (float): Ground truth label.
        y (float): Output of the second layer (sigmoid activation).
    
    Returns:
        gradient_b3 (float): Gradient of the loss with respect to b3.
    """
    print("$$$ Computing Gradient for b3 (Second Layer Bias)")
    gradient_b3 = backward_pass_b3(y_true, y)

    # Print the gradient
    print("Gradient of the Loss w.r.t. b3 (Second Layer Bias):")
    print(gradient_b3)

    return gradient_b3


### X1

In [224]:
gradb3_1 = calculate_bias_gradient_b_second(y_true[0], y_pred[0])

$$$ Computing Gradient for b3 (Second Layer Bias)
dL/dy: -1.8637441618760981
dy/df: 0.24866378096087308
Gradient of L w.r.t. b3: -0.46344567003586407
Gradient of the Loss w.r.t. b3 (Second Layer Bias):
-0.46344567003586407


### X2

In [225]:
gradb3_2 = calculate_bias_gradient_b_second(y_true[1], y_pred[1])

$$$ Computing Gradient for b3 (Second Layer Bias)
dL/dy: 2.0680127367070633
dy/df: 0.2497295953902185
Gradient of L w.r.t. b3: 0.5164439839996734
Gradient of the Loss w.r.t. b3 (Second Layer Bias):
0.5164439839996734


### X3

In [226]:
gradb3_3 = calculate_bias_gradient_b_second(y_true[2], y_pred[2])

$$$ Computing Gradient for b3 (Second Layer Bias)
dL/dy: 2.0890028238967604
dy/df: 0.24954619473833461
Gradient of L w.r.t. b3: 0.5213027055010719
Gradient of the Loss w.r.t. b3 (Second Layer Bias):
0.5213027055010719


### average

In [227]:
stacked = np.stack((gradb3_1, gradb3_2, gradb3_3), axis=0)

# Calculate the mean
grads_b2_mean = np.mean(stacked, axis=0)

print("Mean Matrix:")
print(grads_b2_mean)

Mean Matrix:
0.1914336731549604


## weight updates

In [228]:
W0 =  np.array([
        [0.4, -0.3],  # w11, w12
        [0.2,  0.2],  # w21, w22
        [-0.5,  0.1]   # w31, w32
    ])
U0 = np.array([0.3, -0.1])
b1_0 = np.array([0, -0.4])
b2_0 = 0.1

In [229]:
def update(avg_grad_W, vt_W, W_old,name , gamma=0.9, eta=0.8):
    """
    Updates the first-layer weights (W) using Gradient Descent with Momentum.
    
    Parameters:
        avg_grad_W (2D array): Average gradients for W (shape: 3x2).
        vt_W (2D array): Velocity for W (shape: 3x2).
        W_old (2D array): Current weights W (shape: 3x2).
        gamma (float): Momentum factor.
        eta (float): Learning rate.
    
    Returns:
        W_new (2D array): Updated weights W (shape: 3x2).
        vt_W_new (2D array): Updated velocity for W (shape: 3x2).
    """
    # Compute new velocity
    vt_W_new = gamma * vt_W - eta * avg_grad_W
    print(f"New Velocity for {name}:\n{vt_W_new}")
    
    # Update weights
    W_new = W_old + vt_W_new
    print(f"Updated {name}:\n{W_new}")
    
    return W_new, vt_W_new


In [None]:
W_1, vt_W_1 = update(grads_w_mean, np.zeros([2,3]), W0.T, name="W")

New Velocity for W:
[[ 0.01997026 -0.02442525 -0.01513966]
 [-0.00189702  0.00870766  0.00833646]]
Updated W:
[[ 0.41997026  0.17557475 -0.51513966]
 [-0.30189702  0.20870766  0.10833646]]


In [231]:
b_1_1, vt_b11_1 = update(grads_b1_mean, np.zeros([1,2]), b1_0.T,name = 'b(1)')

New Velocity for b(1):
[[-0.00980641  0.00893704]]
Updated b(1):
[[-0.00980641 -0.39106296]]


In [232]:
U_1, vt_u1_1 = update(grads_u_mean.T, np.zeros([1,2]), U0.T,name = 'U')

New Velocity for U:
[[ 0.03998315 -0.00507887]]
Updated U:
[[ 0.33998315 -0.10507887]]


In [233]:
b_2_1, vt_b21_1 = update(grads_b2_mean.T, np.zeros([1,1]), b2_0,name = 'b(2)')

New Velocity for b(2):
[[-0.15314694]]
Updated b(2):
[[-0.05314694]]


In [234]:
b2_0

0.1

## second iteration

### forward pass

In [235]:
print(b_2_1)


[[-0.05314694]]


In [243]:
W_1

array([[ 0.41997026,  0.17557475, -0.51513966],
       [-0.30189702,  0.20870766,  0.10833646]])

In [247]:
z1 = calculate_first_layer(X1,W_1.T,b_1_1)
z2 = calculate_first_layer(X2,W_1.T,b_1_1)
z3 = calculate_first_layer(X3,W_1.T,b_1_1)
print(z1)
print(z2)
print(z3)
print()
silu1 = silu_activation(z1)
silu2 = silu_activation(z2)
silu3 = silu_activation(z3)
print(silu1)
print(silu2)
print(silu3)
print()
f1 = calculate_second_layer(silu1,U_1.T,b_2_1)
f2 = calculate_second_layer(silu2,U_1.T,b_2_1)
f3 = calculate_second_layer(silu3,U_1.T,b_2_1)
print(f1)
print(f2)
print(f3)
print()
y1 = sigmoid_activation(f1)
y2 = sigmoid_activation(f2)
y3 = sigmoid_activation(f3)
print(y1)
print(y2)
print(y3)

[[ 0.15259402 -0.63879175]]
[[-0.39516167 -0.20856238]]
[[-0.19482942 -0.10651979]]

[[ 0.08210697 -0.22071515]]
[[-0.15904283 -0.09344587]]
[[-0.08795499 -0.05042596]]

[[-0.00203945]]
[[-0.09739964]]
[[-0.07775145]]

[[0.49949014]]
[[0.47566932]]
[[0.48057192]]


In [249]:
y_true = np.array([1, 0, 0])
y_pred = np.array([y1, y2, y3])

In [250]:
def loss (y_true, y_pred):
    individ_losses = -(y_true*(np.log(y_pred)) + (1-y_true)*(np.log(1-y_pred)))
    return np.mean(individ_losses), individ_losses
loss_iter2,individ_losses = loss(y_true, y_pred.T)
print(loss_iter2)
print(individ_losses)

0.6649423612035917
[[[0.69416743 0.64563273 0.65502693]]]


In [240]:
1-y_true

array([0, 1, 1])

## backward

In [259]:
U_1.flatten()
z1

array([[ 0.15259402, -0.63879175]])

In [261]:
gradw1_2 = calculate_jacobian_dl_dW(x=X1, y_true=y_true[0], U=U_1.flatten(), z=z1.flatten(), y=y_pred[0])
print()
gradw2_2 = calculate_jacobian_dl_dW(x=X2, y_true=y_true[1], U=U_1.flatten(), z=z2.flatten(), y=y_pred[1])
print()
gradw3_2 = calculate_jacobian_dl_dW(x=X3, y_true=y_true[2], U=U_1.flatten(), z=z3.flatten(), y=y_pred[2])
print()
stacked = np.stack((gradw1_2, gradw2_2, gradw3_2), axis=0)
grads_w_mean_2 = np.mean(stacked, axis=0)

print("Mean Matrix:")
print(grads_w_mean_2)

$$$w_11 iteration
dL/dy: [[-2.00204153]]
dy/df: [[0.24999974]]
df/dh_0: 0.3399831534829202
dh_0/dz_0: 0.5760019446299496
dz_0/dw_00: 1.0
Gradient of L w.r.t. w_00: [[-0.09801533]]
$$$w_12 iteration
dL/dy: [[-2.00204153]]
dy/df: [[0.24999974]]
df/dh_0: 0.3399831534829202
dh_0/dz_0: 0.5760019446299496
dz_0/dw_01: 0.0
Gradient of L w.r.t. w_01: [[-0.]]
$$$w_13 iteration
dL/dy: [[-2.00204153]]
dy/df: [[0.24999974]]
df/dh_0: 0.3399831534829202
dh_0/dz_0: 0.5760019446299496
dz_0/dw_02: 0.5
Gradient of L w.r.t. w_02: [[-0.04900766]]
$$$w_21 iteration
dL/dy: [[-2.00204153]]
dy/df: [[0.24999974]]
df/dh_1: -0.10507886799272378
dh_1/dz_1: 0.20106600496020557
dz_1/dw_10: 1.0
Gradient of L w.r.t. w_10: [[0.01057467]]
$$$w_22 iteration
dL/dy: [[-2.00204153]]
dy/df: [[0.24999974]]
df/dh_1: -0.10507886799272378
dh_1/dz_1: 0.20106600496020557
dz_1/dw_11: 0.0
Gradient of L w.r.t. w_11: [[0.]]
$$$w_23 iteration
dL/dy: [[-2.00204153]]
dy/df: [[0.24999974]]
df/dh_1: -0.10507886799272378
dh_1/dz_1: 0.201066

  jacobian[i, j] = gradient  # Store the gradient in the Jacobian matrix


In [263]:
gradb1_1_2 = calculate_bias_gradients_b_first(y_true=y_true[0],U=U_1.flatten(), z=z1.flatten(), y=y_pred[0])
gradb1_2_2 = calculate_bias_gradients_b_first(y_true=y_true[1],U=U_1.flatten(), z=z2.flatten(), y=y_pred[1])
gradb1_3_2 = calculate_bias_gradients_b_first(y_true=y_true[2], U=U_1.flatten(), z=z3.flatten(), y=y_pred[2])
print()
stacked = np.stack((gradb1_1_2, gradb1_2_2, gradb1_3_2), axis=0)
grads_b1_mean_2 = np.mean(stacked, axis=0)

print("Mean Matrix:")
print(grads_b1_mean_2)

$$$b1 iteration
dL/dy: [[-2.00204153]]
dy/df: [[0.24999974]]
df/dh_0: 0.3399831534829202
dh_0/dz_0: 0.5760019446299496
Gradient of L w.r.t. b_0: [[-0.09801533]]
$$$b2 iteration
dL/dy: [[-2.00204153]]
dy/df: [[0.24999974]]
df/dh_1: -0.10507886799272378
dh_1/dz_1: 0.20106600496020557
Gradient of L w.r.t. b_1: [[0.01057467]]
Gradients of the Loss w.r.t. Biases [b1, b2]:
[-0.09801533  0.01057467]
$$$b1 iteration
dL/dy: [[1.90719339]]
dy/df: [[0.24940802]]
df/dh_0: 0.3399831534829202
dh_0/dz_0: 0.307443342029572
Gradient of L w.r.t. b_0: [[0.0497196]]
$$$b2 iteration
dL/dy: [[1.90719339]]
dy/df: [[0.24940802]]
df/dh_1: -0.10507886799272378
dh_1/dz_1: 0.3964699148841661
Gradient of L w.r.t. b_1: [[-0.01981667]]
Gradients of the Loss w.r.t. Biases [b1, b2]:
[ 0.0497196  -0.01981667]
$$$b1 iteration
dL/dy: [[1.92519435]]
dy/df: [[0.24962255]]
df/dh_0: 0.3399831534829202
dh_0/dz_0: 0.4031980858623431
Gradient of L w.r.t. b_0: [[0.06587707]]
$$$b2 iteration
dL/dy: [[1.92519435]]
dy/df: [[0.24962

  gradients[i] = backward_pass_bi(i, y_true, U, z, y)


In [265]:
gradsU_1_2 = calculate_weights_gradients_U(y_true[0], y_pred[0], h=silu1)
print()
gradsU_2_2 = calculate_weights_gradients_U(y_true[1], y_pred[1], h=silu2)
print()
gradsU_3_2 = calculate_weights_gradients_U(y_true[2], y_pred[2], h=silu3)
print()
stacked = np.stack((gradsU_1_2, gradsU_2_2, gradsU_3_2), axis=0)

# Calculate the mean
grads_u_mean_2 = np.mean(stacked, axis=0)

print("Mean Matrix:")
print(grads_u_mean_2)

$$$ Computing Gradients for U (Second Layer Weights)
dL/dy: [[-2.00204153]]
dy/df: [[0.24999974]]
Gradient of L w.r.t. U: [[-0.04109535  0.11047011]]
Gradients of the Loss w.r.t. U (Second Layer Weights):
[[-0.04109535]
 [ 0.11047011]]

$$$ Computing Gradients for U (Second Layer Weights)
dL/dy: [[1.90719339]]
dy/df: [[0.24940802]]
Gradient of L w.r.t. U: [[-0.0756518  -0.04444933]]
Gradients of the Loss w.r.t. U (Second Layer Weights):
[[-0.0756518 ]
 [-0.04444933]]

$$$ Computing Gradients for U (Second Layer Weights)
dL/dy: [[1.92519435]]
dy/df: [[0.24962255]]
Gradient of L w.r.t. U: [[-0.0422687 -0.0242333]]
Gradients of the Loss w.r.t. U (Second Layer Weights):
[[-0.0422687]
 [-0.0242333]]

Mean Matrix:
[[-0.05300528]
 [ 0.01392916]]


In [266]:
gradb3_1_2 = calculate_bias_gradient_b_second(y_true[0], y_pred[0])
print()
gradb3_2_2 = calculate_bias_gradient_b_second(y_true[1], y_pred[1])
print()
gradb3_3_2 = calculate_bias_gradient_b_second(y_true[2], y_pred[2])
print()

stacked = np.stack((gradb3_1_2, gradb3_2_2, gradb3_3_2), axis=0)

# Calculate the mean
grads_b2_mean_2 = np.mean(stacked, axis=0)

print("Mean Matrix:")
print(grads_b2_mean_2)

$$$ Computing Gradient for b3 (Second Layer Bias)
dL/dy: [[-2.00204153]]
dy/df: [[0.24999974]]
Gradient of L w.r.t. b3: [[-0.50050986]]
Gradient of the Loss w.r.t. b3 (Second Layer Bias):
[[-0.50050986]]

$$$ Computing Gradient for b3 (Second Layer Bias)
dL/dy: [[1.90719339]]
dy/df: [[0.24940802]]
Gradient of L w.r.t. b3: [[0.47566932]]
Gradient of the Loss w.r.t. b3 (Second Layer Bias):
[[0.47566932]]

$$$ Computing Gradient for b3 (Second Layer Bias)
dL/dy: [[1.92519435]]
dy/df: [[0.24962255]]
Gradient of L w.r.t. b3: [[0.48057192]]
Gradient of the Loss w.r.t. b3 (Second Layer Bias):
[[0.48057192]]

Mean Matrix:
[[0.15191046]]


## weight updates

In [273]:
grads_w_mean_2

array([[-0.03101446,  0.03024562,  0.01560863],
       [ 0.00286433, -0.01082429, -0.01010817]])

In [274]:
W_2, vt_W_2 = update(grads_w_mean_2, vt_W_1, W_1, name="W")

New Velocity for W:
[[ 0.04278479 -0.04617922 -0.0261126 ]
 [-0.00399879  0.01649632  0.01558935]]
Updated W:
[[ 0.46275505  0.12939553 -0.54125227]
 [-0.30589581  0.22520398  0.1239258 ]]


In [277]:
b_1_1

array([[-0.00980641, -0.39106296]])

In [275]:
b_1_2, vt_b11_2 = update(grads_b1_mean_2,vt_b11_1 , b_1_1,name = 'b(1)')

New Velocity for b(1):
[[-0.01351412  0.01652508]]
Updated b(1):
[[-0.02332053 -0.37453788]]


In [286]:
grads_u_mean_2

array([[-0.05300528],
       [ 0.01392916]])

In [284]:
U_2, vt_u1_2 = update(grads_u_mean_2.T, vt_u1_1, U_1,name = 'U')

New Velocity for U:
[[ 0.07838906 -0.01571431]]
Updated U:
[[ 0.41837222 -0.12079318]]


In [280]:
U_1

array([[ 0.33998315, -0.10507887]])

In [None]:
b_2_2, vt_b21_2 = update(grads_b2_mean_2.T, vt_b21_1, b,name = 'b(2)')

New Velocity for b(2):
[[-0.25936061]]
Updated b(2):
[[-0.31250755]]


In [293]:
b_2_1

array([[-0.05314694]])