# With Pytorch

In [6]:
import numpy as np

# Sigmoid activation function and its derivative
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)  # Derivative of sigmoid(x) when x = sigmoid(z)

# XOR dataset: 4 samples with 2 features
X = np.array([[0, 0],
              [0, 1],
              [1, 0],
              [1, 1]])

# XOR labels
y = np.array([[0],
              [1],
              [1],
              [0]])

# Seed for reproducibility
np.random.seed(42)

# Initialize weights randomly with small values
input_size = 2      # 2 input neurons (x1, x2)
hidden_size = 3     # 3 hidden neurons
output_size = 1     # 1 output neuron

# Weights and biases
W1 = np.random.randn(input_size, hidden_size) * 0.1  # (2x3) weight matrix for hidden layer
b1 = np.zeros((1, hidden_size))                      # (1x3) bias vector for hidden layer

W2 = np.random.randn(hidden_size, output_size) * 0.1 # (3x1) weight matrix for output layer
b2 = np.zeros((1, output_size))                      # (1x1) bias for output layer

# Learning rate
lr = 0.5
epochs = 10000

# Training loop
for epoch in range(epochs):
    # ----- Forward Pass -----
    # Hidden layer
    Z1 = np.dot(X, W1) + b1    # Linear transformation: Z1 = XW1 + b1
    H = sigmoid(Z1)            # Activation function: H = σ(Z1)

    # Output layer
    Z2 = np.dot(H, W2) + b2    # Linear transformation: Z2 = HW2 + b2
    Y_pred = sigmoid(Z2)       # Activation function: Y_pred = σ(Z2)

    # ----- Compute Error -----
    error = 0.5 * (y - Y_pred) ** 2  # Squared error loss
    loss = np.sum(error) / X.shape[0]  # Average loss

    # ----- Backpropagation -----
    # Output layer gradients
    dE_dYpred = Y_pred - y               # dE/dŷ = (ŷ - y)
    dYpred_dZ2 = sigmoid_derivative(Y_pred)  # dŷ/dZ2 = sigmoid'(Z2)

    dZ2_dW2 = H  # dZ2/dW2 = H
    delta2 = dE_dYpred * dYpred_dZ2  # δ2 = (ŷ - y) * sigmoid'(Z2)

    # Gradient w.r.t. output layer weights
    dE_dW2 = np.dot(H.T, delta2)  # dE/dW2 = H^T * δ2
    dE_db2 = np.sum(delta2, axis=0, keepdims=True)  # dE/db2 = sum(δ2)

    # Hidden layer gradients
    dZ2_dH = W2.T  # dZ2/dH = W2^T
    delta1 = np.dot(delta2, dZ2_dH) * sigmoid_derivative(H)  # δ1 = (δ2 * W2^T) * sigmoid'(Z1)

    # Gradient w.r.t. hidden layer weights
    dE_dW1 = np.dot(X.T, delta1)  # dE/dW1 = X^T * δ1
    dE_db1 = np.sum(delta1, axis=0, keepdims=True)  # dE/db1 = sum(δ1)

    # ----- Gradient Descent Weight Updates -----
    W2 -= lr * dE_dW2  # W2 = W2 - η * dE/dW2
    b2 -= lr * dE_db2  # b2 = b2 - η * dE/db2

    W1 -= lr * dE_dW1  # W1 = W1 - η * dE/dW1
    b1 -= lr * dE_db1  # b1 = b1 - η * dE/db1

    # Print loss every 1000 epochs
    if epoch % 1000 == 0:
        print(f"Epoch {epoch}: Loss = {loss:.6f}")

# ----- Testing the trained MLP -----
print("\nFinal predictions after training:")
for i in range(len(X)):
    z1 = np.dot(X[i], W1) + b1
    h = sigmoid(z1)
    z2 = np.dot(h, W2) + b2
    y_pred = sigmoid(z2)
    print(f"Input: {X[i]}, Predicted: {y_pred[0][0]:.4f}, Actual: {y[i][0]}")


Epoch 0: Loss = 0.125295
Epoch 1000: Loss = 0.125000
Epoch 2000: Loss = 0.124999
Epoch 3000: Loss = 0.124997
Epoch 4000: Loss = 0.124984
Epoch 5000: Loss = 0.118427
Epoch 6000: Loss = 0.003653
Epoch 7000: Loss = 0.001049
Epoch 8000: Loss = 0.000583
Epoch 9000: Loss = 0.000397

Final predictions after training:
Input: [0 0], Predicted: 0.0282, Actual: 0
Input: [0 1], Predicted: 0.9769, Actual: 1
Input: [1 0], Predicted: 0.9769, Actual: 1
Input: [1 1], Predicted: 0.0231, Actual: 0
