In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

However, perceptrons can not work on linearly non-separable data. XOR is such an example. Let's test our code on XOR and fail.

In [None]:
# XOR data
X = torch.tensor([[0., 0.],
                  [0., 1.],
                  [1., 0.],
                  [1., 1.]])
y_true = torch.tensor([[0.], [1.], [1.], [0.]])

In [None]:

# If you want to set the values manually
w = torch.tensor([0., 0.], dtype=torch.float32, requires_grad=False)
b = torch.tensor([0.], dtype=torch.float32, requires_grad=False)

# Set learning rate
lr = 0.1

# Increase the number of epochs as much as you want
for epoch in range(2000):
    errors = 0
    for i in range(4):
        xi = X[i]
        yi = y_true[i]

        z = torch.dot(w, xi) + b
        prediction = 1.0 if z >= 0 else 0.0  # Step activation

        # Only update if misclassified
        if prediction != yi:
            w += lr * (yi-prediction) * xi
            b += lr * (yi-prediction)
            errors += 1

    print(f"Epoch {epoch+1}, Misclassified: {errors}")
    if errors == 0:
        break

Epoch 1, Misclassified: 3
Epoch 2, Misclassified: 3
Epoch 3, Misclassified: 4
Epoch 4, Misclassified: 4
Epoch 5, Misclassified: 4
Epoch 6, Misclassified: 4
Epoch 7, Misclassified: 4
Epoch 8, Misclassified: 4
Epoch 9, Misclassified: 4
Epoch 10, Misclassified: 4
Epoch 11, Misclassified: 4
Epoch 12, Misclassified: 4
Epoch 13, Misclassified: 4
Epoch 14, Misclassified: 4
Epoch 15, Misclassified: 4
Epoch 16, Misclassified: 4
Epoch 17, Misclassified: 4
Epoch 18, Misclassified: 4
Epoch 19, Misclassified: 4
Epoch 20, Misclassified: 4
Epoch 21, Misclassified: 4
Epoch 22, Misclassified: 4
Epoch 23, Misclassified: 4
Epoch 24, Misclassified: 4
Epoch 25, Misclassified: 4
Epoch 26, Misclassified: 4
Epoch 27, Misclassified: 4
Epoch 28, Misclassified: 4
Epoch 29, Misclassified: 4
Epoch 30, Misclassified: 4
Epoch 31, Misclassified: 4
Epoch 32, Misclassified: 4
Epoch 33, Misclassified: 4
Epoch 34, Misclassified: 4
Epoch 35, Misclassified: 4
Epoch 36, Misclassified: 4
Epoch 37, Misclassified: 4
Epoch 38, 

# Multi Layer Perceptron From Scratch

The XOR gate is a non-linearly separable problem, which a single-layer perceptron cannot solve. To learn it, we build an MLP with one hidden layer and use non-linear activation functions. We define a custom MLP class using PyTorch's nn.Module. The network has:
* 2 input nodes
* 1 hidden layer with 4 neurons
* 1 output node

We use Sigmoid activation to introduce non-linearity, which is essential for solving XOR.

In [None]:
'''nn.Module is the base class for all neural networks in PyTorch.

Think of it as:

ðŸŽ’ A backpack that holds everything your model needs

It manages:

âœ… Layers
âœ… Weights & biases (parameters)
âœ… Gradients
âœ… Training/eval modes
âœ… Saving & loading models

super(MLP, self).__init__() means Call the parent classâ€™s constructor"

So if your class inherits from another class, super() runs the parentâ€™s setup code.

'''

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.hidden = nn.Linear(2, 4)  # Input layer to hidden layer
        self.output = nn.Linear(4, 1)  # Hidden layer to output
        #self.activation = nn.Sigmoid()  # Sigmoid activation
        self.activation = nn.Tanh() # Tanh activation

    def forward(self, x):
        x = self.activation(self.hidden(x))
        x = self.activation(self.output(x))
        return x

We instantiate the MLP model and set:

* Loss function: MSELoss (Mean Squared Error)
* Optimizer: SGD (Stochastic Gradient Descent)

These are standard components for training a supervised learning model.

In [None]:
model = MLP()
criterion = nn.MSELoss()  # Mean squared error loss
optimizer = optim.SGD(model.parameters(), lr=0.1)  # Stochastic gradient descent

This loop trains the MLP for 2000 epochs. Each iteration includes:

* Forward pass: model makes predictions
* Loss computation
* Backward pass: compute gradients via autograd
* Optimizer step: update weights

The model learns to approximate the XOR function through non-linear transformation.

In [None]:
for epoch in range(20000):
    optimizer.zero_grad()
    outputs = model(X)
    loss = criterion(outputs, y_true)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 200 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Epoch 200, Loss: 0.2469
Epoch 400, Loss: 0.2320
Epoch 600, Loss: 0.2041
Epoch 800, Loss: 0.0646
Epoch 1000, Loss: 0.0074
Epoch 1200, Loss: 0.0030
Epoch 1400, Loss: 0.0018
Epoch 1600, Loss: 0.0012
Epoch 1800, Loss: 0.0009
Epoch 2000, Loss: 0.0007
Epoch 2200, Loss: 0.0006
Epoch 2400, Loss: 0.0005
Epoch 2600, Loss: 0.0005
Epoch 2800, Loss: 0.0004
Epoch 3000, Loss: 0.0004
Epoch 3200, Loss: 0.0003
Epoch 3400, Loss: 0.0003
Epoch 3600, Loss: 0.0003
Epoch 3800, Loss: 0.0003
Epoch 4000, Loss: 0.0002
Epoch 4200, Loss: 0.0002
Epoch 4400, Loss: 0.0002
Epoch 4600, Loss: 0.0002
Epoch 4800, Loss: 0.0002
Epoch 5000, Loss: 0.0002
Epoch 5200, Loss: 0.0002
Epoch 5400, Loss: 0.0002
Epoch 5600, Loss: 0.0001
Epoch 5800, Loss: 0.0001
Epoch 6000, Loss: 0.0001
Epoch 6200, Loss: 0.0001
Epoch 6400, Loss: 0.0001
Epoch 6600, Loss: 0.0001
Epoch 6800, Loss: 0.0001
Epoch 7000, Loss: 0.0001
Epoch 7200, Loss: 0.0001
Epoch 7400, Loss: 0.0001
Epoch 7600, Loss: 0.0001
Epoch 7800, Loss: 0.0001
Epoch 8000, Loss: 0.0001
Epoc

Finally, we use the trained model to make predictions on the XOR inputs and round the outputs to 0 or 1. A successful model should predict [0, 1, 1, 0] as expected from XOR logic.

In [None]:
with torch.no_grad():
    predictions = model(X)
    print("\nPredictions (rounded):")
    print(predictions.round())


Predictions (rounded):
tensor([[0.],
        [1.],
        [1.],
        [0.]])


# MLP solving XOR without torch

In [None]:
import numpy as np

# XOR dataset
X = np.array([
    [0,0],
    [0,1],
    [1,0],
    [1,1]
])

y = np.array([[0],[1],[1],[0]])

# Sigmoid and derivative
def sigmoid(x):
    return 1/(1+np.exp(-x))

def sigmoid_deriv(x):
    return x*(1-x)

# Initialize weights
np.random.seed(42)

Wij = np.random.randn(2,2)  # input â†’ hidden
bij = np.zeros((1,2))

Wjk = np.random.randn(2,1)  # hidden â†’ output
bjk = np.zeros((1,1))

lr = 0.1

# Training loop
for epoch in range(10000):

    # ---- Forward pass ----
    zij = X @ Wij + bij
    Oij = sigmoid(zij)

    zjk = Oij @ Wjk + bjk
    yk = sigmoid(zjk)

    # ---- Backprop ----
    error = y - yk

    d_yk = error * sigmoid_deriv(yk)
    d_Wjk = yk.T @ d_yk
    d_bjk = np.sum(d_yk, axis=0, keepdims=True)

    d_Oij = d_yk @ Wjk.T * sigmoid_deriv(Oij)
    d_Wij = X.T @ d_Oij
    d_bij = np.sum(d_Oij, axis=0, keepdims=True)

    # ---- Update ----
    Wjk += lr * d_Wjk
    bjk += lr * d_bjk
    Wij += lr * d_Wij
    bij += lr * d_bij

# ---- Test ----
print("Predictions:")
print(yk.round())


Predictions:
[[1.]
 [0.]
 [1.]
 [0.]]
