In [31]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

# Generate a random dataset for 3-class classification
X, y = make_classification(n_samples=300, n_features=4, n_classes=3, n_informative=4, n_redundant=0, random_state=42)

# One-hot encoding the target labels
encoder = OneHotEncoder(sparse_output=False)
y_encoded = encoder.fit_transform(y.reshape(-1, 1))

# Initialize network parameters
input_size = 4
hidden1_size = 3
hidden2_size = 4
output_size = 3
np.random.seed(42)

# Common hyperparameters
learning_rate = 0.01
epochs = 10
momentum = 0.9
beta1 = 0.9
beta2 = 0.999
epsilon = 1e-8

def initialize_parameters():
    """ Initialize weights and biases for the network. """
    weights1 = np.random.randn(input_size, hidden1_size) * 0.01
    bias1 = np.zeros((1, hidden1_size))
    weights2 = np.random.randn(hidden1_size, hidden2_size) * 0.01
    bias2 = np.zeros((1, hidden2_size))
    weights3 = np.random.randn(hidden2_size, output_size) * 0.01
    bias3 = np.zeros((1, output_size))
    return weights1, bias1, weights2, bias2, weights3, bias3

def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

def forward_pass(X, weights1, bias1, weights2, bias2, weights3, bias3):
    z1 = np.dot(X, weights1) + bias1
    a1 = relu(z1)
    z2 = np.dot(a1, weights2) + bias2
    a2 = relu(z2)
    z3 = np.dot(a2, weights3) + bias3
    output = softmax(z3)
    return a1, a2, output

def compute_loss(output, y_encoded):
    return -np.mean(np.sum(y_encoded * np.log(output + 1e-8), axis=1))

def backpropagation(X, y_encoded, a1, a2, output, weights3, weights2):
    dz3 = output - y_encoded
    dw3 = np.dot(a2.T, dz3) / X.shape[0]
    db3 = np.sum(dz3, axis=0, keepdims=True) / X.shape[0]

    dz2 = np.dot(dz3, weights3.T) * relu_derivative(a2)
    dw2 = np.dot(a1.T, dz2) / X.shape[0]
    db2 = np.sum(dz2, axis=0, keepdims=True) / X.shape[0]

    dz1 = np.dot(dz2, weights2.T) * relu_derivative(a1)
    dw1 = np.dot(X.T, dz1) / X.shape[0]
    db1 = np.sum(dz1, axis=0, keepdims=True) / X.shape[0]

    return dw1, db1, dw2, db2, dw3, db3

epochs = 500  # Increased from 100 to 500
learning_rate = 0.01  # Default initial learning rate

def train_model(optimizer_name):
    weights1, bias1, weights2, bias2, weights3, bias3 = initialize_parameters()

    # Initialize optimizer-specific parameters
    velocity_w1, velocity_b1 = np.zeros_like(weights1), np.zeros_like(bias1)
    cache_w1, cache_b1 = np.zeros_like(weights1), np.zeros_like(bias1)
    m_w1, v_w1 = np.zeros_like(weights1), np.zeros_like(weights1)

    lr = learning_rate
    if optimizer_name == "SGD":
        lr = 0.1
    elif optimizer_name == "Momentum":
        lr = 0.05
        momentum = 0.95
    elif optimizer_name == "AdaGrad":
        lr = 0.01
    elif optimizer_name == "RMSProp":
        lr = 0.001
        decay_rate = 0.9
    elif optimizer_name == "Adam":
        lr = 0.001
        beta1 = 0.9
        beta2 = 0.999

    for epoch in range(epochs):
        # Forward pass
        a1, a2, output = forward_pass(X, weights1, bias1, weights2, bias2, weights3, bias3)
        loss = compute_loss(output, y_encoded)

        if epoch % 100 == 0:
            print(f"Epoch {epoch}/{epochs} | Loss: {loss:.4f} | Optimizer: {optimizer_name}")

        # Backpropagation
        dw1, db1, dw2, db2, dw3, db3 = backpropagation(X, y_encoded, a1, a2, output, weights3, weights2)

        # Update weights and biases based on optimizer
        if optimizer_name == "SGD":
            weights1 -= lr * dw1
            bias1 -= lr * db1
            weights2 -= lr * dw2
            bias2 -= lr * db2
            weights3 -= lr * dw3
            bias3 -= lr * db3

        elif optimizer_name == "Momentum":
            velocity_w1 = momentum * velocity_w1 - lr * dw1
            weights1 += velocity_w1

        elif optimizer_name == "AdaGrad":
            cache_w1 += dw1 ** 2
            weights1 -= (lr / (np.sqrt(cache_w1 + epsilon))) * dw1

        elif optimizer_name == "RMSProp":
            cache_w1 = decay_rate * cache_w1 + (1 - decay_rate) * dw1 ** 2
            weights1 -= (lr / (np.sqrt(cache_w1 + epsilon))) * dw1

        elif optimizer_name == "Adam":
            m_w1 = beta1 * m_w1 + (1 - beta1) * dw1
            v_w1 = beta2 * v_w1 + (1 - beta2) * (dw1 ** 2)
            m_w1_corrected = m_w1 / (1 - beta1 ** (epoch + 1))
            v_w1_corrected = v_w1 / (1 - beta2 ** (epoch + 1))
            weights1 -= (lr * m_w1_corrected) / (np.sqrt(v_w1_corrected) + epsilon)

    # Evaluate the model
    _, _, final_output = forward_pass(X, weights1, bias1, weights2, bias2, weights3, bias3)
    y_pred = np.argmax(final_output, axis=1)
    accuracy = accuracy_score(y, y_pred)
    return accuracy

# Test all optimizers with increased epochs and hyperparameter tuning
results = {}

for optimizer in optimizers:
    accuracy = train_model(optimizer)
    results[optimizer] = accuracy * 100

# Print updated accuracy for each optimizer
print("\nFinal Accuracy Results (500 Epochs):")
for opt, acc in results.items():
    print(f"Optimizer: {opt}, Accuracy: {acc:.2f}%")



Epoch 0/500 | Loss: 1.0986 | Optimizer: SGD
Epoch 100/500 | Loss: 1.0986 | Optimizer: SGD
Epoch 200/500 | Loss: 1.0986 | Optimizer: SGD
Epoch 300/500 | Loss: 1.0986 | Optimizer: SGD
Epoch 400/500 | Loss: 1.0986 | Optimizer: SGD
Epoch 0/500 | Loss: 1.0986 | Optimizer: Momentum
Epoch 100/500 | Loss: 1.0986 | Optimizer: Momentum
Epoch 200/500 | Loss: 1.0986 | Optimizer: Momentum
Epoch 300/500 | Loss: 1.0986 | Optimizer: Momentum
Epoch 400/500 | Loss: 1.0986 | Optimizer: Momentum
Epoch 0/500 | Loss: 1.0986 | Optimizer: AdaGrad
Epoch 100/500 | Loss: 1.0986 | Optimizer: AdaGrad
Epoch 200/500 | Loss: 1.0986 | Optimizer: AdaGrad
Epoch 300/500 | Loss: 1.0986 | Optimizer: AdaGrad
Epoch 400/500 | Loss: 1.0986 | Optimizer: AdaGrad
Epoch 0/500 | Loss: 1.0986 | Optimizer: RMSProp
Epoch 100/500 | Loss: 1.0986 | Optimizer: RMSProp
Epoch 200/500 | Loss: 1.0986 | Optimizer: RMSProp
Epoch 300/500 | Loss: 1.0986 | Optimizer: RMSProp
Epoch 400/500 | Loss: 1.0986 | Optimizer: RMSProp
Epoch 0/500 | Loss: 1.0