In [1]:
# Nesterov Accelerated Gradient Descent
import numpy as np

In [2]:
# Activation function
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

In [3]:
# He initialization
def he_initialization(shape):
    return np.random.randn(*shape) * np.sqrt(2 / shape[0])

In [5]:
# Neural Network Model
class NeuralNet:
    def __init__(self, input_size, hidden_size, output_size):
        self.fc1_weights = he_initialization((input_size, hidden_size))
        self.fc1_bias = np.zeros(hidden_size)
        self.fc2_weights = he_initialization((hidden_size, output_size))
        self.fc2_bias = np.zeros(output_size)
        self.v_fc1_weights = np.zeros_like(self.fc1_weights)
        self.v_fc1_bias = np.zeros_like(self.fc1_bias)
        self.v_fc2_weights = np.zeros_like(self.fc2_weights)
        self.v_fc2_bias = np.zeros_like(self.fc2_bias)

    def forward(self, x):
        self.z1 = np.dot(x, self.fc1_weights) + self.fc1_bias
        self.a1 = relu(self.z1)
        self.z2 = np.dot(self.a1, self.fc2_weights) + self.fc2_bias
        return self.z2

    def backward(self, x, y, output, lr, momentum):
        m = y.shape[0]
        dz2 = output - y
        dw2 = np.dot(self.a1.T, dz2) / m
        db2 = np.sum(dz2, axis=0) / m
        dz1 = np.dot(dz2, self.fc2_weights.T) * relu_derivative(self.z1)
        dw1 = np.dot(x.T, dz1) / m
        db1 = np.sum(dz1, axis=0) / m

        self.v_fc2_weights = momentum * self.v_fc2_weights - lr * dw2
        self.v_fc2_bias = momentum * self.v_fc2_bias - lr * db2
        self.v_fc1_weights = momentum * self.v_fc1_weights - lr * dw1
        self.v_fc1_bias = momentum * self.v_fc1_bias - lr * db1

        self.fc2_weights += momentum * self.v_fc2_weights - lr * dw2
        self.fc2_bias += momentum * self.v_fc2_bias - lr * db2
        self.fc1_weights += momentum * self.v_fc1_weights - lr * dw1
        self.fc1_bias += momentum * self.v_fc1_bias - lr * db1

    def compute_loss(self, y_pred, y_true):
        return np.mean(np.square(y_pred - y_true))

In [6]:
# Training the Neural Network
input_size, hidden_size, output_size = 784, 500, 10
lr, momentum, epochs, batch_size = 0.01, 0.9, 5, 64
model = NeuralNet(input_size, hidden_size, output_size)


In [7]:
# Dummy training data
X_train = np.random.randn(batch_size, input_size)
y_train = np.random.randint(0, output_size, (batch_size, output_size))


In [8]:
for epoch in range(epochs):
    outputs = model.forward(X_train)
    loss = model.compute_loss(outputs, y_train)
    model.backward(X_train, y_train, outputs, lr, momentum)
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss:.4f}')

Epoch [1/5], Loss: 30.5899
Epoch [2/5], Loss: 118.2330
Epoch [3/5], Loss: 70.1710
Epoch [4/5], Loss: 32.1415
Epoch [5/5], Loss: 27.9757


In [9]:
print("Training complete.")

Training complete.
