diff --git a/neural_network/optimizers/__init__.py b/neural_network/optimizers/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/neural_network/optimizers/adagrad.py b/neural_network/optimizers/adagrad.py new file mode 100644 index 000000000000..f767e1a4e390 --- /dev/null +++ b/neural_network/optimizers/adagrad.py @@ -0,0 +1,88 @@ +""" +Adagrad Optimizer + +Implements Adagrad (Adaptive Gradient) for neural network training using NumPy. +Adagrad adapts the learning rate for each parameter based on historical gradients. + +Reference: https://en.wikipedia.org/wiki/Stochastic_gradient_descent#AdaGrad +Author: Adhithya Laxman Ravi Shankar Geetha +Date: 2025.10.22 +""" + +import numpy as np + + +class Adagrad: + """ + Adagrad optimizer. + + Adapts learning rate individually for each parameter: + accumulated_grad += gradient^2 + param = param - (learning_rate / sqrt(accumulated_grad + epsilon)) * gradient + """ + + def __init__(self, learning_rate: float = 0.01, epsilon: float = 1e-8) -> None: + """ + Initialize Adagrad optimizer. + + Args: + learning_rate (float): Initial learning rate. + epsilon (float): Small constant for numerical stability. + + >>> optimizer = Adagrad(learning_rate=0.01, epsilon=1e-8) + >>> optimizer.learning_rate + 0.01 + """ + self.learning_rate = learning_rate + self.epsilon = epsilon + self.accumulated_grad: dict[int, np.ndarray] = {} + + def update( + self, param_id: int, params: np.ndarray, gradients: np.ndarray + ) -> np.ndarray: + """ + Update parameters using Adagrad. + + Args: + param_id (int): Unique identifier for parameter group. + params (np.ndarray): Current parameters. + gradients (np.ndarray): Gradients of parameters. + + Returns: + np.ndarray: Updated parameters. + + >>> optimizer = Adagrad(learning_rate=0.1) + >>> params = np.array([1.0, 2.0]) + >>> grads = np.array([0.1, 0.2]) + >>> updated = optimizer.update(0, params, grads) + >>> updated.shape + (2,) + """ + if param_id not in self.accumulated_grad: + self.accumulated_grad[param_id] = np.zeros_like(params) + + self.accumulated_grad[param_id] += gradients**2 + adjusted_lr = self.learning_rate / ( + np.sqrt(self.accumulated_grad[param_id]) + self.epsilon + ) + return params - adjusted_lr * gradients + + +# Usage example +if __name__ == "__main__": + import doctest + + doctest.testmod() + + print("Adagrad Example: Minimizing f(x) = x^2") + + optimizer = Adagrad(learning_rate=1.0) + x = np.array([5.0]) + + for step in range(20): + gradient = 2 * x + x = optimizer.update(0, x, gradient) + if step % 5 == 0: + print(f"Step {step}: x = {x[0]:.4f}, f(x) = {x[0] ** 2:.4f}") + + print(f"Final: x = {x[0]:.4f}, f(x) = {x[0] ** 2:.4f}") diff --git a/neural_network/optimizers/adam_optimizer.py b/neural_network/optimizers/adam_optimizer.py new file mode 100644 index 000000000000..edcbda4401b4 --- /dev/null +++ b/neural_network/optimizers/adam_optimizer.py @@ -0,0 +1,112 @@ +""" +Adam Optimizer + +Implements Adam (Adaptive Moment Estimation) for neural network training using NumPy. +Adam combines momentum and adaptive learning rates using first and +second moment estimates. + +Reference: https://arxiv.org/abs/1412.6980 +Author: Adhithya Laxman Ravi Shankar Geetha +Date: 2025.10.21 +""" + +import numpy as np + + +class Adam: + """ + Adam optimizer. + + Combines momentum and RMSProp: + m = beta1 * m + (1 - beta1) * gradient + v = beta2 * v + (1 - beta2) * gradient^2 + m_hat = m / (1 - beta1^t) + v_hat = v / (1 - beta2^t) + param = param - learning_rate * m_hat / (sqrt(v_hat) + epsilon) + """ + + def __init__( + self, + learning_rate: float = 0.001, + beta1: float = 0.9, + beta2: float = 0.999, + epsilon: float = 1e-8, + ) -> None: + """ + Initialize Adam optimizer. + + Args: + learning_rate (float): Learning rate. + beta1 (float): Exponential decay rate for first moment. + beta2 (float): Exponential decay rate for second moment. + epsilon (float): Small constant for numerical stability. + + >>> optimizer = Adam(learning_rate=0.001, beta1=0.9, beta2=0.999) + >>> optimizer.beta1 + 0.9 + """ + self.learning_rate = learning_rate + self.beta1 = beta1 + self.beta2 = beta2 + self.epsilon = epsilon + self.m: dict[int, np.ndarray] = {} + self.v: dict[int, np.ndarray] = {} + self.t: dict[int, int] = {} + + def update( + self, param_id: int, params: np.ndarray, gradients: np.ndarray + ) -> np.ndarray: + """ + Update parameters using Adam. + + Args: + param_id (int): Unique identifier for parameter group. + params (np.ndarray): Current parameters. + gradients (np.ndarray): Gradients of parameters. + + Returns: + np.ndarray: Updated parameters. + + >>> optimizer = Adam(learning_rate=0.1) + >>> params = np.array([1.0, 2.0]) + >>> grads = np.array([0.1, 0.2]) + >>> updated = optimizer.update(0, params, grads) + >>> updated.shape + (2,) + """ + if param_id not in self.m: + self.m[param_id] = np.zeros_like(params) + self.v[param_id] = np.zeros_like(params) + self.t[param_id] = 0 + + self.t[param_id] += 1 + + self.m[param_id] = self.beta1 * self.m[param_id] + (1 - self.beta1) * gradients + self.v[param_id] = self.beta2 * self.v[param_id] + (1 - self.beta2) * ( + gradients**2 + ) + + m_hat = self.m[param_id] / (1 - self.beta1 ** self.t[param_id]) + v_hat = self.v[param_id] / (1 - self.beta2 ** self.t[param_id]) + + return params - self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon) + + +# Usage example +if __name__ == "__main__": + import doctest + + doctest.testmod() + + print("Adam Example: Minimizing f(x) = x^2") + + optimizer = Adam(learning_rate=0.1) + x = np.array([5.0]) + + for step in range(20): + gradient = 2 * x + x = optimizer.update(0, x, gradient) + if step % 5 == 0: + print(f"Step {step}: x = {x[0]:.4f}, f(x) = {x[0] ** 2:.4f}") + + print(f"Final: x = {x[0]:.4f}, f(x) = {x[0] ** 2:.4f}") diff --git a/neural_network/optimizers/muon.py b/neural_network/optimizers/muon.py new file mode 100644 index 000000000000..7be037484baf --- /dev/null +++ b/neural_network/optimizers/muon.py @@ -0,0 +1,117 @@ +""" +Muon Optimizer + +Implements Muon optimizer for neural network hidden layers using NumPy. +Muon uses Newton-Schulz orthogonalization iterations for improved convergence. + +Reference: https://kellerjordan.github.io/posts/muon/ +Author: Adhithya Laxman Ravi Shankar Geetha +Date: 2025.10.21 +""" + +import numpy as np + + +class Muon: + """ + Muon optimizer for hidden layer weight matrices. + + Applies Newton-Schulz orthogonalization to gradients before updates. + """ + + def __init__( + self, learning_rate: float = 0.02, momentum: float = 0.95, ns_steps: int = 5 + ) -> None: + """ + Initialize Muon optimizer. + + Args: + learning_rate (float): Learning rate for updates. + momentum (float): Momentum factor. + ns_steps (int): Number of Newton-Schulz iteration steps. + + >>> optimizer = Muon(learning_rate=0.02, momentum=0.95, ns_steps=5) + >>> optimizer.momentum + 0.95 + """ + self.learning_rate = learning_rate + self.momentum = momentum + self.ns_steps = ns_steps + self.velocity: dict[int, np.ndarray] = {} + + def newton_schulz_orthogonalize(self, matrix: np.ndarray) -> np.ndarray: + """ + Orthogonalize matrix using Newton-Schulz iterations. + + Args: + matrix (np.ndarray): Input matrix. + + Returns: + np.ndarray: Orthogonalized matrix. + + >>> optimizer = Muon() + >>> mat = np.array([[1.0, 0.5], [0.5, 1.0]]) + >>> orth = optimizer.newton_schulz_orthogonalize(mat) + >>> orth.shape + (2, 2) + """ + if matrix.shape[0] < matrix.shape[1]: + matrix = matrix.T + transposed = True + else: + transposed = False + + a = matrix.copy() + for _ in range(self.ns_steps): + a = 1.5 * a - 0.5 * a @ (a.T @ a) + + return a.T if transposed else a + + def update( + self, param_id: int, params: np.ndarray, gradients: np.ndarray + ) -> np.ndarray: + """ + Update parameters using Muon. + + Args: + param_id (int): Unique identifier for parameter group. + params (np.ndarray): Current parameters. + gradients (np.ndarray): Gradients of parameters. + + Returns: + np.ndarray: Updated parameters. + + >>> optimizer = Muon(learning_rate=0.1, momentum=0.9) + >>> params = np.array([[1.0, 2.0], [3.0, 4.0]]) + >>> grads = np.array([[0.1, 0.2], [0.3, 0.4]]) + >>> updated = optimizer.update(0, params, grads) + >>> updated.shape + (2, 2) + """ + if param_id not in self.velocity: + self.velocity[param_id] = np.zeros_like(params) + + ortho_grad = self.newton_schulz_orthogonalize(gradients) + self.velocity[param_id] = self.momentum * self.velocity[param_id] + ortho_grad + + return params - self.learning_rate * self.velocity[param_id] + + +# Usage example +if __name__ == "__main__": + import doctest + + doctest.testmod() + + print("Muon Example: Optimizing a 2x2 matrix") + + optimizer = Muon(learning_rate=0.05, momentum=0.9) + weights = np.array([[1.0, 2.0], [3.0, 4.0]]) + + for step in range(10): + gradients = 0.1 * weights # Simplified gradient + weights = optimizer.update(0, weights, gradients) + if step % 3 == 0: + print(f"Step {step}: weights =\n{weights}") + + print(f"Final weights:\n{weights}") diff --git a/neural_network/optimizers/nesterov_accelerated_sgd.py b/neural_network/optimizers/nesterov_accelerated_sgd.py new file mode 100644 index 000000000000..a4f1b63ae683 --- /dev/null +++ b/neural_network/optimizers/nesterov_accelerated_sgd.py @@ -0,0 +1,92 @@ +""" +Nesterov Accelerated Gradient (NAG) Optimizer + +Implements Nesterov momentum for neural network training using NumPy. +NAG looks ahead and computes gradients at the anticipated position. + +Reference: https://cs231n.github.io/neural-networks-3/#sgd +Author: Adhithya Laxman Ravi Shankar Geetha +Date: 2025.10.21 +""" + +import numpy as np + + +class NesterovAcceleratedGradient: + """ + Nesterov Accelerated Gradient (NAG) optimizer. + + Updates parameters using Nesterov momentum: + velocity = momentum * velocity - learning_rate * gradient_at_lookahead + param = param + velocity + """ + + def __init__(self, learning_rate: float = 0.01, momentum: float = 0.9) -> None: + """ + Initialize NAG optimizer. + + Args: + learning_rate (float): Learning rate for weight updates. + momentum (float): Momentum factor. + + >>> optimizer = NesterovAcceleratedGradient(learning_rate=0.01, momentum=0.9) + >>> optimizer.momentum + 0.9 + """ + self.learning_rate = learning_rate + self.momentum = momentum + self.velocity: dict[int, np.ndarray] = {} + + def update( + self, param_id: int, params: np.ndarray, gradients: np.ndarray + ) -> np.ndarray: + """ + Update parameters using NAG. + + Args: + param_id (int): Unique identifier for parameter group. + params (np.ndarray): Current parameters. + gradients (np.ndarray): Gradients at lookahead position. + + Returns: + np.ndarray: Updated parameters. + + >>> optimizer = NesterovAcceleratedGradient(learning_rate=0.1, momentum=0.9) + >>> params = np.array([1.0, 2.0]) + >>> grads = np.array([0.1, 0.2]) + >>> updated = optimizer.update(0, params, grads) + >>> updated.shape + (2,) + """ + if param_id not in self.velocity: + self.velocity[param_id] = np.zeros_like(params) + + velocity_prev = self.velocity[param_id].copy() + self.velocity[param_id] = ( + self.momentum * self.velocity[param_id] - self.learning_rate * gradients + ) + return ( + params + - self.momentum * velocity_prev + + (1 + self.momentum) * self.velocity[param_id] + ) + + +# Usage example +if __name__ == "__main__": + import doctest + + doctest.testmod() + + print("NAG Example: Minimizing f(x) = x^2") + + optimizer = NesterovAcceleratedGradient(learning_rate=0.1, momentum=0.9) + x = np.array([5.0]) + + for step in range(20): + gradient = 2 * x + x = optimizer.update(0, x, gradient) + if step % 5 == 0: + print(f"Step {step}: x = {x[0]:.4f}, f(x) = {x[0] ** 2:.4f}") + + print(f"Final: x = {x[0]:.4f}, f(x) = {x[0] ** 2:.4f}")