In [1]:
import numpy as np

In [2]:
def act_hid(x):
    return np.maximum(0,x)

In [28]:
def der_hid(a):
    return (a > 0).astype(float)

In [4]:
def act_out(x):
    exp_x = np.exp(x - np.max(x, axis=0, keepdims=True))  # Compute exponentials
    return exp_x / np.sum(exp_x, axis=0, keepdims=True)    # Summation & division


In [None]:
def der_hid(a):
    
    return (a > 0).astype(float)

In [None]:
def forward_propagation(X,parameters):
    act_preact={"h0":X}
    L=len(parameters)//2
    
    for i in range (1,L+1):
        w=parameters[f"W{i}"]
        b=parameters[f"b{i}"]
        a=np.dot(w,act_preact[f"h{i-1}"]) +b
        if i==L:
            h=act_out(a)
        else :
            h=act_hid(a)
        act_preact[f"a{i}"]=a
        act_preact[f"h{i}"] =h
    return act_preact[f"h{L}"],act_preact
    

    

In [22]:
def backprop(Y,act_preact,parameters):
    grad_w_b={}
    L=len(parameters)//2
    m=Y.shape[1]
    dZ=act_preact[f"h{L}"]-Y
    grad_w_b[f"dW{L}"]=(1/m)*np.dot(dZ,act_preact[f"h{L-1}"].T)
    grad_w_b[f"db{L}"]=(1/m)*np.sum(dZ,axis=1,keepdims=True)

    for i in range(L-1,0,-1):
        dA=np.dot(parameters[f"W{i+1}"].T,dZ)

        dZ=dA*der_hid(act_preact[f"a{i}"])
        grad_w_b[f"dW{i}"] = (1/m) * np.dot(dZ, act_preact[f"h{i-1}"].T)
        grad_w_b[f"db{i}"] = (1/m) * np.sum(dZ, axis=1, keepdims=True)
    return grad_w_b

    

In [16]:
def update_parameters_sgd(parameters,grad_w_b,eta):
    L = len(parameters) // 2
    for i in range(1, L+1):
        parameters[f"W{i}"] -= eta * grad_w_b[f"dW{i}"]
        parameters[f"b{i}"] -= eta * grad_w_b[f"db{i}"]
    return parameters
    




In [9]:
def update_parameters_momentum(parameters, grad_w_b, state, eta, beta):
    
    L = len(parameters) // 2
    for l in range(1, L+1):
        state.setdefault(f"vW{l}", np.zeros_like(parameters[f"W{l}"]))
        state.setdefault(f"vb{l}", np.zeros_like(parameters[f"b{l}"]))
        
        state[f"vW{l}"] = beta * state[f"vW{l}"] + eta * grad_w_b[f"dW{l}"]
        state[f"vb{l}"] = beta * state[f"vb{l}"] + eta * grad_w_b[f"db{l}"]
        
        parameters[f"W{l}"] -= state[f"vW{l}"]
        parameters[f"b{l}"] -= state[f"vb{l}"]
    return parameters, state

In [10]:
def update_parameters_nesterov(parameters, grad_w_b_w_b, state, eta, beta):
    
    L = len(parameters) // 2
    for l in range(1, L+1):
        vW_prev = state.get(f"vW{l}", np.zeros_like(parameters[f"W{l}"]))
        vb_prev = state.get(f"vb{l}", np.zeros_like(parameters[f"b{l}"]))
        
        state[f"vW{l}"] = beta * vW_prev + eta * grad_w_b_w_b[f"dW{l}"]
        state[f"vb{l}"] = beta * vb_prev + eta * grad_w_b_w_b[f"db{l}"]
        
        parameters[f"W{l}"] -= (beta * vW_prev + (1-beta) * state[f"vW{l}"])
        parameters[f"b{l}"] -= (beta * vb_prev + (1-beta) * state[f"vb{l}"])
    return parameters, state


In [11]:
def update_parameters_rmsprop(parameters, grad_w_b_w_b, state, eta, beta, epsilon=1e-8):
   
    L = len(parameters) // 2
    for l in range(1, L+1):
        state.setdefault(f"sW{l}", np.zeros_like(parameters[f"W{l}"]))
        state.setdefault(f"sb{l}", np.zeros_like(parameters[f"b{l}"]))
        
        state[f"sW{l}"] = beta * state[f"sW{l}"] + (1-beta) * (grad_w_b_w_b[f"dW{l}"] ** 2)
        state[f"sb{l}"] = beta * state[f"sb{l}"] + (1-beta) * (grad_w_b_w_b[f"db{l}"] ** 2)
        
        parameters[f"W{l}"] -= eta * grad_w_b_w_b[f"dW{l}"] / (np.sqrt(state[f"sW{l}"]) + epsilon)
        parameters[f"b{l}"] -= eta * grad_w_b_w_b[f"db{l}"] / (np.sqrt(state[f"sb{l}"]) + epsilon)
    return parameters, state

In [12]:
def update_parameters_adam(parameters, grad_w_b_w_b, state, eta, beta1, beta2, epsilon, t):
   
    
    L = len(parameters) // 2
    for l in range(1, L+1):
        state.setdefault(f"vW{l}", np.zeros_like(parameters[f"W{l}"]))
        state.setdefault(f"vb{l}", np.zeros_like(parameters[f"b{l}"]))
        state.setdefault(f"sW{l}", np.zeros_like(parameters[f"W{l}"]))
        state.setdefault(f"sb{l}", np.zeros_like(parameters[f"b{l}"]))
        
        state[f"vW{l}"] = beta1 * state[f"vW{l}"] + (1 - beta1) * grad_w_b_w_b[f"dW{l}"]
        state[f"vb{l}"] = beta1 * state[f"vb{l}"] + (1 - beta1) * grad_w_b_w_b[f"db{l}"]
        state[f"sW{l}"] = beta2 * state[f"sW{l}"] + (1 - beta2) * (grad_w_b_w_b[f"dW{l}"] ** 2)
        state[f"sb{l}"] = beta2 * state[f"sb{l}"] + (1 - beta2) * (grad_w_b_w_b[f"db{l}"] ** 2)
        
        vW_corr = state[f"vW{l}"] / (1 - beta1**t)
        vb_corr = state[f"vb{l}"] / (1 - beta1**t)
        sW_corr = state[f"sW{l}"] / (1 - beta2**t)
        sb_corr = state[f"sb{l}"] / (1 - beta2**t)
        
        parameters[f"W{l}"] -= eta * vW_corr / (np.sqrt(sW_corr) + epsilon)
        parameters[f"b{l}"] -= eta * vb_corr / (np.sqrt(sb_corr) + epsilon)
    return parameters, state

In [13]:
def update_parameters_nadam(parameters, grad_w_b_w_b, state, eta, beta1, beta2, epsilon, t):
    
    L = len(parameters) // 2
    for l in range(1, L+1):
        state.setdefault(f"vW{l}", np.zeros_like(parameters[f"W{l}"]))
        state.setdefault(f"vb{l}", np.zeros_like(parameters[f"b{l}"]))
        state.setdefault(f"sW{l}", np.zeros_like(parameters[f"W{l}"]))
        state.setdefault(f"sb{l}", np.zeros_like(parameters[f"b{l}"]))
        
        state[f"vW{l}"] = beta1 * state[f"vW{l}"] + (1-beta1) * grad_w_b_w_b[f"dW{l}"]
        state[f"vb{l}"] = beta1 * state[f"vb{l}"] + (1-beta1) * grad_w_b_w_b[f"db{l}"]
        state[f"sW{l}"] = beta2 * state[f"sW{l}"] + (1-beta2) * (grad_w_b_w_b[f"dW{l}"] ** 2)
        state[f"sb{l}"] = beta2 * state[f"sb{l}"] + (1-beta2) * (grad_w_b_w_b[f"db{l}"] ** 2)
        
        vW_corr = state[f"vW{l}"] / (1 - beta1**t)
        vb_corr = state[f"vb{l}"] / (1 - beta1**t)
        sW_corr = state[f"sW{l}"] / (1 - beta2**t)
        sb_corr = state[f"sb{l}"] / (1 - beta2**t)
        
        parameters[f"W{l}"] -= eta * (beta1 * vW_corr + (1-beta1) * grad_w_b_w_b[f"dW{l}"] / (1-beta1**(t+1))) / (np.sqrt(sW_corr) + epsilon)
        parameters[f"b{l}"] -= eta * (beta1 * vb_corr + (1-beta1) * grad_w_b_w_b[f"db{l}"] / (1-beta1**(t+1))) / (np.sqrt(sb_corr) + epsilon)
    return parameters, state


In [14]:
def update_parameters_nadam(parameters, grad_w_b_w_b, state, eta, beta1, beta2, epsilon, t):
    """
    Updates parameters using Nadam.
    Nadam combines Adam with Nesterov momentum.
    """
    L = len(parameters) // 2
    for l in range(1, L+1):
        state.setdefault(f"vW{l}", np.zeros_like(parameters[f"W{l}"]))
        state.setdefault(f"vb{l}", np.zeros_like(parameters[f"b{l}"]))
        state.setdefault(f"sW{l}", np.zeros_like(parameters[f"W{l}"]))
        state.setdefault(f"sb{l}", np.zeros_like(parameters[f"b{l}"]))
        
        state[f"vW{l}"] = beta1 * state[f"vW{l}"] + (1-beta1) * grad_w_b_w_b[f"dW{l}"]
        state[f"vb{l}"] = beta1 * state[f"vb{l}"] + (1-beta1) * grad_w_b_w_b[f"db{l}"]
        state[f"sW{l}"] = beta2 * state[f"sW{l}"] + (1-beta2) * (grad_w_b_w_b[f"dW{l}"] ** 2)
        state[f"sb{l}"] = beta2 * state[f"sb{l}"] + (1-beta2) * (grad_w_b_w_b[f"db{l}"] ** 2)
        
        vW_corr = state[f"vW{l}"] / (1 - beta1**t)
        vb_corr = state[f"vb{l}"] / (1 - beta1**t)
        sW_corr = state[f"sW{l}"] / (1 - beta2**t)
        sb_corr = state[f"sb{l}"] / (1 - beta2**t)
        
        parameters[f"W{l}"] -= eta * (beta1 * vW_corr + (1-beta1) * grad_w_b_w_b[f"dW{l}"] / (1-beta1**(t+1))) / (np.sqrt(sW_corr) + epsilon)
        parameters[f"b{l}"] -= eta * (beta1 * vb_corr + (1-beta1) * grad_w_b_w_b[f"db{l}"] / (1-beta1**(t+1))) / (np.sqrt(sb_corr) + epsilon)
    return parameters, state


In [None]:
def update_parameters(parameters, grads, optimizer, optimizer_state, eta, t=1, hyperparams={}):
    
    if optimizer == "sgd":
        parameters = update_parameters_sgd(parameters, grads, eta)
    elif optimizer == "momentum":
        parameters, optimizer_state = update_parameters_momentum(parameters, grads, optimizer_state, eta, beta=hyperparams.get("beta", 0.9))
    elif optimizer == "nesterov":
        parameters, optimizer_state = update_parameters_nesterov(parameters, grads, optimizer_state, eta, beta=hyperparams.get("beta", 0.9))
    elif optimizer == "rmsprop":
        parameters, optimizer_state = update_parameters_rmsprop(parameters, grads, optimizer_state, eta, beta=hyperparams.get("beta", 0.9), epsilon=hyperparams.get("epsilon", 1e-8))
    elif optimizer == "adam":
        parameters, optimizer_state = update_parameters_adam(parameters, grads, optimizer_state, eta,
                                                             beta1=hyperparams.get("beta1", 0.9),
                                                             beta2=hyperparams.get("beta2", 0.999),
                                                             epsilon=hyperparams.get("epsilon", 1e-8),
                                                             t=t)
    elif optimizer == "nadam":
        parameters, optimizer_state = update_parameters_nadam(parameters, grads, optimizer_state, eta,
                                                              beta1=hyperparams.get("beta1", 0.9),
                                                              beta2=hyperparams.get("beta2", 0.999),
                                                              epsilon=hyperparams.get("epsilon", 1e-8),
                                                              t=t)
    else:
        raise ValueError("Unknown optimizer!")
    return parameters, optimizer_state

In [18]:
def initialize_parameters(layer_dims):
    """
    Initializes parameters for an L-layer network.
    
    Arguments:
      layer_dims -- List of dimensions, e.g., [784, 128, 10]
      
    Returns:
      parameters -- Dictionary of parameters "W1", "b1", ..., "WL", "bL"
    """
    parameters = {}
    L = len(layer_dims) - 1
    for l in range(1, L+1):
        parameters[f"W{l}"] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01
        parameters[f"b{l}"] = np.zeros((layer_dims[l], 1))
    return parameters

In [29]:
# Set up network dimensions and parameters for a 2-layer network
layer_dims = [784, 128, 10]
parameters = initialize_parameters(layer_dims)
optimizer_state = {}  # To store optimizer-specific state variables
eta = 0.01           # Learning rate (η)
max_epochs = 500
t = 1                # Time step (for Adam/Nadam)

# Dummy mini-batch (e.g., 32 examples)
X_batch = np.random.randn(784, 32)
Y_batch = np.zeros((10, 32))
for i in range(32):
    Y_batch[np.random.randint(0, 10), i] = 1

# Forward propagation: Compute activations and store cache
AL, act_preact = forward_propagation(X_batch, parameters)

print("Cache keys:", list(act_preact.keys()))
# Backward propagation: Compute gradients
grads = backprop(Y_batch ,act_preact,parameters)

# Choose an optimizer, e.g., "adam"
optimizer = "adam"
hyperparams = {"beta1": 0.9, "beta2": 0.999, "epsilon": 1e-8, "beta": 0.9}  # For optimizers that use beta

# Update parameters using the chosen optimizer
parameters, optimizer_state = update_parameters(parameters, grads, optimizer, optimizer_state, eta, t, hyperparams)

print("Updated W1 shape:", parameters["W1"].shape)

Cache keys: ['h0', 'a1', 'h1', 'a2', 'h2']
Updated W1 shape: (128, 784)
