In [2]:
import numpy as np

# Define the function and its gradients
def f(x, y):
    return x**2 + 2*y +3*x

# Compute the derivative with respect to x
def f_der_x(x):
     return 2*x + 3

# Compute the derivative with respect to y
def f_der_y(y):
     return 2

# Implement RMSProp

# RMSProp parameters
# : gamma
# : lrate
# : epsilon
gamma = 0.8
lrate = 0.01
epsilon = 1e-8

# Accumulated gradient squares for x and y, respectively
# : gt_x and gt_y
gt_x = 0
gt_y = 0

# Initial values of x and y
x, y = 10.0, -10.0
no_iterations = 100

# RMSProp optimization loop
for i in range(no_iterations):
    g_x = f_der_x(x)
    g_y = f_der_y(y)

    # Update accumulated squared gradients for x and y
    gt_x = gamma * gt_x + (1 - gamma) * g_x**2
    gt_y = gamma * gt_y + (1 - gamma) * g_y**2

    # Update variables x and y using RMSProp rule
    x = x - (lrate * g_x) / (np.sqrt(gt_x) + epsilon)
    y = y - (lrate * g_y) / (np.sqrt(gt_y) + epsilon)

    # Optionally print the values of x, y, and the function value to track progress
    if i % 10 == 0:
        print(f"Iteration {i}: x = {x:.4f}, y = {y:.4f}, f(x, y) = {f(x, y):.4f}")

# Show the final x and f(x, y) value
print(f"Final x = {x:.4f}, Final y = {y:.4f}, Final f(x, y) = {f(x, y):.4f}")

Iteration 0: x = 9.9776, y = -10.0224, f(x, y) = 109.4415
Iteration 10: x = 9.8561, y = -10.1442, f(x, y) = 106.4217
Iteration 20: x = 9.7548, y = -10.2458, f(x, y) = 103.9291
Iteration 30: x = 9.6550, y = -10.3459, f(x, y) = 101.4920
Iteration 40: x = 9.5553, y = -10.4460, f(x, y) = 99.0785
Iteration 50: x = 9.4557, y = -10.5460, f(x, y) = 96.6853
Iteration 60: x = 9.3561, y = -10.6460, f(x, y) = 94.3121
Iteration 70: x = 9.2564, y = -10.7460, f(x, y) = 91.9588
Iteration 80: x = 9.1568, y = -10.8460, f(x, y) = 89.6254
Iteration 90: x = 9.0572, y = -10.9460, f(x, y) = 87.3120
Final x = 8.9675, Final y = -11.0360, Final f(x, y) = 85.2469


In [3]:
# 2) [bonus 4 pts] Add L2 regularization. You have to modify f, f_der_x, f-der_y functions.

import numpy as np

# Define the function with L2 regularization
def f(x, y, lambda_reg=0.1):
    return x**2 + 2*y + 3*x + lambda_reg * (x**2 + y**2)

# Compute the derivative with respect to x (including L2 regularization term)
def f_der_x(x, lambda_reg=0.1):
    return 2*x + 3 + 2*lambda_reg * x

# Compute the derivative with respect to y (including L2 regularization term)
def f_der_y(y, lambda_reg=0.1):
    return 2 + 2*lambda_reg * y

# RMSProp parameters
gamma = 0.8  # Decay factor
lrate = 0.01  # Learning rate
epsilon = 1e-8  # Small value to prevent division by zero
lambda_reg = 0.1  # L2 regularization coefficient

# Accumulated gradient squares for x and y, respectively
gt_x = 0
gt_y = 0

# Initial values of x and y
x, y = 10.0, -10.0
no_iterations = 100

# RMSProp optimization loop with L2 regularization
for i in range(no_iterations):
    # Compute gradients (derivatives) for x and y
    g_x = f_der_x(x, lambda_reg)
    g_y = f_der_y(y, lambda_reg)

    # Update accumulated squared gradients for x and y
    gt_x = gamma * gt_x + (1 - gamma) * g_x**2
    gt_y = gamma * gt_y + (1 - gamma) * g_y**2

    # Update variables x and y using RMSProp rule
    x = x - (lrate * g_x) / (np.sqrt(gt_x) + epsilon)
    y = y - (lrate * g_y) / (np.sqrt(gt_y) + epsilon)

    # Show progress: print every 10 iterations
    if i % 10 == 0:
        print(f"Iteration = {i + 1}: x = {x:.4f}, y = {y:.4f}, f(x, y) = {f(x, y, lambda_reg):.4f}")

# Show the final x and f(x, y) value
print(f"Final x = {x:.4f}, Final y = {y:.4f}, Final f(x, y) = {f(x, y, lambda_reg):.4f}")


Iteration = 1: x = 9.9776, y = -10.0000, f(x, y) = 129.4415
Iteration = 11: x = 9.8561, y = -10.0000, f(x, y) = 126.4243
Iteration = 21: x = 9.7548, y = -10.0000, f(x, y) = 123.9365
Iteration = 31: x = 9.6550, y = -10.0000, f(x, y) = 121.5060
Iteration = 41: x = 9.5553, y = -10.0000, f(x, y) = 119.1012
Iteration = 51: x = 9.4557, y = -10.0000, f(x, y) = 116.7187
Iteration = 61: x = 9.3561, y = -10.0000, f(x, y) = 114.3582
Iteration = 71: x = 9.2565, y = -10.0000, f(x, y) = 112.0195
Iteration = 81: x = 9.1568, y = -10.0000, f(x, y) = 109.7028
Iteration = 91: x = 9.0572, y = -10.0000, f(x, y) = 107.4080
Final x = 8.9676, Final y = -10.0000, Final f(x, y) = 105.3614


In [4]:
# 3) [bonus 8 pts] Implement AdaDelta. You have to define the following  and modify the update rule.

import numpy as np

# Define the function with L2 regularization
def f(x, y, lambda_reg=0.1):
    return x**2 + 2*y + 3*x + lambda_reg * (x**2 + y**2)

# Compute the derivative with respect to x (including L2 regularization term)
def f_der_x(x, lambda_reg=0.1):
    return 2*x + 3 + 2*lambda_reg * x

# Compute the derivative with respect to y (including L2 regularization term)
def f_der_y(y, lambda_reg=0.1):
    return 2 + 2*lambda_reg * y

# AdaDelta parameters
gamma = 0.9  # Decay factor
epsilon = 1e-8  # Small value to prevent division by zero
lambda_reg = 0.1  # L2 regularization coefficient

# Accumulated gradient squares for x and y, respectively
E_g2_x = 0
E_g2_y = 0

# Accumulated update squares for x and y
E_delta2_x = 0
E_delta2_y = 0

# Initial values of x and y
x, y = 10.0, -10.0
no_iterations = 100

# AdaDelta optimization loop with L2 regularization
for i in range(no_iterations):
    # Compute gradients (derivatives) for x and y
    g_x = f_der_x(x, lambda_reg)
    g_y = f_der_y(y, lambda_reg)

    # Update accumulated squared gradients for x and y
    E_g2_x = gamma * E_g2_x + (1 - gamma) * g_x**2
    E_g2_y = gamma * E_g2_y + (1 - gamma) * g_y**2

    # Compute update values using AdaDelta rule
    delta_x = - (np.sqrt(E_delta2_x + epsilon) / np.sqrt(E_g2_x + epsilon)) * g_x
    delta_y = - (np.sqrt(E_delta2_y + epsilon) / np.sqrt(E_g2_y + epsilon)) * g_y

    # Update accumulated squared parameter updates
    E_delta2_x = gamma * E_delta2_x + (1 - gamma) * delta_x**2
    E_delta2_y = gamma * E_delta2_y + (1 - gamma) * delta_y**2

    # Update variables x and y using AdaDelta rule
    x += delta_x
    y += delta_y

    # Show progress: print every 10 iterations
    if i % 10 == 0:
        print(f"Iteration = {i + 1}: x = {x:.4f}, y = {y:.4f}, f(x, y) = {f(x, y, lambda_reg):.4f}")

# Show the final x and f(x, y) value
print(f"Final x = {x:.4f}, Final y = {y:.4f}, Final f(x, y) = {f(x, y, lambda_reg):.4f}")


Iteration = 1: x = 9.9997, y = -10.0000, f(x, y) = 129.9921
Iteration = 11: x = 9.9963, y = -10.0000, f(x, y) = 129.9068
Iteration = 21: x = 9.9926, y = -10.0000, f(x, y) = 129.8162
Iteration = 31: x = 9.9889, y = -10.0000, f(x, y) = 129.7218
Iteration = 41: x = 9.9850, y = -10.0000, f(x, y) = 129.6241
Iteration = 51: x = 9.9809, y = -10.0000, f(x, y) = 129.5232
Iteration = 61: x = 9.9768, y = -10.0000, f(x, y) = 129.4194
Iteration = 71: x = 9.9725, y = -10.0000, f(x, y) = 129.3127
Iteration = 81: x = 9.9681, y = -10.0000, f(x, y) = 129.2033
Iteration = 91: x = 9.9636, y = -10.0000, f(x, y) = 129.0910
Final x = 9.9594, Final y = -10.0000, Final f(x, y) = 128.9878
