In [50]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import line_search

In [51]:
def rosenbrock(x:np.ndarray):
#   Define the object function
    return 100.0 * (x[1] - x[0]**2)**2 + (1.0 - x[0])**2

In [52]:
def grad_rosenbrock(x:np.ndarray):
#   Gradient of the Rosenbrock function.

    x_val, y_val = x[0], x[1]
    df_dx = -400.0 * x_val * (y_val - x_val**2) - 2.0 * (1.0 - x_val)
    df_dy =  200.0 * (y_val - x_val**2)
    return np.array([df_dx, df_dy])

In [None]:
def gradient_descent(x0:np.ndarray, tol=1e-7, max_iter=100000):

    iter = 0    
    x = x0

    while iter < max_iter:
        J = grad_rosenbrock(x)
        f0 = rosenbrock(x)
        J_norm = np.linalg.norm(J)  

        if J_norm < tol:
            break

        # Descent direction is the negative gradient
        p = -J

        # Find alpha that satisfies strong Wolfe conditions
        # Default c1=1e-4, c2=0.9
        alpha, _, _, _, _, _ = line_search(
            f=rosenbrock, myfprime=grad_rosenbrock, xk=x, pk=p, gfk=J, old_fval=f0
        )

        x = x + alpha * p
        iter += 1

    return x, iter

In [57]:
def bfgs(x0:np.ndarray, tol=1e-7, max_iter=100000):

    n = len(x0)
    x = x0
    B = np.eye(n)  # initial inverse Hessian surrogate
    iter = 0

    while iter < max_iter:
        J = grad_rosenbrock(x)
        J_norm = np.linalg.norm(J)
        if J_norm < tol:
            break
        
        p = -np.linalg.solve(B, J)
        f0 = rosenbrock(x)
        alpha, _, _, _, _, _ = line_search(
            f=rosenbrock, myfprime=grad_rosenbrock, xk=x, pk=p, gfk=J, old_fval=f0
        )

        s = alpha * p 
        x_new = x + s
        y = grad_rosenbrock(x_new) - J 
        # Curvature condition
        if np.dot(y, s) > 1e-10:
            d1 = np.inner(s, B @ s)
            d2 = np.inner(y, s)
            B = B - (1/d1) * np.outer(B@s, s@B) + (1/d2) * np.outer(y, y)

        x = x_new
        iter += 1
    return x, iter

In [58]:
x0 = np.array([1.0, 2.0])

sol_gd, iter_gd = gradient_descent(x0)
sol_bfgs, iter_bfgs = bfgs(x0)

print("Gradient Descent using SciPy line search:")
print("  Iterations            :", iter_gd)
print("  Final Rosenbrock value:", rosenbrock(sol_gd))
print()
print("BFGS using SciPy line search:")
print("  Iterations            :", iter_bfgs)
print("  Final Rosenbrock value:", rosenbrock(sol_bfgs))

Gradient Descent using SciPy line search:
  Iterations            : 12375
  Final Rosenbrock value: 9.525536518353381e-15

BFGS using SciPy line search:
  Iterations            : 17
  Final Rosenbrock value: 1.5503028940076136e-18


The result shows that BFGS converges significantly faster than gradient descent methods, because Newton method compute the optimal direction at each update. In general, Newton and Quasi-Newton methods converge faster than gradient descent, but higher computational cost for each iteration. 