# Consider the unconstrained optimisation problem

\begin{align}
\text{min  } f(x_1, x_2) = 4x_1^2 - 4x_1^4 + x_1^{2} +x_1x_2 - 4x_2^2 + 4x_2^4
\end{align}

# Another problem
\begin{align}
\text{min  } f(x_1, x_2) = e^{x_1}(4x_1^2 + 2x_2^2 + 4x_1x_2 + 2x_2 + 1)
\end{align}





In [None]:
import autograd.numpy as np 
import autograd.numpy.linalg as la
from autograd import grad, jacobian

In [None]:
f = lambda x: (4*x[0]**2) - (4*x[0]**4) + (x[0]**(2) + x[0]*x[1]) - (4*x[1]**2) + (4*x[1]**4)
f([0, 0])

In [None]:
f1 = lambda x: np.exp(x[0]) * (4*x[0]**2 + 2*x[1]**2 + 4*x[0]*x[1] + 2*x[1] + 1)
f1([0, 0])

# visualise the objective function with Matplotlib
A Colab tutorial for Matplotlib can be found here [Colab tutorial on Matplotlib](https://colab.research.google.com/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/04.00-Introduction-To-Matplotlib.ipynb#scrollTo=AsOD563_e8Ge). We should use the Object-oriented interface as explained in this tutorial. More details can be found in the book [Python Data Science Handbook](https://colab.research.google.com/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/Index.ipynb)

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

# To draw contour graph for a two-variables function, please read [Contour **plots**](https://colab.research.google.com/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/04.04-Density-and-Contour-Plots.ipynb)

In [None]:
x1 = np.linspace(0, 1, 100)
x2 = np.linspace(0, 1, 100)

X1, X2 = np.meshgrid(x1, x2)
Z = f((X1, X2))


In [None]:
Z.shape

In [None]:
fig = plt.figure(figsize = (14.7, 8.27))
#fig.set_size_inches(14.7, 8.27)
plt.contour(X1, X2, Z, 50,cmap='jet')
plt.colorbar()

The contour may be misleading if the region is too big. You can create a function to visualise a specified region of the contour. How many local minima can you find?

# we can solve a non-linear programming problem using scipy’s minimize function [scipy.optimize](https://docs.scipy.org/doc/scipy/tutorial/optimize.html).

 We use the Broyden-Fletcher-Goldfarb-Shanno algorithm (BFGS), and examples for other methods can be found here [minimize](https://docs.scipy.org/doc/scipy/tutorial/optimize.html#unconstrained-minimization-of-multivariate-scalar-functions-minimize)

 BFGS requires gradient, which can be automatically calculated by autograd (see notebook from last week)

In [None]:
from autograd import grad, jacobian
from scipy.optimize import minimize, OptimizeResult


In [None]:
grad_f = grad(f)
hessen_f = jacobian(grad_f)


# Try initial point $x_0 = [0.5, 0.5]$

In [None]:
x0 = np.array([0.5, 0.5]) #initial point
print(" gradient = ", grad_f(x0), "\n Hessian = ", hessen_f(x0))

In [None]:
res = minimize(f, x0, method='BFGS', jac = grad_f, \
               options={'disp': True})
print("-"*80)
res

# Try different initial points and compare with the contour plot

# Line search
Steepest descent method has a line search step to find a better solution along the negative gradient direction. [Line search](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.line_search.html#scipy.optimize.line_search) is not trivial since convergence needs to be maintained while computational effort is kept reasonable.  

In [None]:
from scipy.optimize import line_search

In [None]:
x0 = np.array([0.5, 0.5])
res = line_search(f, grad_f, x0, -1.*grad_f(x0))
res

Maximum step size is necessary since line_search may not converge. Maximum number of iteration is also worth trying

In [None]:
res = line_search(f, grad_f, x0, -1*grad_f(x0), amax = 0.1, maxiter = 3)
res

You may play with other parameters, but can avoid all these difficulties using fixed step size without line search

In [None]:
f0 = f(x0)
print("f0 = ", f0, "f1 = ", f(x0 - 1.e-1*grad_f(x0)))

# Implementation of the Steepest Descent Method

In [None]:
def steepest_descent(fun, x0, jac, ls = line_search, maxiter = 100, amax = 1000., tol = 1.e-8 ):
  ''' Simple implementation of Steepest Descent for minimising unconstrained nonlinear function.

  Parameters: 
    fun (callable) : the function to minimise

    x0 (1D array) : initial guess

    jac (callable) : the gradient function

    ls (callable, line_search)

    maxiter(int, 100) : maximum number of iterations

    amax(float, 1000.) : max step size in line search

    tol(float, 1.e-8) : used for stopping criteria

  Return:
    res (scipy.optimize.OptimizeResult): optimal solution and value

  Note:
    Follow the style of scipy.optimize;
    scipy.optimize.line_search is used

  '''
  x_eps = tol # tolerence for convergence on delta x
  f_eps = tol # tolerence for convergence on delta f
  g_eps = tol # tolerence for convergence on norm of gradient
  x_k = x0.copy()
  f_k = fun(x_k)
  nfev = 1
  g_k = jac(x_k)
  njev = 1
  nit = 1
  res = OptimizeResult()
  n_g_k = la.norm(g_k)
  if n_g_k < g_eps:
    res.x = x_k
    res.success = True
    res.status = 0
    res.message = "norm of gradient is within tolerence"
    res.fun = f_k
    res.nfev = nfev
    res.njev = njev
    res.nit = nit

  while True:
    alpha_k, fc_k, gc_k, f_k1, _,_ = ls(fun,jac,x_k,-g_k , amax=amax)
    nfev += fc_k
    njev += gc_k
    if alpha_k == None or f_k1 == None: 
      res.x = x_k
      res.success = False
      res.status = 1
      res.message = "Line search fail: alpha or fun is None"
      res.fun = f_k
      res.nfev = nfev
      res.njev = njev
      res.nit = nit
      break

    if abs(alpha_k*n_g_k) < x_eps:
      res.x = x_k1
      res.success = True
      res.status = 0
      res.message = "change of x is within tolerence"
      res.fun = f_k1
      res.nfev = nfev
      res.njev = njev
      res.nit = nit
      break      
    
    x_k1 = x_k - alpha_k*g_k
    
    if abs(f_k - f_k1) < f_eps:
      res.x = x_k1
      res.success = True
      res.status = 0
      res.message = "change of fun is within tolerence"
      res.fun = f_k1
      res.nfev = nfev
      res.njev = njev
      res.nit = nit
      break      
    
    g_k1 = jac(x_k1)
    njev += 1
    n_g_k = la.norm(g_k1)
    if n_g_k < g_eps:
      res.x = x_k1
      res.success = True
      res.status = 0
      res.message = "norm of gradient is within tolerence"
      res.fun = f_k1
      res.nfev = nfev
      res.njev = njev
      res.nit = nit
      break      

    if nit > maxiter:
      res.x = x_k1
      res.success = False
      res.status = 0
      res.message = "Max iter reached"
      res.fun = f_k1
      res.nfev = nfev
      res.njev = njev
      res.nit = nit
      break     

    nit += 1
    x_k = x_k1
    f_k = f_k1
  return res

## Docstring makes documentation easy

In [None]:
help(steepest_descent)

In [None]:
steepest_descent(f, x0, grad_f)

In [None]:
x0 = np.array([0.5, 0.5])
steepest_descent(f, x0, grad_f)

In [None]:
x0 = np.array([-0.071,  0.71]) # very close to a local optimum
steepest_descent(f, x0, grad_f, amax = 0.1)

## We can try different line search method

In [None]:
from scipy.optimize.linesearch import line_search_wolfe1


In [None]:
x0 = np.array([-0.071,  0.71]) # very close to a local optimum
steepest_descent(f, x0, grad_f, ls = line_search_wolfe1, amax = 1.)