In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm

import warnings
warnings.filterwarnings('ignore')

from IPython.display import clear_output

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

In [None]:
# Animated plot
import plotly
import plotly.graph_objs as go

In [None]:
# to put directly the labels on each curve
#! pip install matplotlib-label-lines
from labellines import labelLine, labelLines

In [None]:
# https://github.com/Tikquuss/sag_torch
from src.optim import get_optimizer

## Utils

In [None]:
def gradient(y, x, grad_outputs=None):
    """Compute dy/dx @ grad_outputs"""
    if grad_outputs is None:
        grad_outputs = torch.ones_like(y)
    """    
    else :
        a = grad_outputs.shape == y[0].shape
        assert a or grad_outputs.shape == y.shape
        if a :
             grad_outputs = grad_outputs.repeat(y.shape[0], 1, 1)
    """
    grad = torch.autograd.grad(y, [x], grad_outputs=grad_outputs, create_graph=True)[0]
    return grad

def jacobian(y, x):
    """Compute dy/dx = dy/dx @ grad_outputs; 
    for grad_outputs in [1, 0, ..., 0], [0, 1, 0, ..., 0], ...., [0, ..., 0, 1]"""
    m, n = y.shape[0], x.shape[0]
    jac = torch.zeros(m, n) 
    for i in range(m):
        grad_outputs = torch.zeros_like(y)
        grad_outputs[i] = 1
        jac[i] = gradient(y, x, grad_outputs = grad_outputs)
    return jac

https://www.sfu.ca/~ssurjano/rosen.html

In [None]:
def RosenbrockFunction(x):    
    return sum([100 * (x[i+1] - x[i]**2)**2 + (x[i] - 1)**2 for  i in range(len(x)-1)])
    #return (100 * (x[1:] - x[:-1]**2)**2 + (x[:-1] - 1)**2).sum()

def RosenbrockFunction_log(x):
    try :
        return torch.log(RosenbrockFunction(x) + 1e-8)
    except TypeError: #  argument 'input' (position 1) must be Tensor, not numpy.float64
        return np.log(RosenbrockFunction(x) + 1e-8)
    
def RosenbrockDeriv(index):
    def f(x):
        dz = 0
        if index > 0: dz += 200 * (x[index] - x[index-1]**2)
        if index < len(x) - 1: dz += -400 * x[index] * (x[index+1] - x[index]**2) + 2 * (x[index] - 1)
        return dz

    return f

def RosenbrockDeriv_log(index):
    deriv = RosenbrockDeriv(index)
    def f(x): return deriv(x) / (RosenbrockFunction(x) + 1e-8)
    return f

In [None]:
def RastriginFunction(x, A = 10):
    n = len(x)
    try :
        return A*n + sum( x**2 - A * np.cos(2*np.pi*x) ) 
    except RuntimeError :
        return A*n + sum( x**2 - A * torch.cos(2*np.pi*x) ) 

def RastriginFunction_log(x, A = 10):
    try :
        return torch.log(RastriginFunction(x, A = A) + 1e-8)
    except TypeError: #  argument 'input' (position 1) must be Tensor, not numpy.float64
        return np.log(RastriginFunction(x, A = A) + 1e-8)
    
def RastriginDeriv(index, A = 10):
    def f(x):
        return 2 * x[index] + 2*np.pi*A*np.sin(2*np.pi*x[index])
    return f

def RastriginDeriv_log(index, A = 10):
    deriv = RastriginDeriv(index)
    def f(x): return deriv(x) / (RastriginFunction(x, A = A) + 1e-8)
    return f

In [None]:
def get_data(
    function, deriv_function,
    min_x = -5, max_x = 5, step_x = 0.25, 
    min_y = -5, max_y = 5, step_y = 0.25
) :
    x = np.arange(start = min_x, stop = max_x, step = step_x, dtype = np.float)
    y = np.arange(start = min_y, stop = max_y, step = step_y, dtype = np.float)
    x, y = np.meshgrid(x, y)
    X = []
    for i in range(len(x)):
        for j in range(len(x[0])):
            X.append([x[i][j], y[i][j]])
    X = np.array(X)

    # function
    z = []
    for k in range(len(X)): z.append(function(X[k]))
    z = np.array(z).reshape((len(x), len(x[0])))

    # deriv
    dz = []
    grad1 = deriv_function(index = 0)
    grad2 = deriv_function(index = 1)
    for k in range(len(X)): dz.append([grad1(X[k]), grad2(X[k])])
    dz = np.array(dz)
    dz = np.array(dz).reshape((len(x), len(x[0]), 2))

    return x, y, z, dz

In [None]:
def plotFunction(name, x, y, z, figsize = (8,8), plot_contours = True) :
        
    fig = plt.figure(figsize = figsize)
    #ax = fig.gca(projection='3d')
    ax = plt.axes(projection='3d')
    plt.title(name)
    surf = ax.plot_surface(x, y, z, cmap=cm.coolwarm)
    ax.set(xlabel='x', ylabel='y', zlabel='z')

    fig.colorbar(surf, shrink=0.5, aspect=5)

    plt.show()
    
    if plot_contours :
        fig = plt.figure(figsize = figsize)
        ax = plt.axes()
        vmin, vmax, vlevel = 0.1, 10, 0.5
        CS = ax.contourf(x, y, z, cmap=cm.coolwarm, levels=np.arange(vmin, vmax, vlevel))
        ax.set(xlabel='x', ylabel='y')
        #plt.clabel(CS, inline=1, fontsize=8)
        plt.title(name + ", contours")
        plt.show()

def plotGrad(name, x, y, dz, figsize = (8,8)):
    fig = plt.figure(figsize = figsize)
    plt.title(name)
    dz_ = plt.quiver(x, y, dz[:, :, 0], dz[:, :, 1])
    plt.show()

## Data

In [None]:
log_scale = True

#function_name = "Rosenbrock Function"
function_name = "Rastrigin Function"

In [None]:
grad_name = "Gradient Field of %s" % function_name

if log_scale :
    if "Rosenbrock" in function_name :
        callable_function = RosenbrockFunction_log
        callable_function_deriv = RosenbrockDeriv_log
    elif "Rastrigin" in function_name :
        callable_function = RastriginFunction_log
        callable_function_deriv = RastriginDeriv_log

    function_name = function_name + ' (log)'
    grad_name = grad_name + ' (log)'
else :
    if "Rosenbrock" in function_name :
        callable_function = RosenbrockFunction
        callable_function_deriv = RosenbrockDeriv
    elif "Rastrigin" in function_name :
        callable_function = RastriginFunction
        callable_function_deriv = RastriginDeriv

In [None]:
if "Rosenbrock" in function_name :
    #(min_x, max_x), (min_y, max_y) = (-1.5, 1.5), (-0.5, 1.5)
    (min_x, max_x), (min_y, max_y) = (-4, 4), (-2, 10)

    #step_x, step_y = 0.25, 0.25
    step_x, step_y = 0.05, 0.05
elif "Rastrigin" in function_name :
    (min_x, max_x), (min_y, max_y) = (-5, 5), (-5, 5)
    step_x, step_y = 0.05, 0.05

x, y, z, dz = get_data(
    callable_function, callable_function_deriv,
    min_x = min_x, max_x = max_x, step_x = step_x, 
    min_y = min_y, max_y = max_y, step_y = step_y
)

In [None]:
# figsize = (2*8,8)

# plotFunction(function_name, x, y, z, figsize = figsize, plot_contours=True)
# # # tmp = z - z.min()
# # # tmp = tmp / tmp.sum()
# # # plotFunction(function_name, x, y, tmp)

# #plotGrad(grad_name, x, y, dz, figsize = (2*3,4))
# plotGrad("", x, y, dz, figsize = (2*3,4))

In [None]:
lb = 3
if lb == 1 :
    L, C = 1, 1
else : 
    #C = 2
    C = 3
    L = lb // C + lb % C

figsize=(C*6, L*4)

fig = plt.figure(figsize=figsize)

i = 1
position = int(f"{L}{C}{i}")
ax = fig.add_subplot(position, projection='3d')
#surf = ax.plot_surface(x, y, z, cmap=plt.get_cmap('gist_earth'))
surf = ax.plot_surface(x, y, z, cmap=cm.coolwarm)
#fig.colorbar(surf, ax=ax, shrink=0.5, aspect=5)
#cb = plt.colorbar(surf, ax=[ax], shrink=0.5, aspect=5, location='left')
#ax.set_title(function_name)
ax.set(xlabel='x', ylabel='y', zlabel='z')

a = [1]
b = [1]
z_ = callable_function(np.array([1, 1]))

if "Rosenbrock" in function_name :
    ax.plot(a, b, z_, '-gD', markersize=10)
    #ax.text(a[0], b[0], z_,'min')
      
plot_contours = True
if plot_contours :
    i += 1  
    position = int(f"{L}{C}{i}")
    ax = fig.add_subplot(position)
    vmin, vmax, vlevel = 0.1, 10, 0.5
    CS = ax.contourf(x, y, z, cmap=cm.coolwarm, levels=np.arange(vmin, vmax, vlevel))
    ax.set(xlabel='x', ylabel='y')

    if "Rosenbrock" in function_name :
        ax.plot(a, b, '-gD', markersize=10)
        #ax.text(a[0], b[0],'min')
        #plt.title(function_name + ", contours")


i += 1  
position = int(f"{L}{C}{i}")
ax = fig.add_subplot(position)
dz_ = ax.quiver(x, y, dz[:, :, 0], dz[:, :, 1])
    
plt.show()

https://plotly.com/python/3d-surface-plots/

In [None]:
fig = go.Figure(data=[go.Surface(z=z, x=x, y=y)])

fig.update_traces(contours_z=dict(show=True, usecolormap=True,
                                  highlightcolor="limegreen", project_z=True))

fig.update_layout(
    title= function_name, 
    autosize=False,
    scene_camera_eye=dict(x=1.87, y=0.88, z=-0.64),
    width=500, height=500,
    margin=dict(l=65, r=50, b=65, t=90)
)

# fig.update_layout(
#             title="title",
#             xaxis_title="xaxis_title",
            
#             autosize=False,

#             #yaxis_title="Y Axis Title",
#             #legend_title="Legend Title",
#             font=dict(
#                 family="Courier New, monospace",
#                 size=18,
#                 color="RebeccaPurple"
#             )
#         )

fig.show()

## Training

In [None]:
d = 2

In [None]:
def init_train(optim_name):
    
    if "Rosenbrock" in function_name :
        x_optim = torch.ones(d).to(device)

        #x_hat = torch.randn(d, requires_grad=True, dtype=torch.float, device=device) 
        #x_hat = torch.tensor([0, 8], requires_grad=True, dtype=torch.float, device=device)
        #x_hat = torch.tensor([-3, 8], requires_grad=True, dtype=torch.float, device=device)
        x_hat = torch.tensor([-3, 6], requires_grad=True, dtype=torch.float, device=device)
    elif "Rastrigin" in function_name :
        #x_optim = torch.ones(d) * 0.05
        x_optim = torch.zeros(d).to(device)

        x_hat = torch.tensor([-4, 2], requires_grad=True, dtype=torch.float, device=device)

    
    params = [{'params':[x_hat], 'lr':lr}]
    v = all_optims[optim_name] + f",lr={lr}"
    if "sag" in k : v += ",n=1,m=1"
    optimizer = get_optimizer(params,  v)

    return x_optim, x_hat, optimizer

In [None]:
def train(n_epochs, callable_function, optim_name):
    
    x_optim, x_hat, optimizer = init_train(optim_name)

    error = ((x_hat - x_optim)**2).sum().sqrt().item()

    all_x = [x_hat.detach() + 0.0]
    all_z = []
    all_errors = [error]

    for epoch in range(n_epochs):
        z_ = callable_function(x_hat)
        
        #z_.backward()
        #with torch.no_grad() : x_hat -= lr * x_hat.grad 
        #x_hat.grad.zero_()

        optimizer.zero_grad()
        z_.backward()
        
        try :
            optimizer.step()
        except TypeError :
            batch_idx, indexes = 0, torch.LongTensor([0]) 
            optimizer.step(batch_idx=batch_idx, indexes=indexes)               
               
        error = ((x_hat - x_optim)**2).sum().sqrt().item()

        all_x.append(x_hat.detach() + 0.0)
        all_z.append(z_.detach() + 0.0)
        all_errors.append(error)

        if epoch%1000 == 0 : print(epoch, error)
        #if epoch%1000 == 0 : clear_output(wait=True)

        # if error < 0.00001 : 
        #     print('break', epoch)
        #     break

    all_z.append(callable_function(x_hat).detach() + 0.0)

    return all_x, all_z, all_errors

In [None]:
lr = 0.001
weight_decay = 1.0
# sgd
momentum= 0.9
# adam
beta1=0.9
beta2=0.99
# sag
with_d=True
batch_mode=False
init_y_i=False

In [None]:
all_optims = {
"sgd" : f"sgd,momentum=0,dampening=0,weight_decay={weight_decay},nesterov=False",
"momentum" : f"sgd,momentum={momentum},dampening=0.9,weight_decay={weight_decay},nesterov=False",
"nesterov" : f"sgd,momentum={momentum},dampening=0,weight_decay={weight_decay},nesterov=True",
"asgd" : f"asgd,lambd=0.0001,alpha=0.75,t0=1000000.0,weight_decay={weight_decay}",
"rmsprop" : f"rmsprop,alpha=0.99,weight_decay={weight_decay},momentum=0,centered=False",
"rmsprop_mom" : f"rmsprop,alpha=0.99,weight_decay={weight_decay},momentum={momentum},centered=False",
"rprop" : f"rprop,etaplus=0.5,etaminus=1.2,step_min=1e-06,step_max=50",
"adadelta" : f"adadelta,rho=0.9,weight_decay={weight_decay}", 
"adagrad" : f"adagrad,lr_decay=0,weight_decay={weight_decay},initial_accumulator_value=0", 
"adam" : f"adam,weight_decay={weight_decay},beta1={beta1},beta2={beta2},amsgrad=False",
"amsgrad" : f"adam,weight_decay={weight_decay},beta1={beta1},beta2={beta2},amsgrad=True",
 "adamax" : f"adamax,weight_decay={weight_decay},beta1={beta1},beta2={beta2}",
"custom_adam" : f"custom_adam,weight_decay={weight_decay},beta1={beta1},beta2={beta2}",
"adam_inverse_sqrt" : f"adam_inverse_sqrt,weight_decay={weight_decay},beta1={beta1},beta2={beta2},warmup_updates=4000,warmup_init_lr=1e-7,exp_factor=0.5",
"adam_cosine" : f"adam_cosine,weight_decay={weight_decay},beta1={beta1},beta2={beta2},warmup_updates=4000,warmup_init_lr=1e-7,min_lr=1e-9,init_period=1000000,period_mult=1,lr_shrink=0.75",
"sag" : f"sag,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d}",
"sag_sgd" : f"sag_sgd,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d},momentum={momentum},dampening=0.9,weight_decay={weight_decay},nesterov=False",
"sag_adam" : f"sag_adam,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d},beta1={beta1},beta2={beta2}",
}

In [None]:
# group for resenbrok
all_optims = {
  "adadelta" : f"adadelta,rho=0.9,weight_decay={weight_decay}", # bad
  "adagrad" : f"adagrad,lr_decay=0,weight_decay={weight_decay},initial_accumulator_value=0", # bad
  "sag" : f"sag,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d}",
}

all_optims = {
"momentum" : f"sgd,momentum={momentum},dampening=0.9,weight_decay={weight_decay},nesterov=False",
"nesterov" : f"sgd,momentum={momentum},dampening=0,weight_decay={weight_decay},nesterov=True",
"asgd" : f"asgd,lambd=0.0001,alpha=0.75,t0=1000000.0,weight_decay={weight_decay}",
"sag" : f"sag,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d}",
"sag_sgd" : f"sag_sgd,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d},momentum={momentum},dampening=0.9,weight_decay={weight_decay},nesterov=False",
}

all_optims = {
"rmsprop" : f"rmsprop,alpha=0.99,weight_decay={weight_decay},momentum=0,centered=False",
"rmsprop_mom" : f"rmsprop,alpha=0.99,weight_decay={weight_decay},momentum={momentum},centered=False",
"rprop" : f"rprop,etaplus=0.5,etaminus=1.2,step_min=1e-06,step_max=50",
"sag" : f"sag,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d}",
"sag_sgd" : f"sag_sgd,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d},momentum={momentum},dampening=0.9,weight_decay={weight_decay},nesterov=False",
}

all_optims = {
"adam" : f"adam,weight_decay={weight_decay},beta1={beta1},beta2={beta2},amsgrad=False",
"amsgrad" : f"adam,weight_decay={weight_decay},beta1={beta1},beta2={beta2},amsgrad=True",
"adamax" : f"adamax,weight_decay={weight_decay},beta1={beta1},beta2={beta2}",
"sag" : f"sag,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d}",
"sag_adam" : f"sag_adam,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d},beta1={beta1},beta2={beta2}",
}

all_optims = {
"adam" : f"adam,weight_decay={weight_decay},beta1={beta1},beta2={beta2},amsgrad=False",
"custom_adam" : f"custom_adam,weight_decay={weight_decay},beta1={beta1},beta2={beta2}",
"adam_inverse_sqrt" : f"adam_inverse_sqrt,weight_decay={weight_decay},beta1={beta1},beta2={beta2},warmup_updates=4000,warmup_init_lr=1e-7,exp_factor=0.5",
"adam_cosine" : f"adam_cosine,weight_decay={weight_decay},beta1={beta1},beta2={beta2},warmup_updates=4000,warmup_init_lr=1e-7,min_lr=1e-9,init_period=1000000,period_mult=1,lr_shrink=0.75",
"sag" : f"sag,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d}",
"sag_adam" : f"sag_adam,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d},beta1={beta1},beta2={beta2}",
}

In [None]:
# group for rastragin
all_optims = {
  "adadelta" : f"adadelta,rho=0.9,weight_decay={weight_decay}", 
  "adagrad" : f"adagrad,lr_decay=0,weight_decay={weight_decay},initial_accumulator_value=0", 
  "rprop" : f"rprop,etaplus=0.5,etaminus=1.2,step_min=1e-06,step_max=50", 
  "sag" : f"sag,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d}", 
}

all_optims = {
"momentum" : f"sgd,momentum={momentum},dampening=0.9,weight_decay={weight_decay},nesterov=False",
"nesterov" : f"sgd,momentum={momentum},dampening=0,weight_decay={weight_decay},nesterov=True",
"asgd" : f"asgd,lambd=0.0001,alpha=0.75,t0=1000000.0,weight_decay={weight_decay}",
"sag" : f"sag,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d}",
"sag_sgd" : f"sag_sgd,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d},momentum={momentum},dampening=0.9,weight_decay={weight_decay},nesterov=False",
}

all_optims = {
"rmsprop" : f"rmsprop,alpha=0.99,weight_decay={weight_decay},momentum=0,centered=False",
"rmsprop_mom" : f"rmsprop,alpha=0.99,weight_decay={weight_decay},momentum={momentum},centered=False",
"sag" : f"sag,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d}",
"sag_sgd" : f"sag_sgd,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d},momentum={momentum},dampening=0.9,weight_decay={weight_decay},nesterov=False",
}

all_optims = {
"adam" : f"adam,weight_decay={weight_decay},beta1={beta1},beta2={beta2},amsgrad=False",
"amsgrad" : f"adam,weight_decay={weight_decay},beta1={beta1},beta2={beta2},amsgrad=True",
"adamax" : f"adamax,weight_decay={weight_decay},beta1={beta1},beta2={beta2}",
"sag" : f"sag,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d}",
"sag_adam" : f"sag_adam,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d},beta1={beta1},beta2={beta2}",
}

all_optims = {
"adam" : f"adam,weight_decay={weight_decay},beta1={beta1},beta2={beta2},amsgrad=False",
"custom_adam" : f"custom_adam,weight_decay={weight_decay},beta1={beta1},beta2={beta2}",
"adam_inverse_sqrt" : f"adam_inverse_sqrt,weight_decay={weight_decay},beta1={beta1},beta2={beta2},warmup_updates=4000,warmup_init_lr=1e-7,exp_factor=0.5",
"adam_cosine" : f"adam_cosine,weight_decay={weight_decay},beta1={beta1},beta2={beta2},warmup_updates=4000,warmup_init_lr=1e-7,min_lr=1e-9,init_period=1000000,period_mult=1,lr_shrink=0.75",
"sag" : f"sag,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d}",
"sag_adam" : f"sag_adam,weight_decay={weight_decay},batch_mode={batch_mode},init_y_i={init_y_i},with_d={with_d},beta1={beta1},beta2={beta2}",
}

In [None]:
#n_epochs = 100000
#n_epochs = 50000
n_epochs = 10000  #+ 5000
#n_epochs = 4000 
#n_epochs = 100 
#n_epochs = 20

results = {}

for k, v  in all_optims.items() :
    print("============")
    print(k, v)
    all_x, all_z, all_errors = train(n_epochs, callable_function, optim_name=k)
    results[k] = [all_x, all_z, all_errors]
    print("============")

In [None]:
figsize=(4*3,4*2)
fig, ax = plt.subplots(1, 1, sharex=False, sharey=False, figsize = figsize)
#fig.suptitle("suptitle")

p = None 
#p = 2000 

for k in all_optims.keys() :
    if 'sag' in k : continue
    all_errors = results[k][2]
    xs = list(range(len(all_errors[:p])))
    ax.plot(xs, all_errors[:p], label=k)
    ax.set(xlabel='epochs', ylabel='error')

# #ax.set_title('title')
ax.legend()

_ = labelLines(ax.get_lines(), zorder=2.5, fontsize=14, align=True)

In [None]:
for k  in all_optims.keys() :
    all_errors = results[k][2]
    print(k, min(all_errors))

In [None]:
INF = 1000000000
if "Rosenbrock" in function_name :
    # error
    tmp_errors = {
    "sgd" : 0.53,
    "momentum" : 0.15,
    "nesterov" : 0.55, 
    "asgd" : 0.50,
    "rmsprop" : 0.00, 
    "rmsprop_mom" : 0.01,
    "rprop" : 0.00, 
    "adadelta" : 0.00, 
    "adagrad" : 5.00,
    "adam" : 0.00, 
    "amsgrad" : 0.00, 
    "adamax" : 0.00,
    "custom_adam" : 0.06, 
    "adam_inv_sqrt" : 0.05,
    "adam_cos" : 0.06,
    "sag" : 0.50,
    "sag_sgd" : 0.17,
    "sag_adam" : 0.06,
    }
    # speeds
    tmp_speeds = {
    "sgd" : 2800,
    "momentum" : 2800,
    "nesterov" : 250, 
    "asgd" : 2800,
    "rmsprop" : 9000, 
    "rmsprop_mom" : 700,
    "rprop" : 1400, 
    "adadelta" : 125000, 
    "adagrad" : INF,
    "adam" : 7500, 
    "amsgrad" : 25000, 
    "adamax" : 8000,
    "custom_adam" : 4000, 
    "adam_inv_sqrt" : 6000,
    "adam_cos" : 6000,
    "sag" : 2800,
    "sag_sgd" : 2000,
    "sag_adam" : 4000,
    }
elif "Rastrigin" in function_name :
    # error
    tmp_errors = {
    "sgd" : 2.7469568252563477,
    "momentum" : 2.7258849143981934,
    "nesterov" : 2.6344261169433594, 
    "asgd" :  2.7469568252563477,
    "rmsprop" : 3.0466601848602295, 
    "rmsprop_mom" : 2.8889172077178955,
    "rprop" : 4.448424339294434, 
    "adadelta" : 3.047818660736084, 
    "adagrad" : 3.0598108768463135,
    "adam" : 3.047726631164551, 
    "amsgrad" : 3.0049257278442383, 
    "adamax" : 3.047726631164551,
    "custom_adam" : 1.3765685558319092, 
    "adam_inv_sqrt" : 1.378799557685852,
    "adam_cos" : 1.378799557685852,
    "sag" : 2.746980667114258,
    "sag_sgd" : 2.1404078006744385,
    "sag_adam" : 1.3765685558319092,
    }
    # speeds
    tmp_speeds = {
    "sgd" : 1250, #
    "momentum" : 1200, #
    "nesterov" : 100, # 
    "asgd" : 1250, #
    "rmsprop" : 1200, # 
    "rmsprop_mom" : 150, #
    "rprop" : 10, #
    "adadelta" : 25000, #
    "adagrad" : 400000, #
    "adam" : 1250, #
    "amsgrad" : 2500, # 
    "adamax" : 1250, #
    "custom_adam" : 1500, # 
    "adam_inv_sqrt" : 3500, #
    "adam_cos" : 3500, #
    "sag" : 1250, #
    "sag_sgd" : 375, #
    "sag_adam" : 1500, #
    }

In [None]:
# errors
tmp = {k: round(v, 1) for k, v in sorted(tmp_errors.items(), key=lambda item: item[1])}

methods = list(tmp.keys())
speed = list(tmp.values())

loc = np.arange(len(methods)) # the label locations
width = 0.35 # the width of the bars

#fig, ax = plt.subplots()
figsize=(4*6,4*2)
fig, ax = plt.subplots(1, 1, sharex=False, sharey=False, figsize = figsize)

ax.set_ylabel('final errors at steady states')
#ax.set_title('optimizers')
ax.set_xticks(loc)
ax.set_xticklabels(methods)
pps = ax.bar(loc - width / 2, speed, width, label='')
for p in pps:
    height = p.get_height()
    s = "{}".format(height)
    if "Rosenbrock" in function_name :
        if height > 1.0 : s = "∞" 
    elif "Rastrigin" in function_name :
        if height > 1.0 : pass #s = "∞" 
    ax.text(x=p.get_x() + p.get_width() / 2, y=height+.10, s=s, ha='center')
plt.show()

In [None]:
# speeds
tmp = {k: round(v, 1) for k, v in sorted(tmp_speeds.items(), key=lambda item: item[1])}

methods = list(tmp.keys())
speed = list(tmp.values())

loc = np.arange(len(methods)) # the label locations
width = 0.35 # the width of the bars

#fig, ax = plt.subplots()
figsize=(4*6,4*2)
fig, ax = plt.subplots(1, 1, sharex=False, sharey=False, figsize = figsize)

ax.set_ylabel('speed')
#ax.set_title('optimizers')
ax.set_xticks(loc)
ax.set_xticklabels(methods)
pps = ax.bar(loc - width / 2, speed, width, label='')
for p in pps:
    height = p.get_height()
    s = "{}".format(height)
    if height >= INF : s = "∞"
    ax.text(x=p.get_x() + p.get_width() / 2, y=height+.10, s=s, ha='center')
plt.show()

In [None]:
k = "sag"
all_x, all_z, all_errors = results[k]

In [None]:
figsize=(6*1,4*1)
fig = plt.figure(figsize = figsize)
ax = fig.add_subplot(111, projection='3d')
plt.title(function_name)
surf = ax.plot_surface(x, y, z, cmap=cm.coolwarm)
ax.set(xlabel='x', ylabel='y', zlabel='z')
fig.colorbar(surf, shrink=0.5, aspect=5)

a = [x_[0] for x_ in all_x]
b = [x_[1] for x_ in all_x]

#ax.scatter(a, b, all_z, s=10, c='b', marker='o')
ax.plot(a, b, all_z, c='b')

plt.show()

In [None]:
def plot_landscape(ax, x, y, z, all_x, step_size, put_xy_label = True) :
  
    vmin, vmax, vlevel = 0.1, 10, 0.5
    CS = ax.contourf(x, y, z, cmap=cm.coolwarm, levels=np.arange(vmin, vmax, vlevel))
    if put_xy_label : ax.set(xlabel='x', ylabel='y')
    #plt.clabel(CS, inline=1, fontsize=8)
    a = [x_[0] for x_ in all_x]#[:500]
    b = [x_[1] for x_ in all_x]#[:500]

    #ax.scatter(a, b, s=10, c='b', marker='o')

    ax.plot([a[0]], [b[0]], '-gD', markersize=12)
    ax.text(a[0], b[0],'0')
    ax.plot(a, b, c='b')
    t = len(all_x)-1
    ax.plot([a[-1]], [b[-1]], '-gP', markersize=12)
    ax.text(a[-1], b[-1],f'{t}')

    for t in range(0, n_epochs, step_size) :
        try :
            ax.plot([a[t]], [b[t]], '-gP', markersize=12)
            ax.text(a[t], b[t], f'{t}')
        except IndexError :
            pass

    # origin = [np.array(a[:-1]), np.array(b[:-1])]
    # v1 = np.array(a[1:])
    # v2 = np.array(b[1:])
    # plt.quiver(*origin, v1, v2, scale=21)

    #plt.legend()
    #plt.show()

In [None]:
figsize=(6*1,4*1)
fig = plt.figure(figsize = figsize)
ax = fig.add_subplot(111)
plot_landscape(ax, x, y, z, all_x, step_size=1000)

In [None]:
lb = len(all_optims)
if lb <= 3 :
    L, C = 1, lb
else : 
    #C = 2
    C = 3
    L = lb // C + lb % C

figsize=(C*6, L*4)
fig = plt.figure(figsize = figsize)

i=0

for k in all_optims.keys() :
    i+=1  
    #ax = fig.add_subplot(int(f"{L}{C}{i}"))
    ax = fig.add_subplot(L, C, i)
    all_x, _, _ = results[k]
    step_size = 200

    s = k
    if k == "adam_inverse_sqrt" : s = "adam_inv_sqrt"
    if k == "adam_cosine"  : s = "adam_cos"
    s = tmp_speeds.get(s)
    print(s)
    if "Rosenbrock" in function_name : step_size = int(s / 10)
    elif "Rastrigin" in function_name : step_size = int(2 * s / 5)
    plot_landscape(ax, x, y, z, all_x, step_size=step_size, put_xy_label=False)
    ax.set_title(k)