In [455]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [456]:
from lib.optimizers import *
from lib.functions import *
import numpy as np
import matplotlib.pyplot as plt

### Helper functions

In [457]:
_infty = 1e16
_zero = 1e-32

def remove_nans_infs_and_zeros(arr):
    arr = np.array(arr)
    arr[np.isnan(arr)] = _infty
    arr[np.isinf(arr)] = _infty
    arr[arr > _infty] = _infty
    arr[arr == 0] = _zero
    arr[arr < _zero] = _zero
    return arr

# A neural network

In [458]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [459]:
def F(x, p, layers):
    assert x.shape == (2,)
    assert p.shape == (23,)
    
    W = []
    b = []
    index = 0
    for i in range(len(layers) - 1):
    # W2, W3, W4, b2, b3, b4 = p[:4], p[4:10], p[10:16], p[16:18], p[18:21], p[21:23]
        W.append(p[index:index + layers[i] * layers[i + 1]].reshape((layers[i + 1], layers[i])))
        index += layers[i] * layers[i + 1]
    for i in range(len(layers) - 1):
        b.append(p[index:index + layers[i + 1]])
        index += layers[i + 1]
    
    a = x
    for i in range(len(layers) - 1):
        a = sigmoid(W[i] @ a + b[i])
    y = a
    
    assert y.shape == (2,)
    
    return y

In [460]:
def f(p, layers, X, Y):
    assert p.shape == (23,)
    
    res = np.array([np.linalg.norm(F(x, p, layers) - y) for x, y in zip(X, Y)])
    return res

In [461]:
def grad_C(p, layers, x, y):
    assert p.shape == (23,)
    assert x.shape == (2,)
    assert y.shape == (2,)
    
    W = []
    b = []
    index = 0
    for i in range(len(layers) - 1):
    # W2, W3, W4, b2, b3, b4 = p[:4], p[4:10], p[10:16], p[16:18], p[18:21], p[21:23]
        W.append(p[index:index + layers[i] * layers[i + 1]].reshape((layers[i + 1], layers[i])))
        index += layers[i] * layers[i + 1]
    for i in range(len(layers) - 1):
        b.append(p[index:index + layers[i + 1]])
        index += layers[i + 1]
    # W2, W3, W4, b2, b3, b4 = p[:4], p[4:10], p[10:16], p[16:18], p[18:21], p[21:23]
    # W2 = W2.reshape((2, 2))
    # W3 = W3.reshape((3, 2))
    # W4 = W4.reshape((2, 3))
    
    # assert W2.shape == (2, 2)
    # assert W3.shape == (3, 2)
    # assert W4.shape == (2, 3)
    # assert b2.shape == (2,)
    # assert b3.shape == (3,)
    # assert b4.shape == (2,)
    
    a = [x]
    z = []
    for i in range(len(layers) - 1):
        z.append(W[i] @ a[i] + b[i])
        a.append(sigmoid(z[i]))
    
    # assert a4.shape == (2,)
    
    delta = [sigmoid(z[-1]) * (1 - sigmoid(z[-1])) * (a[-1] - y)]
    for i in range(len(layers) - 2, 0, -1):
        delta.append(sigmoid(z[i - 1]) * (1 - sigmoid(z[i - 1])) * (W[i].T @ delta[-1]))
    delta = delta[::-1]
    
    # delta_4 = sigmoid(z4) * (1 - sigmoid(z4)) * (a4 - y)
    # delta_3 = sigmoid(z3) * (1 - sigmoid(z3)) * (W4.T @ delta_4)
    # delta_2 = sigmoid(z2) * (1 - sigmoid(z2)) * (W3.T @ delta_3)
    
    # assert delta_4.shape == (2,)
    # assert delta_3.shape == (3,)
    # assert delta_2.shape == (2,)
    
    grad_W = []
    for i in range(len(layers) - 1):
        grad_W.append(delta[i].reshape((layers[i + 1], 1)) @ a[i].reshape((1, layers[i])))
    
    # grad_W2 = delta_2.reshape((2, 1)) @ a1.reshape((1, 2))
    # grad_W3 = delta_3.reshape((3, 1)) @ a2.reshape((1, 2))
    # grad_W4 = delta_4.reshape((2, 1)) @ a3.reshape((1, 3))
    
    # assert grad_W2.shape == (2, 2)
    # assert grad_W3.shape == (3, 2)
    # assert grad_W4.shape == (2, 3)
    
    grad_b = delta
    # grad_b2 = delta_2
    # grad_b3 = delta_3
    # grad_b4 = delta_4
    
    # assert grad_b2.shape == (2,)
    # assert grad_b3.shape == (3,)
    # assert grad_b4.shape == (2,)
    
    grad = np.concatenate(([grad_W[i].flatten() for i in range(len(grad_W))] + [grad_b[i] for i in range(len(grad_b))]))
    
    assert grad.shape == (23,)
    
    return grad

# Data

In [462]:
num_samples = 30
X = np.array([[x, x**2] for x in range(num_samples)])
Y = np.vstack((X[:, 0], X[:, 1])).T
X[:5, :], Y[:5, :]

(array([[ 0,  0],
        [ 1,  1],
        [ 2,  4],
        [ 3,  9],
        [ 4, 16]]),
 array([[ 0,  0],
        [ 1,  1],
        [ 2,  4],
        [ 3,  9],
        [ 4, 16]]))

In [463]:
f(np.random.randn(23), [2, 3, 2], X, Y)

2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2


  return 1 / (1 + np.exp(-z))


array([7.98405578e-01, 9.05857851e-01, 3.74747024e+00, 8.74250637e+00,
       1.57446465e+01, 2.47474823e+01, 3.57500457e+01, 4.87522073e+01,
       6.37540095e+01, 8.07555187e+01, 9.97567939e+01, 1.20757882e+02,
       1.43758820e+02, 1.68759636e+02, 1.95760351e+02, 2.24760983e+02,
       2.55761545e+02, 2.88762048e+02, 3.23762501e+02, 3.60762911e+02,
       3.99763283e+02, 4.40763623e+02, 4.83763935e+02, 5.28764221e+02,
       5.75764485e+02, 6.24764730e+02, 6.75764957e+02, 7.28765168e+02,
       7.83765365e+02, 8.40765549e+02])

In [464]:
layers = [2, 2, 3, 2]

def r(p):
    return f(p, layers, X, Y)

def Dr(p):
    return np.vstack([grad_C(p, layers, x, y) for x, y in zip(X, Y)])

N = X.shape[0]
M = 23
M, N

(23, 30)

In [465]:
R = Function(F=r, DF=Dr, M=23, N=N)

3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)


  return 1 / (1 + np.exp(-z))


# Test different methods

In [466]:
np.random.seed(0)
p0 = 1 / np.sqrt(M) * np.random.randn(M)
max_iter = 1000
alpha_gn = 1e1
max_iter_gd = 1000
alpha_gd = 1e3
p0

array([ 0.36783034,  0.08343855,  0.20408098,  0.46725853,  0.38941276,
       -0.20377652,  0.19810713, -0.03156016, -0.02152262,  0.08561571,
        0.03003516,  0.30323699,  0.15868734,  0.02537099,  0.09255188,
        0.06957591,  0.31153702, -0.04277846,  0.06527913, -0.17809127,
       -0.53233518,  0.1362889 ,  0.18024741])

In [467]:
np.linalg.norm(R(p0))

3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3


2112.431985653707

## Stochastic gradient

In [468]:
p = p0

for epoch in range(max_iter_gd):
    i = np.random.randint(0, len(X))
    x, y = X[i,:], Y[i]
    p -= alpha_gd * grad_C(p, layers, x, y)
    
err = np.linalg.norm(f(p, layers, X, Y))
print(f"{p=}\n{err=}")

3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 

3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3


## Gradient descent

In [469]:
p, err = gradient_descent(R, p0, alpha_gd, max_iter_gd)
print(f"{p=}\n{err=}")

3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3


(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,)
3 3 3 3
(23,

KeyboardInterrupt: 

## Gauss-Newton

In [None]:
p, err = gauss_newton(R=R, p0=p0, max_iter=max_iter, alpha=alpha_gn, step_type="least_squares")
print(f"{p=}\n{err=}")

p=array([ 3.67830341e-01,  8.34385458e-02,  2.04080977e-01,  4.67258532e-01,
        8.50769018e+02,  8.50175829e+02,  3.41371968e+03,  3.41349001e+03,
        2.34160636e+03,  2.34171350e+03,  3.04818188e+03,  2.69460267e+03,
        2.08698922e+03,  7.29863970e+04,  6.45136567e+04,  4.99681146e+04,
        3.11537022e-01, -4.27784552e-02,  8.50444884e+02,  3.41334348e+03,
        2.34109555e+03,  5.42001919e+03,  1.29776388e+05])
err=2110.6003885150785


## Levenberg-Marquardt

#### 1. Constant $\lambda_k$ sequence

In [None]:
lambda_param = 1.
optimizer = LevenbergMarquardt(
    R=R, lambda_param_fun=LevenbergMarquardt.LambdaParamConstant(lambda0=lambda_param)
)

In [None]:
p, err = optimizer.optimize(p0, max_iter, step_type="least_squares")
print(f"{p=}\n{err=}")

p=array([ 3.67830341e-01,  8.34385458e-02,  2.04080977e-01,  4.67258532e-01,
        8.50769018e+02,  8.50175829e+02,  3.41371968e+03,  3.41349001e+03,
        2.34160636e+03,  2.34171350e+03,  3.04818188e+03,  2.69460267e+03,
        2.08698922e+03,  7.29863970e+04,  6.45136567e+04,  4.99681146e+04,
        3.11537022e-01, -4.27784552e-02,  8.50444884e+02,  3.41334348e+03,
        2.34109555e+03,  5.42001919e+03,  1.29776388e+05])
err=2110.6003885150785


In [None]:
p, err = optimizer.optimize(p0, max_iter, step_type="cgnr", step_max_iter=6, step_tol=1e-6)
print(f"{p=}\n{err=}")

p=array([ 3.67830341e-01,  8.34385458e-02,  2.04080977e-01,  4.67258532e-01,
        8.50769018e+02,  8.50175829e+02,  3.41371968e+03,  3.41349001e+03,
        2.34160636e+03,  2.34171350e+03,  3.04818188e+03,  2.69460267e+03,
        2.08698922e+03,  7.29863970e+04,  6.45136567e+04,  4.99681146e+04,
        3.11537022e-01, -4.27784552e-02,  8.50444884e+02,  3.41334348e+03,
        2.34109555e+03,  5.42001919e+03,  1.29776388e+05])
err=2110.6003885150785


In [None]:
p, err = optimizer.optimize(p0, max_iter, step_type="svd")
print(f"{p=}\n{err=}")

p=array([ 3.67830341e-01,  8.34385458e-02,  2.04080977e-01,  4.67258532e-01,
        8.50769018e+02,  8.50175829e+02,  3.41371968e+03,  3.41349001e+03,
        2.34160636e+03,  2.34171350e+03,  3.04818188e+03,  2.69460267e+03,
        2.08698922e+03,  7.29863970e+04,  6.45136567e+04,  4.99681146e+04,
        3.11537022e-01, -4.27784552e-02,  8.50444884e+02,  3.41334348e+03,
        2.34109555e+03,  5.42001919e+03,  1.29776388e+05])
err=2110.6003885150785


#### 2. Sequence $\lambda_k$ decrasing when error is decreasing, and increasing otherwise

In [None]:
lambda_param = 1.
lambda_change = 2.
optimizer = LevenbergMarquardt(
    R=R, lambda_param_fun=LevenbergMarquardt.LambdaParamDefaultModifier(lambda0=lambda_param, lambda_change=lambda_change)
)

In [None]:
p, err = optimizer.optimize(p0, max_iter, step_type="cholesky")
print(f"{p=}\n{err=}")

KeyboardInterrupt: 

In [None]:
p, err = optimizer.optimize(p0, max_iter, step_type="cgnr", step_max_iter=6, step_tol=1e-6)
print(f"{p=}\n{err=}")

p=array([ 0.44364667, -0.34899506,  0.64851184,  1.52386946, -0.24480518,
       -4.18642183,  1.56485549, -2.97968555, -0.45877092,  8.77812663,
        0.20815047,  0.50293939, 13.11306863,  1.0453993 ,  2.47683831,
       50.98316778, -1.03334643,  0.31965958, -4.86292703, -5.16310465,
        9.70650825, 13.09335596, 53.59734248])
err=987.0992857863894


In [None]:
p, err = optimizer.optimize(p0, max_iter, step_type="svd")
print(f"{p=}\n{err=}")

p=array([ 0.44364667, -0.34899506,  0.64851184,  1.52386946, -0.24480518,
       -4.18642183,  1.56485549, -2.97968555, -0.45877092,  8.77812663,
        0.20815047,  0.50293939, 13.11306863,  1.0453993 ,  2.47683831,
       50.98316778, -1.03334643,  0.31965958, -4.86292703, -5.16310465,
        9.70650825, 13.09335596, 53.59734248])
err=987.0992857863894
