Решите простую задачу безусловной оптимизации (минимизации) в двумерном пространстве:  
$$f(\boldsymbol x) = -8x_1 - 16x_2 + x_1^2 + 4x_2^2$$
используя два метода:
 - аналитически (функция квадратичная, выпуклая)
 - методом градиентного спуска, используя один из методов оптимизации torch.optim

In [1]:
import torch
import numpy as np

In [6]:
def f(x1, x2):
    return -8*x1 - 16*x2 + x1**2 + 4*x2**2

x1 = torch.tensor(4, dtype=torch.float32, requires_grad=True)
x2 = torch.tensor(4, dtype=torch.float32, requires_grad=True)
for i in range(100):
    y = f(x1, x2)
    y.backward()
    print(f'iteration {i}, x1 {x1}, x2 {x2}, loss {y}, grad {x1.grad}, grad {x2.grad}')
    with torch.no_grad():
        x1 -= 0.01 * x1.grad
        x2 -= 0.01 * x2.grad
    x1.grad.zero_()
    x2.grad.zero_()
    #print(f'grad {x.grad}')

iteration 0, x1 4.0, x2 4.0, loss -16.0, grad 0.0, grad 16.0
iteration 1, x1 4.0, x2 3.8399999141693115, loss -18.457603454589844, grad 0.0, grad 14.719999313354492
iteration 2, x1 4.0, x2 3.6928000450134277, loss -20.53771209716797, grad 0.0, grad 13.542400360107422
iteration 3, x1 4.0, x2 3.5573761463165283, loss -22.29831314086914, grad 0.0, grad 12.459009170532227
iteration 4, x1 4.0, x2 3.432785987854004, loss -23.788497924804688, grad 0.0, grad 11.462287902832031
iteration 5, x1 4.0, x2 3.3181631565093994, loss -25.04977798461914, grad 0.0, grad 10.545305252075195
iteration 6, x1 4.0, x2 3.21271014213562, loss -26.117340087890625, grad 0.0, grad 9.701681137084961
iteration 7, x1 4.0, x2 3.1156933307647705, loss -27.02090835571289, grad 0.0, grad 8.925546646118164
iteration 8, x1 4.0, x2 3.026437759399414, loss -27.785701751708984, grad 0.0, grad 8.211502075195312
iteration 9, x1 4.0, x2 2.9443228244781494, loss -28.433013916015625, grad 0.0, grad 7.554582595825195
iteration 10, x

In [7]:
import torch

def f(x1, x2):
    return -8*x1 - 16*x2 + x1**2 + 4*x2**2

x1 = torch.tensor([4], dtype=torch.float32, requires_grad=True)
x2 = torch.tensor([4], dtype=torch.float32, requires_grad=True)

optimizer = torch.optim.SGD([x1, x2], lr=0.1)


for i in range(100):
    y = f(x1, x2)
    optimizer.zero_grad()
    y.backward()
    optimizer.step()
    print(f'iteration {i}, x1 {x1}, x2 {x2}, loss {y}, grad {x1.grad}, grad {x2.grad}')
    #     with torch.no_grad():
    #         x -= 0.1 * x.grad
    #x.grad.zero_()
    #print(f'grad {x.grad}')

iteration 0, x1 tensor([4.], requires_grad=True), x2 tensor([2.4000], requires_grad=True), loss tensor([-16.], grad_fn=<AddBackward0>), grad tensor([0.]), grad tensor([16.])
iteration 1, x1 tensor([4.], requires_grad=True), x2 tensor([2.0800], requires_grad=True), loss tensor([-31.3600], grad_fn=<AddBackward0>), grad tensor([0.]), grad tensor([3.2000])
iteration 2, x1 tensor([4.], requires_grad=True), x2 tensor([2.0160], requires_grad=True), loss tensor([-31.9744], grad_fn=<AddBackward0>), grad tensor([0.]), grad tensor([0.6400])
iteration 3, x1 tensor([4.], requires_grad=True), x2 tensor([2.0032], requires_grad=True), loss tensor([-31.9990], grad_fn=<AddBackward0>), grad tensor([0.]), grad tensor([0.1280])
iteration 4, x1 tensor([4.], requires_grad=True), x2 tensor([2.0006], requires_grad=True), loss tensor([-32.0000], grad_fn=<AddBackward0>), grad tensor([0.]), grad tensor([0.0256])
iteration 5, x1 tensor([4.], requires_grad=True), x2 tensor([2.0001], requires_grad=True), loss tensor