In [0]:
from mxnet import autograd, np, npx
npx.set_np()

In [2]:
x = np.arange(4)
x.attach_grad()
x.grad

array([0., 0., 0., 0.])

In [3]:
with autograd.record():
    y = 2 * np.dot(x, x) # Same as 2 * (L2 norm)^2

y

array(28.)

In [0]:
y.backward()

In [5]:
x.grad

array([ 0.,  4.,  8., 12.])

In [6]:
with autograd.record():
    y = x.sum() # Sum of each element

y.backward()
x.grad

array([1., 1., 1., 1.])

In [7]:
with autograd.record():
    y = np.abs(x).sum() # L1 norm

y.backward()
x.grad

array([0., 1., 1., 1.])

In [8]:
with autograd.record():
    y = x * x  # y is a vector

y.backward() # Assumes we want to sum elements of y. Autograd is for scalars only?

u = x.copy()
u.attach_grad()
with autograd.record():
    v = (u * u).sum()  # v is a scalar

v.backward()

x.grad == u.grad

array([ True,  True,  True,  True])

In [9]:
with autograd.record():
    y = x * x
    u = y.detach() # Returns y, but detached from computation graph
    z = u * x

z.backward()
x.grad == u

array([ True,  True,  True,  True])

In [10]:
y = np.ones(4) * 2
y.attach_grad()
with autograd.record():
    u = x * y
    u.attach_grad()  # Implicitly run u = u.detach()
    z = 5 * u - x

z.backward()
x.grad, u.grad, y.grad

(array([-1., -1., -1., -1.]), array([5., 5., 5., 5.]), array([0., 0., 0., 0.]))

In [0]:
# Gradient of arbitrary function
def f(a):
    b = a * 2
    while np.linalg.norm(b) < 1000:
        b = b * 2
    if b.sum() > 0:
        c = b
    else:
        c = 100 * b
    return c

a = np.random.normal()
a.attach_grad()
with autograd.record():
    d = f(a)

d.backward()

In [12]:
a.grad == d / a # Gradient check

array(True)

In [13]:
print(autograd.is_training()) # Prediction mode
with autograd.record():
    print(autograd.is_training()) # Training mode

# Works like PyTorch

False
True
