# Gradient Checkpointing with `jax.checkpoint` ( `jax.remat`)

In [1]:
import jax
import jax.numpy as jnp

def g(W, x):
  y = jnp.dot(W, x)
  return jnp.sin(y)

def f(W1, W2, W3, x):
  x = g(W1, x)
  x = g(W2, x)
  x = g(W3, x)
  return x

W1 = jnp.ones((5, 4))
W2 = jnp.ones((6, 5))
W3 = jnp.ones((7, 6))
x = jnp.ones(4)

# Inspect the 'residual' values to be saved on the forward pass
# if you were to evaluate `jax.grad(f)(W1, W2, W3, x)`
from jax.ad_checkpoint import print_saved_residuals
jax.ad_checkpoint.print_saved_residuals(f, W1, W2, W3, x)

f32[5,4] from the argument W1
f32[6,5] from the argument W2
f32[7,6] from the argument W3
f32[4] from the argument x
f32[5] output of sin from <ipython-input-1-ee597810e1e0>:6:9 (g)
f32[5] output of cos from <ipython-input-1-ee597810e1e0>:6:9 (g)
f32[6] output of sin from <ipython-input-1-ee597810e1e0>:6:9 (g)
f32[6] output of cos from <ipython-input-1-ee597810e1e0>:6:9 (g)
f32[7] output of cos from <ipython-input-1-ee597810e1e0>:6:9 (g)


In [2]:
def f2(W1, W2, W3, x):
  x = jax.checkpoint(g)(W1, x)
  x = jax.checkpoint(g)(W2, x)
  x = jax.checkpoint(g)(W3, x)
  return x

jax.ad_checkpoint.print_saved_residuals(f2, W1, W2, W3, x)

f32[5,4] from the argument W1
f32[6,5] from the argument W2
f32[7,6] from the argument W3
f32[4] from the argument x
f32[5] output of sin from <ipython-input-1-ee597810e1e0>:6:9 (g)
f32[6] output of sin from <ipython-input-1-ee597810e1e0>:6:9 (g)


In [3]:
f3 = jax.checkpoint(f, policy=jax.checkpoint_policies.dots_with_no_batch_dims_saveable)
jax.ad_checkpoint.print_saved_residuals(f3, W1, W2, W3, x)

f32[5,4] from the argument W1
f32[6,5] from the argument W2
f32[7,6] from the argument W3
f32[4] from the argument x
f32[5] output of reduce_precision from <ipython-input-1-ee597810e1e0>:5:6 (g)
f32[6] output of reduce_precision from <ipython-input-1-ee597810e1e0>:5:6 (g)
f32[7] output of reduce_precision from <ipython-input-1-ee597810e1e0>:5:6 (g)


In [4]:
from jax.ad_checkpoint import checkpoint_name

def f4(W1, W2, W3, x):
  x = checkpoint_name(g(W1, x), name='a')
  x = checkpoint_name(g(W2, x), name='b')
  x = checkpoint_name(g(W3, x), name='c')
  return x

f4 = jax.checkpoint(f4, policy=jax.checkpoint_policies.save_only_these_names('a'))
jax.ad_checkpoint.print_saved_residuals(f4, W1, W2, W3, x)

f32[5,4] from the argument W1
f32[6,5] from the argument W2
f32[7,6] from the argument W3
f32[4] from the argument x
f32[5] output of reduce_precision from <ipython-input-4-bdb467781e9f>:4:6 (f4)


In [5]:
from jax.tree_util import tree_flatten, tree_unflatten

from rich.console import Console
from rich.table import Table
import rich.text

def print_fwd_bwd(f, *args, **kwargs) -> None:
  args, in_tree = tree_flatten((args, kwargs))

  def f_(*args):
    args, kwargs = tree_unflatten(in_tree, args)
    return f(*args, **kwargs)

  fwd = jax.make_jaxpr(lambda *args: jax.vjp(f_, *args))(*args).jaxpr

  y, f_vjp = jax.vjp(f_, *args)
  res, in_tree = tree_flatten(f_vjp)

  def g_(*args):
    *res, y = args
    f_vjp = tree_unflatten(in_tree, res)
    return f_vjp(y)

  bwd = jax.make_jaxpr(g_)(*res, y).jaxpr

  table = Table(show_header=False, show_lines=True, padding=(1, 2, 0, 2), box=None)
  table.add_row("[bold green]forward computation:",
                "[bold green]backward computation:")
  table.add_row(rich.text.Text.from_ansi(str(fwd)),
                rich.text.Text.from_ansi(str(bwd)))
  console = Console(width=240, force_jupyter=True)
  console.print(table)

def _renderable_repr(self):
  return self.html
rich.jupyter.JupyterRenderable._repr_html_ = _renderable_repr

In [6]:
# Without using `jax.checkpoint`:
print_fwd_bwd(f, W1, W2, W3, x)

In [7]:
# Using `jax.checkpoint` with policy=jax.checkpoint_policies.dots_with_no_batch_dims_saveable:
print_fwd_bwd(f3, W1, W2, W3, x)

## `jax.checkpoint`

In [8]:
def sin_vjp(x):
  y = jnp.sin(x)
  cos_x = jnp.cos(x)
  return y, lambda y_bar: cos_x * y_bar

In [9]:
def sin_vjp2(x):
  y = jnp.sin(x)
  return y, lambda y_bar: jnp.cos(x) * y_bar

In [10]:
def f(x):
  y = g(x)
  z = h(y)
  return z

def f_vjp(x):
  y, g_vjp = jax.vjp(g, x)
  z, h_vjp = jax.vjp(h, y)
  def f_bwd(z_bar):
    y_bar, = h_vjp(z_bar)
    x_bar, = g_vjp(y_bar)
    return x_bar
  return z, f_bwd

In [11]:
def f_vjp_checkpoint(x):
  y = g(x)
  z, h_vjp = jax.vjp(h, y)
  def f_bwd2(z_bar):
    y_bar, = h_vjp(z_bar)
    _, g_vjp = jax.vjp(g, x)
    x_bar, = g_vjp(y_bar)
    return x_bar
  return z, f_bwd2

In [12]:
def f_checkpoint(x):
  y = jax.checkpoint(g)(x)
  z = h(y)
  return z

In [13]:
def f_checkpoint_grad(x):
  y = g(x)                  # step 1
  _, h_vjp = jax.vjp(h)(y)  # step 2
  y_bar, = h_vjp(1.0)       # step 3
  _, g_vjp = jax.vjp(g, x)  # step 4
  x_bar, = g_vjp(y_bar)     # step 5
  return x_bar

In [14]:
def f_grad_bad(x):
  _ = f(x)                  # step 1
  _, f_vjp = jax.vjp(f, x)  # step 2
  x_bar, = f_vjp(1.0)       # step 3
  return x_bar

In [15]:
def f_grad_bad2(x):
  y, g_vjp = jax.vjp(g, x)  # step 1
  z = h(y)                  # step 2
  _, h_vjp = jax.vjp(h, y)  # step 3
  y_bar, = h_vjp(1.0)       # step 3
  x_bar, = g_vjp(y_bar)     # step 5
  return x_bar

## What's saveable

In [16]:
def loss(params, x, y):
  return jnp.sum((predict(params, x) - y)**2)

def predict(params, x):
  *Ws, Wlast = params
  for W in Ws:
    x = layer(W, x)
  x = jnp.dot(Wlast, x)
  return x

def layer(W, x):
  return jnp.sin(jnp.dot(W, x))

In [17]:
W1 = W2 = W3 = jnp.ones((4, 4))
params = [W1, W2, W3]
x = jnp.ones(4)
y = jnp.ones(4)

In [18]:
print_saved_residuals(loss, params, x, y)

f32[4,4] from the argument params[0]
f32[4,4] from the argument params[1]
f32[4,4] from the argument params[2]
f32[4] from the argument x
f32[4] output of sin from <ipython-input-16-507e2be1527b>:12:9 (layer)
f32[4] output of cos from <ipython-input-16-507e2be1527b>:12:9 (layer)
f32[4] output of sin from <ipython-input-16-507e2be1527b>:12:9 (layer)
f32[4] output of cos from <ipython-input-16-507e2be1527b>:12:9 (layer)
f32[4] output of mul from <ipython-input-16-507e2be1527b>:2:17 (loss)


In [19]:
loss_checkpoint = jax.checkpoint(loss, policy=jax.checkpoint_policies.dots_with_no_batch_dims_saveable)
print_saved_residuals(loss_checkpoint, params, x, y)

f32[4,4] from the argument params[0]
f32[4,4] from the argument params[1]
f32[4,4] from the argument params[2]
f32[4] from the argument x
f32[4] from the argument y
f32[4] output of reduce_precision from <ipython-input-16-507e2be1527b>:12:17 (layer)
f32[4] output of reduce_precision from <ipython-input-16-507e2be1527b>:12:17 (layer)
f32[4] output of reduce_precision from <ipython-input-16-507e2be1527b>:8:6 (predict)


In [20]:
from jax.ad_checkpoint import checkpoint_name

def predict(params, x):
  *Ws, Wlast = params
  for i, W in enumerate(Ws):
    x = layer(W, x)
    x = checkpoint_name(x, name=f'layer{i}_output')
  x = jnp.dot(Wlast, x)
  return x

In [21]:
print_saved_residuals(loss, params, x, y)

f32[4,4] from the argument params[0]
f32[4,4] from the argument params[1]
f32[4,4] from the argument params[2]
f32[4] from the argument x
f32[4] output of cos from <ipython-input-16-507e2be1527b>:12:9 (layer)
f32[4] named 'layer0_output' from <ipython-input-20-ca54e1f99dd8>:7:8 (predict)
f32[4] output of cos from <ipython-input-16-507e2be1527b>:12:9 (layer)
f32[4] named 'layer1_output' from <ipython-input-20-ca54e1f99dd8>:7:8 (predict)
f32[4] output of mul from <ipython-input-16-507e2be1527b>:2:17 (loss)


In [22]:
loss_checkpoint2 = jax.checkpoint(loss, policy=jax.checkpoint_policies.save_any_names_but_these('layer1_output'))
print_saved_residuals(loss_checkpoint2, params, x, y)

f32[4,4] from the argument params[0]
f32[4,4] from the argument params[1]
f32[4,4] from the argument params[2]
f32[4] from the argument x
f32[4] from the argument y


## Offload

In [23]:
from jax.ad_checkpoint import checkpoint

def checkpoint_offload_dot_with_no_batch_dims(self):
  policy = jax.checkpoint_policies.offload_dot_with_no_batch_dims(
      "device", "pinned_host")

  @functools.partial(checkpoint, policy=policy)
  def f(x):
    x = jnp.einsum('ij,jk->ik', x, x, precision=lax.Precision.HIGHEST)
    x = jnp.sin(x)
    x = jnp.einsum('ij,jk->ik', x, x, precision=lax.Precision.HIGHEST)
    x = jnp.sin(x)
    x = jnp.einsum('ij,jk->ik', x, x, precision=lax.Precision.HIGHEST)
    x = jnp.sin(x)
    x = jnp.sum(x)
    return x

In [24]:
from jax.ad_checkpoint import checkpoint, checkpoint_name
from jax._src import test_util as jtu

def checkpoint_names_saved_offloaded_recomputed(self):
  mesh = jtu.create_mesh((2,), ("x",))
  shape = (256, 128)
  np_inp = np.arange(math.prod(shape), dtype=np.float32).reshape(shape)
  s = NamedSharding(mesh, P("x"))
  inp = jax.device_put(np_inp, s)

  policy = jax.checkpoint_policies.save_and_offload_only_these_names(
      names_which_can_be_saved=["y"], names_which_can_be_offloaded=["z"],
      offload_src='device', offload_dst='pinned_host')

  @functools.partial(checkpoint, policy=policy)
  def f(x):
    def g(ys, _):
      y, _ = ys
      y = checkpoint_name(jnp.sin(y), "y")
      z = checkpoint_name(jnp.sin(y), "z")
      z = z.T
      w = checkpoint_name(jnp.sin(z), "w")
      return (w.T, jnp.sum(w)), None
    _, scan_out = jax.lax.scan(g, (x, np.array(1, dtype=np.float32)), [np_inp])[0]
    return scan_out

## Recursive `jax.checpoint`

In [25]:
def chain_compose(funs):
  def f(x):
    for fun in funs:
      x = fun(x)
    return x
  return f

f = chain_compose([jnp.sin] * 8)
print_saved_residuals(f, 3.)

f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)
f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)
f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)
f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)
f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)
f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)
f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)
f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)


In [26]:
f = chain_compose([jnp.sin] * 16)
print_saved_residuals(f, 3.)

f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)
f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)
f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)
f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)
f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)
f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)
f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)
f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)
f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)
f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)
f32[] output of cos from <ipython-input-25-154fe1d2f1f8>:4:10 (chain_compose.<locals>.f)
f32[] output of cos f

In [27]:
def recursive_checkpoint(funs):
  if len(funs) == 1:
    return funs[0]
  elif len(funs) == 2:
    f1, f2 = funs
    return lambda x: f1(f2(x))
  else:
    f1 = recursive_checkpoint(funs[:len(funs)//2])
    f2 = recursive_checkpoint(funs[len(funs)//2:])
    return lambda x: f1(jax.checkpoint(f2)(x))

In [28]:
f = recursive_checkpoint([jnp.sin] * 8)
print_saved_residuals(f, 3.)

f32[] from the argument x
f32[] output of sin from <ipython-input-27-6da3e02ee513>:6:21 (recursive_checkpoint.<locals>.<lambda>)
f32[] output of cos from <ipython-input-27-6da3e02ee513>:6:24 (recursive_checkpoint.<locals>.<lambda>)
f32[] output of cos from <ipython-input-27-6da3e02ee513>:6:21 (recursive_checkpoint.<locals>.<lambda>)


In [29]:
f = recursive_checkpoint([jnp.sin] * 16)
print_saved_residuals(f, 3.)

f32[] from the argument x
f32[] output of sin from <ipython-input-27-6da3e02ee513>:6:21 (recursive_checkpoint.<locals>.<lambda>)
f32[] output of sin from <ipython-input-27-6da3e02ee513>:6:21 (recursive_checkpoint.<locals>.<lambda>)
f32[] output of cos from <ipython-input-27-6da3e02ee513>:6:24 (recursive_checkpoint.<locals>.<lambda>)
f32[] output of cos from <ipython-input-27-6da3e02ee513>:6:21 (recursive_checkpoint.<locals>.<lambda>)


In [30]:
f = chain_compose([jnp.sin] * 8)
print_fwd_bwd(f, 3.)

In [31]:
f = recursive_checkpoint([jnp.sin] * 8)
print_fwd_bwd(f, 3.)