In [None]:
import torch
from torch import nn
from torch.utils.checkpoint import checkpoint_sequential, checkpoint
from tqdm.notebook import tqdm

device = torch.device('cuda')

def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

In [None]:
f = nn.Linear(10,10)
x = torch.randn((1, 10))
checkpoint(f, x)

  return fn(*args, **kwargs)


tensor([[-0.0846,  0.6789,  0.0073,  0.0761, -1.1562, -0.7660, -0.0578,  0.3709,
          0.1660,  0.5099]])

## Simple model

In [None]:
nn_depth = 12

list_fc = []

# Build the model
for _ in range(nn_depth):
    list_fc.append(nn.Linear(500, 500, bias=False, device=device))
fc_nn = nn.Sequential(*list_fc)
fc_nn

Sequential(
  (0): Linear(in_features=500, out_features=500, bias=False)
  (1): Linear(in_features=500, out_features=500, bias=False)
  (2): Linear(in_features=500, out_features=500, bias=False)
  (3): Linear(in_features=500, out_features=500, bias=False)
  (4): Linear(in_features=500, out_features=500, bias=False)
  (5): Linear(in_features=500, out_features=500, bias=False)
  (6): Linear(in_features=500, out_features=500, bias=False)
  (7): Linear(in_features=500, out_features=500, bias=False)
  (8): Linear(in_features=500, out_features=500, bias=False)
  (9): Linear(in_features=500, out_features=500, bias=False)
  (10): Linear(in_features=500, out_features=500, bias=False)
  (11): Linear(in_features=500, out_features=500, bias=False)
)

In [None]:
save_before = torch.cuda.memory_allocated()
x = torch.randn([500,500], device=device)
x_mem_size = torch.cuda.memory_allocated() - save_before
print(f"memory size of x: {x_mem_size}")

memory size of x: 1000448


In [None]:
def test0():
    # TODO: use nn_depth and x_mem_size to predict the allocation during fwd

    save_before = torch.cuda.memory_allocated()

    y = fc_nn(x)
    print(f"memory allocation in fwd: {sizeof_fmt(torch.cuda.memory_allocated() - save_before)}")

test0()

memory allocation in fwd: 11.4MiB


In [None]:
nn_depth = 1
batch_size = 500

# Build the model
list_fc = []
for _ in range(nn_depth):
    list_fc.append(nn.Linear(500, 500, bias=False, device=device))
fc_nn = nn.Sequential(*list_fc)

def test1(batch_size):
    save_before = torch.cuda.memory_allocated()
    x = torch.randn([batch_size,500], device=device)
    x_mem_size = x.numel()*4
    w_mem_size = fc_nn.get_submodule('0').weight.numel()*4

    print(f"memory size of x: {x_mem_size}")
    print(f"memory size of w: {w_mem_size}")

    y = fc_nn(x)
    print(f"shape of y: {y.shape}\n")

    torch.cuda.reset_peak_memory_stats()
    max_before = torch.cuda.max_memory_allocated()
    save_before = torch.cuda.memory_allocated()


    loss = y.mean()
    loss.backward()
    # TODO: predict the allocation during bwd

    print(f"memory allocation in bwd: {sizeof_fmt(torch.cuda.memory_allocated() - save_before)}")
    # print(f"predict memory allocation in fwd: {sizeof_fmt(predict_mem_in_bwd)}\n")

    # TODO: predict the allocation during bwd
    # predict_max_in_bwd = ...

    print(f"max memory allocation in bwd: {torch.cuda.max_memory_allocated() - max_before}")
    # print(f"predict memory allocation in bwd: {predict_max_in_bwd}")

test1(batch_size)

memory size of x: 1000000
memory size of w: 1000000
shape of y: torch.Size([500, 500])

memory allocation in bwd: 9.1MiB
max memory allocation in bwd: 10521600


## Torch.checkpoin examples

In [None]:
nn_depth = 24
batch_size = 10000

# Build the model
list_fc = []
for _ in range(nn_depth):
    list_fc.append(nn.Linear(500, 500, bias=False, device=device))
fc_nn = nn.Sequential(*list_fc)

x = torch.randn([batch_size,500], device=device, requires_grad=True)

def test2(verbose=True):
    x = torch.randn([batch_size,500], device=device, requires_grad=True)

    save_before = torch.cuda.memory_allocated()
    x_mem_size = x.numel()*4
    w_mem_size = fc_nn.get_submodule('0').weight.numel()*4

    torch.cuda.reset_peak_memory_stats()
    max_before = torch.cuda.max_memory_allocated()
    save_before = torch.cuda.memory_allocated()

    if verbose:
        print(f"memory size of x: {sizeof_fmt(x_mem_size)}")
        print(f"memory size of w: {sizeof_fmt(w_mem_size)}")

    y = fc_nn(x)
    if verbose:
        print(f"memory allocation in fwd: {sizeof_fmt(torch.cuda.memory_allocated() - save_before)}\n")

    loss = y.mean()
    loss.backward()

    if verbose:
        print(f"memory allocation in bwd: {sizeof_fmt(torch.cuda.memory_allocated() - save_before)}")
        print(f"max memory allocation in bwd: {sizeof_fmt(torch.cuda.max_memory_allocated() - max_before)}")
    fc_nn.zero_grad()

In [None]:
from tirch.utils.

SyntaxError: invalid syntax (<ipython-input-26-9d1e08adf879>, line 1)

In [None]:
nn_depth = 24
batch_size = 10000
n_segments = 6# How many checkpoints to insert

# Build the model
list_fc = []
for _ in range(nn_depth):
    list_fc.append(nn.Linear(500, 500, bias=False, device=device))
    list_fc.append(nn.GELU().to(device))
fc_nn = nn.Sequential(*list_fc)

x = torch.randn([batch_size,500], device=device, requires_grad=True)

def test3(verbose=True):
    x = torch.randn([batch_size,500], device=device, requires_grad=True)

    save_before = torch.cuda.memory_allocated()
    x_mem_size = x.numel()*4
    w_mem_size = fc_nn.get_submodule('0').weight.numel()*4

    torch.cuda.reset_peak_memory_stats()
    max_before = torch.cuda.max_memory_allocated()
    save_before = torch.cuda.memory_allocated()

    if verbose:
        print(f"memory size of x: {sizeof_fmt(x_mem_size)}")
        print(f"memory size of w: {sizeof_fmt(w_mem_size)}")

    #y = checkpoint_sequential(fc_nn, n_segments, x)
    y = x
    for layer in fc_nn:
      if isinstance(layer, nn.GELU):
        y = checkpoint(layer, y)
      else:
        y = layer(y)


    if verbose:
        print(f"memory allocation in fwd: {sizeof_fmt(torch.cuda.memory_allocated() - save_before)}\n")

    loss = y.mean()
    loss.backward()

    if verbose:
        print(f"memory allocation in bwd: {sizeof_fmt(torch.cuda.memory_allocated() - save_before)}")
        print(f"max memory allocation in bwd: {sizeof_fmt(torch.cuda.max_memory_allocated() - max_before)}")
    fc_nn.zero_grad()

In [None]:
x = torch.randn([batch_size,500], device=device, requires_grad=True)

print("=====original module=====")
test2()

print()
print("=====checkpoint module=====")
test3()


=====original module=====
memory size of x: 19.1MiB
memory size of w: 976.6KiB
memory allocation in fwd: 960.0MiB

memory allocation in bwd: 62.9MiB
max memory allocation in bwd: 1000.0MiB

=====checkpoint module=====
memory size of x: 19.1MiB
memory size of w: 976.6KiB
memory allocation in fwd: 960.0MiB

memory allocation in bwd: 62.9MiB
max memory allocation in bwd: 1020.0MiB


  return fn(*args, **kwargs)


In [None]:
print("training time for the original module")
for _ in tqdm(range(100)):
    test2(verbose=False)

training time for the original module


  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
print("training time for the checkpoint module")
for _ in tqdm(range(100)):
    test3(verbose=False)

training time for the checkpoint module


  0%|          | 0/100 [00:00<?, ?it/s]

## Understand Peak memory (Homework)

In [None]:
batch_size = 5000
nn_depth = 48

list_fc = []

# Build the model
for _ in range(nn_depth):
    list_fc.append(nn.Linear(500, 500, bias=False, device=device))
fc_nn = nn.Sequential(*list_fc)

def max_prediction(nn_depth, w_mem_size, y_mem_size):
    mem_by_layer = [0]
    #TODO: predict memory peak of each layer

    return max(mem_by_layer)

def HW1(batch_size=500, nn_depth = 6, verbose=True):
    save_before = torch.cuda.memory_allocated()
    x = torch.randn([batch_size,500], device=device)
    if verbose:
        print(f"depth of nn: {nn_depth}")
        print(f"batch size: {batch_size}")
        print(f"size of x: {sizeof_fmt(torch.cuda.memory_allocated()-save_before)}\n")

    save_before = torch.cuda.memory_allocated()
    torch.cuda.reset_peak_memory_stats()
    max_before = torch.cuda.max_memory_allocated()

    # Running Forward
    y = fc_nn(x)

    y_mem_size = y.numel()*4
    w_mem_size = fc_nn.get_submodule('0').weight.numel()*4

    if verbose:
        print(f"save mem after fwd: {sizeof_fmt(torch.cuda.memory_allocated()-save_before)}")
        print(f"this should be close to: {sizeof_fmt(nn_depth*y_mem_size)}\n")
        print(f"max mem after fwd: {sizeof_fmt(torch.cuda.max_memory_allocated()-max_before)}")
        print(f"this should be close to: {sizeof_fmt(nn_depth*y_mem_size)}\n")

    # Running Backward
    loss = y.mean()
    loss.backward()

    max_real = torch.cuda.max_memory_allocated()-max_before
    max_pred = max_prediction(nn_depth, w_mem_size, y_mem_size)
    if verbose:
        print(f"save mem after bwd:{sizeof_fmt(torch.cuda.memory_allocated()-save_before)}")
        print(f"this should be close to:{sizeof_fmt(nn_depth*w_mem_size+y_mem_size)}\n")
        print(f"max mem after bwd: {sizeof_fmt(max_real)}")
        print(f"this should be close to:{sizeof_fmt(max_pred)}")


    if abs(max_pred-max_real)/max_real < 0.1:
        if verbose: print("Yes they are close")
    else:
        print(f"Something is wrong with your prediction when batch_size = {batch_size}")
    fc_nn.zero_grad(set_to_none=True)

HW1(batch_size, nn_depth)

depth of nn: 48
batch size: 5000
size of x: 10.5MiB

save mem after fwd: 480.0MiB
this should be close to: 457.8MiB

max mem after fwd: 480.0MiB
this should be close to: 457.8MiB

save mem after bwd:56.3MiB
this should be close to:55.3MiB

max mem after bwd: 501.0MiB
this should be close to:0.0B
Something is wrong with your prediction when batch_size = 5000


In [None]:
test_batch_sizes = [500]
test_batch_sizes += [300, 400, 450, 480]
test_batch_sizes += [520, 550, 600, 700]
test_batch_sizes += [1000, 2000, 3000, 10000]
test_batch_sizes += [10, 20, 30,100]

# Howework 1: if prediction function is correct, there should be no mistake message
for batch_size in test_batch_sizes:
    HW1(batch_size, nn_depth, verbose=False)

## Torch.checkpoint (Homework)

In [None]:
batch_size = 5000
nn_depth = 48
n_segments= 2
list_fc = []

# Build the model
for _ in range(nn_depth):
    list_fc.append(nn.Linear(500, 500, bias=False, device=device))
fc_nn = nn.Sequential(*list_fc)

def max_prediction_fwd_checkpoint(nn_depth, w_mem_size, y_mem_size, n_segments):
    segment_size = nn_depth//n_segments
    #TODO: predict memory peak of each layer
    return 0

def max_prediction_bwd_checkpoint(nn_depth, w_mem_size, y_mem_size, n_segments):
    segment_size = nn_depth//n_segments
    mem_by_layer = [0]
    #TODO: predict memory peak of each layer

    return max(mem_by_layer)

def HW2(batch_size=500, nn_depth = 48, n_segments=1, verbose=True):
    save_before = torch.cuda.memory_allocated()

    x = torch.randn([batch_size,500], device=device, requires_grad=True)
    x_mem_size = torch.cuda.memory_allocated()-save_before
    if verbose:
        print(f"depth of nn: {nn_depth}")
        print(f"batch size: {batch_size}")
        print(f"size of x: {sizeof_fmt(x_mem_size)}\n")

    save_before = torch.cuda.memory_allocated()
    torch.cuda.reset_peak_memory_stats()
    max_before = torch.cuda.max_memory_allocated()

    # Running Forward
    y = checkpoint_sequential(fc_nn, n_segments, x)

    y_mem_size = y.numel()*4
    w_mem_size = fc_nn.get_submodule('0').weight.numel()*4

    max_real_fwd = torch.cuda.max_memory_allocated()-max_before
    max_pred_fwd = max_prediction_fwd_checkpoint(nn_depth, w_mem_size, y_mem_size, n_segments)
    if verbose:
        print(f"save mem after fwd: {sizeof_fmt(torch.cuda.memory_allocated()-save_before)}")
        print(f"this should be close to: {sizeof_fmt(max_pred_fwd)}\n")
        print(f"max mem after fwd: {sizeof_fmt(max_real_fwd)}")
        print(f"this should be close to: {sizeof_fmt(max_pred_fwd)}")

    if abs(max_pred_fwd-max_real_fwd)/max_real_fwd < 0.1:
        if verbose: print("Yes they are close\n")
    else:
        print(f"Something is wrong with your prediction when batch_size = {batch_size}, n_segments = {n_segments}\n")


    # Running Backward
    loss = y.mean()
    loss.backward()

    max_real_bwd = torch.cuda.max_memory_allocated()-max_before
    max_pred_bwd = max_prediction_bwd_checkpoint(nn_depth, w_mem_size, y_mem_size, n_segments)
    if verbose:
        print(f"save mem after bwd:{sizeof_fmt(torch.cuda.memory_allocated()-save_before)}")
        print(f"this should be close to:{sizeof_fmt(nn_depth*w_mem_size+2*y_mem_size)}\n")
        print(f"max mem after bwd: {sizeof_fmt(max_real_bwd)}")
        print(f"this should be close to:{sizeof_fmt(max_pred_bwd)}")


    if abs(max_pred_bwd-max_real_bwd)/max_real_bwd < 0.1:
        if verbose: print("Yes they are close")
    else:
        print(f"Something is wrong with your prediction when batch_size = {batch_size}, n_segments = {n_segments}")
    fc_nn.zero_grad(set_to_none=True)

HW2(batch_size, nn_depth, n_segments)

depth of nn: 48
batch size: 5000
size of x: 10.5MiB

save mem after fwd: 249.5MiB
this should be close to: 0.0B

max mem after fwd: 249.5MiB
this should be close to: 0.0B
Something is wrong with your prediction when batch_size = 5000, n_segments = 2

save mem after bwd:65.8MiB
this should be close to:64.8MiB

max mem after bwd: 294.3MiB
this should be close to:0.0B
Something is wrong with your prediction when batch_size = 5000, n_segments = 2


In [None]:
test_batch_sizes = [500]
test_batch_sizes += [300, 400, 450, 480]
test_batch_sizes += [520, 550, 600, 700]
test_batch_sizes += [1000, 2000, 3000, 10000]
test_batch_sizes += [10, 20, 30,100]

test_n_segments = [1,2,4,6,8,12,16,24,48]

# Howework 2: if prediction function is correct, there should be no mistake message
for batch_size in test_batch_sizes:
    for n_segments in test_n_segments:
        HW2(batch_size, nn_depth, n_segments, verbose=False)