In [None]:
import ctypes

class Nanosleep:
    def __init__(self):
        self.libc = ctypes.CDLL('libc.so.6')
        class timespec(ctypes.Structure):
            _fields_ = [("tv_sec", ctypes.c_long),
                        ("tv_nsec", ctypes.c_long)]
        self.delay_time = timespec(0, 0)
        self.libc.nanosleep.argtypes = [ctypes.POINTER(timespec), ctypes.POINTER(timespec)]
        self.libc.nanosleep.restype = ctypes.c_int
        
    def __call__(self, nanoseconds):
        self.delay_time.tv_sec = nanoseconds // 1000_000_000
        self.delay_time.tv_nsec = (nanoseconds % 1000_000_000)
        self.libc.nanosleep(ctypes.byref(self.delay_time), None)

In [None]:
import time

# define the Nanosleep class here...

# create an instance of the Nanosleep class
ns = Nanosleep()

# sleep for 1 second using the Nanosleep class
start_time = time.time()
ns(1000_000_000)
end_time = time.time()

# print the elapsed time
print("Elapsed time: {:.2f} seconds".format(end_time - start_time))

Elapsed time: 1.00 seconds


In [None]:
import torch
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity



def no_torch_pure_loop(a_tensor):
    total = 1
    for i in range(a_tensor.size()[0]):
        total += 1

def my_add(total, val): return total+val

def no_torch_pycall(a_tensor):
    total = 1
    for i in range(a_tensor.size()[0]):
        total = my_add(total, 1)

def cpu_no_copy(a_tensor):
    total = 1
    for i in range(a_tensor.shape[0]):
        total = a_tensor[i]

def cpu_no_copy_nano_sleep(a_tensor):
    total = 1
    for i in range(a_tensor.shape[0]):
        total = a_tensor[i]        
        ns(0)

def first_sum(a_tensor):
    """
    needs to wait for 0.023 - 0.025 ms for each copy
    then 0.005 ms for add on cpu and python
    => 0.30ms * 2**12 = 122.88 ms
    """
    total = 0.0
    for i in range(a_tensor.size()[0]):
        total += a_tensor[i].cpu()
    return total

def first_sum2(a_tensor):
    total = torch.zeros(1, device='cuda')
    for i in range(a_tensor.size()[0]):
        total += a_tensor[i].to('cuda')
    return total

def second_sum(a_tensor):
    """ 
    add_ 0.004 - 0.006 ms - 
    select - 0.001 ms
    then after 1k operations get queued (happens after 4.8k events)
    you have to wait with aten:add_ for 2.025 ms
    """
    total = torch.zeros(1, device='cuda')
    for i in range(a_tensor.size()[0]):
        total += a_tensor[i]
    return total

def third_sum(a_tensor):
    total = 0.0
    tensor_on_cpu = a_tensor.cpu()
    for i in range(tensor_on_cpu.size()[0]):
        total += tensor_on_cpu[i]
    return total


def third_sum_clone(a_tensor):
    total = 0.0
    tensor_on_cpu = a_tensor.cpu()
    for i in range(tensor_on_cpu.size()[0]):
        total += torch.tensor(tensor_on_cpu[i].item())
    return total

torch.manual_seed(145) # Fun fact: 145 = 1! + 4! +5!
data = torch.rand(2**12, device='cuda', dtype=torch.float64)

In [None]:
%%timeit
no_torch_pycall(data)

210 µs ± 2.05 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
%%timeit
no_torch_pure_loop(data)

94.2 µs ± 1.72 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [None]:
%%timeit 
tensor_on_cpu = data.cpu()

35.1 µs ± 42.5 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [None]:
import numpy as np
npdata = np.array(data.cpu())

In [None]:
%%timeit 
cpu_no_copy(npdata)

199 µs ± 675 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
%%timeit 
cpu_no_copy(data_cpu)

2.9 ms ± 13.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
%%timeit 
cpu_no_copy_nano_sleep(data_cpu)

273 ms ± 635 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit
third_sum(data)

6.54 ms ± 28.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
%%timeit
third_sum(data_cpu)

6.33 ms ± 12.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
0.030 * 2**12

122.88

In [None]:
%%timeit
sum1 = first_sum(data).cpu()

123 ms ± 2.54 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit
sum1 = first_sum2(data_cpu).cpu()

208 ms ± 2.47 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit
sum2 = second_sum(data)

1.65 ms ± 60.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
%%timeit
sum2 = second_sum(data).cpu()

2.26 ms ± 148 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
%%timeit
sum3 = third_sum(data)

6.25 ms ± 17.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
%%timeit
sum3 = third_sum_clone(data)

16.2 ms ± 76.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
    sum1 = first_sum(data).cpu()

prof.export_chrome_trace("trace_first_sum.json")

STAGE:2023-02-16 08:29:29 15854:15854 ActivityProfilerController.cpp:294] Completed Stage: Warm Up
STAGE:2023-02-16 08:29:30 15854:15854 ActivityProfilerController.cpp:300] Completed Stage: Collection
STAGE:2023-02-16 08:29:30 15854:15854 output_json.cpp:417] Completed Stage: Post Processing


In [None]:
data_cpu = data.cpu()
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
    sum1 = first_sum2(data_cpu).cpu()

prof.export_chrome_trace("trace_first_sum2.json")

STAGE:2023-02-16 08:29:33 15854:15854 ActivityProfilerController.cpp:294] Completed Stage: Warm Up
STAGE:2023-02-16 08:29:33 15854:15854 ActivityProfilerController.cpp:300] Completed Stage: Collection
STAGE:2023-02-16 08:29:34 15854:15854 output_json.cpp:417] Completed Stage: Post Processing


In [None]:
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
    sum1 = second_sum(data).cpu()

prof.export_chrome_trace("trace_second_sum.json")

STAGE:2023-02-16 08:29:57 15854:15854 ActivityProfilerController.cpp:294] Completed Stage: Warm Up
STAGE:2023-02-16 08:29:57 15854:15854 ActivityProfilerController.cpp:300] Completed Stage: Collection
STAGE:2023-02-16 08:29:57 15854:15854 output_json.cpp:417] Completed Stage: Post Processing
