In [None]:
################################################################################
################ Carlos Alberto Vidrios Serrano ################################
################################################################################

!uv pip install -q --system numba-cuda==0.4.0
import numpy as np
from numba import cuda
import time
import os
from numba import config
config.CUDA_ENABLE_PYNVJITLINK = 1

In [None]:
#CUDA steps:
#1. Inicializar datos del CPU
#2. Transferir del CPU al GPU
#3. Correr el kernel con un tama√±o definido de grid/block (Threads)
#4. Transferir los resultados del GPU al CPU
#5. Limpiar la memoria

#1. CUDA kernel
@cuda.jit
def first_kernel(a, result):
    idx = cuda.grid(1)
    if idx < a.size:
      result[idx] = a[idx]


def main():
    # 2. Initialize data on CPU
    N = 10_000_000
    a_cpu = np.arange(N, dtype=np.float32)

    # -------------------------------
    # CPU computation
    # -------------------------------
    start = time.time()
    result_cpu = a_cpu
    cpu_time = time.time() - start
    print(f"CPU time: {cpu_time * 1e3:.2f} ms")

    # -------------------------------
    # GPU computation
    # -------------------------------
    # Transfer to GPU
    start = time.time()
    a_gpu = cuda.to_device(a_cpu)
    result_gpu = cuda.device_array_like(a_cpu)
    transfer_in_time = time.time() - start

    #kernel launch
    threads_per_block = 128
    blocks_per_grid = (N + (threads_per_block - 1)) // threads_per_block
    start = time.time()
    first_kernel[blocks_per_grid, threads_per_block](a_gpu, result_gpu)
    cuda.synchronize()
    kernel_time = time.time() - start

    #copy back
    start = time.time()
    result_from_gpu = result_gpu.copy_to_host()
    cuda.synchronize()
    transfer_out_time = time.time() - start

    #Report
    print(f"GPU transfer to device: {transfer_in_time*1e3:.2f} ms")
    print(f"GPU kernel execution: {kernel_time*1e3:.2f} ms")
    print(f"GPU transfer to host: {transfer_out_time*1e3:.2f} ms")
    print(f"Total GPU time: {(transfer_in_time + kernel_time + transfer_out_time)* 1e3:.2f} ms")

    #Cleanup
    del a_gpu, result_gpu
    cuda.close()

if __name__ == "__main__":
  main()

CPU time: 0.00 ms
GPU transfer to device: 159.93 ms
GPU kernel execution: 1520.26 ms
GPU transfer to host: 16.47 ms
Total GPU time: 1696.65 ms
