<a href="https://colab.research.google.com/github/SalvadorMtz-UTN/EspecializacionIA/blob/main/ECU1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!uv pip install -q --system numba-cuda==0.4.0
import numpy as np
from numba import cuda
import time
import os
from numba import config
config.CUDA_ENABLE_PYNVJITLINK = 1

In [None]:
#CUDA Steps
#Initializing data on CPU
#Transfer from CPU to GPU
#Run Kernel with defined Grid/Block size (Threads)
#Transfer results from GPU to CPU
#Clear memory


# CUDA kernel Device
@cuda.jit
def first_kernel(a, result):
    idx = cuda.grid(1) # index thread
    if idx < a.size:
        result[idx] = a[idx]


# Host
def main():
    # 1.- Initialize data on CPU
    N = 10_000_000
    a_cpu = np.arange(N, dtype=np.float32)

    # -----------------------------
    # CPU computation
    # -----------------------------
    start = time.time()
    result_cpu = a_cpu
    cpu_time = time.time() - start
    print(f"CPU time: {cpu_time * 1e6:.2f} ms")

    # -----------------------------
    # GPU computation
    # -----------------------------

    # 2.- Transfer to GPU
    start = time.time()
    a_gpu = cuda.to_device(a_cpu)
    result_gpu = cuda.device_array_like(a_cpu) # reservar memoria
    transfer_in_time = time.time() - start

    # Kernel launch
    threads_per_block = 128
    blocks_per_grid = (N + threads_per_block - 1) // threads_per_block #10_000_000+127//128=
    start = time.time()
    first_kernel[blocks_per_grid, threads_per_block](a_gpu, result_gpu)
    cuda.synchronize()
    kernel_time = time.time() - start



    # Copy Back  Transfer result back to CPU
    start = time.time()
    result_from_gpu = result_gpu.copy_to_host()
    transfer_out_time = time.time() - start


    # Report
    print(f"Transfer to GPU time: {transfer_in_time * 1e3:.2f} ms")
    print(f"Kernel execution time: {kernel_time * 1e3:.2f} ms")
    print(f"Transfer from GPU time: {transfer_out_time * 1e3:.2f} ms")
    print(f"Total GPU time:    {(transfer_in_time + kernel_time + transfer_out_time)*1e3:.2f} ms")

    # Cleanup
    del a_gpu, result_gpu
    cuda.close()

    # Validation
    if np.allclose(result_from_gpu, result_cpu):
        print("✅ GPU and CPU results match!")
    else:
        print("❌ Results differ!")


if __name__ == "__main__":
        main()

CPU time: 1.43 ms
Transfer to GPU time: 90.64 ms
Kernel execution time: 47.88 ms
Transfer from GPU time: 14.83 ms
Total GPU time:    153.36 ms
✅ GPU and CPU results match!
