<a href="https://colab.research.google.com/github/Rontim/GPU-Parallel-Processing-AI/blob/main/gpu_programming/gpu_benchmarking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🚀 Phase 2: GPU Benchmarking with CuPy

In [1]:
# 🔧 Setup
import numpy as np
import cupy as cp
import time

In [2]:
# Confirm GPU availability
gpu_count = cp.cuda.runtime.getDeviceCount()
print(f"Number of CUDA-enabled GPUs detected: {gpu_count}")
for i in range(gpu_count):
    print(f"GPU {i}: {cp.cuda.runtime.getDeviceProperties(i)['name'].decode()}")

Number of CUDA-enabled GPUs detected: 1
GPU 0: Tesla T4


## 🔬 Benchmark 1: Matrix Multiplication

In [25]:
# Set size for benchmark
N = 100


In [26]:
# Generate random matrices
A_cpu = np.random.rand(N, N).astype(np.float32)
B_cpu = np.random.rand(N, N).astype(np.float32)

In [27]:
# NumPy (CPU)
start = time.time()
C_cpu = np.dot(A_cpu, B_cpu)
cpu_time = time.time() - start
print(f"CPU (NumPy) time: {cpu_time:.4f} seconds")

CPU (NumPy) time: 0.0004 seconds


In [28]:
# CuPy (GPU)
A_gpu = cp.array(A_cpu)
B_gpu = cp.array(B_cpu)

In [30]:
# Warm up GPU
_ = cp.dot(A_gpu, B_gpu)


In [31]:
# Benchmark
cp.cuda.Device(0).synchronize()
start = time.time()
C_gpu = cp.dot(A_gpu, B_gpu)
cp.cuda.Device(0).synchronize()
gpu_time = time.time() - start
print(f"GPU (CuPy) time: {gpu_time:.4f} seconds")

GPU (CuPy) time: 0.0004 seconds


In [32]:
# Compare results
C_gpu_cpu = cp.asnumpy(C_gpu)
max_diff = np.max(np.abs(C_cpu - C_gpu_cpu))
print(f"Max difference between CPU and GPU results: {max_diff}")

Max difference between CPU and GPU results: 1.71661376953125e-05


## 🔬 Benchmark 2: Element-wise Operations

In [21]:
# Large array
N = 10_000_000
array_cpu = np.random.rand(N).astype(np.float32)
array_gpu = cp.array(array_cpu)

In [22]:
# NumPy (CPU)
start = time.time()
result_cpu = np.sin(array_cpu) + np.exp(array_cpu) * np.log(array_cpu + 1)
cpu_time = time.time() - start
print(f"Elementwise CPU time: {cpu_time:.4f} seconds")


Elementwise CPU time: 0.0853 seconds


In [23]:
# CuPy (GPU)
cp.cuda.Device(0).synchronize()
start = time.time()
result_gpu = cp.sin(array_gpu) + cp.exp(array_gpu) * cp.log(array_gpu + 1)
cp.cuda.Device(0).synchronize()
gpu_time = time.time() - start
print(f"Elementwise GPU time: {gpu_time:.4f} seconds")

Elementwise GPU time: 0.0028 seconds


In [24]:
# Compare results
result_gpu_cpu = cp.asnumpy(result_gpu)
max_diff = np.max(np.abs(result_cpu - result_gpu_cpu))
print(f"Max difference in elementwise operation: {max_diff}")

Max difference in elementwise operation: 7.152557373046875e-07
