In [None]:
!pip install cupy-cuda11x

In [None]:
!nvcc --version

In [None]:
!pip uninstall -y cupy cupy-cuda12x

In [None]:
!apt-get update -y
!apt-get install -y cuda-toolkit-11-8

In [None]:
!pip install cupy-cuda11x

In [None]:
!nvcc --version

In [None]:
!pip install -U cupy-cuda12x

In [None]:
!nvcc --version


In [None]:
!apt-get remove --purge '^cuda.*' -y
!apt-get remove --purge '^nvidia-.*' -y
!apt-get autoremove -y
!apt-get clean
!pip uninstall -y cupy cupy-cuda*


In [None]:
!pip install cupy-cuda12x==13.1.0


In [3]:
import cupy as cp
print(cp.cuda.is_available())
gpu_array = cp.array([1, 2, 3, 4, 5])
print(gpu_array)


True
[1 2 3 4 5]


In [4]:
a = cp.array([1, 2, 3])
b = cp.array([4, 5, 6])

print(a + b)
print(a * b)
print(cp.exp(a))

[5 7 9]
[ 4 10 18]
[ 2.71828183  7.3890561  20.08553692]


In [5]:
import numpy as np
cpu_array = np.array([1, 2, 3])
gpu_array = cp.array(cpu_array)
cpu_result = cp.asnumpy(gpu_array)

print("Numpy (CPU) Array:", cpu_array)
print("CuPy (GPU) Array:", gpu_array)
print("Converted:", cpu_result)

Numpy (CPU) Array: [1 2 3]
CuPy (GPU) Array: [1 2 3]
Converted: [1 2 3]


In [2]:
import cupy as cp
import numpy as np
import time

# Create a large NumPy array (100 million elements)
cpu_array = np.random.rand(100_000_000)

# Move to GPU
start_gpu = time.time()
gpu_array = cp.array(cpu_array)  # Transfer CPU -> GPU
end_gpu = time.time()

# Move back to CPU
start_cpu = time.time()
cpu_result = cp.asnumpy(gpu_array)  # Transfer GPU -> CPU
end_cpu = time.time()

print(f"CPU -> GPU Transfer Time: {end_gpu - start_gpu:.6f} sec")
print(f"GPU -> CPU Transfer Time: {end_cpu - start_cpu:.6f} sec")

# Measure repeated transfers
repeats = 10
start_repeat = time.time()
for _ in range(repeats):
    gpu_array = cp.array(cpu_array)  # CPU -> GPU
    cpu_result = cp.asnumpy(gpu_array)  # GPU -> CPU
end_repeat = time.time()

print(f"Average Transfer Time (CPU ↔ GPU, {repeats} times): {(end_repeat - start_repeat) / repeats:.6f} sec")


CPU -> GPU Transfer Time: 0.839782 sec
GPU -> CPU Transfer Time: 0.362003 sec
Average Transfer Time (CPU ↔ GPU, 10 times): 0.514063 sec


Matrix Multiplication on GPU

In [2]:
import numpy as np
import cupy as cp
import time

N = 4096

A_cpu = np.random.rand(N, N)
B_cpu = np.random.rand(N, N)

start_cpu = time.time()
C_cpu = np.dot(A_cpu, B_cpu)
end_cpu = time.time()

A_gpu = cp.asarray(A_cpu)
B_gpu = cp.asarray(B_cpu)
start_gpu = time.time()
C_gpu = cp.dot(A_gpu, B_gpu)
cp.cuda.Device(0).synchronize()
end_gpu = time.time()

# Compare Execution Times
print(f"CPU Time: {end_cpu - start_cpu:.6f} sec")
print(f"GPU Time: {end_gpu - start_gpu:.6f} sec")

# Verify Correctness
C_check = cp.asnumpy(C_gpu)  # Transfer result back to CPU
print(f"Error between CPU and GPU results: {np.max(np.abs(C_check - C_cpu))}")

CPU Time: 3.395280 sec
GPU Time: 0.768896 sec
Error between CPU and GPU results: 8.640199666842818e-12


In [4]:
batch_size = 10
A_batch = cp.random.rand(batch_size, N, N)
B_batch = cp.random.rand(batch_size, N, N)

start_batched = time.time()
C_batch = cp.matmul(A_batch, B_batch)
cp.cuda.Device(0).synchronize()
end_batched = time.time()

print(f"Batched GPU time: {end_batched - start_batched:.6f} sec")

Batched GPU time: 5.430521 sec


In [7]:
import cupy as cp
free_mem, total_mem = cp.cuda.Device(0).mem_info
print(f"Free memory: {free_mem / 1024**2:.2f} MB")
print(f"Total memory: {total_mem / 1024**2:.2f} MB")


square_kernel = cp.RawKernel(r'''
extern "C" __global__
void square(const float* x, float* y, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        y[tid] = x[tid] * x[tid];
    }
}
''', 'square')

# Create data on GPU
n_elements = 100000
x_gpu = cp.random.rand(n_elements, dtype=cp.float32)
y_gpu = cp.zeros(n_elements, dtype=cp.float32)

# Run the kernel: using 256 threads per block
threads_per_block = 256
blocks = (n_elements + threads_per_block - 1) // threads_per_block
square_kernel((blocks,), (threads_per_block,), (x_gpu, y_gpu, n_elements))

print("First 10 squared values:", y_gpu[:10])


Free memory: 14596.12 MB
Total memory: 15095.06 MB
First 10 squared values: [8.1160255e-02 1.5683751e-01 1.9388148e-04 4.9947921e-02 5.3757656e-01
 8.1706357e-01 6.5382030e-03 6.3362902e-01 8.4135033e-02 9.4004422e-01]


In [6]:
import cupy as cp
free_mem, total_mem = cp.cuda.Device(0).mem_info
print(f"Free memory: {free_mem / 1024**2:.2f} MB")
print(f"Total memory: {total_mem / 1024**2:.2f} MB")


Free memory: 14596.12 MB
Total memory: 15095.06 MB


In [8]:
import cupy as cp
import time

# Create sample data
n_elements = 10000
x = cp.random.rand(n_elements, dtype=cp.float32)
y = cp.random.rand(n_elements, dtype=cp.float32)

# CUDA events for profiling
start = cp.cuda.Event()
end = cp.cuda.Event()

# Launch Kernel with timing
start.record()
y = x**2 + 2*x + 1  # Example kernel computation
end.record()

# Synchronize and compute time
end.synchronize()
time_taken = cp.cuda.get_elapsed_time(start, end)  # Time in milliseconds

print(f"Kernel Execution Time: {time_taken:.6f} ms")


Kernel Execution Time: 538.838989 ms


In [9]:
# Host (CPU) data
h_data = cp.random.rand(n_elements, dtype=cp.float32)

# Transfer CPU → GPU
start.record()
d_data = cp.array(h_data)
end.record()
end.synchronize()
cpu_to_gpu_time = cp.cuda.get_elapsed_time(start, end)

# Transfer GPU → CPU
start.record()
h_data_copy = d_data.get()
end.record()
end.synchronize()
gpu_to_cpu_time = cp.cuda.get_elapsed_time(start, end)

print(f"CPU → GPU Transfer Time: {cpu_to_gpu_time:.6f} ms")
print(f"GPU → CPU Transfer Time: {gpu_to_cpu_time:.6f} ms")


CPU → GPU Transfer Time: 130.486267 ms
GPU → CPU Transfer Time: 0.212160 ms


In [13]:
import cupy as cp
import cupyx.profiler

n_elements = 10000
x = cp.random.rand(n_elements, dtype=cp.float32)

# Start profiling
with cupyx.profiler.time_range('CustomKernel', color_id=0):
    y = x**2 + 2*x + 1  # Example computation

cp.cuda.Device(0).synchronize()  # Ensure GPU finishes execution
print("Profiling complete.")


Profiling complete.


Why Profiling Matters Before Optimization?
Profiling helps us identify bottlenecks before making any optimizations. Without it, we might optimize the wrong part of the code, wasting effort without real performance gains.

What Profiling Tells Us?
Time Spent on Different Operations:

How much time is spent on CPU-GPU memory transfers?
How much time does the GPU kernel execution take?
Memory Usage & Bandwidth:

How much global memory is accessed?
Is shared memory underutilized?
Kernel Performance:

Are all GPU threads active?
Is register usage limiting performance?
Are we experiencing memory access bottlenecks?

When to Optimize?
If memory transfers take a lot of time → Use pinned memory.
If GPU utilization is low → Improve parallelism and occupancy.
If global memory accesses are slow → Use shared memory.