In [1]:

import cupy as cp
import numpy as np
import time

N = 1024
THREADS_PER_BLOCK = 32
BLOCKS_N_32 = (N // THREADS_PER_BLOCK, 1, 1)
THREADS_32 = (THREADS_PER_BLOCK, 1, 1)

CUDA_KERNELS = r'''
extern "C" __global__ void kernel_indexing(int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i == 0 || i == N - 1 || i == N/2) {
        printf("Thread %d / %d | BlockIdx=%d | ThreadIdx=%d\n",
               i, N, blockIdx.x, threadIdx.x);
    }
    if (blockIdx.x == 0 && threadIdx.x == 0) {
        printf("Launch Metadata: GridDim=%d | BlockDim=%d\n",
               gridDim.x, blockDim.x);
    }
}
''';

k_index = cp.RawKernel(CUDA_KERNELS, 'kernel_indexing')

print("Step 1: Memory Allocation and Initialization")

A_host = np.arange(N, dtype=np.int32)
B_host = 2 * np.arange(N, dtype=np.int32)

print("First 5 elements of A:", A_host[:5])
print("First 5 elements of B:", B_host[:5])

A_device = cp.asarray(A_host)
B_device = cp.asarray(B_host)
print("Data copied to GPU")

print("Step 2: Serial Execution on Default Stream")

C_device = A_device + B_device
print("Kernel1 executed: C = A + B")

D_device = C_device * C_device
print("Kernel2 executed: D = C * C")

D_serial = D_device.get()
print("First 5 elements of D:", D_serial[:5])

expected = (A_host[4] + B_host[4]) ** 2
print("Validation check for D[4]: expected", expected, "got", D_serial[4])

print("Step 3: Parallel Execution with CUDA Streams")

stream1, stream2 = cp.cuda.Stream(), cp.cuda.Stream()
event = cp.cuda.Event()
D_device.fill(0)

with stream1:
    C_stream = A_device + B_device
    stream1.record(event)
    print("Kernel1 launched on Stream1")

stream2.wait_event(event)
with stream2:
    D_stream = C_stream * C_stream
    print("Kernel2 launched on Stream2")

D_stream_host = D_stream.get(stream=stream2)
stream2.synchronize()
print("First 5 elements of streamed D:", D_stream_host[:5])

print("Step 4: Synchronization Scenarios")

start_time = time.time()
D_device = (A_device + B_device) * (A_device + B_device)
cp.cuda.Device(0).synchronize()
sync_time = time.time() - start_time
print("With explicit synchronization, CPU blocked for", round(sync_time, 5), "seconds")

D_device.fill(0)
D_device = (A_device + B_device) * (A_device + B_device)
time.sleep(0.001)
D_nosync = D_device.get()
print("With implicit synchronization, .get() waited automatically")

print("Step 5: Thread Hierarchy Visualization")

print("Configuration 1: <<<1, N>>> (1 block, N threads)")
k_index((1, 1, 1), (N, 1, 1), (N,))
cp.cuda.Device(0).synchronize()

print("Configuration 2: <<<N/32, 32>>> (multiple blocks)")
k_index(BLOCKS_N_32, THREADS_32, (N,))
cp.cuda.Device(0).synchronize()

print("Done")


Step 1: Memory Allocation and Initialization
First 5 elements of A: [0 1 2 3 4]
First 5 elements of B: [0 2 4 6 8]
Data copied to GPU
Step 2: Serial Execution on Default Stream
Kernel1 executed: C = A + B
Kernel2 executed: D = C * C
First 5 elements of D: [  0   9  36  81 144]
Validation check for D[4]: expected 144 got 144
Step 3: Parallel Execution with CUDA Streams
Kernel1 launched on Stream1
Kernel2 launched on Stream2
First 5 elements of streamed D: [  0   9  36  81 144]
Step 4: Synchronization Scenarios
With explicit synchronization, CPU blocked for 0.00026 seconds
With implicit synchronization, .get() waited automatically
Step 5: Thread Hierarchy Visualization
Configuration 1: <<<1, N>>> (1 block, N threads)
Configuration 2: <<<N/32, 32>>> (multiple blocks)
Done
