In [None]:
import numpy as np
import cupy as cp

In [None]:
kernal_1 = cp.RawKernel(r'''
extern "C" __global__
void add1(int* a, int* b, int* c, int n){
   int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n) {
      c[i] = a[i] + b[i];
    }
}''', 'add1')

kernal_2 = cp.RawKernel(r'''
extern "C" __global__
void add2(int* c, int* d, int n){
   int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n) {
      d[i] = c[i] * c[i];
    }
}''', 'add2')


i = 1
n = 1024
thread_per_block = 256
block_per_grid = (n + thread_per_block - 1) // thread_per_block

a_np = np.arange(n, dtype=cp.int32)    #allocating a in cpu(host) memory.
b_np = np.arange(n, dtype=cp.int32)

a = cp.asarray(a_np)                   #allocation in gpu(device) memory.
b = cp.asarray(b_np)
c = cp.empty_like(a)
d = cp.empty_like(a)

a[i] = i
b[i] = 2*i

#kernal_1((block_per_grid, ), (thread_per_block, ), (a, b, c, n))
#kernal_2((block_per_grid, ), (thread_per_block, ), (c, d, n))

stream_1 = cp.cuda.Stream()
stream_2 = cp.cuda.Stream()

with stream_1:
  kernal_1((1, ), (n, ), (a, b, c, n))

with stream_2:
  kernal_2((int(n/32), ), (32, ), (c, d, n))

stream_1.synchronize()
stream_2.synchronize()


c_cpu = c.get()
print("Element wise addition : ", c_cpu[:20])

d_cpu = d.get()                        #to transfer back to cpu from gpu we use get()
print("Element wise square : ", d_cpu[:20])













Element wise addition :  [ 0  3  4  6  8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38]
Element wise square :  [   0    9   16   36   64  100  144  196  256  324  400  484  576  676
  784  900 1024 1156 1296 1444]


In [None]:
i_cp = i.get()
print(i_cp)

AttributeError: 'int' object has no attribute 'get'

In [None]:
import cupy as cp
import numpy as np

kernel = cp.RawKernel(r'''
extern "C" __global__
void write_thread_ids(int* out_block, int* out_thread, int n) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n) {
        out_block[i] = blockIdx.x;
        out_thread[i] = threadIdx.x;
    }
}''', 'write_thread_ids')

n = 1024

out_thread = cp.empty(n, dtype=cp.int32)
out_block = cp.empty(n, dtype=cp.int32)

kernel((int(n/32),), (32,), (out_block, out_thread, n))

# Copy back to host
out_cpu_block = out_block.get()
print("Block IDs:", out_cpu_block)

out_cpu_thread = out_thread.get()
print("Thread IDs:", out_cpu_thread)

Block IDs: [ 0  0  0 ... 31 31 31]
Thread IDs: [ 0  1  2 ... 29 30 31]
