In [145]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

import numpy as np

In [146]:
testarr= np.random.randn(128,128)
testarr2= np.random.randn(128,128)
resultarr = np.zeros((128,128),dtype='float32')

In [147]:
testarr = testarr.astype(np.float32)
testarr2 = testarr2.astype(np.float32)

In [148]:
testarr_gpu = cuda.mem_alloc(testarr.nbytes)
testarr2_gpu = cuda.mem_alloc(testarr2.nbytes)
resultarr_gpu = cuda.mem_alloc(resultarr.nbytes)

cuda.memcpy_htod(testarr_gpu, testarr)
cuda.memcpy_htod(testarr2_gpu, testarr2)
cuda.memcpy_htod(resultarr_gpu, resultarr)

In [149]:
kernel = SourceModule("""
    __global__ void addtwo(float* resultarr, float* arr1, float* arr2) {
        int block_id =
        blockIdx.x +
        blockIdx.y * gridDim.x +
        blockIdx.z * gridDim.x * gridDim.y;

        int block_offset =
        block_id *
        blockDim.x * blockDim.y * blockDim.z;

        int thread_offset =
        threadIdx.x +
        threadIdx.y * blockDim.x +
        threadIdx.z * blockDim.x * blockDim.y;

        int id = block_offset + thread_offset;

        resultarr[id] = arr1[id] + arr2[id];
    }
""")

In [150]:
threadsPerBlock = (32,32,1)
blocksPerGrid = (4,4,1)

In [151]:
func = kernel.get_function("addtwo")
func(resultarr_gpu,testarr_gpu,testarr2_gpu, block=threadsPerBlock, grid=blocksPerGrid)

In [152]:
# empty = np.empty_like(resultarr)
cuda.memcpy_dtoh(resultarr, resultarr_gpu)

In [153]:
print(testarr)
print ("=============")
print(testarr2)
print ("=============")
print(resultarr)

print ("=============")
print ("=============")

#print(testarr[127])
#print(testarr2[127])
print(testarr[127]+testarr2[127])
print ("=============")
print(resultarr[127])

[[ 2.172797   -1.8913003  -0.13335153 ... -0.56815684 -0.3726319
   2.3859284 ]
 [-0.66520137 -0.36903605 -0.4553281  ...  0.3579886  -0.24931668
   1.2944077 ]
 [ 0.0707363  -0.6346102   1.6502053  ...  0.3886892   1.1047211
  -0.26416543]
 ...
 [ 0.795691    0.15838856  0.296864   ...  0.6569664   0.16506015
  -0.6848216 ]
 [ 0.41907394  1.7448777  -0.22915816 ...  0.00269804  0.68914384
  -1.1716142 ]
 [-1.0447842   0.11919478 -2.4202237  ... -1.2135612  -0.7723315
   0.4883211 ]]
[[ 0.8828912   1.9456795  -0.19567458 ... -2.1858425  -0.22638795
  -0.2584525 ]
 [ 0.988247   -1.2246962   1.5547718  ... -0.85325086  0.32420933
   1.050567  ]
 [-0.60226065 -1.8897116   1.1138006  ...  0.8898077   1.4075462
  -1.1041176 ]
 ...
 [ 0.17017879  0.9980286  -0.94855565 ... -0.6083658  -0.9379222
  -1.2179459 ]
 [ 0.91534954  1.5216018  -0.24705173 ...  1.3609576  -0.3085141
   0.05658213]
 [ 0.7526007   0.43940407 -0.5704486  ...  0.20386623  1.5538568
   1.0900004 ]]
[[ 3.0556881   0.054379