In [13]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

import numpy as np

In [14]:
# create 4x4 numpy random array
a = np.random.randn(4,4)

In [15]:
print(a)

[[-1.25807439 -1.49082831  0.42366278  0.23243383]
 [ 0.79992675  0.14990894  0.08045815  0.75484672]
 [ 1.17296669  2.09375653  0.91962065 -0.11099851]
 [ 0.65909774  1.23358011 -1.19015763  1.00887176]]


In [16]:
# elements of a is numpy.float64(double precision)
type(a[1,1])

numpy.float64

In [17]:
# nvidia devices only support single precision, so typecast
a = a.astype(np.float32)

In [18]:
#now type is numpy.float32(single precision)
type(a[1,1])

numpy.float32

In [19]:
#allocate memory on the device to transfer data
a_gpu = cuda.mem_alloc(a.nbytes)

#next, transfer data to GPU
#just like CUDA C, htod : host to device
cuda.memcpy_htod(a_gpu,a)

In [20]:
#Create a kernel
mod = SourceModule("""
    __global__ void doublify(float *a)
    {
        int idx = threadIdx.x + threadIdx.y*blockDim.x;
        a[idx] *= 2;
    }
""")

In [21]:
#threadsPerBlock
threadsPerBlock = (32,32,1)
#blocksPerGrid
blocksPerGrid = (1,1,1)


In [22]:
func = mod.get_function("doublify")
func(a_gpu, block=threadsPerBlock, grid=blocksPerGrid)

In [23]:
a_doubled = np.empty_like(a)
#dtoh : device to host
cuda.memcpy_dtoh(a_doubled, a_gpu)

In [24]:
print("Before: ")
print(a)

print("After: ")
print(a_doubled)

Before: 
[[-1.2580744  -1.4908283   0.42366278  0.23243383]
 [ 0.79992676  0.14990894  0.08045815  0.7548467 ]
 [ 1.1729667   2.0937564   0.91962063 -0.11099851]
 [ 0.65909773  1.2335801  -1.1901577   1.0088718 ]]
After: 
[[-2.5161488  -2.9816566   0.84732556  0.46486765]
 [ 1.5998535   0.2998179   0.1609163   1.5096934 ]
 [ 2.3459334   4.187513    1.8392413  -0.22199702]
 [ 1.3181955   2.4671602  -2.3803153   2.0177436 ]]
