In [151]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

import numpy as np

In [152]:
# create 4x4 numpy random array
a = np.random.randn(4,4)

In [153]:
print(a)

[[ 0.29645637  0.3740677  -0.18154397 -2.56960585]
 [-1.48196668  0.40002586 -0.09616903 -0.23997804]
 [-0.52618869 -0.69273117  1.28962305 -0.04621967]
 [ 0.00796687 -1.026881    1.72005559  0.54959014]]


In [154]:
# elements of a is numpy.float64(double precision)
type(a[1,1])

numpy.float64

In [155]:
# nvidia devices only support single precision, so typecast
a = a.astype(np.float32)

In [156]:
#now type is numpy.float32(single precision)
type(a[1,1])

numpy.float32

In [157]:
#allocate memory on the device to transfer data
a_gpu = cuda.mem_alloc(a.nbytes)

#next, transfer data to GPU
#just like CUDA C, htod : host to device
cuda.memcpy_htod(a_gpu,a)

In [158]:
#Create a kernel
mod = SourceModule("""
    __global__ void doublify(float *a)
    {
        int idx = threadIdx.x + threadIdx.y*blockDim.x;
        a[idx] *= 2;
    }
""")

In [159]:
#threadsPerBlock
threadsPerBlock = (32,32,1)
#blocksPerGrid
blocksPerGrid = (1,1,1)


In [160]:
func = mod.get_function("doublify")
func(a_gpu, block=threadsPerBlock, grid=blocksPerGrid)

In [161]:
a_doubled = np.empty_like(a)
#dtoh : device to host
cuda.memcpy_dtoh(a_doubled, a_gpu)

In [162]:
print("Before: ")
print(a)

print("After: ")
print(a_doubled)

Before: 
[[ 0.29645637  0.3740677  -0.18154398 -2.5696058 ]
 [-1.4819667   0.40002587 -0.09616903 -0.23997805]
 [-0.5261887  -0.69273114  1.289623   -0.04621967]
 [ 0.00796687 -1.026881    1.7200556   0.5495901 ]]
After: 
[[ 0.59291273  0.7481354  -0.36308795 -5.1392117 ]
 [-2.9639335   0.80005175 -0.19233806 -0.4799561 ]
 [-1.0523773  -1.3854623   2.579246   -0.09243934]
 [ 0.01593374 -2.053762    3.4401112   1.0991802 ]]
