In [172]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

import numpy as np

In [173]:
# create 4x4 numpy random array
a = np.random.randn(4,4)

In [174]:
print(a)

[[-0.51342129  0.26176578 -2.17623851 -0.49214858]
 [ 1.39745188 -1.79311957 -0.84478383 -0.04278923]
 [-1.64701191  1.09121821 -0.42139168 -0.47278904]
 [-1.08904488  0.02165025  2.00204463  1.59899558]]


In [175]:
# elements of a is numpy.float64(double precision)
type(a[1,1])

numpy.float64

In [176]:
# nvidia devices only support single precision, so typecast
a = a.astype(np.float32)

In [177]:
#now type is numpy.float32(single precision)
type(a[1,1])

numpy.float32

In [178]:
#allocate memory on the device to transfer data
a_gpu = cuda.mem_alloc(a.nbytes)

#next, transfer data to GPU
#just like CUDA C, htod : host to device
cuda.memcpy_htod(a_gpu,a)

In [179]:
#Create a kernel
mod = SourceModule("""
    __global__ void doublify(float *a)
    {
        int idx = threadIdx.x + threadIdx.y*4;
        a[idx] *= 2;
    }
""")

In [180]:
bdim = (4,4,1)

In [181]:
func = mod.get_function("doublify")
func(a_gpu, block=bdim)

In [182]:
a_doubled = np.empty_like(a)
#dtoh : device to host
cuda.memcpy_dtoh(a_doubled, a_gpu)

In [183]:
print("Before: ")
print(a)

print("After: ")
print(a_doubled)

Before: 
[[-0.5134213   0.26176578 -2.1762385  -0.49214858]
 [ 1.3974519  -1.7931195  -0.84478384 -0.04278923]
 [-1.6470119   1.0912182  -0.42139167 -0.47278905]
 [-1.0890449   0.02165025  2.0020447   1.5989956 ]]
After: 
[[-1.0268426   0.52353156 -4.352477   -0.98429716]
 [ 2.7949038  -3.586239   -1.6895677  -0.08557846]
 [-3.2940238   2.1824365  -0.84278333 -0.9455781 ]
 [-2.1780899   0.0433005   4.0040894   3.1979911 ]]
