In [None]:
import pyopencl as cl
from pyopencl import array
import numpy as np

In [None]:
def np_se(a, b):
    return (a - b) ** 2

In [None]:
def gpu_se(a, b, platform, device, context, program):

    queue = cl.CommandQueue(context,
                            properties=cl.command_queue_properties.
                            PROFILING_ENABLE)
    mem_flags = cl.mem_flags
    a_buf = cl.Buffer(context,
                      mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,
                      hostbuf=a)
    b_buf = cl.Buffer(context,
                      mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=b)
    error = np.empty_like(a)
    destination_buf = cl.Buffer(context,
                                mem_flags.WRITE_ONLY,
                                error.nbytes)

    exec_evt = program.mean_squared_error(queue, error.shape, None,
                                          a_buf, b_buf, destination_buf)
    exec_evt.wait()
    elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start)

    print("Execution time of OpenCL: %g s" % elapsed)

    cl.enqueue_copy(queue,
                    error, destination_buf)

    return error

In [None]:
np.random.seed(51)
a = np.random.rand(4096).astype(np.float32)
b = np.random.rand(4096).astype(np.float32)


platform = cl.get_platforms()[0]
device = platform.get_devices()[2]
context = cl.Context([device])

program = cl.Program(context, """
    __kernel void mean_squared_error(__global const float *a,
    __global const float *b, __global float *result)
    {
        int gid = get_global_id(0);
        float temp = a[gid] - b[gid];
        result[gid] =  temp * temp;
    }
        """).build()
gpu_error = gpu_se(a, b, platform, device, context, program)

np_error = np_se(a, b)
print('GPU error', np.mean(gpu_error))
print('NumPy error', np.mean(np_error))
%time np_se(a, b)