**Author:** Raoul Malm  
**Description:** Basic example to show that parallel computation with OpenCL in Python can be much faster than the corresponding sequential computation.

In [41]:
import pyopencl as cl
import numpy as np
from datetime import datetime

start = datetime.now();

# create some data array to give as input to Kernel and get output
array_size = 100000000;
num1 = np.array(np.random.randint(1000, size=array_size), dtype=np.int32)
num2 = np.array(np.random.randint(1000, size=array_size), dtype=np.int32)
print('computation time: ', datetime.now()-start)
print("Number1:", num1)
print("Number2:", num2)

computation time:  0:00:02.802909
Number1: [ 21 291 142 ...,   3 201 968]
Number2: [776 221 609 ..., 697 675 463]


In [42]:
## sequential computation

start = datetime.now();

# output array
out = np.empty(num1.shape, dtype=np.int32)
for i in range(num1.shape[0]):
    out[i] = num1[i]*num1[i] + num2[i]*num2[i];

print('computation time: ', datetime.now()-start)
print("Output :", out)

computation time:  0:01:36.485914
Output : [ 602617  133522  391045 ...,  485818  496026 1151393]


In [44]:
## parallel computation

start = datetime.now();

# output array
out = np.empty(num1.shape, dtype=np.int32)

# create context and command queue
context = cl.create_some_context()  
queue = cl.CommandQueue(context) 

# create the buffers to hold the values of the input
num1_buf = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,hostbuf=num1)
num2_buf = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,hostbuf=num2)

# create output buffer
out_buf = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, out.nbytes)

# kernel program
code = """
__kernel void frst_prog(__global int* num1, __global int* num2,__global int* out) 
{
    int i = get_global_id(0);
    out[i] = num1[i]*num1[i] + num2[i]*num2[i];
}
"""

# build and launch the Kernel
bld = cl.Program(context, code).build()
launch = bld.frst_prog(queue, num1.shape, None, num1_buf, num2_buf, out_buf)
launch.wait() # wait till the process completes

# copy the output from the context to the Python process
cl.enqueue_copy(queue, out, out_buf)

print('computation time: ', datetime.now()-start)

# print the output
print("Output :", out)

computation time:  0:00:00.685827
Output : [ 602617  133522  391045 ...,  485818  496026 1151393]
