From 2e3e20c0324d07b60586aaef2dec124e2989b39f Mon Sep 17 00:00:00 2001
From: Nikolay Polyarniy <PolarNick239@gmail.com>
Date: Thu, 4 Apr 2019 21:15:12 +0300
Subject: [PATCH] Initial commit

---
 CMakeLists.txt                               |   45 +
 README.md                                    |   34 +
 libs/CMakeLists.txt                          |    3 +
 libs/clew/CL/cl.h                            | 1003 ++++++++++++
 libs/clew/CL/cl_d3d10.h                      |  131 ++
 libs/clew/CL/cl_ext.h                        |  278 ++++
 libs/clew/CL/cl_gl.h                         |  160 ++
 libs/clew/CL/cl_gl_ext.h                     |   74 +
 libs/clew/CL/cl_platform.h                   | 1203 +++++++++++++++
 libs/clew/CL/opencl.h                        |   59 +
 libs/clew/CMakeLists.txt                     |   22 +
 libs/clew/libclew/ocl_init.cpp               | 1142 ++++++++++++++
 libs/clew/libclew/ocl_init.h                 |    3 +
 libs/gpu/CMakeLists.txt                      |   79 +
 libs/gpu/LICENSE                             |   23 +
 libs/gpu/libgpu/context.cpp                  |  310 ++++
 libs/gpu/libgpu/context.h                    |   72 +
 libs/gpu/libgpu/cuda/cu/common.cu            |  535 +++++++
 libs/gpu/libgpu/cuda/cu/opencl_translator.cu |   96 ++
 libs/gpu/libgpu/cuda/cuda_api.cpp            |  158 ++
 libs/gpu/libgpu/cuda/cuda_api.h              |   29 +
 libs/gpu/libgpu/cuda/enum.cpp                |   94 ++
 libs/gpu/libgpu/cuda/enum.h                  |   45 +
 libs/gpu/libgpu/cuda/sdk/helper_math.h       | 1453 ++++++++++++++++++
 libs/gpu/libgpu/cuda/utils.cpp               |   14 +
 libs/gpu/libgpu/cuda/utils.h                 |   76 +
 libs/gpu/libgpu/device.cpp                   |  174 +++
 libs/gpu/libgpu/device.h                     |   51 +
 libs/gpu/libgpu/gold_helpers.cpp             |   96 ++
 libs/gpu/libgpu/gold_helpers.h               |   36 +
 libs/gpu/libgpu/hexdumparray.cpp             |   74 +
 libs/gpu/libgpu/opencl/cl/c_template.cl      |    8 +
 libs/gpu/libgpu/opencl/cl/clion_defines.cl   |   74 +
 libs/gpu/libgpu/opencl/cl/common.cl          |  427 +++++
 libs/gpu/libgpu/opencl/device_info.cpp       |  204 +++
 libs/gpu/libgpu/opencl/device_info.h         |   49 +
 libs/gpu/libgpu/opencl/engine.cpp            |  749 +++++++++
 libs/gpu/libgpu/opencl/engine.h              |  266 ++++
 libs/gpu/libgpu/opencl/enum.cpp              |  251 +++
 libs/gpu/libgpu/opencl/enum.h                |   77 +
 libs/gpu/libgpu/opencl/utils.cpp             |   61 +
 libs/gpu/libgpu/opencl/utils.h               |   69 +
 libs/gpu/libgpu/shared_device_buffer.cpp     |  428 ++++++
 libs/gpu/libgpu/shared_device_buffer.h       |   87 ++
 libs/gpu/libgpu/shared_host_buffer.cpp       |  206 +++
 libs/gpu/libgpu/shared_host_buffer.h         |   54 +
 libs/gpu/libgpu/utils.cpp                    |  121 ++
 libs/gpu/libgpu/utils.h                      |   44 +
 libs/gpu/libgpu/work_size.h                  |   84 +
 libs/utils/CMakeLists.txt                    |   37 +
 libs/utils/libutils/fast_random.h            |   38 +
 libs/utils/libutils/misc.cpp                 |   74 +
 libs/utils/libutils/misc.h                   |   59 +
 libs/utils/libutils/string_utils.cpp         |  158 ++
 libs/utils/libutils/string_utils.h           |   24 +
 libs/utils/libutils/thread_mutex.cpp         |  119 ++
 libs/utils/libutils/thread_mutex.h           |  127 ++
 libs/utils/libutils/timer.h                  |  161 ++
 src/cl/merge_sort.cl                         |   32 +
 src/cl/merge_sort_cl.h                       |   49 +
 src/cu/merge_sort.cu                         |    9 +
 src/defines.h                                |    8 +
 src/io_utils/buffer_reader.cpp               |   30 +
 src/io_utils/buffer_reader.h                 |   17 +
 src/io_utils/buffer_writer.cpp               |   44 +
 src/io_utils/buffer_writer.h                 |   21 +
 src/io_utils/file_reader.cpp                 |   71 +
 src/io_utils/file_reader.h                   |   25 +
 src/io_utils/file_writer.cpp                 |   42 +
 src/io_utils/file_writer.h                   |   23 +
 src/main_generator.cpp                       |   47 +
 src/main_sorter.cpp                          |  168 ++
 72 files changed, 12214 insertions(+)
 create mode 100644 CMakeLists.txt
 create mode 100644 README.md
 create mode 100644 libs/CMakeLists.txt
 create mode 100644 libs/clew/CL/cl.h
 create mode 100644 libs/clew/CL/cl_d3d10.h
 create mode 100644 libs/clew/CL/cl_ext.h
 create mode 100644 libs/clew/CL/cl_gl.h
 create mode 100644 libs/clew/CL/cl_gl_ext.h
 create mode 100644 libs/clew/CL/cl_platform.h
 create mode 100644 libs/clew/CL/opencl.h
 create mode 100644 libs/clew/CMakeLists.txt
 create mode 100644 libs/clew/libclew/ocl_init.cpp
 create mode 100644 libs/clew/libclew/ocl_init.h
 create mode 100644 libs/gpu/CMakeLists.txt
 create mode 100644 libs/gpu/LICENSE
 create mode 100644 libs/gpu/libgpu/context.cpp
 create mode 100644 libs/gpu/libgpu/context.h
 create mode 100644 libs/gpu/libgpu/cuda/cu/common.cu
 create mode 100644 libs/gpu/libgpu/cuda/cu/opencl_translator.cu
 create mode 100644 libs/gpu/libgpu/cuda/cuda_api.cpp
 create mode 100644 libs/gpu/libgpu/cuda/cuda_api.h
 create mode 100644 libs/gpu/libgpu/cuda/enum.cpp
 create mode 100644 libs/gpu/libgpu/cuda/enum.h
 create mode 100644 libs/gpu/libgpu/cuda/sdk/helper_math.h
 create mode 100644 libs/gpu/libgpu/cuda/utils.cpp
 create mode 100644 libs/gpu/libgpu/cuda/utils.h
 create mode 100644 libs/gpu/libgpu/device.cpp
 create mode 100644 libs/gpu/libgpu/device.h
 create mode 100644 libs/gpu/libgpu/gold_helpers.cpp
 create mode 100644 libs/gpu/libgpu/gold_helpers.h
 create mode 100644 libs/gpu/libgpu/hexdumparray.cpp
 create mode 100644 libs/gpu/libgpu/opencl/cl/c_template.cl
 create mode 100644 libs/gpu/libgpu/opencl/cl/clion_defines.cl
 create mode 100644 libs/gpu/libgpu/opencl/cl/common.cl
 create mode 100644 libs/gpu/libgpu/opencl/device_info.cpp
 create mode 100644 libs/gpu/libgpu/opencl/device_info.h
 create mode 100644 libs/gpu/libgpu/opencl/engine.cpp
 create mode 100644 libs/gpu/libgpu/opencl/engine.h
 create mode 100644 libs/gpu/libgpu/opencl/enum.cpp
 create mode 100644 libs/gpu/libgpu/opencl/enum.h
 create mode 100644 libs/gpu/libgpu/opencl/utils.cpp
 create mode 100644 libs/gpu/libgpu/opencl/utils.h
 create mode 100644 libs/gpu/libgpu/shared_device_buffer.cpp
 create mode 100644 libs/gpu/libgpu/shared_device_buffer.h
 create mode 100644 libs/gpu/libgpu/shared_host_buffer.cpp
 create mode 100644 libs/gpu/libgpu/shared_host_buffer.h
 create mode 100644 libs/gpu/libgpu/utils.cpp
 create mode 100644 libs/gpu/libgpu/utils.h
 create mode 100644 libs/gpu/libgpu/work_size.h
 create mode 100644 libs/utils/CMakeLists.txt
 create mode 100644 libs/utils/libutils/fast_random.h
 create mode 100644 libs/utils/libutils/misc.cpp
 create mode 100644 libs/utils/libutils/misc.h
 create mode 100644 libs/utils/libutils/string_utils.cpp
 create mode 100644 libs/utils/libutils/string_utils.h
 create mode 100644 libs/utils/libutils/thread_mutex.cpp
 create mode 100644 libs/utils/libutils/thread_mutex.h
 create mode 100644 libs/utils/libutils/timer.h
 create mode 100644 src/cl/merge_sort.cl
 create mode 100644 src/cl/merge_sort_cl.h
 create mode 100644 src/cu/merge_sort.cu
 create mode 100644 src/defines.h
 create mode 100644 src/io_utils/buffer_reader.cpp
 create mode 100644 src/io_utils/buffer_reader.h
 create mode 100644 src/io_utils/buffer_writer.cpp
 create mode 100644 src/io_utils/buffer_writer.h
 create mode 100644 src/io_utils/file_reader.cpp
 create mode 100644 src/io_utils/file_reader.h
 create mode 100644 src/io_utils/file_writer.cpp
 create mode 100644 src/io_utils/file_writer.h
 create mode 100644 src/main_generator.cpp
 create mode 100644 src/main_sorter.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..772d1c1
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,45 @@
+cmake_minimum_required(VERSION 3.1)
+
+project(external_sort)
+
+set(CMAKE_CXX_STANDARD 11)
+
+option(GPU_CUDA_SUPPORT "CUDA support." OFF)
+
+find_package(OpenMP)
+if (OpenMP_CXX_FOUND)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+else()
+    message(WARNING "OpenMP not found!")
+endif()
+
+add_subdirectory(libs)
+
+convertIntoHeader(src/cl/merge_sort.cl src/cl/merge_sort_cl.h merge_sort_kernel)
+
+set(SOURCES
+        src/cl/merge_sort_cl.h
+        src/io_utils/buffer_reader.cpp
+        src/io_utils/buffer_reader.h
+        src/io_utils/buffer_writer.cpp
+        src/io_utils/buffer_writer.h
+        src/io_utils/file_reader.cpp
+        src/io_utils/file_reader.h
+        src/io_utils/file_writer.cpp
+        src/io_utils/file_writer.h
+)
+
+add_executable(input_generator src/main_generator.cpp ${SOURCES})
+target_link_libraries(input_generator libclew libgpu libutils)
+
+if (GPU_CUDA_SUPPORT)
+    find_package(CUDA REQUIRED)
+    add_definitions(-DCUDA_SUPPORT)
+    set(SOURCES ${SOURCES} src/cu/merge_sort.cu)
+    cuda_add_executable(${PROJECT_NAME} src/main_sorter.cpp ${SOURCES})
+else()
+    add_executable(${PROJECT_NAME} src/main_sorter.cpp ${SOURCES})
+endif()
+
+target_link_libraries(${PROJECT_NAME} libclew libgpu libutils)
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..136aef1
--- /dev/null
+++ b/README.md
@@ -0,0 +1,34 @@
+# Generating input data
+
+```
+./input_generator 1000000000 input.data
+```
+
+```
+Saving 1000000000 random floats (3814 MB) to file input.data
+Done in 9.94176 seconds (383.704 MB/s)
+```
+
+# Sorting
+
+```
+./external_sort input.data output.data
+```
+
+```
+Values number: 1000000000 (3814 MB)
+Pass #0: sorting part by part in core...
+    In core parts number: 60
+    Limit for values in core: 16777216 (64 MB)
+    IO: 51.1294 MB/s
+    Finished in 149.217 s (9% reading + 82% sorting + 10% writing)
+Pass #1: merging groups of 16 parts...
+    Input parts: 60 with 16777216 values (64 MB) in each
+    IO: 629.504 MB/s
+    Finished in 12.1199 s
+Pass #2: merging groups of 16 parts...
+    Input parts: 4 with 268435456 values (1024 MB) in each
+    IO: 343.152 MB/s
+    Finished in 22.2333 s
+Finished in 183.571 s
+```
diff --git a/libs/CMakeLists.txt b/libs/CMakeLists.txt
new file mode 100644
index 0000000..a050a26
--- /dev/null
+++ b/libs/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(clew)
+add_subdirectory(gpu)
+add_subdirectory(utils)
diff --git a/libs/clew/CL/cl.h b/libs/clew/CL/cl.h
new file mode 100644
index 0000000..e5662fe
--- /dev/null
+++ b/libs/clew/CL/cl.h
@@ -0,0 +1,1003 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11985 $ on $Date: 2010-07-15 11:16:06 -0700 (Thu, 15 Jul 2010) $ */
+
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl_platform.h>
+#else
+#include <CL/cl_platform.h>
+#endif	
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+
+typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ 
+typedef cl_ulong            cl_bitfield;
+typedef cl_bitfield         cl_device_type;
+typedef cl_uint             cl_platform_info;
+typedef cl_uint             cl_device_info;
+typedef cl_bitfield         cl_device_fp_config;
+typedef cl_uint             cl_device_mem_cache_type;
+typedef cl_uint             cl_device_local_mem_type;
+typedef cl_bitfield         cl_device_exec_capabilities;
+typedef cl_bitfield         cl_command_queue_properties;
+
+typedef intptr_t			cl_context_properties;
+typedef cl_uint             cl_context_info;
+typedef cl_uint             cl_command_queue_info;
+typedef cl_uint             cl_channel_order;
+typedef cl_uint             cl_channel_type;
+typedef cl_bitfield         cl_mem_flags;
+typedef cl_uint             cl_mem_object_type;
+typedef cl_uint             cl_mem_info;
+typedef cl_uint             cl_image_info;
+typedef cl_uint             cl_buffer_create_type;
+typedef cl_uint             cl_addressing_mode;
+typedef cl_uint             cl_filter_mode;
+typedef cl_uint             cl_sampler_info;
+typedef cl_bitfield         cl_map_flags;
+typedef cl_uint             cl_program_info;
+typedef cl_uint             cl_program_build_info;
+typedef cl_int              cl_build_status;
+typedef cl_uint             cl_kernel_info;
+typedef cl_uint             cl_kernel_work_group_info;
+typedef cl_uint             cl_event_info;
+typedef cl_uint             cl_command_type;
+typedef cl_uint             cl_profiling_info;
+
+typedef struct _cl_image_format {
+    cl_channel_order        image_channel_order;
+    cl_channel_type         image_channel_data_type;
+} cl_image_format;
+
+
+typedef struct _cl_buffer_region {
+    size_t                  origin;
+    size_t                  size;
+} cl_buffer_region;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_SUCCESS                                  0
+#define CL_DEVICE_NOT_FOUND                         -1
+#define CL_DEVICE_NOT_AVAILABLE                     -2
+#define CL_COMPILER_NOT_AVAILABLE                   -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
+#define CL_OUT_OF_RESOURCES                         -5
+#define CL_OUT_OF_HOST_MEMORY                       -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
+#define CL_MEM_COPY_OVERLAP                         -8
+#define CL_IMAGE_FORMAT_MISMATCH                    -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
+#define CL_BUILD_PROGRAM_FAILURE                    -11
+#define CL_MAP_FAILURE                              -12
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+
+#define CL_INVALID_VALUE                            -30
+#define CL_INVALID_DEVICE_TYPE                      -31
+#define CL_INVALID_PLATFORM                         -32
+#define CL_INVALID_DEVICE                           -33
+#define CL_INVALID_CONTEXT                          -34
+#define CL_INVALID_QUEUE_PROPERTIES                 -35
+#define CL_INVALID_COMMAND_QUEUE                    -36
+#define CL_INVALID_HOST_PTR                         -37
+#define CL_INVALID_MEM_OBJECT                       -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
+#define CL_INVALID_IMAGE_SIZE                       -40
+#define CL_INVALID_SAMPLER                          -41
+#define CL_INVALID_BINARY                           -42
+#define CL_INVALID_BUILD_OPTIONS                    -43
+#define CL_INVALID_PROGRAM                          -44
+#define CL_INVALID_PROGRAM_EXECUTABLE               -45
+#define CL_INVALID_KERNEL_NAME                      -46
+#define CL_INVALID_KERNEL_DEFINITION                -47
+#define CL_INVALID_KERNEL                           -48
+#define CL_INVALID_ARG_INDEX                        -49
+#define CL_INVALID_ARG_VALUE                        -50
+#define CL_INVALID_ARG_SIZE                         -51
+#define CL_INVALID_KERNEL_ARGS                      -52
+#define CL_INVALID_WORK_DIMENSION                   -53
+#define CL_INVALID_WORK_GROUP_SIZE                  -54
+#define CL_INVALID_WORK_ITEM_SIZE                   -55
+#define CL_INVALID_GLOBAL_OFFSET                    -56
+#define CL_INVALID_EVENT_WAIT_LIST                  -57
+#define CL_INVALID_EVENT                            -58
+#define CL_INVALID_OPERATION                        -59
+#define CL_INVALID_GL_OBJECT                        -60
+#define CL_INVALID_BUFFER_SIZE                      -61
+#define CL_INVALID_MIP_LEVEL                        -62
+#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
+#define CL_INVALID_PROPERTY                         -64
+
+/* OpenCL Version */
+#define CL_VERSION_1_0                              1
+#define CL_VERSION_1_1                              1
+
+/* cl_bool */
+#define CL_FALSE                                    0
+#define CL_TRUE                                     1
+
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE                         0x0900
+#define CL_PLATFORM_VERSION                         0x0901
+#define CL_PLATFORM_NAME                            0x0902
+#define CL_PLATFORM_VENDOR                          0x0903
+#define CL_PLATFORM_EXTENSIONS                      0x0904
+
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
+
+/* cl_device_info */
+#define CL_DEVICE_TYPE                              0x1000
+#define CL_DEVICE_VENDOR_ID                         0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
+#define CL_DEVICE_ADDRESS_BITS                      0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                     0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
+#define CL_DEVICE_MAX_SAMPLERS                      0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
+#define CL_DEVICE_ENDIAN_LITTLE                     0x1026
+#define CL_DEVICE_AVAILABLE                         0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
+#define CL_DEVICE_NAME                              0x102B
+#define CL_DEVICE_VENDOR                            0x102C
+#define CL_DRIVER_VERSION                           0x102D
+#define CL_DEVICE_PROFILE                           0x102E
+#define CL_DEVICE_VERSION                           0x102F
+#define CL_DEVICE_EXTENSIONS                        0x1030
+#define CL_DEVICE_PLATFORM                          0x1031
+/* 0x1032 reserved for CL_DEVICE_DOUBLE_FP_CONFIG */
+/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT         0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT         0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
+#define CL_DEVICE_OPENCL_C_VERSION                  0x103D
+
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM                                (1 << 0)
+#define CL_FP_INF_NAN                               (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
+#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
+#define CL_FP_ROUND_TO_INF                          (1 << 4)
+#define CL_FP_FMA                                   (1 << 5)
+#define CL_FP_SOFT_FLOAT                            (1 << 6)
+
+/* cl_device_mem_cache_type */
+#define CL_NONE                                     0x0
+#define CL_READ_ONLY_CACHE                          0x1
+#define CL_READ_WRITE_CACHE                         0x2
+
+/* cl_device_local_mem_type */
+#define CL_LOCAL                                    0x1
+#define CL_GLOBAL                                   0x2
+
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL                              (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
+
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+
+/* cl_context_info  */
+#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
+#define CL_CONTEXT_DEVICES                          0x1081
+#define CL_CONTEXT_PROPERTIES                       0x1082
+#define CL_CONTEXT_NUM_DEVICES                      0x1083
+
+/* cl_context_info + cl_context_properties */
+#define CL_CONTEXT_PLATFORM                         0x1084
+
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT                            0x1090
+#define CL_QUEUE_DEVICE                             0x1091
+#define CL_QUEUE_REFERENCE_COUNT                    0x1092
+#define CL_QUEUE_PROPERTIES                         0x1093
+
+/* cl_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE                           (1 << 0)
+#define CL_MEM_WRITE_ONLY                           (1 << 1)
+#define CL_MEM_READ_ONLY                            (1 << 2)
+#define CL_MEM_USE_HOST_PTR                         (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
+#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
+
+/* cl_channel_order */
+#define CL_R                                        0x10B0
+#define CL_A                                        0x10B1
+#define CL_RG                                       0x10B2
+#define CL_RA                                       0x10B3
+#define CL_RGB                                      0x10B4
+#define CL_RGBA                                     0x10B5
+#define CL_BGRA                                     0x10B6
+#define CL_ARGB                                     0x10B7
+#define CL_INTENSITY                                0x10B8
+#define CL_LUMINANCE                                0x10B9
+#define CL_Rx                                       0x10BA
+#define CL_RGx                                      0x10BB
+#define CL_RGBx                                     0x10BC
+
+/* cl_channel_type */
+#define CL_SNORM_INT8                               0x10D0
+#define CL_SNORM_INT16                              0x10D1
+#define CL_UNORM_INT8                               0x10D2
+#define CL_UNORM_INT16                              0x10D3
+#define CL_UNORM_SHORT_565                          0x10D4
+#define CL_UNORM_SHORT_555                          0x10D5
+#define CL_UNORM_INT_101010                         0x10D6
+#define CL_SIGNED_INT8                              0x10D7
+#define CL_SIGNED_INT16                             0x10D8
+#define CL_SIGNED_INT32                             0x10D9
+#define CL_UNSIGNED_INT8                            0x10DA
+#define CL_UNSIGNED_INT16                           0x10DB
+#define CL_UNSIGNED_INT32                           0x10DC
+#define CL_HALF_FLOAT                               0x10DD
+#define CL_FLOAT                                    0x10DE
+
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER                        0x10F0
+#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
+#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+
+/* cl_mem_info */
+#define CL_MEM_TYPE                                 0x1100
+#define CL_MEM_FLAGS                                0x1101
+#define CL_MEM_SIZE                                 0x1102
+#define CL_MEM_HOST_PTR                             0x1103
+#define CL_MEM_MAP_COUNT                            0x1104
+#define CL_MEM_REFERENCE_COUNT                      0x1105
+#define CL_MEM_CONTEXT                              0x1106
+#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
+#define CL_MEM_OFFSET                               0x1108
+
+/* cl_image_info */
+#define CL_IMAGE_FORMAT                             0x1110
+#define CL_IMAGE_ELEMENT_SIZE                       0x1111
+#define CL_IMAGE_ROW_PITCH                          0x1112
+#define CL_IMAGE_SLICE_PITCH                        0x1113
+#define CL_IMAGE_WIDTH                              0x1114
+#define CL_IMAGE_HEIGHT                             0x1115
+#define CL_IMAGE_DEPTH                              0x1116
+
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE                             0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
+#define CL_ADDRESS_CLAMP                            0x1132
+#define CL_ADDRESS_REPEAT                           0x1133
+#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
+
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST                           0x1140
+#define CL_FILTER_LINEAR                            0x1141
+
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
+#define CL_SAMPLER_CONTEXT                          0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
+#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
+#define CL_SAMPLER_FILTER_MODE                      0x1154
+
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ                                 (1 << 0)
+#define CL_MAP_WRITE                                (1 << 1)
+
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
+#define CL_PROGRAM_CONTEXT                          0x1161
+#define CL_PROGRAM_NUM_DEVICES                      0x1162
+#define CL_PROGRAM_DEVICES                          0x1163
+#define CL_PROGRAM_SOURCE                           0x1164
+#define CL_PROGRAM_BINARY_SIZES                     0x1165
+#define CL_PROGRAM_BINARIES                         0x1166
+
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS                     0x1181
+#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
+#define CL_PROGRAM_BUILD_LOG                        0x1183
+
+/* cl_build_status */
+#define CL_BUILD_SUCCESS                            0
+#define CL_BUILD_NONE                               -1
+#define CL_BUILD_ERROR                              -2
+#define CL_BUILD_IN_PROGRESS                        -3
+
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME                     0x1190
+#define CL_KERNEL_NUM_ARGS                          0x1191
+#define CL_KERNEL_REFERENCE_COUNT                   0x1192
+#define CL_KERNEL_CONTEXT                           0x1193
+#define CL_KERNEL_PROGRAM                           0x1194
+
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
+
+/* cl_event_info  */
+#define CL_EVENT_COMMAND_QUEUE                      0x11D0
+#define CL_EVENT_COMMAND_TYPE                       0x11D1
+#define CL_EVENT_REFERENCE_COUNT                    0x11D2
+#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
+#define CL_EVENT_CONTEXT                            0x11D4
+
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
+#define CL_COMMAND_TASK                             0x11F1
+#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
+#define CL_COMMAND_READ_BUFFER                      0x11F3
+#define CL_COMMAND_WRITE_BUFFER                     0x11F4
+#define CL_COMMAND_COPY_BUFFER                      0x11F5
+#define CL_COMMAND_READ_IMAGE                       0x11F6
+#define CL_COMMAND_WRITE_IMAGE                      0x11F7
+#define CL_COMMAND_COPY_IMAGE                       0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
+#define CL_COMMAND_MAP_BUFFER                       0x11FB
+#define CL_COMMAND_MAP_IMAGE                        0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
+#define CL_COMMAND_MARKER                           0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
+#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
+#define CL_COMMAND_USER                             0x1204
+
+/* command execution status */
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+  
+/* cl_buffer_create_type  */
+#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
+
+/* cl_profiling_info  */
+#define CL_PROFILING_COMMAND_QUEUED                 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
+#define CL_PROFILING_COMMAND_START                  0x1282
+#define CL_PROFILING_COMMAND_END                    0x1283
+
+/********************************************************************************************************/
+
+/* Platform API */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint          /* num_entries */,
+                 cl_platform_id * /* platforms */,
+                 cl_uint *        /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL 
+clGetPlatformInfo(cl_platform_id   /* platform */, 
+                  cl_platform_info /* param_name */,
+                  size_t           /* param_value_size */, 
+                  void *           /* param_value */,
+                  size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Device APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id   /* platform */,
+               cl_device_type   /* device_type */, 
+               cl_uint          /* num_entries */, 
+               cl_device_id *   /* devices */, 
+               cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id    /* device */,
+                cl_device_info  /* param_name */, 
+                size_t          /* param_value_size */, 
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Context APIs  */
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * /* properties */,
+                cl_uint                       /* num_devices */,
+                const cl_device_id *          /* devices */,
+                void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+                void *                        /* user_data */,
+                cl_int *                      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * /* properties */,
+                        cl_device_type                /* device_type */,
+                        void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
+                        void *                        /* user_data */,
+                        cl_int *                      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context         /* context */, 
+                 cl_context_info    /* param_name */, 
+                 size_t             /* param_value_size */, 
+                 void *             /* param_value */, 
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Command Queue APIs */
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context                     /* context */, 
+                     cl_device_id                   /* device */, 
+                     cl_command_queue_properties    /* properties */,
+                     cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue      /* command_queue */,
+                      cl_command_queue_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#warning CL_USE_DEPRECATED_OPENCL_1_0_APIS is defined. These APIs are unsupported and untested in OpenCL 1.1!
+/* 
+ *  WARNING:
+ *     This API introduces mutable state into the OpenCL implementation. It has been REMOVED
+ *  to better facilitate thread safety.  The 1.0 API is not thread safe. It is not tested by the
+ *  OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.
+ *  It is likely to be non-performant. Use of this API is not advised. Use at your own risk.
+ *
+ *  Software developers previously relying on this API are instructed to set the command queue 
+ *  properties when creating the queue, instead. 
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetCommandQueueProperty(cl_command_queue              /* command_queue */,
+                          cl_command_queue_properties   /* properties */, 
+                          cl_bool                        /* enable */,
+                          cl_command_queue_properties * /* old_properties */) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
+#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */
+
+/* Memory Object APIs */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context   /* context */,
+               cl_mem_flags /* flags */,
+               size_t       /* size */,
+               void *       /* host_ptr */,
+               cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer(cl_mem                   /* buffer */,
+                  cl_mem_flags             /* flags */,
+                  cl_buffer_create_type    /* buffer_create_type */,
+                  const void *             /* buffer_create_info */,
+                  cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage2D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */,
+                size_t                  /* image_height */,
+                size_t                  /* image_row_pitch */, 
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+                        
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage3D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */, 
+                size_t                  /* image_height */,
+                size_t                  /* image_depth */, 
+                size_t                  /* image_row_pitch */, 
+                size_t                  /* image_slice_pitch */, 
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+                        
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context           /* context */,
+                           cl_mem_flags         /* flags */,
+                           cl_mem_object_type   /* image_type */,
+                           cl_uint              /* num_entries */,
+                           cl_image_format *    /* image_formats */,
+                           cl_uint *            /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+                                    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem           /* memobj */,
+                   cl_mem_info      /* param_name */, 
+                   size_t           /* param_value_size */,
+                   void *           /* param_value */,
+                   size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem           /* image */,
+               cl_image_info    /* param_name */, 
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback(  cl_mem /* memobj */, 
+                                    void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
+                                    void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;  
+
+/* Sampler APIs  */
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSampler(cl_context          /* context */,
+                cl_bool             /* normalized_coords */, 
+                cl_addressing_mode  /* addressing_mode */, 
+                cl_filter_mode      /* filter_mode */,
+                cl_int *            /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler         /* sampler */,
+                 cl_sampler_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                            
+/* Program Object APIs  */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context        /* context */,
+                          cl_uint           /* count */,
+                          const char **     /* strings */,
+                          const size_t *    /* lengths */,
+                          cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context                     /* context */,
+                          cl_uint                        /* num_devices */,
+                          const cl_device_id *           /* device_list */,
+                          const size_t *                 /* lengths */,
+                          const unsigned char **         /* binaries */,
+                          cl_int *                       /* binary_status */,
+                          cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program           /* program */,
+               cl_uint              /* num_devices */,
+               const cl_device_id * /* device_list */,
+               const char *         /* options */, 
+               void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+               void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadCompiler(void) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program         /* program */,
+                 cl_program_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program            /* program */,
+                      cl_device_id          /* device */,
+                      cl_program_build_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                            
+/* Kernel Object APIs */
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program      /* program */,
+               const char *    /* kernel_name */,
+               cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program     /* program */,
+                         cl_uint        /* num_kernels */,
+                         cl_kernel *    /* kernels */,
+                         cl_uint *      /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel    /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel   /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel    /* kernel */,
+               cl_uint      /* arg_index */,
+               size_t       /* arg_size */,
+               const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel       /* kernel */,
+                cl_kernel_info  /* param_name */,
+                size_t          /* param_value_size */,
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel                  /* kernel */,
+                         cl_device_id               /* device */,
+                         cl_kernel_work_group_info  /* param_name */,
+                         size_t                     /* param_value_size */,
+                         void *                     /* param_value */,
+                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Event Object APIs  */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint             /* num_events */,
+                const cl_event *    /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event         /* event */,
+               cl_event_info    /* param_name */,
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent(cl_context    /* context */,
+                  cl_int *      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;               
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus(cl_event   /* event */,
+                     cl_int     /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+                     
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback( cl_event    /* event */,
+                    cl_int      /* command_exec_callback_type */,
+                    void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+                    void *      /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+/* Profiling APIs  */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event            /* event */,
+                        cl_profiling_info   /* param_name */,
+                        size_t              /* param_value_size */,
+                        void *              /* param_value */,
+                        size_t *            /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                                
+/* Flush and Finish APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Enqueued Commands APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
+                    cl_mem              /* buffer */,
+                    cl_bool             /* blocking_read */,
+                    size_t              /* offset */,
+                    size_t              /* cb */, 
+                    void *              /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue    /* command_queue */,
+                        cl_mem              /* buffer */,
+                        cl_bool             /* blocking_read */,
+                        const size_t *      /* buffer_origin */,
+                        const size_t *      /* host_origin */, 
+                        const size_t *      /* region */,
+                        size_t              /* buffer_row_pitch */,
+                        size_t              /* buffer_slice_pitch */,
+                        size_t              /* host_row_pitch */,
+                        size_t              /* host_slice_pitch */,                        
+                        void *              /* ptr */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue   /* command_queue */, 
+                     cl_mem             /* buffer */, 
+                     cl_bool            /* blocking_write */, 
+                     size_t             /* offset */, 
+                     size_t             /* cb */, 
+                     const void *       /* ptr */, 
+                     cl_uint            /* num_events_in_wait_list */, 
+                     const cl_event *   /* event_wait_list */, 
+                     cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue    /* command_queue */,
+                         cl_mem              /* buffer */,
+                         cl_bool             /* blocking_write */,
+                         const size_t *      /* buffer_origin */,
+                         const size_t *      /* host_origin */, 
+                         const size_t *      /* region */,
+                         size_t              /* buffer_row_pitch */,
+                         size_t              /* buffer_slice_pitch */,
+                         size_t              /* host_row_pitch */,
+                         size_t              /* host_slice_pitch */,                        
+                         const void *        /* ptr */,
+                         cl_uint             /* num_events_in_wait_list */,
+                         const cl_event *    /* event_wait_list */,
+                         cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue    /* command_queue */, 
+                    cl_mem              /* src_buffer */,
+                    cl_mem              /* dst_buffer */, 
+                    size_t              /* src_offset */,
+                    size_t              /* dst_offset */,
+                    size_t              /* cb */, 
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect(cl_command_queue    /* command_queue */, 
+                        cl_mem              /* src_buffer */,
+                        cl_mem              /* dst_buffer */, 
+                        const size_t *      /* src_origin */,
+                        const size_t *      /* dst_origin */,
+                        const size_t *      /* region */, 
+                        size_t              /* src_row_pitch */,
+                        size_t              /* src_slice_pitch */,
+                        size_t              /* dst_row_pitch */,
+                        size_t              /* dst_slice_pitch */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* image */,
+                   cl_bool              /* blocking_read */, 
+                   const size_t *       /* origin[3] */,
+                   const size_t *       /* region[3] */,
+                   size_t               /* row_pitch */,
+                   size_t               /* slice_pitch */, 
+                   void *               /* ptr */,
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue    /* command_queue */,
+                    cl_mem              /* image */,
+                    cl_bool             /* blocking_write */, 
+                    const size_t *      /* origin[3] */,
+                    const size_t *      /* region[3] */,
+                    size_t              /* input_row_pitch */,
+                    size_t              /* input_slice_pitch */, 
+                    const void *        /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* src_image */,
+                   cl_mem               /* dst_image */, 
+                   const size_t *       /* src_origin[3] */,
+                   const size_t *       /* dst_origin[3] */,
+                   const size_t *       /* region[3] */, 
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_image */,
+                           cl_mem           /* dst_buffer */, 
+                           const size_t *   /* src_origin[3] */,
+                           const size_t *   /* region[3] */, 
+                           size_t           /* dst_offset */,
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_buffer */,
+                           cl_mem           /* dst_image */, 
+                           size_t           /* src_offset */,
+                           const size_t *   /* dst_origin[3] */,
+                           const size_t *   /* region[3] */, 
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue /* command_queue */,
+                   cl_mem           /* buffer */,
+                   cl_bool          /* blocking_map */, 
+                   cl_map_flags     /* map_flags */,
+                   size_t           /* offset */,
+                   size_t           /* cb */,
+                   cl_uint          /* num_events_in_wait_list */,
+                   const cl_event * /* event_wait_list */,
+                   cl_event *       /* event */,
+                   cl_int *         /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue  /* command_queue */,
+                  cl_mem            /* image */, 
+                  cl_bool           /* blocking_map */, 
+                  cl_map_flags      /* map_flags */, 
+                  const size_t *    /* origin[3] */,
+                  const size_t *    /* region[3] */,
+                  size_t *          /* image_row_pitch */,
+                  size_t *          /* image_slice_pitch */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */,
+                  cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
+                        cl_mem           /* memobj */,
+                        void *           /* mapped_ptr */,
+                        cl_uint          /* num_events_in_wait_list */,
+                        const cl_event *  /* event_wait_list */,
+                        cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
+                       cl_kernel        /* kernel */,
+                       cl_uint          /* work_dim */,
+                       const size_t *   /* global_work_offset */,
+                       const size_t *   /* global_work_size */,
+                       const size_t *   /* local_work_size */,
+                       cl_uint          /* num_events_in_wait_list */,
+                       const cl_event * /* event_wait_list */,
+                       cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue  /* command_queue */,
+              cl_kernel         /* kernel */,
+              cl_uint           /* num_events_in_wait_list */,
+              const cl_event *  /* event_wait_list */,
+              cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
+					  void (CL_CALLBACK *user_func)(void *), 
+                      void *            /* args */,
+                      size_t            /* cb_args */, 
+                      cl_uint           /* num_mem_objects */,
+                      const cl_mem *    /* mem_list */,
+                      const void **     /* args_mem_loc */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue    /* command_queue */,
+                cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
+                       cl_uint          /* num_events */,
+                       const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Extension function access
+ *
+ * Returns the extension function address for the given function name,
+ * or NULL if a valid function can not be found.  The client must
+ * check to make sure the address is not NULL, before using or 
+ * calling the returned function address.
+ */
+extern CL_API_ENTRY void * CL_API_CALL clGetExtensionFunctionAddress(const char * /* func_name */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_H */
+
diff --git a/libs/clew/CL/cl_d3d10.h b/libs/clew/CL/cl_d3d10.h
new file mode 100644
index 0000000..a834615
--- /dev/null
+++ b/libs/clew/CL/cl_d3d10.h
@@ -0,0 +1,131 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D10_H
+#define __OPENCL_CL_D3D10_H
+
+#include <d3d10.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d10_sharing                                                       */
+#define cl_khr_d3d10_sharing 1
+
+typedef cl_uint cl_d3d10_device_source_khr;
+typedef cl_uint cl_d3d10_device_set_khr;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
+#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
+#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
+#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
+
+/* cl_d3d10_device_source_nv */
+#define CL_D3D10_DEVICE_KHR                          0x4010
+#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
+
+/* cl_d3d10_device_set_nv */
+#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
+#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
+#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
+
+/* cl_mem_info */
+#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
+
+/* cl_image_info */
+#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
+#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d10_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d10_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D10Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    cl_mem *         mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_D3D10_H */
+
diff --git a/libs/clew/CL/cl_ext.h b/libs/clew/CL/cl_ext.h
new file mode 100644
index 0000000..89b4cb7
--- /dev/null
+++ b/libs/clew/CL/cl_ext.h
@@ -0,0 +1,278 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
+
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies.                                   */
+
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+        #include <OpenCL/cl.h>
+    #include <AvailabilityMacros.h>
+#else
+        #include <CL/cl.h>
+#endif
+
+/* cl_khr_fp64 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
+
+/* cl_khr_fp16 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
+
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources 
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback 
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in 
+ * which they were registered. The user callback functions are called and then the memory object is deleted 
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be 
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as 
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
+                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
+                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
+
+
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger 
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */, 
+                                            const void * /* private_info */, 
+                                            size_t       /* cb */, 
+                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/************************ 
+* cl_khr_icd extension *                                                  
+************************/
+#define cl_khr_icd 1
+
+/* cl_platform_info                                                        */
+#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
+
+/* Additional Error Codes                                                  */
+#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
+                       cl_platform_id * /* platforms */,
+                       cl_uint *        /* num_platforms */);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
+    cl_uint          /* num_entries */,
+    cl_platform_id * /* platforms */,
+    cl_uint *        /* num_platforms */);
+
+
+/******************************************
+* cl_nv_device_attribute_query extension *
+******************************************/
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+#define CL_DEVICE_WARP_SIZE_NV                      0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+
+/*********************************
+* cl_amd_device_attribute_query *
+*********************************/
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
+
+/*********************************
+* cl_arm_printf extension
+*********************************/
+#define CL_PRINTF_CALLBACK_ARM                      0x40B0
+#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
+
+#ifdef CL_VERSION_1_1
+   /***********************************
+    * cl_ext_device_fission extension *
+    ***********************************/
+    #define cl_ext_device_fission   1
+    
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef cl_ulong  cl_device_partition_property_ext;
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
+                            const cl_device_partition_property_ext * /* properties */,
+                            cl_uint /*num_entries*/,
+                            cl_device_id * /*out_devices*/,
+                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef CL_API_ENTRY cl_int 
+    ( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
+                                                const cl_device_partition_property_ext * /* properties */,
+                                                cl_uint /*num_entries*/,
+                                                cl_device_id * /*out_devices*/,
+                                                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    /* cl_device_partition_property_ext */
+    #define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
+    #define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
+    #define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
+    #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
+    
+    /* clDeviceGetInfo selectors */
+    #define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
+    #define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
+    #define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
+    #define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
+    #define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
+    
+    /* error codes */
+    #define CL_DEVICE_PARTITION_FAILED_EXT              -1057
+    #define CL_INVALID_PARTITION_COUNT_EXT              -1058
+    #define CL_INVALID_PARTITION_NAME_EXT               -1059
+    
+    /* CL_AFFINITY_DOMAINs */
+    #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
+    #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
+    #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
+    #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
+    #define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
+    #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
+    
+    /* cl_device_partition_property_ext list terminators */
+    #define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
+
+/*********************************
+* cl_qcom_ext_host_ptr extension
+*********************************/
+
+#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
+
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0      
+#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
+#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
+
+typedef cl_uint                                   cl_image_pitch_info_qcom;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(cl_device_id             device,
+                         size_t                   image_width,
+                         size_t                   image_height,
+                         const cl_image_format   *image_format,
+                         cl_image_pitch_info_qcom param_name,
+                         size_t                   param_value_size,
+                         void                    *param_value,
+                         size_t                  *param_value_size_ret);
+
+typedef struct _cl_mem_ext_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Legal values will be defined in layered extensions. */
+    cl_uint  allocation_type;
+            
+    /* Host cache policy for this external memory allocation. */
+    cl_uint  host_cache_policy;
+
+} cl_mem_ext_host_ptr;
+
+/*********************************
+* cl_qcom_ion_host_ptr extension
+*********************************/
+
+#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
+
+typedef struct _cl_mem_ion_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* ION file descriptor */
+    int                  ion_filedesc;
+            
+    /* Host pointer to the ION allocated memory */
+    void*                ion_hostptr;
+
+} cl_mem_ion_host_ptr;
+
+#endif /* CL_VERSION_1_1 */
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* __CL_EXT_H */
diff --git a/libs/clew/CL/cl_gl.h b/libs/clew/CL/cl_gl.h
new file mode 100644
index 0000000..92d1be2
--- /dev/null
+++ b/libs/clew/CL/cl_gl.h
@@ -0,0 +1,160 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+/*
+ * cl_gl.h contains Khronos-approved (KHR) OpenCL extensions which have
+ * OpenGL dependencies. The application is responsible for #including
+ * OpenGL or OpenGL ES headers before #including cl_gl.h.
+ */
+
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#include <OpenGL/CGLDevice.h>
+#else
+#include <CL/cl.h>
+#endif	
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef cl_uint     cl_gl_object_type;
+typedef cl_uint     cl_gl_texture_info;
+typedef cl_uint     cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+
+/* cl_gl_object_type */
+#define CL_GL_OBJECT_BUFFER             0x2000
+#define CL_GL_OBJECT_TEXTURE2D          0x2001
+#define CL_GL_OBJECT_TEXTURE3D          0x2002
+#define CL_GL_OBJECT_RENDERBUFFER       0x2003
+
+/* cl_gl_texture_info */
+#define CL_GL_TEXTURE_TARGET            0x2004
+#define CL_GL_MIPMAP_LEVEL              0x2005
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context     /* context */,
+                     cl_mem_flags   /* flags */,
+                     cl_GLuint      /* bufobj */,
+                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context   /* context */,
+                           cl_mem_flags /* flags */,
+                           cl_GLuint    /* renderbuffer */,
+                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem                /* memobj */,
+                  cl_gl_object_type *   /* gl_object_type */,
+                  cl_GLuint *              /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
+                  
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem               /* memobj */,
+                   cl_gl_texture_info   /* param_name */,
+                   size_t               /* param_value_size */,
+                   void *               /* param_value */,
+                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+/* cl_khr_gl_sharing extension  */
+
+#define cl_khr_gl_sharing 1
+
+typedef cl_uint     cl_gl_context_info;
+
+/* Additional Error Codes  */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
+
+/* cl_gl_context_info  */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
+
+/* Additional cl_context_properties  */
+#define CL_GL_CONTEXT_KHR                       0x2008
+#define CL_EGL_DISPLAY_KHR                      0x2009
+#define CL_GLX_DISPLAY_KHR                      0x200A
+#define CL_WGL_HDC_KHR                          0x200B
+#define CL_CGL_SHAREGROUP_KHR                   0x200C
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
+                      cl_gl_context_info            /* param_name */,
+                      size_t                        /* param_value_size */,
+                      void *                        /* param_value */,
+                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties * properties,
+    cl_gl_context_info            param_name,
+    size_t                        param_value_size,
+    void *                        param_value,
+    size_t *                      param_value_size_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_GL_H  */
diff --git a/libs/clew/CL/cl_gl_ext.h b/libs/clew/CL/cl_gl_ext.h
new file mode 100644
index 0000000..12ad713
--- /dev/null
+++ b/libs/clew/CL/cl_gl_ext.h
@@ -0,0 +1,74 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
+/* OpenGL dependencies.                                                         */
+
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+    #include <OpenCL/cl_gl.h>
+#else
+    #include <CL/cl_gl.h>
+#endif
+
+/*
+ * For each extension, follow this template
+ * /* cl_VEN_extname extension  */
+/* #define cl_VEN_extname 1
+ * ... define new types, if any
+ * ... define new tokens, if any
+ * ... define new APIs, if any
+ *
+ *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
+ *  This allows us to avoid having to decide whether to include GL headers or GLES here.
+ */
+
+/* 
+ *  cl_khr_gl_event  extension
+ *  See section 9.9 in the OpenCL 1.1 spec for more information
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context           /* context */,
+                           cl_GLsync            /* cl_GLsync */,
+                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* __OPENCL_CL_GL_EXT_H  */
diff --git a/libs/clew/CL/cl_platform.h b/libs/clew/CL/cl_platform.h
new file mode 100644
index 0000000..065aca4
--- /dev/null
+++ b/libs/clew/CL/cl_platform.h
@@ -0,0 +1,1203 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */
+
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+
+#ifdef __APPLE__
+    /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+    #include <AvailabilityMacros.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+    #define CL_API_ENTRY
+    #define CL_API_CALL     __stdcall
+    #define CL_CALLBACK     __stdcall
+#else
+    #define CL_API_ENTRY
+    #define CL_API_CALL
+    #define CL_CALLBACK
+#endif
+
+#ifdef __APPLE__
+    #define CL_EXTENSION_WEAK_LINK                  __attribute__((weak_import))       
+    #define CL_API_SUFFIX__VERSION_1_0              AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_API_SUFFIX__VERSION_1_1              CL_EXTENSION_WEAK_LINK
+    #define CL_EXT_SUFFIX__VERSION_1_1              CL_EXTENSION_WEAK_LINK
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+#else
+    #define CL_EXTENSION_WEAK_LINK                         
+    #define CL_API_SUFFIX__VERSION_1_0
+    #define CL_EXT_SUFFIX__VERSION_1_0
+    #define CL_API_SUFFIX__VERSION_1_1
+    #define CL_EXT_SUFFIX__VERSION_1_1
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+#endif
+
+#if (defined (_WIN32) && defined(_MSC_VER))
+
+/* scalar types  */
+typedef signed   __int8         cl_char;
+typedef unsigned __int8         cl_uchar;
+typedef signed   __int16        cl_short;
+typedef unsigned __int16        cl_ushort;
+typedef signed   __int32        cl_int;
+typedef unsigned __int32        cl_uint;
+typedef signed   __int64        cl_long;
+typedef unsigned __int64        cl_ulong;
+
+typedef unsigned __int16        cl_half;
+typedef float                   cl_float;
+typedef double                  cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      0x1.0p-23f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define  CL_M_E             2.718281828459045090796
+#define  CL_M_LOG2E         1.442695040888963387005
+#define  CL_M_LOG10E        0.434294481903251816668
+#define  CL_M_LN2           0.693147180559945286227
+#define  CL_M_LN10          2.302585092994045901094
+#define  CL_M_PI            3.141592653589793115998
+#define  CL_M_PI_2          1.570796326794896557999
+#define  CL_M_PI_4          0.785398163397448278999
+#define  CL_M_1_PI          0.318309886183790691216
+#define  CL_M_2_PI          0.636619772367581382433
+#define  CL_M_2_SQRTPI      1.128379167095512558561
+#define  CL_M_SQRT2         1.414213562373095145475
+#define  CL_M_SQRT1_2       0.707106781186547572737
+
+#define  CL_M_E_F           2.71828174591064f
+#define  CL_M_LOG2E_F       1.44269502162933f
+#define  CL_M_LOG10E_F      0.43429449200630f
+#define  CL_M_LN2_F         0.69314718246460f
+#define  CL_M_LN10_F        2.30258512496948f
+#define  CL_M_PI_F          3.14159274101257f
+#define  CL_M_PI_2_F        1.57079637050629f
+#define  CL_M_PI_4_F        0.78539818525314f
+#define  CL_M_1_PI_F        0.31830987334251f
+#define  CL_M_2_PI_F        0.63661974668503f
+#define  CL_M_2_SQRTPI_F    1.12837922573090f
+#define  CL_M_SQRT2_F       1.41421353816986f
+#define  CL_M_SQRT1_2_F     0.70710676908493f
+
+#define CL_NAN              (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF        ((cl_float) 1e50)
+#define CL_HUGE_VAL         ((cl_double) 1e500)
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#else
+
+#include <stdint.h>
+
+/* scalar types  */
+typedef int8_t          cl_char;
+typedef uint8_t         cl_uchar;
+typedef int16_t         cl_short    __attribute__((aligned(2)));
+typedef uint16_t        cl_ushort   __attribute__((aligned(2)));
+typedef int32_t         cl_int      __attribute__((aligned(4)));
+typedef uint32_t        cl_uint     __attribute__((aligned(4)));
+typedef int64_t         cl_long     __attribute__((aligned(8)));
+typedef uint64_t        cl_ulong    __attribute__((aligned(8)));
+
+typedef uint16_t        cl_half     __attribute__((aligned(2)));
+typedef float           cl_float    __attribute__((aligned(4)));
+typedef double          cl_double   __attribute__((aligned(8)));
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          0x1.fffffep127f
+#define CL_FLT_MIN          0x1.0p-126f
+#define CL_FLT_EPSILON      0x1.0p-23f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          0x1.fffffffffffffp1023
+#define CL_DBL_MIN          0x1.0p-1022
+#define CL_DBL_EPSILON      0x1.0p-52
+
+#define  CL_M_E             2.718281828459045090796
+#define  CL_M_LOG2E         1.442695040888963387005
+#define  CL_M_LOG10E        0.434294481903251816668
+#define  CL_M_LN2           0.693147180559945286227
+#define  CL_M_LN10          2.302585092994045901094
+#define  CL_M_PI            3.141592653589793115998
+#define  CL_M_PI_2          1.570796326794896557999
+#define  CL_M_PI_4          0.785398163397448278999
+#define  CL_M_1_PI          0.318309886183790691216
+#define  CL_M_2_PI          0.636619772367581382433
+#define  CL_M_2_SQRTPI      1.128379167095512558561
+#define  CL_M_SQRT2         1.414213562373095145475
+#define  CL_M_SQRT1_2       0.707106781186547572737
+
+#define  CL_M_E_F           2.71828174591064f
+#define  CL_M_LOG2E_F       1.44269502162933f
+#define  CL_M_LOG10E_F      0.43429449200630f
+#define  CL_M_LN2_F         0.69314718246460f
+#define  CL_M_LN10_F        2.30258512496948f
+#define  CL_M_PI_F          3.14159274101257f
+#define  CL_M_PI_2_F        1.57079637050629f
+#define  CL_M_PI_4_F        0.78539818525314f
+#define  CL_M_1_PI_F        0.31830987334251f
+#define  CL_M_2_PI_F        0.63661974668503f
+#define  CL_M_2_SQRTPI_F    1.12837922573090f
+#define  CL_M_SQRT2_F       1.41421353816986f
+#define  CL_M_SQRT1_2_F     0.70710676908493f
+
+#if defined( __GNUC__ )
+   #define CL_HUGE_VALF     __builtin_huge_valf()
+   #define CL_HUGE_VAL      __builtin_huge_val()
+   #define CL_NAN           __builtin_nanf( "" )
+#else
+   #define CL_HUGE_VALF     ((cl_float) 1e50)
+   #define CL_HUGE_VAL      ((cl_double) 1e500)
+   float nanf( const char * );
+   #define CL_NAN           nanf( "" )  
+#endif
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#endif
+
+#include <stddef.h>
+
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which headers to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int          cl_GLint;
+typedef unsigned int cl_GLenum;
+
+/*
+ * Vector types 
+ *
+ *  Note:   OpenCL requires that all types be naturally aligned. 
+ *          This means that vector types must be naturally aligned.
+ *          For example, a vector of four floats must be aligned to
+ *          a 16 byte boundary (calculated as 4 * the natural 4-byte 
+ *          alignment of the float).  The alignment qualifiers here
+ *          will only function properly if your compiler supports them
+ *          and if you don't actively work to defeat them.  For example,
+ *          in order for a cl_float4 to be 16 byte aligned in a struct,
+ *          the start of the struct must itself be 16-byte aligned. 
+ *
+ *          Maintaining proper alignment is the user's responsibility.
+ */
+
+/* Define basic vector types */
+#if defined( __VEC__ )
+   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+   typedef vector unsigned char     __cl_uchar16;
+   typedef vector signed char       __cl_char16;
+   typedef vector unsigned short    __cl_ushort8;
+   typedef vector signed short      __cl_short8;
+   typedef vector unsigned int      __cl_uint4;
+   typedef vector signed int        __cl_int4;
+   typedef vector float             __cl_float4;
+   #define  __CL_UCHAR16__  1
+   #define  __CL_CHAR16__   1
+   #define  __CL_USHORT8__  1
+   #define  __CL_SHORT8__   1
+   #define  __CL_UINT4__    1
+   #define  __CL_INT4__     1
+   #define  __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <xmmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef float __cl_float4   __attribute__((vector_size(16)));
+    #else
+        typedef __m128 __cl_float4;
+    #endif
+    #define __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE2__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <emmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
+        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
+        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
+        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
+        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
+        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
+        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
+        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
+        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
+    #else
+        typedef __m128i __cl_uchar16;
+        typedef __m128i __cl_char16;
+        typedef __m128i __cl_ushort8;
+        typedef __m128i __cl_short8;
+        typedef __m128i __cl_uint4;
+        typedef __m128i __cl_int4;
+        typedef __m128i __cl_ulong2;
+        typedef __m128i __cl_long2;
+        typedef __m128d __cl_double2;
+    #endif
+    #define __CL_UCHAR16__  1
+    #define __CL_CHAR16__   1
+    #define __CL_USHORT8__  1
+    #define __CL_SHORT8__   1
+    #define __CL_INT4__     1
+    #define __CL_UINT4__    1
+    #define __CL_ULONG2__   1
+    #define __CL_LONG2__    1
+    #define __CL_DOUBLE2__  1
+#endif
+
+#if defined( __MMX__ )
+    #include <mmintrin.h>
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
+        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
+        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
+        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
+        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
+        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
+        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
+        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
+        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
+    #else
+        typedef __m64       __cl_uchar8;
+        typedef __m64       __cl_char8;
+        typedef __m64       __cl_ushort4;
+        typedef __m64       __cl_short4;
+        typedef __m64       __cl_uint2;
+        typedef __m64       __cl_int2;
+        typedef __m64       __cl_ulong1;
+        typedef __m64       __cl_long1;
+        typedef __m64       __cl_float2;
+    #endif
+    #define __CL_UCHAR8__   1
+    #define __CL_CHAR8__    1
+    #define __CL_USHORT4__  1
+    #define __CL_SHORT4__   1
+    #define __CL_INT2__     1
+    #define __CL_UINT2__    1
+    #define __CL_ULONG1__   1
+    #define __CL_LONG1__    1
+    #define __CL_FLOAT2__   1
+#endif
+
+#if defined( __AVX__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <immintrin.h> 
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
+        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
+    #else
+        typedef __m256      __cl_float8;
+        typedef __m256d     __cl_double4;
+    #endif
+    #define __CL_FLOAT8__   1
+    #define __CL_DOUBLE4__  1
+#endif
+
+/* Define alignment keys */
+#if defined( __GNUC__ )
+    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
+    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
+    /* #include <crtdefs.h>                                                                                             */
+    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
+    #define CL_ALIGNED(_x)
+#else
+   #warning  Need to implement some method to align data here
+   #define  CL_ALIGNED(_x)
+#endif
+
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+    /* .xyzw and .s0123...{f|F} are supported */
+    #define CL_HAS_NAMED_VECTOR_FIELDS 1
+    /* .hi and .lo are supported */
+    #define CL_HAS_HI_LO_VECTOR_FIELDS 1
+#endif
+
+/* Define cl_vector types */
+
+/* ---- cl_charn ---- */
+typedef union
+{
+    cl_char  CL_ALIGNED(2) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_char  x, y; };
+   __extension__ struct{ cl_char  s0, s1; };
+   __extension__ struct{ cl_char  lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2;
+#endif
+}cl_char2;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(4) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_char  x, y, z, w; };
+   __extension__ struct{ cl_char  s0, s1, s2, s3; };
+   __extension__ struct{ cl_char2 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[2];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4;
+#endif
+}cl_char4;
+
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef  cl_char4  cl_char3;
+
+typedef union
+{
+    cl_char   CL_ALIGNED(8) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_char  x, y, z, w; };
+   __extension__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_char4 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[4];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4[2];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8;
+#endif
+}cl_char8;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(16) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_char8 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[8];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4[4];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8[2];
+#endif
+#if defined( __CL_CHAR16__ )
+    __cl_char16    v16;
+#endif
+}cl_char16;
+
+
+/* ---- cl_ucharn ---- */
+typedef union
+{
+    cl_uchar  CL_ALIGNED(2) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uchar  x, y; };
+   __extension__ struct{ cl_uchar  s0, s1; };
+   __extension__ struct{ cl_uchar  lo, hi; };
+#endif
+#if defined( __cl_uchar2__) 
+    __cl_uchar2     v2;
+#endif
+}cl_uchar2;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(4) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uchar  x, y, z, w; };
+   __extension__ struct{ cl_uchar  s0, s1, s2, s3; };
+   __extension__ struct{ cl_uchar2 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[2];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4;
+#endif
+}cl_uchar4;
+
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef  cl_uchar4  cl_uchar3;
+
+typedef union
+{
+    cl_uchar   CL_ALIGNED(8) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uchar  x, y, z, w; };
+   __extension__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_uchar4 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[4];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4[2];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8;
+#endif
+}cl_uchar8;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(16) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_uchar8 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[8];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4[4];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8[2];
+#endif
+#if defined( __CL_UCHAR16__ )
+    __cl_uchar16    v16;
+#endif
+}cl_uchar16;
+
+
+/* ---- cl_shortn ---- */
+typedef union
+{
+    cl_short  CL_ALIGNED(4) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_short  x, y; };
+   __extension__ struct{ cl_short  s0, s1; };
+   __extension__ struct{ cl_short  lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2;
+#endif
+}cl_short2;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(8) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_short  x, y, z, w; };
+   __extension__ struct{ cl_short  s0, s1, s2, s3; };
+   __extension__ struct{ cl_short2 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[2];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4;
+#endif
+}cl_short4;
+
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef  cl_short4  cl_short3;
+
+typedef union
+{
+    cl_short   CL_ALIGNED(16) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_short  x, y, z, w; };
+   __extension__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_short4 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[4];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4[2];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8;
+#endif
+}cl_short8;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(32) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_short8 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[8];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4[4];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8[2];
+#endif
+#if defined( __CL_SHORT16__ )
+    __cl_short16    v16;
+#endif
+}cl_short16;
+
+
+/* ---- cl_ushortn ---- */
+typedef union
+{
+    cl_ushort  CL_ALIGNED(4) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ushort  x, y; };
+   __extension__ struct{ cl_ushort  s0, s1; };
+   __extension__ struct{ cl_ushort  lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2;
+#endif
+}cl_ushort2;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(8) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ushort  x, y, z, w; };
+   __extension__ struct{ cl_ushort  s0, s1, s2, s3; };
+   __extension__ struct{ cl_ushort2 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[2];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4;
+#endif
+}cl_ushort4;
+
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef  cl_ushort4  cl_ushort3;
+
+typedef union
+{
+    cl_ushort   CL_ALIGNED(16) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ushort  x, y, z, w; };
+   __extension__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_ushort4 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[4];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4[2];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8;
+#endif
+}cl_ushort8;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(32) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_ushort8 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[8];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4[4];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8[2];
+#endif
+#if defined( __CL_USHORT16__ )
+    __cl_ushort16    v16;
+#endif
+}cl_ushort16;
+
+/* ---- cl_intn ---- */
+typedef union
+{
+    cl_int  CL_ALIGNED(8) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_int  x, y; };
+   __extension__ struct{ cl_int  s0, s1; };
+   __extension__ struct{ cl_int  lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2;
+#endif
+}cl_int2;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(16) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_int  x, y, z, w; };
+   __extension__ struct{ cl_int  s0, s1, s2, s3; };
+   __extension__ struct{ cl_int2 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[2];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4;
+#endif
+}cl_int4;
+
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef  cl_int4  cl_int3;
+
+typedef union
+{
+    cl_int   CL_ALIGNED(32) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_int  x, y, z, w; };
+   __extension__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_int4 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[4];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4[2];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8;
+#endif
+}cl_int8;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(64) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_int8 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[8];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4[4];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8[2];
+#endif
+#if defined( __CL_INT16__ )
+    __cl_int16    v16;
+#endif
+}cl_int16;
+
+
+/* ---- cl_uintn ---- */
+typedef union
+{
+    cl_uint  CL_ALIGNED(8) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uint  x, y; };
+   __extension__ struct{ cl_uint  s0, s1; };
+   __extension__ struct{ cl_uint  lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2;
+#endif
+}cl_uint2;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(16) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uint  x, y, z, w; };
+   __extension__ struct{ cl_uint  s0, s1, s2, s3; };
+   __extension__ struct{ cl_uint2 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[2];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4;
+#endif
+}cl_uint4;
+
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef  cl_uint4  cl_uint3;
+
+typedef union
+{
+    cl_uint   CL_ALIGNED(32) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uint  x, y, z, w; };
+   __extension__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_uint4 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[4];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4[2];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8;
+#endif
+}cl_uint8;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(64) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_uint8 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[8];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4[4];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8[2];
+#endif
+#if defined( __CL_UINT16__ )
+    __cl_uint16    v16;
+#endif
+}cl_uint16;
+
+/* ---- cl_longn ---- */
+typedef union
+{
+    cl_long  CL_ALIGNED(16) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_long  x, y; };
+   __extension__ struct{ cl_long  s0, s1; };
+   __extension__ struct{ cl_long  lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2;
+#endif
+}cl_long2;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(32) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_long  x, y, z, w; };
+   __extension__ struct{ cl_long  s0, s1, s2, s3; };
+   __extension__ struct{ cl_long2 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[2];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4;
+#endif
+}cl_long4;
+
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef  cl_long4  cl_long3;
+
+typedef union
+{
+    cl_long   CL_ALIGNED(64) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_long  x, y, z, w; };
+   __extension__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_long4 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[4];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4[2];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8;
+#endif
+}cl_long8;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(128) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_long8 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[8];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4[4];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8[2];
+#endif
+#if defined( __CL_LONG16__ )
+    __cl_long16    v16;
+#endif
+}cl_long16;
+
+
+/* ---- cl_ulongn ---- */
+typedef union
+{
+    cl_ulong  CL_ALIGNED(16) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ulong  x, y; };
+   __extension__ struct{ cl_ulong  s0, s1; };
+   __extension__ struct{ cl_ulong  lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2;
+#endif
+}cl_ulong2;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(32) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ulong  x, y, z, w; };
+   __extension__ struct{ cl_ulong  s0, s1, s2, s3; };
+   __extension__ struct{ cl_ulong2 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[2];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4;
+#endif
+}cl_ulong4;
+
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef  cl_ulong4  cl_ulong3;
+
+typedef union
+{
+    cl_ulong   CL_ALIGNED(64) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ulong  x, y, z, w; };
+   __extension__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_ulong4 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[4];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4[2];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8;
+#endif
+}cl_ulong8;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(128) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_ulong8 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[8];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4[4];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8[2];
+#endif
+#if defined( __CL_ULONG16__ )
+    __cl_ulong16    v16;
+#endif
+}cl_ulong16;
+
+
+/* --- cl_floatn ---- */
+
+typedef union
+{
+    cl_float  CL_ALIGNED(8) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_float  x, y; };
+   __extension__ struct{ cl_float  s0, s1; };
+   __extension__ struct{ cl_float  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2;
+#endif
+}cl_float2;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(16) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_float   x, y, z, w; };
+   __extension__ struct{ cl_float   s0, s1, s2, s3; };
+   __extension__ struct{ cl_float2  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[2];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4;
+#endif
+}cl_float4;
+
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef  cl_float4  cl_float3;
+
+typedef union
+{
+    cl_float   CL_ALIGNED(32) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_float   x, y, z, w; };
+   __extension__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_float4  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[4];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4[2];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8;
+#endif
+}cl_float8;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(64) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_float8 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[8];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4[4];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8[2];
+#endif
+#if defined( __CL_FLOAT16__ )
+    __cl_float16    v16;
+#endif
+}cl_float16;
+
+/* --- cl_doublen ---- */
+
+typedef union
+{
+    cl_double  CL_ALIGNED(16) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_double  x, y; };
+   __extension__ struct{ cl_double s0, s1; };
+   __extension__ struct{ cl_double lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2;
+#endif
+}cl_double2;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(32) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_double  x, y, z, w; };
+   __extension__ struct{ cl_double  s0, s1, s2, s3; };
+   __extension__ struct{ cl_double2 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[2];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4;
+#endif
+}cl_double4;
+
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef  cl_double4  cl_double3;
+
+typedef union
+{
+    cl_double   CL_ALIGNED(64) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_double  x, y, z, w; };
+   __extension__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_double4 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[4];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4[2];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8;
+#endif
+}cl_double8;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(128) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_double8 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[8];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4[4];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8[2];
+#endif
+#if defined( __CL_DOUBLE16__ )
+    __cl_double16    v16;
+#endif
+}cl_double16;
+
+/* Macro to facilitate debugging 
+ * Usage:
+ *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. 
+ *   The first line ends with:   CL_PROGRAM_STRING_BEGIN \"
+ *   Each line thereafter of OpenCL C source must end with: \n\
+ *   The last line ends in ";
+ *
+ *   Example:
+ *
+ *   const char *my_program = CL_PROGRAM_STRING_BEGIN "\
+ *   kernel void foo( int a, float * b )             \n\
+ *   {                                               \n\
+ *      // my comment                                \n\
+ *      *b[ get_global_id(0)] = a;                   \n\
+ *   }                                               \n\
+ *   ";
+ *
+ * This should correctly set up the line, (column) and file information for your source 
+ * string so you can do source level debugging.
+ */
+#define  __CL_STRINGIFY( _x )               # _x
+#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
+#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" 
+  
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __CL_PLATFORM_H  */
diff --git a/libs/clew/CL/opencl.h b/libs/clew/CL/opencl.h
new file mode 100644
index 0000000..9855cd7
--- /dev/null
+++ b/libs/clew/CL/opencl.h
@@ -0,0 +1,59 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_H
+#define __OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_gl.h>
+#include <OpenCL/cl_gl_ext.h>
+#include <OpenCL/cl_ext.h>
+
+#else
+
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_gl_ext.h>
+#include <CL/cl_ext.h>
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_H   */
+
diff --git a/libs/clew/CMakeLists.txt b/libs/clew/CMakeLists.txt
new file mode 100644
index 0000000..99e7308
--- /dev/null
+++ b/libs/clew/CMakeLists.txt
@@ -0,0 +1,22 @@
+cmake_minimum_required(VERSION 3.1)
+
+project(libclew)
+
+set(HEADERS
+        CL/cl.h
+        CL/cl_d3d10.h
+        CL/cl_ext.h
+        CL/cl_gl.h
+        CL/cl_gl_ext.h
+        CL/cl_platform.h
+        CL/opencl.h
+        libclew/ocl_init.h
+        )
+
+set(SOURCES
+        libclew/ocl_init.cpp
+        )
+
+add_library(${PROJECT_NAME} ${HEADERS} ${SOURCES})
+target_link_libraries (${PROJECT_NAME} ${CMAKE_DL_LIBS})
+target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR})
diff --git a/libs/clew/libclew/ocl_init.cpp b/libs/clew/libclew/ocl_init.cpp
new file mode 100644
index 0000000..c1e4be1
--- /dev/null
+++ b/libs/clew/libclew/ocl_init.cpp
@@ -0,0 +1,1142 @@
+#include <CL/cl.h>
+
+#ifdef _WIN32
+
+#include <windows.h>
+
+typedef HMODULE OclLibrary;
+
+HMODULE oclLoadLibrary(void)
+{
+	return LoadLibraryW(L"OpenCL.dll");
+}
+
+FARPROC oclGetProcAddress(HMODULE hModule, LPCSTR lpProcName)
+{
+	return ::GetProcAddress(hModule, lpProcName);
+}
+
+#elif defined(__unix__) || defined(__APPLE__) || defined(__MACOSX)
+
+#include <dlfcn.h>
+
+typedef void * OclLibrary;
+
+OclLibrary oclLoadLibrary(void)
+{
+#if defined(__APPLE__) || defined(__MACOSX)
+	return dlopen("/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL", RTLD_NOW);
+#else
+	OclLibrary lib = dlopen("libOpenCL.so", RTLD_NOW);
+	if (!lib) {
+		lib = dlopen("libOpenCL.so.1", RTLD_NOW);
+	}
+	return lib;
+#endif
+}
+
+void *oclGetProcAddress(void *handle, const char *symbol)
+{
+	return dlsym(handle, symbol);
+}
+
+#else
+#error unsupported platform
+#endif
+
+// Platform API
+
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clGetPlatformIDs)				(cl_uint, cl_platform_id *, cl_uint *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clGetPlatformInfo)			(cl_platform_id, cl_platform_info, size_t, void *, size_t *);
+
+// Device APIs
+
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clGetDeviceIDs)				(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clGetDeviceInfo)				(cl_device_id, cl_device_info, size_t, void *, size_t *);
+
+// Context APIs  
+
+typedef cl_context			(CL_API_ENTRY CL_API_CALL * p_pfn_clCreateContext)				(const cl_context_properties *, cl_uint, const cl_device_id *, void (CL_CALLBACK *)(const char *, const void *, size_t, void *), void *, cl_int *);
+typedef cl_context			(CL_API_ENTRY CL_API_CALL * p_pfn_clCreateContextFromType)		(const cl_context_properties *, cl_device_type, void (CL_CALLBACK *)(const char *, const void *, size_t, void *), void *, cl_int *);
+
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clRetainContext)				(cl_context);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clReleaseContext)				(cl_context);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clGetContextInfo)				(cl_context, cl_context_info, size_t, void *, size_t *);
+
+// Command Queue APIs
+
+typedef cl_command_queue	(CL_API_ENTRY CL_API_CALL * p_pfn_clCreateCommandQueue)			(cl_context, cl_device_id, cl_command_queue_properties, cl_int *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clRetainCommandQueue)			(cl_command_queue);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clReleaseCommandQueue)		(cl_command_queue);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clGetCommandQueueInfo)		(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clSetCommandQueueProperty)	(cl_command_queue, cl_command_queue_properties, cl_bool, cl_command_queue_properties *);
+
+// Memory Object APIs
+
+typedef cl_mem				(CL_API_ENTRY CL_API_CALL * p_pfn_clCreateBuffer)				(cl_context, cl_mem_flags, size_t, void *, cl_int *);
+typedef cl_mem				(CL_API_ENTRY CL_API_CALL * p_pfn_clCreateImage2D)				(cl_context, cl_mem_flags, const cl_image_format *, size_t, size_t, size_t, void *, cl_int *);
+typedef cl_mem				(CL_API_ENTRY CL_API_CALL * p_pfn_clCreateImage3D)				(cl_context, cl_mem_flags, const cl_image_format *, size_t, size_t, size_t, size_t, size_t, void *, cl_int *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clRetainMemObject)			(cl_mem);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clReleaseMemObject)			(cl_mem);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clGetSupportedImageFormats)	(cl_context, cl_mem_flags, cl_mem_object_type, cl_uint, cl_image_format *, cl_uint *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clGetMemObjectInfo)			(cl_mem, cl_mem_info, size_t, void *, size_t *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clGetImageInfo)				(cl_mem, cl_image_info, size_t, void *, size_t *);
+
+// Sampler APIs
+
+typedef cl_sampler			(CL_API_ENTRY CL_API_CALL * p_pfn_clCreateSampler)				(cl_context, cl_bool, cl_addressing_mode, cl_filter_mode, cl_int *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clRetainSampler)				(cl_sampler);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clReleaseSampler)				(cl_sampler);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clGetSamplerInfo)				(cl_sampler, cl_sampler_info, size_t, void *, size_t *);
+
+// Program Object APIs
+
+typedef cl_program			(CL_API_ENTRY CL_API_CALL * p_pfn_clCreateProgramWithSource)	(cl_context, cl_uint, const char **, const size_t *, cl_int *);
+typedef cl_program			(CL_API_ENTRY CL_API_CALL * p_pfn_clCreateProgramWithBinary)	(cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clRetainProgram)				(cl_program);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clReleaseProgram)				(cl_program);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clBuildProgram)				(cl_program, cl_uint, const cl_device_id *, const char *, void (CL_CALLBACK *)(cl_program, void *), void *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clUnloadCompiler)				(void);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clGetProgramInfo)				(cl_program, cl_program_info, size_t, void *, size_t *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clGetProgramBuildInfo)		(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *);
+
+// Kernel Object APIs
+
+typedef cl_kernel			(CL_API_ENTRY CL_API_CALL * p_pfn_clCreateKernel)				(cl_program, const char *, cl_int *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clCreateKernelsInProgram)		(cl_program, cl_uint, cl_kernel *, cl_uint *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clRetainKernel)				(cl_kernel);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clReleaseKernel)				(cl_kernel);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clSetKernelArg)				(cl_kernel, cl_uint, size_t, const void *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clGetKernelInfo)				(cl_kernel, cl_kernel_info, size_t, void *, size_t *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clGetKernelWorkGroupInfo)		(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *);
+
+// Event Object APIs
+
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clWaitForEvents)				(cl_uint, const cl_event *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clGetEventInfo)				(cl_event, cl_event_info, size_t, void *, size_t *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clRetainEvent)				(cl_event);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clReleaseEvent)				(cl_event);
+
+// Profiling APIs
+
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clGetEventProfilingInfo)		(cl_event, cl_profiling_info, size_t, void *, size_t *);
+
+// Flush and Finish APIs
+
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clFlush)						(cl_command_queue);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clFinish)						(cl_command_queue);
+
+// Enqueued Commands APIs
+
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueReadBuffer)			(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueReadBufferRect)		(cl_command_queue, cl_mem, cl_bool, const size_t *, const size_t *, const size_t *, size_t, size_t, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueWriteBuffer)			(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueWriteBufferRect)		(cl_command_queue, cl_mem, cl_bool, const size_t *, const size_t *, const size_t *, size_t, size_t, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueCopyBuffer)			(cl_command_queue, cl_mem, cl_mem, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueReadImage)			(cl_command_queue, cl_mem, cl_bool, const size_t * [], const size_t * [], size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueWriteImage)			(cl_command_queue, cl_mem, cl_bool, const size_t * [], const size_t * [], size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueCopyImage)			(cl_command_queue, cl_mem, cl_mem, const size_t * [], const size_t * [], const size_t * [], cl_uint, const cl_event *, cl_event *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueCopyImageToBuffer)	(cl_command_queue, cl_mem, cl_mem, const size_t * [], const size_t * [], size_t, cl_uint, const cl_event *, cl_event *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueCopyBufferToImage)	(cl_command_queue, cl_mem, cl_mem, size_t, const size_t * [], const size_t * [], cl_uint, const cl_event *, cl_event *);
+typedef void *				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueMapBuffer)			(cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event *, cl_event *, cl_int *);
+typedef void *				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueMapImage)			(cl_command_queue, cl_mem, cl_bool, cl_map_flags, const size_t *, const size_t *, size_t *, size_t *, cl_uint, const cl_event *, cl_event *, cl_int *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueUnmapMemObject)		(cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueNDRangeKernel)		(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueTask)				(cl_command_queue, cl_kernel, cl_uint, const cl_event *, cl_event *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueNativeKernel)		(cl_command_queue, void (CL_CALLBACK *)(void *), void *, size_t, cl_uint, const cl_mem *, const void **, cl_uint, const cl_event *, cl_event *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueMarker)				(cl_command_queue, cl_event *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueWaitForEvents)		(cl_command_queue, cl_uint, const cl_event *);
+typedef cl_int				(CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueBarrier)				(cl_command_queue);
+
+// Extension function access
+//
+// Returns the extension function address for the given function name,
+// or NULL if a valid function can not be found.  The client must
+// check to make sure the address is not NULL, before using or 
+// calling the returned function address.
+//
+
+typedef void *				(CL_API_ENTRY CL_API_CALL * p_pfn_clGetExtensionFunctionAddress)(const char *);
+
+p_pfn_clGetPlatformIDs				pfn_clGetPlatformIDs				= 0;
+p_pfn_clGetPlatformInfo				pfn_clGetPlatformInfo				= 0;
+p_pfn_clGetDeviceIDs				pfn_clGetDeviceIDs					= 0;
+p_pfn_clGetDeviceInfo				pfn_clGetDeviceInfo					= 0;
+p_pfn_clCreateContext				pfn_clCreateContext					= 0;
+p_pfn_clCreateContextFromType		pfn_clCreateContextFromType			= 0;
+p_pfn_clRetainContext				pfn_clRetainContext					= 0;
+p_pfn_clReleaseContext				pfn_clReleaseContext				= 0;
+p_pfn_clGetContextInfo				pfn_clGetContextInfo				= 0;
+p_pfn_clCreateCommandQueue			pfn_clCreateCommandQueue			= 0;
+p_pfn_clRetainCommandQueue			pfn_clRetainCommandQueue			= 0;
+p_pfn_clReleaseCommandQueue			pfn_clReleaseCommandQueue			= 0;
+p_pfn_clGetCommandQueueInfo			pfn_clGetCommandQueueInfo			= 0;
+p_pfn_clSetCommandQueueProperty		pfn_clSetCommandQueueProperty		= 0;
+p_pfn_clCreateBuffer				pfn_clCreateBuffer					= 0;
+p_pfn_clCreateImage2D				pfn_clCreateImage2D					= 0;
+p_pfn_clCreateImage3D				pfn_clCreateImage3D					= 0;
+p_pfn_clRetainMemObject				pfn_clRetainMemObject				= 0;
+p_pfn_clReleaseMemObject			pfn_clReleaseMemObject				= 0;
+p_pfn_clGetSupportedImageFormats	pfn_clGetSupportedImageFormats		= 0;
+p_pfn_clGetMemObjectInfo			pfn_clGetMemObjectInfo				= 0;
+p_pfn_clGetImageInfo				pfn_clGetImageInfo					= 0;
+p_pfn_clCreateSampler				pfn_clCreateSampler					= 0;
+p_pfn_clRetainSampler				pfn_clRetainSampler					= 0;
+p_pfn_clReleaseSampler				pfn_clReleaseSampler				= 0;
+p_pfn_clGetSamplerInfo				pfn_clGetSamplerInfo				= 0;
+p_pfn_clCreateProgramWithSource		pfn_clCreateProgramWithSource		= 0;
+p_pfn_clCreateProgramWithBinary		pfn_clCreateProgramWithBinary		= 0;
+p_pfn_clRetainProgram				pfn_clRetainProgram					= 0;
+p_pfn_clReleaseProgram				pfn_clReleaseProgram				= 0;
+p_pfn_clBuildProgram				pfn_clBuildProgram					= 0;
+p_pfn_clUnloadCompiler				pfn_clUnloadCompiler				= 0;
+p_pfn_clGetProgramInfo				pfn_clGetProgramInfo				= 0;
+p_pfn_clGetProgramBuildInfo			pfn_clGetProgramBuildInfo			= 0;
+p_pfn_clCreateKernel				pfn_clCreateKernel					= 0;
+p_pfn_clCreateKernelsInProgram		pfn_clCreateKernelsInProgram		= 0;
+p_pfn_clRetainKernel				pfn_clRetainKernel					= 0;
+p_pfn_clReleaseKernel				pfn_clReleaseKernel					= 0;
+p_pfn_clSetKernelArg				pfn_clSetKernelArg					= 0;
+p_pfn_clGetKernelInfo				pfn_clGetKernelInfo					= 0;
+p_pfn_clGetKernelWorkGroupInfo		pfn_clGetKernelWorkGroupInfo		= 0;
+p_pfn_clWaitForEvents				pfn_clWaitForEvents					= 0;
+p_pfn_clGetEventInfo				pfn_clGetEventInfo					= 0;
+p_pfn_clRetainEvent					pfn_clRetainEvent					= 0;
+p_pfn_clReleaseEvent				pfn_clReleaseEvent					= 0;
+p_pfn_clGetEventProfilingInfo		pfn_clGetEventProfilingInfo			= 0;
+p_pfn_clFlush						pfn_clFlush							= 0;
+p_pfn_clFinish						pfn_clFinish						= 0;
+p_pfn_clEnqueueReadBuffer			pfn_clEnqueueReadBuffer				= 0;
+p_pfn_clEnqueueReadBufferRect		pfn_clEnqueueReadBufferRect			= 0;
+p_pfn_clEnqueueWriteBuffer			pfn_clEnqueueWriteBuffer			= 0;
+p_pfn_clEnqueueWriteBufferRect		pfn_clEnqueueWriteBufferRect		= 0;
+p_pfn_clEnqueueCopyBuffer			pfn_clEnqueueCopyBuffer				= 0;
+p_pfn_clEnqueueReadImage			pfn_clEnqueueReadImage				= 0;
+p_pfn_clEnqueueWriteImage			pfn_clEnqueueWriteImage				= 0;
+p_pfn_clEnqueueCopyImage			pfn_clEnqueueCopyImage				= 0;
+p_pfn_clEnqueueCopyImageToBuffer	pfn_clEnqueueCopyImageToBuffer		= 0;
+p_pfn_clEnqueueCopyBufferToImage	pfn_clEnqueueCopyBufferToImage		= 0;
+p_pfn_clEnqueueMapBuffer			pfn_clEnqueueMapBuffer				= 0;
+p_pfn_clEnqueueMapImage				pfn_clEnqueueMapImage				= 0;
+p_pfn_clEnqueueUnmapMemObject		pfn_clEnqueueUnmapMemObject			= 0;
+p_pfn_clEnqueueNDRangeKernel		pfn_clEnqueueNDRangeKernel			= 0;
+p_pfn_clEnqueueTask					pfn_clEnqueueTask					= 0;
+p_pfn_clEnqueueNativeKernel			pfn_clEnqueueNativeKernel			= 0;
+p_pfn_clEnqueueMarker				pfn_clEnqueueMarker					= 0;
+p_pfn_clEnqueueWaitForEvents		pfn_clEnqueueWaitForEvents			= 0;
+p_pfn_clEnqueueBarrier				pfn_clEnqueueBarrier				= 0;
+p_pfn_clGetExtensionFunctionAddress	pfn_clGetExtensionFunctionAddress	= 0;
+
+int ocl_init(void)
+{
+	if (pfn_clGetPlatformIDs) return 1;
+
+	OclLibrary lib = oclLoadLibrary();
+	if (!lib) return 0;
+
+	pfn_clGetPlatformIDs				= (p_pfn_clGetPlatformIDs)				oclGetProcAddress(lib, "clGetPlatformIDs");
+	pfn_clGetPlatformInfo				= (p_pfn_clGetPlatformInfo)				oclGetProcAddress(lib, "clGetPlatformInfo");
+	pfn_clGetDeviceIDs					= (p_pfn_clGetDeviceIDs)				oclGetProcAddress(lib, "clGetDeviceIDs");
+	pfn_clGetDeviceInfo					= (p_pfn_clGetDeviceInfo)				oclGetProcAddress(lib, "clGetDeviceInfo");
+	pfn_clCreateContext					= (p_pfn_clCreateContext)				oclGetProcAddress(lib, "clCreateContext");
+	pfn_clCreateContextFromType			= (p_pfn_clCreateContextFromType)		oclGetProcAddress(lib, "clCreateContextFromType");
+	pfn_clRetainContext					= (p_pfn_clRetainContext)				oclGetProcAddress(lib, "clRetainContext");
+	pfn_clReleaseContext				= (p_pfn_clReleaseContext)				oclGetProcAddress(lib, "clReleaseContext");
+	pfn_clGetContextInfo				= (p_pfn_clGetContextInfo)				oclGetProcAddress(lib, "clGetContextInfo");
+	pfn_clCreateCommandQueue			= (p_pfn_clCreateCommandQueue)			oclGetProcAddress(lib, "clCreateCommandQueue");
+	pfn_clRetainCommandQueue			= (p_pfn_clRetainCommandQueue)			oclGetProcAddress(lib, "clRetainCommandQueue");
+	pfn_clReleaseCommandQueue			= (p_pfn_clReleaseCommandQueue)			oclGetProcAddress(lib, "clReleaseCommandQueue");
+	pfn_clGetCommandQueueInfo			= (p_pfn_clGetCommandQueueInfo)			oclGetProcAddress(lib, "clGetCommandQueueInfo");
+	pfn_clSetCommandQueueProperty		= (p_pfn_clSetCommandQueueProperty)		oclGetProcAddress(lib, "clSetCommandQueueProperty");
+	pfn_clCreateBuffer					= (p_pfn_clCreateBuffer)				oclGetProcAddress(lib, "clCreateBuffer");
+	pfn_clCreateImage2D					= (p_pfn_clCreateImage2D)				oclGetProcAddress(lib, "clCreateImage2D");
+	pfn_clCreateImage3D					= (p_pfn_clCreateImage3D)				oclGetProcAddress(lib, "clCreateImage3D");
+	pfn_clRetainMemObject				= (p_pfn_clRetainMemObject)				oclGetProcAddress(lib, "clRetainMemObject");
+	pfn_clReleaseMemObject				= (p_pfn_clReleaseMemObject)			oclGetProcAddress(lib, "clReleaseMemObject");
+	pfn_clGetSupportedImageFormats		= (p_pfn_clGetSupportedImageFormats)	oclGetProcAddress(lib, "clGetSupportedImageFormats");
+	pfn_clGetMemObjectInfo				= (p_pfn_clGetMemObjectInfo)			oclGetProcAddress(lib, "clGetMemObjectInfo");
+	pfn_clGetImageInfo					= (p_pfn_clGetImageInfo)				oclGetProcAddress(lib, "clGetImageInfo");
+	pfn_clCreateSampler					= (p_pfn_clCreateSampler)				oclGetProcAddress(lib, "clCreateSampler");
+	pfn_clRetainSampler					= (p_pfn_clRetainSampler)				oclGetProcAddress(lib, "clRetainSampler");
+	pfn_clReleaseSampler				= (p_pfn_clReleaseSampler)				oclGetProcAddress(lib, "clReleaseSampler");
+	pfn_clGetSamplerInfo				= (p_pfn_clGetSamplerInfo)				oclGetProcAddress(lib, "clGetSamplerInfo");
+	pfn_clCreateProgramWithSource		= (p_pfn_clCreateProgramWithSource)		oclGetProcAddress(lib, "clCreateProgramWithSource");
+	pfn_clCreateProgramWithBinary		= (p_pfn_clCreateProgramWithBinary)		oclGetProcAddress(lib, "clCreateProgramWithBinary");
+	pfn_clRetainProgram					= (p_pfn_clRetainProgram)				oclGetProcAddress(lib, "clRetainProgram");
+	pfn_clReleaseProgram				= (p_pfn_clReleaseProgram)				oclGetProcAddress(lib, "clReleaseProgram");
+	pfn_clBuildProgram					= (p_pfn_clBuildProgram)				oclGetProcAddress(lib, "clBuildProgram");
+	pfn_clUnloadCompiler				= (p_pfn_clUnloadCompiler)				oclGetProcAddress(lib, "clUnloadCompiler");
+	pfn_clGetProgramInfo				= (p_pfn_clGetProgramInfo)				oclGetProcAddress(lib, "clGetProgramInfo");
+	pfn_clGetProgramBuildInfo			= (p_pfn_clGetProgramBuildInfo)			oclGetProcAddress(lib, "clGetProgramBuildInfo");
+	pfn_clCreateKernel					= (p_pfn_clCreateKernel)				oclGetProcAddress(lib, "clCreateKernel");
+	pfn_clCreateKernelsInProgram		= (p_pfn_clCreateKernelsInProgram)		oclGetProcAddress(lib, "clCreateKernelsInProgram");
+	pfn_clRetainKernel					= (p_pfn_clRetainKernel)				oclGetProcAddress(lib, "clRetainKernel");
+	pfn_clReleaseKernel					= (p_pfn_clReleaseKernel)				oclGetProcAddress(lib, "clReleaseKernel");
+	pfn_clSetKernelArg					= (p_pfn_clSetKernelArg)				oclGetProcAddress(lib, "clSetKernelArg");
+	pfn_clGetKernelInfo					= (p_pfn_clGetKernelInfo)				oclGetProcAddress(lib, "clGetKernelInfo");
+	pfn_clGetKernelWorkGroupInfo		= (p_pfn_clGetKernelWorkGroupInfo)		oclGetProcAddress(lib, "clGetKernelWorkGroupInfo");
+	pfn_clWaitForEvents					= (p_pfn_clWaitForEvents)				oclGetProcAddress(lib, "clWaitForEvents");
+	pfn_clGetEventInfo					= (p_pfn_clGetEventInfo)				oclGetProcAddress(lib, "clGetEventInfo");
+	pfn_clRetainEvent					= (p_pfn_clRetainEvent)					oclGetProcAddress(lib, "clRetainEvent");
+	pfn_clReleaseEvent					= (p_pfn_clReleaseEvent)				oclGetProcAddress(lib, "clReleaseEvent");
+	pfn_clGetEventProfilingInfo			= (p_pfn_clGetEventProfilingInfo)		oclGetProcAddress(lib, "clGetEventProfilingInfo");
+	pfn_clFlush							= (p_pfn_clFlush)						oclGetProcAddress(lib, "clFlush");
+	pfn_clFinish						= (p_pfn_clFinish)						oclGetProcAddress(lib, "clFinish");
+	pfn_clEnqueueReadBuffer				= (p_pfn_clEnqueueReadBuffer)			oclGetProcAddress(lib, "clEnqueueReadBuffer");
+	pfn_clEnqueueReadBufferRect			= (p_pfn_clEnqueueReadBufferRect)		oclGetProcAddress(lib, "clEnqueueReadBufferRect");
+	pfn_clEnqueueWriteBuffer			= (p_pfn_clEnqueueWriteBuffer)			oclGetProcAddress(lib, "clEnqueueWriteBuffer");
+	pfn_clEnqueueWriteBufferRect		= (p_pfn_clEnqueueWriteBufferRect)		oclGetProcAddress(lib, "clEnqueueWriteBufferRect");
+	pfn_clEnqueueCopyBuffer				= (p_pfn_clEnqueueCopyBuffer)			oclGetProcAddress(lib, "clEnqueueCopyBuffer");
+	pfn_clEnqueueReadImage				= (p_pfn_clEnqueueReadImage)			oclGetProcAddress(lib, "clEnqueueReadImage");
+	pfn_clEnqueueWriteImage				= (p_pfn_clEnqueueWriteImage)			oclGetProcAddress(lib, "clEnqueueWriteImage");
+	pfn_clEnqueueCopyImage				= (p_pfn_clEnqueueCopyImage)			oclGetProcAddress(lib, "clEnqueueCopyImage");
+	pfn_clEnqueueCopyImageToBuffer		= (p_pfn_clEnqueueCopyImageToBuffer)	oclGetProcAddress(lib, "clEnqueueCopyImageToBuffer");
+	pfn_clEnqueueCopyBufferToImage		= (p_pfn_clEnqueueCopyBufferToImage)	oclGetProcAddress(lib, "clEnqueueCopyBufferToImage");
+	pfn_clEnqueueMapBuffer				= (p_pfn_clEnqueueMapBuffer)			oclGetProcAddress(lib, "clEnqueueMapBuffer");
+	pfn_clEnqueueMapImage				= (p_pfn_clEnqueueMapImage)				oclGetProcAddress(lib, "clEnqueueMapImage");
+	pfn_clEnqueueUnmapMemObject			= (p_pfn_clEnqueueUnmapMemObject)		oclGetProcAddress(lib, "clEnqueueUnmapMemObject");
+	pfn_clEnqueueNDRangeKernel			= (p_pfn_clEnqueueNDRangeKernel)		oclGetProcAddress(lib, "clEnqueueNDRangeKernel");
+	pfn_clEnqueueTask					= (p_pfn_clEnqueueTask)					oclGetProcAddress(lib, "clEnqueueTask");
+	pfn_clEnqueueNativeKernel			= (p_pfn_clEnqueueNativeKernel)			oclGetProcAddress(lib, "clEnqueueNativeKernel");
+	pfn_clEnqueueMarker					= (p_pfn_clEnqueueMarker)				oclGetProcAddress(lib, "clEnqueueMarker");
+	pfn_clEnqueueWaitForEvents			= (p_pfn_clEnqueueWaitForEvents)		oclGetProcAddress(lib, "clEnqueueWaitForEvents");
+	pfn_clEnqueueBarrier				= (p_pfn_clEnqueueBarrier)				oclGetProcAddress(lib, "clEnqueueBarrier");
+	pfn_clGetExtensionFunctionAddress	= (p_pfn_clGetExtensionFunctionAddress)	oclGetProcAddress(lib, "clGetExtensionFunctionAddress");
+
+	return 1;
+}
+
+// Platform API
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint          num_entries,
+                 cl_platform_id * platforms,
+                 cl_uint *        num_platforms) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clGetPlatformIDs) return CL_INVALID_OPERATION;
+
+	return pfn_clGetPlatformIDs(num_entries, platforms, num_platforms);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL 
+clGetPlatformInfo(cl_platform_id   platform,
+                  cl_platform_info param_name,
+                  size_t           param_value_size,
+                  void *           param_value,
+                  size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clGetPlatformInfo) return CL_INVALID_OPERATION;
+
+	return pfn_clGetPlatformInfo(platform, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+// Device APIs
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id   platform,
+               cl_device_type   device_type,
+               cl_uint          num_entries,
+               cl_device_id *   devices,
+               cl_uint *        num_devices) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clGetDeviceIDs) return CL_INVALID_OPERATION;
+
+	return pfn_clGetDeviceIDs(platform, device_type, num_entries, devices, num_devices);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id    device,
+                cl_device_info  param_name, 
+                size_t          param_value_size, 
+                void *          param_value,
+                size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clGetDeviceInfo) return CL_INVALID_OPERATION;
+
+	return pfn_clGetDeviceInfo(device, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+// Context APIs  
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * properties,
+                cl_uint                 num_devices,
+                const cl_device_id *    devices,
+                void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+                void *                  user_data,
+                cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clCreateContext) return 0;
+
+	return pfn_clCreateContext(properties, num_devices, devices, pfn_notify, user_data, errcode_ret);
+}
+
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * properties,
+                        cl_device_type          device_type,
+                        void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+                        void *                  user_data,
+                        cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clCreateContextFromType) return 0;
+
+	return pfn_clCreateContextFromType(properties, device_type, pfn_notify, user_data, errcode_ret);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context context) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clRetainContext) return CL_INVALID_OPERATION;
+
+	return pfn_clRetainContext(context);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context context) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clReleaseContext) return CL_INVALID_OPERATION;
+
+	return pfn_clReleaseContext(context);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context         context, 
+                 cl_context_info    param_name, 
+                 size_t             param_value_size, 
+                 void *             param_value, 
+                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clGetContextInfo) return CL_INVALID_OPERATION;
+
+	return pfn_clGetContextInfo(context, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+// Command Queue APIs
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context                     context, 
+                     cl_device_id                   device, 
+                     cl_command_queue_properties    properties,
+                     cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clCreateCommandQueue) return 0;
+
+	return pfn_clCreateCommandQueue(context, device, properties, errcode_ret);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clRetainCommandQueue) return CL_INVALID_OPERATION;
+
+	return pfn_clRetainCommandQueue(command_queue);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clReleaseCommandQueue) return CL_INVALID_OPERATION;
+
+	return pfn_clReleaseCommandQueue(command_queue);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue      command_queue,
+                      cl_command_queue_info param_name,
+                      size_t                param_value_size,
+                      void *                param_value,
+                      size_t *              param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clGetCommandQueueInfo) return CL_INVALID_OPERATION;
+
+	return pfn_clGetCommandQueueInfo(command_queue, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetCommandQueueProperty(cl_command_queue              command_queue,
+                          cl_command_queue_properties   properties, 
+                          cl_bool                        enable,
+                          cl_command_queue_properties * old_properties) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clSetCommandQueueProperty) return CL_INVALID_OPERATION;
+
+	return pfn_clSetCommandQueueProperty(command_queue, properties, enable, old_properties);
+}
+
+// Memory Object APIs
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context   context,
+               cl_mem_flags flags,
+               size_t       size,
+               void *       host_ptr,
+               cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clCreateBuffer) return 0;
+
+	return pfn_clCreateBuffer(context, flags, size, host_ptr, errcode_ret);
+}
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage2D(cl_context              context,
+                cl_mem_flags            flags,
+                const cl_image_format * image_format,
+                size_t                  image_width,
+                size_t                  image_height,
+                size_t                  image_row_pitch, 
+                void *                  host_ptr,
+                cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clCreateImage2D) return 0;
+
+	return pfn_clCreateImage2D(context, flags, image_format, image_width, image_height, image_row_pitch, host_ptr, errcode_ret);
+}
+                        
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage3D(cl_context              context,
+                cl_mem_flags            flags,
+                const cl_image_format * image_format,
+                size_t                  image_width, 
+                size_t                  image_height,
+                size_t                  image_depth, 
+                size_t                  image_row_pitch, 
+                size_t                  image_slice_pitch, 
+                void *                  host_ptr,
+                cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clCreateImage3D) return 0;
+
+	return pfn_clCreateImage3D(context, flags, image_format, image_width, image_height, image_depth, image_row_pitch, image_slice_pitch, host_ptr, errcode_ret);
+}
+                        
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clRetainMemObject) return CL_INVALID_OPERATION;
+
+	return pfn_clRetainMemObject(memobj);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clReleaseMemObject) return CL_INVALID_OPERATION;
+
+	return pfn_clReleaseMemObject(memobj);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context           context,
+                           cl_mem_flags         flags,
+                           cl_mem_object_type   image_type,
+                           cl_uint              num_entries,
+                           cl_image_format *    image_formats,
+                           cl_uint *            num_image_formats) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clGetSupportedImageFormats) return CL_INVALID_OPERATION;
+
+	return pfn_clGetSupportedImageFormats(context, flags, image_type, num_entries, image_formats, num_image_formats);
+}
+                                    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem           memobj,
+                   cl_mem_info      param_name, 
+                   size_t           param_value_size,
+                   void *           param_value,
+                   size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clGetMemObjectInfo) return CL_INVALID_OPERATION;
+
+	return pfn_clGetMemObjectInfo(memobj, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem           image,
+               cl_image_info    param_name, 
+               size_t           param_value_size,
+               void *           param_value,
+               size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clGetImageInfo) return CL_INVALID_OPERATION;
+
+	return pfn_clGetImageInfo(image, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+
+// Sampler APIs
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSampler(cl_context          context,
+                cl_bool             normalized_coords, 
+                cl_addressing_mode  addressing_mode, 
+                cl_filter_mode      filter_mode,
+                cl_int *            errcode_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clCreateSampler) return 0;
+
+	return pfn_clCreateSampler(context, normalized_coords, addressing_mode, filter_mode, errcode_ret);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clRetainSampler) return CL_INVALID_OPERATION;
+
+	return pfn_clRetainSampler(sampler);
+}
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clReleaseSampler) return CL_INVALID_OPERATION;
+
+	return pfn_clReleaseSampler(sampler);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler         sampler,
+                 cl_sampler_info    param_name,
+                 size_t             param_value_size,
+                 void *             param_value,
+                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clGetSamplerInfo) return CL_INVALID_OPERATION;
+
+	return pfn_clGetSamplerInfo(sampler, param_name, param_value_size, param_value, param_value_size_ret);
+}
+                            
+// Program Object APIs
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context        context,
+                          cl_uint           count,
+                          const char **     strings,
+                          const size_t *    lengths,
+                          cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clCreateProgramWithSource) return 0;
+
+	return pfn_clCreateProgramWithSource(context, count, strings, lengths, errcode_ret);
+}
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context                     context,
+                          cl_uint                        num_devices,
+                          const cl_device_id *           device_list,
+                          const size_t *                 lengths,
+                          const unsigned char **         binaries,
+                          cl_int *                       binary_status,
+                          cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clCreateProgramWithBinary) return 0;
+
+	return pfn_clCreateProgramWithBinary(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clRetainProgram) return CL_INVALID_OPERATION;
+
+	return pfn_clRetainProgram(program);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clReleaseProgram) return CL_INVALID_OPERATION;
+
+	return pfn_clReleaseProgram(program);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program           program,
+               cl_uint              num_devices,
+               const cl_device_id * device_list,
+               const char *         options, 
+               void (CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+               void *               user_data) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clBuildProgram) return CL_INVALID_OPERATION;
+
+	return pfn_clBuildProgram(program, num_devices, device_list, options, pfn_notify, user_data);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadCompiler(void) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clUnloadCompiler) return CL_INVALID_OPERATION;
+
+	return pfn_clUnloadCompiler();
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program         program,
+                 cl_program_info    param_name,
+                 size_t             param_value_size,
+                 void *             param_value,
+                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clGetProgramInfo) return CL_INVALID_OPERATION;
+
+	return pfn_clGetProgramInfo(program, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program            program,
+                      cl_device_id          device,
+                      cl_program_build_info param_name,
+                      size_t                param_value_size,
+                      void *                param_value,
+                      size_t *              param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clGetProgramBuildInfo) return CL_INVALID_OPERATION;
+
+	return pfn_clGetProgramBuildInfo(program, device, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+// Kernel Object APIs
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program      program,
+               const char *    kernel_name,
+               cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clCreateKernel) return 0;
+
+	return pfn_clCreateKernel(program, kernel_name, errcode_ret);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program     program,
+                         cl_uint        num_kernels,
+                         cl_kernel *    kernels,
+                         cl_uint *      num_kernels_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clCreateKernelsInProgram) return CL_INVALID_OPERATION;
+
+	return pfn_clCreateKernelsInProgram(program, num_kernels, kernels, num_kernels_ret);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel    kernel) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clRetainKernel) return CL_INVALID_OPERATION;
+
+	return pfn_clRetainKernel(kernel);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel   kernel) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clReleaseKernel) return CL_INVALID_OPERATION;
+
+	return pfn_clReleaseKernel(kernel);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel    kernel,
+               cl_uint      arg_index,
+               size_t       arg_size,
+               const void * arg_value) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clSetKernelArg) return CL_INVALID_OPERATION;
+
+	return pfn_clSetKernelArg(kernel, arg_index, arg_size, arg_value);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel       kernel,
+                cl_kernel_info  param_name,
+                size_t          param_value_size,
+                void *          param_value,
+                size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clGetKernelInfo) return CL_INVALID_OPERATION;
+
+	return pfn_clGetKernelInfo(kernel, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel                  kernel,
+                         cl_device_id               device,
+                         cl_kernel_work_group_info  param_name,
+                         size_t                     param_value_size,
+                         void *                     param_value,
+                         size_t *                   param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clGetKernelWorkGroupInfo) return CL_INVALID_OPERATION;
+
+	return pfn_clGetKernelWorkGroupInfo(kernel, device, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+// Event Object APIs
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint             num_events,
+                const cl_event *    event_list) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clWaitForEvents) return CL_INVALID_OPERATION;
+
+	return pfn_clWaitForEvents(num_events, event_list);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event         event,
+               cl_event_info    param_name,
+               size_t           param_value_size,
+               void *           param_value,
+               size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clGetEventInfo) return CL_INVALID_OPERATION;
+
+	return pfn_clGetEventInfo(event, param_name, param_value_size, param_value, param_value_size_ret);
+}
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clRetainEvent) return CL_INVALID_OPERATION;
+
+	return pfn_clRetainEvent(event);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clReleaseEvent) return CL_INVALID_OPERATION;
+
+	return pfn_clReleaseEvent(event);
+}
+
+// Profiling APIs
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event            event,
+                        cl_profiling_info   param_name,
+                        size_t              param_value_size,
+                        void *              param_value,
+                        size_t *            param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clGetEventProfilingInfo) return CL_INVALID_OPERATION;
+
+	return pfn_clGetEventProfilingInfo(event, param_name, param_value_size, param_value, param_value_size_ret);
+}
+
+// Flush and Finish APIs
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clFlush) return CL_INVALID_OPERATION;
+
+	return pfn_clFlush(command_queue);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clFinish) return CL_INVALID_OPERATION;
+
+	return pfn_clFinish(command_queue);
+}
+
+// Enqueued Commands APIs
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue    command_queue,
+                    cl_mem              buffer,
+                    cl_bool             blocking_read,
+                    size_t              offset,
+                    size_t              cb,
+                    void *              ptr,
+                    cl_uint             num_events_in_wait_list,
+                    const cl_event *    event_wait_list,
+                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clEnqueueReadBuffer) return CL_INVALID_OPERATION;
+
+	return pfn_clEnqueueReadBuffer(command_queue, buffer, blocking_read, offset, cb, ptr, num_events_in_wait_list, event_wait_list, event);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue	command_queue,
+						cl_mem				buffer,
+						cl_bool				blocking_read,
+						const size_t *		buffer_origin,
+						const size_t *		host_origin,
+						const size_t *		region,
+						size_t				buffer_row_pitch,
+						size_t				buffer_slice_pitch,
+						size_t				host_row_pitch,
+						size_t				host_slice_pitch,
+						void *				ptr,
+						cl_uint				num_events_in_wait_list,
+						const cl_event *	event_wait_list,
+						cl_event *			event) CL_API_SUFFIX__VERSION_1_1
+{
+	if (!pfn_clEnqueueReadBufferRect) return CL_INVALID_OPERATION;
+
+	return pfn_clEnqueueReadBufferRect(command_queue, buffer, blocking_read, buffer_origin, host_origin, region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event);
+}
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue   command_queue,
+                     cl_mem             buffer,
+                     cl_bool            blocking_write,
+                     size_t             offset,
+                     size_t             cb,
+                     const void *       ptr,
+                     cl_uint            num_events_in_wait_list,
+                     const cl_event *   event_wait_list,
+                     cl_event *         event) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clEnqueueWriteBuffer) return CL_INVALID_OPERATION;
+
+	return pfn_clEnqueueWriteBuffer(command_queue, buffer, blocking_write, offset, cb, ptr, num_events_in_wait_list, event_wait_list, event);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue	command_queue,
+                         cl_mem				buffer,
+                         cl_bool			blocking_write,
+                         const size_t *		buffer_origin,
+                         const size_t *		host_origin,
+                         const size_t *		region,
+                         size_t				buffer_row_pitch,
+                         size_t				buffer_slice_pitch,
+                         size_t				host_row_pitch,
+                         size_t				host_slice_pitch,
+                         const void *		ptr,
+                         cl_uint			num_events_in_wait_list,
+                         const cl_event *	event_wait_list,
+                         cl_event *			event) CL_API_SUFFIX__VERSION_1_1
+{
+	if (!pfn_clEnqueueWriteBufferRect) return CL_INVALID_OPERATION;
+
+	return pfn_clEnqueueWriteBufferRect(command_queue, buffer, blocking_write, buffer_origin, host_origin, region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue    command_queue, 
+                    cl_mem              src_buffer,
+                    cl_mem              dst_buffer, 
+                    size_t              src_offset,
+                    size_t              dst_offset,
+                    size_t              cb, 
+                    cl_uint             num_events_in_wait_list,
+                    const cl_event *    event_wait_list,
+                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clEnqueueCopyBuffer) return CL_INVALID_OPERATION;
+
+	return pfn_clEnqueueCopyBuffer(command_queue, src_buffer, dst_buffer, src_offset, dst_offset, cb, num_events_in_wait_list, event_wait_list, event);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue     command_queue,
+                   cl_mem               image,
+                   cl_bool              blocking_read, 
+                   const size_t *       origin[3],
+                   const size_t *       region[3],
+                   size_t               row_pitch,
+                   size_t               slice_pitch, 
+                   void *               ptr,
+                   cl_uint              num_events_in_wait_list,
+                   const cl_event *     event_wait_list,
+                   cl_event *           event) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clEnqueueReadImage) return CL_INVALID_OPERATION;
+
+	return pfn_clEnqueueReadImage(command_queue, image, blocking_read, origin, region, row_pitch, slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue    command_queue,
+                    cl_mem              image,
+                    cl_bool             blocking_write, 
+                    const size_t *      origin[3],
+                    const size_t *      region[3],
+                    size_t              input_row_pitch,
+                    size_t              input_slice_pitch, 
+                    const void *        ptr,
+                    cl_uint             num_events_in_wait_list,
+                    const cl_event *    event_wait_list,
+                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clEnqueueWriteImage) return CL_INVALID_OPERATION;
+
+	return pfn_clEnqueueWriteImage(command_queue, image, blocking_write, origin, region, input_row_pitch, input_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue     command_queue,
+                   cl_mem               src_image,
+                   cl_mem               dst_image, 
+                   const size_t *       src_origin[3],
+                   const size_t *       dst_origin[3],
+                   const size_t *       region[3], 
+                   cl_uint              num_events_in_wait_list,
+                   const cl_event *     event_wait_list,
+                   cl_event *           event) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clEnqueueCopyImage) return CL_INVALID_OPERATION;
+
+	return pfn_clEnqueueCopyImage(command_queue, src_image, dst_image, src_origin, dst_origin, region, num_events_in_wait_list, event_wait_list, event);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
+                           cl_mem           src_image,
+                           cl_mem           dst_buffer, 
+                           const size_t *   src_origin[3],
+                           const size_t *   region[3], 
+                           size_t           dst_offset,
+                           cl_uint          num_events_in_wait_list,
+                           const cl_event * event_wait_list,
+                           cl_event *       event) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clEnqueueCopyImageToBuffer) return CL_INVALID_OPERATION;
+
+	return pfn_clEnqueueCopyImageToBuffer(command_queue, src_image, dst_buffer, src_origin, region, dst_offset, num_events_in_wait_list, event_wait_list, event);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue command_queue,
+                           cl_mem           src_buffer,
+                           cl_mem           dst_image, 
+                           size_t           src_offset,
+                           const size_t *   dst_origin[3],
+                           const size_t *   region[3], 
+                           cl_uint          num_events_in_wait_list,
+                           const cl_event * event_wait_list,
+                           cl_event *       event) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clEnqueueCopyBufferToImage) return CL_INVALID_OPERATION;
+
+	return pfn_clEnqueueCopyBufferToImage(command_queue, src_buffer, dst_image, src_offset, dst_origin, region, num_events_in_wait_list, event_wait_list, event);
+}
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue command_queue,
+                   cl_mem           buffer,
+                   cl_bool          blocking_map, 
+                   cl_map_flags     map_flags,
+                   size_t           offset,
+                   size_t           cb,
+                   cl_uint          num_events_in_wait_list,
+                   const cl_event * event_wait_list,
+                   cl_event *       event,
+                   cl_int *         errcode_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clEnqueueMapBuffer) return 0;
+
+	return pfn_clEnqueueMapBuffer(command_queue, buffer, blocking_map, map_flags, offset, cb, num_events_in_wait_list, event_wait_list, event, errcode_ret);
+}
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue  command_queue,
+                  cl_mem            image, 
+                  cl_bool           blocking_map, 
+                  cl_map_flags      map_flags, 
+                  const size_t *    origin,
+                  const size_t *    region,
+                  size_t *          image_row_pitch,
+                  size_t *          image_slice_pitch,
+                  cl_uint           num_events_in_wait_list,
+                  const cl_event *  event_wait_list,
+                  cl_event *        event,
+                  cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clEnqueueMapImage) return 0;
+
+	return pfn_clEnqueueMapImage(command_queue, image, blocking_map, map_flags, origin, region, image_row_pitch, image_slice_pitch, num_events_in_wait_list, event_wait_list, event, errcode_ret);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue command_queue,
+                        cl_mem           memobj,
+                        void *           mapped_ptr,
+                        cl_uint          num_events_in_wait_list,
+                        const cl_event *  event_wait_list,
+                        cl_event *        event) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clEnqueueUnmapMemObject) return CL_INVALID_OPERATION;
+
+	return pfn_clEnqueueUnmapMemObject(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue command_queue,
+                       cl_kernel        kernel,
+                       cl_uint          work_dim,
+                       const size_t *   global_work_offset,
+                       const size_t *   global_work_size,
+                       const size_t *   local_work_size,
+                       cl_uint          num_events_in_wait_list,
+                       const cl_event * event_wait_list,
+                       cl_event *       event) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clEnqueueNDRangeKernel) return CL_INVALID_OPERATION;
+
+	return pfn_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue  command_queue,
+              cl_kernel         kernel,
+              cl_uint           num_events_in_wait_list,
+              const cl_event *  event_wait_list,
+              cl_event *        event) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clEnqueueTask) return CL_INVALID_OPERATION;
+
+	return pfn_clEnqueueTask(command_queue, kernel, num_events_in_wait_list, event_wait_list, event);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue  command_queue,
+					  void (CL_CALLBACK *user_func)(void *), 
+                      void *            args,
+                      size_t            cb_args, 
+                      cl_uint           num_mem_objects,
+                      const cl_mem *    mem_list,
+                      const void **     args_mem_loc,
+                      cl_uint           num_events_in_wait_list,
+                      const cl_event *  event_wait_list,
+                      cl_event *        event) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clEnqueueNativeKernel) return CL_INVALID_OPERATION;
+
+	return pfn_clEnqueueNativeKernel(command_queue, user_func, args, cb_args, num_mem_objects, mem_list, args_mem_loc, num_events_in_wait_list, event_wait_list, event);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue    command_queue,
+                cl_event *          event) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clEnqueueMarker) return CL_INVALID_OPERATION;
+
+	return pfn_clEnqueueMarker(command_queue, event);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue command_queue,
+                       cl_uint          num_events,
+                       const cl_event * event_list) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clEnqueueWaitForEvents) return CL_INVALID_OPERATION;
+
+	return pfn_clEnqueueWaitForEvents(command_queue, num_events, event_list);
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clEnqueueBarrier) return CL_INVALID_OPERATION;
+
+	return pfn_clEnqueueBarrier(command_queue);
+}
+
+// Extension function access
+//
+// Returns the extension function address for the given function name,
+// or NULL if a valid function can not be found.  The client must
+// check to make sure the address is not NULL, before using or 
+// calling the returned function address.
+//
+extern CL_API_ENTRY void * CL_API_CALL clGetExtensionFunctionAddress(const char * func_name) CL_API_SUFFIX__VERSION_1_0
+{
+	if (!pfn_clGetExtensionFunctionAddress) return 0;
+
+	return pfn_clGetExtensionFunctionAddress(func_name);
+}
diff --git a/libs/clew/libclew/ocl_init.h b/libs/clew/libclew/ocl_init.h
new file mode 100644
index 0000000..7929b0f
--- /dev/null
+++ b/libs/clew/libclew/ocl_init.h
@@ -0,0 +1,3 @@
+#pragma once
+
+int ocl_init(void);
diff --git a/libs/gpu/CMakeLists.txt b/libs/gpu/CMakeLists.txt
new file mode 100644
index 0000000..e0ac49b
--- /dev/null
+++ b/libs/gpu/CMakeLists.txt
@@ -0,0 +1,79 @@
+cmake_minimum_required(VERSION 3.1)
+
+project(libgpu)
+
+set(HEADERS
+        libgpu/opencl/device_info.h
+        libgpu/opencl/engine.h
+        libgpu/opencl/enum.h
+        libgpu/opencl/utils.h
+        libgpu/context.h
+        libgpu/device.h
+        libgpu/gold_helpers.h
+        libgpu/shared_device_buffer.h
+        libgpu/shared_host_buffer.h
+        libgpu/utils.h
+        libgpu/work_size.h
+        )
+
+set(SOURCES
+        libgpu/opencl/device_info.cpp
+        libgpu/opencl/engine.cpp
+        libgpu/opencl/enum.cpp
+        libgpu/opencl/utils.cpp
+        libgpu/context.cpp
+        libgpu/device.cpp
+        libgpu/gold_helpers.cpp
+        libgpu/shared_device_buffer.cpp
+        libgpu/shared_host_buffer.cpp
+        libgpu/utils.cpp
+        )
+
+set(CUDA_HEADERS
+        libgpu/cuda/sdk/helper_math.h
+        libgpu/cuda/cuda_api.h
+        libgpu/cuda/enum.h
+        libgpu/cuda/utils.h
+        )
+
+set(CUDA_SOURCES
+        libgpu/cuda/cuda_api.cpp
+        libgpu/cuda/enum.cpp
+        libgpu/cuda/utils.cpp
+        )
+
+option(GPU_CUDA_SUPPORT "CUDA support." OFF)
+
+set (LIBRARIES
+        libclew
+        libutils)
+
+set(CMAKE_CXX_STANDARD 11)
+
+if (GPU_CUDA_SUPPORT)
+    find_package (CUDA REQUIRED)
+
+    set(HEADERS ${HEADERS} ${CUDA_HEADERS})
+    set(SOURCES ${SOURCES} ${CUDA_SOURCES})
+    set(LIBRARIES ${LIBRARIES} ${CUDA_LIBRARIES})
+
+    add_definitions(-DCUDA_SUPPORT)
+    cuda_add_library(${PROJECT_NAME} ${SOURCES} ${HEADERS})
+else ()
+    add_library(${PROJECT_NAME} ${SOURCES} ${HEADERS})
+endif ()
+
+target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR})
+target_link_libraries(${PROJECT_NAME} ${LIBRARIES})
+
+add_executable(hexdumparray libgpu/hexdumparray.cpp)
+
+function(convertIntoHeader sourceFile headerFile arrayName)
+    add_custom_command(
+            OUTPUT ${PROJECT_SOURCE_DIR}/${headerFile}
+
+            COMMAND hexdumparray ${PROJECT_SOURCE_DIR}/${sourceFile} ${PROJECT_SOURCE_DIR}/${headerFile} ${arrayName}
+
+            DEPENDS ${PROJECT_SOURCE_DIR}/${sourceFile} hexdumparray
+    )
+endfunction()
diff --git a/libs/gpu/LICENSE b/libs/gpu/LICENSE
new file mode 100644
index 0000000..6fb2204
--- /dev/null
+++ b/libs/gpu/LICENSE
@@ -0,0 +1,23 @@
+MIT License
+
+Copyright (c) 2018 Nikolay Polyarniy
+Copyright (c) 2018 GPGPUCourse2018
+Copyright (c) 2018 Agisoft LLC
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/libs/gpu/libgpu/context.cpp b/libs/gpu/libgpu/context.cpp
new file mode 100644
index 0000000..2c07b2d
--- /dev/null
+++ b/libs/gpu/libgpu/context.cpp
@@ -0,0 +1,310 @@
+#include "context.h"
+
+#ifdef CUDA_SUPPORT
+#include <libgpu/cuda/utils.h>
+#include <libgpu/cuda/cuda_api.h>
+#endif
+
+namespace gpu {
+
+THREAD_LOCAL Context::Data *Context::data_current_ = 0;
+
+Context::Data::Data()
+{
+	type			= TypeUndefined;
+	cuda_device		= 0;
+	cuda_context	= 0;
+	cuda_stream		= 0;
+	ocl_device		= 0;
+	activated		= false;
+}
+
+Context::Data::~Data()
+{
+	if (data_current_ != this) {
+		if (data_current_ != 0) {
+			std::cerr << "Another GPU context found on context destruction" << std::endl;
+		}
+	} else {
+		data_current_ = 0;
+	}
+
+#ifdef CUDA_SUPPORT
+	if (cuda_stream) {
+		cudaError_t err = cudaStreamDestroy(cuda_stream);
+		if (cudaSuccess != err)
+			std::cerr << "Warning: cudaStreamDestroy failed: " << cuda::formatError(err) << std::endl;
+	}
+
+#ifndef CUDA_USE_PRIMARY_CONTEXT
+	if (cuda_context) {
+		CUresult err = cuCtxDestroy(cuda_context);
+		if (CUDA_SUCCESS != err)
+			std::cerr << "Warning: cuCtxDestroy failed: " << cuda::formatDriverError(err) << std::endl;
+	}
+#endif
+#endif
+}
+
+Context::Context()
+{
+	data_ = data_current_;
+}
+
+void Context::clear()
+{
+	data_ = NULL;
+}
+
+void Context::init(int device)
+{
+#ifdef CUDA_SUPPORT
+#ifndef CUDA_USE_PRIMARY_CONTEXT
+	if (!cuda_api_init())
+		throw cuda::cuda_exception("Can't load nvcuda library");
+#endif
+	std::shared_ptr<Data> data = std::make_shared<Data>();
+	data->type				= TypeCUDA;
+	data->cuda_device		= device;
+	data_ref_	= data;
+#endif
+}
+
+void Context::init(struct _cl_device_id *device)
+{
+	std::shared_ptr<Data> data = std::make_shared<Data>();
+	data->type				= TypeOpenCL;
+	data->ocl_device		= device;
+	data_ref_	= data;
+}
+
+bool Context::isInitialized()
+{
+	return data_ref_.get() && data_ref_->type != TypeUndefined;
+}
+
+bool Context::isGPU()
+{
+	return (type() != TypeUndefined);
+}
+
+bool Context::isIntelGPU()
+{
+	if (type() != TypeOpenCL) {
+		return false;
+	}
+
+	return cl()->deviceInfo().isIntelGPU();
+}
+
+bool Context::isGoldChecksEnabled()
+{
+	return false; // NOTTODO: Make it switchable
+}
+
+void Context::activate()
+{
+	if (!data_ref_)
+		throw std::runtime_error("Unexpected GPU context activate call");
+
+	// create cuda stream on first activate call
+	if (!data_ref_->activated) {
+#ifdef CUDA_SUPPORT
+		if (data_ref_->type == TypeCUDA) {
+#ifndef CUDA_USE_PRIMARY_CONTEXT
+			// It is claimed that contexts are thread safe starting from CUDA 4.0.
+			// Nevertheless, we observe crashes in nvcuda.dll if the same device is used in parallel from 2 threads using its primary context.
+			// To avoid this problem we create a separate standard context for each processing thread.
+			// https://devtalk.nvidia.com/default/topic/519087/cuda-programming-and-performance/cuda-context-and-threading/post/3689477/#3689477
+			// http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DRIVER.html#axzz4g8KX5QV5
+
+			CUdevice device = 0;
+			CU_SAFE_CALL( cuDeviceGet(&device, data_ref_->cuda_device) );
+			CU_SAFE_CALL( cuCtxCreate(&data_ref_->cuda_context, 0, device) );
+#else
+			CUDA_SAFE_CALL( cudaSetDevice(data_ref_->cuda_device) );
+#endif
+			CUDA_SAFE_CALL( cudaStreamCreate(&data_ref_->cuda_stream) );
+		}
+#endif
+
+		if (data_ref_->type == TypeOpenCL) {
+			ocl::sh_ptr_ocl_engine engine = std::make_shared<ocl::OpenCLEngine>();
+			engine->init(data_ref_->ocl_device);
+			data_ref_->ocl_engine = engine;
+		}
+
+		data_ref_->activated = true;
+	}
+
+	if (data_current_ && data_current_ != data_ref_.get())
+		throw std::runtime_error("Another GPU context is already active");
+
+	data_			= data_ref_.get();
+	data_current_	= data_;
+}
+
+Context::Data *Context::data() const
+{
+	if (!data_)
+		throw std::runtime_error("Null context");
+
+	return data_;
+}
+
+size_t Context::getCoresEstimate()
+{
+	size_t compute_units = 1;
+
+	switch (type()) {
+#ifdef CUDA_SUPPORT
+		case Context::TypeCUDA:
+			cudaDeviceProp deviceProp;
+			CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, data_->cuda_device));
+			compute_units = (size_t) deviceProp.multiProcessorCount;
+			break;
+#endif
+		case Context::TypeOpenCL:
+			compute_units = cl()->maxComputeUnits();
+			break;
+		default:
+			gpu::raiseException(__FILE__, __LINE__, "No GPU context!");
+	}
+
+	return compute_units * 256;
+}
+
+size_t Context::getTotalMemory()
+{
+	size_t total_mem_size = 0;
+	size_t free_mem_size = 0;
+
+	switch (type()) {
+#ifdef CUDA_SUPPORT
+		case Context::TypeCUDA:
+			CUDA_SAFE_CALL(cudaMemGetInfo(&free_mem_size, &total_mem_size));
+			break;
+#endif
+		case Context::TypeOpenCL:
+			total_mem_size = cl()->totalMemSize();
+			break;
+		default:
+			gpu::raiseException(__FILE__, __LINE__, "No GPU context!");
+	}
+
+	return total_mem_size;
+}
+
+size_t Context::getFreeMemory()
+{
+	size_t total_mem_size = 0;
+	size_t free_mem_size = 0;
+
+	switch (type()) {
+#ifdef CUDA_SUPPORT
+		case Context::TypeCUDA:
+			CUDA_SAFE_CALL(cudaMemGetInfo(&free_mem_size, &total_mem_size));
+			break;
+#endif
+		case Context::TypeOpenCL:
+			total_mem_size = cl()->totalMemSize();
+			free_mem_size = total_mem_size - total_mem_size / 5;
+			break;
+		default:
+			gpu::raiseException(__FILE__, __LINE__, "No GPU context!");
+	}
+
+	return free_mem_size;
+}
+
+size_t Context::getMaxMemAlloc()
+{
+	size_t max_mem_alloc_size = 0;
+
+#ifdef CUDA_SUPPORT
+	if (type() == gpu::Context::TypeCUDA) {
+		size_t total_mem_size = 0;
+		size_t free_mem_size = 0;
+		CUDA_SAFE_CALL(cudaMemGetInfo(&free_mem_size, &total_mem_size));
+		max_mem_alloc_size = total_mem_size / 2;
+	} else
+#endif
+	if (type() == gpu::Context::TypeOpenCL) {
+		max_mem_alloc_size = cl()->maxMemAllocSize();
+	} else {
+		gpu::raiseException(__FILE__, __LINE__, "No GPU context!");
+	}
+
+	return max_mem_alloc_size;
+}
+
+size_t Context::getMaxWorkgroupSize()
+{
+	size_t max_workgroup_size = 0;
+
+	switch (type()) {
+#ifdef CUDA_SUPPORT
+		case Context::TypeCUDA:
+			int value;
+			CUDA_SAFE_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMaxThreadsPerBlock, data_->cuda_device));
+			max_workgroup_size = value;
+			break;
+#endif
+		case Context::TypeOpenCL:
+			max_workgroup_size = cl()->maxWorkgroupSize();
+			break;
+		default:
+			gpu::raiseException(__FILE__, __LINE__, "No GPU context!");
+	}
+
+	return max_workgroup_size;
+}
+
+std::vector<size_t> Context::getMaxWorkItemSizes()
+{
+	std::vector<size_t> work_item_sizes(3);
+
+	switch (type()) {
+#ifdef CUDA_SUPPORT
+		case Context::TypeCUDA:
+			int value[3];
+			CUDA_SAFE_CALL(cudaDeviceGetAttribute(&value[0], cudaDevAttrMaxBlockDimX, data_->cuda_device));
+			CUDA_SAFE_CALL(cudaDeviceGetAttribute(&value[1], cudaDevAttrMaxBlockDimY, data_->cuda_device));
+			CUDA_SAFE_CALL(cudaDeviceGetAttribute(&value[2], cudaDevAttrMaxBlockDimZ, data_->cuda_device));
+			for (int i = 0; i < 3; ++i) {
+				work_item_sizes[i] = value[i];
+			}
+			break;
+#endif
+		case Context::TypeOpenCL:
+			for (int i = 0; i < 3; ++i) {
+				work_item_sizes[i] = cl()->maxWorkItemSizes(i);
+			}
+			break;
+		default:
+			gpu::raiseException(__FILE__, __LINE__, "No GPU context!");
+	}
+
+	return work_item_sizes;
+}
+
+Context::Type Context::type() const
+{
+	if (data_)
+		return data_->type;
+	if (data_ref_)
+		return data_ref_->type;
+	return TypeUndefined;
+}
+
+ocl::sh_ptr_ocl_engine Context::cl() const
+{
+	return data()->ocl_engine;
+}
+
+cudaStream_t Context::cudaStream() const
+{
+	return data()->cuda_stream;
+}
+
+}
diff --git a/libs/gpu/libgpu/context.h b/libs/gpu/libgpu/context.h
new file mode 100644
index 0000000..23ce598
--- /dev/null
+++ b/libs/gpu/libgpu/context.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <vector>
+#include <libgpu/opencl/engine.h>
+
+typedef struct CUctx_st *cudaContext_t;
+typedef struct CUstream_st *cudaStream_t;
+
+#ifdef _MSC_VER
+    #define THREAD_LOCAL __declspec(thread)
+#else
+    #define THREAD_LOCAL __thread
+#endif
+
+namespace gpu {
+
+class Context {
+public:
+	Context();
+
+	enum Type {
+		TypeUndefined,
+		TypeOpenCL,
+		TypeCUDA
+	};
+
+	void	clear();
+	void	init(int device);
+	void	init(struct _cl_device_id *device);
+	bool	isInitialized();
+	bool	isGPU();
+	bool	isIntelGPU();
+	bool	isGoldChecksEnabled();
+
+	void	activate();
+
+	size_t 				getCoresEstimate();
+	size_t				getTotalMemory();
+	size_t				getFreeMemory();
+	size_t				getMaxMemAlloc();
+	size_t				getMaxWorkgroupSize();
+	std::vector<size_t>	getMaxWorkItemSizes();
+
+	Type	type() const;
+
+	ocl::sh_ptr_ocl_engine	cl() const;
+	cudaStream_t			cudaStream() const;
+
+protected:
+	class Data {
+	public:
+		Data();
+		~Data();
+
+		Type	type;
+
+		int						cuda_device;
+		cudaContext_t			cuda_context;
+		cudaStream_t			cuda_stream;
+		struct _cl_device_id *	ocl_device;
+		ocl::sh_ptr_ocl_engine	ocl_engine;
+		bool					activated;
+	};
+
+	Data *	data() const;
+
+	Data *						data_;
+	std::shared_ptr<Data>		data_ref_;
+	static THREAD_LOCAL Data *	data_current_;
+};
+
+}
diff --git a/libs/gpu/libgpu/cuda/cu/common.cu b/libs/gpu/libgpu/cuda/cu/common.cu
new file mode 100644
index 0000000..12320bf
--- /dev/null
+++ b/libs/gpu/libgpu/cuda/cu/common.cu
@@ -0,0 +1,535 @@
+#ifndef common_cu // pragma once
+#define common_cu
+
+#include <libgpu/cuda/utils.h>
+#include <libgpu/work_size.h>
+#include <math.h>
+#include <libutils/types.h>
+
+using gpu::WorkSize;
+
+//#define DEBUG
+
+#ifdef DEBUG
+#include <stdio.h>
+#define printf_assert(condition, message) \
+        if (!(condition)) printf("%s Line %d\n", message, __LINE__);
+#else
+#define printf_assert(condition, message)
+#endif
+#define assert_isfinite(value) \
+        printf_assert(isfinite(value), "Value should be finite!");
+
+#define WARP_SIZE 32 // NOTTODO: WHY WARP_SZ IS UNDEFINED?
+
+#ifndef M_PI
+#define M_PI 3.141592654f
+#endif
+
+namespace cuda {
+
+	template <typename T>
+	__device__ T max(T a, T b) {
+		return a > b ? a : b;
+	}
+
+	template <typename T>
+	__device__ T max(T a, T b, T c) {
+		return max(a, max(b, c));
+	}
+
+	template <typename T>
+	__device__ T min(T a, T b) {
+		return a < b ? a : b;
+	}
+
+	template <typename T>
+	__device__ T min(T a, T b, T c) {
+		return min(a, min(b, c));
+	}
+
+	inline __device__ uint3 fetch_uint3(const unsigned int* ptr, size_t index)
+	{
+		return make_uint3(ptr[3 * index + 0], ptr[3 * index + 1], ptr[3 * index + 2]);
+	}
+
+	inline __device__ uint4 fetch_uint4(const unsigned int* ptr, size_t index)
+	{
+		return make_uint4(ptr[4 * index + 0], ptr[4 * index + 1], ptr[4 * index + 2], ptr[4 * index + 3]);
+	}
+
+	inline __device__ float2 fetch_float2(const float* ptr, size_t index)
+	{
+		return make_float2(ptr[2 * index + 0], ptr[2 * index + 1]);
+	}
+
+	inline __device__ float3 fetch_float3(const float* ptr, size_t index)
+	{
+		return make_float3(ptr[3 * index + 0], ptr[3 * index + 1], ptr[3 * index + 2]);
+	}
+
+	inline __device__ float4 fetch_float4(const float* ptr, size_t index)
+	{
+		return make_float4(ptr[4 * index + 0], ptr[4 * index + 1], ptr[4 * index + 2], ptr[4 * index + 3]);
+	}
+
+	inline __device__ void set_uint3(unsigned int* ptr, size_t index, uint3 value)
+	{
+		ptr[3 * index + 0] = value.x;
+		ptr[3 * index + 1] = value.y;
+		ptr[3 * index + 2] = value.z;
+	}
+
+	inline __device__ void set_float2(float* ptr, size_t index, float2 value)
+	{
+		ptr[2 * index + 0] = value.x;
+		ptr[2 * index + 1] = value.y;
+	}
+
+	inline __device__ void set_float3(float* ptr, size_t index, float3 value)
+	{
+		ptr[3 * index + 0] = value.x;
+		ptr[3 * index + 1] = value.y;
+		ptr[3 * index + 2] = value.z;
+	}
+
+	inline __device__ __host__ float clamp(float f, float a, float b)
+	{
+		return fmaxf(a, fminf(f, b));
+	}
+
+	inline __device__ float atomicCAS_f32(float *p, float cmp, float val) {
+		return __int_as_float(atomicCAS((int *) p, __float_as_int(cmp), __float_as_int(val)));
+	}
+
+	inline __device__ float atomicCAS_32(float* p, float cmp, float val) {
+		return atomicCAS_f32(p, cmp, val);
+	}
+
+	inline __device__ unsigned int atomicCAS_32(unsigned int* p, unsigned int cmp, unsigned int val) {
+		return atomicCAS(p, cmp, val);
+	}
+
+	inline __device__ float3 operator-(const float3 &a)
+	{
+		return make_float3(-a.x, -a.y, -a.z);
+	}
+
+	inline __device__ float2 operator-(float2 a, float2 b)
+	{
+		return make_float2(a.x - b.x, a.y - b.y);
+	}
+
+	inline __device__ void operator-=(float2 &a, float2 b)
+	{
+		a.x -= b.x;
+		a.y -= b.y;
+	}
+
+	inline __device__ float2 operator+(float2 a, float2 b)
+	{
+		return make_float2(a.x + b.x, a.y + b.y);
+	}
+
+	inline __device__ void operator+=(float2 &a, float2 b)
+	{
+		a.x += b.x;
+		a.y += b.y;
+	}
+
+	inline __device__ float3 operator-(float3 a, float3 b)
+	{
+		return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+	}
+
+	inline __device__ float3 operator+(float3 a, float3 b)
+	{
+		return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+	}
+
+	inline __device__ void operator+=(float3 &a, float3 b)
+	{
+		a.x += b.x;
+		a.y += b.y;
+		a.z += b.z;
+	}
+
+	inline __device__ float4 operator+(float4 a, float4 b)
+	{
+		return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+	}
+
+	inline __device__ float2 operator*(float2 a, float b)
+	{
+		return make_float2(a.x * b, a.y * b);
+	}
+
+	inline __device__ float3 operator*(float3 a, float b)
+	{
+		return make_float3(a.x * b, a.y * b, a.z * b);
+	}
+
+	inline __device__ float4 operator*(float4 a, float b)
+	{
+		return make_float4(a.x * b, a.y * b, a.z * b, a.w * b);
+	}
+
+	inline __device__ float2 operator/(float2 a, float b)
+	{
+		return make_float2(a.x / b, a.y / b);
+	}
+
+	inline __device__ float3 operator/(float3 a, float b)
+	{
+		return make_float3(a.x / b, a.y / b, a.z / b);
+	}
+
+	inline __device__ void operator/=(float3& a, float b)
+	{
+		a.x /= b;
+		a.y /= b;
+		a.z /= b;
+	}
+
+	inline __device__ float3 operator*(float3 a, float3 b)
+	{
+		return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
+	}
+
+	inline __device__ void operator*=(float3 &a, float b)
+	{
+		a.x *= b;
+		a.y *= b;
+		a.z *= b;
+	}
+
+	inline __device__ bool operator==(float3 a, float3 b)
+	{
+		return a.x == b.x && a.y == b.y && a.z == b.z;
+	}
+
+	inline __device__ bool operator==(float4 a, float4 b)
+	{
+		return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w;
+	}
+
+	inline __device__ float dot(float2 a, float2 b)
+	{
+		return a.x * b.x + a.y * b.y;
+	}
+
+	inline __device__ float dot(float3 a, float3 b)
+	{
+		return a.x * b.x + a.y * b.y + a.z * b.z;
+	}
+
+	inline __device__ float dot(float4 a, float4 b)
+	{
+		return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+	}
+
+	inline __device__ float3 cross(float3 a, float3 b)
+	{
+		return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
+	}
+
+	inline __device__ float norm2(float2 v)
+	{
+		return dot(v, v);
+	}
+
+	inline __device__ float norm2(float3 v)
+	{
+		return dot(v, v);
+	}
+
+	inline __device__ float norm2(float4 v)
+	{
+		return dot(v, v);
+	}
+
+	inline __device__ float norm(float2 v)
+	{
+		return sqrtf(norm2(v));
+	}
+
+	inline __device__ float norm(float3 v)
+	{
+		return sqrtf(norm2(v));
+	}
+
+	inline __device__ float norm(float4 v)
+	{
+		return sqrtf(norm2(v));
+	}
+
+	inline __device__ float3 normalize(float3 v)
+	{
+		return v / sqrtf(dot(v, v));
+	}
+
+//______SHARED_STRUCTS__________________________________________________________________________________________________
+
+	// https://devtalk.nvidia.com/default/topic/673965/are-there-any-cuda-libararies-for-3x3-matrix-amp-vector3-amp-quaternion-operations-/
+	typedef struct {
+		float4 m_row[3];
+	} Matrix3x3f;
+
+	typedef struct {
+		float4 m_row[4];
+	} Matrix4x4f;
+
+//______HOST_CODE_______________________________________________________________________________________________________
+
+	inline __host__ float3 make_float3_from_vector(const vector3d &v)
+	{
+		return make_float3((float) v.x(), (float) v.y(), (float) v.z());
+	}
+
+	inline __host__ Matrix3x3f make_matrix_f3x3(const matrix3x3d &a)
+	{
+		Matrix3x3f m;
+		m.m_row[0] = make_float4((float) a(0, 0), (float) a(0, 1), (float) a(0, 2), 0.0f);
+		m.m_row[1] = make_float4((float) a(1, 0), (float) a(1, 1), (float) a(1, 2), 0.0f);
+		m.m_row[2] = make_float4((float) a(2, 0), (float) a(2, 1), (float) a(2, 2), 0.0f);
+		return m;
+	}
+
+	inline __host__ Matrix4x4f make_matrix_f4x4(const matrix4x4d &a)
+	{
+		Matrix4x4f m;
+		m.m_row[0] = make_float4((float) a(0, 0), (float) a(0, 1), (float) a(0, 2), (float) a(0, 3));
+		m.m_row[1] = make_float4((float) a(1, 0), (float) a(1, 1), (float) a(1, 2), (float) a(1, 3));
+		m.m_row[2] = make_float4((float) a(2, 0), (float) a(2, 1), (float) a(2, 2), (float) a(2, 3));
+		m.m_row[3] = make_float4((float) a(3, 0), (float) a(3, 1), (float) a(3, 2), (float) a(3, 3));
+		return m;
+	}
+
+//______DEVICE_CODE_____________________________________________________________________________________________________
+
+#ifdef DEBUG
+	inline __device__ void print_matrix_f3x3(const Matrix3x3f &m)
+	{
+		printf("[\n");
+		printf("  [%f, %f, %f],\n", m.m_row[0].x, m.m_row[0].y, m.m_row[0].z);
+		printf("  [%f, %f, %f],\n", m.m_row[1].x, m.m_row[1].y, m.m_row[1].z);
+		printf("  [%f, %f, %f],\n", m.m_row[2].x, m.m_row[2].y, m.m_row[2].z);
+		printf("]\n");
+	}
+#endif
+
+	inline __device__ Matrix3x3f make_matrix_f3x3(float a00, float a01, float a02, float a10, float a11, float a12, float a20, float a21, float a22)
+	{
+		Matrix3x3f m;
+		m.m_row[0] = make_float4(a00, a01, a02, 0.0f);
+		m.m_row[1] = make_float4(a10, a11, a12, 0.0f);
+		m.m_row[2] = make_float4(a20, a21, a22, 0.0f);
+		return m;
+	}
+
+	inline __device__ Matrix3x3f make_zero_matrix_f3x3()
+	{
+		Matrix3x3f m;
+		m.m_row[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		m.m_row[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		m.m_row[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		return m;
+	}
+
+	inline __device__ Matrix3x3f make_eye_matrix_f3x3()
+	{
+		Matrix3x3f m;
+		m.m_row[0] = make_float4(1.0f, 0.0f, 0.0f, 0.0f);
+		m.m_row[1] = make_float4(0.0f, 1.0f, 0.0f, 0.0f);
+		m.m_row[2] = make_float4(0.0f, 0.0f, 1.0f, 0.0f);
+		return m;
+	}
+
+	inline __device__ Matrix3x3f transpose_f3x3(const Matrix3x3f &m)
+	{
+		Matrix3x3f t;
+		t.m_row[0] = make_float4(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.0f);
+		t.m_row[1] = make_float4(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.0f);
+		t.m_row[2] = make_float4(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.0f);
+		return t;
+	}
+
+	inline __device__ Matrix3x3f add_f3x3(const Matrix3x3f& a, const Matrix3x3f& b)
+	{
+		Matrix3x3f m;
+		m.m_row[0] = a.m_row[0] + b.m_row[0];
+		m.m_row[1] = a.m_row[1] + b.m_row[1];
+		m.m_row[2] = a.m_row[2] + b.m_row[2];
+		return m;
+	}
+
+	inline __device__ Matrix3x3f mul_f3x3(const Matrix3x3f &a, const Matrix3x3f &b)
+	{
+		Matrix3x3f bt = transpose_f3x3(b);
+		Matrix3x3f res;
+		res.m_row[0] = make_float4(dot(a.m_row[0], bt.m_row[0]), dot(a.m_row[0], bt.m_row[1]), dot(a.m_row[0], bt.m_row[2]), 0.0f);
+		res.m_row[1] = make_float4(dot(a.m_row[1], bt.m_row[0]), dot(a.m_row[1], bt.m_row[1]), dot(a.m_row[1], bt.m_row[2]), 0.0f);
+		res.m_row[2] = make_float4(dot(a.m_row[2], bt.m_row[0]), dot(a.m_row[2], bt.m_row[1]), dot(a.m_row[2], bt.m_row[2]), 0.0f);
+		return res;
+	}
+
+	inline __device__ Matrix3x3f mul_f_f3x3(float k, const Matrix3x3f& a)
+	{
+		Matrix3x3f m;
+		m.m_row[0] = a.m_row[0] * k;
+		m.m_row[1] = a.m_row[1] * k;
+		m.m_row[2] = a.m_row[2] * k;
+		return m;
+	}
+
+	inline __device__ float3 mul_f3x3_f3(const Matrix3x3f& a, const float3 &b)
+	{
+		return make_float3(a.m_row[0].x * b.x + a.m_row[0].y * b.y + a.m_row[0].z * b.z,
+						   a.m_row[1].x * b.x + a.m_row[1].y * b.y + a.m_row[1].z * b.z,
+						   a.m_row[2].x * b.x + a.m_row[2].y * b.y + a.m_row[2].z * b.z);
+	}
+
+	inline __device__ float2 transformPoint_f3x3(const Matrix3x3f &m, const float2 &p)
+	{
+		float3 temp = mul_f3x3_f3(m, make_float3(p.x, p.y, 1.0f));
+		return make_float2(temp.x, temp.y) / temp.z;
+	}
+
+#ifdef DEBUG
+	inline __device__ void print_matrix_f4x4(const Matrix4x4f &m)
+	{
+		printf("[\n");
+		printf("  [%f, %f, %f, %f],\n", m.m_row[0].x, m.m_row[0].y, m.m_row[0].z, m.m_row[0].w);
+		printf("  [%f, %f, %f, %f],\n", m.m_row[1].x, m.m_row[1].y, m.m_row[1].z, m.m_row[1].w);
+		printf("  [%f, %f, %f, %f],\n", m.m_row[2].x, m.m_row[2].y, m.m_row[2].z, m.m_row[2].w);
+		printf("  [%f, %f, %f, %f],\n", m.m_row[3].x, m.m_row[3].y, m.m_row[3].z, m.m_row[3].w);
+		printf("]\n");
+	}
+#endif
+
+	inline __device__ Matrix4x4f make_matrix_f4x4(float a00, float a01, float a02, float a03,
+												  float a10, float a11, float a12, float a13,
+												  float a20, float a21, float a22, float a23,
+												  float a30, float a31, float a32, float a33)
+	{
+		Matrix4x4f m;
+		m.m_row[0] = make_float4(a00, a01, a02, a03);
+		m.m_row[1] = make_float4(a10, a11, a12, a13);
+		m.m_row[2] = make_float4(a20, a21, a22, a23);
+		m.m_row[3] = make_float4(a30, a31, a32, a33);
+		return m;
+	}
+
+	inline __device__ Matrix4x4f make_translation_f4x4(const float3 &t)
+	{
+		return make_matrix_f4x4(1.0f, 0.0f, 0.0f, t.x,
+								0.0f, 1.0f, 0.0f, t.y,
+								0.0f, 0.0f, 1.0f, t.z,
+								0.0f, 0.0f, 0.0f, 1.0f);
+	}
+
+	inline __device__ Matrix4x4f make_rotation_f4x4(const Matrix3x3f &r)
+	{
+		Matrix4x4f m;
+		m.m_row[0] = r.m_row[0];
+		m.m_row[1] = r.m_row[1];
+		m.m_row[2] = r.m_row[2];
+
+		m.m_row[0].w = 0.0f;
+		m.m_row[1].w = 0.0f;
+		m.m_row[2].w = 0.0f;
+		m.m_row[3] = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
+		return m;
+	}
+
+	inline __device__ float3 extract_translation_f4x4(const Matrix4x4f &m)
+	{
+		float norm = 1.0f / m.m_row[3].w;
+		return make_float3(m.m_row[0].w, m.m_row[1].w, m.m_row[2].w) * norm;
+	}
+
+	inline __device__ Matrix3x3f extract_rotation_f4x4(const Matrix4x4f &m)
+	{
+		Matrix3x3f R = make_matrix_f3x3(
+				m.m_row[0].x, m.m_row[0].y, m.m_row[0].z,
+				m.m_row[1].x, m.m_row[1].y, m.m_row[1].z,
+				m.m_row[2].x, m.m_row[2].y, m.m_row[2].z);
+
+		// matrix4x4f.scale3()
+		Matrix3x3f MtM = mul_f3x3(transpose_f3x3(R), R);
+
+		float3 d = make_float3(MtM.m_row[0].x, MtM.m_row[1].y, MtM.m_row[2].z);
+
+		if (d.x > 0) d.x = sqrtf(d.x);
+		if (d.y > 0) d.y = sqrtf(d.y);
+		if (d.z > 0) d.z = sqrtf(d.z);
+
+		float3 s = d;
+
+		if (s.x) s.x = 1.0f / s.x;
+		if (s.y) s.y = 1.0f / s.y;
+		if (s.z) s.z = 1.0f / s.z;
+
+		return mul_f3x3(R, make_matrix_f3x3(s.x, 0.0f, 0.0f,
+											0.0f, s.y, 0.0f,
+											0.0f, 0.0f, s.z));
+	}
+
+	inline __device__ Matrix4x4f transpose_f4x4(const Matrix4x4f &m)
+	{
+		Matrix4x4f t;
+		t.m_row[0] = make_float4(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, m.m_row[3].x);
+		t.m_row[1] = make_float4(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, m.m_row[3].y);
+		t.m_row[2] = make_float4(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, m.m_row[3].z);
+		t.m_row[3] = make_float4(m.m_row[0].w, m.m_row[1].w, m.m_row[2].w, m.m_row[3].w);
+		return t;
+	}
+
+	inline __device__ Matrix4x4f mul_f4x4(const Matrix4x4f &a, const Matrix4x4f &b)
+	{
+		Matrix4x4f bt = transpose_f4x4(b);
+		Matrix4x4f res;
+		res.m_row[0] = make_float4(dot(a.m_row[0], bt.m_row[0]), dot(a.m_row[0], bt.m_row[1]), dot(a.m_row[0], bt.m_row[2]), dot(a.m_row[0], bt.m_row[3]));
+		res.m_row[1] = make_float4(dot(a.m_row[1], bt.m_row[0]), dot(a.m_row[1], bt.m_row[1]), dot(a.m_row[1], bt.m_row[2]), dot(a.m_row[1], bt.m_row[3]));
+		res.m_row[2] = make_float4(dot(a.m_row[2], bt.m_row[0]), dot(a.m_row[2], bt.m_row[1]), dot(a.m_row[2], bt.m_row[2]), dot(a.m_row[2], bt.m_row[3]));
+		res.m_row[3] = make_float4(dot(a.m_row[3], bt.m_row[0]), dot(a.m_row[3], bt.m_row[1]), dot(a.m_row[3], bt.m_row[2]), dot(a.m_row[3], bt.m_row[3]));
+		return res;
+	}
+
+	inline __device__ float4 mul_f4x4_f4(const Matrix4x4f& a, const float4 &b)
+	{
+		return make_float4(a.m_row[0].x * b.x + a.m_row[0].y * b.y + a.m_row[0].z * b.z + a.m_row[0].w * b.w,
+						   a.m_row[1].x * b.x + a.m_row[1].y * b.y + a.m_row[1].z * b.z + a.m_row[1].w * b.w,
+						   a.m_row[2].x * b.x + a.m_row[2].y * b.y + a.m_row[2].z * b.z + a.m_row[2].w * b.w,
+						   a.m_row[3].x * b.x + a.m_row[3].y * b.y + a.m_row[3].z * b.z + a.m_row[3].w * b.w);
+	}
+
+	inline __device__ float3 transformPoint(const Matrix4x4f &m, const float3 &p)
+	{
+		float4 temp = mul_f4x4_f4(m, make_float4(p.x, p.y, p.z, 1.0f));
+		return make_float3(temp.x, temp.y, temp.z) / temp.w;
+	}
+
+	inline __device__ float3 transformVector(const Matrix4x4f &m, const float3 &v)
+	{
+		float4 temp = mul_f4x4_f4(m, make_float4(v.x, v.y, v.z, 0.0f));
+		return make_float3(temp.x, temp.y, temp.z);
+	}
+
+	inline __device__ float smootherstep(float edge0, float edge1, float x)
+	{
+		if (x < edge0) {
+			return 0.0f;
+		} else if (x >= edge1) {
+			return 1.0f;
+		}
+
+		// Scale, and clamp x to 0..1 range
+		x = (x - edge0) / (edge1 - edge0);
+		// Evaluate polynomial
+		return x * x * x * (x * (x * 6.0f - 15.0f) + 10.0f);
+	}
+
+}
+
+#endif // pragma once
\ No newline at end of file
diff --git a/libs/gpu/libgpu/cuda/cu/opencl_translator.cu b/libs/gpu/libgpu/cuda/cu/opencl_translator.cu
new file mode 100644
index 0000000..6866fee
--- /dev/null
+++ b/libs/gpu/libgpu/cuda/cu/opencl_translator.cu
@@ -0,0 +1,96 @@
+#ifndef opencl_translator_cu // pragma once
+#define opencl_translator_cu
+
+#ifdef __NVCC__
+
+#ifndef STATIC_KEYWORD
+#define STATIC_KEYWORD __device__
+#endif
+
+// See https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/functionQualifiers.html
+#define vec_type_hint(typen)
+#define work_group_size_hint(X, Y, Z)
+#define reqd_work_group_size(X, Y, Z)
+
+#define __kernel __global__
+#define __global
+#define __local __shared__
+#define __constant __constant__
+
+typedef unsigned int uint;
+
+// https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/barrier.html
+enum	cl_mem_fence_flags
+{
+    CLK_LOCAL_MEM_FENCE,
+    CLK_GLOBAL_MEM_FENCE
+};
+
+STATIC_KEYWORD void	barrier(cl_mem_fence_flags flags)
+{
+    __syncthreads();
+}
+
+// https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/workItemFunctions.html
+STATIC_KEYWORD size_t	getXYZByIndex(dim3 xyz, uint dimindx)
+{
+    if (dimindx == 2) {
+        return xyz.z;
+    } else if (dimindx == 1) {
+        return xyz.y;
+    } else {
+        return xyz.x;
+    }
+}
+
+STATIC_KEYWORD size_t	get_global_size	(uint dimindx) {
+    return getXYZByIndex(gridDim, dimindx) * getXYZByIndex(blockDim, dimindx);
+}
+
+STATIC_KEYWORD size_t	get_global_id	(uint dimindx) {
+    return getXYZByIndex(blockIdx, dimindx) * getXYZByIndex(blockDim, dimindx) + getXYZByIndex(threadIdx, dimindx);
+}
+
+STATIC_KEYWORD size_t	get_local_size	(uint dimindx) {
+    return getXYZByIndex(blockDim, dimindx);
+}
+
+STATIC_KEYWORD size_t	get_local_id	(uint dimindx) {
+    return getXYZByIndex(threadIdx, dimindx);
+}
+
+STATIC_KEYWORD size_t	get_num_groups	(uint dimindx) {
+    return getXYZByIndex(gridDim, dimindx);
+}
+
+STATIC_KEYWORD size_t	get_group_id	(uint dimindx) {
+    return getXYZByIndex(blockIdx, dimindx);
+}
+
+STATIC_KEYWORD uint	get_work_dim() 
+{
+    if (get_global_size(2) > 1) {
+        return 3;
+    } else if (get_global_size(1) > 1) {
+        return 2;
+    } else {
+        return 1;
+    }
+}
+
+#define WARP_SIZE 32
+
+#endif
+
+#ifdef __CUDA_ARCH__
+#define DEVICE_CODE
+#else
+#define HOST_CODE
+#endif
+
+#include <libgpu/work_size.h>
+#include <libgpu/shared_device_buffer.h>
+#include <libgpu/cuda/utils.h>
+#include <cuda_runtime_api.h>
+
+#endif // pragma once
\ No newline at end of file
diff --git a/libs/gpu/libgpu/cuda/cuda_api.cpp b/libs/gpu/libgpu/cuda/cuda_api.cpp
new file mode 100644
index 0000000..21aaf26
--- /dev/null
+++ b/libs/gpu/libgpu/cuda/cuda_api.cpp
@@ -0,0 +1,158 @@
+#ifdef CUDA_SUPPORT
+#include "cuda_api.h"
+
+#ifdef _WIN32
+
+#include <windows.h>
+
+typedef HMODULE CudaLibrary;
+
+static HMODULE cudaLoadLibrary()
+{
+	return LoadLibraryW(L"nvcuda.dll");
+}
+
+static FARPROC cudaGetProcAddress(HMODULE hModule, LPCSTR lpProcName)
+{
+	return ::GetProcAddress(hModule, lpProcName);
+}
+
+#elif defined(__unix__) || defined(__APPLE__) || defined(__MACOSX)
+
+#include <dlfcn.h>
+
+typedef void * CudaLibrary;
+
+static CudaLibrary cudaLoadLibrary()
+{
+#if defined(__APPLE__) || defined(__MACOSX)
+	return dlopen("/Library/Frameworks/CUDA.framework/Versions/Current/CUDA", RTLD_NOW);
+#else
+	return dlopen("libcuda.so", RTLD_NOW);
+#endif
+}
+
+static void *cudaGetProcAddress(void *handle, const char *symbol)
+{
+	return dlsym(handle, symbol);
+}
+
+#else
+#error unsupported platform
+#endif
+
+namespace cuda {
+
+std::string driverErrorString(CUresult code)
+{
+#define DEFINE_ERROR(value)	case value: return #value;
+
+	switch (code) {
+	DEFINE_ERROR(CUDA_SUCCESS)
+	DEFINE_ERROR(CUDA_ERROR_INVALID_VALUE)
+	DEFINE_ERROR(CUDA_ERROR_OUT_OF_MEMORY)
+	DEFINE_ERROR(CUDA_ERROR_NOT_INITIALIZED)
+	DEFINE_ERROR(CUDA_ERROR_DEINITIALIZED)
+	DEFINE_ERROR(CUDA_ERROR_PROFILER_DISABLED)
+	DEFINE_ERROR(CUDA_ERROR_PROFILER_NOT_INITIALIZED)
+	DEFINE_ERROR(CUDA_ERROR_PROFILER_ALREADY_STARTED)
+	DEFINE_ERROR(CUDA_ERROR_PROFILER_ALREADY_STOPPED)
+	DEFINE_ERROR(CUDA_ERROR_NO_DEVICE)
+	DEFINE_ERROR(CUDA_ERROR_INVALID_DEVICE)
+	DEFINE_ERROR(CUDA_ERROR_INVALID_IMAGE)
+	DEFINE_ERROR(CUDA_ERROR_INVALID_CONTEXT)
+	DEFINE_ERROR(CUDA_ERROR_CONTEXT_ALREADY_CURRENT)
+	DEFINE_ERROR(CUDA_ERROR_MAP_FAILED)
+	DEFINE_ERROR(CUDA_ERROR_UNMAP_FAILED)
+	DEFINE_ERROR(CUDA_ERROR_ARRAY_IS_MAPPED)
+	DEFINE_ERROR(CUDA_ERROR_ALREADY_MAPPED)
+	DEFINE_ERROR(CUDA_ERROR_NO_BINARY_FOR_GPU)
+	DEFINE_ERROR(CUDA_ERROR_ALREADY_ACQUIRED)
+	DEFINE_ERROR(CUDA_ERROR_NOT_MAPPED)
+	DEFINE_ERROR(CUDA_ERROR_NOT_MAPPED_AS_ARRAY)
+	DEFINE_ERROR(CUDA_ERROR_NOT_MAPPED_AS_POINTER)
+	DEFINE_ERROR(CUDA_ERROR_ECC_UNCORRECTABLE)
+	DEFINE_ERROR(CUDA_ERROR_UNSUPPORTED_LIMIT)
+	DEFINE_ERROR(CUDA_ERROR_CONTEXT_ALREADY_IN_USE)
+	DEFINE_ERROR(CUDA_ERROR_PEER_ACCESS_UNSUPPORTED)
+	DEFINE_ERROR(CUDA_ERROR_INVALID_SOURCE)
+	DEFINE_ERROR(CUDA_ERROR_FILE_NOT_FOUND)
+	DEFINE_ERROR(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND)
+	DEFINE_ERROR(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED)
+	DEFINE_ERROR(CUDA_ERROR_OPERATING_SYSTEM)
+	DEFINE_ERROR(CUDA_ERROR_INVALID_HANDLE)
+	DEFINE_ERROR(CUDA_ERROR_NOT_FOUND)
+	DEFINE_ERROR(CUDA_ERROR_NOT_READY)
+	DEFINE_ERROR(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES)
+	DEFINE_ERROR(CUDA_ERROR_LAUNCH_TIMEOUT)
+	DEFINE_ERROR(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING)
+	DEFINE_ERROR(CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED)
+	DEFINE_ERROR(CUDA_ERROR_PEER_ACCESS_NOT_ENABLED)
+	DEFINE_ERROR(CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE)
+	DEFINE_ERROR(CUDA_ERROR_CONTEXT_IS_DESTROYED)
+	DEFINE_ERROR(CUDA_ERROR_ASSERT)
+	DEFINE_ERROR(CUDA_ERROR_TOO_MANY_PEERS)
+	DEFINE_ERROR(CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED)
+	DEFINE_ERROR(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED)
+	DEFINE_ERROR(CUDA_ERROR_LAUNCH_FAILED)
+	DEFINE_ERROR(CUDA_ERROR_NOT_PERMITTED)
+	DEFINE_ERROR(CUDA_ERROR_NOT_SUPPORTED)
+	DEFINE_ERROR(CUDA_ERROR_UNKNOWN)
+	default: return "CUDA_ERROR_UNKNOWN_CODE_" + to_string(code);
+	}
+
+#undef DEFINE_ERROR
+}
+
+std::string formatDriverError(CUresult code)
+{
+	return driverErrorString(code) + " (" + to_string(code) + ")";
+}
+
+}
+
+typedef CUresult				(CUDAAPI * p_pfn_cuDeviceGet)				(CUdevice *, int);
+typedef CUresult				(CUDAAPI * p_pfn_cuCtxCreate)				(CUcontext *, unsigned int, CUdevice);
+typedef CUresult				(CUDAAPI * p_pfn_cuCtxDestroy)				(CUcontext);
+
+p_pfn_cuDeviceGet				pfn_cuDeviceGet				= 0;
+p_pfn_cuCtxCreate				pfn_cuCtxCreate				= 0;
+p_pfn_cuCtxDestroy				pfn_cuCtxDestroy			= 0;
+
+bool cuda_api_init()
+{
+	if (pfn_cuCtxCreate)
+		return true;
+
+	CudaLibrary lib = cudaLoadLibrary();
+	if (!lib)
+		return false;
+
+	pfn_cuDeviceGet				= (p_pfn_cuDeviceGet)				cudaGetProcAddress(lib, "cuDeviceGet");
+	pfn_cuCtxCreate				= (p_pfn_cuCtxCreate)				cudaGetProcAddress(lib, "cuCtxCreate_v2");
+	pfn_cuCtxDestroy			= (p_pfn_cuCtxDestroy)				cudaGetProcAddress(lib, "cuCtxDestroy_v2");
+
+	return true;
+}
+
+CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal)
+{
+	if (!pfn_cuDeviceGet) return CUDA_ERROR_NOT_INITIALIZED;
+
+	return pfn_cuDeviceGet(device, ordinal);
+}
+
+CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
+{
+	if (!pfn_cuCtxCreate) return CUDA_ERROR_NOT_INITIALIZED;
+
+	return pfn_cuCtxCreate(pctx, flags, dev);
+}
+
+CUresult CUDAAPI cuCtxDestroy(CUcontext ctx)
+{
+	if (!pfn_cuCtxDestroy) return CUDA_ERROR_NOT_INITIALIZED;
+
+	return pfn_cuCtxDestroy(ctx);
+}
+#endif
diff --git a/libs/gpu/libgpu/cuda/cuda_api.h b/libs/gpu/libgpu/cuda/cuda_api.h
new file mode 100644
index 0000000..6e878cf
--- /dev/null
+++ b/libs/gpu/libgpu/cuda/cuda_api.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <libgpu/cuda/utils.h>
+#include <cuda.h>
+
+bool cuda_api_init();
+
+namespace cuda {
+
+	std::string formatDriverError(CUresult code);
+
+	static inline void reportErrorCU(CUresult err, int line, std::string prefix="")
+	{
+		if (CUDA_SUCCESS == err)
+			return;
+
+		std::string message = prefix + formatDriverError(err) + " at line " + to_string(line);
+
+		switch (err) {
+		case CUDA_ERROR_OUT_OF_MEMORY:
+			throw cuda_bad_alloc(message);
+		default:
+			throw cuda_exception(message);
+		}
+	}
+
+	#define CU_SAFE_CALL(expr)  cuda::reportErrorCU(expr, __LINE__)
+
+}
diff --git a/libs/gpu/libgpu/cuda/enum.cpp b/libs/gpu/libgpu/cuda/enum.cpp
new file mode 100644
index 0000000..6e3f243
--- /dev/null
+++ b/libs/gpu/libgpu/cuda/enum.cpp
@@ -0,0 +1,94 @@
+#ifdef CUDA_SUPPORT
+#include <cuda_runtime.h>
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include "enum.h"
+#include "utils.h"
+
+bool CUDAEnum::printInfo(int id)
+{
+	cudaError_t status;
+
+	cudaDeviceProp prop;
+	status = cudaGetDeviceProperties(&prop, id);
+	if (status != cudaSuccess)
+		return false;
+
+	int driverVersion = 239;
+	status = cudaDriverGetVersion(&driverVersion);
+	if (status != cudaSuccess)
+		return false;
+
+	int runtimeVersion = 239;
+	status = cudaRuntimeGetVersion(&runtimeVersion);
+	if (status != cudaSuccess)
+		return false;
+
+	std::cout << "Using device: " << prop.name << ", " << prop.multiProcessorCount << " compute units, " << (prop.totalGlobalMem >> 20) << " MB global memory, compute capability " <<  prop.major << "." << prop.minor << std::endl;
+	std::cout << "  driver version: " << driverVersion << ", runtime version: " << runtimeVersion << std::endl;
+	std::cout << "  max work group size " << prop.maxThreadsPerBlock << std::endl;
+	std::cout << "  max work item sizes [" << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", " << prop.maxThreadsDim[2] << "]" << std::endl;
+	return true;
+}
+
+CUDAEnum::CUDAEnum()
+{
+}
+
+CUDAEnum::~CUDAEnum()
+{
+}
+
+bool CUDAEnum::compareDevice(const Device &dev1, const Device &dev2)
+{
+	if (dev1.name	> dev2.name)	return false;
+	if (dev1.name	< dev2.name)	return true;
+	if (dev1.id		> dev2.id)		return false;
+	return true;
+}
+
+bool CUDAEnum::enumDevices()
+{
+	int device_count = 0;
+
+	cudaError_t res = cudaGetDeviceCount(&device_count);
+	if (res == cudaErrorNoDevice || res == cudaErrorInsufficientDriver)
+		return true;
+
+	if (res != cudaSuccess) {
+		std::cerr << "cudaGetDeviceCount failed: " << cuda::formatError(res) << std::endl;
+		return false;
+	}
+
+	for (int device_index = 0; device_index < device_count; device_index++) {
+		cudaDeviceProp prop;
+
+		res = cudaGetDeviceProperties(&prop, device_index);
+		if (res != cudaSuccess) {
+			std::cerr << "cudaGetDeviceProperties failed: " << cuda::formatError(res) << std::endl;
+			return false;
+		}
+
+		// we don't support CUDA devices with compute capability < 2.0
+		if (prop.major < 2)
+			continue;
+
+		Device device;
+
+		device.id				= device_index;
+		device.name				= prop.name;
+		device.compute_units	= prop.multiProcessorCount;
+		device.mem_size			= prop.totalGlobalMem;
+		device.clock			= prop.clockRate / 1000;
+		device.pci_bus_id		= prop.pciBusID;
+		device.pci_device_id	= prop.pciDeviceID;
+
+		devices_.push_back(device);
+	}
+
+	std::sort(devices_.begin(), devices_.end(), compareDevice);
+
+	return true;
+}
+#endif
diff --git a/libs/gpu/libgpu/cuda/enum.h b/libs/gpu/libgpu/cuda/enum.h
new file mode 100644
index 0000000..f864e38
--- /dev/null
+++ b/libs/gpu/libgpu/cuda/enum.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+#ifdef CUDA_SUPPORT
+
+class CUDAEnum {
+public:
+	CUDAEnum();
+	~CUDAEnum();
+
+	class Device {
+	public:
+		Device()
+		{
+			id					= 0;
+			compute_units		= 0;
+			mem_size			= 0;
+			clock				= 0;
+			pci_bus_id			= 0;
+			pci_device_id		= 0;
+		}
+
+		int						id;
+		std::string				name;
+		unsigned int			compute_units;
+		unsigned long long		mem_size;
+		unsigned int			clock;
+		unsigned int			pci_bus_id;
+		unsigned int			pci_device_id;
+	};
+
+	bool	enumDevices();
+	std::vector<Device> &	devices()	{ return devices_;		}
+
+	static	bool printInfo(int id);
+
+protected:
+	static	bool	compareDevice(const Device &dev1, const Device &dev2);
+
+	std::vector<Device>		devices_;
+};
+
+#endif
diff --git a/libs/gpu/libgpu/cuda/sdk/helper_math.h b/libs/gpu/libgpu/cuda/sdk/helper_math.h
new file mode 100644
index 0000000..24b79b3
--- /dev/null
+++ b/libs/gpu/libgpu/cuda/sdk/helper_math.h
@@ -0,0 +1,1453 @@
+ /**
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/*
+ *  This file implements common mathematical operations on vector types
+ *  (float3, float4 etc.) since these are not provided as standard by CUDA.
+ *
+ *  The syntax is modeled on the Cg standard library.
+ *
+ *  This is part of the Helper library includes
+ *
+ *    Thanks to Linh Hah for additions and fixes.
+ */
+
+#ifndef HELPER_MATH_H
+#define HELPER_MATH_H
+
+#include "cuda_runtime.h"
+
+typedef unsigned int uint;
+typedef unsigned short ushort;
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#ifndef __CUDACC__
+#include <math.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// host implementations of CUDA functions
+////////////////////////////////////////////////////////////////////////////////
+
+inline float fminf(float a, float b)
+{
+    return a < b ? a : b;
+}
+
+inline float fmaxf(float a, float b)
+{
+    return a > b ? a : b;
+}
+
+inline int max(int a, int b)
+{
+    return a > b ? a : b;
+}
+
+inline int min(int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline float rsqrtf(float x)
+{
+    return 1.0f / sqrtf(x);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// constructors
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 make_float2(float s)
+{
+    return make_float2(s, s);
+}
+inline __host__ __device__ float2 make_float2(float3 a)
+{
+    return make_float2(a.x, a.y);
+}
+inline __host__ __device__ float2 make_float2(int2 a)
+{
+    return make_float2(float(a.x), float(a.y));
+}
+inline __host__ __device__ float2 make_float2(uint2 a)
+{
+    return make_float2(float(a.x), float(a.y));
+}
+
+inline __host__ __device__ int2 make_int2(int s)
+{
+    return make_int2(s, s);
+}
+inline __host__ __device__ int2 make_int2(int3 a)
+{
+    return make_int2(a.x, a.y);
+}
+inline __host__ __device__ int2 make_int2(uint2 a)
+{
+    return make_int2(int(a.x), int(a.y));
+}
+inline __host__ __device__ int2 make_int2(float2 a)
+{
+    return make_int2(int(a.x), int(a.y));
+}
+
+inline __host__ __device__ uint2 make_uint2(uint s)
+{
+    return make_uint2(s, s);
+}
+inline __host__ __device__ uint2 make_uint2(uint3 a)
+{
+    return make_uint2(a.x, a.y);
+}
+inline __host__ __device__ uint2 make_uint2(int2 a)
+{
+    return make_uint2(uint(a.x), uint(a.y));
+}
+
+inline __host__ __device__ float3 make_float3(float s)
+{
+    return make_float3(s, s, s);
+}
+inline __host__ __device__ float3 make_float3(float2 a)
+{
+    return make_float3(a.x, a.y, 0.0f);
+}
+inline __host__ __device__ float3 make_float3(float2 a, float s)
+{
+    return make_float3(a.x, a.y, s);
+}
+inline __host__ __device__ float3 make_float3(float4 a)
+{
+    return make_float3(a.x, a.y, a.z);
+}
+inline __host__ __device__ float3 make_float3(int3 a)
+{
+    return make_float3(float(a.x), float(a.y), float(a.z));
+}
+inline __host__ __device__ float3 make_float3(uint3 a)
+{
+    return make_float3(float(a.x), float(a.y), float(a.z));
+}
+
+inline __host__ __device__ int3 make_int3(int s)
+{
+    return make_int3(s, s, s);
+}
+inline __host__ __device__ int3 make_int3(int2 a)
+{
+    return make_int3(a.x, a.y, 0);
+}
+inline __host__ __device__ int3 make_int3(int2 a, int s)
+{
+    return make_int3(a.x, a.y, s);
+}
+inline __host__ __device__ int3 make_int3(uint3 a)
+{
+    return make_int3(int(a.x), int(a.y), int(a.z));
+}
+inline __host__ __device__ int3 make_int3(float3 a)
+{
+    return make_int3(int(a.x), int(a.y), int(a.z));
+}
+
+inline __host__ __device__ uint3 make_uint3(uint s)
+{
+    return make_uint3(s, s, s);
+}
+inline __host__ __device__ uint3 make_uint3(uint2 a)
+{
+    return make_uint3(a.x, a.y, 0);
+}
+inline __host__ __device__ uint3 make_uint3(uint2 a, uint s)
+{
+    return make_uint3(a.x, a.y, s);
+}
+inline __host__ __device__ uint3 make_uint3(uint4 a)
+{
+    return make_uint3(a.x, a.y, a.z);
+}
+inline __host__ __device__ uint3 make_uint3(int3 a)
+{
+    return make_uint3(uint(a.x), uint(a.y), uint(a.z));
+}
+
+inline __host__ __device__ float4 make_float4(float s)
+{
+    return make_float4(s, s, s, s);
+}
+inline __host__ __device__ float4 make_float4(float3 a)
+{
+    return make_float4(a.x, a.y, a.z, 0.0f);
+}
+inline __host__ __device__ float4 make_float4(float3 a, float w)
+{
+    return make_float4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ float4 make_float4(int4 a)
+{
+    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+inline __host__ __device__ float4 make_float4(uint4 a)
+{
+    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+
+inline __host__ __device__ int4 make_int4(int s)
+{
+    return make_int4(s, s, s, s);
+}
+inline __host__ __device__ int4 make_int4(int3 a)
+{
+    return make_int4(a.x, a.y, a.z, 0);
+}
+inline __host__ __device__ int4 make_int4(int3 a, int w)
+{
+    return make_int4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ int4 make_int4(uint4 a)
+{
+    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
+}
+inline __host__ __device__ int4 make_int4(float4 a)
+{
+    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
+}
+
+
+inline __host__ __device__ uint4 make_uint4(uint s)
+{
+    return make_uint4(s, s, s, s);
+}
+inline __host__ __device__ uint4 make_uint4(uint3 a)
+{
+    return make_uint4(a.x, a.y, a.z, 0);
+}
+inline __host__ __device__ uint4 make_uint4(uint3 a, uint w)
+{
+    return make_uint4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ uint4 make_uint4(int4 a)
+{
+    return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// negate
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator-(float2 &a)
+{
+    return make_float2(-a.x, -a.y);
+}
+inline __host__ __device__ int2 operator-(int2 &a)
+{
+    return make_int2(-a.x, -a.y);
+}
+inline __host__ __device__ float3 operator-(float3 &a)
+{
+    return make_float3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ int3 operator-(int3 &a)
+{
+    return make_int3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ float4 operator-(float4 &a)
+{
+    return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+inline __host__ __device__ int4 operator-(int4 &a)
+{
+    return make_int4(-a.x, -a.y, -a.z, -a.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// addition
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator+(float2 a, float2 b)
+{
+    return make_float2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(float2 &a, float2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ float2 operator+(float2 a, float b)
+{
+    return make_float2(a.x + b, a.y + b);
+}
+inline __host__ __device__ float2 operator+(float b, float2 a)
+{
+    return make_float2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(float2 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+}
+
+inline __host__ __device__ int2 operator+(int2 a, int2 b)
+{
+    return make_int2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(int2 &a, int2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ int2 operator+(int2 a, int b)
+{
+    return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ int2 operator+(int b, int2 a)
+{
+    return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(int2 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+}
+
+inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
+{
+    return make_uint2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ uint2 operator+(uint2 a, uint b)
+{
+    return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ uint2 operator+(uint b, uint2 a)
+{
+    return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+}
+
+
+inline __host__ __device__ float3 operator+(float3 a, float3 b)
+{
+    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(float3 &a, float3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ float3 operator+(float3 a, float b)
+{
+    return make_float3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(float3 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+
+inline __host__ __device__ int3 operator+(int3 a, int3 b)
+{
+    return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(int3 &a, int3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ int3 operator+(int3 a, int b)
+{
+    return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(int3 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+
+inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
+{
+    return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ uint3 operator+(uint3 a, uint b)
+{
+    return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+
+inline __host__ __device__ int3 operator+(int b, int3 a)
+{
+    return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ uint3 operator+(uint b, uint3 a)
+{
+    return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ float3 operator+(float b, float3 a)
+{
+    return make_float3(a.x + b, a.y + b, a.z + b);
+}
+
+inline __host__ __device__ float4 operator+(float4 a, float4 b)
+{
+    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(float4 &a, float4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ float4 operator+(float4 a, float b)
+{
+    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ float4 operator+(float b, float4 a)
+{
+    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ void operator+=(float4 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+
+inline __host__ __device__ int4 operator+(int4 a, int4 b)
+{
+    return make_int4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(int4 &a, int4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ int4 operator+(int4 a, int b)
+{
+    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ int4 operator+(int b, int4 a)
+{
+    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ void operator+=(int4 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+
+inline __host__ __device__ uint4 operator+(uint4 a, uint4 b)
+{
+    return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ uint4 operator+(uint4 a, uint b)
+{
+    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ uint4 operator+(uint b, uint4 a)
+{
+    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// subtract
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator-(float2 a, float2 b)
+{
+    return make_float2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ float2 operator-(float2 a, float b)
+{
+    return make_float2(a.x - b, a.y - b);
+}
+inline __host__ __device__ float2 operator-(float b, float2 a)
+{
+    return make_float2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+
+inline __host__ __device__ int2 operator-(int2 a, int2 b)
+{
+    return make_int2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ int2 operator-(int2 a, int b)
+{
+    return make_int2(a.x - b, a.y - b);
+}
+inline __host__ __device__ int2 operator-(int b, int2 a)
+{
+    return make_int2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+
+inline __host__ __device__ uint2 operator-(uint2 a, uint2 b)
+{
+    return make_uint2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ uint2 operator-(uint2 a, uint b)
+{
+    return make_uint2(a.x - b, a.y - b);
+}
+inline __host__ __device__ uint2 operator-(uint b, uint2 a)
+{
+    return make_uint2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+
+inline __host__ __device__ float3 operator-(float3 a, float3 b)
+{
+    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(float3 &a, float3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ float3 operator-(float3 a, float b)
+{
+    return make_float3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ float3 operator-(float b, float3 a)
+{
+    return make_float3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(float3 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+
+inline __host__ __device__ int3 operator-(int3 a, int3 b)
+{
+    return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ int3 operator-(int3 a, int b)
+{
+    return make_int3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ int3 operator-(int b, int3 a)
+{
+    return make_int3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+
+inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
+{
+    return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ uint3 operator-(uint3 a, uint b)
+{
+    return make_uint3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ uint3 operator-(uint b, uint3 a)
+{
+    return make_uint3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+
+inline __host__ __device__ float4 operator-(float4 a, float4 b)
+{
+    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(float4 &a, float4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ float4 operator-(float4 a, float b)
+{
+    return make_float4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ void operator-=(float4 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+
+inline __host__ __device__ int4 operator-(int4 a, int4 b)
+{
+    return make_int4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ int4 operator-(int4 a, int b)
+{
+    return make_int4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ int4 operator-(int b, int4 a)
+{
+    return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+
+inline __host__ __device__ uint4 operator-(uint4 a, uint4 b)
+{
+    return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ uint4 operator-(uint4 a, uint b)
+{
+    return make_uint4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ uint4 operator-(uint b, uint4 a)
+{
+    return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// multiply
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator*(float2 a, float2 b)
+{
+    return make_float2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(float2 &a, float2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ float2 operator*(float2 a, float b)
+{
+    return make_float2(a.x * b, a.y * b);
+}
+inline __host__ __device__ float2 operator*(float b, float2 a)
+{
+    return make_float2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(float2 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+
+inline __host__ __device__ int2 operator*(int2 a, int2 b)
+{
+    return make_int2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ int2 operator*(int2 a, int b)
+{
+    return make_int2(a.x * b, a.y * b);
+}
+inline __host__ __device__ int2 operator*(int b, int2 a)
+{
+    return make_int2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+
+inline __host__ __device__ uint2 operator*(uint2 a, uint2 b)
+{
+    return make_uint2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ uint2 operator*(uint2 a, uint b)
+{
+    return make_uint2(a.x * b, a.y * b);
+}
+inline __host__ __device__ uint2 operator*(uint b, uint2 a)
+{
+    return make_uint2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+
+inline __host__ __device__ float3 operator*(float3 a, float3 b)
+{
+    return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ float3 operator*(float3 a, float b)
+{
+    return make_float3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ float3 operator*(float b, float3 a)
+{
+    return make_float3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+
+inline __host__ __device__ int3 operator*(int3 a, int3 b)
+{
+    return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ int3 operator*(int3 a, int b)
+{
+    return make_int3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ int3 operator*(int b, int3 a)
+{
+    return make_int3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+
+inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
+{
+    return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ uint3 operator*(uint3 a, uint b)
+{
+    return make_uint3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ uint3 operator*(uint b, uint3 a)
+{
+    return make_uint3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+
+inline __host__ __device__ float4 operator*(float4 a, float4 b)
+{
+    return make_float4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(float4 &a, float4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ float4 operator*(float4 a, float b)
+{
+    return make_float4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ float4 operator*(float b, float4 a)
+{
+    return make_float4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(float4 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+
+inline __host__ __device__ int4 operator*(int4 a, int4 b)
+{
+    return make_int4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ int4 operator*(int4 a, int b)
+{
+    return make_int4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ int4 operator*(int b, int4 a)
+{
+    return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+
+inline __host__ __device__ uint4 operator*(uint4 a, uint4 b)
+{
+    return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ uint4 operator*(uint4 a, uint b)
+{
+    return make_uint4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ uint4 operator*(uint b, uint4 a)
+{
+    return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// divide
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator/(float2 a, float2 b)
+{
+    return make_float2(a.x / b.x, a.y / b.y);
+}
+inline __host__ __device__ void operator/=(float2 &a, float2 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+}
+inline __host__ __device__ float2 operator/(float2 a, float b)
+{
+    return make_float2(a.x / b, a.y / b);
+}
+inline __host__ __device__ void operator/=(float2 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+}
+inline __host__ __device__ float2 operator/(float b, float2 a)
+{
+    return make_float2(b / a.x, b / a.y);
+}
+
+inline __host__ __device__ float3 operator/(float3 a, float3 b)
+{
+    return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+inline __host__ __device__ void operator/=(float3 &a, float3 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
+}
+inline __host__ __device__ float3 operator/(float3 a, float b)
+{
+    return make_float3(a.x / b, a.y / b, a.z / b);
+}
+inline __host__ __device__ void operator/=(float3 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+    a.z /= b;
+}
+inline __host__ __device__ float3 operator/(float b, float3 a)
+{
+    return make_float3(b / a.x, b / a.y, b / a.z);
+}
+
+inline __host__ __device__ float4 operator/(float4 a, float4 b)
+{
+    return make_float4(a.x / b.x, a.y / b.y, a.z / b.z,  a.w / b.w);
+}
+inline __host__ __device__ void operator/=(float4 &a, float4 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
+    a.w /= b.w;
+}
+inline __host__ __device__ float4 operator/(float4 a, float b)
+{
+    return make_float4(a.x / b, a.y / b, a.z / b,  a.w / b);
+}
+inline __host__ __device__ void operator/=(float4 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+    a.z /= b;
+    a.w /= b;
+}
+inline __host__ __device__ float4 operator/(float b, float4 a)
+{
+    return make_float4(b / a.x, b / a.y, b / a.z, b / a.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// min
+////////////////////////////////////////////////////////////////////////////////
+
+inline  __host__ __device__ float2 fminf(float2 a, float2 b)
+{
+    return make_float2(fminf(a.x,b.x), fminf(a.y,b.y));
+}
+inline __host__ __device__ float3 fminf(float3 a, float3 b)
+{
+    return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
+}
+inline  __host__ __device__ float4 fminf(float4 a, float4 b)
+{
+    return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
+}
+
+inline __host__ __device__ int2 min(int2 a, int2 b)
+{
+    return make_int2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ int3 min(int3 a, int3 b)
+{
+    return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ int4 min(int4 a, int4 b)
+{
+    return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+
+inline __host__ __device__ uint2 min(uint2 a, uint2 b)
+{
+    return make_uint2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ uint3 min(uint3 a, uint3 b)
+{
+    return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ uint4 min(uint4 a, uint4 b)
+{
+    return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// max
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fmaxf(float2 a, float2 b)
+{
+    return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y));
+}
+inline __host__ __device__ float3 fmaxf(float3 a, float3 b)
+{
+    return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
+}
+inline __host__ __device__ float4 fmaxf(float4 a, float4 b)
+{
+    return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
+}
+
+inline __host__ __device__ int2 max(int2 a, int2 b)
+{
+    return make_int2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ int3 max(int3 a, int3 b)
+{
+    return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ int4 max(int4 a, int4 b)
+{
+    return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+
+inline __host__ __device__ uint2 max(uint2 a, uint2 b)
+{
+    return make_uint2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ uint3 max(uint3 a, uint3 b)
+{
+    return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ uint4 max(uint4 a, uint4 b)
+{
+    return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// lerp
+// - linear interpolation between a and b, based on value t in [0, 1] range
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float lerp(float a, float b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
+{
+    return a + t*(b-a);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// clamp
+// - clamp the value v to be in the range [a, b]
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float clamp(float f, float a, float b)
+{
+    return fmaxf(a, fminf(f, b));
+}
+inline __device__ __host__ int clamp(int f, int a, int b)
+{
+    return max(a, min(f, b));
+}
+inline __device__ __host__ uint clamp(uint f, uint a, uint b)
+{
+    return max(a, min(f, b));
+}
+
+inline __device__ __host__ float2 clamp(float2 v, float a, float b)
+{
+    return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
+{
+    return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ float3 clamp(float3 v, float a, float b)
+{
+    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
+{
+    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ float4 clamp(float4 v, float a, float b)
+{
+    return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
+{
+    return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+inline __device__ __host__ int2 clamp(int2 v, int a, int b)
+{
+    return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b)
+{
+    return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ int3 clamp(int3 v, int a, int b)
+{
+    return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
+{
+    return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ int4 clamp(int4 v, int a, int b)
+{
+    return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b)
+{
+    return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b)
+{
+    return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b)
+{
+    return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
+{
+    return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
+{
+    return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b)
+{
+    return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b)
+{
+    return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// dot product
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float dot(float2 a, float2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ float dot(float3 a, float3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ float dot(float4 a, float4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __host__ __device__ int dot(int2 a, int2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ int dot(int3 a, int3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ int dot(int4 a, int4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __host__ __device__ uint dot(uint2 a, uint2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ uint dot(uint3 a, uint3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ uint dot(uint4 a, uint4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// length
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float length(float2 v)
+{
+    return sqrtf(dot(v, v));
+}
+inline __host__ __device__ float length(float3 v)
+{
+    return sqrtf(dot(v, v));
+}
+inline __host__ __device__ float length(float4 v)
+{
+    return sqrtf(dot(v, v));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// normalize
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 normalize(float2 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+inline __host__ __device__ float3 normalize(float3 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+inline __host__ __device__ float4 normalize(float4 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// floor
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 floorf(float2 v)
+{
+    return make_float2(floorf(v.x), floorf(v.y));
+}
+inline __host__ __device__ float3 floorf(float3 v)
+{
+    return make_float3(floorf(v.x), floorf(v.y), floorf(v.z));
+}
+inline __host__ __device__ float4 floorf(float4 v)
+{
+    return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// frac - returns the fractional portion of a scalar or each vector component
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float fracf(float v)
+{
+    return v - floorf(v);
+}
+inline __host__ __device__ float2 fracf(float2 v)
+{
+    return make_float2(fracf(v.x), fracf(v.y));
+}
+inline __host__ __device__ float3 fracf(float3 v)
+{
+    return make_float3(fracf(v.x), fracf(v.y), fracf(v.z));
+}
+inline __host__ __device__ float4 fracf(float4 v)
+{
+    return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// fmod
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fmodf(float2 a, float2 b)
+{
+    return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y));
+}
+inline __host__ __device__ float3 fmodf(float3 a, float3 b)
+{
+    return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
+}
+inline __host__ __device__ float4 fmodf(float4 a, float4 b)
+{
+    return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// absolute value
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fabs(float2 v)
+{
+    return make_float2(fabs(v.x), fabs(v.y));
+}
+inline __host__ __device__ float3 fabs(float3 v)
+{
+    return make_float3(fabs(v.x), fabs(v.y), fabs(v.z));
+}
+inline __host__ __device__ float4 fabs(float4 v)
+{
+    return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
+}
+
+inline __host__ __device__ int2 abs(int2 v)
+{
+    return make_int2(abs(v.x), abs(v.y));
+}
+inline __host__ __device__ int3 abs(int3 v)
+{
+    return make_int3(abs(v.x), abs(v.y), abs(v.z));
+}
+inline __host__ __device__ int4 abs(int4 v)
+{
+    return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// reflect
+// - returns reflection of incident ray I around surface normal N
+// - N should be normalized, reflected vector's length is equal to length of I
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float3 reflect(float3 i, float3 n)
+{
+    return i - 2.0f * n * dot(n,i);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// cross product
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float3 cross(float3 a, float3 b)
+{
+    return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// smoothstep
+// - returns 0 if x < a
+// - returns 1 if x > b
+// - otherwise returns smooth interpolation between 0 and 1 based on x
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float smoothstep(float a, float b, float x)
+{
+    float y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(3.0f - (2.0f*y)));
+}
+inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x)
+{
+    float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y)));
+}
+inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x)
+{
+    float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y)));
+}
+inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x)
+{
+    float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y)));
+}
+
+#endif
diff --git a/libs/gpu/libgpu/cuda/utils.cpp b/libs/gpu/libgpu/cuda/utils.cpp
new file mode 100644
index 0000000..36a3794
--- /dev/null
+++ b/libs/gpu/libgpu/cuda/utils.cpp
@@ -0,0 +1,14 @@
+#ifdef CUDA_SUPPORT
+#include "utils.h"
+#include "cuda_api.h"
+
+namespace cuda {
+
+std::string formatError(cudaError_t code)
+{
+	return std::string(cudaGetErrorString(code)) + " (" + to_string(code) + ")";
+}
+
+}
+
+#endif
diff --git a/libs/gpu/libgpu/cuda/utils.h b/libs/gpu/libgpu/cuda/utils.h
new file mode 100644
index 0000000..b7c3bc5
--- /dev/null
+++ b/libs/gpu/libgpu/cuda/utils.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+#include <cuda_runtime_api.h>
+#include <device_launch_parameters.h>
+#include <libgpu/utils.h>
+#include <libutils/string_utils.h>
+#include <cfloat>
+
+#define CUDA_KERNELS_ACCURATE_ERRORS_CHECKS true
+
+#ifndef NDEBUG
+#undef CUDA_KERNELS_ACCURATE_ERRORS_CHECKS
+#define CUDA_KERNELS_ACCURATE_ERRORS_CHECKS true
+#endif
+
+namespace cuda {
+
+	class cuda_exception : public gpu::gpu_exception {
+	public:
+		cuda_exception(std::string msg) throw ()					: gpu_exception(msg)							{	}
+		cuda_exception(const char *msg) throw ()					: gpu_exception(msg)							{	}
+		cuda_exception() throw ()									: gpu_exception("CUDA exception")				{	}
+	};
+
+	class cuda_bad_alloc : public gpu::gpu_bad_alloc {
+	public:
+		cuda_bad_alloc(std::string msg) throw ()					: gpu_bad_alloc(msg)							{	}
+		cuda_bad_alloc(const char *msg) throw ()					: gpu_bad_alloc(msg)							{	}
+		cuda_bad_alloc() throw ()									: gpu_bad_alloc("CUDA exception")				{	}
+	};
+
+	std::string formatError(cudaError_t code);
+
+	static inline void reportError(cudaError_t err, int line, std::string prefix="")
+	{
+		if (cudaSuccess == err)
+			return;
+
+		std::string message = prefix + formatError(err) + " at line " + to_string(line);
+
+		switch (err) {
+		case cudaErrorMemoryAllocation:
+			throw cuda_bad_alloc(message);
+		default:
+			throw cuda_exception(message);
+		}
+	}
+
+	static inline void checkKernelErrors(cudaStream_t stream, int line)
+	{
+		reportError(cudaGetLastError(), line, "Kernel failed: ");
+		if (CUDA_KERNELS_ACCURATE_ERRORS_CHECKS) {
+			reportError(cudaStreamSynchronize(stream), line, "Kernel failed: ");
+		}
+	}
+
+	#define CUDA_SAFE_CALL(expr)  cuda::reportError(expr, __LINE__)
+	#define CUDA_CHECK_KERNEL(stream)  cuda::checkKernelErrors(stream, __LINE__)
+
+	template <typename T>	class DataTypeRange					{ };
+	template<>				class DataTypeRange<unsigned char>	{ public:	static __device__	unsigned char	min() { return 0; }			static __device__	unsigned char	max() {	return UCHAR_MAX;	}};
+	template<>				class DataTypeRange<unsigned short>	{ public:	static __device__	unsigned short	min() { return 0; }			static __device__	unsigned short	max() {	return USHRT_MAX;	}};
+	template<>				class DataTypeRange<unsigned int>	{ public:	static __device__	unsigned int	min() { return 0; }			static __device__	unsigned int	max() {	return UINT_MAX;	}};
+	template<>				class DataTypeRange<float>			{ public:	static __device__	float			min() { return FLT_MIN; }	static __device__	float			max() {	return FLT_MAX;		}};
+	template<>				class DataTypeRange<double>			{ public:	static __device__	double			min() { return DBL_MIN; }	static __device__	double			max() {	return DBL_MAX;		}};
+
+	template <typename T>	class TypeHelper					{ };
+	template<>				class TypeHelper<unsigned char>		{ public:	typedef unsigned int	type32; };
+	template<>				class TypeHelper<unsigned short>	{ public:	typedef unsigned int	type32; };
+	template<>				class TypeHelper<unsigned int>		{ public:	typedef unsigned int	type32; };
+	template<>				class TypeHelper<float>				{ public:	typedef float			type32; };
+	template<>				class TypeHelper<double>			{ public:	typedef float			type32; };
+
+}
diff --git a/libs/gpu/libgpu/device.cpp b/libs/gpu/libgpu/device.cpp
new file mode 100644
index 0000000..f663dd4
--- /dev/null
+++ b/libs/gpu/libgpu/device.cpp
@@ -0,0 +1,174 @@
+#include "device.h"
+#include "context.h"
+#include <libgpu/opencl/enum.h>
+#include <libgpu/opencl/utils.h>
+#include <algorithm>
+
+#ifdef CUDA_SUPPORT
+#include <libgpu/cuda/enum.h>
+#include <libgpu/cuda/utils.h>
+#include <cuda_runtime.h>
+#endif
+
+namespace gpu {
+
+std::vector<Device> enumDevices()
+{
+	std::vector<Device> devices;
+
+#ifdef CUDA_SUPPORT
+	CUDAEnum cuda_enum;
+	cuda_enum.enumDevices();
+
+	const std::vector<CUDAEnum::Device> &cuda_devices = cuda_enum.devices();
+	for (size_t k = 0; k < cuda_devices.size(); k++) {
+		const CUDAEnum::Device &cuda_device = cuda_devices[k];
+
+		Device device;
+		device.name				= cuda_device.name;
+		device.compute_units	= cuda_device.compute_units;
+		device.clock			= cuda_device.clock;
+		device.mem_size			= cuda_device.mem_size;
+		device.pci_bus_id		= cuda_device.pci_bus_id;
+		device.pci_device_id	= cuda_device.pci_device_id;
+		device.supports_opencl	= false;
+		device.supports_cuda	= true;
+		device.device_id_opencl	= 0;
+		device.device_id_cuda	= cuda_device.id;
+		devices.push_back(device);
+	}
+#endif
+
+	OpenCLEnum opencl_enum;
+	opencl_enum.enumDevices();
+
+	const std::vector<OpenCLEnum::Device> &opencl_devices = opencl_enum.devices();
+	for (size_t k = 0; k < opencl_devices.size(); k++) {
+		const OpenCLEnum::Device &opencl_device = opencl_devices[k];
+
+		Device device;
+		device.name				= opencl_device.name;
+		device.opencl_vendor	= opencl_device.vendor;
+		device.opencl_version	= opencl_device.version;
+		device.compute_units	= opencl_device.compute_units;
+		device.clock			= opencl_device.clock;
+		device.mem_size			= opencl_device.mem_size;
+		device.pci_bus_id		= opencl_device.nvidia_pci_bus_id;
+		device.pci_device_id	= opencl_device.nvidia_pci_slot_id;
+		device.supports_opencl	= true;
+		device.supports_cuda	= false;
+		device.device_id_opencl	= opencl_device.id;
+		device.device_id_cuda	= 0;
+		devices.push_back(device);
+	}
+
+#ifdef CUDA_SUPPORT
+	std::sort(devices.begin(), devices.end());
+
+	// merge corresponding devices
+	for (size_t k = 0; k + 1 < devices.size(); k++) {
+		if (devices[k].name				!= devices[k + 1].name)				continue;
+		if (devices[k].pci_bus_id		!= devices[k + 1].pci_bus_id)		continue;
+		if (devices[k].pci_device_id	!= devices[k + 1].pci_device_id)	continue;
+
+		if (!devices[k].supports_opencl && !devices[k + 1].supports_cuda) {
+			devices[k].supports_opencl	= true;
+			devices[k].device_id_opencl	= devices[k + 1].device_id_opencl;
+			devices.erase(devices.begin() + k + 1);
+		}
+	}
+#endif
+
+	return devices;
+}
+
+bool Device::printInfo() const
+{
+#ifdef CUDA_SUPPORT
+	if (supports_cuda) {
+		return CUDAEnum::printInfo(device_id_cuda);
+	}
+#endif
+
+	if (supports_opencl) {
+		ocl::DeviceInfo device_info;
+		device_info.init(device_id_opencl);
+		device_info.print();
+		return true;
+	}
+
+	return false;
+}
+
+bool Device::supportsFreeMemoryQuery() const
+{
+#ifdef CUDA_SUPPORT
+	if (supports_cuda) {
+		return true;
+	} else
+#endif
+	if (supports_opencl) {
+		ocl::DeviceInfo device_info;
+		device_info.init(device_id_opencl);
+		if (device_info.hasExtension(CL_AMD_DEVICE_ATTRIBUTE_QUERY_EXT)) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+unsigned long long Device::getFreeMemory() const
+{
+#ifdef CUDA_SUPPORT
+	if (supports_cuda) {
+		Context context;
+		context.init(device_id_cuda);
+		context.activate();
+
+		size_t total_mem_size = 0;
+		size_t free_mem_size = 0;
+		CUDA_SAFE_CALL(cudaMemGetInfo(&free_mem_size, &total_mem_size));
+		return free_mem_size;
+	} else
+#endif
+	if (supports_opencl) {
+		ocl::DeviceInfo device_info;
+		device_info.init(device_id_opencl);
+		if (device_info.device_type == CL_DEVICE_TYPE_GPU && device_info.hasExtension(CL_AMD_DEVICE_ATTRIBUTE_QUERY_EXT)) {
+			cl_ulong free_mem = 0;
+			OCL_SAFE_CALL(clGetDeviceInfo(device_id_opencl, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, sizeof(free_mem), &free_mem, NULL));
+			return free_mem * 1024;
+		} else {
+			size_t free_mem_size = mem_size - mem_size / 5;
+			return free_mem_size;
+		}
+	} else {
+		return 0x40000000ull * 64;	// assume 64GB by default
+	}
+}
+
+std::vector<Device> selectDevices(unsigned int mask, bool silent)
+{
+	if (!mask)
+		return std::vector<Device>();
+
+	std::vector<Device> devices = enumDevices();
+
+	std::vector<Device> res;
+	for (size_t k = 0; k < devices.size(); k++) {
+		if (!(mask & (1 << k)))
+			continue;
+
+		Device &device = devices[k];
+		if (!silent)
+			if (!device.printInfo())
+				continue;
+
+		res.push_back(device);
+	}
+
+	return res;
+}
+
+}
diff --git a/libs/gpu/libgpu/device.h b/libs/gpu/libgpu/device.h
new file mode 100644
index 0000000..0ae5e0a
--- /dev/null
+++ b/libs/gpu/libgpu/device.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+typedef struct _cl_device_id *		cl_device_id;
+
+namespace gpu {
+
+class Device {
+public:
+	std::string			name;
+	std::string			opencl_vendor;
+	std::string			opencl_version;
+	unsigned int		compute_units;
+	unsigned int		clock;
+	unsigned long long	mem_size;
+	unsigned int		pci_bus_id;
+	unsigned int		pci_device_id;
+
+	bool				supports_opencl;
+	bool				supports_cuda;
+
+	cl_device_id		device_id_opencl;
+	int					device_id_cuda;
+
+	bool				printInfo() const;
+
+	bool				supportsFreeMemoryQuery() const;
+	unsigned long long	getFreeMemory() const;
+
+	bool operator< (const Device &other) const
+	{
+		if (name			< other.name)				return true;
+		if (name			> other.name)				return false;
+		if (pci_bus_id		< other.pci_bus_id)			return true;
+		if (pci_bus_id		> other.pci_bus_id)			return false;
+		if (pci_device_id	< other.pci_device_id)		return true;
+		if (pci_device_id	> other.pci_device_id)		return false;
+		if (supports_opencl	< other.supports_opencl)	return true;
+		if (supports_opencl	> other.supports_opencl)	return false;
+		if (supports_cuda	< other.supports_cuda)		return true;
+		if (supports_cuda	> other.supports_cuda)		return false;
+		return false;
+	}
+};
+
+std::vector<Device> enumDevices();
+std::vector<Device> selectDevices(unsigned int mask, bool silent=false);
+
+}
diff --git a/libs/gpu/libgpu/gold_helpers.cpp b/libs/gpu/libgpu/gold_helpers.cpp
new file mode 100644
index 0000000..14a0d15
--- /dev/null
+++ b/libs/gpu/libgpu/gold_helpers.cpp
@@ -0,0 +1,96 @@
+#include "gold_helpers.h"
+
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <algorithm>
+
+namespace gold {
+
+    template<typename T>
+    void host_data<T>::init(const gpu::gpu_mem_any &gpu_data) {
+        size_t n = gpu_data.size() / sizeof(T);
+        data = std::vector<T>(n);
+        gpu_data.read(data.data(), gpu_data.size());
+    }
+
+    template<typename T>
+    void host_data<T>::init(const gpu::shared_device_buffer_typed <T> &gpu_data) {
+        size_t n = gpu_data.size() / sizeof(T);
+        data = std::vector<T>(n);
+        gpu_data.readN(data.data(), n);
+    }
+
+    template<typename T>
+    bool host_data<T>::operator==(const host_data <T> &that) {
+        for (size_t i = 0; i < data.size(); ++i) {
+            if (data[i] != that.data[i]) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    template<typename T>
+    T diff(T a, T b) {
+        return std::max(a, b) - std::min(a, b);
+    }
+
+    float diff(float a, float b) {
+        if (!std::isnan(a) && std::isnan(b)) {
+            return std::max(a, b) - std::min(a, b);
+        } else if (std::isnan(a) &&std::isnan(b)) {
+            return 0.0f;
+        } else if (std::isnan(a)) {
+            return std::abs(b);
+        } else {
+            assert(std::isnan(b));
+            return std::abs(a);
+        }
+    }
+
+    double diff(double a, double b) {
+        if (std::isnan(a) && std::isnan(b)) {
+            return std::max(a, b) - std::min(a, b);
+        } else if (std::isnan(a) &&std::isnan(b)) {
+            return 0.0;
+        } else if (std::isnan(a)) {
+            return std::abs(b);
+        } else {
+            assert(std::isnan(b));
+            return std::abs(a);
+        }
+    }
+
+    void ensure(bool condition, int line) {
+        if (!condition) {
+            std::cerr << "GOLD check filed at line " << line << "!" << std::endl;
+        }
+    }
+
+    template <typename T>
+    void ensure_less(T a, T b, int line) {
+        if (a < b) {
+            return;
+        } else {
+            std::cerr << "Failed check: " << a << " < " << b << std::endl;
+            ensure(a < b, line);
+        }
+    }
+
+    template class host_data<int8_t>;
+    template class host_data<int16_t>;
+    template class host_data<int32_t>;
+    template class host_data<uint8_t>;
+    template class host_data<uint16_t>;
+    template class host_data<uint32_t>;
+    template class host_data<float>;
+    template class host_data<double>;
+
+    template void ensure_less(uint8_t a, uint8_t b, int line);
+    template void ensure_less(uint16_t a, uint16_t b, int line);
+    template void ensure_less(uint32_t a, uint32_t b, int line);
+    template void ensure_less(float a, float b, int line);
+    template void ensure_less(double a, double b, int line);
+
+}
diff --git a/libs/gpu/libgpu/gold_helpers.h b/libs/gpu/libgpu/gold_helpers.h
new file mode 100644
index 0000000..5315eef
--- /dev/null
+++ b/libs/gpu/libgpu/gold_helpers.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include "shared_device_buffer.h"
+
+#include <vector>
+
+#define GOLD_CHECK(condition) gold::ensure(condition, __LINE__)
+#define GOLD_CHECK_LESS(a, b) gold::ensure_less(a, b, __LINE__)
+
+namespace gold {
+
+	template <typename T>
+	class host_data {
+	public:
+		host_data() {}
+		host_data(const gpu::gpu_mem_any& gpu_data)						{ init(gpu_data); };
+		host_data(const gpu::shared_device_buffer_typed<T>& gpu_data)	{ init(gpu_data); };
+
+		void init(const gpu::gpu_mem_any& gpu_data);
+		void init(const gpu::shared_device_buffer_typed<T>& gpu_data);
+
+		bool operator==(const host_data<T>& that);
+		bool operator!=(const host_data<T>& that) { return !(*this == that); }
+
+		T* ptr() { return data.data(); }
+
+	private:
+		std::vector<T> data;
+	};
+
+	void ensure(bool condition, int line);
+
+	template <typename T>
+	void ensure_less(T a, T b, int line);
+
+}
diff --git a/libs/gpu/libgpu/hexdumparray.cpp b/libs/gpu/libgpu/hexdumparray.cpp
new file mode 100644
index 0000000..0c888af
--- /dev/null
+++ b/libs/gpu/libgpu/hexdumparray.cpp
@@ -0,0 +1,74 @@
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+#include <string>
+
+int main(int argc, char **argv)
+{
+    if (argc != 4) {
+        std::cerr << "Usage: " << argv[0] << " <sourceFile> <headerFile> <arrayName>" << std::endl;
+        return 1;
+    }
+
+    std::string sourceFilename(argv[1]);
+    std::string headerFilename(argv[2]);
+    std::string arrayName(argv[3]);
+
+    std::ifstream fin(sourceFilename, std::ios::binary);
+    std::ofstream fout(headerFilename);
+
+    if (!fin) {
+        std::cerr << "Can't open file " << sourceFilename << "!" << std::endl;
+        return 1;
+    }
+
+    // #include <cstddef>
+    //
+    // static const char <arrayName>[] = {
+    //  /*... hexadecimal data from source file ...*/
+    // };
+    //
+    // size_t <arrayName>_length = sizeof(<arrayName>) / sizeof(char);
+
+    fout << "#include <cstddef>" << std::endl;
+    fout << std::endl;
+    fout << "static const char " << arrayName << "[] = {" << std::endl;
+
+    char buffer[2391];
+    const int maxBytesInLine = 120 / 6;
+    int bytesInLine = 0;
+
+    std::streamsize n;
+    do {
+        fin.read(buffer, sizeof(buffer) / sizeof(char));
+        n = fin.gcount();
+        for (std::streamsize i = 0; i < n; ++i) {
+            unsigned int value = (unsigned int) buffer[i];
+            if (value > 0xff) {
+                value -= 0xffffff00;
+            }
+            if (value >= 128) {
+                fout << "-";
+                value = 256 - value;
+            }
+            fout << "0x" << std::setw(2) << std::setfill('0') << std::hex << value << ", ";
+            ++bytesInLine;
+            if (bytesInLine == maxBytesInLine) {
+                fout << std::endl;
+                bytesInLine = 0;
+            }
+        }
+    } while (n > 0);
+
+    if (bytesInLine > 0)
+        fout << std::endl;
+
+    fout << "};" << std::endl;
+    fout << std::endl;
+    fout << "size_t " << arrayName << "_length = sizeof(" << arrayName << ") / sizeof(char);" << std::endl;
+
+    fin.close();
+    fout.close();
+
+    return 0;
+}
diff --git a/libs/gpu/libgpu/opencl/cl/c_template.cl b/libs/gpu/libgpu/opencl/cl/c_template.cl
new file mode 100644
index 0000000..0970716
--- /dev/null
+++ b/libs/gpu/libgpu/opencl/cl/c_template.cl
@@ -0,0 +1,8 @@
+#ifndef c_template_cl // pragma once
+#define c_template_cl
+
+#define T_DEPENDENT2(fun, suffix) fun ## _ ## suffix
+#define T_DEPENDENT1(fun, suffix) T_DEPENDENT2(fun, suffix)
+#define T_DEPENDENT(fun) T_DEPENDENT1(fun, T)
+
+#endif // pragma once
diff --git a/libs/gpu/libgpu/opencl/cl/clion_defines.cl b/libs/gpu/libgpu/opencl/cl/clion_defines.cl
new file mode 100644
index 0000000..942af70
--- /dev/null
+++ b/libs/gpu/libgpu/opencl/cl/clion_defines.cl
@@ -0,0 +1,74 @@
+#ifndef clion_defines_cl // pragma once
+#define clion_defines_cl
+
+#ifdef __CLION_IDE__
+
+#ifndef STATIC_KEYWORD
+#define STATIC_KEYWORD static
+#endif
+
+#define __kernel
+#define __global
+#define __local
+#define __constant
+#define __private
+
+#define half float
+
+struct float2 { float x;          };
+struct float3 { float x, y, z;    };
+struct float4 { float x, y, z, w; };
+
+// https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/commonFunctions.html
+#define gentype float
+gentype		clamp		(gentype x, float minval, float maxval);
+gentype		degrees		(gentype radians);
+gentype		max			(gentype x, gentype y);
+gentype		min			(gentype x, gentype y);
+gentype		mix			(gentype x, gentype y, gentype a);
+gentype		radians		(gentype degrees);
+gentype		sign		(gentype x);
+gentype		smoothstep	(gentype edge0, gentype edge1, gentype x);
+gentype		step		(gentype edge, gentype x);
+#undef gentype
+
+// https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/barrier.html
+enum	cl_mem_fence_flags
+{
+	CLK_LOCAL_MEM_FENCE,
+	CLK_GLOBAL_MEM_FENCE
+};
+void	barrier(cl_mem_fence_flags flags);
+
+// https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/vectorDataLoadandStoreFunctions.html
+#define gentype float
+#define gentypen float4
+gentypen	vload4			(size_t offset, const gentype *p);
+void		vstore4			(gentypen data, size_t offset, gentype *p);
+void		vstore4			(gentypen data, size_t offset, gentype *p);
+#undef gentypen
+#undef gentype
+float		vload_half		(size_t offset, const half *p);
+float4		vload_half4		(size_t offset, const half *p);
+void		vstore_half		(float data, size_t offset, half *p);
+void		vstore_half4	(float4 data, size_t offset, half *p);
+float4		vloada_half4	(size_t offset, const half *p);
+void		vstorea_half4	(float4 data, size_t offset, half *p);
+
+// https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/workItemFunctions.html
+size_t	get_global_size		(uint dimindx);
+size_t	get_global_id		(uint dimindx);
+size_t	get_local_size		(uint dimindx);
+size_t	get_local_id		(uint dimindx);
+size_t	get_num_groups		(uint dimindx);
+size_t	get_group_id		(uint dimindx);
+size_t	get_global_offset	(uint dimindx);
+uint	get_work_dim		();
+
+// Defined in libs/gpu/libgpu/opencl/engine.cpp:584
+// 64 for AMD, 32 for NVidia, 8 for intel GPUs, 1 for CPU
+#define WARP_SIZE 64
+
+#endif
+
+#endif // pragma once
diff --git a/libs/gpu/libgpu/opencl/cl/common.cl b/libs/gpu/libgpu/opencl/cl/common.cl
new file mode 100644
index 0000000..6f3dac1
--- /dev/null
+++ b/libs/gpu/libgpu/opencl/cl/common.cl
@@ -0,0 +1,427 @@
+#ifndef common_cl // pragma once
+#define common_cl
+
+#line 5
+
+#ifdef HOST_CODE
+#include <libclew/CL/cl_platform.h>
+#include <libutils/types.h>
+#else
+#include "clion_defines.cl"
+#endif
+
+//#define DEBUG
+
+#ifndef HOST_CODE
+#ifdef DEBUG
+#define printf_assert(condition, message) \
+        if (!(condition)) printf("%s Line %d\n", message, __LINE__);
+#else
+#define printf_assert(condition, message)
+#endif
+
+#define assert_isfinite(value) \
+        printf_assert(isfinite(value), "Value should be finite!");
+#endif
+
+#define BOOL_TYPE int
+#define BOOL_TRUE 1
+#define BOOL_FALSE 0
+
+//______DEVICE_CODE_____________________________________________________________________________________________________
+
+#ifndef HOST_CODE
+	#define make_uint3  (uint3)
+	#define make_uint4  (uint4)
+	#define make_float2 (float2)
+	#define make_float3 (float3)
+	#define make_float4 (float4)
+
+	#define tanf tan
+	#define cosf cos
+	#define sinf sin
+	#define atan2f atan2
+	#define atanf atan
+	#define asinf asin
+
+	STATIC_KEYWORD float sqrtf(float x)
+	{
+		return sqrt(x);
+	}
+
+	STATIC_KEYWORD float expf(float x)
+	{
+		return exp(x);
+	}
+
+	#define norm(v) length(v)
+	#define norm2(v) dot(v, v)
+
+	#define min3(x, y, z) (min(min(x, y), z))
+	#define max3(x, y, z) (max(max(x, y), z))
+
+	#define cl_float4 float4
+
+	STATIC_KEYWORD uint3 fetch_uint3(__global const unsigned int* ptr, size_t index)
+	{
+		return make_uint3(ptr[3 * index + 0], ptr[3 * index + 1], ptr[3 * index + 2]);
+	}
+
+	STATIC_KEYWORD uint4 fetch_uint4(__global const unsigned int* ptr, size_t index)
+	{
+		return make_uint4(ptr[4 * index + 0], ptr[4 * index + 1], ptr[4 * index + 2], ptr[4 * index + 3]);
+	}
+
+	STATIC_KEYWORD float2 fetch_float2(__global const float* ptr, size_t index)
+	{
+		return make_float2(ptr[2 * index + 0], ptr[2 * index + 1]);
+	}
+
+	STATIC_KEYWORD float3 fetch_float3(__global const float* ptr, size_t index)
+	{
+		return make_float3(ptr[3 * index + 0], ptr[3 * index + 1], ptr[3 * index + 2]);
+	}
+
+	STATIC_KEYWORD float4 fetch_float4(__global const float* ptr, size_t index)
+	{
+		return make_float4(ptr[4 * index + 0], ptr[4 * index + 1], ptr[4 * index + 2], ptr[4 * index + 3]);
+	}
+
+	STATIC_KEYWORD void set_uint3(__global unsigned int* ptr, size_t index, uint3 value)
+	{
+		ptr[3 * index + 0] = value.x;
+		ptr[3 * index + 1] = value.y;
+		ptr[3 * index + 2] = value.z;
+	}
+
+	STATIC_KEYWORD void set_float3(__global float* ptr, size_t index, float3 value)
+	{
+		ptr[3 * index + 0] = value.x;
+		ptr[3 * index + 1] = value.y;
+		ptr[3 * index + 2] = value.z;
+	}
+
+	STATIC_KEYWORD void atomic_add_f32(volatile __global float *address, float value) {
+		float old = value;
+		while ((old = atomic_xchg(address, atomic_xchg(address, 0.0f)+old))!=0.0f);
+	}
+
+	STATIC_KEYWORD float atomic_cmpxchg_f32(volatile __global float *p, float cmp, float val) {
+		union {
+			unsigned int	u32;
+			float			f32;
+		} cmp_union, val_union, old_union;
+
+		cmp_union.f32 = cmp;
+		val_union.f32 = val;
+		old_union.u32 = atomic_cmpxchg((volatile __global unsigned int *) p, cmp_union.u32, val_union.u32);
+		return old_union.f32;
+	}
+
+	STATIC_KEYWORD float atomic_cmpxchg_float(volatile __global float *p, float cmp, float val) {
+		return atomic_cmpxchg_f32(p, cmp, val);
+	}
+
+	STATIC_KEYWORD unsigned int atomic_cmpxchg_uint(volatile __global uint *p, uint cmp, uint val) {
+		return atomic_cmpxchg(p, cmp, val);
+	}
+
+	STATIC_KEYWORD unsigned char rounded_cast_uchar(float value) {
+		return (unsigned char) (value + 0.5f);
+	}
+
+	STATIC_KEYWORD unsigned short rounded_cast_ushort(float value) {
+		return (unsigned short) (value + 0.5f);
+	}
+
+	STATIC_KEYWORD unsigned int rounded_cast_uint(float value) {
+		return (unsigned int) (value + 0.5f);
+	}
+
+	STATIC_KEYWORD float rounded_cast_float(float value) {
+		return value;
+	}
+#endif
+
+//______SHARED_STRUCTS__________________________________________________________________________________________________
+
+// https://devtalk.nvidia.com/default/topic/673965/are-there-any-cuda-libararies-for-3x3-matrix-amp-vector3-amp-quaternion-operations-/
+typedef struct {
+	cl_float4 m_row[3];
+} Matrix3x3f;
+
+typedef struct {
+	cl_float4 m_row[4];
+} Matrix4x4f;
+
+//______HOST_CODE_______________________________________________________________________________________________________
+
+#ifdef HOST_CODE
+	inline cl_float3 make_float3(const vector3d &a)
+	{
+		cl_float3 v = {(float) a.x(), (float) a.y(), (float) a.z()};
+		return v;
+	}
+
+	inline cl_float4 make_float4(const vector4d &a)
+	{
+		cl_float4 v = {(float) a.x(), (float) a.y(), (float) a.z(), (float) a.w()};
+		return v;
+	}
+
+	inline cl_float4 make_float4(float x, float y, float z, float w)
+	{
+		cl_float4 v = {x, y, z, w};
+		return v;
+	}
+
+	inline Matrix3x3f make_matrix_f3x3(const matrix3x3d &a)
+	{
+		Matrix3x3f m;
+		m.m_row[0] = make_float4((float) a(0, 0), (float) a(0, 1), (float) a(0, 2), 0.0f);
+		m.m_row[1] = make_float4((float) a(1, 0), (float) a(1, 1), (float) a(1, 2), 0.0f);
+		m.m_row[2] = make_float4((float) a(2, 0), (float) a(2, 1), (float) a(2, 2), 0.0f);
+		return m;
+	}
+
+	inline Matrix4x4f make_matrix_f4x4(const matrix4x4d &a)
+	{
+		Matrix4x4f m;
+		m.m_row[0] = make_float4((float) a(0, 0), (float) a(0, 1), (float) a(0, 2), (float) a(0, 3));
+		m.m_row[1] = make_float4((float) a(1, 0), (float) a(1, 1), (float) a(1, 2), (float) a(1, 3));
+		m.m_row[2] = make_float4((float) a(2, 0), (float) a(2, 1), (float) a(2, 2), (float) a(2, 3));
+		m.m_row[3] = make_float4((float) a(3, 0), (float) a(3, 1), (float) a(3, 2), (float) a(3, 3));
+		return m;
+	}
+#endif
+
+//______DEVICE_CODE_____________________________________________________________________________________________________
+
+#ifndef HOST_CODE
+
+#ifdef DEBUG
+	STATIC_KEYWORD void print_matrix_f3x3(const Matrix3x3f m)
+	{
+		printf("[\n");
+		printf("  [%f, %f, %f],\n", m.m_row[0].x, m.m_row[0].y, m.m_row[0].z);
+		printf("  [%f, %f, %f],\n", m.m_row[1].x, m.m_row[1].y, m.m_row[1].z);
+		printf("  [%f, %f, %f],\n", m.m_row[2].x, m.m_row[2].y, m.m_row[2].z);
+		printf("]\n");
+	}
+#endif
+
+	STATIC_KEYWORD Matrix3x3f make_matrix_f3x3(float a00, float a01, float a02, float a10, float a11, float a12, float a20, float a21, float a22)
+	{
+		Matrix3x3f m;
+		m.m_row[0] = make_float4(a00, a01, a02, 0.0f);
+		m.m_row[1] = make_float4(a10, a11, a12, 0.0f);
+		m.m_row[2] = make_float4(a20, a21, a22, 0.0f);
+		return m;
+	}
+
+	STATIC_KEYWORD Matrix3x3f make_zero_matrix_f3x3()
+	{
+		Matrix3x3f m;
+		m.m_row[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		m.m_row[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		m.m_row[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		return m;
+	}
+
+	STATIC_KEYWORD Matrix3x3f make_eye_matrix_f3x3()
+	{
+		Matrix3x3f m;
+		m.m_row[0] = make_float4(1.0f, 0.0f, 0.0f, 0.0f);
+		m.m_row[1] = make_float4(0.0f, 1.0f, 0.0f, 0.0f);
+		m.m_row[2] = make_float4(0.0f, 0.0f, 1.0f, 0.0f);
+		return m;
+	}
+
+	STATIC_KEYWORD Matrix3x3f transpose_f3x3(const Matrix3x3f m)
+	{
+		Matrix3x3f t;
+		t.m_row[0] = make_float4(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.0f);
+		t.m_row[1] = make_float4(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.0f);
+		t.m_row[2] = make_float4(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.0f);
+		return t;
+	}
+
+	STATIC_KEYWORD Matrix3x3f add_f3x3(const Matrix3x3f a, const Matrix3x3f b)
+	{
+		Matrix3x3f m;
+		m.m_row[0] = a.m_row[0] + b.m_row[0];
+		m.m_row[1] = a.m_row[1] + b.m_row[1];
+		m.m_row[2] = a.m_row[2] + b.m_row[2];
+		return m;
+	}
+
+    STATIC_KEYWORD Matrix3x3f mul_f3x3(const Matrix3x3f a, const Matrix3x3f b)
+    {
+        Matrix3x3f bt = transpose_f3x3(b);
+        Matrix3x3f res;
+        res.m_row[0] = make_float4(dot(a.m_row[0], bt.m_row[0]), dot(a.m_row[0], bt.m_row[1]), dot(a.m_row[0], bt.m_row[2]), 0.0f);
+        res.m_row[1] = make_float4(dot(a.m_row[1], bt.m_row[0]), dot(a.m_row[1], bt.m_row[1]), dot(a.m_row[1], bt.m_row[2]), 0.0f);
+        res.m_row[2] = make_float4(dot(a.m_row[2], bt.m_row[0]), dot(a.m_row[2], bt.m_row[1]), dot(a.m_row[2], bt.m_row[2]), 0.0f);
+        return res;
+    }
+
+	STATIC_KEYWORD Matrix3x3f mul_f_f3x3(float k, const Matrix3x3f a)
+	{
+		Matrix3x3f m;
+		m.m_row[0] = a.m_row[0] * k;
+		m.m_row[1] = a.m_row[1] * k;
+		m.m_row[2] = a.m_row[2] * k;
+		return m;
+	}
+
+	STATIC_KEYWORD float3 mul_f3x3_f3(const Matrix3x3f a, const float3 b)
+	{
+		return make_float3(a.m_row[0].x * b.x + a.m_row[0].y * b.y + a.m_row[0].z * b.z,
+						   a.m_row[1].x * b.x + a.m_row[1].y * b.y + a.m_row[1].z * b.z,
+						   a.m_row[2].x * b.x + a.m_row[2].y * b.y + a.m_row[2].z * b.z);
+	}
+
+	STATIC_KEYWORD float2 transformPoint_f3x3(const Matrix3x3f m, const float2 p)
+	{
+		float3 temp = mul_f3x3_f3(m, make_float3(p.x, p.y, 1.0f));
+		return make_float2(temp.x, temp.y) / temp.z;
+	}
+
+#ifdef DEBUG
+	STATIC_KEYWORD void print_matrix_f4x4(const Matrix4x4f m)
+	{
+		printf("[\n");
+		printf("  [%f, %f, %f, %f],\n", m.m_row[0].x, m.m_row[0].y, m.m_row[0].z, m.m_row[0].w);
+		printf("  [%f, %f, %f, %f],\n", m.m_row[1].x, m.m_row[1].y, m.m_row[1].z, m.m_row[1].w);
+		printf("  [%f, %f, %f, %f],\n", m.m_row[2].x, m.m_row[2].y, m.m_row[2].z, m.m_row[2].w);
+		printf("  [%f, %f, %f, %f],\n", m.m_row[3].x, m.m_row[3].y, m.m_row[3].z, m.m_row[3].w);
+		printf("]\n");
+	}
+#endif
+
+	STATIC_KEYWORD Matrix4x4f make_matrix_f4x4(float a00, float a01, float a02, float a03,
+											   float a10, float a11, float a12, float a13,
+											   float a20, float a21, float a22, float a23,
+											   float a30, float a31, float a32, float a33)
+	{
+		Matrix4x4f m;
+		m.m_row[0] = make_float4(a00, a01, a02, a03);
+		m.m_row[1] = make_float4(a10, a11, a12, a13);
+		m.m_row[2] = make_float4(a20, a21, a22, a23);
+		m.m_row[3] = make_float4(a30, a31, a32, a33);
+		return m;
+	}
+
+	STATIC_KEYWORD Matrix4x4f make_translation_f4x4(const float3 t)
+	{
+		return make_matrix_f4x4(1.0f, 0.0f, 0.0f, t.x,
+								0.0f, 1.0f, 0.0f, t.y,
+								0.0f, 0.0f, 1.0f, t.z,
+								0.0f, 0.0f, 0.0f, 1.0f);
+	}
+
+	STATIC_KEYWORD Matrix4x4f make_rotation_f4x4(const Matrix3x3f r)
+	{
+		Matrix4x4f m;
+		m.m_row[0] = r.m_row[0];
+		m.m_row[1] = r.m_row[1];
+		m.m_row[2] = r.m_row[2];
+
+		m.m_row[0].w = 0.0f;
+		m.m_row[1].w = 0.0f;
+		m.m_row[2].w = 0.0f;
+		m.m_row[3] = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
+		return m;
+	}
+
+	STATIC_KEYWORD float3 extract_translation_f4x4(const Matrix4x4f m)
+	{
+		float norm = 1.0f / m.m_row[3].w;
+		return make_float3(m.m_row[0].w, m.m_row[1].w, m.m_row[2].w) * norm;
+	}
+
+    STATIC_KEYWORD Matrix3x3f extract_rotation_f4x4(const Matrix4x4f m)
+    {
+        Matrix3x3f R = make_matrix_f3x3(
+                m.m_row[0].x, m.m_row[0].y, m.m_row[0].z,
+                m.m_row[1].x, m.m_row[1].y, m.m_row[1].z,
+                m.m_row[2].x, m.m_row[2].y, m.m_row[2].z);
+
+        // matrix4x4f.scale3()
+        Matrix3x3f MtM = mul_f3x3(transpose_f3x3(R), R);
+
+        float3 d = make_float3(MtM.m_row[0].x, MtM.m_row[1].y, MtM.m_row[2].z);
+
+        if (d.x > 0) d.x = sqrtf(d.x);
+        if (d.y > 0) d.y = sqrtf(d.y);
+        if (d.z > 0) d.z = sqrtf(d.z);
+
+        float3 s = d;
+
+        if (s.x) s.x = 1.0f / s.x;
+        if (s.y) s.y = 1.0f / s.y;
+        if (s.z) s.z = 1.0f / s.z;
+
+        return mul_f3x3(R, make_matrix_f3x3(s.x, 0.0f, 0.0f,
+                                            0.0f, s.y, 0.0f,
+                                            0.0f, 0.0f, s.z));
+    }
+
+	STATIC_KEYWORD Matrix4x4f transpose_f4x4(const Matrix4x4f m)
+	{
+		Matrix4x4f t;
+		t.m_row[0] = make_float4(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, m.m_row[3].x);
+		t.m_row[1] = make_float4(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, m.m_row[3].y);
+		t.m_row[2] = make_float4(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, m.m_row[3].z);
+		t.m_row[3] = make_float4(m.m_row[0].w, m.m_row[1].w, m.m_row[2].w, m.m_row[3].w);
+		return t;
+	}
+
+	STATIC_KEYWORD Matrix4x4f mul_f4x4(const Matrix4x4f a, const Matrix4x4f b)
+	{
+		Matrix4x4f bt = transpose_f4x4(b);
+		Matrix4x4f res;
+		res.m_row[0] = make_float4(dot(a.m_row[0], bt.m_row[0]), dot(a.m_row[0], bt.m_row[1]), dot(a.m_row[0], bt.m_row[2]), dot(a.m_row[0], bt.m_row[3]));
+		res.m_row[1] = make_float4(dot(a.m_row[1], bt.m_row[0]), dot(a.m_row[1], bt.m_row[1]), dot(a.m_row[1], bt.m_row[2]), dot(a.m_row[1], bt.m_row[3]));
+		res.m_row[2] = make_float4(dot(a.m_row[2], bt.m_row[0]), dot(a.m_row[2], bt.m_row[1]), dot(a.m_row[2], bt.m_row[2]), dot(a.m_row[2], bt.m_row[3]));
+		res.m_row[3] = make_float4(dot(a.m_row[3], bt.m_row[0]), dot(a.m_row[3], bt.m_row[1]), dot(a.m_row[3], bt.m_row[2]), dot(a.m_row[3], bt.m_row[3]));
+		return res;
+	}
+
+	STATIC_KEYWORD float4 mul_f4x4_f4(const Matrix4x4f a, const float4 b)
+	{
+		return make_float4(a.m_row[0].x * b.x + a.m_row[0].y * b.y + a.m_row[0].z * b.z + a.m_row[0].w * b.w,
+						   a.m_row[1].x * b.x + a.m_row[1].y * b.y + a.m_row[1].z * b.z + a.m_row[1].w * b.w,
+						   a.m_row[2].x * b.x + a.m_row[2].y * b.y + a.m_row[2].z * b.z + a.m_row[2].w * b.w,
+						   a.m_row[3].x * b.x + a.m_row[3].y * b.y + a.m_row[3].z * b.z + a.m_row[3].w * b.w);
+	}
+
+	STATIC_KEYWORD float3 transformPoint(const Matrix4x4f m, const float3 p)
+	{
+		float4 temp = mul_f4x4_f4(m, make_float4(p.x, p.y, p.z, 1.0f));
+		return make_float3(temp.x, temp.y, temp.z) / temp.w;
+	}
+
+	STATIC_KEYWORD float3 transformVector(const Matrix4x4f m, const float3 v)
+	{
+		float4 temp = mul_f4x4_f4(m, make_float4(v.x, v.y, v.z, 0.0f));
+		return make_float3(temp.x, temp.y, temp.z);
+	}
+
+	STATIC_KEYWORD float smootherstep(float edge0, float edge1, float x)
+	{
+		if (x < edge0) {
+			return 0.0f;
+		} else if (x >= edge1) {
+			return 1.0f;
+		}
+
+		// Scale, and clamp x to 0..1 range
+		x = (x - edge0) / (edge1 - edge0);
+		// Evaluate polynomial
+		return x * x * x * (x * (x * 6.0f - 15.0f) + 10.0f);
+	}
+
+#endif
+
+#endif // pragma once
diff --git a/libs/gpu/libgpu/opencl/device_info.cpp b/libs/gpu/libgpu/opencl/device_info.cpp
new file mode 100644
index 0000000..30e875c
--- /dev/null
+++ b/libs/gpu/libgpu/opencl/device_info.cpp
@@ -0,0 +1,204 @@
+#include "device_info.h"
+#include "utils.h"
+#include <iostream>
+#include <vector>
+
+namespace ocl {
+
+DeviceInfo::DeviceInfo()
+{
+	device_type					= 0;
+	max_compute_units			= 0;
+	max_mem_alloc_size			= 0;
+	max_workgroup_size			= 0;
+	max_work_item_sizes[0]		= 0;
+	max_work_item_sizes[1]		= 0;
+	max_work_item_sizes[2]		= 0;
+	global_mem_size				= 0;
+	device_address_bits			= 0;
+	vendor_id					= 0;
+	warp_size					= 0;
+	wavefront_width				= 0;
+	opencl_major_version		= 0;
+	opencl_minor_version		= 0;
+}
+
+void DeviceInfo::print() const
+{
+	std::cout << "Using device: " << device_name << ", " << max_compute_units << " compute units, " << (global_mem_size >> 20) << " MB global memory,"
+		" OpenCL " << opencl_major_version << "." << opencl_minor_version << std::endl;
+	std::cout << "  driver version: " << driver_version << ", platform version: " << platform_version << std::endl;
+	std::cout << "  max work group size " << max_workgroup_size << std::endl;
+	std::cout << "  max work item sizes [" << max_work_item_sizes[0] << ", " << max_work_item_sizes[1] << ", " << max_work_item_sizes[2] << "]" << std::endl;
+	std::cout << "  max mem alloc size " << (max_mem_alloc_size >> 20) << " MB" << std::endl;
+	if (warp_size != 0)
+		std::cout << "  warp size " << warp_size << std::endl;
+	if (wavefront_width != 0)
+		std::cout << "  wavefront width " << wavefront_width << std::endl;
+}
+
+void DeviceInfo::init(cl_device_id device_id)
+{
+	cl_device_type	device_type					= 0;
+	cl_uint			max_compute_units			= 0;		  // Number of compute units (SM's on NV GPU)
+	cl_uint			max_work_item_dimensions	= 0;
+	size_t			max_workgroup_size			= 0;
+	cl_uint			vendor_id					= 0;
+	cl_ulong		max_mem_alloc_size			= 0;
+	cl_ulong		global_mem_size				= 0;
+	cl_uint			device_address_bits			= 0;
+	char			device_string[1024]			= "";
+	char			vendor_string[1024]			= "";
+	char			driver_version_string[1024] = "";
+	char			platform_version_string[1024]= "";
+
+	cl_platform_id platform_id = 0;
+	OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_PLATFORM, sizeof(platform_id), &platform_id, NULL));
+
+	OCL_SAFE_CALL(clGetPlatformInfo(platform_id, CL_PLATFORM_VERSION,				sizeof(platform_version_string),	&platform_version_string, NULL));
+
+	OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_NAME,						sizeof(device_string),				&device_string, NULL));
+	OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_VENDOR,						sizeof(vendor_string),				&vendor_string, NULL));
+	OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DRIVER_VERSION,						sizeof(driver_version_string),		&driver_version_string, NULL));
+	OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_TYPE,						sizeof(cl_device_type),				&device_type, NULL));
+	OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS,			sizeof(max_compute_units),			&max_compute_units, NULL));
+	OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE,			sizeof(max_mem_alloc_size),			&max_mem_alloc_size, NULL));
+	OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,	sizeof(max_work_item_dimensions),	&max_work_item_dimensions, NULL));
+	OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE,			sizeof(max_workgroup_size),			&max_workgroup_size, NULL));
+	OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE,				sizeof(global_mem_size),			&global_mem_size, NULL));
+	OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_ADDRESS_BITS,				sizeof(device_address_bits),		&device_address_bits, NULL));
+	OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_VENDOR_ID,					sizeof(vendor_id),					&vendor_id, NULL));
+
+	std::vector<size_t> max_work_item_sizes(max_work_item_dimensions);
+	OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_ITEM_SIZES, max_work_item_dimensions * sizeof(size_t), max_work_item_sizes.data(), NULL));
+	for (int i = 0; i < 3; i++)
+		this->max_work_item_sizes[i] = max_work_item_sizes[i];
+
+	this->device_name				= std::string(device_string);
+	this->vendor_name				= std::string(vendor_string);
+	this->device_type				= device_type;
+	this->vendor_id					= vendor_id;
+	this->max_compute_units			= max_compute_units;
+	this->max_mem_alloc_size		= max_mem_alloc_size;
+	this->max_workgroup_size		= max_workgroup_size;
+	this->global_mem_size			= global_mem_size;
+	this->device_address_bits		= device_address_bits;
+	this->max_work_item_dimensions	= max_work_item_dimensions;
+	this->driver_version			= std::string(driver_version_string);
+	this->platform_version			= std::string(platform_version_string);
+
+	initExtensions(platform_id, device_id);
+	initOpenCLVersion(platform_id, device_id);
+
+	if (device_type == CL_DEVICE_TYPE_GPU && vendor_id == ID_AMD && hasExtension(CL_AMD_DEVICE_ATTRIBUTE_QUERY_EXT)) {
+		OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_BOARD_NAME_AMD, sizeof(device_string), &device_string, NULL));
+		this->device_name = std::string(device_string) + " (" + this->device_name + ")";
+	}
+
+	cl_uint warp_size = 0;
+	size_t wavefront_width = 0;
+	if (device_type == CL_DEVICE_TYPE_GPU) {
+		if (vendor_id == ID_NVIDIA && hasExtension(CL_NV_DEVICE_ATTRIBUTE_QUERY_EXT)) {
+			OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), &warp_size, NULL));
+		} else if (vendor_id == ID_AMD && hasExtension(CL_AMD_DEVICE_ATTRIBUTE_QUERY_EXT)) {
+			OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_WAVEFRONT_WIDTH_AMD, sizeof(cl_uint), &wavefront_width, NULL));
+		}
+	}
+
+	this->warp_size			= warp_size;
+	this->wavefront_width	= wavefront_width;
+}
+
+void DeviceInfo::initOpenCLVersion(cl_platform_id platform_id, cl_device_id device_id)
+{
+	const int buffer_limit = 1024;
+	char buffer[buffer_limit];
+
+	int platform_major_version = 0;
+	int platform_minor_version = 0;
+
+	OCL_SAFE_CALL(clGetPlatformInfo (platform_id, CL_PLATFORM_VERSION, buffer_limit, buffer, NULL));
+	parseOpenCLVersion(buffer, buffer_limit, platform_major_version, platform_minor_version);
+
+	int device_major_version = 0;
+	int device_minor_version = 0;
+
+	OCL_SAFE_CALL(clGetDeviceInfo (device_id, CL_DEVICE_VERSION, buffer_limit, buffer, NULL));
+	parseOpenCLVersion(buffer, buffer_limit, device_major_version, device_minor_version);
+
+	if (device_major_version < platform_major_version
+		|| (device_major_version == platform_major_version && device_minor_version < platform_minor_version)) {
+		opencl_major_version = device_major_version;
+		opencl_minor_version = device_minor_version;
+	} else {
+		opencl_major_version = platform_major_version;
+		opencl_minor_version = platform_minor_version;
+	}
+}
+
+void DeviceInfo::parseOpenCLVersion(char* buffer, int buffer_limit, int& major_version, int& minor_verions)
+{
+	// For platform:
+	// "OpenCL<space><major_version.minor_version><space><platform-specific information>"
+	// For device:
+	// "OpenCL<space><major_version.minor_version><space><vendor-specific information>"
+	int firstSpaceIndex = -1;
+	int firstDotIndex = -1;
+	int secondSpaceIndex = -1;
+	for (int i = 0; i < buffer_limit; i++ ) {
+		if (buffer[i] == ' ') {
+			if (firstSpaceIndex == -1) {
+				firstSpaceIndex = i;
+			} else if (secondSpaceIndex == -1) {
+				secondSpaceIndex = i;
+				buffer[i] = 0;
+				break;
+			}
+		} else if (buffer[i] == '.' && firstDotIndex == -1) {
+			firstDotIndex = i;
+			buffer[i] = 0;
+		}
+	}
+
+	major_version = atoi(buffer + firstSpaceIndex + 1);
+	minor_verions = atoi(buffer + firstDotIndex + 1);
+}
+
+bool DeviceInfo::isIntelGPU() const
+{
+	return device_type == CL_DEVICE_TYPE_GPU
+		   && (vendor_id == ocl::ID_INTEL || vendor_name.find("Intel") != std::string::npos);
+}
+
+void DeviceInfo::initExtensions(cl_platform_id platform_id, cl_device_id device_id)
+{
+	for (int i = 0; i < 2; ++i) {
+		size_t length;
+		if (i == 0) {
+			OCL_SAFE_CALL(clGetPlatformInfo(platform_id, CL_PLATFORM_EXTENSIONS, 0, 0, &length));
+		} else {
+			OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, 0, &length));
+		}
+
+		std::vector<char> buffer(length);
+		if (i == 0) {
+			OCL_SAFE_CALL(clGetPlatformInfo(platform_id, CL_PLATFORM_EXTENSIONS, sizeof(char) * buffer.size(), buffer.data(), NULL));
+		} else {
+			OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, sizeof(char) * buffer.size(), buffer.data(), NULL));
+		}
+
+		std::string extension = "";
+		for (int i = 0; i <= buffer.size(); i++) {
+			if (i == buffer.size() || buffer[i] == ' ') {
+				if (extension.length() > 0) {
+					extensions.insert(extension);
+					extension = "";
+				}
+			} else {
+				extension += buffer[i];
+			}
+		}
+	}
+}
+
+}
diff --git a/libs/gpu/libgpu/opencl/device_info.h b/libs/gpu/libgpu/opencl/device_info.h
new file mode 100644
index 0000000..979cf00
--- /dev/null
+++ b/libs/gpu/libgpu/opencl/device_info.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include <set>
+
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+
+namespace ocl {
+
+class DeviceInfo {
+public:
+	DeviceInfo();
+
+	void init(cl_device_id device_id);
+	void print() const;
+
+	bool 				isIntelGPU() const;
+	bool				hasExtension(std::string extension)	{ return extensions.count(extension) > 0;}
+
+	std::string				device_name;
+	std::string				vendor_name;
+	unsigned int			device_type;
+	unsigned int			vendor_id;
+	size_t					max_compute_units;
+	size_t					max_mem_alloc_size;
+	size_t					max_workgroup_size;
+	size_t					max_work_item_sizes[3];
+	size_t					global_mem_size;
+	size_t 					device_address_bits;
+	size_t					max_work_item_dimensions;
+	unsigned int			warp_size;
+	size_t					wavefront_width;
+	std::string				driver_version;
+	std::string				platform_version;
+
+	int 					opencl_major_version;
+	int 					opencl_minor_version;
+
+	std::set<std::string>	extensions;
+
+protected:
+	void				initExtensions(cl_platform_id platform_id, cl_device_id device_id);
+	void				initOpenCLVersion(cl_platform_id platform_id, cl_device_id device_id);
+	void				parseOpenCLVersion(char* buffer, int buffer_limit, int& major_version, int& minor_verions);
+};
+
+}
diff --git a/libs/gpu/libgpu/opencl/engine.cpp b/libs/gpu/libgpu/opencl/engine.cpp
new file mode 100644
index 0000000..28b82ee
--- /dev/null
+++ b/libs/gpu/libgpu/opencl/engine.cpp
@@ -0,0 +1,749 @@
+#include "utils.h"
+#include "libutils/thread_mutex.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <cassert>
+#include <vector>
+
+#include <libclew/ocl_init.h>
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+#include <libgpu/context.h>
+#include <libgpu/shared_device_buffer.h>
+#include <libutils/timer.h>
+
+#define _SHORT_FILE_ "ocl_engine.cpp"
+
+#define LOAD_KERNEL_BINARIES_FROM_FILE ""
+#define DUMP_KERNEL_BINARIES_TO_FILE ""
+#define OCL_VERBOSE_COMPILE_LOG false
+
+#ifdef _MSC_VER
+typedef unsigned long long uint64_t;
+#endif
+
+using namespace ocl;
+
+// Gets the platform ID for NVIDIA if available, otherwise default
+cl_platform_id oclGetPlatformID(void)
+{
+	char chBuffer[1024];
+	cl_uint num_platforms;
+
+	cl_platform_id	platform = 0;
+
+	// Get OpenCL platform count
+	OCL_SAFE_CALL(clGetPlatformIDs (0, NULL, &num_platforms));
+
+	if (num_platforms == 0)
+		throw ocl_exception("No OpenCL platforms found");
+
+	// if there's a platform or more, make space for ID's
+	std::vector<cl_platform_id> clPlatformIDs(num_platforms);
+
+	// get platform info for each platform and trap the NVIDIA platform if found
+	OCL_SAFE_CALL(clGetPlatformIDs (num_platforms, clPlatformIDs.data(), NULL));
+	for (cl_uint i = 0; i < num_platforms; ++i) {
+		OCL_SAFE_CALL(clGetPlatformInfo (clPlatformIDs[i], CL_PLATFORM_NAME, 1024, &chBuffer, NULL));
+		if (strstr(chBuffer, "NVIDIA") != NULL) {
+			platform = clPlatformIDs[i];
+			break;
+		}
+	}
+
+	// default to zeroeth platform if NVIDIA not found
+	if (platform == 0)
+		platform = clPlatformIDs[0];
+
+	return platform;
+}
+
+OpenCLKernel::OpenCLKernel()
+{
+	kernel_				= 0;
+	work_group_size_	= 0;
+}
+
+OpenCLKernel::~OpenCLKernel()
+{
+	if (kernel_)	clReleaseKernel(kernel_);
+}
+
+void OpenCLKernel::create(cl_program program, const char *kernel_name, cl_device_id device_id_)
+{
+	if (device_id_ == NULL) {
+		gpu::Context context;
+		GPU_CHECKED_VERBOSE(context.type() == gpu::Context::TypeOpenCL, "Can not link with OpenCL kernel!");
+		device_id_ = context.cl()->device();
+	}
+
+	cl_int ciErrNum = CL_SUCCESS;
+	kernel_name_ = std::string(kernel_name);
+	kernel_ = clCreateKernel(program, kernel_name, &ciErrNum);
+
+	if (ciErrNum != CL_SUCCESS)
+		throw std::runtime_error("clCreateKernel " + to_string(kernel_name_) + " failed: " + errorString(ciErrNum));
+
+	size_t kernel_workgroup_size = 0;
+
+	ciErrNum = clGetKernelWorkGroupInfo(kernel_, device_id_, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernel_workgroup_size, NULL);
+	if (ciErrNum != CL_SUCCESS)
+		throw std::runtime_error("clGetKernelWorkGroupInfo failed: " + errorString(ciErrNum));
+
+	work_group_size_ = kernel_workgroup_size;
+}
+
+void OpenCLKernel::setArg(cl_uint arg_index, size_t arg_size, const void *arg_value)
+{
+	cl_int ciErrNum = clSetKernelArg(kernel_, arg_index, arg_size, arg_value);
+
+	if (ciErrNum != CL_SUCCESS)
+		throw std::runtime_error("clSetKernelArg " + to_string(kernel_name_) + "#" + to_string(arg_index) + " (" +to_string(arg_size) + " bytes) failed: " + errorString(ciErrNum));
+}
+
+OpenCLEngine::OpenCLEngine()
+{
+	platform_id_				= 0;
+	device_id_					= 0;
+	context_					= 0;
+	command_queue_				= 0;
+	total_mem_size_				= 0;
+}
+
+OpenCLEngine::~OpenCLEngine()
+{
+	for (std::map<int, OpenCLKernel *>::iterator it = kernels_.begin(); it != kernels_.end(); ++it)
+		delete it->second;
+
+	for (std::map<int, cl_program>::iterator it = programs_.begin(); it != programs_.end(); ++it)
+		clReleaseProgram(it->second);
+
+	if (command_queue_)		clReleaseCommandQueue(command_queue_);
+	if (context_)			clReleaseContext(context_);
+}
+
+void OpenCLEngine::init(cl_device_id device_id, const char *cl_params, bool verbose)
+{
+	if (!device_id) {
+		init((cl_platform_id) 0, (cl_device_id) 0, cl_params, verbose);
+		return;
+	}
+
+	cl_platform_id platform_id = 0;
+	OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_PLATFORM, sizeof(platform_id), &platform_id, NULL));
+
+	return init(platform_id, device_id, cl_params, verbose);
+}
+
+void OpenCLEngine::init(cl_platform_id platform_id, cl_device_id device_id, const char *cl_params, bool verbose)
+{
+	if (!ocl_init())
+		throw ocl_exception("Can't init OpenCL driver");
+
+	if (command_queue_) {
+		clReleaseCommandQueue(command_queue_);
+		command_queue_ = 0;
+	}
+
+	if (context_) {
+		clReleaseContext(context_);
+		context_ = 0;
+	}
+
+	if (!platform_id) {
+		device_id	= 0;
+		platform_id	= oclGetPlatformID();
+	}
+
+	if (!device_id) {
+		// Get all the devices
+		cl_uint			uiNumDevices	= 0;		// Number of devices available
+
+		OCL_SAFE_CALL(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices));
+
+		if (uiNumDevices < 1)
+			throw ocl_exception("No OpenCL devices found");
+
+		std::vector<cl_device_id> devices(uiNumDevices);
+		OCL_SAFE_CALL(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, uiNumDevices, devices.data(), NULL));
+		device_id = devices[0];
+	}
+
+	device_info_.init(device_id);
+
+	if (device_info_.max_work_item_dimensions < 3)
+		throw ocl_exception("3 dimensional work items not supported");
+
+	total_mem_size_ = device_info_.global_mem_size;
+
+	cl_context_properties context_props[] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform_id, 0 };
+
+	cl_int ciErrNum;
+	context_		= clCreateContext(context_props, 1, &device_id, NULL, NULL, &ciErrNum);
+	OCL_SAFE_CALL(ciErrNum);
+
+	command_queue_	= clCreateCommandQueue(context_, device_id, 0, &ciErrNum);
+	OCL_SAFE_CALL(ciErrNum);
+
+	platform_id_	= platform_id;
+	device_id_		= device_id;
+
+	if (device_info_.device_type == CL_DEVICE_TYPE_GPU) {
+		if (device_info_.warp_size) {
+			wavefront_size_ = device_info_.warp_size;
+		} else if (device_info_.wavefront_width) {
+			wavefront_size_ = device_info_.wavefront_width;
+		} else if (device_info_.isIntelGPU()) {
+			wavefront_size_ = 8;
+		} else {
+			wavefront_size_ = 1;
+		}
+	} else {
+		wavefront_size_ = 1;
+	}
+
+	if (verbose) {
+		device_info_.print();
+		if (device_info_.warp_size == 0 && device_info_.wavefront_width == 0) {
+			std::cout << "  wavefront width " << wavefront_size_ << std::endl;
+		}
+	}
+}
+
+void ocl::oclPrintBuildLog(cl_program program)
+{
+	size_t device_count;
+	OCL_SAFE_CALL(clGetProgramInfo(program, CL_PROGRAM_DEVICES, 0, NULL, &device_count));
+	device_count /= sizeof(cl_device_id);
+
+	std::vector<cl_device_id> devices(device_count);
+
+	OCL_SAFE_CALL(clGetProgramInfo(program, CL_PROGRAM_DEVICES, device_count * sizeof(cl_device_id), devices.data(), NULL));
+
+	for (size_t k = 0; k < device_count; k++) {
+		std::cout << "Device " << k + 1 << std::endl;
+		size_t log_size = 0;
+		OCL_SAFE_CALL(clGetProgramBuildInfo(program, devices[k], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size));
+		if (log_size > 0) {
+			std::cout << "\tProgram build log:" << std::endl;
+			std::vector<char> log(log_size + 1);
+			OCL_SAFE_CALL(clGetProgramBuildInfo(program, devices[k], CL_PROGRAM_BUILD_LOG, log_size, log.data(), NULL));
+
+			log[log_size] = 0;
+			std::cout << log.data() << std::endl << std::endl;
+		} else {
+			std::cout << "\tProgram build log is clear" << std::endl;
+		}
+	}
+}
+
+cl_mem OpenCLEngine::createBuffer(cl_mem_flags flags, size_t size)
+{
+//	if (size > device_info_.max_mem_alloc_size) {
+//		throw ocl_bad_alloc("Can't allocate " + to_string(size) + " bytes, because max allocation size is " + to_string(device_info_.max_mem_alloc_size) + "!");
+//	}
+
+	cl_int status = CL_SUCCESS;
+	cl_mem res = clCreateBuffer(context_, flags, size, NULL, &status);
+	OCL_SAFE_CALL(status);
+
+	// forcing buffer allocation by fictive write
+	size_t data_size = (size >= 8) ? 4 : 1;
+	assert (size >= 2 * data_size);
+
+	int test_data = 239;
+	try {
+		writeBuffer(res, CL_TRUE, 0, data_size, &test_data);
+	} catch (ocl_exception& e) {
+		releaseMemObject(res);
+		throw;
+	}
+
+	return res;
+}
+
+void OpenCLEngine::writeBuffer(cl_mem buffer, cl_bool blocking_write, size_t offset, size_t cb, const void *ptr)
+{
+	if (cb == 0)
+		return;
+	OCL_SAFE_CALL(clEnqueueWriteBuffer(queue(), buffer, blocking_write, offset, cb, ptr, 0, NULL, NULL));
+}
+
+void OpenCLEngine::writeBufferRect(cl_mem buffer, cl_bool blocking_write, const size_t buffer_origin[3], const size_t host_origin[3], const size_t region[3],
+								size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, const void *ptr)
+{
+	if (region[0] == 0 || region[1] == 0 || region[2] == 0)
+		return;
+	OCL_SAFE_CALL(clEnqueueWriteBufferRect(queue(), buffer, blocking_write, buffer_origin, host_origin, region,
+								buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, NULL));
+}
+
+void OpenCLEngine::readBuffer(cl_mem buffer, cl_bool blocking_read, size_t offset, size_t cb, void *ptr)
+{
+	if (cb == 0)
+		return;
+	OCL_SAFE_CALL(clEnqueueReadBuffer(queue(), buffer, blocking_read, offset, cb, ptr, 0, NULL, NULL));
+}
+
+void OpenCLEngine::readBufferRect(cl_mem buffer, cl_bool blocking_write, const size_t buffer_origin[3], const size_t host_origin[3], const size_t region[3],
+								size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, void *ptr)
+{
+	if (region[0] == 0 || region[1] == 0 || region[2] == 0)
+		return;
+	OCL_SAFE_CALL(clEnqueueReadBufferRect(queue(), buffer, blocking_write, buffer_origin, host_origin, region,
+								buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, NULL));
+}
+
+void OpenCLEngine::copyBuffer(cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t cb)
+{
+	if (cb == 0)
+		return;
+	cl_event ev = NULL;
+	OCL_SAFE_CALL(clEnqueueCopyBuffer(queue(), src_buffer, dst_buffer, src_offset, dst_offset, cb, 0, NULL, &ev));
+	trackEvent(ev);
+}
+
+void OpenCLEngine::releaseMemObject(cl_mem memobj)
+{
+	if (memobj == NULL)
+		return;
+
+	OCL_SAFE_CALL(clReleaseMemObject(memobj));
+}
+
+void OpenCLEngine::ndRangeKernel(OpenCLKernel &kernel, cl_uint work_dim, const size_t *global_work_offset,
+								 const size_t *global_work_size, const size_t *local_work_size)
+{
+	if (work_dim < 1 || work_dim > 3)
+		throw ocl_exception("Wrong work dimension size: " + to_string(work_dim) + "!");
+
+	// check workgroup size
+	if (local_work_size) {
+		size_t workgroup_size = 1;
+		for (cl_uint dim = 0; dim < work_dim; dim++) {
+			if (local_work_size[dim] > device_info_.max_work_item_sizes[dim])
+				throw ocl_exception("Wrong work_size[" + to_string(dim) + "] value: " + to_string(local_work_size[dim]) + "!");
+			workgroup_size *= local_work_size[dim];
+		}
+		if (workgroup_size > device_info_.max_workgroup_size)
+			throw ocl_exception("Too big workgroup size: " + to_string(workgroup_size) + "!");
+		if (workgroup_size > kernel.workGroupSize())
+			throw ocl_exception("Too big workgroup size for this kernel: " + to_string(workgroup_size) + "!");
+	}
+
+	// If, for example, CL_DEVICE_ADDRESS_BITS = 32, i.e. the device uses a 32-bit address space,
+	// size_t is a 32-bit unsigned integer and global_work_size values must be in the range 1 .. 2^32 - 1.
+	// Values outside this range return a CL_OUT_OF_RESOURCES error.
+	uint64_t max_global_work_size = (size_t) 1 << (device_info_.device_address_bits - 1);
+	max_global_work_size = max_global_work_size + (max_global_work_size - 1);
+	for (size_t d = 0; d < work_dim; ++d) {
+		if (global_work_size[d] == 0) {
+			std::cerr << "Global work size is zero!" << std::endl;
+			throw ocl_exception("Global work_size[" + to_string(d) + "] value is zero!");
+		} else if (global_work_size[d] > max_global_work_size && device_info_.device_address_bits <= 64) {
+			throw ocl_exception("Global work_size[" + to_string(d) + "] value is too big for this device address bits: "
+								+ to_string(global_work_size[d]) + ", while device has " + to_string(device_info_.device_address_bits) + " address bits!");
+		}
+	}
+
+	cl_event ev = NULL;
+	OCL_SAFE_CALL(clEnqueueNDRangeKernel(queue(), kernel.kernel(), work_dim, global_work_offset, global_work_size, local_work_size, 0, NULL, &ev));
+	trackEvent(ev, "Kernel " + kernel.kernelName() + ": ");
+}
+
+void OpenCLEngine::trackEvent(cl_event ev, std::string message)
+{
+	cl_int		ciErrNum	= CL_SUCCESS;
+	cl_int		result		= CL_SUCCESS;
+
+	try {
+		OCL_SAFE_CALL_MESSAGE(clFlush(queue()), message);
+		OCL_SAFE_CALL_MESSAGE(clWaitForEvents(1, &ev), message);
+		OCL_SAFE_CALL_MESSAGE(clGetEventInfo(ev, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &result, 0), message);
+
+		if (result != CL_COMPLETE) {
+			throw ocl_exception("Wait for event succeed, but it is still is not complete with execution status: " + to_string(result) + "!");
+		}
+	} catch (...) {
+		OCL_SAFE_CALL_MESSAGE(clReleaseEvent(ev), message);
+		throw;
+	}
+
+	OCL_SAFE_CALL_MESSAGE(clReleaseEvent(ev), message);
+}
+
+cl_program OpenCLEngine::findProgram(int id) const
+{
+	std::map<int, cl_program>::const_iterator it = programs_.find(id);
+	if (it != programs_.end())
+		return it->second;
+	return 0;
+}
+
+OpenCLKernel *OpenCLEngine::findKernel(int id) const
+{
+	std::map<int, OpenCLKernel *>::const_iterator it = kernels_.find(id);
+	if (it != kernels_.end())
+		return it->second;
+	return 0;
+}
+
+VersionedBinary::VersionedBinary(const char *data, const size_t size,
+								 int bits, const int opencl_major_version, const int opencl_minor_version)
+		: data_(data), size_(size), device_address_bits_(bits), opencl_major_version_(opencl_major_version), opencl_minor_version_(opencl_minor_version)
+{}
+
+ProgramBinaries::ProgramBinaries(std::vector<VersionedBinary> binaries, std::string defines, std::string program_name) : binaries_(binaries)
+{
+	static int next_program_id = 0;
+	program_name_ = program_name;
+	id_			= next_program_id++;
+	defines_	= defines;
+}
+
+ProgramBinaries::ProgramBinaries(const char *source_code, size_t source_code_length, std::string defines, std::string program_name) : binaries_({VersionedBinary(source_code, source_code_length, 0, 1, 2)})
+{
+	static int next_program_id = 0;
+	program_name_ = program_name;
+	id_			= next_program_id++;
+	defines_	= defines;
+}
+
+const VersionedBinary* ProgramBinaries::getBinary(const std::shared_ptr<OpenCLEngine> &cl) const
+{
+	for (int i = 0; i < binaries_.size(); ++i) {
+		const VersionedBinary* binary = &binaries_[i];
+
+		if (binary->deviceAddressBits() && binary->deviceAddressBits() != cl->deviceAddressBits())
+			continue;
+
+		if (binary->openclMajorVersion() > cl->deviceInfo().opencl_major_version)
+			continue;
+
+		if (binary->openclMajorVersion() == cl->deviceInfo().opencl_major_version && binary->openclMinorVersion() > cl->deviceInfo().opencl_minor_version)
+			continue;
+
+		return binary;
+	}
+
+	throw ocl_exception("No SPIR version for " + to_string(cl->deviceAddressBits()) + "-bit device with OpenCL "
+						+ to_string(cl->deviceInfo().opencl_major_version) + "." + to_string(cl->deviceInfo().opencl_minor_version) + "!");
+}
+
+KernelSource::KernelSource(std::shared_ptr<ocl::ProgramBinaries> program, const char *name) : program_(program)
+{
+	id_		= getNextKernelId();
+	name_	= std::string(name);
+}
+
+KernelSource::KernelSource(std::shared_ptr<ocl::ProgramBinaries> program, const std::string &name) : program_(program)
+{
+	id_		= getNextKernelId();
+	name_	= name;
+}
+
+int KernelSource::getNextKernelId()
+{
+	static int next_kernel_id = 0;
+	return next_kernel_id++;
+}
+
+namespace ocl {
+	typedef std::map<std::pair<cl_platform_id, cl_device_id>, std::vector<unsigned char>> binaries_by_device;
+	static std::map<int, binaries_by_device>	cached_kernels_binaries;
+	static Mutex								cached_kernels_mutex;
+
+	std::vector<unsigned char>* getCachedBinary(int programId, cl_platform_id platform, cl_device_id device)
+	{
+		auto programCacheIt = cached_kernels_binaries.find(programId);
+		if (programCacheIt == cached_kernels_binaries.end())
+			cached_kernels_binaries[programId] = binaries_by_device();
+		auto binaryIt = cached_kernels_binaries[programId].find(std::make_pair(platform, device));
+		if (binaryIt != cached_kernels_binaries[programId].end()) {
+			return &binaryIt->second;
+		} else {
+			return NULL;
+		}
+	}
+
+	void setCachedBinary(int programId, cl_platform_id platform, cl_device_id device, std::vector<unsigned char> binaries)
+	{
+		auto programCacheIt = cached_kernels_binaries.find(programId);
+		if (programCacheIt == cached_kernels_binaries.end())
+			cached_kernels_binaries[programId] = binaries_by_device();
+		cached_kernels_binaries[programId][std::make_pair(platform, device)] = binaries;
+	}
+
+	std::vector<unsigned char> getProgramBinaries(cl_program program)
+	{
+		size_t binaries_size;
+		OCL_SAFE_CALL(clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
+									   sizeof(size_t), &binaries_size, NULL));
+
+		std::vector<unsigned char> binaries(binaries_size);
+		unsigned char *data = binaries.data();
+		OCL_SAFE_CALL(clGetProgramInfo(program, CL_PROGRAM_BINARIES,
+									   sizeof(unsigned char *), &data, NULL));
+
+		return binaries;
+	}
+}
+
+OpenCLKernel *KernelSource::getKernel(const std::shared_ptr<OpenCLEngine> &cl, bool printLog)
+{
+	OpenCLKernel *kernel = cl->findKernel(id_);
+	if (kernel)
+		return kernel;
+
+	cl_program program = cl->findProgram(program_->id());
+
+	if (!program) {
+		Lock lock(cached_kernels_mutex);
+
+		bool verbose = printLog || OCL_VERBOSE_COMPILE_LOG;
+
+		const VersionedBinary* binary = program_->getBinary(cl);
+		const std::vector<unsigned char>* cachedCompiledBinary = getCachedBinary(program_->id(), cl->platform(), cl->device());
+
+		cl_int ciErrNum = CL_SUCCESS;
+
+		std::string options = program_->defines();
+
+		std::vector<unsigned char> loaded_from_file_binaries;
+		std::string binaries_to_load_filename = LOAD_KERNEL_BINARIES_FROM_FILE;
+
+		if (!binaries_to_load_filename.empty()){
+			std::ifstream program_binaries_file;
+			program_binaries_file.open(binaries_to_load_filename);
+
+			std::string binaries_string((std::istreambuf_iterator<char>(program_binaries_file)), std::istreambuf_iterator<char>());
+
+			loaded_from_file_binaries = std::vector<unsigned char>(binaries_string.size());
+			for (int i = 0; i < binaries_string.size(); ++i) {
+				loaded_from_file_binaries[i] = (unsigned char) binaries_string[i];
+			}
+			cachedCompiledBinary = &loaded_from_file_binaries;
+
+			program_binaries_file.close();
+		}
+
+		if (cachedCompiledBinary != NULL) {
+			std::vector<const unsigned char *>	kernel_ptrs;
+			std::vector<size_t>					kernel_sizes;
+
+			kernel_ptrs.push_back(cachedCompiledBinary->data());
+			kernel_sizes.push_back(cachedCompiledBinary->size());
+
+			cl_device_id device = cl->device();
+			cl_int binary_status;
+
+			program = clCreateProgramWithBinary(cl->context(), 1, &device, &kernel_sizes[0], &kernel_ptrs[0], &binary_status, &ciErrNum);
+			OCL_SAFE_CALL(binary_status);
+			OCL_SAFE_CALL(ciErrNum);
+		} else if (binary->deviceAddressBits() == 0) {
+			std::vector<const char *>			kernel_ptrs;
+			std::vector<size_t>					kernel_sizes;
+
+			kernel_ptrs.push_back(binary->data());
+			kernel_sizes.push_back(binary->size());
+
+			program = clCreateProgramWithSource(cl->context(), kernel_ptrs.size(), &kernel_ptrs[0], &kernel_sizes[0], &ciErrNum);
+			OCL_SAFE_CALL(ciErrNum);
+		} else {
+			std::vector<const unsigned char *>	kernel_ptrs;
+			std::vector<size_t>					kernel_sizes;
+
+			kernel_ptrs.push_back((unsigned char*) binary->data());
+			kernel_sizes.push_back(binary->size());
+
+			cl_device_id device = cl->device();
+			cl_int binary_status;
+
+			program = clCreateProgramWithBinary(cl->context(), 1, &device, &kernel_sizes[0], &kernel_ptrs[0], &binary_status, &ciErrNum);
+			OCL_SAFE_CALL(binary_status);
+			OCL_SAFE_CALL(ciErrNum);
+
+			if (cl->deviceInfo().extensions.count("cl_khr_spir") == 0)
+				throw ocl_exception("Device does not support SPIR!");
+
+			options += " -x spir";
+		}
+
+		options += " -D WARP_SIZE=" + to_string(cl->wavefrontSize());
+
+		timer tm;
+		tm.start();
+
+		if (cachedCompiledBinary == NULL && verbose) {
+			if (program_->programName() == "") {
+				std::cout << "Building kernels for " << cl->deviceName() << "... " << std::endl;
+			}
+//			else {
+//				std::cout << "Building kernel " << program_->programName() << " for " << cl->deviceName() << "... " << std::endl;
+//			}
+		}
+
+		ciErrNum = clBuildProgram(program, 0, NULL, options.c_str(), NULL, NULL);
+
+		if (ciErrNum == CL_SUCCESS && cachedCompiledBinary == NULL) {
+			if (program_->programName() == "" && verbose) {
+				std::cout << "Kernels compilation done in " << tm.elapsed() << " seconds" << std::endl;
+			}
+//			else {
+//				std::cout << "Kernel " << program_->programName() << " compilation done in " << tm.elapsed() << " seconds" << std::endl;
+//			}
+
+			std::vector<unsigned char> binaries = getProgramBinaries(program);
+			setCachedBinary(program_->id(), cl->platform(), cl->device(), binaries);
+		}
+
+		if (ciErrNum != CL_SUCCESS || verbose) {
+			ocl::oclPrintBuildLog(program);
+
+			std::string binaries_filename = DUMP_KERNEL_BINARIES_TO_FILE;
+			if (!binaries_filename.empty()) {
+				std::vector<unsigned char> binaries = getProgramBinaries(program);
+				std::string binaries_string((char*) binaries.data(), binaries.size());
+
+				std::ofstream program_binaries_file;
+				program_binaries_file.open(binaries_filename + "_platform" + to_string(cl->platform()) + "_device" + to_string(cl->device()) + "_program" + to_string(program_->id()));
+
+				program_binaries_file << binaries_string;
+				program_binaries_file.close();
+			}
+		}
+
+		if (ciErrNum != CL_SUCCESS) {
+			clReleaseProgram(program);
+			program = 0;
+		}
+
+		OCL_SAFE_CALL(ciErrNum);
+		cl->programs()[program_->id()] = program;
+	}
+
+	kernel = new OpenCLKernel;
+	kernel->create(program, name_.c_str());
+
+	cl->kernels()[id_] = kernel;
+
+	return kernel;
+}
+
+void KernelSource::exec(const gpu::WorkSize &ws, const Arg &arg0, const Arg &arg1, const Arg &arg2, const Arg &arg3, const Arg &arg4, const Arg &arg5, const Arg &arg6, const Arg &arg7, const Arg &arg8, const Arg &arg9, const Arg &arg10, const Arg &arg11, const Arg &arg12, const Arg &arg13, const Arg &arg14, const Arg &arg15, const Arg &arg16, const Arg &arg17, const Arg &arg18, const Arg &arg19, const Arg &arg20, const Arg &arg21, const Arg &arg22, const Arg &arg23, const Arg &arg24, const Arg &arg25, const Arg &arg26, const Arg &arg27, const Arg &arg28, const Arg &arg29, const Arg &arg30, const Arg &arg31, const Arg &arg32, const Arg &arg33, const Arg &arg34, const Arg &arg35, const Arg &arg36, const Arg &arg37, const Arg &arg38, const Arg &arg39, const Arg &arg40)
+{
+	gpu::Context context;
+
+	OpenCLKernel *kernel = getKernel(context.cl());
+
+	kernel->setArgs(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15, arg16, arg17, arg18, arg19, arg20, arg21, arg22, arg23, arg24, arg25, arg26, arg27, arg28, arg29, arg30, arg31, arg32, arg33, arg34, arg35, arg36, arg37, arg38, arg39, arg40);
+
+	context.cl()->ndRangeKernel(*kernel, 3, NULL, ws.clGlobalSize(), ws.clLocalSize());
+}
+
+void KernelSource::execSubdivided(const gpu::WorkSize &ws, const Arg &arg0, const Arg &arg1, const Arg &arg2, const Arg &arg3, const Arg &arg4, const Arg &arg5, const Arg &arg6, const Arg &arg7, const Arg &arg8, const Arg &arg9, const Arg &arg10, const Arg &arg11, const Arg &arg12, const Arg &arg13, const Arg &arg14, const Arg &arg15, const Arg &arg16, const Arg &arg17, const Arg &arg18, const Arg &arg19, const Arg &arg20, const Arg &arg21, const Arg &arg22, const Arg &arg23, const Arg &arg24, const Arg &arg25, const Arg &arg26, const Arg &arg27, const Arg &arg28, const Arg &arg29, const Arg &arg30, const Arg &arg31, const Arg &arg32, const Arg &arg33, const Arg &arg34, const Arg &arg35, const Arg &arg36, const Arg &arg37, const Arg &arg38, const Arg &arg39, const Arg &arg40)
+{
+	const size_t max_total_size = 1000000;
+
+	const size_t local_x = ws.clLocalSize()[0];
+	const size_t local_y = ws.clLocalSize()[1];
+	const size_t local_z = ws.clLocalSize()[2];
+
+	const size_t total_x = ws.clGlobalSize()[0];
+	const size_t total_y = ws.clGlobalSize()[1];
+	const size_t total_z = ws.clGlobalSize()[2];
+
+	size_t nparts_x = 1;
+	size_t nparts_y = 1;
+	size_t nparts_z = 1;
+
+	size_t part_x = total_x;
+	size_t part_y = total_y;
+	size_t part_z = total_z;
+
+	while (part_x * part_y * part_z > max_total_size && part_x > local_x) {
+		nparts_x *= 2;
+		part_x = local_x * gpu::divup(gpu::divup(total_x, nparts_x), local_x);
+	}
+	while (part_x * part_y * part_z > max_total_size && part_y > local_y) {
+		nparts_y *= 2;
+		part_y = local_y * gpu::divup(gpu::divup(total_y, nparts_y), local_y);
+	}
+	while (part_x * part_y * part_z > max_total_size && part_z > local_z) {
+		nparts_z *= 2;
+		part_z = local_z * gpu::divup(gpu::divup(total_z, nparts_z), local_z);
+	}
+
+	gpu::Context context;
+
+	OpenCLKernel *kernel = getKernel(context.cl());
+
+	kernel->setArgs(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15, arg16, arg17, arg18, arg19, arg20, arg21, arg22, arg23, arg24, arg25, arg26, arg27, arg28, arg29, arg30, arg31, arg32, arg33, arg34, arg35, arg36, arg37, arg38, arg39, arg40);
+
+	for (size_t offset_x = 0; offset_x < total_x; offset_x += part_x) {
+		for (size_t offset_y = 0; offset_y < total_y; offset_y += part_y) {
+			for (size_t offset_z = 0; offset_z < total_z; offset_z += part_z) {
+				size_t offset[3];
+				offset[0] = offset_x;
+				offset[1] = offset_y;
+				offset[2] = offset_z;
+
+				size_t current_x = std::min(part_x, total_x - offset_x);
+				size_t current_y = std::min(part_y, total_y - offset_y);
+				size_t current_z = std::min(part_z, total_z - offset_z);
+
+				gpu::WorkSize ws_part(local_x, local_y, local_z, current_x, current_y, current_z);
+
+				// NOTTODO: generalize this logic, apply it to CUDA, make so that ndRangeKernel is called only in one place in codebase, remove all get_group_id/get_num_groups calls, etc.
+				context.cl()->ndRangeKernel(*kernel, 3, offset, ws_part.clGlobalSize(), ws_part.clLocalSize());
+			}
+		}
+	}
+}
+
+void KernelSource::precompile(bool printLog) {
+	gpu::Context context;
+
+	precompile(context.cl(), printLog);
+}
+
+void KernelSource::precompile(const std::shared_ptr<OpenCLEngine> &cl, bool printLog) {
+	getKernel(cl, printLog);
+}
+
+OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer &arg)
+{
+	is_null = false;
+	size = sizeof(cl_mem);
+	cl_mem_storage = arg.clmem();
+	value = &cl_mem_storage;
+	if (arg.cloffset() != 0) {
+		ocl_exception("Offset is not zero, but ignored!");
+	}
+}
+
+template<typename T>
+OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer_typed<T> &arg)
+{
+	is_null = false;
+	size = sizeof(cl_mem);
+	cl_mem_storage = arg.clmem();
+	value = &cl_mem_storage;
+	if (arg.cloffset() != 0) {
+		ocl_exception("Offset is not zero, but ignored!");
+	}
+}
+
+template OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer_typed<char> &arg);
+template OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer_typed<unsigned char> &arg);
+template OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer_typed<short> &arg);
+template OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer_typed<unsigned short> &arg);
+template OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer_typed<int> &arg);
+template OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer_typed<unsigned int> &arg);
+template OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer_typed<float> &arg);
+template OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer_typed<double> &arg);
\ No newline at end of file
diff --git a/libs/gpu/libgpu/opencl/engine.h b/libs/gpu/libgpu/opencl/engine.h
new file mode 100644
index 0000000..dbca5d6
--- /dev/null
+++ b/libs/gpu/libgpu/opencl/engine.h
@@ -0,0 +1,266 @@
+#pragma once
+
+#include <set>
+#include <vector>
+#include <limits>
+#include <iostream>
+#include <stdexcept>
+#include <stdint.h>
+
+#include <CL/cl.h>
+#include <libgpu/work_size.h>
+#include <libgpu/opencl/device_info.h>
+#include <libgpu/opencl/utils.h>
+#include <libgpu/utils.h>
+#include <memory>
+#include <map>
+
+namespace gpu {
+	class WorkSize;
+	class shared_device_buffer;
+
+	template <typename T>
+	class shared_device_buffer_typed;
+}
+
+namespace ocl {
+
+	template<class T>
+	struct OpenCLType;
+
+	template<> struct OpenCLType<int8_t>		{ typedef cl_char	type; static std::string name() { return "char";	}  static int8_t		max() { return CL_CHAR_MAX;		} static int8_t			min() { return CL_CHAR_MIN;	} };
+	template<> struct OpenCLType<int16_t>		{ typedef cl_short	type; static std::string name() { return "short";	}  static int16_t		max() { return CL_SHRT_MAX;		} static int16_t		min() { return CL_SHRT_MIN;	} };
+	template<> struct OpenCLType<int32_t>		{ typedef cl_int	type; static std::string name() { return "int";		}  static int32_t		max() { return CL_INT_MAX;		} static int32_t		min() { return CL_INT_MIN;	} };
+	template<> struct OpenCLType<uint8_t>		{ typedef cl_uchar	type; static std::string name() { return "uchar";	}  static uint8_t		max() { return CL_UCHAR_MAX;	} static uint8_t		min() { return 0;			} };
+	template<> struct OpenCLType<uint16_t>		{ typedef cl_ushort	type; static std::string name() { return "ushort";	}  static uint16_t		max() { return CL_CHAR_MAX;		} static uint16_t		min() { return 0; 			} };
+	template<> struct OpenCLType<uint32_t>		{ typedef cl_uint	type; static std::string name() { return "uint";	}  static uint32_t		max() { return CL_UINT_MAX;		} static uint32_t		min() { return 0;			} };
+	template<> struct OpenCLType<float>			{ typedef cl_float	type; static std::string name() { return "float";	}  static float			max() { return CL_FLT_MAX;		} static float			min() { return CL_FLT_MIN;	} };
+	template<> struct OpenCLType<double>		{ typedef cl_double	type; static std::string name() { return "double";	}  static double		max() { return std::numeric_limits<double>::max();		} static double			min() { return CL_DBL_MIN;	} };
+
+	class OpenCLEngine;
+
+	typedef std::shared_ptr<OpenCLEngine>	sh_ptr_ocl_engine;
+
+	class LocalMem {
+	public:
+		LocalMem(size_t size) : size(size) { }
+
+		::size_t size;
+	};
+
+	class OpenCLKernelArg {
+	public:
+		OpenCLKernelArg() : is_null(true), size(0), value(0), cl_mem_storage(NULL) { }
+
+		template <typename T>
+		OpenCLKernelArg(const T &arg) : is_null(false), size(sizeof(arg)), value(&arg), cl_mem_storage(NULL) { }
+
+		OpenCLKernelArg(const LocalMem &arg) : is_null(false), size(arg.size), value(0), cl_mem_storage(NULL) { }
+
+		OpenCLKernelArg(const gpu::shared_device_buffer &arg);
+
+		template <typename T>
+		OpenCLKernelArg(const gpu::shared_device_buffer_typed<T> &arg);
+
+		bool			is_null;
+		size_t			size;
+		const void *	value;
+	protected:
+		cl_mem 			cl_mem_storage;
+	};
+
+	class OpenCLKernel {
+	public:
+		OpenCLKernel();
+		~OpenCLKernel();
+
+		void		create(cl_program program, const char *kernel_name, cl_device_id device_id_=NULL);
+
+		cl_kernel	kernel(void)			{ return kernel_;			}
+		std::string kernelName(void)		{ return kernel_name_;		}
+		size_t		workGroupSize(void)		{ return work_group_size_;	}
+
+		typedef OpenCLKernelArg Arg;
+
+		void setArgs(const Arg &arg0 = Arg(), const Arg &arg1 = Arg(), const Arg &arg2 = Arg(), const Arg &arg3 = Arg(), const Arg &arg4 = Arg(), const Arg &arg5 = Arg(), const Arg &arg6 = Arg(), const Arg &arg7 = Arg(), const Arg &arg8 = Arg(), const Arg &arg9 = Arg(), const Arg &arg10 = Arg(), const Arg &arg11 = Arg(), const Arg &arg12 = Arg(), const Arg &arg13 = Arg(), const Arg &arg14 = Arg(), const Arg &arg15 = Arg(), const Arg &arg16 = Arg(), const Arg &arg17 = Arg(), const Arg &arg18 = Arg(), const Arg &arg19 = Arg(), const Arg &arg20 = Arg(), const Arg &arg21 = Arg(), const Arg &arg22 = Arg(), const Arg &arg23 = Arg(), const Arg &arg24 = Arg(), const Arg &arg25 = Arg(), const Arg &arg26 = Arg(), const Arg &arg27 = Arg(), const Arg &arg28 = Arg(), const Arg &arg29 = Arg(), const Arg &arg30 = Arg(), const Arg &arg31 = Arg(), const Arg &arg32 = Arg(), const Arg &arg33 = Arg(), const Arg &arg34 = Arg(), const Arg &arg35 = Arg(), const Arg &arg36 = Arg(), const Arg &arg37 = Arg(), const Arg &arg38 = Arg(), const Arg &arg39 = Arg(), const Arg &arg40 = Arg())
+		{
+			setArg( 0, arg0);
+			setArg( 1, arg1);
+			setArg( 2, arg2);
+			setArg( 3, arg3);
+			setArg( 4, arg4);
+			setArg( 5, arg5);
+			setArg( 6, arg6);
+			setArg( 7, arg7);
+			setArg( 8, arg8);
+			setArg( 9, arg9);
+			setArg(10, arg10);
+			setArg(11, arg11);
+			setArg(12, arg12);
+			setArg(13, arg13);
+			setArg(14, arg14);
+			setArg(15, arg15);
+			setArg(16, arg16);
+			setArg(17, arg17);
+			setArg(18, arg18);
+			setArg(19, arg19);
+			setArg(20, arg20);
+			setArg(21, arg21);
+			setArg(22, arg22);
+			setArg(23, arg23);
+			setArg(24, arg24);
+			setArg(25, arg25);
+			setArg(26, arg26);
+			setArg(27, arg27);
+			setArg(28, arg28);
+			setArg(29, arg29);
+			setArg(30, arg30);
+			setArg(31, arg31);
+			setArg(32, arg32);
+			setArg(33, arg33);
+			setArg(34, arg34);
+			setArg(35, arg35);
+			setArg(36, arg36);
+			setArg(37, arg37);
+			setArg(38, arg38);
+			setArg(39, arg39);
+			setArg(40, arg40);
+		}
+
+	protected:
+		void		setArg(cl_uint arg_index, size_t arg_size, const void *arg_value);
+
+		void		setArg(cl_uint arg_index, const Arg &arg)
+		{
+			if (!arg.is_null)
+				setArg(arg_index, arg.size, arg.value);
+		}
+
+		cl_kernel	kernel_;
+		size_t		work_group_size_;
+		std::string	kernel_name_;
+	};
+
+	class OpenCLEngine {
+	public:
+		OpenCLEngine();
+		~OpenCLEngine();
+
+		void				init(cl_device_id device_id = 0, const char *cl_params = 0, bool verbose = false);
+		void				init(cl_platform_id platform_id = 0, cl_device_id device_id = 0, const char *cl_params = 0, bool verbose = false);
+		cl_mem				createBuffer(cl_mem_flags flags, size_t size);
+		void				writeBuffer(cl_mem buffer, cl_bool blocking_write, size_t offset, size_t cb, const void *ptr);
+		void				writeBufferRect(cl_mem buffer, cl_bool blocking_write, const size_t buffer_origin[3], const size_t host_origin[3], const size_t region[3],
+								size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, const void *ptr);
+		void				readBuffer(cl_mem buffer, cl_bool blocking_read, size_t offset, size_t cb, void *ptr);
+		void				readBufferRect(cl_mem buffer, cl_bool blocking_write, const size_t buffer_origin[3], const size_t host_origin[3], const size_t region[3],
+											size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, void *ptr);
+		void				copyBuffer(cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t cb);
+		void				ndRangeKernel(OpenCLKernel &kernel, cl_uint work_dim, const size_t *global_work_offset,
+											const size_t *global_work_size, const size_t *local_work_size);
+		void				releaseMemObject(cl_mem memobj);
+
+		const DeviceInfo &	deviceInfo() const			{ return device_info_;				}
+
+		cl_platform_id		platform()					{ return platform_id_;				}
+		cl_device_id		device()					{ return device_id_;				}
+		cl_context			context()					{ return context_;					}
+		cl_command_queue	queue()						{ return command_queue_;			}
+
+		const std::string &	deviceName()				{ return device_info_.device_name;				}
+		size_t				maxComputeUnits() const		{ return device_info_.max_compute_units;		}
+		size_t				maxWorkgroupSize() const	{ return device_info_.max_workgroup_size;		}
+		size_t				maxWorkItemSizes(int dim)	{ return device_info_.max_work_item_sizes[dim];	}
+		size_t				maxMemAllocSize()			{ return device_info_.max_mem_alloc_size;		}
+		size_t				globalMemSize()				{ return device_info_.global_mem_size;			}
+		size_t				deviceAddressBits()			{ return device_info_.device_address_bits;		}
+		size_t 				wavefrontSize()				{ return wavefront_size_;						}
+		size_t 				totalMemSize()				{ return total_mem_size_;						}
+
+		std::map<int, cl_program> &		programs()	{ return programs_;	}
+		std::map<int, OpenCLKernel *> &	kernels()	{ return kernels_;	}
+
+		cl_program						findProgram(int id) const;
+		OpenCLKernel *					findKernel(int id) const;
+
+	protected:
+		void				trackEvent(cl_event ev, std::string message="");
+
+		cl_platform_id		platform_id_;
+		cl_device_id		device_id_;
+		cl_context			context_;
+		cl_command_queue	command_queue_;
+
+		size_t 				wavefront_size_;
+
+		DeviceInfo			device_info_;
+		size_t				total_mem_size_;
+
+		std::map<int, cl_program>		programs_;
+		std::map<int, OpenCLKernel *>	kernels_;
+	};
+
+	void		oclPrintBuildLog(cl_program program);
+
+class VersionedBinary {
+
+public:
+	VersionedBinary(const char *data, const size_t size,
+					int bits, const int opencl_major_version, const int opencl_minor_version);
+
+	const char *			data() const				{ return data_;					}
+	size_t					size() const				{ return size_;					}
+	int						deviceAddressBits() const	{ return device_address_bits_;	}
+	int						openclMajorVersion() const	{ return opencl_major_version_;	}
+	int						openclMinorVersion() const	{ return opencl_minor_version_;	}
+
+protected:
+	const char *			data_;
+	const size_t			size_;
+	int 					device_address_bits_;
+	const int				opencl_major_version_;
+	const int				opencl_minor_version_;
+};
+
+class ProgramBinaries {
+public:
+	ProgramBinaries(std::vector<VersionedBinary> binaries, std::string defines = std::string(), std::string program_name = std::string());
+	ProgramBinaries(const char *source_code, size_t source_code_length, std::string defines = std::string(), std::string program_name = std::string());
+
+	int										id() const { return id_; }
+	std::string								defines() const { return defines_; }
+	const VersionedBinary*					getBinary(const std::shared_ptr<OpenCLEngine> &cl) const;
+	const std::string &						programName() const { return program_name_; };
+
+protected:
+	int										id_;
+	std::vector<VersionedBinary>			binaries_;
+	std::string								program_name_;
+	std::string								defines_;
+};
+
+class KernelSource {
+public:
+	KernelSource(std::shared_ptr<ocl::ProgramBinaries> program, const char *name);
+	KernelSource(std::shared_ptr<ocl::ProgramBinaries> program, const std::string &name);
+
+	typedef OpenCLKernel::Arg Arg;
+
+	void exec(const gpu::WorkSize &ws, const Arg &arg0 = Arg(), const Arg &arg1 = Arg(), const Arg &arg2 = Arg(), const Arg &arg3 = Arg(), const Arg &arg4 = Arg(), const Arg &arg5 = Arg(), const Arg &arg6 = Arg(), const Arg &arg7 = Arg(), const Arg &arg8 = Arg(), const Arg &arg9 = Arg(), const Arg &arg10 = Arg(), const Arg &arg11 = Arg(), const Arg &arg12 = Arg(), const Arg &arg13 = Arg(), const Arg &arg14 = Arg(), const Arg &arg15 = Arg(), const Arg &arg16 = Arg(), const Arg &arg17 = Arg(), const Arg &arg18 = Arg(), const Arg &arg19 = Arg(), const Arg &arg20 = Arg(), const Arg &arg21 = Arg(), const Arg &arg22 = Arg(), const Arg &arg23 = Arg(), const Arg &arg24 = Arg(), const Arg &arg25 = Arg(), const Arg &arg26 = Arg(), const Arg &arg27 = Arg(), const Arg &arg28 = Arg(), const Arg &arg29 = Arg(), const Arg &arg30 = Arg(), const Arg &arg31 = Arg(), const Arg &arg32 = Arg(), const Arg &arg33 = Arg(), const Arg &arg34 = Arg(), const Arg &arg35 = Arg(), const Arg &arg36 = Arg(), const Arg &arg37 = Arg(), const Arg &arg38 = Arg(), const Arg &arg39 = Arg(), const Arg &arg40 = Arg());
+	void execSubdivided(const gpu::WorkSize &ws, const Arg &arg0 = Arg(), const Arg &arg1 = Arg(), const Arg &arg2 = Arg(), const Arg &arg3 = Arg(), const Arg &arg4 = Arg(), const Arg &arg5 = Arg(), const Arg &arg6 = Arg(), const Arg &arg7 = Arg(), const Arg &arg8 = Arg(), const Arg &arg9 = Arg(), const Arg &arg10 = Arg(), const Arg &arg11 = Arg(), const Arg &arg12 = Arg(), const Arg &arg13 = Arg(), const Arg &arg14 = Arg(), const Arg &arg15 = Arg(), const Arg &arg16 = Arg(), const Arg &arg17 = Arg(), const Arg &arg18 = Arg(), const Arg &arg19 = Arg(), const Arg &arg20 = Arg(), const Arg &arg21 = Arg(), const Arg &arg22 = Arg(), const Arg &arg23 = Arg(), const Arg &arg24 = Arg(), const Arg &arg25 = Arg(), const Arg &arg26 = Arg(), const Arg &arg27 = Arg(), const Arg &arg28 = Arg(), const Arg &arg29 = Arg(), const Arg &arg30 = Arg(), const Arg &arg31 = Arg(), const Arg &arg32 = Arg(), const Arg &arg33 = Arg(), const Arg &arg34 = Arg(), const Arg &arg35 = Arg(), const Arg &arg36 = Arg(), const Arg &arg37 = Arg(), const Arg &arg38 = Arg(), const Arg &arg39 = Arg(), const Arg &arg40 = Arg());
+
+	void precompile(bool printLog=false);
+	void precompile(const std::shared_ptr<OpenCLEngine> &cl, bool printLog=false);
+
+protected:
+	int getNextKernelId();
+
+	OpenCLKernel *getKernel(const std::shared_ptr<OpenCLEngine> &cl, bool printLog=false);
+
+	std::shared_ptr<ocl::ProgramBinaries> program_;
+
+	int				id_;
+	std::string		name_;
+};
+
+}
diff --git a/libs/gpu/libgpu/opencl/enum.cpp b/libs/gpu/libgpu/opencl/enum.cpp
new file mode 100644
index 0000000..61bc967
--- /dev/null
+++ b/libs/gpu/libgpu/opencl/enum.cpp
@@ -0,0 +1,251 @@
+#include <libclew/ocl_init.h>
+#include <libgpu/opencl/utils.h>
+#include <libgpu/opencl/device_info.h>
+#include <libutils/string_utils.h>
+#include <algorithm>
+#include <CL/cl.h>
+#include <iostream>
+#include "enum.h"
+
+#define OCL_CPU_DEVICES_ENABLED true
+
+bool OpenCLEnum::Device::printInfo() const
+{
+	ocl::DeviceInfo device_info;
+	device_info.init(id);
+	device_info.print();
+	return true;
+}
+
+OpenCLEnum::OpenCLEnum()
+{
+}
+
+OpenCLEnum::~OpenCLEnum()
+{
+}
+
+bool OpenCLEnum::enumPlatforms()
+{
+	cl_uint num_platforms; 
+	cl_int ciErrNum;
+
+	// Get OpenCL platform count
+	ciErrNum = clGetPlatformIDs (0, NULL, &num_platforms);
+	if (ciErrNum != CL_SUCCESS) {
+		std::cerr << "clGetPlatformIDs failed: " << ocl::errorString(ciErrNum) << std::endl;
+		return false;
+	}
+
+	if (num_platforms == 0)
+		return true;
+
+	std::vector<cl_platform_id> clPlatformIDs(num_platforms);
+
+	// get platform info for each platform and trap the NVIDIA platform if found
+	ciErrNum = clGetPlatformIDs (num_platforms, clPlatformIDs.data(), NULL);
+	if (ciErrNum != CL_SUCCESS) {
+		std::cerr << "clGetPlatformIDs for " << num_platforms << " platforms failed: " << ocl::errorString(ciErrNum) << std::endl;
+		return false;
+	}
+
+	for (cl_uint i = 0; i < num_platforms; ++i) {
+		Platform platform;
+
+		cl_platform_id platform_id = clPlatformIDs[i];
+		platform.id = platform_id;
+
+		queryPlatformInfo(platform_id, CL_PLATFORM_NAME,	platform.name,		"CL_PLATFORM_NAME",		1024);
+		queryPlatformInfo(platform_id, CL_PLATFORM_VENDOR,	platform.vendor,	"CL_PLATFORM_VENDOR",	1024);
+		queryPlatformInfo(platform_id, CL_PLATFORM_VERSION,	platform.version,	"CL_PLATFORM_VERSION",	1024);
+
+		platforms_.push_back(platform);
+	}
+
+	return true;
+}
+
+bool OpenCLEnum::queryDeviceInfo(Device &device)
+{
+	cl_device_id device_id = device.id;
+
+	queryDeviceInfo(device_id, CL_DEVICE_TYPE,					device.device_type,		"CL_DEVICE_TYPE");
+	queryDeviceInfo(device_id, CL_DEVICE_NAME,					device.name,			"CL_DEVICE_NAME", 1024);
+	queryDeviceInfo(device_id, CL_DEVICE_VENDOR,				device.vendor,			"CL_DEVICE_VENDOR", 1024);
+	queryDeviceInfo(device_id, CL_DEVICE_VENDOR_ID,				device.vendor_id,		"CL_DEVICE_VENDOR_ID");
+	queryDeviceInfo(device_id, CL_DEVICE_VERSION,				device.version,			"CL_DEVICE_VERSION", 1024);
+	queryDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS,		device.compute_units,	"CL_DEVICE_MAX_COMPUTE_UNITS");
+	queryDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE,		device.mem_size,		"CL_DEVICE_GLOBAL_MEM_SIZE");
+	queryDeviceInfo(device_id, CL_DEVICE_MAX_CLOCK_FREQUENCY,	device.clock,			"CL_DEVICE_MAX_CLOCK_FREQUENCY");
+
+	std::set< std::string > extensions;
+	queryExtensionList(device_id, extensions);
+
+	if (extensions.count("cl_nv_device_attribute_query")) {
+		queryDeviceInfo(device_id, CL_DEVICE_PCI_BUS_ID_NV,		device.nvidia_pci_bus_id,	"CL_DEVICE_PCI_BUS_ID_NV");
+		queryDeviceInfo(device_id, CL_DEVICE_PCI_SLOT_ID_NV,	device.nvidia_pci_slot_id,	"CL_DEVICE_PCI_SLOT_ID_NV");
+	}
+
+	device.has_cl_khr_spir = (extensions.count("cl_khr_spir") != 0);
+
+	return true;
+}
+
+template <typename T>
+bool OpenCLEnum::queryDeviceInfo(cl_device_id device_id, unsigned int param, T &value, const std::string &param_name)
+{
+	cl_int res = clGetDeviceInfo(device_id, param, sizeof(value), &value, NULL);
+	if (res != CL_SUCCESS) {
+		std::cerr << "clGetDeviceInfo(" << param_name << ") failed: " << ocl::errorString(res) << std::endl;
+		return false;
+	}
+	return true;
+}
+
+bool OpenCLEnum::queryDeviceInfo(cl_device_id device_id, unsigned int param, std::string &value, const std::string &param_name, size_t max_size)
+{
+	cl_int res;
+	if (max_size == 0) {
+		res = clGetDeviceInfo(device_id, param, 0, NULL, &max_size);
+		if (res != CL_SUCCESS) {
+			std::cerr << "clGetDeviceInfo(" << param_name << ") failed: " << ocl::errorString(res) << std::endl;
+			return false;
+		}
+	}
+
+	std::vector<char> data(max_size);
+	res = clGetDeviceInfo(device_id, param, max_size, data.data(), &max_size);
+	if (res != CL_SUCCESS) {
+		std::cerr << "clGetDeviceInfo(" << param_name << ") failed: " << ocl::errorString(res) << std::endl;
+		return false;
+	}
+
+	value.assign(data.begin(), data.begin() + max_size);
+	value = value.c_str();	// remove trailing null chars
+	return true;
+}
+
+bool OpenCLEnum::queryPlatformInfo(cl_platform_id platform_id, unsigned int param, std::string &value, const std::string &param_name, size_t max_size)
+{
+	cl_int res;
+
+	std::vector<char> data(max_size);
+	res = clGetPlatformInfo(platform_id, param, max_size, data.data(), &max_size);
+	if (res != CL_SUCCESS) {
+		std::cerr << "clGetPlatformInfo(" << param_name << ") failed: " << ocl::errorString(res) << std::endl;
+		return false;
+	}
+
+	value.assign(data.begin(), data.begin() + max_size);
+	value = value.c_str();	// remove trailing null chars
+	return true;
+}
+
+bool OpenCLEnum::queryExtensionList(cl_device_id device_id, std::set<std::string> &extensions)
+{
+	std::string extensions_string;
+	if (!queryDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, extensions_string, "CL_DEVICE_EXTENSIONS"))
+		return false;
+
+	std::vector<std::string> tokens = split(extensions_string, " ");
+	extensions.insert(tokens.begin(), tokens.end());
+	return true;
+}
+
+bool OpenCLEnum::enumDevices(cl_platform_id platform_id)
+{
+	cl_int ciErrNum;
+
+	cl_uint			uiNumDevices	= 0;		// number of devices available
+	cl_device_type	device_type		= CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR;
+	if (OCL_CPU_DEVICES_ENABLED) {
+		device_type |= CL_DEVICE_TYPE_CPU;
+	}
+
+	ciErrNum = clGetDeviceIDs(platform_id, device_type, 0, NULL, &uiNumDevices);
+	if (ciErrNum != CL_SUCCESS && ciErrNum != CL_DEVICE_NOT_FOUND) {
+		std::cerr << "clGetDeviceIDs failed: " << ocl::errorString(ciErrNum) << std::endl;
+		return false;
+	}
+
+	if (ciErrNum == CL_DEVICE_NOT_FOUND || uiNumDevices == 0)
+		return true;
+
+	std::vector<cl_device_id> cdDevices(uiNumDevices);
+
+	ciErrNum = clGetDeviceIDs(platform_id, device_type, uiNumDevices, cdDevices.data(), NULL);
+	if (ciErrNum != CL_SUCCESS) {
+		std::cerr << "clGetDeviceIDs for " << uiNumDevices << " devices failed: " << ocl::errorString(ciErrNum) << std::endl;
+		return false;
+	}
+
+	for (cl_uint i = 0; i < uiNumDevices; i++) {
+		Device device;
+
+		device.id			= cdDevices[i];
+		device.platform_id	= platform_id;
+
+		if (!queryDeviceInfo(device)) {
+			std::cerr << device.name << ": can't query device info" << std::endl;
+			continue;
+		}
+
+#ifdef SPIR_SUPPORT
+		if (!device.has_cl_khr_spir) {
+	#ifdef CUDA_SUPPORT
+			if (device.vendor_id != ocl::ID_NVIDIA && device.vendor.find("NVIDIA") == std::string::npos) {
+	#endif
+				std::cerr << device.name << ": no SPIR support" << std::endl;
+	#ifdef CUDA_SUPPORT
+			}
+	#endif
+			continue;
+		}
+#endif
+
+#ifdef CUDA_SUPPORT
+		if (device.vendor_id == ocl::ID_NVIDIA || device.vendor.find("NVIDIA") != std::string::npos) {
+			continue;
+		}
+#endif
+
+		devices_.push_back(device);
+	}
+
+	return true;
+}
+
+bool OpenCLEnum::compareDevice(const Device &dev1, const Device &dev2)
+{
+	if (dev1.name	> dev2.name)	return false;
+	if (dev1.name	< dev2.name)	return true;
+	if (dev1.id		> dev2.id)		return false;
+	return true;
+}
+
+bool OpenCLEnum::enumDevices()
+{
+	if (!ocl_init()) {
+		std::cerr << "Can't load OpenCL library" << std::endl;
+		return false;
+	}
+
+	if (!enumPlatforms())
+		return false;
+
+	for (size_t k = 0; k < platforms_.size(); k++) {
+		if (!enumDevices(platforms_[k].id)) {
+			std::cerr << platforms_[k].name << ": can't enumerate devices" << std::endl;
+		}
+	}
+
+	std::sort(devices_.begin(), devices_.end(), compareDevice);
+
+	return true;
+}
+
+std::shared_ptr<ocl::OpenCLEngine> OpenCLEnum::Device::createEngine(bool printInfo) {
+	std::shared_ptr<ocl::OpenCLEngine> engine(new ocl::OpenCLEngine());
+	engine->init(platform_id, id, 0, printInfo);
+	return engine;
+}
diff --git a/libs/gpu/libgpu/opencl/enum.h b/libs/gpu/libgpu/opencl/enum.h
new file mode 100644
index 0000000..f7db09d
--- /dev/null
+++ b/libs/gpu/libgpu/opencl/enum.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <libgpu/opencl/engine.h>
+
+typedef struct _cl_platform_id *	cl_platform_id;
+typedef struct _cl_device_id *		cl_device_id;
+
+class OpenCLEnum {
+public:
+	OpenCLEnum();
+	~OpenCLEnum();
+
+	class Device {
+	public:
+		Device()
+		{
+			device_type			= 0;
+			compute_units		= 0;
+			mem_size			= 0;
+			clock				= 0;
+			nvidia_pci_bus_id	= 0;
+			nvidia_pci_slot_id	= 0;
+			has_cl_khr_spir		= false;
+		}
+
+		cl_device_id			id;
+		unsigned int			vendor_id;
+		cl_platform_id			platform_id;
+		cl_device_type			device_type;
+		std::string				name;
+		std::string				vendor;
+		std::string				version;
+		unsigned int			compute_units;
+		unsigned long long		mem_size;
+		unsigned int			clock;
+		unsigned int			nvidia_pci_bus_id;
+		unsigned int			nvidia_pci_slot_id;
+		bool					has_cl_khr_spir;
+
+		ocl::sh_ptr_ocl_engine	createEngine(bool printInfo=false);
+
+		bool	isCPU(void)	{ return device_type == CL_DEVICE_TYPE_CPU;	}
+		bool	isGPU(void)	{ return device_type == CL_DEVICE_TYPE_GPU;	}
+
+		bool printInfo() const;
+	};
+
+	class Platform {
+	public:
+		cl_platform_id			id;
+		std::string				name;
+		std::string				vendor;
+		std::string				version;
+	};
+
+	bool	enumDevices();
+	std::vector<Device> &	devices()	{ return devices_;		}
+	std::vector<Platform> &	platforms()	{ return platforms_;	}
+
+protected:
+	bool	enumPlatforms();
+	bool	enumDevices(cl_platform_id platform_id);
+
+	bool	queryDeviceInfo(Device &device);
+	bool	queryDeviceInfo(cl_device_id device_id, unsigned int param, std::string &value, const std::string &param_name, size_t max_size = 0);
+	template <typename T>
+	bool	queryDeviceInfo(cl_device_id device_id, unsigned int param, T &value, const std::string &param_name);
+	bool	queryPlatformInfo(cl_platform_id platform_id, unsigned int param, std::string &value, const std::string &param_name, size_t max_size);
+	bool	queryExtensionList(cl_device_id device_id, std::set<std::string> &extensions);
+
+	static	bool	compareDevice(const Device &dev1, const Device &dev2);
+
+	std::vector<Device>		devices_;
+	std::vector<Platform>	platforms_;
+};
diff --git a/libs/gpu/libgpu/opencl/utils.cpp b/libs/gpu/libgpu/opencl/utils.cpp
new file mode 100644
index 0000000..a2d8214
--- /dev/null
+++ b/libs/gpu/libgpu/opencl/utils.cpp
@@ -0,0 +1,61 @@
+#include <vector>
+#include "utils.h"
+
+namespace ocl {
+
+std::string errorString(cl_int code)
+{
+	switch (code) {
+	case CL_SUCCESS:							return "CL_SUCCESS";
+	case CL_DEVICE_NOT_FOUND:					return "CL_DEVICE_NOT_FOUND";
+	case CL_DEVICE_NOT_AVAILABLE:				return "CL_DEVICE_NOT_AVAILABLE";
+	case CL_COMPILER_NOT_AVAILABLE:				return "CL_COMPILER_NOT_AVAILABLE";
+	case CL_MEM_OBJECT_ALLOCATION_FAILURE:		return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+	case CL_OUT_OF_RESOURCES:					return "CL_OUT_OF_RESOURCES";
+	case CL_OUT_OF_HOST_MEMORY:					return "CL_OUT_OF_HOST_MEMORY";
+	case CL_PROFILING_INFO_NOT_AVAILABLE:		return "CL_PROFILING_INFO_NOT_AVAILABLE";
+	case CL_MEM_COPY_OVERLAP:					return "CL_MEM_COPY_OVERLAP";
+	case CL_IMAGE_FORMAT_MISMATCH:				return "CL_IMAGE_FORMAT_MISMATCH";
+	case CL_IMAGE_FORMAT_NOT_SUPPORTED:			return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+	case CL_BUILD_PROGRAM_FAILURE:				return "CL_BUILD_PROGRAM_FAILURE";
+	case CL_MAP_FAILURE:						return "CL_MAP_FAILURE";
+
+	case CL_INVALID_VALUE:						return "CL_INVALID_VALUE";
+	case CL_INVALID_DEVICE_TYPE:				return "CL_INVALID_DEVICE_TYPE";
+	case CL_INVALID_PLATFORM:					return "CL_INVALID_PLATFORM";
+	case CL_INVALID_DEVICE:						return "CL_INVALID_DEVICE";
+	case CL_INVALID_CONTEXT:					return "CL_INVALID_CONTEXT";
+	case CL_INVALID_QUEUE_PROPERTIES:			return "CL_INVALID_QUEUE_PROPERTIES";
+	case CL_INVALID_COMMAND_QUEUE:				return "CL_INVALID_COMMAND_QUEUE";
+	case CL_INVALID_HOST_PTR:					return "CL_INVALID_HOST_PTR";
+	case CL_INVALID_MEM_OBJECT:					return "CL_INVALID_MEM_OBJECT";
+	case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:	return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+	case CL_INVALID_IMAGE_SIZE:					return "CL_INVALID_IMAGE_SIZE";
+	case CL_INVALID_SAMPLER:					return "CL_INVALID_SAMPLER";
+	case CL_INVALID_BINARY:						return "CL_INVALID_BINARY";
+	case CL_INVALID_BUILD_OPTIONS:				return "CL_INVALID_BUILD_OPTIONS";
+	case CL_INVALID_PROGRAM:					return "CL_INVALID_PROGRAM";
+	case CL_INVALID_PROGRAM_EXECUTABLE:			return "CL_INVALID_PROGRAM_EXECUTABLE";
+	case CL_INVALID_KERNEL_NAME:				return "CL_INVALID_KERNEL_NAME";
+	case CL_INVALID_KERNEL_DEFINITION:			return "CL_INVALID_KERNEL_DEFINITION";
+	case CL_INVALID_KERNEL:						return "CL_INVALID_KERNEL";
+	case CL_INVALID_ARG_INDEX:					return "CL_INVALID_ARG_INDEX";
+	case CL_INVALID_ARG_VALUE:					return "CL_INVALID_ARG_VALUE";
+	case CL_INVALID_ARG_SIZE:					return "CL_INVALID_ARG_SIZE";
+	case CL_INVALID_KERNEL_ARGS:				return "CL_INVALID_KERNEL_ARGS";
+	case CL_INVALID_WORK_DIMENSION:				return "CL_INVALID_WORK_DIMENSION";
+	case CL_INVALID_WORK_GROUP_SIZE:			return "CL_INVALID_WORK_GROUP_SIZE";
+	case CL_INVALID_WORK_ITEM_SIZE:				return "CL_INVALID_WORK_ITEM_SIZE";
+	case CL_INVALID_GLOBAL_OFFSET:				return "CL_INVALID_GLOBAL_OFFSET";
+	case CL_INVALID_EVENT_WAIT_LIST:			return "CL_INVALID_EVENT_WAIT_LIST";
+	case CL_INVALID_EVENT:						return "CL_INVALID_EVENT";
+	case CL_INVALID_OPERATION:					return "CL_INVALID_OPERATION";
+	case CL_INVALID_GL_OBJECT:					return "CL_INVALID_GL_OBJECT";
+	case CL_INVALID_BUFFER_SIZE:				return "CL_INVALID_BUFFER_SIZE";
+	case CL_INVALID_MIP_LEVEL:					return "CL_INVALID_MIP_LEVEL";
+	case CL_INVALID_GLOBAL_WORK_SIZE:			return "CL_INVALID_GLOBAL_WORK_SIZE";
+	default:									return "CL_UNKNOWN_ERROR_CODE_" + to_string(code);
+	}
+}
+
+}
diff --git a/libs/gpu/libgpu/opencl/utils.h b/libs/gpu/libgpu/opencl/utils.h
new file mode 100644
index 0000000..f866b9c
--- /dev/null
+++ b/libs/gpu/libgpu/opencl/utils.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <stdexcept>
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+#include <libutils/string_utils.h>
+#include <libgpu/utils.h>
+
+namespace ocl {
+
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE	0x11B3  // since OpenCL 1.1
+
+#define CL_NV_DEVICE_ATTRIBUTE_QUERY_EXT				"cl_nv_device_attribute_query"
+#define CL_AMD_DEVICE_ATTRIBUTE_QUERY_EXT				"cl_amd_device_attribute_query"
+
+#ifndef CL_DEVICE_PCI_BUS_ID_NV
+#define CL_DEVICE_PCI_BUS_ID_NV							0x4008
+#endif
+
+#ifndef CL_DEVICE_PCI_SLOT_ID_NV
+#define CL_DEVICE_PCI_SLOT_ID_NV						0x4009
+#endif
+
+#define CL_DEVICE_TOPOLOGY_AMD							0x4037
+#define CL_DEVICE_BOARD_NAME_AMD						0x4038
+#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD				0x4039
+#define CL_DEVICE_WAVEFRONT_WIDTH_AMD					0x4043
+
+enum VENDOR {
+	ID_AMD		= 0x1002,
+	ID_INTEL	= 0x8086,
+	ID_NVIDIA	= 0x10de,
+};
+
+class ocl_exception : public gpu::gpu_exception {
+public:
+	ocl_exception(std::string msg) throw ()					: gpu_exception(msg)							{	}
+	ocl_exception(const char *msg) throw ()					: gpu_exception(msg)							{	}
+	ocl_exception() throw ()								: gpu_exception("OpenCL exception")				{	}
+};
+
+class ocl_bad_alloc : public gpu::gpu_bad_alloc {
+public:
+	ocl_bad_alloc(std::string msg) throw ()					: gpu_bad_alloc(msg)							{	}
+	ocl_bad_alloc(const char *msg) throw ()					: gpu_bad_alloc(msg)							{	}
+	ocl_bad_alloc() throw ()								: gpu_bad_alloc("OpenCL exception")				{	}
+};
+
+std::string errorString(cl_int code);
+
+static inline void reportError(cl_int err, int line, std::string prefix="")
+{
+	if (CL_SUCCESS == err)
+		return;
+
+	std::string message = prefix + errorString(err) + " (" + to_string(err) + ")" + " at line " + to_string(line);
+
+	switch (err) {
+	case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+		throw ocl_bad_alloc(message);
+	default:
+		throw ocl_exception(message);
+	}
+}
+
+#define OCL_SAFE_CALL(expr)  ocl::reportError(expr, __LINE__, "")
+#define OCL_SAFE_CALL_MESSAGE(expr, message)  ocl::reportError(expr, __LINE__, message)
+
+}
diff --git a/libs/gpu/libgpu/shared_device_buffer.cpp b/libs/gpu/libgpu/shared_device_buffer.cpp
new file mode 100644
index 0000000..4d164c6
--- /dev/null
+++ b/libs/gpu/libgpu/shared_device_buffer.cpp
@@ -0,0 +1,428 @@
+#include "shared_device_buffer.h"
+#include "context.h"
+#include <libgpu/utils.h>
+#include <algorithm>
+#include <stdexcept>
+
+#ifdef CUDA_SUPPORT
+#include <libgpu/cuda/utils.h>
+#include <cuda_runtime.h>
+#endif
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+namespace gpu {
+
+shared_device_buffer::shared_device_buffer()
+{
+	buffer_	= 0;
+	data_	= 0;
+	type_	= Context::TypeUndefined;
+	size_	= 0;
+	offset_	= 0;
+}
+
+shared_device_buffer::~shared_device_buffer()
+{
+	decref();
+}
+
+shared_device_buffer::shared_device_buffer(const shared_device_buffer &other, size_t offset)
+{
+	buffer_	= other.buffer_;
+	data_	= other.data_;
+	type_	= other.type_;
+	size_	= other.size_;
+	offset_	= other.offset_ + offset;
+	incref();
+}
+
+shared_device_buffer &shared_device_buffer::operator= (const shared_device_buffer &other)
+{
+	if (this != &other) {
+		decref();
+		buffer_	= other.buffer_;
+		data_	= other.data_;
+		type_	= other.type_;
+		size_	= other.size_;
+		offset_	= other.offset_;
+		incref();
+	}
+
+	return *this;
+}
+
+void shared_device_buffer::swap(shared_device_buffer &other)
+{
+	std::swap(buffer_,	other.buffer_);
+	std::swap(data_,	other.data_);
+	std::swap(type_,	other.type_);
+	std::swap(size_,	other.size_);
+	std::swap(offset_,	other.offset_);
+}
+
+void shared_device_buffer::incref()
+{
+	if (!buffer_)
+		return;
+
+#if defined(_WIN64)
+	InterlockedIncrement64((LONGLONG *) buffer_);
+#elif defined(_WIN32)
+	InterlockedIncrement((LONG *) buffer_);
+#else
+	__sync_add_and_fetch((long long *) buffer_, 1);
+#endif
+}
+
+void shared_device_buffer::decref()
+{
+	if (!buffer_)
+		return;
+
+	long long count = 0;
+
+#if defined(_WIN64)
+	count = InterlockedDecrement64((LONGLONG *) buffer_);
+#elif defined(_WIN32)
+	count = InterlockedDecrement((LONG *) buffer_);
+#else
+	count = __sync_sub_and_fetch((long long *) buffer_, 1);
+#endif
+
+	if (!count) {
+		switch (type_) {
+#ifdef CUDA_SUPPORT
+		case Context::TypeCUDA:
+			cudaFree(data_);
+			break;
+#endif
+		case Context::TypeOpenCL:
+			clReleaseMemObject((cl_mem) data_);
+			break;
+		default:
+			gpu::raiseException(__FILE__, __LINE__, "No GPU context!");
+		}
+
+		delete [] buffer_;
+	}
+
+	buffer_ = 0;
+	data_	= 0;
+	type_	= Context::TypeUndefined;
+	size_	= 0;
+	offset_	= 0;
+}
+
+shared_device_buffer shared_device_buffer::create(size_t size)
+{
+	shared_device_buffer res;
+	res.resize(size);
+	return res;
+}
+
+void *shared_device_buffer::cuptr() const
+{
+	if (type_ == Context::TypeOpenCL)
+		throw gpu_exception("GPU buffer type mismatch");
+
+	return (char *) data_ + offset_;
+}
+
+cl_mem shared_device_buffer::clmem() const
+{
+	if (type_ == Context::TypeCUDA)
+		throw gpu_exception("GPU buffer type mismatch");
+
+	return (cl_mem) data_;
+}
+
+size_t shared_device_buffer::cloffset() const
+{
+	if (type_ == Context::TypeCUDA)
+		throw gpu_exception("GPU buffer type mismatch");
+
+	return offset_;
+}
+
+size_t shared_device_buffer::size() const
+{
+	return size_;
+}
+
+bool shared_device_buffer::isNull() const
+{
+	return data_ == NULL;
+}
+
+void shared_device_buffer::reset()
+{
+	decref();
+}
+
+void shared_device_buffer::resize(size_t size)
+{
+	if (size == size_)
+		return;
+
+	decref();
+
+	Context context;
+	Context::Type type = context.type();
+
+	switch (type) {
+#ifdef CUDA_SUPPORT
+	case Context::TypeCUDA:
+		CUDA_SAFE_CALL( cudaMalloc(&data_, size) );
+		break;
+#endif
+	case Context::TypeOpenCL:
+		data_ = context.cl()->createBuffer(CL_MEM_READ_WRITE, size);
+		break;
+	default:
+		gpu::raiseException(__FILE__, __LINE__, "No GPU context!");
+	}
+
+	buffer_	= new unsigned char [8];
+	* (long long *) buffer_ = 0;
+	incref();
+
+	type_	= type;
+	size_	= size;
+	offset_	= 0;
+}
+
+void shared_device_buffer::grow(size_t size, float reserveMultiplier)
+{
+	if (size > size_)
+		resize((size_t) (size * reserveMultiplier));
+}
+
+void shared_device_buffer::write(const void *data, size_t size)
+{
+	if (size == 0)
+		return;
+
+	if (size > size_)
+		throw gpu_exception("Too many data for this device buffer: " + to_string(size) + " > " + to_string(size_));
+
+	Context context;
+	switch (context.type()) {
+#ifdef CUDA_SUPPORT
+	case Context::TypeCUDA:
+		CUDA_SAFE_CALL(cudaMemcpy(cuptr(), data, size, cudaMemcpyHostToDevice));
+		break;
+#endif
+	case Context::TypeOpenCL:
+		context.cl()->writeBuffer((cl_mem) data_, CL_TRUE, offset_, size, data);
+		break;
+	default:
+		gpu::raiseException(__FILE__, __LINE__, "No GPU context!");
+	}
+}
+
+void shared_device_buffer::write(const shared_device_buffer &buffer, size_t size)
+{
+	if (!size)
+		return;
+
+	if (size > size_)
+		throw gpu_exception("Too many data for this device buffer: " + to_string(size) + " > " + to_string(size_));
+
+	Context context;
+	switch (context.type()) {
+#ifdef CUDA_SUPPORT
+	case Context::TypeCUDA:
+		CUDA_SAFE_CALL(cudaMemcpy(cuptr(), buffer.cuptr(), size, cudaMemcpyDeviceToDevice));
+		break;
+#endif
+	case Context::TypeOpenCL:
+		context.cl()->copyBuffer(buffer.clmem(), clmem(), buffer.cloffset(), cloffset(), size);
+		break;
+	default:
+		gpu::raiseException(__FILE__, __LINE__, "No GPU context!");
+	}
+}
+
+void shared_device_buffer::write(const shared_host_buffer &buffer, size_t size)
+{
+	if (!size)
+		return;
+
+	if (size > size_)
+		throw gpu_exception("Too many data for this device buffer: " + to_string(size) + " > " + to_string(size_));
+
+	Context context;
+	switch (context.type()) {
+#ifdef CUDA_SUPPORT
+	case Context::TypeCUDA:
+		CUDA_SAFE_CALL(cudaMemcpy(cuptr(), buffer.get(), size, cudaMemcpyHostToDevice));
+		break;
+#endif
+	case Context::TypeOpenCL:
+		context.cl()->writeBuffer((cl_mem) data_, CL_TRUE, offset_, size, buffer.get());
+		break;
+	default:
+		gpu::raiseException(__FILE__, __LINE__, "No GPU context!");
+	}
+}
+
+void shared_device_buffer::write2D(size_t dpitch, const void *src, size_t spitch, size_t width, size_t height)
+{
+	if (spitch == width && dpitch == width) {
+		write(src, width * height);
+		return;
+	}
+
+	Context context;
+	switch (context.type()) {
+#ifdef CUDA_SUPPORT
+	case Context::TypeCUDA:
+		CUDA_SAFE_CALL(cudaMemcpy2D(cuptr(), dpitch, src, spitch, width, height, cudaMemcpyHostToDevice));
+		break;
+#endif
+	case Context::TypeOpenCL:
+		{
+			size_t buffer_origin[3] = { offset_, 0, 0 };
+			size_t host_origin[3] = { 0, 0, 0 };
+			size_t region[3] = { width, height, 1 };
+			context.cl()->writeBufferRect((cl_mem) data_, CL_TRUE, buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src);
+		}
+		break;
+	default:
+		gpu::raiseException(__FILE__, __LINE__, "No GPU context!");
+	}
+}
+
+void shared_device_buffer::read(void *data, size_t size, size_t offset) const
+{
+	if (size == 0)
+		return;
+	if (size > size_)
+		throw gpu_exception("Not enough data in this device buffer: " + to_string(size) + " > " + to_string(size_));
+
+	Context context;
+	switch (context.type()) {
+#ifdef CUDA_SUPPORT
+	case Context::TypeCUDA:
+		CUDA_SAFE_CALL(cudaMemcpy(data, (char *) cuptr() + offset, size, cudaMemcpyDeviceToHost));
+		break;
+#endif
+	case Context::TypeOpenCL:
+		context.cl()->readBuffer((cl_mem) data_, CL_TRUE, offset_ + offset, size, data);
+		break;
+	default:
+		gpu::raiseException(__FILE__, __LINE__, "No GPU context!");
+	}
+}
+
+void shared_device_buffer::read2D(size_t spitch, void *dst, size_t dpitch, size_t width, size_t height) const
+{
+	if (spitch == width && dpitch == width) {
+		read(dst, width * height);
+		return;
+	}
+
+	Context context;
+	switch (context.type()) {
+#ifdef CUDA_SUPPORT
+		case Context::TypeCUDA:
+			CUDA_SAFE_CALL(cudaMemcpy2D(dst, dpitch, cuptr(), spitch, width, height, cudaMemcpyDeviceToHost));
+			break;
+#endif
+		case Context::TypeOpenCL:
+		{
+			size_t buffer_origin[3] = { offset_, 0, 0 };
+			size_t host_origin[3] = { 0, 0, 0 };
+			size_t region[3] = { width, height, 1 };
+			context.cl()->readBufferRect((cl_mem) data_, CL_TRUE, buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst);
+		}
+			break;
+		default:
+			gpu::raiseException(__FILE__, __LINE__, "No GPU context!");
+	}
+}
+
+void shared_device_buffer::copyTo(shared_device_buffer &that, size_t size) const
+{
+	if (size == 0)
+		return;
+	if (size > size_)
+		throw gpu_exception("Not enough data in this device buffer: " + to_string(size) + " > " + to_string(size_));
+
+	Context context;
+	switch (context.type()) {
+#ifdef CUDA_SUPPORT
+		case Context::TypeCUDA:
+			CUDA_SAFE_CALL(cudaMemcpy((char *) that.cuptr(), (char *) cuptr(), size, cudaMemcpyDeviceToDevice));
+			break;
+#endif
+		case Context::TypeOpenCL:
+			context.cl()->copyBuffer(clmem(), that.clmem(), offset_, that.offset_, size);
+			break;
+		default:
+			gpu::raiseException(__FILE__, __LINE__, "No GPU context!");
+	}
+}
+
+template <typename T>
+shared_device_buffer_typed<T> shared_device_buffer_typed<T>::createN(size_t number)
+{
+	shared_device_buffer_typed<T> res;
+	res.resizeN(number);
+	return res;
+}
+
+template <typename T>
+size_t shared_device_buffer_typed<T>::number() const
+{
+	return size_ / sizeof(T);
+}
+
+template <typename T>
+void shared_device_buffer_typed<T>::resizeN(size_t number)
+{
+	this->resize(number * sizeof(T));
+}
+
+template <typename T>
+void shared_device_buffer_typed<T>::growN(size_t number, float reserveMultiplier)
+{
+	this->grow(number * sizeof(T), reserveMultiplier);
+}
+
+template <typename T>
+T *shared_device_buffer_typed<T>::cuptr() const
+{
+	return (T *) shared_device_buffer::cuptr();
+}
+
+template<typename T>
+void shared_device_buffer_typed<T>::writeN(const T* data, size_t number) {
+	this->write(data, number * sizeof(T));
+}
+
+template<typename T>
+void shared_device_buffer_typed<T>::readN(T* data, size_t number, size_t offset) const
+{
+	this->read(data, number * sizeof(T), offset * sizeof(T));
+}
+
+template<typename T>
+void shared_device_buffer_typed<T>::copyToN(shared_device_buffer_typed<T> &that, size_t number) const
+{
+	this->copyTo(that, number * sizeof(T));
+}
+
+template class shared_device_buffer_typed<int8_t>;
+template class shared_device_buffer_typed<int16_t>;
+template class shared_device_buffer_typed<int32_t>;
+template class shared_device_buffer_typed<uint8_t>;
+template class shared_device_buffer_typed<uint16_t>;
+template class shared_device_buffer_typed<uint32_t>;
+template class shared_device_buffer_typed<float>;
+template class shared_device_buffer_typed<double>;
+
+}
diff --git a/libs/gpu/libgpu/shared_device_buffer.h b/libs/gpu/libgpu/shared_device_buffer.h
new file mode 100644
index 0000000..1885fac
--- /dev/null
+++ b/libs/gpu/libgpu/shared_device_buffer.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <cstddef>
+#include "shared_host_buffer.h"
+
+typedef struct _cl_mem *cl_mem;
+
+namespace gpu {
+
+class shared_device_buffer {
+public:
+	shared_device_buffer();
+	~shared_device_buffer();
+	shared_device_buffer(const shared_device_buffer &other, size_t offset = 0);
+	shared_device_buffer &operator= (const shared_device_buffer &other);
+
+	void			swap(shared_device_buffer &other);
+	void			reset();
+	size_t			size() const;
+	void			resize(size_t size);
+	void			grow(size_t size, float reserveMultiplier=1.1f);
+	bool 			isNull() const;
+
+	void *			cuptr() const;
+	cl_mem			clmem() const;
+	size_t			cloffset() const;
+
+	void 			write(const void *data, size_t size);
+	void			write(const shared_device_buffer &buffer, size_t size);
+	void			write(const shared_host_buffer &buffer, size_t size);
+	void			write2D(size_t dpitch, const void *src, size_t spitch, size_t width, size_t height);
+
+	void			read(void *data, size_t size, size_t offset = 0) const;
+	void 			read2D(size_t spitch, void *dst, size_t dpitch, size_t width, size_t height) const;
+
+	void 			copyTo(shared_device_buffer &that, size_t size) const;
+
+	static shared_device_buffer create(size_t size);
+
+protected:
+	void	incref();
+	void	decref();
+
+	unsigned char *	buffer_;
+	void *			data_;
+	int				type_;
+	size_t			size_;
+	size_t			offset_;
+};
+
+template <typename T>
+class shared_device_buffer_typed : public shared_device_buffer {
+public:
+	shared_device_buffer_typed() : shared_device_buffer() {}
+	shared_device_buffer_typed(const shared_device_buffer_typed &other, size_t offset) : shared_device_buffer(other, offset * sizeof(T)) {}
+	explicit shared_device_buffer_typed(const shared_device_buffer &other) : shared_device_buffer(other) {}
+
+	size_t			number() const;
+
+	void			resizeN(size_t number);
+	void			growN(size_t number, float reserveMultiplier=1.1f);
+
+	T *				cuptr() const;
+
+	void 			writeN(const T* data, size_t number);
+
+	void			readN(T* data, size_t number, size_t offset = 0) const;
+
+	void			copyToN(shared_device_buffer_typed<T> &that, size_t number) const;
+
+	static shared_device_buffer_typed<T> createN(size_t number);
+};
+
+typedef shared_device_buffer						gpu_mem_any;
+
+typedef shared_device_buffer_typed<int8_t>			gpu_mem_8i;
+typedef shared_device_buffer_typed<int16_t>			gpu_mem_16i;
+typedef shared_device_buffer_typed<int32_t>			gpu_mem_32i;
+typedef shared_device_buffer_typed<uint8_t>			gpu_mem_8u;
+typedef shared_device_buffer_typed<uint16_t>		gpu_mem_16u;
+typedef shared_device_buffer_typed<uint32_t>		gpu_mem_32u;
+typedef shared_device_buffer_typed<float>			gpu_mem_32f;
+typedef shared_device_buffer_typed<double>			gpu_mem_64f;
+
+#define gpu_mem shared_device_buffer_typed
+
+}
diff --git a/libs/gpu/libgpu/shared_host_buffer.cpp b/libs/gpu/libgpu/shared_host_buffer.cpp
new file mode 100644
index 0000000..cbdfbac
--- /dev/null
+++ b/libs/gpu/libgpu/shared_host_buffer.cpp
@@ -0,0 +1,206 @@
+#include "shared_host_buffer.h"
+#include "context.h"
+#include <algorithm>
+#include <stdexcept>
+
+#ifdef CUDA_SUPPORT
+#include <libgpu/cuda/utils.h>
+#endif
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+namespace gpu {
+
+shared_host_buffer::shared_host_buffer()
+{
+	buffer_	= 0;
+	data_	= 0;
+	type_	= Context::TypeUndefined;
+	size_	= 0;
+}
+
+shared_host_buffer::~shared_host_buffer()
+{
+	decref();
+}
+
+shared_host_buffer::shared_host_buffer(const shared_host_buffer &other)
+{
+	buffer_	= other.buffer_;
+	data_	= other.data_;
+	type_	= other.type_;
+	size_	= other.size_;
+	incref();
+}
+
+shared_host_buffer &shared_host_buffer::operator= (const shared_host_buffer &other)
+{
+	if (this != &other) {
+		decref();
+		buffer_	= other.buffer_;
+		data_	= other.data_;
+		type_	= other.type_;
+		size_	= other.size_;
+		incref();
+	}
+
+	return *this;
+}
+
+void shared_host_buffer::swap(shared_host_buffer &other)
+{
+	std::swap(buffer_,	other.buffer_);
+	std::swap(data_,	other.data_);
+	std::swap(type_,	other.type_);
+	std::swap(size_,	other.size_);
+}
+
+void shared_host_buffer::incref()
+{
+	if (!buffer_)
+		return;
+
+#if defined(_WIN64)
+	InterlockedIncrement64((LONGLONG *) buffer_);
+#elif defined(_WIN32)
+	InterlockedIncrement((LONG *) buffer_);
+#else
+	__sync_add_and_fetch((long long *) buffer_, 1);
+#endif
+}
+
+void shared_host_buffer::decref()
+{
+	if (!buffer_)
+		return;
+
+	long long count = 0;
+
+#if defined(_WIN64)
+	count = InterlockedDecrement64((LONGLONG *) buffer_);
+#elif defined(_WIN32)
+	count = InterlockedDecrement((LONG *) buffer_);
+#else
+	count = __sync_sub_and_fetch((long long *) buffer_, 1);
+#endif
+
+	if (count)
+		return;
+
+	switch (type_) {
+#ifdef CUDA_SUPPORT
+	case Context::TypeCUDA:
+		cudaFreeHost(data_);
+		break;
+#endif
+	case Context::TypeOpenCL:
+		free(data_);
+		break;
+	default:
+		gpu::raiseException(__FILE__, __LINE__, "No GPU context!");
+	}
+
+	delete [] buffer_;
+
+	buffer_ = 0;
+	data_	= 0;
+	type_	= Context::TypeUndefined;
+	size_	= 0;
+}
+
+shared_host_buffer shared_host_buffer::create(size_t size)
+{
+	shared_host_buffer res;
+	res.resize(size);
+	return res;
+}
+
+void *shared_host_buffer::get() const
+{
+	return data_;
+}
+
+size_t shared_host_buffer::size() const
+{
+	return size_;
+}
+
+void shared_host_buffer::resize(size_t size)
+{
+	if (size == size_)
+		return;
+
+	decref();
+
+	buffer_	= new unsigned char [8];
+	* (long long *) buffer_ = 0;
+	incref();
+
+	Context context;
+	Context::Type type = context.type();
+
+	switch (type) {
+#ifdef CUDA_SUPPORT
+	case Context::TypeCUDA:
+		CUDA_SAFE_CALL( cudaMallocHost(&data_, size) );
+		break;
+#endif
+	case Context::TypeOpenCL:
+		// NOTTODO: implement pinned memory in opencl
+		// currently we use a plain paged memory buffer
+		data_ = malloc(size);
+		if (!data_)
+			throw std::bad_alloc();
+		break;
+	default:
+		gpu::raiseException(__FILE__, __LINE__, "No GPU context!");
+	}
+
+	type_ = type;
+	size_ = size;
+}
+
+void shared_host_buffer::grow(size_t size)
+{
+	if (size > size_)
+		resize(size);
+}
+
+template<typename T>
+shared_host_buffer_typed<T> shared_host_buffer_typed<T>::createN(size_t number)
+{
+	shared_host_buffer_typed<T> res;
+	res.resizeN(number);
+	return res;
+}
+
+template <typename T>
+void shared_host_buffer_typed<T>::resizeN(size_t number)
+{
+	this->resize(number * sizeof(T));
+}
+
+template <typename T>
+T *shared_host_buffer_typed<T>::get() const
+{
+	return (T*) data_;
+}
+
+template<typename T>
+size_t shared_host_buffer_typed<T>::number() const
+{
+	return this->size_ / sizeof(T);
+}
+
+template class shared_host_buffer_typed<char>;
+template class shared_host_buffer_typed<unsigned char>;
+template class shared_host_buffer_typed<short>;
+template class shared_host_buffer_typed<unsigned short>;
+template class shared_host_buffer_typed<int>;
+template class shared_host_buffer_typed<unsigned int>;
+template class shared_host_buffer_typed<float>;
+template class shared_host_buffer_typed<double>;
+
+}
diff --git a/libs/gpu/libgpu/shared_host_buffer.h b/libs/gpu/libgpu/shared_host_buffer.h
new file mode 100644
index 0000000..c2a74d6
--- /dev/null
+++ b/libs/gpu/libgpu/shared_host_buffer.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <cstddef>
+#include <stdint.h>
+
+namespace gpu {
+
+class shared_host_buffer {
+public:
+	shared_host_buffer();
+	~shared_host_buffer();
+	shared_host_buffer(const shared_host_buffer &other);
+	shared_host_buffer &operator= (const shared_host_buffer &other);
+
+	void			swap(shared_host_buffer &other);
+	void *			get() const;
+	size_t			size() const;
+	void			resize(size_t size);
+	void			grow(size_t size);
+
+	static shared_host_buffer create(size_t size);
+
+protected:
+	void	incref();
+	void	decref();
+
+	unsigned char *	buffer_;
+	void *			data_;
+	int				type_;
+	size_t			size_;
+};
+
+template <typename T>
+class shared_host_buffer_typed : public shared_host_buffer {
+public:
+	void			resizeN(size_t number);
+
+	T *				get() const;
+
+	size_t			number() const;
+
+	static shared_host_buffer_typed<T> createN(size_t number);
+};
+
+typedef shared_host_buffer							gpu_host_mem_any;
+
+typedef shared_host_buffer_typed<int16_t>			gpu_host_mem_16i;
+typedef shared_host_buffer_typed<int32_t>			gpu_host_mem_32i;
+typedef shared_host_buffer_typed<uint8_t>			gpu_host_mem_8u;
+typedef shared_host_buffer_typed<uint16_t>			gpu_host_mem_16u;
+typedef shared_host_buffer_typed<uint32_t>			gpu_host_mem_32u;
+typedef shared_host_buffer_typed<float>				gpu_host_mem_32f;
+
+}
diff --git a/libs/gpu/libgpu/utils.cpp b/libs/gpu/libgpu/utils.cpp
new file mode 100644
index 0000000..f33f625
--- /dev/null
+++ b/libs/gpu/libgpu/utils.cpp
@@ -0,0 +1,121 @@
+#include "utils.h"
+#include "context.h"
+
+#include <libgpu/opencl/engine.h>
+#include <limits>
+
+
+void gpu::raiseException(std::string file, int line, std::string message)  {
+	if (message.length() > 0) {
+		throw gpu_exception("Failure at " + file + ":" + to_string(line) + ": " + message);
+	} else {
+		throw gpu_exception("Failure at " + file + ":" + to_string(line));
+	}
+}
+
+template <typename T>
+size_t gpu::deviceTypeSize() {
+	Context context;
+#ifdef CUDA_SUPPORT
+	if (context.type() == Context::TypeCUDA) {
+		return sizeof(T);
+	} else
+#endif
+	if (context.type() == Context::TypeOpenCL) {
+		return sizeof(typename ocl::OpenCLType<T>::type);
+	} else {
+		throw gpu_exception("No GPU active context!");
+	}
+}
+
+template <typename T>
+T gpu::deviceTypeMax() {
+	Context context;
+#ifdef CUDA_SUPPORT
+	if (context.type() == Context::TypeCUDA) {
+		return std::numeric_limits<T>::max();
+	} else
+#endif
+	if (context.type() == Context::TypeOpenCL) {
+		return ocl::OpenCLType<T>::max();
+	} else {
+		throw gpu_exception("No GPU active context!");
+	}
+}
+
+template <typename T>
+T gpu::deviceTypeMin() {
+	Context context;
+#ifdef CUDA_SUPPORT
+	if (context.type() == Context::TypeCUDA) {
+		return std::numeric_limits<T>::min();
+	} else
+#endif
+	if (context.type() == Context::TypeOpenCL) {
+		return ocl::OpenCLType<T>::min();
+	} else {
+		throw gpu_exception("No GPU active context!");
+	}
+}
+
+unsigned int gpu::calcNChunk(size_t n, size_t group_size, size_t max_size)
+{
+	if (n == 0)
+		return group_size;
+
+	size_t work_parts_n = (n + max_size - 1) / max_size;
+	size_t exec_n = (n + work_parts_n - 1) / work_parts_n;
+	exec_n = (exec_n + group_size - 1) / group_size * group_size;
+	return (unsigned int) exec_n;
+}
+
+unsigned int gpu::calcColsChunk(size_t width, size_t height, size_t group_size_x, size_t max_size)
+{
+	size_t work_parts_n = (width * height + max_size - 1) / max_size;
+	size_t ncols = (width + work_parts_n - 1) / work_parts_n;
+	ncols = (ncols + group_size_x - 1) / group_size_x * group_size_x;
+	return (unsigned int) ncols;
+}
+
+unsigned int gpu::calcRowsChunk(size_t width, size_t height, size_t group_size_y, size_t max_size)
+{
+	size_t work_parts_n = (width * height + max_size - 1) / max_size;
+	size_t nrows = (height + work_parts_n - 1) / work_parts_n;
+	nrows = (nrows + group_size_y - 1) / group_size_y * group_size_y;
+	return (unsigned int) nrows;
+}
+
+unsigned int gpu::calcZSlicesChunk(size_t x, size_t y, size_t z, size_t group_size_z, size_t max_size)
+{
+	size_t work_parts_n = (z * y * x + max_size - 1) / max_size;
+	size_t z_slices = (z + work_parts_n - 1) / work_parts_n;
+	z_slices = (z_slices + group_size_z - 1) / group_size_z * group_size_z;
+	return (unsigned int) z_slices;
+}
+
+template size_t		gpu::deviceTypeSize<int8_t>();
+template size_t		gpu::deviceTypeSize<int16_t>();
+template size_t		gpu::deviceTypeSize<int32_t>();
+template size_t		gpu::deviceTypeSize<uint8_t>();
+template size_t		gpu::deviceTypeSize<uint16_t>();
+template size_t		gpu::deviceTypeSize<uint32_t>();
+template size_t		gpu::deviceTypeSize<float>();
+template size_t		gpu::deviceTypeSize<double>();
+
+template int8_t		gpu::deviceTypeMax<int8_t>();
+template int16_t	gpu::deviceTypeMax<int16_t>();
+template int32_t	gpu::deviceTypeMax<int32_t>();
+template uint8_t	gpu::deviceTypeMax<uint8_t>();
+template uint16_t	gpu::deviceTypeMax<uint16_t>();
+template uint32_t	gpu::deviceTypeMax<uint32_t>();
+template float		gpu::deviceTypeMax<float>();
+template double		gpu::deviceTypeMax<double>();
+
+template int8_t		gpu::deviceTypeMin<int8_t>();
+template int16_t	gpu::deviceTypeMin<int16_t>();
+template int32_t	gpu::deviceTypeMin<int32_t>();
+template uint8_t	gpu::deviceTypeMin<uint8_t>();
+template uint16_t	gpu::deviceTypeMin<uint16_t>();
+template uint32_t	gpu::deviceTypeMin<uint32_t>();
+template float		gpu::deviceTypeMin<float>();
+template double		gpu::deviceTypeMin<double>();
diff --git a/libs/gpu/libgpu/utils.h b/libs/gpu/libgpu/utils.h
new file mode 100644
index 0000000..1d6f88d
--- /dev/null
+++ b/libs/gpu/libgpu/utils.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <string>
+#include <stdexcept>
+
+namespace gpu {
+
+	class gpu_exception : public std::runtime_error {
+	public:
+		gpu_exception(std::string msg) throw ()					: runtime_error(msg)							{	}
+		gpu_exception(const char *msg) throw ()					: runtime_error(msg)							{	}
+		gpu_exception() throw ()								: runtime_error("GPU exception")				{	}
+	};
+
+	class gpu_bad_alloc : public gpu_exception {
+	public:
+		gpu_bad_alloc(std::string msg) throw ()					: gpu_exception(msg)							{	}
+		gpu_bad_alloc(const char *msg) throw ()					: gpu_exception(msg)							{	}
+		gpu_bad_alloc() throw ()								: gpu_exception("GPU exception")				{	}
+	};
+
+	void raiseException(std::string file, int line, std::string message);
+
+	template <typename T>
+	size_t deviceTypeSize();
+
+	template <typename T>
+	T deviceTypeMax();
+
+	template <typename T>
+	T deviceTypeMin();
+
+	inline unsigned int divup(unsigned int num, unsigned int denom) {
+		return (num + denom - 1) / denom;
+	}
+
+	unsigned int calcNChunk(size_t n, size_t group_size, size_t max_size=1000*1000);
+	unsigned int calcColsChunk(size_t width, size_t height, size_t group_size_x, size_t max_size=1000*1000);
+	unsigned int calcRowsChunk(size_t width, size_t height, size_t group_size_y, size_t max_size=1000*1000);
+	unsigned int calcZSlicesChunk(size_t x, size_t y, size_t z, size_t group_size_z, size_t max_size=1000*1000);
+}
+
+#define GPU_CHECKED_VERBOSE(x, message)	if (!(x)) {gpu::raiseException(__FILE__, __LINE__, message);}
+#define GPU_CHECKED(x)					if (!(x)) {gpu::raiseException(__FILE__, __LINE__, "");}
diff --git a/libs/gpu/libgpu/work_size.h b/libs/gpu/libgpu/work_size.h
new file mode 100644
index 0000000..36eac11
--- /dev/null
+++ b/libs/gpu/libgpu/work_size.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include "utils.h"
+
+#ifdef CUDA_SUPPORT
+	#include <vector_types.h>
+#endif
+
+namespace gpu {
+	class WorkSize {
+	public:
+		WorkSize(unsigned int groupSizeX, unsigned int workSizeX)
+		{
+			init(1, groupSizeX, 1, 1, workSizeX, 1, 1);
+		}
+
+		WorkSize(unsigned int groupSizeX, unsigned int groupSizeY, unsigned int workSizeX, unsigned int workSizeY)
+		{
+			init(2, groupSizeX, groupSizeY, 1, workSizeX, workSizeY, 1);
+		}
+
+		WorkSize(unsigned int groupSizeX, unsigned int groupSizeY, unsigned int groupSizeZ,unsigned  int workSizeX, unsigned int workSizeY, unsigned int workSizeZ)
+		{
+			init(3, groupSizeX, groupSizeY, groupSizeZ, workSizeX, workSizeY, workSizeZ);
+		}
+
+#ifdef CUDA_SUPPORT
+		const dim3 &cuBlockSize() const {
+			return blockSize;
+		}
+
+		const dim3 &cuGridSize() const {
+			return gridSize;
+		}
+#endif
+
+		const size_t *clLocalSize() const {
+			return localWorkSize;
+		}
+
+		const size_t *clGlobalSize() const {
+			return globalWorkSize;
+		}
+
+		int clWorkDim() const {
+			return workDims;
+		}
+
+	private:
+		void init(int workDims, unsigned int groupSizeX, unsigned int groupSizeY, unsigned int groupSizeZ, unsigned  int workSizeX, unsigned int workSizeY, unsigned int workSizeZ)
+		{
+			this->workDims = workDims;
+
+			localWorkSize[0] = groupSizeX;
+			localWorkSize[1] = groupSizeY;
+			localWorkSize[2] = groupSizeZ;
+
+			workSizeX = gpu::divup(workSizeX, groupSizeX) * groupSizeX;
+			workSizeY = gpu::divup(workSizeY, groupSizeY) * groupSizeY;
+			workSizeZ = gpu::divup(workSizeZ, groupSizeZ) * groupSizeZ;
+
+			globalWorkSize[0] = workSizeX;
+			globalWorkSize[1] = workSizeY;
+			globalWorkSize[2] = workSizeZ;
+
+#ifdef CUDA_SUPPORT
+			blockSize	= dim3(groupSizeX, groupSizeY, groupSizeZ);
+			gridSize	= dim3(gpu::divup(workSizeX, groupSizeX),
+							   gpu::divup(workSizeY, groupSizeY),
+							   gpu::divup(workSizeZ, groupSizeZ));
+#endif
+		}
+
+	private:
+		size_t	localWorkSize[3];
+		size_t	globalWorkSize[3];
+		int workDims;
+
+#ifdef CUDA_SUPPORT
+		dim3	blockSize;
+		dim3	gridSize;
+#endif
+	};
+}
\ No newline at end of file
diff --git a/libs/utils/CMakeLists.txt b/libs/utils/CMakeLists.txt
new file mode 100644
index 0000000..ddf23a3
--- /dev/null
+++ b/libs/utils/CMakeLists.txt
@@ -0,0 +1,37 @@
+cmake_minimum_required(VERSION 3.1)
+
+project(libutils)
+
+set(HEADERS
+        libutils/fast_random.h
+        libutils/misc.h
+        libutils/string_utils.h
+        libutils/thread_mutex.h
+        libutils/timer.h
+        )
+
+set(SOURCES
+        libutils/misc.cpp
+        libutils/string_utils.cpp
+        libutils/thread_mutex.cpp
+        )
+
+option(GPU_CUDA_SUPPORT "CUDA support." OFF)
+
+set(CMAKE_CXX_STANDARD 11)
+
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_package(Threads REQUIRED)
+
+if (GPU_CUDA_SUPPORT)
+    find_package (CUDA REQUIRED)
+
+    add_definitions(-DCUDA_SUPPORT)
+
+    cuda_add_library(${PROJECT_NAME} ${SOURCES} ${HEADERS})
+else()
+    add_library(${PROJECT_NAME} ${SOURCES} ${HEADERS})
+endif()
+
+target_link_libraries(${PROJECT_NAME} Threads::Threads libgpu)
+target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR})
diff --git a/libs/utils/libutils/fast_random.h b/libs/utils/libutils/fast_random.h
new file mode 100644
index 0000000..a03a077
--- /dev/null
+++ b/libs/utils/libutils/fast_random.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <limits>
+
+// See https://stackoverflow.com/a/1640399
+class FastRandom {
+public:
+    FastRandom(unsigned long seed=123456789) {
+        reset(seed);
+    }
+
+    void reset(unsigned long seed=123456789) {
+        x = seed;
+        y = 362436069;
+        z = 521288629;
+    }
+
+    // Returns pseudo-random value in range [min; max] (inclusive)
+    int next(int min=0, int max=std::numeric_limits<int>::max()) {
+        x ^= x << 16;
+        x ^= x >> 5;
+        x ^= x << 1;
+
+        unsigned long t = x;
+        x = y;
+        y = z;
+        z = t ^ x ^ y;
+
+        return min + (unsigned int) (z % (((unsigned long) max) - min + 1));
+    }
+
+    float nextf() {
+        return (next() * 2000.0f / std::numeric_limits<int>::max()) - 1000.0f;
+    }
+
+private:
+    unsigned long x, y, z;
+};
diff --git a/libs/utils/libutils/misc.cpp b/libs/utils/libutils/misc.cpp
new file mode 100644
index 0000000..bf29d01
--- /dev/null
+++ b/libs/utils/libutils/misc.cpp
@@ -0,0 +1,74 @@
+#include "misc.h"
+
+#ifdef CUDA_SUPPORT
+#include <cuda_runtime_api.h>
+#endif
+
+void gpu::printDeviceInfo(gpu::Device &device)
+{
+#ifdef CUDA_SUPPORT
+	if (device.supports_cuda) {
+		int driverVersion = 239;
+		cudaDriverGetVersion(&driverVersion);
+		std::cout << "GPU. " << device.name << " (CUDA " << driverVersion << ").";
+	} else
+#endif
+	{
+		ocl::DeviceInfo info;
+		info.init(device.device_id_opencl);
+		if (info.device_type == CL_DEVICE_TYPE_GPU) {
+			std::cout << "GPU.";
+		} else if (info.device_type == CL_DEVICE_TYPE_CPU) {
+			std::cout << "CPU.";
+		} else {
+			throw std::runtime_error(
+					"Only CPU and GPU supported! But type=" + to_string(info.device_type) + " encountered!");
+		}
+		std::cout << " " << info.device_name << ".";
+		if (info.device_type == CL_DEVICE_TYPE_CPU) {
+			std::cout << " " << info.vendor_name << ".";
+		}
+	}
+
+	if (device.supportsFreeMemoryQuery()) {
+		std::cout << " Free memory: " << (device.getFreeMemory() >> 20) << "/" << (device.mem_size >> 20) << " Mb";
+	} else {
+		std::cout << " Total memory: " << (device.mem_size >> 20) << " Mb";
+	}
+	std::cout << std::endl;
+}
+
+
+gpu::Device gpu::chooseGPUDevice(int argc, char **argv)
+{
+	std::vector <gpu::Device> devices = gpu::enumDevices();
+	unsigned int device_index = std::numeric_limits<unsigned int>::max();
+
+	if (devices.size() == 0) {
+		throw std::runtime_error("No OpenCL devices found!");
+	} else {
+		std::cout << "OpenCL devices:" << std::endl;
+		for (int i = 0; i < devices.size(); ++i) {
+			std::cout << "  Device #" << i << ": ";
+			gpu::printDeviceInfo(devices[i]);
+		}
+		if (devices.size() == 1) {
+			device_index = 0;
+		} else {
+			if (argc != 2) {
+				std::cerr << "Usage: <app> <OpenCLDeviceIndex>" << std::endl;
+				std::cerr << "	Where <OpenCLDeviceIndex> should be from 0 to " << (devices.size() - 1) << " (inclusive)" << std::endl;
+				throw std::runtime_error("Illegal arguments!");
+			} else {
+				device_index = atoi(argv[1]);
+				if (device_index >= devices.size()) {
+					std::cerr << "<OpenCLDeviceIndex> should be from 0 to " << (devices.size() - 1) << " (inclusive)! But " << argv[1] << " provided!" << std::endl;
+					throw std::runtime_error("Illegal arguments!");
+				}
+			}
+		}
+		std::cout << "Using device #" << device_index << ": ";
+		gpu::printDeviceInfo(devices[device_index]);
+	}
+	return devices[device_index];
+}
diff --git a/libs/utils/libutils/misc.h b/libs/utils/libutils/misc.h
new file mode 100644
index 0000000..678ec00
--- /dev/null
+++ b/libs/utils/libutils/misc.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <libgpu/device.h>
+#include <libgpu/opencl/engine.h>
+#include <libgpu/opencl/device_info.h>
+#include <libutils/string_utils.h>
+#include <CL/cl.h>
+
+#include <string>
+#include <limits>
+#include <iostream>
+#include <stdexcept>
+
+namespace gpu {
+	void printDeviceInfo(gpu::Device &device);
+
+	gpu::Device chooseGPUDevice(int argc, char **argv);
+}
+
+namespace ocl {
+
+	class Kernel {
+	public:
+		Kernel() {}
+
+		Kernel(const char *source_code, size_t source_code_length, std::string kernel_name,
+			   std::string defines = std::string())
+		{
+			init(source_code, source_code_length, kernel_name, defines);
+		}
+
+		void init(const char *source_code, size_t source_code_length, std::string kernel_name,
+				  std::string defines = std::string())
+		{
+			program_ = std::make_shared<ocl::ProgramBinaries>(source_code, source_code_length, defines);
+			kernel_ = std::make_shared<ocl::KernelSource>(program_, kernel_name);
+		}
+
+		void compile(bool printLog=false)
+		{
+			if (!kernel_)
+				throw std::runtime_error("Null kernel!");
+			kernel_->precompile(printLog);
+		}
+
+		typedef ocl::OpenCLKernel::Arg Arg;
+
+		void exec(const gpu::WorkSize &ws, const Arg &arg0 = Arg(), const Arg &arg1 = Arg(), const Arg &arg2 = Arg(), const Arg &arg3 = Arg(), const Arg &arg4 = Arg(), const Arg &arg5 = Arg(), const Arg &arg6 = Arg(), const Arg &arg7 = Arg(), const Arg &arg8 = Arg(), const Arg &arg9 = Arg(), const Arg &arg10 = Arg(), const Arg &arg11 = Arg(), const Arg &arg12 = Arg(), const Arg &arg13 = Arg(), const Arg &arg14 = Arg(), const Arg &arg15 = Arg(), const Arg &arg16 = Arg(), const Arg &arg17 = Arg(), const Arg &arg18 = Arg(), const Arg &arg19 = Arg(), const Arg &arg20 = Arg(), const Arg &arg21 = Arg(), const Arg &arg22 = Arg(), const Arg &arg23 = Arg(), const Arg &arg24 = Arg(), const Arg &arg25 = Arg(), const Arg &arg26 = Arg(), const Arg &arg27 = Arg(), const Arg &arg28 = Arg(), const Arg &arg29 = Arg(), const Arg &arg30 = Arg(), const Arg &arg31 = Arg(), const Arg &arg32 = Arg(), const Arg &arg33 = Arg(), const Arg &arg34 = Arg(), const Arg &arg35 = Arg(), const Arg &arg36 = Arg(), const Arg &arg37 = Arg(), const Arg &arg38 = Arg(), const Arg &arg39 = Arg(), const Arg &arg40 = Arg())
+		{
+			if (!kernel_)
+				throw std::runtime_error("Null kernel!");
+			kernel_->exec(ws, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15, arg16, arg17, arg18, arg19, arg20, arg21, arg22, arg23, arg24, arg25, arg26, arg27, arg28, arg29, arg30, arg31, arg32, arg33, arg34, arg35, arg36, arg37, arg38, arg39, arg40);
+		}
+
+	private:
+		std::shared_ptr<ocl::ProgramBinaries> program_;
+		std::shared_ptr<ocl::KernelSource> kernel_;
+	};
+}
diff --git a/libs/utils/libutils/string_utils.cpp b/libs/utils/libutils/string_utils.cpp
new file mode 100644
index 0000000..ab82898
--- /dev/null
+++ b/libs/utils/libutils/string_utils.cpp
@@ -0,0 +1,158 @@
+#include "string_utils.h"
+#include <sstream>
+
+std::vector<std::string> split(const std::string &string, const std::string &separator, bool keep_empty_parts)
+{
+	std::vector<std::string> result;
+	size_t p = 0;
+	
+	while (true) {
+		size_t s = string.find(separator, p);
+		if (s == std::string::npos)
+			break;
+		std::string token = string.substr(p, s - p);
+		if (keep_empty_parts || token.size())
+			result.push_back(token);
+		p = s + separator.size();
+	}
+
+	std::string token = string.substr(p);
+	if (keep_empty_parts || token.size())
+		result.push_back(token);
+	return result;
+}
+
+std::string join(const std::vector<std::string> &tokens, const std::string &separator)
+{
+	std::string res;
+	for (size_t i = 0; i < tokens.size(); i++) {
+		if (i)
+			res += separator;
+		res += tokens[i];
+	}
+	return res;
+}
+
+std::istream &getline(std::istream &is, std::string &str)
+{
+	std::string::size_type nread = 0;
+
+	if (std::istream::sentry(is, true)) {
+		std::streambuf *const sbuf = is.rdbuf();
+		str.clear();
+
+		while (nread < str.max_size()) {
+			int c1 = sbuf->sbumpc();
+			if (c1 == std::streambuf::traits_type::eof()) {
+				is.setstate(std::istream::eofbit);
+				break;
+			} else {
+				++nread;
+				const char ch = c1;
+				if (ch != '\n' && ch != '\r') {
+					str.push_back(ch);
+				} else {
+					const char ch1 = is.peek();
+					if (ch == '\n' && ch1 == '\r') is.ignore(1);
+					if (ch == '\r' && ch1 == '\n') is.ignore(1);
+					break;
+				}
+			}
+		}
+	}
+
+	if (nread == 0 || nread >= str.max_size()) {
+		is.setstate(std::istream::failbit);
+	}
+
+	return is;
+}
+
+double atof(const std::string &s)
+{
+	std::stringstream ss(s);
+	ss.imbue(std::locale::classic());
+
+	double value = 0;
+	ss >> value;
+	return value;
+}
+
+int atoi(const std::string &s)
+{
+	std::stringstream ss(s);
+	ss.imbue(std::locale::classic());
+
+	int value = 0;
+	ss >> value;
+	return value;
+}
+
+std::string tolower(const std::string &str)
+{
+	std::string res = str;
+	for (size_t k = 0; k < res.size(); k++) res[k] = ::tolower(res[k]);
+	return res;
+}
+
+std::string trimmed(const std::string &s)
+{
+	const size_t p1 = s.find_first_not_of(' ');
+	const size_t p2 = s.find_last_not_of(' ');
+
+	if (p1 == std::string::npos)
+		return std::string();
+
+	return s.substr(p1, p2 - p1 + 1);
+}
+
+// base 64 encoding/decoding
+// http://stackoverflow.com/questions/180947/base64-decode-snippet-in-c
+
+std::string base64_encode(const std::string &in)
+{
+	std::string out;
+
+	int val=0, valb=-6;
+	for (std::string::const_iterator it = in.begin(); it != in.end(); ++it) {
+		unsigned char c = *it;
+
+		val = (val<<8) + c;
+		valb += 8;
+		while (valb>=0) {
+			out.push_back("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(val>>valb)&0x3F]);
+			valb-=6;
+		}
+	}
+
+	if (valb>-6) out.push_back("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[((val<<8)>>(valb+8))&0x3F]);
+	while (out.size()%4) out.push_back('=');
+	return out;
+}
+
+std::string base64_decode(const std::string &in)
+{
+	std::string out;
+
+	std::vector<int> T(256,-1);
+	for (int i=0; i<64; i++) T["ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[i]] = i; 
+
+	int val=0, valb=-8;
+	for (std::string::const_iterator it = in.begin(); it != in.end(); ++it) {
+		unsigned char c = *it;
+		if (isspace(c))
+			continue;
+
+		if (T[c] == -1)
+			break;
+
+		val = (val<<6) + T[c];
+		valb += 6;
+		if (valb>=0) {
+			out.push_back(char((val>>valb)&0xFF));
+			valb-=8;
+		}
+	}
+
+	return out;
+}
diff --git a/libs/utils/libutils/string_utils.h b/libs/utils/libutils/string_utils.h
new file mode 100644
index 0000000..cbd96ec
--- /dev/null
+++ b/libs/utils/libutils/string_utils.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <sstream>
+
+template <typename T>
+std::string to_string(T value)
+{
+    std::ostringstream ss;
+    ss << value;
+    return ss.str();
+}
+
+std::vector<std::string> split(const std::string &string, const std::string &separator, bool keep_empty_parts = true);
+std::string join(const std::vector<std::string> &tokens, const std::string &separator);
+std::istream &getline(std::istream &is, std::string &str);
+double atof(const std::string &s);
+int atoi(const std::string &s);
+std::string tolower(const std::string &str);
+std::string trimmed(const std::string &str);
+std::string base64_encode(const std::string &in);
+std::string base64_decode(const std::string &in);
diff --git a/libs/utils/libutils/thread_mutex.cpp b/libs/utils/libutils/thread_mutex.cpp
new file mode 100644
index 0000000..15a163b
--- /dev/null
+++ b/libs/utils/libutils/thread_mutex.cpp
@@ -0,0 +1,119 @@
+#include "thread_mutex.h"
+#include <cassert>
+
+#define MUTEX_POOL_CHECK_FOR_DEADLOCKS 0
+
+#if defined _WIN32 || defined _WIN64
+
+// Windows threads
+
+Mutex::Mutex()
+{
+	::InitializeCriticalSection(&_mutex);
+}
+
+Mutex::~Mutex()
+{
+	::DeleteCriticalSection(&_mutex);
+}
+
+void Mutex::lock() const
+{
+	::EnterCriticalSection(&_mutex);
+	assert(_mutex.RecursionCount == 1);
+}
+
+void Mutex::unlock() const
+{
+	assert(_mutex.RecursionCount == 1);
+	::LeaveCriticalSection(&_mutex);
+}
+
+bool Mutex::tryLock () const
+{
+	return (::TryEnterCriticalSection(&_mutex) != 0);
+}
+
+#else
+
+// Posix threads
+
+#include <errno.h>
+
+Mutex::Mutex ()
+{
+	int error = ::pthread_mutex_init(&_mutex, 0);
+	assert(error == 0);
+}
+
+Mutex::~Mutex ()
+{
+	int error = ::pthread_mutex_destroy(&_mutex);
+	assert(error == 0);
+}
+
+void Mutex::lock() const
+{
+	int error = ::pthread_mutex_lock(&_mutex);
+	assert(error == 0);
+}
+
+void Mutex::unlock() const
+{
+	int error = ::pthread_mutex_unlock(&_mutex);
+	assert(error == 0);
+}
+
+bool Mutex::tryLock () const
+{
+	int error = ::pthread_mutex_trylock(&_mutex);
+	if (error == EBUSY) return false;
+	assert(error == 0);
+	return true;
+}
+
+#endif
+
+MutexPool global_mutexpool;
+
+MutexPool::MutexPool(size_t size)
+{
+	size_		= size;
+	mutexes_	= new MutexPtr[size];
+	for (size_t k = 0; k < size; k++)
+		mutexes_[k] = 0;
+}
+
+MutexPool::~MutexPool()
+{
+	for (size_t k = 0; k < size_; k++) {
+		delete mutexes_[k];
+		mutexes_[k] = 0;
+	}
+	delete[] mutexes_;
+}
+
+MutexPool *MutexPool::instance()
+{
+	return &global_mutexpool;
+}
+
+Mutex &MutexPool::get(const void *address)
+{
+	Lock lock(mutex_);
+
+	size_t index = int(((size_t)(void *)(address) >> (sizeof(address) >> 1)) % size_);
+
+#if MUTEX_POOL_CHECK_FOR_DEADLOCKS
+	index = 0;
+#endif
+
+	Mutex *m = mutexes_[index];
+
+	if (!m) {
+		mutexes_[index] = new Mutex;
+		m = mutexes_[index];
+	}
+
+	return *m;
+}
diff --git a/libs/utils/libutils/thread_mutex.h b/libs/utils/libutils/thread_mutex.h
new file mode 100644
index 0000000..25f4e84
--- /dev/null
+++ b/libs/utils/libutils/thread_mutex.h
@@ -0,0 +1,127 @@
+#pragma once
+
+#if defined _WIN32 || defined _WIN64
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#else
+#include <pthread.h>
+#endif
+
+class Lock;
+
+class Mutex {
+public:
+	Mutex ();
+	virtual ~Mutex ();
+
+	void	lock () const;
+	void	unlock () const;
+	bool	tryLock () const;
+
+private:
+#if defined _WIN32 || defined _WIN64
+	mutable CRITICAL_SECTION _mutex;
+#else
+	mutable pthread_mutex_t _mutex;
+#endif
+
+	void operator = (const Mutex& M);
+	Mutex (const Mutex& M);
+
+	friend class Lock;
+};
+
+class Lock {
+public:
+	Lock (const Mutex& m, bool autoLock = true) : _mutex (m), _locked (false)
+	{
+		if (autoLock) {
+			_mutex.lock();
+			_locked = true;
+		}
+	}
+
+	~Lock ()
+	{
+		if (_locked)
+			_mutex.unlock();
+	}
+
+	void acquire ()
+	{
+		_mutex.lock();
+		_locked = true;
+	}
+
+	void release ()
+	{
+		_mutex.unlock();
+		_locked = false;
+	}
+
+	bool locked ()
+	{
+		return _locked;
+	}
+
+private:
+	const Mutex &	_mutex;
+	bool			_locked;
+};
+
+class TryLock {
+public:
+	TryLock (const Mutex& m, bool autoLock = true) : _mutex (m), _locked (false)
+	{
+		if (autoLock)
+			_locked = _mutex.tryLock();
+	}
+
+	~TryLock ()
+	{
+		if (_locked)
+			_mutex.unlock();
+	}
+
+	bool acquire ()
+	{
+		_locked = _mutex.tryLock();
+	}
+
+	void release ()
+	{
+		_mutex.unlock();
+		_locked = false;
+	}
+
+	bool locked ()
+	{
+		return _locked;
+	}
+
+private:
+	const Mutex &	_mutex;
+	bool			_locked;
+};
+
+class MutexPool {
+public:
+	MutexPool(size_t size = 256);
+	~MutexPool();
+
+	Mutex &get(const void *address);
+
+	static MutexPool *instance();
+
+private:
+	typedef Mutex *	MutexPtr;
+
+	Mutex		mutex_;
+	MutexPtr *	mutexes_;
+	size_t		size_;
+};
diff --git a/libs/utils/libutils/timer.h b/libs/utils/libutils/timer.h
new file mode 100644
index 0000000..fddf6d1
--- /dev/null
+++ b/libs/utils/libutils/timer.h
@@ -0,0 +1,161 @@
+#pragma once
+
+#ifdef _WIN32
+#include <time.h>
+#else
+#include <sys/time.h>
+#endif
+
+#include <vector>
+#include <cmath>
+#include <algorithm>
+
+class timer {
+protected:
+#ifdef _WIN32
+    typedef clock_t timer_type;
+#else
+    typedef struct timeval timer_type;
+#endif
+
+    double counter_;
+    timer_type start_;
+    int is_running_;
+    
+    std::vector<double> laps_;
+
+public:
+    timer(bool paused = false)
+    {
+        counter_ = 0;
+        is_running_ = 0;
+        if (!paused)
+            start();
+    }
+
+    void start()
+    {
+        if (is_running_) return;
+
+        start_ = measure();
+        is_running_ = 1;
+    }
+
+    void stop()
+    {
+        if (!is_running_) return;
+
+        counter_ += diff(start_, measure());
+        is_running_ = 0;
+    }
+
+    double nextLap()
+    {
+        double lap_time = elapsed();
+        laps_.push_back(lap_time);
+        restart();
+        return lap_time;
+    }
+
+    void reset()
+    {
+        counter_ = 0;
+        is_running_ = 0;
+    }
+
+    void restart()
+    {
+        reset();
+        start();
+    }
+
+    double elapsed() const
+    {
+        double tm = counter_;
+
+        if (is_running_)
+            tm += diff(start_, measure());
+
+        if (tm < 0)
+            tm = 0;
+
+        return tm;
+    }
+    
+    const std::vector<double>& laps() const
+    {
+        return laps_;
+    }
+
+    // Note that this is not true averaging, if there is at least 5 laps - averaging made from 20% percentile to 80% percentile (See lapsFiltered)
+    double lapAvg() const
+    {
+        std::vector<double> laps = lapsFiltered();
+        
+        double sum = 0.0;
+        for (int i = 0; i < laps.size(); ++i) {
+            sum += laps[i];
+        }
+        if (laps.size() > 0) {
+            sum /= laps.size();
+        }
+        return sum;
+    }
+
+    // Note that this is not true averaging, if there is at least 5 laps - averaging made from 20% percentile to 80% percentile (See lapsFiltered)
+    double lapStd() const
+    {
+        double avg = lapAvg();
+
+        std::vector<double> laps = lapsFiltered();
+
+        double sum2 = 0.0;
+        for (int i = 0; i < laps.size(); ++i) {
+            sum2 += laps[i] * laps[i];
+        }
+        if (laps.size() > 0) {
+            sum2 /= laps.size();
+        }
+        return sqrt(std::max(0.0, sum2 - avg * avg));
+    }
+
+protected:
+
+    std::vector<double> lapsFiltered() const
+    {
+        std::vector<double> laps = laps_;
+        std::sort(laps.begin(), laps.end());
+
+        unsigned int nlaps = laps.size();
+        if (nlaps >= 5) {
+            // Removing last 20% of measures
+            laps.erase(laps.end() - nlaps/5, laps.end());
+            // Removing first 20% of measures
+            laps.erase(laps.begin(), laps.begin() + nlaps/5);
+        }
+        return laps;
+    }
+
+    static timer_type measure()
+    {
+        timer_type tm;
+#ifdef _WIN32
+        tm = clock();
+#else
+        ::gettimeofday(&tm, 0);
+#endif
+        return tm;
+    }
+
+    static double diff(const timer_type &start, const timer_type &end)
+    {
+#ifdef _WIN32
+        return (double) (end - start) / (double) CLOCKS_PER_SEC;
+#else
+        long secs = end.tv_sec - start.tv_sec;
+        long usecs = end.tv_usec - start.tv_usec;
+
+        return (double) secs + (double) usecs / 1000000.0;
+#endif
+    }
+};
diff --git a/src/cl/merge_sort.cl b/src/cl/merge_sort.cl
new file mode 100644
index 0000000..0b7c27a
--- /dev/null
+++ b/src/cl/merge_sort.cl
@@ -0,0 +1,32 @@
+#ifdef __CLION_IDE__
+#include <libgpu/opencl/cl/clion_defines.cl>
+#define WORKGROUP_SIZE 256
+#endif
+
+#line 6
+
+__attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
+__kernel void merge_sort(__global const float* as,
+						 __global       float* bs,
+						 unsigned int n,
+						 unsigned int sorted_chunks_size)
+{
+	const unsigned int global_index = get_global_id(0);
+	const unsigned int local_index = get_local_id(0);
+
+	__local float as_local[WORKGROUP_SIZE];
+	__local float bs_local[WORKGROUP_SIZE];
+
+	if (2 * sorted_chunks_size <= WORKGROUP_SIZE) {
+		as_local[local_index] = (global_index < n) ? as[global_index] : FLT_MAX;
+		barrier(CLK_LOCAL_MEM_FENCE);
+		if (local_index % (2 * sorted_chunks_size) == 0) {
+			// Merging two sorted chunks in new one
+			int i = 0;
+			int j = 0;
+			for (int _ = 0; _ < 2 * sorted_chunks_size; ++_) {
+				
+			}
+		}
+	}
+}
diff --git a/src/cl/merge_sort_cl.h b/src/cl/merge_sort_cl.h
new file mode 100644
index 0000000..8ef7c42
--- /dev/null
+++ b/src/cl/merge_sort_cl.h
@@ -0,0 +1,49 @@
+#include <cstddef>
+
+static const char merge_sort_kernel[] = {
+0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x5f, 0x5f, 0x43, 0x4c, 0x49, 0x4f, 0x4e, 0x5f, 0x49, 0x44, 0x45, 0x5f, 0x5f, 
+0x0a, 0x23, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x20, 0x3c, 0x6c, 0x69, 0x62, 0x67, 0x70, 0x75, 0x2f, 0x6f, 0x70, 
+0x65, 0x6e, 0x63, 0x6c, 0x2f, 0x63, 0x6c, 0x2f, 0x63, 0x6c, 0x69, 0x6f, 0x6e, 0x5f, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 
+0x73, 0x2e, 0x63, 0x6c, 0x3e, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x57, 0x4f, 0x52, 0x4b, 0x47, 0x52, 
+0x4f, 0x55, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, 0x32, 0x35, 0x36, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 
+0x0a, 0x23, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x36, 0x0a, 0x0a, 0x5f, 0x5f, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 
+0x65, 0x5f, 0x5f, 0x28, 0x28, 0x72, 0x65, 0x71, 0x64, 0x5f, 0x77, 0x6f, 0x72, 0x6b, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 
+0x5f, 0x73, 0x69, 0x7a, 0x65, 0x28, 0x57, 0x4f, 0x52, 0x4b, 0x47, 0x52, 0x4f, 0x55, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 
+0x2c, 0x20, 0x31, 0x2c, 0x20, 0x31, 0x29, 0x29, 0x29, 0x0a, 0x5f, 0x5f, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x20, 0x76, 
+0x6f, 0x69, 0x64, 0x20, 0x6d, 0x65, 0x72, 0x67, 0x65, 0x5f, 0x73, 0x6f, 0x72, 0x74, 0x28, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, 
+0x62, 0x61, 0x6c, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, 0x20, 0x61, 0x73, 0x2c, 
+0x0a, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x20, 0x20, 0x20, 0x20, 
+0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, 0x20, 0x62, 0x73, 0x2c, 0x0a, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 
+0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x6e, 0x2c, 0x0a, 0x09, 0x09, 0x09, 
+0x09, 0x09, 0x09, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x73, 0x6f, 0x72, 
+0x74, 0x65, 0x64, 0x5f, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x73, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 
+0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x67, 
+0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 
+0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x3b, 0x0a, 0x09, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 
+0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x69, 0x6e, 
+0x64, 0x65, 0x78, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x30, 
+0x29, 0x3b, 0x0a, 0x0a, 0x09, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x61, 
+0x73, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5b, 0x57, 0x4f, 0x52, 0x4b, 0x47, 0x52, 0x4f, 0x55, 0x50, 0x5f, 0x53, 0x49, 
+0x5a, 0x45, 0x5d, 0x3b, 0x0a, 0x09, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 
+0x62, 0x73, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5b, 0x57, 0x4f, 0x52, 0x4b, 0x47, 0x52, 0x4f, 0x55, 0x50, 0x5f, 0x53, 
+0x49, 0x5a, 0x45, 0x5d, 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x32, 0x20, 0x2a, 0x20, 0x73, 0x6f, 0x72, 0x74, 
+0x65, 0x64, 0x5f, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x73, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x3c, 0x3d, 0x20, 0x57, 0x4f, 
+0x52, 0x4b, 0x47, 0x52, 0x4f, 0x55, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x61, 0x73, 
+0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5b, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x5d, 0x20, 
+0x3d, 0x20, 0x28, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x20, 0x3c, 0x20, 0x6e, 0x29, 
+0x20, 0x3f, 0x20, 0x61, 0x73, 0x5b, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x5d, 0x20, 
+0x3a, 0x20, 0x46, 0x4c, 0x54, 0x5f, 0x4d, 0x41, 0x58, 0x3b, 0x0a, 0x09, 0x09, 0x62, 0x61, 0x72, 0x72, 0x69, 0x65, 0x72, 
+0x28, 0x43, 0x4c, 0x4b, 0x5f, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x5f, 0x46, 0x45, 0x4e, 0x43, 0x45, 
+0x29, 0x3b, 0x0a, 0x09, 0x09, 0x69, 0x66, 0x20, 0x28, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x69, 0x6e, 0x64, 0x65, 0x78, 
+0x20, 0x25, 0x20, 0x28, 0x32, 0x20, 0x2a, 0x20, 0x73, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x5f, 0x63, 0x68, 0x75, 0x6e, 0x6b, 
+0x73, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x20, 0x3d, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x09, 0x2f, 
+0x2f, 0x20, 0x4d, 0x65, 0x72, 0x67, 0x69, 0x6e, 0x67, 0x20, 0x74, 0x77, 0x6f, 0x20, 0x73, 0x6f, 0x72, 0x74, 0x65, 0x64, 
+0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x6f, 0x6e, 0x65, 0x0a, 0x09, 
+0x09, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x0a, 0x09, 0x09, 0x09, 0x69, 0x6e, 0x74, 0x20, 
+0x6a, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x0a, 0x09, 0x09, 0x09, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x69, 0x6e, 0x74, 0x20, 0x5f, 
+0x20, 0x3d, 0x20, 0x30, 0x3b, 0x20, 0x5f, 0x20, 0x3c, 0x20, 0x32, 0x20, 0x2a, 0x20, 0x73, 0x6f, 0x72, 0x74, 0x65, 0x64, 
+0x5f, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x73, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x3b, 0x20, 0x2b, 0x2b, 0x5f, 0x29, 0x20, 0x7b, 
+0x0a, 0x09, 0x09, 0x09, 0x09, 0x0a, 0x09, 0x09, 0x09, 0x7d, 0x0a, 0x09, 0x09, 0x7d, 0x0a, 0x09, 0x7d, 0x0a, 0x7d, 0x0a, 
+};
+
+size_t merge_sort_kernel_length = sizeof(merge_sort_kernel) / sizeof(char);
diff --git a/src/cu/merge_sort.cu b/src/cu/merge_sort.cu
new file mode 100644
index 0000000..b0f4dc0
--- /dev/null
+++ b/src/cu/merge_sort.cu
@@ -0,0 +1,9 @@
+#include <libgpu/cuda/cu/opencl_translator.cu>
+
+#include "../cl/merge_sort.cl"
+
+void cuda_merge_sort(const gpu::WorkSize &workSize, cudaStream_t stream,
+					 const float* as, float* bs, unsigned int n, unsigned int sorted_chunks_size) {
+	merge_sort<<<workSize.cuGridSize(), workSize.cuBlockSize()>>>(as, bs, n, sorted_chunks_size);
+	CUDA_CHECK_KERNEL(stream);
+}
diff --git a/src/defines.h b/src/defines.h
new file mode 100644
index 0000000..ada5956
--- /dev/null
+++ b/src/defines.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#define MAX_IN_MEMORY_VALUES (16*1024*1024)
+
+#define PARTS_MERGED_PER_PASS 16
+
+// BYTES_PER_BUFFER = 512 KB
+#define BYTES_PER_BUFFER ((size_t) (128 * 1024 * sizeof(float)))
diff --git a/src/io_utils/buffer_reader.cpp b/src/io_utils/buffer_reader.cpp
new file mode 100644
index 0000000..b37eebf
--- /dev/null
+++ b/src/io_utils/buffer_reader.cpp
@@ -0,0 +1,30 @@
+#include "buffer_reader.h"
+
+#include <cassert>
+#include <stdexcept>
+
+
+BufferReader::BufferReader(const std::vector<char> &data) : data(data), offset(0)
+{
+	
+}
+
+bool BufferReader::isEmpty()
+{
+	if (offset >= data.size()) {
+		assert(offset == data.size());
+		return true;
+	} else {
+		return false;
+	}
+}
+
+float BufferReader::next()
+{
+	if (isEmpty()) {
+		throw std::runtime_error("Empty buffer!");
+	}
+	float value = *((float*) (data.data() + offset));
+	offset += sizeof(float);
+	return value;
+}
diff --git a/src/io_utils/buffer_reader.h b/src/io_utils/buffer_reader.h
new file mode 100644
index 0000000..d416a82
--- /dev/null
+++ b/src/io_utils/buffer_reader.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+
+class BufferReader {
+public:
+	explicit BufferReader(const std::vector<char> &data);
+
+	float next();
+	bool isEmpty();
+
+private:
+	std::vector<char>	data;
+	size_t				offset;
+};
diff --git a/src/io_utils/buffer_writer.cpp b/src/io_utils/buffer_writer.cpp
new file mode 100644
index 0000000..a20236c
--- /dev/null
+++ b/src/io_utils/buffer_writer.cpp
@@ -0,0 +1,44 @@
+#include "buffer_writer.h"
+
+#include <cassert>
+#include <string.h>
+
+
+BufferWriter::BufferWriter(size_t size) : data(std::vector<char>(size, 0)), offset(0)
+{
+	
+}
+
+void BufferWriter::write(float value)
+{
+	assert(!isFull());
+	*((float*) (data.data() + offset)) = value;
+	offset += sizeof(float);
+}
+
+bool BufferWriter::isFull()
+{
+	if (offset >= data.size()) {
+		assert(offset == data.size());
+		return true;
+	} else {
+		return false;
+	}
+}
+
+char* BufferWriter::ptr()
+{
+	return data.data();
+}
+
+
+size_t BufferWriter::valuesNumber()
+{
+	return offset / sizeof(float);
+}
+
+void BufferWriter::clear()
+{
+	memset(data.data(), 0, data.size());
+	offset = 0;
+}
diff --git a/src/io_utils/buffer_writer.h b/src/io_utils/buffer_writer.h
new file mode 100644
index 0000000..a721b8c
--- /dev/null
+++ b/src/io_utils/buffer_writer.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+
+class BufferWriter {
+public:
+	explicit BufferWriter(size_t size);
+
+	void write(float value);
+	bool isFull();
+
+	char* ptr();
+	size_t valuesNumber();
+	void clear();
+
+private:
+	std::vector<char>	data;
+	size_t				offset;
+};
diff --git a/src/io_utils/file_reader.cpp b/src/io_utils/file_reader.cpp
new file mode 100644
index 0000000..3454b84
--- /dev/null
+++ b/src/io_utils/file_reader.cpp
@@ -0,0 +1,71 @@
+#include "file_reader.h"
+#include "../defines.h"
+
+#include <cassert>
+#include <stdexcept>
+
+
+FileReader::FileReader(const std::string &filename) :
+		filename(filename),
+		file(std::fstream(filename, std::ios::in | std::ios::binary | std::ios::ate)),
+		buffer(BufferReader(std::vector<char>()))
+{
+	ptrdiff_t file_size = file.tellg();
+	if (file_size < 0) {
+		throw std::runtime_error("Failed to get size of file " + filename + "!");
+	}
+	file.seekg(0);
+
+	size = (size_t) file_size;
+	offset = 0;
+	assert(size % sizeof(float) == 0);
+}
+
+FileReader::~FileReader()
+{
+	file.close();
+}
+
+size_t FileReader::valuesNumber()
+{
+	return size / sizeof(float);
+}
+
+bool FileReader::isEmpty()
+{
+	if (offset >= size) {
+		assert(buffer.isEmpty() && file.tellg() == size);
+		return true;
+	} else {
+		return false;
+	}
+}
+
+float FileReader::next()
+{
+	if (isEmpty()) {
+		throw std::runtime_error("Empty file!");
+	}
+	if (buffer.isEmpty()) {
+		size_t buffer_size = std::min(BYTES_PER_BUFFER, size - offset);
+		std::vector<char> data(buffer_size, 0);
+		file.read(data.data(), buffer_size);
+		buffer = BufferReader(data);
+	}
+	assert(!buffer.isEmpty());
+	offset += sizeof(float);
+	return buffer.next();
+}
+
+void FileReader::seek(size_t index)
+{
+	buffer = BufferReader(std::vector<char>());
+	offset = index * sizeof(float);
+	assert(offset < size);
+	file.seekg(offset);
+}
+
+std::string FileReader::getFilename()
+{
+	return filename;
+}
diff --git a/src/io_utils/file_reader.h b/src/io_utils/file_reader.h
new file mode 100644
index 0000000..2e3553b
--- /dev/null
+++ b/src/io_utils/file_reader.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "buffer_reader.h"
+
+#include <fstream>
+
+
+class FileReader {
+public:
+	explicit FileReader(const std::string &filename);
+	~FileReader();
+
+	size_t valuesNumber();
+	bool isEmpty();
+	float next();
+	void seek(size_t index);
+	std::string getFilename(); 
+
+private:
+	const std::string				filename;
+	std::basic_fstream<char>		file;
+	BufferReader					buffer;
+	size_t							size;
+	size_t							offset;
+};
diff --git a/src/io_utils/file_writer.cpp b/src/io_utils/file_writer.cpp
new file mode 100644
index 0000000..b2c56e8
--- /dev/null
+++ b/src/io_utils/file_writer.cpp
@@ -0,0 +1,42 @@
+#include "file_writer.h"
+#include "../defines.h"
+
+#include <cassert>
+
+
+FileWriter::FileWriter(const std::string &filename)
+		: filename(filename),
+		  file(std::fstream(filename, std::ios::out | std::ios::binary)),
+		  buffer(BYTES_PER_BUFFER)
+{
+	
+}
+
+FileWriter::~FileWriter()
+{
+	flushBuffer();
+	file.close();
+}
+
+size_t FileWriter::valuesNumber()
+{
+	return offset / sizeof(float);
+}
+
+void FileWriter::write(float value)
+{
+	if (buffer.isFull()) {
+		flushBuffer();
+	}
+	assert(!buffer.isFull());
+	buffer.write(value);
+}
+
+void FileWriter::flushBuffer()
+{
+	size_t values_number = buffer.valuesNumber();
+	if (values_number > 0) {
+		file.write(buffer.ptr(), values_number * sizeof(float));
+		buffer.clear();
+	}
+}
diff --git a/src/io_utils/file_writer.h b/src/io_utils/file_writer.h
new file mode 100644
index 0000000..9ddcc19
--- /dev/null
+++ b/src/io_utils/file_writer.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "buffer_writer.h"
+
+#include <fstream>
+
+
+class FileWriter {
+public:
+	explicit FileWriter(const std::string &filename);
+	~FileWriter();
+
+	size_t valuesNumber();
+	void write(float value);
+
+private:
+	void flushBuffer();
+
+	const std::string				filename;
+	std::basic_fstream<char>		file;
+	BufferWriter					buffer;
+	size_t							offset;
+};
diff --git a/src/main_generator.cpp b/src/main_generator.cpp
new file mode 100644
index 0000000..8173fa3
--- /dev/null
+++ b/src/main_generator.cpp
@@ -0,0 +1,47 @@
+#include <libutils/timer.h>
+#include <libutils/fast_random.h>
+
+#include <fstream>
+#include <iostream>
+
+#include "io_utils/file_reader.h"
+#include "io_utils/file_writer.h"
+
+
+int main(int argc, char* argv[])
+{
+	if (argc != 3) {
+		std::cout << "Usage: " << argv[0] << " <numberOfValuesToGenerate> <outputFilename>" << std::endl;
+		return 1;
+	}
+
+	long int n = strtol(argv[1], nullptr, 10);
+	if (n <= 0) {
+		std::cout << "Number of values to generate should be positive integer value, but " << argv[1] << " found!" << std::endl;
+		return 1;
+	}
+
+	std::string outputFilename(argv[2]);
+	std::cout << "Saving " << n << " random floats (" << (n * sizeof(float) / 1024 / 1024) << " MB) to file " << outputFilename << std::endl;
+
+	timer t;
+
+	{
+		FastRandom r;
+		FileWriter out(outputFilename);
+		for (size_t i = 0; i < n; ++i) {
+			float v = r.nextf();
+			out.write(v);
+		}
+	}
+
+	double dt = t.elapsed();
+
+	std::cout << "Done in " << dt << " seconds";
+	if (dt > 0.0) {
+		std::cout << " (" << ((double) n * sizeof(float) / dt / 1024 / 1024) << " MB/s)";
+	}
+	std::cout << std::endl;
+
+	return 0;
+}
diff --git a/src/main_sorter.cpp b/src/main_sorter.cpp
new file mode 100644
index 0000000..10e205c
--- /dev/null
+++ b/src/main_sorter.cpp
@@ -0,0 +1,168 @@
+#include "io_utils/file_reader.h"
+#include "defines.h"
+#include "io_utils/file_writer.h"
+
+#include <memory>
+#include <vector>
+#include <cassert>
+#include <iostream>
+#include <algorithm>
+#include <libutils/string_utils.h>
+#include <libutils/timer.h>
+
+
+std::string getFilename(const std::string &outputFilename, size_t pass, size_t part_index)
+{
+	return outputFilename + "." + to_string(pass) + "." + to_string(part_index) + ".tmp";
+}
+
+
+std::string toPercent(double part, double total)
+{
+	if (total == 0.0)
+		return "0";
+	return to_string((int) std::floor(100.0 * part / total + 0.5));
+}
+
+
+int main(int argc, char* argv[])
+{
+	if (argc != 3) {
+		std::cout << "Usage: " << argv[0] << " <inputFilename> <outputFilename>" << std::endl;
+		return 1;
+	}
+
+	std::string inputFilename(argv[1]);
+	std::string outputFilename(argv[2]);
+
+	size_t n;
+	{
+		FileReader reader(inputFilename);
+		n = reader.valuesNumber();
+	}
+	std::cout << "Values number: " << n << " (" << (n * sizeof(float) / 1024 / 1024) << " MB)" << std::endl;
+
+	size_t pass = 0;
+	size_t prevpass_nparts = 0;
+	size_t prevpass_nvalues = 0;
+
+	timer full_t;
+
+	{
+		size_t in_core_parts = (n + MAX_IN_MEMORY_VALUES - 1) / MAX_IN_MEMORY_VALUES; 
+		std::cout << "Pass #" << pass << ": sorting part by part in core..." << std::endl;
+		std::cout << "    In core parts number: " << in_core_parts << std::endl;
+		std::cout << "    Limit for values in core: " << MAX_IN_MEMORY_VALUES << " (" << (MAX_IN_MEMORY_VALUES * sizeof(float) / 1024 / 1024) << " MB)" << std::endl;
+		double reading_time = 0.0;
+		double sorting_time = 0.0;
+		double writing_time = 0.0;
+		timer total_t;
+		#pragma omp parallel reduction(+:reading_time,sorting_time,writing_time)
+		{
+			FileReader reader(inputFilename);
+			std::vector<float> data(MAX_IN_MEMORY_VALUES, 0.0f);
+			#pragma omp parallel for schedule(dynamic, 1)
+			for (size_t part_index = 0; part_index < in_core_parts; ++part_index) {
+				size_t from = part_index * MAX_IN_MEMORY_VALUES;
+				size_t to = std::min(n, (part_index + 1) * MAX_IN_MEMORY_VALUES);
+
+				timer reading_t;
+				{
+					reader.seek(from);
+					for (size_t i = from; i < to; ++i) {
+						data[i - from] = reader.next();
+					}
+				}
+				reading_time += reading_t.elapsed();
+
+				timer sorting_t;
+				// TODO: implement sort on GPU
+				std::sort(data.begin(), data.begin() + (to - from));
+				sorting_time += sorting_t.elapsed();
+
+				timer writing_t;
+				{
+					FileWriter writer(in_core_parts == 1 ? outputFilename : getFilename(outputFilename, pass, part_index));
+					for (size_t i = 0; i < to - from; ++i) {
+						writer.write(data[i]);
+					}
+				}
+				writing_time += writing_t.elapsed();
+			}
+		}
+		double sum_time = reading_time + sorting_time + writing_time;
+		size_t total_values = 2 * n;
+		std::cout << "    IO: " << (total_values / total_t.elapsed() / 1024 / 1024 * sizeof(float)) << " MB/s" << std::endl;
+		std::cout << "    Finished in " << total_t.elapsed() << " s (" << toPercent(reading_time, sum_time) << "% reading + " << toPercent(sorting_time, sum_time) << "% sorting + " << toPercent(writing_time, sum_time) << "% writing)" << std::endl;
+		prevpass_nparts = in_core_parts;
+		prevpass_nvalues = MAX_IN_MEMORY_VALUES;
+	}
+
+	while (prevpass_nparts > 1) {
+		size_t prevpass = pass;
+		++pass;
+		size_t merged_parts = (prevpass_nparts + PARTS_MERGED_PER_PASS - 1) / PARTS_MERGED_PER_PASS;
+		size_t merged_nvalues = prevpass_nvalues * PARTS_MERGED_PER_PASS;
+		std::cout << "Pass #" << pass << ": merging groups of " << PARTS_MERGED_PER_PASS << " parts..." << std::endl;
+		std::cout << "    Input parts: " << prevpass_nparts << " with " << prevpass_nvalues << " values (" << (prevpass_nvalues * sizeof(float) / 1024 / 1024) << " MB) in each" << std::endl;
+
+		timer total_t;
+		#pragma omp parallel for schedule(dynamic, 1)
+		for (size_t part_index = 0; part_index < merged_parts; ++part_index) {
+			std::vector<std::unique_ptr<FileReader>> readers;
+			for (size_t i = 0; i < std::min((size_t) PARTS_MERGED_PER_PASS, prevpass_nparts - PARTS_MERGED_PER_PASS * part_index); ++i) {
+				size_t prevpass_part_index = PARTS_MERGED_PER_PASS * part_index + i;
+				readers.emplace_back(new FileReader(getFilename(outputFilename, prevpass, prevpass_part_index)));
+			}
+			FileWriter writer(merged_parts == 1 ? outputFilename : getFilename(outputFilename, pass, part_index));
+			const float NO_VALUE = std::numeric_limits<float>::max();
+			float min_values[PARTS_MERGED_PER_PASS];
+			bool is_empty[PARTS_MERGED_PER_PASS];
+			for (size_t i = 0; i < readers.size(); ++i) {
+				min_values[i] = readers[i]->next();
+				is_empty[i] = false;
+			}
+			while (true) {
+				float min = NO_VALUE;
+				ptrdiff_t min_reader = -1;
+				for (size_t i = 0; i < readers.size(); ++i) {
+					if (is_empty[i]) continue;
+					if (min_reader == -1 || min_values[i] < min) {
+						min = min_values[i];
+						min_reader = i;
+					}
+				}
+				if (min_reader == -1) {
+					// i.e. all readers are empty
+					for (size_t i = 0; i < readers.size(); ++i) {
+						assert(min_values[i] == NO_VALUE);
+						assert(is_empty[i]);
+						assert(readers[i]->isEmpty());
+					}
+					break;
+				}
+				assert(min_values[min_reader] == min);
+				if (readers[min_reader]->isEmpty()) {
+					is_empty[min_reader] = true;
+					min_values[min_reader] = NO_VALUE;
+				} else {
+					min_values[min_reader] = readers[min_reader]->next();
+				}
+				writer.write(min);
+			}
+			for (size_t i = 0; i < readers.size(); ++i) {
+				std::remove(readers[i]->getFilename().c_str());
+			}
+		}
+		size_t total_values = 2 * n;
+		std::cout << "    IO: " << (total_values / total_t.elapsed() / 1024 / 1024 * sizeof(float)) << " MB/s" << std::endl;
+		std::cout << "    Finished in " << total_t.elapsed() << " s" << std::endl;
+
+		prevpass_nparts = merged_parts;
+		prevpass_nvalues = merged_nvalues;
+	}
+
+	std::cout << "Finished in " << full_t.elapsed() << " s" << std::endl;
+
+	return 0;
+}