From 2e3e20c0324d07b60586aaef2dec124e2989b39f Mon Sep 17 00:00:00 2001 From: Nikolay Polyarniy Date: Thu, 4 Apr 2019 21:15:12 +0300 Subject: [PATCH] Initial commit --- CMakeLists.txt | 45 + README.md | 34 + libs/CMakeLists.txt | 3 + libs/clew/CL/cl.h | 1003 ++++++++++++ libs/clew/CL/cl_d3d10.h | 131 ++ libs/clew/CL/cl_ext.h | 278 ++++ libs/clew/CL/cl_gl.h | 160 ++ libs/clew/CL/cl_gl_ext.h | 74 + libs/clew/CL/cl_platform.h | 1203 +++++++++++++++ libs/clew/CL/opencl.h | 59 + libs/clew/CMakeLists.txt | 22 + libs/clew/libclew/ocl_init.cpp | 1142 ++++++++++++++ libs/clew/libclew/ocl_init.h | 3 + libs/gpu/CMakeLists.txt | 79 + libs/gpu/LICENSE | 23 + libs/gpu/libgpu/context.cpp | 310 ++++ libs/gpu/libgpu/context.h | 72 + libs/gpu/libgpu/cuda/cu/common.cu | 535 +++++++ libs/gpu/libgpu/cuda/cu/opencl_translator.cu | 96 ++ libs/gpu/libgpu/cuda/cuda_api.cpp | 158 ++ libs/gpu/libgpu/cuda/cuda_api.h | 29 + libs/gpu/libgpu/cuda/enum.cpp | 94 ++ libs/gpu/libgpu/cuda/enum.h | 45 + libs/gpu/libgpu/cuda/sdk/helper_math.h | 1453 ++++++++++++++++++ libs/gpu/libgpu/cuda/utils.cpp | 14 + libs/gpu/libgpu/cuda/utils.h | 76 + libs/gpu/libgpu/device.cpp | 174 +++ libs/gpu/libgpu/device.h | 51 + libs/gpu/libgpu/gold_helpers.cpp | 96 ++ libs/gpu/libgpu/gold_helpers.h | 36 + libs/gpu/libgpu/hexdumparray.cpp | 74 + libs/gpu/libgpu/opencl/cl/c_template.cl | 8 + libs/gpu/libgpu/opencl/cl/clion_defines.cl | 74 + libs/gpu/libgpu/opencl/cl/common.cl | 427 +++++ libs/gpu/libgpu/opencl/device_info.cpp | 204 +++ libs/gpu/libgpu/opencl/device_info.h | 49 + libs/gpu/libgpu/opencl/engine.cpp | 749 +++++++++ libs/gpu/libgpu/opencl/engine.h | 266 ++++ libs/gpu/libgpu/opencl/enum.cpp | 251 +++ libs/gpu/libgpu/opencl/enum.h | 77 + libs/gpu/libgpu/opencl/utils.cpp | 61 + libs/gpu/libgpu/opencl/utils.h | 69 + libs/gpu/libgpu/shared_device_buffer.cpp | 428 ++++++ libs/gpu/libgpu/shared_device_buffer.h | 87 ++ libs/gpu/libgpu/shared_host_buffer.cpp | 206 +++ libs/gpu/libgpu/shared_host_buffer.h | 54 + libs/gpu/libgpu/utils.cpp | 121 ++ libs/gpu/libgpu/utils.h | 44 + libs/gpu/libgpu/work_size.h | 84 + libs/utils/CMakeLists.txt | 37 + libs/utils/libutils/fast_random.h | 38 + libs/utils/libutils/misc.cpp | 74 + libs/utils/libutils/misc.h | 59 + libs/utils/libutils/string_utils.cpp | 158 ++ libs/utils/libutils/string_utils.h | 24 + libs/utils/libutils/thread_mutex.cpp | 119 ++ libs/utils/libutils/thread_mutex.h | 127 ++ libs/utils/libutils/timer.h | 161 ++ src/cl/merge_sort.cl | 32 + src/cl/merge_sort_cl.h | 49 + src/cu/merge_sort.cu | 9 + src/defines.h | 8 + src/io_utils/buffer_reader.cpp | 30 + src/io_utils/buffer_reader.h | 17 + src/io_utils/buffer_writer.cpp | 44 + src/io_utils/buffer_writer.h | 21 + src/io_utils/file_reader.cpp | 71 + src/io_utils/file_reader.h | 25 + src/io_utils/file_writer.cpp | 42 + src/io_utils/file_writer.h | 23 + src/main_generator.cpp | 47 + src/main_sorter.cpp | 168 ++ 72 files changed, 12214 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 README.md create mode 100644 libs/CMakeLists.txt create mode 100644 libs/clew/CL/cl.h create mode 100644 libs/clew/CL/cl_d3d10.h create mode 100644 libs/clew/CL/cl_ext.h create mode 100644 libs/clew/CL/cl_gl.h create mode 100644 libs/clew/CL/cl_gl_ext.h create mode 100644 libs/clew/CL/cl_platform.h create mode 100644 libs/clew/CL/opencl.h create mode 100644 libs/clew/CMakeLists.txt create mode 100644 libs/clew/libclew/ocl_init.cpp create mode 100644 libs/clew/libclew/ocl_init.h create mode 100644 libs/gpu/CMakeLists.txt create mode 100644 libs/gpu/LICENSE create mode 100644 libs/gpu/libgpu/context.cpp create mode 100644 libs/gpu/libgpu/context.h create mode 100644 libs/gpu/libgpu/cuda/cu/common.cu create mode 100644 libs/gpu/libgpu/cuda/cu/opencl_translator.cu create mode 100644 libs/gpu/libgpu/cuda/cuda_api.cpp create mode 100644 libs/gpu/libgpu/cuda/cuda_api.h create mode 100644 libs/gpu/libgpu/cuda/enum.cpp create mode 100644 libs/gpu/libgpu/cuda/enum.h create mode 100644 libs/gpu/libgpu/cuda/sdk/helper_math.h create mode 100644 libs/gpu/libgpu/cuda/utils.cpp create mode 100644 libs/gpu/libgpu/cuda/utils.h create mode 100644 libs/gpu/libgpu/device.cpp create mode 100644 libs/gpu/libgpu/device.h create mode 100644 libs/gpu/libgpu/gold_helpers.cpp create mode 100644 libs/gpu/libgpu/gold_helpers.h create mode 100644 libs/gpu/libgpu/hexdumparray.cpp create mode 100644 libs/gpu/libgpu/opencl/cl/c_template.cl create mode 100644 libs/gpu/libgpu/opencl/cl/clion_defines.cl create mode 100644 libs/gpu/libgpu/opencl/cl/common.cl create mode 100644 libs/gpu/libgpu/opencl/device_info.cpp create mode 100644 libs/gpu/libgpu/opencl/device_info.h create mode 100644 libs/gpu/libgpu/opencl/engine.cpp create mode 100644 libs/gpu/libgpu/opencl/engine.h create mode 100644 libs/gpu/libgpu/opencl/enum.cpp create mode 100644 libs/gpu/libgpu/opencl/enum.h create mode 100644 libs/gpu/libgpu/opencl/utils.cpp create mode 100644 libs/gpu/libgpu/opencl/utils.h create mode 100644 libs/gpu/libgpu/shared_device_buffer.cpp create mode 100644 libs/gpu/libgpu/shared_device_buffer.h create mode 100644 libs/gpu/libgpu/shared_host_buffer.cpp create mode 100644 libs/gpu/libgpu/shared_host_buffer.h create mode 100644 libs/gpu/libgpu/utils.cpp create mode 100644 libs/gpu/libgpu/utils.h create mode 100644 libs/gpu/libgpu/work_size.h create mode 100644 libs/utils/CMakeLists.txt create mode 100644 libs/utils/libutils/fast_random.h create mode 100644 libs/utils/libutils/misc.cpp create mode 100644 libs/utils/libutils/misc.h create mode 100644 libs/utils/libutils/string_utils.cpp create mode 100644 libs/utils/libutils/string_utils.h create mode 100644 libs/utils/libutils/thread_mutex.cpp create mode 100644 libs/utils/libutils/thread_mutex.h create mode 100644 libs/utils/libutils/timer.h create mode 100644 src/cl/merge_sort.cl create mode 100644 src/cl/merge_sort_cl.h create mode 100644 src/cu/merge_sort.cu create mode 100644 src/defines.h create mode 100644 src/io_utils/buffer_reader.cpp create mode 100644 src/io_utils/buffer_reader.h create mode 100644 src/io_utils/buffer_writer.cpp create mode 100644 src/io_utils/buffer_writer.h create mode 100644 src/io_utils/file_reader.cpp create mode 100644 src/io_utils/file_reader.h create mode 100644 src/io_utils/file_writer.cpp create mode 100644 src/io_utils/file_writer.h create mode 100644 src/main_generator.cpp create mode 100644 src/main_sorter.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..772d1c1 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,45 @@ +cmake_minimum_required(VERSION 3.1) + +project(external_sort) + +set(CMAKE_CXX_STANDARD 11) + +option(GPU_CUDA_SUPPORT "CUDA support." OFF) + +find_package(OpenMP) +if (OpenMP_CXX_FOUND) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") +else() + message(WARNING "OpenMP not found!") +endif() + +add_subdirectory(libs) + +convertIntoHeader(src/cl/merge_sort.cl src/cl/merge_sort_cl.h merge_sort_kernel) + +set(SOURCES + src/cl/merge_sort_cl.h + src/io_utils/buffer_reader.cpp + src/io_utils/buffer_reader.h + src/io_utils/buffer_writer.cpp + src/io_utils/buffer_writer.h + src/io_utils/file_reader.cpp + src/io_utils/file_reader.h + src/io_utils/file_writer.cpp + src/io_utils/file_writer.h +) + +add_executable(input_generator src/main_generator.cpp ${SOURCES}) +target_link_libraries(input_generator libclew libgpu libutils) + +if (GPU_CUDA_SUPPORT) + find_package(CUDA REQUIRED) + add_definitions(-DCUDA_SUPPORT) + set(SOURCES ${SOURCES} src/cu/merge_sort.cu) + cuda_add_executable(${PROJECT_NAME} src/main_sorter.cpp ${SOURCES}) +else() + add_executable(${PROJECT_NAME} src/main_sorter.cpp ${SOURCES}) +endif() + +target_link_libraries(${PROJECT_NAME} libclew libgpu libutils) diff --git a/README.md b/README.md new file mode 100644 index 0000000..136aef1 --- /dev/null +++ b/README.md @@ -0,0 +1,34 @@ +# Generating input data + +``` +./input_generator 1000000000 input.data +``` + +``` +Saving 1000000000 random floats (3814 MB) to file input.data +Done in 9.94176 seconds (383.704 MB/s) +``` + +# Sorting + +``` +./external_sort input.data output.data +``` + +``` +Values number: 1000000000 (3814 MB) +Pass #0: sorting part by part in core... + In core parts number: 60 + Limit for values in core: 16777216 (64 MB) + IO: 51.1294 MB/s + Finished in 149.217 s (9% reading + 82% sorting + 10% writing) +Pass #1: merging groups of 16 parts... + Input parts: 60 with 16777216 values (64 MB) in each + IO: 629.504 MB/s + Finished in 12.1199 s +Pass #2: merging groups of 16 parts... + Input parts: 4 with 268435456 values (1024 MB) in each + IO: 343.152 MB/s + Finished in 22.2333 s +Finished in 183.571 s +``` diff --git a/libs/CMakeLists.txt b/libs/CMakeLists.txt new file mode 100644 index 0000000..a050a26 --- /dev/null +++ b/libs/CMakeLists.txt @@ -0,0 +1,3 @@ +add_subdirectory(clew) +add_subdirectory(gpu) +add_subdirectory(utils) diff --git a/libs/clew/CL/cl.h b/libs/clew/CL/cl.h new file mode 100644 index 0000000..e5662fe --- /dev/null +++ b/libs/clew/CL/cl.h @@ -0,0 +1,1003 @@ +/******************************************************************************* + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +/* $Revision: 11985 $ on $Date: 2010-07-15 11:16:06 -0700 (Thu, 15 Jul 2010) $ */ + +#ifndef __OPENCL_CL_H +#define __OPENCL_CL_H + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/******************************************************************************/ + +typedef struct _cl_platform_id * cl_platform_id; +typedef struct _cl_device_id * cl_device_id; +typedef struct _cl_context * cl_context; +typedef struct _cl_command_queue * cl_command_queue; +typedef struct _cl_mem * cl_mem; +typedef struct _cl_program * cl_program; +typedef struct _cl_kernel * cl_kernel; +typedef struct _cl_event * cl_event; +typedef struct _cl_sampler * cl_sampler; + +typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ +typedef cl_ulong cl_bitfield; +typedef cl_bitfield cl_device_type; +typedef cl_uint cl_platform_info; +typedef cl_uint cl_device_info; +typedef cl_bitfield cl_device_fp_config; +typedef cl_uint cl_device_mem_cache_type; +typedef cl_uint cl_device_local_mem_type; +typedef cl_bitfield cl_device_exec_capabilities; +typedef cl_bitfield cl_command_queue_properties; + +typedef intptr_t cl_context_properties; +typedef cl_uint cl_context_info; +typedef cl_uint cl_command_queue_info; +typedef cl_uint cl_channel_order; +typedef cl_uint cl_channel_type; +typedef cl_bitfield cl_mem_flags; +typedef cl_uint cl_mem_object_type; +typedef cl_uint cl_mem_info; +typedef cl_uint cl_image_info; +typedef cl_uint cl_buffer_create_type; +typedef cl_uint cl_addressing_mode; +typedef cl_uint cl_filter_mode; +typedef cl_uint cl_sampler_info; +typedef cl_bitfield cl_map_flags; +typedef cl_uint cl_program_info; +typedef cl_uint cl_program_build_info; +typedef cl_int cl_build_status; +typedef cl_uint cl_kernel_info; +typedef cl_uint cl_kernel_work_group_info; +typedef cl_uint cl_event_info; +typedef cl_uint cl_command_type; +typedef cl_uint cl_profiling_info; + +typedef struct _cl_image_format { + cl_channel_order image_channel_order; + cl_channel_type image_channel_data_type; +} cl_image_format; + + +typedef struct _cl_buffer_region { + size_t origin; + size_t size; +} cl_buffer_region; + +/******************************************************************************/ + +/* Error Codes */ +#define CL_SUCCESS 0 +#define CL_DEVICE_NOT_FOUND -1 +#define CL_DEVICE_NOT_AVAILABLE -2 +#define CL_COMPILER_NOT_AVAILABLE -3 +#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4 +#define CL_OUT_OF_RESOURCES -5 +#define CL_OUT_OF_HOST_MEMORY -6 +#define CL_PROFILING_INFO_NOT_AVAILABLE -7 +#define CL_MEM_COPY_OVERLAP -8 +#define CL_IMAGE_FORMAT_MISMATCH -9 +#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10 +#define CL_BUILD_PROGRAM_FAILURE -11 +#define CL_MAP_FAILURE -12 +#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13 +#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14 + +#define CL_INVALID_VALUE -30 +#define CL_INVALID_DEVICE_TYPE -31 +#define CL_INVALID_PLATFORM -32 +#define CL_INVALID_DEVICE -33 +#define CL_INVALID_CONTEXT -34 +#define CL_INVALID_QUEUE_PROPERTIES -35 +#define CL_INVALID_COMMAND_QUEUE -36 +#define CL_INVALID_HOST_PTR -37 +#define CL_INVALID_MEM_OBJECT -38 +#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39 +#define CL_INVALID_IMAGE_SIZE -40 +#define CL_INVALID_SAMPLER -41 +#define CL_INVALID_BINARY -42 +#define CL_INVALID_BUILD_OPTIONS -43 +#define CL_INVALID_PROGRAM -44 +#define CL_INVALID_PROGRAM_EXECUTABLE -45 +#define CL_INVALID_KERNEL_NAME -46 +#define CL_INVALID_KERNEL_DEFINITION -47 +#define CL_INVALID_KERNEL -48 +#define CL_INVALID_ARG_INDEX -49 +#define CL_INVALID_ARG_VALUE -50 +#define CL_INVALID_ARG_SIZE -51 +#define CL_INVALID_KERNEL_ARGS -52 +#define CL_INVALID_WORK_DIMENSION -53 +#define CL_INVALID_WORK_GROUP_SIZE -54 +#define CL_INVALID_WORK_ITEM_SIZE -55 +#define CL_INVALID_GLOBAL_OFFSET -56 +#define CL_INVALID_EVENT_WAIT_LIST -57 +#define CL_INVALID_EVENT -58 +#define CL_INVALID_OPERATION -59 +#define CL_INVALID_GL_OBJECT -60 +#define CL_INVALID_BUFFER_SIZE -61 +#define CL_INVALID_MIP_LEVEL -62 +#define CL_INVALID_GLOBAL_WORK_SIZE -63 +#define CL_INVALID_PROPERTY -64 + +/* OpenCL Version */ +#define CL_VERSION_1_0 1 +#define CL_VERSION_1_1 1 + +/* cl_bool */ +#define CL_FALSE 0 +#define CL_TRUE 1 + +/* cl_platform_info */ +#define CL_PLATFORM_PROFILE 0x0900 +#define CL_PLATFORM_VERSION 0x0901 +#define CL_PLATFORM_NAME 0x0902 +#define CL_PLATFORM_VENDOR 0x0903 +#define CL_PLATFORM_EXTENSIONS 0x0904 + +/* cl_device_type - bitfield */ +#define CL_DEVICE_TYPE_DEFAULT (1 << 0) +#define CL_DEVICE_TYPE_CPU (1 << 1) +#define CL_DEVICE_TYPE_GPU (1 << 2) +#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3) +#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF + +/* cl_device_info */ +#define CL_DEVICE_TYPE 0x1000 +#define CL_DEVICE_VENDOR_ID 0x1001 +#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002 +#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003 +#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004 +#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B +#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C +#define CL_DEVICE_ADDRESS_BITS 0x100D +#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E +#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F +#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010 +#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011 +#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012 +#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013 +#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014 +#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015 +#define CL_DEVICE_IMAGE_SUPPORT 0x1016 +#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017 +#define CL_DEVICE_MAX_SAMPLERS 0x1018 +#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019 +#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A +#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B +#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C +#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D +#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E +#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F +#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020 +#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021 +#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022 +#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023 +#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024 +#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025 +#define CL_DEVICE_ENDIAN_LITTLE 0x1026 +#define CL_DEVICE_AVAILABLE 0x1027 +#define CL_DEVICE_COMPILER_AVAILABLE 0x1028 +#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029 +#define CL_DEVICE_QUEUE_PROPERTIES 0x102A +#define CL_DEVICE_NAME 0x102B +#define CL_DEVICE_VENDOR 0x102C +#define CL_DRIVER_VERSION 0x102D +#define CL_DEVICE_PROFILE 0x102E +#define CL_DEVICE_VERSION 0x102F +#define CL_DEVICE_EXTENSIONS 0x1030 +#define CL_DEVICE_PLATFORM 0x1031 +/* 0x1032 reserved for CL_DEVICE_DOUBLE_FP_CONFIG */ +/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */ +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034 +#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C +#define CL_DEVICE_OPENCL_C_VERSION 0x103D + +/* cl_device_fp_config - bitfield */ +#define CL_FP_DENORM (1 << 0) +#define CL_FP_INF_NAN (1 << 1) +#define CL_FP_ROUND_TO_NEAREST (1 << 2) +#define CL_FP_ROUND_TO_ZERO (1 << 3) +#define CL_FP_ROUND_TO_INF (1 << 4) +#define CL_FP_FMA (1 << 5) +#define CL_FP_SOFT_FLOAT (1 << 6) + +/* cl_device_mem_cache_type */ +#define CL_NONE 0x0 +#define CL_READ_ONLY_CACHE 0x1 +#define CL_READ_WRITE_CACHE 0x2 + +/* cl_device_local_mem_type */ +#define CL_LOCAL 0x1 +#define CL_GLOBAL 0x2 + +/* cl_device_exec_capabilities - bitfield */ +#define CL_EXEC_KERNEL (1 << 0) +#define CL_EXEC_NATIVE_KERNEL (1 << 1) + +/* cl_command_queue_properties - bitfield */ +#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0) +#define CL_QUEUE_PROFILING_ENABLE (1 << 1) + +/* cl_context_info */ +#define CL_CONTEXT_REFERENCE_COUNT 0x1080 +#define CL_CONTEXT_DEVICES 0x1081 +#define CL_CONTEXT_PROPERTIES 0x1082 +#define CL_CONTEXT_NUM_DEVICES 0x1083 + +/* cl_context_info + cl_context_properties */ +#define CL_CONTEXT_PLATFORM 0x1084 + +/* cl_command_queue_info */ +#define CL_QUEUE_CONTEXT 0x1090 +#define CL_QUEUE_DEVICE 0x1091 +#define CL_QUEUE_REFERENCE_COUNT 0x1092 +#define CL_QUEUE_PROPERTIES 0x1093 + +/* cl_mem_flags - bitfield */ +#define CL_MEM_READ_WRITE (1 << 0) +#define CL_MEM_WRITE_ONLY (1 << 1) +#define CL_MEM_READ_ONLY (1 << 2) +#define CL_MEM_USE_HOST_PTR (1 << 3) +#define CL_MEM_ALLOC_HOST_PTR (1 << 4) +#define CL_MEM_COPY_HOST_PTR (1 << 5) + +/* cl_channel_order */ +#define CL_R 0x10B0 +#define CL_A 0x10B1 +#define CL_RG 0x10B2 +#define CL_RA 0x10B3 +#define CL_RGB 0x10B4 +#define CL_RGBA 0x10B5 +#define CL_BGRA 0x10B6 +#define CL_ARGB 0x10B7 +#define CL_INTENSITY 0x10B8 +#define CL_LUMINANCE 0x10B9 +#define CL_Rx 0x10BA +#define CL_RGx 0x10BB +#define CL_RGBx 0x10BC + +/* cl_channel_type */ +#define CL_SNORM_INT8 0x10D0 +#define CL_SNORM_INT16 0x10D1 +#define CL_UNORM_INT8 0x10D2 +#define CL_UNORM_INT16 0x10D3 +#define CL_UNORM_SHORT_565 0x10D4 +#define CL_UNORM_SHORT_555 0x10D5 +#define CL_UNORM_INT_101010 0x10D6 +#define CL_SIGNED_INT8 0x10D7 +#define CL_SIGNED_INT16 0x10D8 +#define CL_SIGNED_INT32 0x10D9 +#define CL_UNSIGNED_INT8 0x10DA +#define CL_UNSIGNED_INT16 0x10DB +#define CL_UNSIGNED_INT32 0x10DC +#define CL_HALF_FLOAT 0x10DD +#define CL_FLOAT 0x10DE + +/* cl_mem_object_type */ +#define CL_MEM_OBJECT_BUFFER 0x10F0 +#define CL_MEM_OBJECT_IMAGE2D 0x10F1 +#define CL_MEM_OBJECT_IMAGE3D 0x10F2 + +/* cl_mem_info */ +#define CL_MEM_TYPE 0x1100 +#define CL_MEM_FLAGS 0x1101 +#define CL_MEM_SIZE 0x1102 +#define CL_MEM_HOST_PTR 0x1103 +#define CL_MEM_MAP_COUNT 0x1104 +#define CL_MEM_REFERENCE_COUNT 0x1105 +#define CL_MEM_CONTEXT 0x1106 +#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107 +#define CL_MEM_OFFSET 0x1108 + +/* cl_image_info */ +#define CL_IMAGE_FORMAT 0x1110 +#define CL_IMAGE_ELEMENT_SIZE 0x1111 +#define CL_IMAGE_ROW_PITCH 0x1112 +#define CL_IMAGE_SLICE_PITCH 0x1113 +#define CL_IMAGE_WIDTH 0x1114 +#define CL_IMAGE_HEIGHT 0x1115 +#define CL_IMAGE_DEPTH 0x1116 + +/* cl_addressing_mode */ +#define CL_ADDRESS_NONE 0x1130 +#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131 +#define CL_ADDRESS_CLAMP 0x1132 +#define CL_ADDRESS_REPEAT 0x1133 +#define CL_ADDRESS_MIRRORED_REPEAT 0x1134 + +/* cl_filter_mode */ +#define CL_FILTER_NEAREST 0x1140 +#define CL_FILTER_LINEAR 0x1141 + +/* cl_sampler_info */ +#define CL_SAMPLER_REFERENCE_COUNT 0x1150 +#define CL_SAMPLER_CONTEXT 0x1151 +#define CL_SAMPLER_NORMALIZED_COORDS 0x1152 +#define CL_SAMPLER_ADDRESSING_MODE 0x1153 +#define CL_SAMPLER_FILTER_MODE 0x1154 + +/* cl_map_flags - bitfield */ +#define CL_MAP_READ (1 << 0) +#define CL_MAP_WRITE (1 << 1) + +/* cl_program_info */ +#define CL_PROGRAM_REFERENCE_COUNT 0x1160 +#define CL_PROGRAM_CONTEXT 0x1161 +#define CL_PROGRAM_NUM_DEVICES 0x1162 +#define CL_PROGRAM_DEVICES 0x1163 +#define CL_PROGRAM_SOURCE 0x1164 +#define CL_PROGRAM_BINARY_SIZES 0x1165 +#define CL_PROGRAM_BINARIES 0x1166 + +/* cl_program_build_info */ +#define CL_PROGRAM_BUILD_STATUS 0x1181 +#define CL_PROGRAM_BUILD_OPTIONS 0x1182 +#define CL_PROGRAM_BUILD_LOG 0x1183 + +/* cl_build_status */ +#define CL_BUILD_SUCCESS 0 +#define CL_BUILD_NONE -1 +#define CL_BUILD_ERROR -2 +#define CL_BUILD_IN_PROGRESS -3 + +/* cl_kernel_info */ +#define CL_KERNEL_FUNCTION_NAME 0x1190 +#define CL_KERNEL_NUM_ARGS 0x1191 +#define CL_KERNEL_REFERENCE_COUNT 0x1192 +#define CL_KERNEL_CONTEXT 0x1193 +#define CL_KERNEL_PROGRAM 0x1194 + +/* cl_kernel_work_group_info */ +#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0 +#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1 +#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2 +#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3 +#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4 + +/* cl_event_info */ +#define CL_EVENT_COMMAND_QUEUE 0x11D0 +#define CL_EVENT_COMMAND_TYPE 0x11D1 +#define CL_EVENT_REFERENCE_COUNT 0x11D2 +#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3 +#define CL_EVENT_CONTEXT 0x11D4 + +/* cl_command_type */ +#define CL_COMMAND_NDRANGE_KERNEL 0x11F0 +#define CL_COMMAND_TASK 0x11F1 +#define CL_COMMAND_NATIVE_KERNEL 0x11F2 +#define CL_COMMAND_READ_BUFFER 0x11F3 +#define CL_COMMAND_WRITE_BUFFER 0x11F4 +#define CL_COMMAND_COPY_BUFFER 0x11F5 +#define CL_COMMAND_READ_IMAGE 0x11F6 +#define CL_COMMAND_WRITE_IMAGE 0x11F7 +#define CL_COMMAND_COPY_IMAGE 0x11F8 +#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9 +#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA +#define CL_COMMAND_MAP_BUFFER 0x11FB +#define CL_COMMAND_MAP_IMAGE 0x11FC +#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD +#define CL_COMMAND_MARKER 0x11FE +#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF +#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200 +#define CL_COMMAND_READ_BUFFER_RECT 0x1201 +#define CL_COMMAND_WRITE_BUFFER_RECT 0x1202 +#define CL_COMMAND_COPY_BUFFER_RECT 0x1203 +#define CL_COMMAND_USER 0x1204 + +/* command execution status */ +#define CL_COMPLETE 0x0 +#define CL_RUNNING 0x1 +#define CL_SUBMITTED 0x2 +#define CL_QUEUED 0x3 + +/* cl_buffer_create_type */ +#define CL_BUFFER_CREATE_TYPE_REGION 0x1220 + +/* cl_profiling_info */ +#define CL_PROFILING_COMMAND_QUEUED 0x1280 +#define CL_PROFILING_COMMAND_SUBMIT 0x1281 +#define CL_PROFILING_COMMAND_START 0x1282 +#define CL_PROFILING_COMMAND_END 0x1283 + +/********************************************************************************************************/ + +/* Platform API */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPlatformIDs(cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPlatformInfo(cl_platform_id /* platform */, + cl_platform_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Device APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceIDs(cl_platform_id /* platform */, + cl_device_type /* device_type */, + cl_uint /* num_entries */, + cl_device_id * /* devices */, + cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceInfo(cl_device_id /* device */, + cl_device_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Context APIs */ +extern CL_API_ENTRY cl_context CL_API_CALL +clCreateContext(const cl_context_properties * /* properties */, + cl_uint /* num_devices */, + const cl_device_id * /* devices */, + void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *), + void * /* user_data */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_context CL_API_CALL +clCreateContextFromType(const cl_context_properties * /* properties */, + cl_device_type /* device_type */, + void (CL_CALLBACK * /* pfn_notify*/ )(const char *, const void *, size_t, void *), + void * /* user_data */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetContextInfo(cl_context /* context */, + cl_context_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Command Queue APIs */ +extern CL_API_ENTRY cl_command_queue CL_API_CALL +clCreateCommandQueue(cl_context /* context */, + cl_device_id /* device */, + cl_command_queue_properties /* properties */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetCommandQueueInfo(cl_command_queue /* command_queue */, + cl_command_queue_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS +#warning CL_USE_DEPRECATED_OPENCL_1_0_APIS is defined. These APIs are unsupported and untested in OpenCL 1.1! +/* + * WARNING: + * This API introduces mutable state into the OpenCL implementation. It has been REMOVED + * to better facilitate thread safety. The 1.0 API is not thread safe. It is not tested by the + * OpenCL 1.1 conformance test, and consequently may not work or may not work dependably. + * It is likely to be non-performant. Use of this API is not advised. Use at your own risk. + * + * Software developers previously relying on this API are instructed to set the command queue + * properties when creating the queue, instead. + */ +extern CL_API_ENTRY cl_int CL_API_CALL +clSetCommandQueueProperty(cl_command_queue /* command_queue */, + cl_command_queue_properties /* properties */, + cl_bool /* enable */, + cl_command_queue_properties * /* old_properties */) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED; +#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */ + +/* Memory Object APIs */ +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateBuffer(cl_context /* context */, + cl_mem_flags /* flags */, + size_t /* size */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateSubBuffer(cl_mem /* buffer */, + cl_mem_flags /* flags */, + cl_buffer_create_type /* buffer_create_type */, + const void * /* buffer_create_info */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateImage2D(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + size_t /* image_width */, + size_t /* image_height */, + size_t /* image_row_pitch */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateImage3D(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + size_t /* image_width */, + size_t /* image_height */, + size_t /* image_depth */, + size_t /* image_row_pitch */, + size_t /* image_slice_pitch */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSupportedImageFormats(cl_context /* context */, + cl_mem_flags /* flags */, + cl_mem_object_type /* image_type */, + cl_uint /* num_entries */, + cl_image_format * /* image_formats */, + cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetMemObjectInfo(cl_mem /* memobj */, + cl_mem_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetImageInfo(cl_mem /* image */, + cl_image_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetMemObjectDestructorCallback( cl_mem /* memobj */, + void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), + void * /*user_data */ ) CL_API_SUFFIX__VERSION_1_1; + +/* Sampler APIs */ +extern CL_API_ENTRY cl_sampler CL_API_CALL +clCreateSampler(cl_context /* context */, + cl_bool /* normalized_coords */, + cl_addressing_mode /* addressing_mode */, + cl_filter_mode /* filter_mode */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSamplerInfo(cl_sampler /* sampler */, + cl_sampler_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Program Object APIs */ +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithSource(cl_context /* context */, + cl_uint /* count */, + const char ** /* strings */, + const size_t * /* lengths */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithBinary(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const size_t * /* lengths */, + const unsigned char ** /* binaries */, + cl_int * /* binary_status */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clBuildProgram(cl_program /* program */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clUnloadCompiler(void) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetProgramInfo(cl_program /* program */, + cl_program_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetProgramBuildInfo(cl_program /* program */, + cl_device_id /* device */, + cl_program_build_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Kernel Object APIs */ +extern CL_API_ENTRY cl_kernel CL_API_CALL +clCreateKernel(cl_program /* program */, + const char * /* kernel_name */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateKernelsInProgram(cl_program /* program */, + cl_uint /* num_kernels */, + cl_kernel * /* kernels */, + cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArg(cl_kernel /* kernel */, + cl_uint /* arg_index */, + size_t /* arg_size */, + const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelInfo(cl_kernel /* kernel */, + cl_kernel_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelWorkGroupInfo(cl_kernel /* kernel */, + cl_device_id /* device */, + cl_kernel_work_group_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Event Object APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clWaitForEvents(cl_uint /* num_events */, + const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventInfo(cl_event /* event */, + cl_event_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateUserEvent(cl_context /* context */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetUserEventStatus(cl_event /* event */, + cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetEventCallback( cl_event /* event */, + cl_int /* command_exec_callback_type */, + void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_1; + +/* Profiling APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventProfilingInfo(cl_event /* event */, + cl_profiling_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Flush and Finish APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +/* Enqueued Commands APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_read */, + size_t /* offset */, + size_t /* cb */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadBufferRect(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_read */, + const size_t * /* buffer_origin */, + const size_t * /* host_origin */, + const size_t * /* region */, + size_t /* buffer_row_pitch */, + size_t /* buffer_slice_pitch */, + size_t /* host_row_pitch */, + size_t /* host_slice_pitch */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_write */, + size_t /* offset */, + size_t /* cb */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteBufferRect(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_write */, + const size_t * /* buffer_origin */, + const size_t * /* host_origin */, + const size_t * /* region */, + size_t /* buffer_row_pitch */, + size_t /* buffer_slice_pitch */, + size_t /* host_row_pitch */, + size_t /* host_slice_pitch */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBuffer(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_buffer */, + size_t /* src_offset */, + size_t /* dst_offset */, + size_t /* cb */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBufferRect(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_buffer */, + const size_t * /* src_origin */, + const size_t * /* dst_origin */, + const size_t * /* region */, + size_t /* src_row_pitch */, + size_t /* src_slice_pitch */, + size_t /* dst_row_pitch */, + size_t /* dst_slice_pitch */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_read */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t /* row_pitch */, + size_t /* slice_pitch */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_write */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t /* input_row_pitch */, + size_t /* input_slice_pitch */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyImage(cl_command_queue /* command_queue */, + cl_mem /* src_image */, + cl_mem /* dst_image */, + const size_t * /* src_origin[3] */, + const size_t * /* dst_origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */, + cl_mem /* src_image */, + cl_mem /* dst_buffer */, + const size_t * /* src_origin[3] */, + const size_t * /* region[3] */, + size_t /* dst_offset */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_image */, + size_t /* src_offset */, + const size_t * /* dst_origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY void * CL_API_CALL +clEnqueueMapBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_map */, + cl_map_flags /* map_flags */, + size_t /* offset */, + size_t /* cb */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY void * CL_API_CALL +clEnqueueMapImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_map */, + cl_map_flags /* map_flags */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t * /* image_row_pitch */, + size_t * /* image_slice_pitch */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueUnmapMemObject(cl_command_queue /* command_queue */, + cl_mem /* memobj */, + void * /* mapped_ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueNDRangeKernel(cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* work_dim */, + const size_t * /* global_work_offset */, + const size_t * /* global_work_size */, + const size_t * /* local_work_size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueTask(cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueNativeKernel(cl_command_queue /* command_queue */, + void (CL_CALLBACK *user_func)(void *), + void * /* args */, + size_t /* cb_args */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_list */, + const void ** /* args_mem_loc */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMarker(cl_command_queue /* command_queue */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWaitForEvents(cl_command_queue /* command_queue */, + cl_uint /* num_events */, + const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueBarrier(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +/* Extension function access + * + * Returns the extension function address for the given function name, + * or NULL if a valid function can not be found. The client must + * check to make sure the address is not NULL, before using or + * calling the returned function address. + */ +extern CL_API_ENTRY void * CL_API_CALL clGetExtensionFunctionAddress(const char * /* func_name */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_H */ + diff --git a/libs/clew/CL/cl_d3d10.h b/libs/clew/CL/cl_d3d10.h new file mode 100644 index 0000000..a834615 --- /dev/null +++ b/libs/clew/CL/cl_d3d10.h @@ -0,0 +1,131 @@ +/********************************************************************************** + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_CL_D3D10_H +#define __OPENCL_CL_D3D10_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************************************************** + * cl_khr_d3d10_sharing */ +#define cl_khr_d3d10_sharing 1 + +typedef cl_uint cl_d3d10_device_source_khr; +typedef cl_uint cl_d3d10_device_set_khr; + +/******************************************************************************/ + +/* Error Codes */ +#define CL_INVALID_D3D10_DEVICE_KHR -1002 +#define CL_INVALID_D3D10_RESOURCE_KHR -1003 +#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004 +#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005 + +/* cl_d3d10_device_source_nv */ +#define CL_D3D10_DEVICE_KHR 0x4010 +#define CL_D3D10_DXGI_ADAPTER_KHR 0x4011 + +/* cl_d3d10_device_set_nv */ +#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012 +#define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013 + +/* cl_context_info */ +#define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014 +#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C + +/* cl_mem_info */ +#define CL_MEM_D3D10_RESOURCE_KHR 0x4015 + +/* cl_image_info */ +#define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016 + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017 +#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018 + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)( + cl_platform_id platform, + cl_d3d10_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_khr d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Buffer * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Texture2D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Texture3D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_D3D10_H */ + diff --git a/libs/clew/CL/cl_ext.h b/libs/clew/CL/cl_ext.h new file mode 100644 index 0000000..89b4cb7 --- /dev/null +++ b/libs/clew/CL/cl_ext.h @@ -0,0 +1,278 @@ +/******************************************************************************* + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */ + +/* cl_ext.h contains OpenCL extensions which don't have external */ +/* (OpenGL, D3D) dependencies. */ + +#ifndef __CL_EXT_H +#define __CL_EXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ + #include + #include +#else + #include +#endif + +/* cl_khr_fp64 extension - no extension #define since it has no functions */ +#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 + +/* cl_khr_fp16 extension - no extension #define since it has no functions */ +#define CL_DEVICE_HALF_FP_CONFIG 0x1033 + +/* Memory object destruction + * + * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR + * + * Registers a user callback function that will be called when the memory object is deleted and its resources + * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback + * stack associated with memobj. The registered user callback functions are called in the reverse order in + * which they were registered. The user callback functions are called and then the memory object is deleted + * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be + * notified when the memory referenced by host_ptr, specified when the memory object is created and used as + * the storage bits for the memory object, can be reused or freed. + * + * The application may not call CL api's with the cl_mem object passed to the pfn_notify. + * + * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) + * before using. + */ +#define cl_APPLE_SetMemObjectDestructor 1 +cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE( cl_mem /* memobj */, + void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), + void * /*user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + + +/* Context Logging Functions + * + * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext(). + * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) + * before using. + * + * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger + */ +#define cl_APPLE_ContextLoggingFunctions 1 +extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE( const char * /* errstr */, + const void * /* private_info */, + size_t /* cb */, + void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + +/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */ +extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE( const char * /* errstr */, + const void * /* private_info */, + size_t /* cb */, + void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + +/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */ +extern void CL_API_ENTRY clLogMessagesToStderrAPPLE( const char * /* errstr */, + const void * /* private_info */, + size_t /* cb */, + void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + + +/************************ +* cl_khr_icd extension * +************************/ +#define cl_khr_icd 1 + +/* cl_platform_info */ +#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920 + +/* Additional Error Codes */ +#define CL_PLATFORM_NOT_FOUND_KHR -1001 + +extern CL_API_ENTRY cl_int CL_API_CALL +clIcdGetPlatformIDsKHR(cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */); + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)( + cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */); + + +/****************************************** +* cl_nv_device_attribute_query extension * +******************************************/ +/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */ +#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000 +#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001 +#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002 +#define CL_DEVICE_WARP_SIZE_NV 0x4003 +#define CL_DEVICE_GPU_OVERLAP_NV 0x4004 +#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005 +#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006 + +/********************************* +* cl_amd_device_attribute_query * +*********************************/ +#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036 + +/********************************* +* cl_arm_printf extension +*********************************/ +#define CL_PRINTF_CALLBACK_ARM 0x40B0 +#define CL_PRINTF_BUFFERSIZE_ARM 0x40B1 + +#ifdef CL_VERSION_1_1 + /*********************************** + * cl_ext_device_fission extension * + ***********************************/ + #define cl_ext_device_fission 1 + + extern CL_API_ENTRY cl_int CL_API_CALL + clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + typedef CL_API_ENTRY cl_int + (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + extern CL_API_ENTRY cl_int CL_API_CALL + clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + typedef CL_API_ENTRY cl_int + (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + typedef cl_ulong cl_device_partition_property_ext; + extern CL_API_ENTRY cl_int CL_API_CALL + clCreateSubDevicesEXT( cl_device_id /*in_device*/, + const cl_device_partition_property_ext * /* properties */, + cl_uint /*num_entries*/, + cl_device_id * /*out_devices*/, + cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + typedef CL_API_ENTRY cl_int + ( CL_API_CALL * clCreateSubDevicesEXT_fn)( cl_device_id /*in_device*/, + const cl_device_partition_property_ext * /* properties */, + cl_uint /*num_entries*/, + cl_device_id * /*out_devices*/, + cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + /* cl_device_partition_property_ext */ + #define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050 + #define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051 + #define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052 + #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053 + + /* clDeviceGetInfo selectors */ + #define CL_DEVICE_PARENT_DEVICE_EXT 0x4054 + #define CL_DEVICE_PARTITION_TYPES_EXT 0x4055 + #define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056 + #define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057 + #define CL_DEVICE_PARTITION_STYLE_EXT 0x4058 + + /* error codes */ + #define CL_DEVICE_PARTITION_FAILED_EXT -1057 + #define CL_INVALID_PARTITION_COUNT_EXT -1058 + #define CL_INVALID_PARTITION_NAME_EXT -1059 + + /* CL_AFFINITY_DOMAINs */ + #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1 + #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2 + #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3 + #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4 + #define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10 + #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100 + + /* cl_device_partition_property_ext list terminators */ + #define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0) + #define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0) + #define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1) + +/********************************* +* cl_qcom_ext_host_ptr extension +*********************************/ + +#define CL_MEM_EXT_HOST_PTR_QCOM (1 << 29) + +#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM 0x40A0 +#define CL_DEVICE_PAGE_SIZE_QCOM 0x40A1 +#define CL_IMAGE_ROW_ALIGNMENT_QCOM 0x40A2 +#define CL_IMAGE_SLICE_ALIGNMENT_QCOM 0x40A3 +#define CL_MEM_HOST_UNCACHED_QCOM 0x40A4 +#define CL_MEM_HOST_WRITEBACK_QCOM 0x40A5 +#define CL_MEM_HOST_WRITETHROUGH_QCOM 0x40A6 +#define CL_MEM_HOST_WRITE_COMBINING_QCOM 0x40A7 + +typedef cl_uint cl_image_pitch_info_qcom; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceImageInfoQCOM(cl_device_id device, + size_t image_width, + size_t image_height, + const cl_image_format *image_format, + cl_image_pitch_info_qcom param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret); + +typedef struct _cl_mem_ext_host_ptr +{ + /* Type of external memory allocation. */ + /* Legal values will be defined in layered extensions. */ + cl_uint allocation_type; + + /* Host cache policy for this external memory allocation. */ + cl_uint host_cache_policy; + +} cl_mem_ext_host_ptr; + +/********************************* +* cl_qcom_ion_host_ptr extension +*********************************/ + +#define CL_MEM_ION_HOST_PTR_QCOM 0x40A8 + +typedef struct _cl_mem_ion_host_ptr +{ + /* Type of external memory allocation. */ + /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */ + cl_mem_ext_host_ptr ext_host_ptr; + + /* ION file descriptor */ + int ion_filedesc; + + /* Host pointer to the ION allocated memory */ + void* ion_hostptr; + +} cl_mem_ion_host_ptr; + +#endif /* CL_VERSION_1_1 */ + +#ifdef __cplusplus +} +#endif + + +#endif /* __CL_EXT_H */ diff --git a/libs/clew/CL/cl_gl.h b/libs/clew/CL/cl_gl.h new file mode 100644 index 0000000..92d1be2 --- /dev/null +++ b/libs/clew/CL/cl_gl.h @@ -0,0 +1,160 @@ +/********************************************************************************** + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +/* + * cl_gl.h contains Khronos-approved (KHR) OpenCL extensions which have + * OpenGL dependencies. The application is responsible for #including + * OpenGL or OpenGL ES headers before #including cl_gl.h. + */ + +#ifndef __OPENCL_CL_GL_H +#define __OPENCL_CL_GL_H + +#ifdef __APPLE__ +#include +#include +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef cl_uint cl_gl_object_type; +typedef cl_uint cl_gl_texture_info; +typedef cl_uint cl_gl_platform_info; +typedef struct __GLsync *cl_GLsync; + +/* cl_gl_object_type */ +#define CL_GL_OBJECT_BUFFER 0x2000 +#define CL_GL_OBJECT_TEXTURE2D 0x2001 +#define CL_GL_OBJECT_TEXTURE3D 0x2002 +#define CL_GL_OBJECT_RENDERBUFFER 0x2003 + +/* cl_gl_texture_info */ +#define CL_GL_TEXTURE_TARGET 0x2004 +#define CL_GL_MIPMAP_LEVEL 0x2005 + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLBuffer(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLuint /* bufobj */, + int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLTexture2D(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLenum /* target */, + cl_GLint /* miplevel */, + cl_GLuint /* texture */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLTexture3D(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLenum /* target */, + cl_GLint /* miplevel */, + cl_GLuint /* texture */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLRenderbuffer(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLuint /* renderbuffer */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLObjectInfo(cl_mem /* memobj */, + cl_gl_object_type * /* gl_object_type */, + cl_GLuint * /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLTextureInfo(cl_mem /* memobj */, + cl_gl_texture_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireGLObjects(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseGLObjects(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +/* cl_khr_gl_sharing extension */ + +#define cl_khr_gl_sharing 1 + +typedef cl_uint cl_gl_context_info; + +/* Additional Error Codes */ +#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000 + +/* cl_gl_context_info */ +#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006 +#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007 + +/* Additional cl_context_properties */ +#define CL_GL_CONTEXT_KHR 0x2008 +#define CL_EGL_DISPLAY_KHR 0x2009 +#define CL_GLX_DISPLAY_KHR 0x200A +#define CL_WGL_HDC_KHR 0x200B +#define CL_CGL_SHAREGROUP_KHR 0x200C + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLContextInfoKHR(const cl_context_properties * /* properties */, + cl_gl_context_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)( + const cl_context_properties * properties, + cl_gl_context_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret); + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_GL_H */ diff --git a/libs/clew/CL/cl_gl_ext.h b/libs/clew/CL/cl_gl_ext.h new file mode 100644 index 0000000..12ad713 --- /dev/null +++ b/libs/clew/CL/cl_gl_ext.h @@ -0,0 +1,74 @@ +/********************************************************************************** + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have */ +/* OpenGL dependencies. */ + +#ifndef __OPENCL_CL_GL_EXT_H +#define __OPENCL_CL_GL_EXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ + #include +#else + #include +#endif + +/* + * For each extension, follow this template + * /* cl_VEN_extname extension */ +/* #define cl_VEN_extname 1 + * ... define new types, if any + * ... define new tokens, if any + * ... define new APIs, if any + * + * If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header + * This allows us to avoid having to decide whether to include GL headers or GLES here. + */ + +/* + * cl_khr_gl_event extension + * See section 9.9 in the OpenCL 1.1 spec for more information + */ +#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateEventFromGLsyncKHR(cl_context /* context */, + cl_GLsync /* cl_GLsync */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_GL_EXT_H */ diff --git a/libs/clew/CL/cl_platform.h b/libs/clew/CL/cl_platform.h new file mode 100644 index 0000000..065aca4 --- /dev/null +++ b/libs/clew/CL/cl_platform.h @@ -0,0 +1,1203 @@ +/********************************************************************************** + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */ + +#ifndef __CL_PLATFORM_H +#define __CL_PLATFORM_H + +#ifdef __APPLE__ + /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */ + #include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_WIN32) + #define CL_API_ENTRY + #define CL_API_CALL __stdcall + #define CL_CALLBACK __stdcall +#else + #define CL_API_ENTRY + #define CL_API_CALL + #define CL_CALLBACK +#endif + +#ifdef __APPLE__ + #define CL_EXTENSION_WEAK_LINK __attribute__((weak_import)) + #define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_API_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK + #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER +#else + #define CL_EXTENSION_WEAK_LINK + #define CL_API_SUFFIX__VERSION_1_0 + #define CL_EXT_SUFFIX__VERSION_1_0 + #define CL_API_SUFFIX__VERSION_1_1 + #define CL_EXT_SUFFIX__VERSION_1_1 + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED +#endif + +#if (defined (_WIN32) && defined(_MSC_VER)) + +/* scalar types */ +typedef signed __int8 cl_char; +typedef unsigned __int8 cl_uchar; +typedef signed __int16 cl_short; +typedef unsigned __int16 cl_ushort; +typedef signed __int32 cl_int; +typedef unsigned __int32 cl_uint; +typedef signed __int64 cl_long; +typedef unsigned __int64 cl_ulong; + +typedef unsigned __int16 cl_half; +typedef float cl_float; +typedef double cl_double; + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 340282346638528859811704183484516925440.0f +#define CL_FLT_MIN 1.175494350822287507969e-38f +#define CL_FLT_EPSILON 0x1.0p-23f + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0 +#define CL_DBL_MIN 2.225073858507201383090e-308 +#define CL_DBL_EPSILON 2.220446049250313080847e-16 + +#define CL_M_E 2.718281828459045090796 +#define CL_M_LOG2E 1.442695040888963387005 +#define CL_M_LOG10E 0.434294481903251816668 +#define CL_M_LN2 0.693147180559945286227 +#define CL_M_LN10 2.302585092994045901094 +#define CL_M_PI 3.141592653589793115998 +#define CL_M_PI_2 1.570796326794896557999 +#define CL_M_PI_4 0.785398163397448278999 +#define CL_M_1_PI 0.318309886183790691216 +#define CL_M_2_PI 0.636619772367581382433 +#define CL_M_2_SQRTPI 1.128379167095512558561 +#define CL_M_SQRT2 1.414213562373095145475 +#define CL_M_SQRT1_2 0.707106781186547572737 + +#define CL_M_E_F 2.71828174591064f +#define CL_M_LOG2E_F 1.44269502162933f +#define CL_M_LOG10E_F 0.43429449200630f +#define CL_M_LN2_F 0.69314718246460f +#define CL_M_LN10_F 2.30258512496948f +#define CL_M_PI_F 3.14159274101257f +#define CL_M_PI_2_F 1.57079637050629f +#define CL_M_PI_4_F 0.78539818525314f +#define CL_M_1_PI_F 0.31830987334251f +#define CL_M_2_PI_F 0.63661974668503f +#define CL_M_2_SQRTPI_F 1.12837922573090f +#define CL_M_SQRT2_F 1.41421353816986f +#define CL_M_SQRT1_2_F 0.70710676908493f + +#define CL_NAN (CL_INFINITY - CL_INFINITY) +#define CL_HUGE_VALF ((cl_float) 1e50) +#define CL_HUGE_VAL ((cl_double) 1e500) +#define CL_MAXFLOAT CL_FLT_MAX +#define CL_INFINITY CL_HUGE_VALF + +#else + +#include + +/* scalar types */ +typedef int8_t cl_char; +typedef uint8_t cl_uchar; +typedef int16_t cl_short __attribute__((aligned(2))); +typedef uint16_t cl_ushort __attribute__((aligned(2))); +typedef int32_t cl_int __attribute__((aligned(4))); +typedef uint32_t cl_uint __attribute__((aligned(4))); +typedef int64_t cl_long __attribute__((aligned(8))); +typedef uint64_t cl_ulong __attribute__((aligned(8))); + +typedef uint16_t cl_half __attribute__((aligned(2))); +typedef float cl_float __attribute__((aligned(4))); +typedef double cl_double __attribute__((aligned(8))); + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 0x1.fffffep127f +#define CL_FLT_MIN 0x1.0p-126f +#define CL_FLT_EPSILON 0x1.0p-23f + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 0x1.fffffffffffffp1023 +#define CL_DBL_MIN 0x1.0p-1022 +#define CL_DBL_EPSILON 0x1.0p-52 + +#define CL_M_E 2.718281828459045090796 +#define CL_M_LOG2E 1.442695040888963387005 +#define CL_M_LOG10E 0.434294481903251816668 +#define CL_M_LN2 0.693147180559945286227 +#define CL_M_LN10 2.302585092994045901094 +#define CL_M_PI 3.141592653589793115998 +#define CL_M_PI_2 1.570796326794896557999 +#define CL_M_PI_4 0.785398163397448278999 +#define CL_M_1_PI 0.318309886183790691216 +#define CL_M_2_PI 0.636619772367581382433 +#define CL_M_2_SQRTPI 1.128379167095512558561 +#define CL_M_SQRT2 1.414213562373095145475 +#define CL_M_SQRT1_2 0.707106781186547572737 + +#define CL_M_E_F 2.71828174591064f +#define CL_M_LOG2E_F 1.44269502162933f +#define CL_M_LOG10E_F 0.43429449200630f +#define CL_M_LN2_F 0.69314718246460f +#define CL_M_LN10_F 2.30258512496948f +#define CL_M_PI_F 3.14159274101257f +#define CL_M_PI_2_F 1.57079637050629f +#define CL_M_PI_4_F 0.78539818525314f +#define CL_M_1_PI_F 0.31830987334251f +#define CL_M_2_PI_F 0.63661974668503f +#define CL_M_2_SQRTPI_F 1.12837922573090f +#define CL_M_SQRT2_F 1.41421353816986f +#define CL_M_SQRT1_2_F 0.70710676908493f + +#if defined( __GNUC__ ) + #define CL_HUGE_VALF __builtin_huge_valf() + #define CL_HUGE_VAL __builtin_huge_val() + #define CL_NAN __builtin_nanf( "" ) +#else + #define CL_HUGE_VALF ((cl_float) 1e50) + #define CL_HUGE_VAL ((cl_double) 1e500) + float nanf( const char * ); + #define CL_NAN nanf( "" ) +#endif +#define CL_MAXFLOAT CL_FLT_MAX +#define CL_INFINITY CL_HUGE_VALF + +#endif + +#include + +/* Mirror types to GL types. Mirror types allow us to avoid deciding which headers to load based on whether we are using GL or GLES here. */ +typedef unsigned int cl_GLuint; +typedef int cl_GLint; +typedef unsigned int cl_GLenum; + +/* + * Vector types + * + * Note: OpenCL requires that all types be naturally aligned. + * This means that vector types must be naturally aligned. + * For example, a vector of four floats must be aligned to + * a 16 byte boundary (calculated as 4 * the natural 4-byte + * alignment of the float). The alignment qualifiers here + * will only function properly if your compiler supports them + * and if you don't actively work to defeat them. For example, + * in order for a cl_float4 to be 16 byte aligned in a struct, + * the start of the struct must itself be 16-byte aligned. + * + * Maintaining proper alignment is the user's responsibility. + */ + +/* Define basic vector types */ +#if defined( __VEC__ ) + #include /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */ + typedef vector unsigned char __cl_uchar16; + typedef vector signed char __cl_char16; + typedef vector unsigned short __cl_ushort8; + typedef vector signed short __cl_short8; + typedef vector unsigned int __cl_uint4; + typedef vector signed int __cl_int4; + typedef vector float __cl_float4; + #define __CL_UCHAR16__ 1 + #define __CL_CHAR16__ 1 + #define __CL_USHORT8__ 1 + #define __CL_SHORT8__ 1 + #define __CL_UINT4__ 1 + #define __CL_INT4__ 1 + #define __CL_FLOAT4__ 1 +#endif + +#if defined( __SSE__ ) + #if defined( __MINGW64__ ) + #include + #else + #include + #endif + #if defined( __GNUC__ ) + typedef float __cl_float4 __attribute__((vector_size(16))); + #else + typedef __m128 __cl_float4; + #endif + #define __CL_FLOAT4__ 1 +#endif + +#if defined( __SSE2__ ) + #if defined( __MINGW64__ ) + #include + #else + #include + #endif + #if defined( __GNUC__ ) + typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16))); + typedef cl_char __cl_char16 __attribute__((vector_size(16))); + typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16))); + typedef cl_short __cl_short8 __attribute__((vector_size(16))); + typedef cl_uint __cl_uint4 __attribute__((vector_size(16))); + typedef cl_int __cl_int4 __attribute__((vector_size(16))); + typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16))); + typedef cl_long __cl_long2 __attribute__((vector_size(16))); + typedef cl_double __cl_double2 __attribute__((vector_size(16))); + #else + typedef __m128i __cl_uchar16; + typedef __m128i __cl_char16; + typedef __m128i __cl_ushort8; + typedef __m128i __cl_short8; + typedef __m128i __cl_uint4; + typedef __m128i __cl_int4; + typedef __m128i __cl_ulong2; + typedef __m128i __cl_long2; + typedef __m128d __cl_double2; + #endif + #define __CL_UCHAR16__ 1 + #define __CL_CHAR16__ 1 + #define __CL_USHORT8__ 1 + #define __CL_SHORT8__ 1 + #define __CL_INT4__ 1 + #define __CL_UINT4__ 1 + #define __CL_ULONG2__ 1 + #define __CL_LONG2__ 1 + #define __CL_DOUBLE2__ 1 +#endif + +#if defined( __MMX__ ) + #include + #if defined( __GNUC__ ) + typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8))); + typedef cl_char __cl_char8 __attribute__((vector_size(8))); + typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8))); + typedef cl_short __cl_short4 __attribute__((vector_size(8))); + typedef cl_uint __cl_uint2 __attribute__((vector_size(8))); + typedef cl_int __cl_int2 __attribute__((vector_size(8))); + typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8))); + typedef cl_long __cl_long1 __attribute__((vector_size(8))); + typedef cl_float __cl_float2 __attribute__((vector_size(8))); + #else + typedef __m64 __cl_uchar8; + typedef __m64 __cl_char8; + typedef __m64 __cl_ushort4; + typedef __m64 __cl_short4; + typedef __m64 __cl_uint2; + typedef __m64 __cl_int2; + typedef __m64 __cl_ulong1; + typedef __m64 __cl_long1; + typedef __m64 __cl_float2; + #endif + #define __CL_UCHAR8__ 1 + #define __CL_CHAR8__ 1 + #define __CL_USHORT4__ 1 + #define __CL_SHORT4__ 1 + #define __CL_INT2__ 1 + #define __CL_UINT2__ 1 + #define __CL_ULONG1__ 1 + #define __CL_LONG1__ 1 + #define __CL_FLOAT2__ 1 +#endif + +#if defined( __AVX__ ) + #if defined( __MINGW64__ ) + #include + #else + #include + #endif + #if defined( __GNUC__ ) + typedef cl_float __cl_float8 __attribute__((vector_size(32))); + typedef cl_double __cl_double4 __attribute__((vector_size(32))); + #else + typedef __m256 __cl_float8; + typedef __m256d __cl_double4; + #endif + #define __CL_FLOAT8__ 1 + #define __CL_DOUBLE4__ 1 +#endif + +/* Define alignment keys */ +#if defined( __GNUC__ ) + #define CL_ALIGNED(_x) __attribute__ ((aligned(_x))) +#elif defined( _WIN32) && (_MSC_VER) + /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */ + /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */ + /* #include */ + /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */ + #define CL_ALIGNED(_x) +#else + #warning Need to implement some method to align data here + #define CL_ALIGNED(_x) +#endif + +/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */ +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + /* .xyzw and .s0123...{f|F} are supported */ + #define CL_HAS_NAMED_VECTOR_FIELDS 1 + /* .hi and .lo are supported */ + #define CL_HAS_HI_LO_VECTOR_FIELDS 1 +#endif + +/* Define cl_vector types */ + +/* ---- cl_charn ---- */ +typedef union +{ + cl_char CL_ALIGNED(2) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_char x, y; }; + __extension__ struct{ cl_char s0, s1; }; + __extension__ struct{ cl_char lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2; +#endif +}cl_char2; + +typedef union +{ + cl_char CL_ALIGNED(4) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_char x, y, z, w; }; + __extension__ struct{ cl_char s0, s1, s2, s3; }; + __extension__ struct{ cl_char2 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[2]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4; +#endif +}cl_char4; + +/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */ +typedef cl_char4 cl_char3; + +typedef union +{ + cl_char CL_ALIGNED(8) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_char x, y, z, w; }; + __extension__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_char4 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[4]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4[2]; +#endif +#if defined( __CL_CHAR8__ ) + __cl_char8 v8; +#endif +}cl_char8; + +typedef union +{ + cl_char CL_ALIGNED(16) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_char8 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[8]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4[4]; +#endif +#if defined( __CL_CHAR8__ ) + __cl_char8 v8[2]; +#endif +#if defined( __CL_CHAR16__ ) + __cl_char16 v16; +#endif +}cl_char16; + + +/* ---- cl_ucharn ---- */ +typedef union +{ + cl_uchar CL_ALIGNED(2) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_uchar x, y; }; + __extension__ struct{ cl_uchar s0, s1; }; + __extension__ struct{ cl_uchar lo, hi; }; +#endif +#if defined( __cl_uchar2__) + __cl_uchar2 v2; +#endif +}cl_uchar2; + +typedef union +{ + cl_uchar CL_ALIGNED(4) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_uchar x, y, z, w; }; + __extension__ struct{ cl_uchar s0, s1, s2, s3; }; + __extension__ struct{ cl_uchar2 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[2]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4; +#endif +}cl_uchar4; + +/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */ +typedef cl_uchar4 cl_uchar3; + +typedef union +{ + cl_uchar CL_ALIGNED(8) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_uchar x, y, z, w; }; + __extension__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_uchar4 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[4]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4[2]; +#endif +#if defined( __CL_UCHAR8__ ) + __cl_uchar8 v8; +#endif +}cl_uchar8; + +typedef union +{ + cl_uchar CL_ALIGNED(16) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_uchar8 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[8]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4[4]; +#endif +#if defined( __CL_UCHAR8__ ) + __cl_uchar8 v8[2]; +#endif +#if defined( __CL_UCHAR16__ ) + __cl_uchar16 v16; +#endif +}cl_uchar16; + + +/* ---- cl_shortn ---- */ +typedef union +{ + cl_short CL_ALIGNED(4) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_short x, y; }; + __extension__ struct{ cl_short s0, s1; }; + __extension__ struct{ cl_short lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2; +#endif +}cl_short2; + +typedef union +{ + cl_short CL_ALIGNED(8) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_short x, y, z, w; }; + __extension__ struct{ cl_short s0, s1, s2, s3; }; + __extension__ struct{ cl_short2 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[2]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4; +#endif +}cl_short4; + +/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */ +typedef cl_short4 cl_short3; + +typedef union +{ + cl_short CL_ALIGNED(16) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_short x, y, z, w; }; + __extension__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_short4 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[4]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4[2]; +#endif +#if defined( __CL_SHORT8__ ) + __cl_short8 v8; +#endif +}cl_short8; + +typedef union +{ + cl_short CL_ALIGNED(32) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_short8 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[8]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4[4]; +#endif +#if defined( __CL_SHORT8__ ) + __cl_short8 v8[2]; +#endif +#if defined( __CL_SHORT16__ ) + __cl_short16 v16; +#endif +}cl_short16; + + +/* ---- cl_ushortn ---- */ +typedef union +{ + cl_ushort CL_ALIGNED(4) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_ushort x, y; }; + __extension__ struct{ cl_ushort s0, s1; }; + __extension__ struct{ cl_ushort lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2; +#endif +}cl_ushort2; + +typedef union +{ + cl_ushort CL_ALIGNED(8) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_ushort x, y, z, w; }; + __extension__ struct{ cl_ushort s0, s1, s2, s3; }; + __extension__ struct{ cl_ushort2 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[2]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4; +#endif +}cl_ushort4; + +/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */ +typedef cl_ushort4 cl_ushort3; + +typedef union +{ + cl_ushort CL_ALIGNED(16) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_ushort x, y, z, w; }; + __extension__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_ushort4 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[4]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4[2]; +#endif +#if defined( __CL_USHORT8__ ) + __cl_ushort8 v8; +#endif +}cl_ushort8; + +typedef union +{ + cl_ushort CL_ALIGNED(32) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_ushort8 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[8]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4[4]; +#endif +#if defined( __CL_USHORT8__ ) + __cl_ushort8 v8[2]; +#endif +#if defined( __CL_USHORT16__ ) + __cl_ushort16 v16; +#endif +}cl_ushort16; + +/* ---- cl_intn ---- */ +typedef union +{ + cl_int CL_ALIGNED(8) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_int x, y; }; + __extension__ struct{ cl_int s0, s1; }; + __extension__ struct{ cl_int lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2; +#endif +}cl_int2; + +typedef union +{ + cl_int CL_ALIGNED(16) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_int x, y, z, w; }; + __extension__ struct{ cl_int s0, s1, s2, s3; }; + __extension__ struct{ cl_int2 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[2]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4; +#endif +}cl_int4; + +/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */ +typedef cl_int4 cl_int3; + +typedef union +{ + cl_int CL_ALIGNED(32) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_int x, y, z, w; }; + __extension__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_int4 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[4]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4[2]; +#endif +#if defined( __CL_INT8__ ) + __cl_int8 v8; +#endif +}cl_int8; + +typedef union +{ + cl_int CL_ALIGNED(64) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_int8 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[8]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4[4]; +#endif +#if defined( __CL_INT8__ ) + __cl_int8 v8[2]; +#endif +#if defined( __CL_INT16__ ) + __cl_int16 v16; +#endif +}cl_int16; + + +/* ---- cl_uintn ---- */ +typedef union +{ + cl_uint CL_ALIGNED(8) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_uint x, y; }; + __extension__ struct{ cl_uint s0, s1; }; + __extension__ struct{ cl_uint lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2; +#endif +}cl_uint2; + +typedef union +{ + cl_uint CL_ALIGNED(16) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_uint x, y, z, w; }; + __extension__ struct{ cl_uint s0, s1, s2, s3; }; + __extension__ struct{ cl_uint2 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[2]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4; +#endif +}cl_uint4; + +/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */ +typedef cl_uint4 cl_uint3; + +typedef union +{ + cl_uint CL_ALIGNED(32) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_uint x, y, z, w; }; + __extension__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_uint4 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[4]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4[2]; +#endif +#if defined( __CL_UINT8__ ) + __cl_uint8 v8; +#endif +}cl_uint8; + +typedef union +{ + cl_uint CL_ALIGNED(64) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_uint8 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[8]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4[4]; +#endif +#if defined( __CL_UINT8__ ) + __cl_uint8 v8[2]; +#endif +#if defined( __CL_UINT16__ ) + __cl_uint16 v16; +#endif +}cl_uint16; + +/* ---- cl_longn ---- */ +typedef union +{ + cl_long CL_ALIGNED(16) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_long x, y; }; + __extension__ struct{ cl_long s0, s1; }; + __extension__ struct{ cl_long lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2; +#endif +}cl_long2; + +typedef union +{ + cl_long CL_ALIGNED(32) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_long x, y, z, w; }; + __extension__ struct{ cl_long s0, s1, s2, s3; }; + __extension__ struct{ cl_long2 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[2]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4; +#endif +}cl_long4; + +/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */ +typedef cl_long4 cl_long3; + +typedef union +{ + cl_long CL_ALIGNED(64) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_long x, y, z, w; }; + __extension__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_long4 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[4]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4[2]; +#endif +#if defined( __CL_LONG8__ ) + __cl_long8 v8; +#endif +}cl_long8; + +typedef union +{ + cl_long CL_ALIGNED(128) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_long8 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[8]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4[4]; +#endif +#if defined( __CL_LONG8__ ) + __cl_long8 v8[2]; +#endif +#if defined( __CL_LONG16__ ) + __cl_long16 v16; +#endif +}cl_long16; + + +/* ---- cl_ulongn ---- */ +typedef union +{ + cl_ulong CL_ALIGNED(16) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_ulong x, y; }; + __extension__ struct{ cl_ulong s0, s1; }; + __extension__ struct{ cl_ulong lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2; +#endif +}cl_ulong2; + +typedef union +{ + cl_ulong CL_ALIGNED(32) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_ulong x, y, z, w; }; + __extension__ struct{ cl_ulong s0, s1, s2, s3; }; + __extension__ struct{ cl_ulong2 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[2]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4; +#endif +}cl_ulong4; + +/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */ +typedef cl_ulong4 cl_ulong3; + +typedef union +{ + cl_ulong CL_ALIGNED(64) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_ulong x, y, z, w; }; + __extension__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_ulong4 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[4]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4[2]; +#endif +#if defined( __CL_ULONG8__ ) + __cl_ulong8 v8; +#endif +}cl_ulong8; + +typedef union +{ + cl_ulong CL_ALIGNED(128) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_ulong8 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[8]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4[4]; +#endif +#if defined( __CL_ULONG8__ ) + __cl_ulong8 v8[2]; +#endif +#if defined( __CL_ULONG16__ ) + __cl_ulong16 v16; +#endif +}cl_ulong16; + + +/* --- cl_floatn ---- */ + +typedef union +{ + cl_float CL_ALIGNED(8) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_float x, y; }; + __extension__ struct{ cl_float s0, s1; }; + __extension__ struct{ cl_float lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2; +#endif +}cl_float2; + +typedef union +{ + cl_float CL_ALIGNED(16) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_float x, y, z, w; }; + __extension__ struct{ cl_float s0, s1, s2, s3; }; + __extension__ struct{ cl_float2 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[2]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4; +#endif +}cl_float4; + +/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */ +typedef cl_float4 cl_float3; + +typedef union +{ + cl_float CL_ALIGNED(32) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_float x, y, z, w; }; + __extension__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_float4 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[4]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4[2]; +#endif +#if defined( __CL_FLOAT8__ ) + __cl_float8 v8; +#endif +}cl_float8; + +typedef union +{ + cl_float CL_ALIGNED(64) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_float8 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[8]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4[4]; +#endif +#if defined( __CL_FLOAT8__ ) + __cl_float8 v8[2]; +#endif +#if defined( __CL_FLOAT16__ ) + __cl_float16 v16; +#endif +}cl_float16; + +/* --- cl_doublen ---- */ + +typedef union +{ + cl_double CL_ALIGNED(16) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_double x, y; }; + __extension__ struct{ cl_double s0, s1; }; + __extension__ struct{ cl_double lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2; +#endif +}cl_double2; + +typedef union +{ + cl_double CL_ALIGNED(32) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_double x, y, z, w; }; + __extension__ struct{ cl_double s0, s1, s2, s3; }; + __extension__ struct{ cl_double2 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[2]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4; +#endif +}cl_double4; + +/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */ +typedef cl_double4 cl_double3; + +typedef union +{ + cl_double CL_ALIGNED(64) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_double x, y, z, w; }; + __extension__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_double4 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[4]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4[2]; +#endif +#if defined( __CL_DOUBLE8__ ) + __cl_double8 v8; +#endif +}cl_double8; + +typedef union +{ + cl_double CL_ALIGNED(128) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_double8 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[8]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4[4]; +#endif +#if defined( __CL_DOUBLE8__ ) + __cl_double8 v8[2]; +#endif +#if defined( __CL_DOUBLE16__ ) + __cl_double16 v16; +#endif +}cl_double16; + +/* Macro to facilitate debugging + * Usage: + * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. + * The first line ends with: CL_PROGRAM_STRING_BEGIN \" + * Each line thereafter of OpenCL C source must end with: \n\ + * The last line ends in "; + * + * Example: + * + * const char *my_program = CL_PROGRAM_STRING_BEGIN "\ + * kernel void foo( int a, float * b ) \n\ + * { \n\ + * // my comment \n\ + * *b[ get_global_id(0)] = a; \n\ + * } \n\ + * "; + * + * This should correctly set up the line, (column) and file information for your source + * string so you can do source level debugging. + */ +#define __CL_STRINGIFY( _x ) # _x +#define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x ) +#define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" + +#ifdef __cplusplus +} +#endif + +#endif /* __CL_PLATFORM_H */ diff --git a/libs/clew/CL/opencl.h b/libs/clew/CL/opencl.h new file mode 100644 index 0000000..9855cd7 --- /dev/null +++ b/libs/clew/CL/opencl.h @@ -0,0 +1,59 @@ +/******************************************************************************* + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_H +#define __OPENCL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ + +#include +#include +#include +#include + +#else + +#include +#include +#include +#include + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_H */ + diff --git a/libs/clew/CMakeLists.txt b/libs/clew/CMakeLists.txt new file mode 100644 index 0000000..99e7308 --- /dev/null +++ b/libs/clew/CMakeLists.txt @@ -0,0 +1,22 @@ +cmake_minimum_required(VERSION 3.1) + +project(libclew) + +set(HEADERS + CL/cl.h + CL/cl_d3d10.h + CL/cl_ext.h + CL/cl_gl.h + CL/cl_gl_ext.h + CL/cl_platform.h + CL/opencl.h + libclew/ocl_init.h + ) + +set(SOURCES + libclew/ocl_init.cpp + ) + +add_library(${PROJECT_NAME} ${HEADERS} ${SOURCES}) +target_link_libraries (${PROJECT_NAME} ${CMAKE_DL_LIBS}) +target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}) diff --git a/libs/clew/libclew/ocl_init.cpp b/libs/clew/libclew/ocl_init.cpp new file mode 100644 index 0000000..c1e4be1 --- /dev/null +++ b/libs/clew/libclew/ocl_init.cpp @@ -0,0 +1,1142 @@ +#include + +#ifdef _WIN32 + +#include + +typedef HMODULE OclLibrary; + +HMODULE oclLoadLibrary(void) +{ + return LoadLibraryW(L"OpenCL.dll"); +} + +FARPROC oclGetProcAddress(HMODULE hModule, LPCSTR lpProcName) +{ + return ::GetProcAddress(hModule, lpProcName); +} + +#elif defined(__unix__) || defined(__APPLE__) || defined(__MACOSX) + +#include + +typedef void * OclLibrary; + +OclLibrary oclLoadLibrary(void) +{ +#if defined(__APPLE__) || defined(__MACOSX) + return dlopen("/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL", RTLD_NOW); +#else + OclLibrary lib = dlopen("libOpenCL.so", RTLD_NOW); + if (!lib) { + lib = dlopen("libOpenCL.so.1", RTLD_NOW); + } + return lib; +#endif +} + +void *oclGetProcAddress(void *handle, const char *symbol) +{ + return dlsym(handle, symbol); +} + +#else +#error unsupported platform +#endif + +// Platform API + +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clGetPlatformIDs) (cl_uint, cl_platform_id *, cl_uint *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clGetPlatformInfo) (cl_platform_id, cl_platform_info, size_t, void *, size_t *); + +// Device APIs + +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clGetDeviceIDs) (cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clGetDeviceInfo) (cl_device_id, cl_device_info, size_t, void *, size_t *); + +// Context APIs + +typedef cl_context (CL_API_ENTRY CL_API_CALL * p_pfn_clCreateContext) (const cl_context_properties *, cl_uint, const cl_device_id *, void (CL_CALLBACK *)(const char *, const void *, size_t, void *), void *, cl_int *); +typedef cl_context (CL_API_ENTRY CL_API_CALL * p_pfn_clCreateContextFromType) (const cl_context_properties *, cl_device_type, void (CL_CALLBACK *)(const char *, const void *, size_t, void *), void *, cl_int *); + +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clRetainContext) (cl_context); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clReleaseContext) (cl_context); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clGetContextInfo) (cl_context, cl_context_info, size_t, void *, size_t *); + +// Command Queue APIs + +typedef cl_command_queue (CL_API_ENTRY CL_API_CALL * p_pfn_clCreateCommandQueue) (cl_context, cl_device_id, cl_command_queue_properties, cl_int *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clRetainCommandQueue) (cl_command_queue); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clReleaseCommandQueue) (cl_command_queue); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clGetCommandQueueInfo) (cl_command_queue, cl_command_queue_info, size_t, void *, size_t *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clSetCommandQueueProperty) (cl_command_queue, cl_command_queue_properties, cl_bool, cl_command_queue_properties *); + +// Memory Object APIs + +typedef cl_mem (CL_API_ENTRY CL_API_CALL * p_pfn_clCreateBuffer) (cl_context, cl_mem_flags, size_t, void *, cl_int *); +typedef cl_mem (CL_API_ENTRY CL_API_CALL * p_pfn_clCreateImage2D) (cl_context, cl_mem_flags, const cl_image_format *, size_t, size_t, size_t, void *, cl_int *); +typedef cl_mem (CL_API_ENTRY CL_API_CALL * p_pfn_clCreateImage3D) (cl_context, cl_mem_flags, const cl_image_format *, size_t, size_t, size_t, size_t, size_t, void *, cl_int *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clRetainMemObject) (cl_mem); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clReleaseMemObject) (cl_mem); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clGetSupportedImageFormats) (cl_context, cl_mem_flags, cl_mem_object_type, cl_uint, cl_image_format *, cl_uint *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clGetMemObjectInfo) (cl_mem, cl_mem_info, size_t, void *, size_t *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clGetImageInfo) (cl_mem, cl_image_info, size_t, void *, size_t *); + +// Sampler APIs + +typedef cl_sampler (CL_API_ENTRY CL_API_CALL * p_pfn_clCreateSampler) (cl_context, cl_bool, cl_addressing_mode, cl_filter_mode, cl_int *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clRetainSampler) (cl_sampler); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clReleaseSampler) (cl_sampler); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clGetSamplerInfo) (cl_sampler, cl_sampler_info, size_t, void *, size_t *); + +// Program Object APIs + +typedef cl_program (CL_API_ENTRY CL_API_CALL * p_pfn_clCreateProgramWithSource) (cl_context, cl_uint, const char **, const size_t *, cl_int *); +typedef cl_program (CL_API_ENTRY CL_API_CALL * p_pfn_clCreateProgramWithBinary) (cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clRetainProgram) (cl_program); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clReleaseProgram) (cl_program); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clBuildProgram) (cl_program, cl_uint, const cl_device_id *, const char *, void (CL_CALLBACK *)(cl_program, void *), void *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clUnloadCompiler) (void); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clGetProgramInfo) (cl_program, cl_program_info, size_t, void *, size_t *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clGetProgramBuildInfo) (cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *); + +// Kernel Object APIs + +typedef cl_kernel (CL_API_ENTRY CL_API_CALL * p_pfn_clCreateKernel) (cl_program, const char *, cl_int *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clCreateKernelsInProgram) (cl_program, cl_uint, cl_kernel *, cl_uint *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clRetainKernel) (cl_kernel); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clReleaseKernel) (cl_kernel); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clSetKernelArg) (cl_kernel, cl_uint, size_t, const void *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clGetKernelInfo) (cl_kernel, cl_kernel_info, size_t, void *, size_t *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clGetKernelWorkGroupInfo) (cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *); + +// Event Object APIs + +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clWaitForEvents) (cl_uint, const cl_event *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clGetEventInfo) (cl_event, cl_event_info, size_t, void *, size_t *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clRetainEvent) (cl_event); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clReleaseEvent) (cl_event); + +// Profiling APIs + +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clGetEventProfilingInfo) (cl_event, cl_profiling_info, size_t, void *, size_t *); + +// Flush and Finish APIs + +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clFlush) (cl_command_queue); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clFinish) (cl_command_queue); + +// Enqueued Commands APIs + +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueReadBuffer) (cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueReadBufferRect) (cl_command_queue, cl_mem, cl_bool, const size_t *, const size_t *, const size_t *, size_t, size_t, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueWriteBuffer) (cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueWriteBufferRect) (cl_command_queue, cl_mem, cl_bool, const size_t *, const size_t *, const size_t *, size_t, size_t, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueCopyBuffer) (cl_command_queue, cl_mem, cl_mem, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueReadImage) (cl_command_queue, cl_mem, cl_bool, const size_t * [], const size_t * [], size_t, size_t, void *, cl_uint, const cl_event *, cl_event *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueWriteImage) (cl_command_queue, cl_mem, cl_bool, const size_t * [], const size_t * [], size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueCopyImage) (cl_command_queue, cl_mem, cl_mem, const size_t * [], const size_t * [], const size_t * [], cl_uint, const cl_event *, cl_event *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueCopyImageToBuffer) (cl_command_queue, cl_mem, cl_mem, const size_t * [], const size_t * [], size_t, cl_uint, const cl_event *, cl_event *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueCopyBufferToImage) (cl_command_queue, cl_mem, cl_mem, size_t, const size_t * [], const size_t * [], cl_uint, const cl_event *, cl_event *); +typedef void * (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueMapBuffer) (cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event *, cl_event *, cl_int *); +typedef void * (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueMapImage) (cl_command_queue, cl_mem, cl_bool, cl_map_flags, const size_t *, const size_t *, size_t *, size_t *, cl_uint, const cl_event *, cl_event *, cl_int *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueUnmapMemObject) (cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueNDRangeKernel) (cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueTask) (cl_command_queue, cl_kernel, cl_uint, const cl_event *, cl_event *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueNativeKernel) (cl_command_queue, void (CL_CALLBACK *)(void *), void *, size_t, cl_uint, const cl_mem *, const void **, cl_uint, const cl_event *, cl_event *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueMarker) (cl_command_queue, cl_event *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueWaitForEvents) (cl_command_queue, cl_uint, const cl_event *); +typedef cl_int (CL_API_ENTRY CL_API_CALL * p_pfn_clEnqueueBarrier) (cl_command_queue); + +// Extension function access +// +// Returns the extension function address for the given function name, +// or NULL if a valid function can not be found. The client must +// check to make sure the address is not NULL, before using or +// calling the returned function address. +// + +typedef void * (CL_API_ENTRY CL_API_CALL * p_pfn_clGetExtensionFunctionAddress)(const char *); + +p_pfn_clGetPlatformIDs pfn_clGetPlatformIDs = 0; +p_pfn_clGetPlatformInfo pfn_clGetPlatformInfo = 0; +p_pfn_clGetDeviceIDs pfn_clGetDeviceIDs = 0; +p_pfn_clGetDeviceInfo pfn_clGetDeviceInfo = 0; +p_pfn_clCreateContext pfn_clCreateContext = 0; +p_pfn_clCreateContextFromType pfn_clCreateContextFromType = 0; +p_pfn_clRetainContext pfn_clRetainContext = 0; +p_pfn_clReleaseContext pfn_clReleaseContext = 0; +p_pfn_clGetContextInfo pfn_clGetContextInfo = 0; +p_pfn_clCreateCommandQueue pfn_clCreateCommandQueue = 0; +p_pfn_clRetainCommandQueue pfn_clRetainCommandQueue = 0; +p_pfn_clReleaseCommandQueue pfn_clReleaseCommandQueue = 0; +p_pfn_clGetCommandQueueInfo pfn_clGetCommandQueueInfo = 0; +p_pfn_clSetCommandQueueProperty pfn_clSetCommandQueueProperty = 0; +p_pfn_clCreateBuffer pfn_clCreateBuffer = 0; +p_pfn_clCreateImage2D pfn_clCreateImage2D = 0; +p_pfn_clCreateImage3D pfn_clCreateImage3D = 0; +p_pfn_clRetainMemObject pfn_clRetainMemObject = 0; +p_pfn_clReleaseMemObject pfn_clReleaseMemObject = 0; +p_pfn_clGetSupportedImageFormats pfn_clGetSupportedImageFormats = 0; +p_pfn_clGetMemObjectInfo pfn_clGetMemObjectInfo = 0; +p_pfn_clGetImageInfo pfn_clGetImageInfo = 0; +p_pfn_clCreateSampler pfn_clCreateSampler = 0; +p_pfn_clRetainSampler pfn_clRetainSampler = 0; +p_pfn_clReleaseSampler pfn_clReleaseSampler = 0; +p_pfn_clGetSamplerInfo pfn_clGetSamplerInfo = 0; +p_pfn_clCreateProgramWithSource pfn_clCreateProgramWithSource = 0; +p_pfn_clCreateProgramWithBinary pfn_clCreateProgramWithBinary = 0; +p_pfn_clRetainProgram pfn_clRetainProgram = 0; +p_pfn_clReleaseProgram pfn_clReleaseProgram = 0; +p_pfn_clBuildProgram pfn_clBuildProgram = 0; +p_pfn_clUnloadCompiler pfn_clUnloadCompiler = 0; +p_pfn_clGetProgramInfo pfn_clGetProgramInfo = 0; +p_pfn_clGetProgramBuildInfo pfn_clGetProgramBuildInfo = 0; +p_pfn_clCreateKernel pfn_clCreateKernel = 0; +p_pfn_clCreateKernelsInProgram pfn_clCreateKernelsInProgram = 0; +p_pfn_clRetainKernel pfn_clRetainKernel = 0; +p_pfn_clReleaseKernel pfn_clReleaseKernel = 0; +p_pfn_clSetKernelArg pfn_clSetKernelArg = 0; +p_pfn_clGetKernelInfo pfn_clGetKernelInfo = 0; +p_pfn_clGetKernelWorkGroupInfo pfn_clGetKernelWorkGroupInfo = 0; +p_pfn_clWaitForEvents pfn_clWaitForEvents = 0; +p_pfn_clGetEventInfo pfn_clGetEventInfo = 0; +p_pfn_clRetainEvent pfn_clRetainEvent = 0; +p_pfn_clReleaseEvent pfn_clReleaseEvent = 0; +p_pfn_clGetEventProfilingInfo pfn_clGetEventProfilingInfo = 0; +p_pfn_clFlush pfn_clFlush = 0; +p_pfn_clFinish pfn_clFinish = 0; +p_pfn_clEnqueueReadBuffer pfn_clEnqueueReadBuffer = 0; +p_pfn_clEnqueueReadBufferRect pfn_clEnqueueReadBufferRect = 0; +p_pfn_clEnqueueWriteBuffer pfn_clEnqueueWriteBuffer = 0; +p_pfn_clEnqueueWriteBufferRect pfn_clEnqueueWriteBufferRect = 0; +p_pfn_clEnqueueCopyBuffer pfn_clEnqueueCopyBuffer = 0; +p_pfn_clEnqueueReadImage pfn_clEnqueueReadImage = 0; +p_pfn_clEnqueueWriteImage pfn_clEnqueueWriteImage = 0; +p_pfn_clEnqueueCopyImage pfn_clEnqueueCopyImage = 0; +p_pfn_clEnqueueCopyImageToBuffer pfn_clEnqueueCopyImageToBuffer = 0; +p_pfn_clEnqueueCopyBufferToImage pfn_clEnqueueCopyBufferToImage = 0; +p_pfn_clEnqueueMapBuffer pfn_clEnqueueMapBuffer = 0; +p_pfn_clEnqueueMapImage pfn_clEnqueueMapImage = 0; +p_pfn_clEnqueueUnmapMemObject pfn_clEnqueueUnmapMemObject = 0; +p_pfn_clEnqueueNDRangeKernel pfn_clEnqueueNDRangeKernel = 0; +p_pfn_clEnqueueTask pfn_clEnqueueTask = 0; +p_pfn_clEnqueueNativeKernel pfn_clEnqueueNativeKernel = 0; +p_pfn_clEnqueueMarker pfn_clEnqueueMarker = 0; +p_pfn_clEnqueueWaitForEvents pfn_clEnqueueWaitForEvents = 0; +p_pfn_clEnqueueBarrier pfn_clEnqueueBarrier = 0; +p_pfn_clGetExtensionFunctionAddress pfn_clGetExtensionFunctionAddress = 0; + +int ocl_init(void) +{ + if (pfn_clGetPlatformIDs) return 1; + + OclLibrary lib = oclLoadLibrary(); + if (!lib) return 0; + + pfn_clGetPlatformIDs = (p_pfn_clGetPlatformIDs) oclGetProcAddress(lib, "clGetPlatformIDs"); + pfn_clGetPlatformInfo = (p_pfn_clGetPlatformInfo) oclGetProcAddress(lib, "clGetPlatformInfo"); + pfn_clGetDeviceIDs = (p_pfn_clGetDeviceIDs) oclGetProcAddress(lib, "clGetDeviceIDs"); + pfn_clGetDeviceInfo = (p_pfn_clGetDeviceInfo) oclGetProcAddress(lib, "clGetDeviceInfo"); + pfn_clCreateContext = (p_pfn_clCreateContext) oclGetProcAddress(lib, "clCreateContext"); + pfn_clCreateContextFromType = (p_pfn_clCreateContextFromType) oclGetProcAddress(lib, "clCreateContextFromType"); + pfn_clRetainContext = (p_pfn_clRetainContext) oclGetProcAddress(lib, "clRetainContext"); + pfn_clReleaseContext = (p_pfn_clReleaseContext) oclGetProcAddress(lib, "clReleaseContext"); + pfn_clGetContextInfo = (p_pfn_clGetContextInfo) oclGetProcAddress(lib, "clGetContextInfo"); + pfn_clCreateCommandQueue = (p_pfn_clCreateCommandQueue) oclGetProcAddress(lib, "clCreateCommandQueue"); + pfn_clRetainCommandQueue = (p_pfn_clRetainCommandQueue) oclGetProcAddress(lib, "clRetainCommandQueue"); + pfn_clReleaseCommandQueue = (p_pfn_clReleaseCommandQueue) oclGetProcAddress(lib, "clReleaseCommandQueue"); + pfn_clGetCommandQueueInfo = (p_pfn_clGetCommandQueueInfo) oclGetProcAddress(lib, "clGetCommandQueueInfo"); + pfn_clSetCommandQueueProperty = (p_pfn_clSetCommandQueueProperty) oclGetProcAddress(lib, "clSetCommandQueueProperty"); + pfn_clCreateBuffer = (p_pfn_clCreateBuffer) oclGetProcAddress(lib, "clCreateBuffer"); + pfn_clCreateImage2D = (p_pfn_clCreateImage2D) oclGetProcAddress(lib, "clCreateImage2D"); + pfn_clCreateImage3D = (p_pfn_clCreateImage3D) oclGetProcAddress(lib, "clCreateImage3D"); + pfn_clRetainMemObject = (p_pfn_clRetainMemObject) oclGetProcAddress(lib, "clRetainMemObject"); + pfn_clReleaseMemObject = (p_pfn_clReleaseMemObject) oclGetProcAddress(lib, "clReleaseMemObject"); + pfn_clGetSupportedImageFormats = (p_pfn_clGetSupportedImageFormats) oclGetProcAddress(lib, "clGetSupportedImageFormats"); + pfn_clGetMemObjectInfo = (p_pfn_clGetMemObjectInfo) oclGetProcAddress(lib, "clGetMemObjectInfo"); + pfn_clGetImageInfo = (p_pfn_clGetImageInfo) oclGetProcAddress(lib, "clGetImageInfo"); + pfn_clCreateSampler = (p_pfn_clCreateSampler) oclGetProcAddress(lib, "clCreateSampler"); + pfn_clRetainSampler = (p_pfn_clRetainSampler) oclGetProcAddress(lib, "clRetainSampler"); + pfn_clReleaseSampler = (p_pfn_clReleaseSampler) oclGetProcAddress(lib, "clReleaseSampler"); + pfn_clGetSamplerInfo = (p_pfn_clGetSamplerInfo) oclGetProcAddress(lib, "clGetSamplerInfo"); + pfn_clCreateProgramWithSource = (p_pfn_clCreateProgramWithSource) oclGetProcAddress(lib, "clCreateProgramWithSource"); + pfn_clCreateProgramWithBinary = (p_pfn_clCreateProgramWithBinary) oclGetProcAddress(lib, "clCreateProgramWithBinary"); + pfn_clRetainProgram = (p_pfn_clRetainProgram) oclGetProcAddress(lib, "clRetainProgram"); + pfn_clReleaseProgram = (p_pfn_clReleaseProgram) oclGetProcAddress(lib, "clReleaseProgram"); + pfn_clBuildProgram = (p_pfn_clBuildProgram) oclGetProcAddress(lib, "clBuildProgram"); + pfn_clUnloadCompiler = (p_pfn_clUnloadCompiler) oclGetProcAddress(lib, "clUnloadCompiler"); + pfn_clGetProgramInfo = (p_pfn_clGetProgramInfo) oclGetProcAddress(lib, "clGetProgramInfo"); + pfn_clGetProgramBuildInfo = (p_pfn_clGetProgramBuildInfo) oclGetProcAddress(lib, "clGetProgramBuildInfo"); + pfn_clCreateKernel = (p_pfn_clCreateKernel) oclGetProcAddress(lib, "clCreateKernel"); + pfn_clCreateKernelsInProgram = (p_pfn_clCreateKernelsInProgram) oclGetProcAddress(lib, "clCreateKernelsInProgram"); + pfn_clRetainKernel = (p_pfn_clRetainKernel) oclGetProcAddress(lib, "clRetainKernel"); + pfn_clReleaseKernel = (p_pfn_clReleaseKernel) oclGetProcAddress(lib, "clReleaseKernel"); + pfn_clSetKernelArg = (p_pfn_clSetKernelArg) oclGetProcAddress(lib, "clSetKernelArg"); + pfn_clGetKernelInfo = (p_pfn_clGetKernelInfo) oclGetProcAddress(lib, "clGetKernelInfo"); + pfn_clGetKernelWorkGroupInfo = (p_pfn_clGetKernelWorkGroupInfo) oclGetProcAddress(lib, "clGetKernelWorkGroupInfo"); + pfn_clWaitForEvents = (p_pfn_clWaitForEvents) oclGetProcAddress(lib, "clWaitForEvents"); + pfn_clGetEventInfo = (p_pfn_clGetEventInfo) oclGetProcAddress(lib, "clGetEventInfo"); + pfn_clRetainEvent = (p_pfn_clRetainEvent) oclGetProcAddress(lib, "clRetainEvent"); + pfn_clReleaseEvent = (p_pfn_clReleaseEvent) oclGetProcAddress(lib, "clReleaseEvent"); + pfn_clGetEventProfilingInfo = (p_pfn_clGetEventProfilingInfo) oclGetProcAddress(lib, "clGetEventProfilingInfo"); + pfn_clFlush = (p_pfn_clFlush) oclGetProcAddress(lib, "clFlush"); + pfn_clFinish = (p_pfn_clFinish) oclGetProcAddress(lib, "clFinish"); + pfn_clEnqueueReadBuffer = (p_pfn_clEnqueueReadBuffer) oclGetProcAddress(lib, "clEnqueueReadBuffer"); + pfn_clEnqueueReadBufferRect = (p_pfn_clEnqueueReadBufferRect) oclGetProcAddress(lib, "clEnqueueReadBufferRect"); + pfn_clEnqueueWriteBuffer = (p_pfn_clEnqueueWriteBuffer) oclGetProcAddress(lib, "clEnqueueWriteBuffer"); + pfn_clEnqueueWriteBufferRect = (p_pfn_clEnqueueWriteBufferRect) oclGetProcAddress(lib, "clEnqueueWriteBufferRect"); + pfn_clEnqueueCopyBuffer = (p_pfn_clEnqueueCopyBuffer) oclGetProcAddress(lib, "clEnqueueCopyBuffer"); + pfn_clEnqueueReadImage = (p_pfn_clEnqueueReadImage) oclGetProcAddress(lib, "clEnqueueReadImage"); + pfn_clEnqueueWriteImage = (p_pfn_clEnqueueWriteImage) oclGetProcAddress(lib, "clEnqueueWriteImage"); + pfn_clEnqueueCopyImage = (p_pfn_clEnqueueCopyImage) oclGetProcAddress(lib, "clEnqueueCopyImage"); + pfn_clEnqueueCopyImageToBuffer = (p_pfn_clEnqueueCopyImageToBuffer) oclGetProcAddress(lib, "clEnqueueCopyImageToBuffer"); + pfn_clEnqueueCopyBufferToImage = (p_pfn_clEnqueueCopyBufferToImage) oclGetProcAddress(lib, "clEnqueueCopyBufferToImage"); + pfn_clEnqueueMapBuffer = (p_pfn_clEnqueueMapBuffer) oclGetProcAddress(lib, "clEnqueueMapBuffer"); + pfn_clEnqueueMapImage = (p_pfn_clEnqueueMapImage) oclGetProcAddress(lib, "clEnqueueMapImage"); + pfn_clEnqueueUnmapMemObject = (p_pfn_clEnqueueUnmapMemObject) oclGetProcAddress(lib, "clEnqueueUnmapMemObject"); + pfn_clEnqueueNDRangeKernel = (p_pfn_clEnqueueNDRangeKernel) oclGetProcAddress(lib, "clEnqueueNDRangeKernel"); + pfn_clEnqueueTask = (p_pfn_clEnqueueTask) oclGetProcAddress(lib, "clEnqueueTask"); + pfn_clEnqueueNativeKernel = (p_pfn_clEnqueueNativeKernel) oclGetProcAddress(lib, "clEnqueueNativeKernel"); + pfn_clEnqueueMarker = (p_pfn_clEnqueueMarker) oclGetProcAddress(lib, "clEnqueueMarker"); + pfn_clEnqueueWaitForEvents = (p_pfn_clEnqueueWaitForEvents) oclGetProcAddress(lib, "clEnqueueWaitForEvents"); + pfn_clEnqueueBarrier = (p_pfn_clEnqueueBarrier) oclGetProcAddress(lib, "clEnqueueBarrier"); + pfn_clGetExtensionFunctionAddress = (p_pfn_clGetExtensionFunctionAddress) oclGetProcAddress(lib, "clGetExtensionFunctionAddress"); + + return 1; +} + +// Platform API +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPlatformIDs(cl_uint num_entries, + cl_platform_id * platforms, + cl_uint * num_platforms) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clGetPlatformIDs) return CL_INVALID_OPERATION; + + return pfn_clGetPlatformIDs(num_entries, platforms, num_platforms); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPlatformInfo(cl_platform_id platform, + cl_platform_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clGetPlatformInfo) return CL_INVALID_OPERATION; + + return pfn_clGetPlatformInfo(platform, param_name, param_value_size, param_value, param_value_size_ret); +} + +// Device APIs +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceIDs(cl_platform_id platform, + cl_device_type device_type, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clGetDeviceIDs) return CL_INVALID_OPERATION; + + return pfn_clGetDeviceIDs(platform, device_type, num_entries, devices, num_devices); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceInfo(cl_device_id device, + cl_device_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clGetDeviceInfo) return CL_INVALID_OPERATION; + + return pfn_clGetDeviceInfo(device, param_name, param_value_size, param_value, param_value_size_ret); +} + +// Context APIs +extern CL_API_ENTRY cl_context CL_API_CALL +clCreateContext(const cl_context_properties * properties, + cl_uint num_devices, + const cl_device_id * devices, + void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), + void * user_data, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clCreateContext) return 0; + + return pfn_clCreateContext(properties, num_devices, devices, pfn_notify, user_data, errcode_ret); +} + +extern CL_API_ENTRY cl_context CL_API_CALL +clCreateContextFromType(const cl_context_properties * properties, + cl_device_type device_type, + void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), + void * user_data, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clCreateContextFromType) return 0; + + return pfn_clCreateContextFromType(properties, device_type, pfn_notify, user_data, errcode_ret); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainContext(cl_context context) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clRetainContext) return CL_INVALID_OPERATION; + + return pfn_clRetainContext(context); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseContext(cl_context context) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clReleaseContext) return CL_INVALID_OPERATION; + + return pfn_clReleaseContext(context); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetContextInfo(cl_context context, + cl_context_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clGetContextInfo) return CL_INVALID_OPERATION; + + return pfn_clGetContextInfo(context, param_name, param_value_size, param_value, param_value_size_ret); +} + +// Command Queue APIs +extern CL_API_ENTRY cl_command_queue CL_API_CALL +clCreateCommandQueue(cl_context context, + cl_device_id device, + cl_command_queue_properties properties, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clCreateCommandQueue) return 0; + + return pfn_clCreateCommandQueue(context, device, properties, errcode_ret); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clRetainCommandQueue) return CL_INVALID_OPERATION; + + return pfn_clRetainCommandQueue(command_queue); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clReleaseCommandQueue) return CL_INVALID_OPERATION; + + return pfn_clReleaseCommandQueue(command_queue); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetCommandQueueInfo(cl_command_queue command_queue, + cl_command_queue_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clGetCommandQueueInfo) return CL_INVALID_OPERATION; + + return pfn_clGetCommandQueueInfo(command_queue, param_name, param_value_size, param_value, param_value_size_ret); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetCommandQueueProperty(cl_command_queue command_queue, + cl_command_queue_properties properties, + cl_bool enable, + cl_command_queue_properties * old_properties) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clSetCommandQueueProperty) return CL_INVALID_OPERATION; + + return pfn_clSetCommandQueueProperty(command_queue, properties, enable, old_properties); +} + +// Memory Object APIs +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateBuffer(cl_context context, + cl_mem_flags flags, + size_t size, + void * host_ptr, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clCreateBuffer) return 0; + + return pfn_clCreateBuffer(context, flags, size, host_ptr, errcode_ret); +} + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateImage2D(cl_context context, + cl_mem_flags flags, + const cl_image_format * image_format, + size_t image_width, + size_t image_height, + size_t image_row_pitch, + void * host_ptr, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clCreateImage2D) return 0; + + return pfn_clCreateImage2D(context, flags, image_format, image_width, image_height, image_row_pitch, host_ptr, errcode_ret); +} + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateImage3D(cl_context context, + cl_mem_flags flags, + const cl_image_format * image_format, + size_t image_width, + size_t image_height, + size_t image_depth, + size_t image_row_pitch, + size_t image_slice_pitch, + void * host_ptr, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clCreateImage3D) return 0; + + return pfn_clCreateImage3D(context, flags, image_format, image_width, image_height, image_depth, image_row_pitch, image_slice_pitch, host_ptr, errcode_ret); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clRetainMemObject) return CL_INVALID_OPERATION; + + return pfn_clRetainMemObject(memobj); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clReleaseMemObject) return CL_INVALID_OPERATION; + + return pfn_clReleaseMemObject(memobj); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSupportedImageFormats(cl_context context, + cl_mem_flags flags, + cl_mem_object_type image_type, + cl_uint num_entries, + cl_image_format * image_formats, + cl_uint * num_image_formats) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clGetSupportedImageFormats) return CL_INVALID_OPERATION; + + return pfn_clGetSupportedImageFormats(context, flags, image_type, num_entries, image_formats, num_image_formats); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetMemObjectInfo(cl_mem memobj, + cl_mem_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clGetMemObjectInfo) return CL_INVALID_OPERATION; + + return pfn_clGetMemObjectInfo(memobj, param_name, param_value_size, param_value, param_value_size_ret); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetImageInfo(cl_mem image, + cl_image_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clGetImageInfo) return CL_INVALID_OPERATION; + + return pfn_clGetImageInfo(image, param_name, param_value_size, param_value, param_value_size_ret); +} + + +// Sampler APIs +extern CL_API_ENTRY cl_sampler CL_API_CALL +clCreateSampler(cl_context context, + cl_bool normalized_coords, + cl_addressing_mode addressing_mode, + cl_filter_mode filter_mode, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clCreateSampler) return 0; + + return pfn_clCreateSampler(context, normalized_coords, addressing_mode, filter_mode, errcode_ret); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clRetainSampler) return CL_INVALID_OPERATION; + + return pfn_clRetainSampler(sampler); +} + + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clReleaseSampler) return CL_INVALID_OPERATION; + + return pfn_clReleaseSampler(sampler); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSamplerInfo(cl_sampler sampler, + cl_sampler_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clGetSamplerInfo) return CL_INVALID_OPERATION; + + return pfn_clGetSamplerInfo(sampler, param_name, param_value_size, param_value, param_value_size_ret); +} + +// Program Object APIs +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithSource(cl_context context, + cl_uint count, + const char ** strings, + const size_t * lengths, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clCreateProgramWithSource) return 0; + + return pfn_clCreateProgramWithSource(context, count, strings, lengths, errcode_ret); +} + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithBinary(cl_context context, + cl_uint num_devices, + const cl_device_id * device_list, + const size_t * lengths, + const unsigned char ** binaries, + cl_int * binary_status, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clCreateProgramWithBinary) return 0; + + return pfn_clCreateProgramWithBinary(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clRetainProgram) return CL_INVALID_OPERATION; + + return pfn_clRetainProgram(program); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clReleaseProgram) return CL_INVALID_OPERATION; + + return pfn_clReleaseProgram(program); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clBuildProgram(cl_program program, + cl_uint num_devices, + const cl_device_id * device_list, + const char * options, + void (CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), + void * user_data) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clBuildProgram) return CL_INVALID_OPERATION; + + return pfn_clBuildProgram(program, num_devices, device_list, options, pfn_notify, user_data); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clUnloadCompiler(void) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clUnloadCompiler) return CL_INVALID_OPERATION; + + return pfn_clUnloadCompiler(); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetProgramInfo(cl_program program, + cl_program_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clGetProgramInfo) return CL_INVALID_OPERATION; + + return pfn_clGetProgramInfo(program, param_name, param_value_size, param_value, param_value_size_ret); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetProgramBuildInfo(cl_program program, + cl_device_id device, + cl_program_build_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clGetProgramBuildInfo) return CL_INVALID_OPERATION; + + return pfn_clGetProgramBuildInfo(program, device, param_name, param_value_size, param_value, param_value_size_ret); +} + +// Kernel Object APIs +extern CL_API_ENTRY cl_kernel CL_API_CALL +clCreateKernel(cl_program program, + const char * kernel_name, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clCreateKernel) return 0; + + return pfn_clCreateKernel(program, kernel_name, errcode_ret); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateKernelsInProgram(cl_program program, + cl_uint num_kernels, + cl_kernel * kernels, + cl_uint * num_kernels_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clCreateKernelsInProgram) return CL_INVALID_OPERATION; + + return pfn_clCreateKernelsInProgram(program, num_kernels, kernels, num_kernels_ret); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainKernel(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clRetainKernel) return CL_INVALID_OPERATION; + + return pfn_clRetainKernel(kernel); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseKernel(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clReleaseKernel) return CL_INVALID_OPERATION; + + return pfn_clReleaseKernel(kernel); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArg(cl_kernel kernel, + cl_uint arg_index, + size_t arg_size, + const void * arg_value) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clSetKernelArg) return CL_INVALID_OPERATION; + + return pfn_clSetKernelArg(kernel, arg_index, arg_size, arg_value); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelInfo(cl_kernel kernel, + cl_kernel_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clGetKernelInfo) return CL_INVALID_OPERATION; + + return pfn_clGetKernelInfo(kernel, param_name, param_value_size, param_value, param_value_size_ret); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelWorkGroupInfo(cl_kernel kernel, + cl_device_id device, + cl_kernel_work_group_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clGetKernelWorkGroupInfo) return CL_INVALID_OPERATION; + + return pfn_clGetKernelWorkGroupInfo(kernel, device, param_name, param_value_size, param_value, param_value_size_ret); +} + +// Event Object APIs +extern CL_API_ENTRY cl_int CL_API_CALL +clWaitForEvents(cl_uint num_events, + const cl_event * event_list) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clWaitForEvents) return CL_INVALID_OPERATION; + + return pfn_clWaitForEvents(num_events, event_list); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventInfo(cl_event event, + cl_event_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clGetEventInfo) return CL_INVALID_OPERATION; + + return pfn_clGetEventInfo(event, param_name, param_value_size, param_value, param_value_size_ret); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clRetainEvent) return CL_INVALID_OPERATION; + + return pfn_clRetainEvent(event); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clReleaseEvent) return CL_INVALID_OPERATION; + + return pfn_clReleaseEvent(event); +} + +// Profiling APIs +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventProfilingInfo(cl_event event, + cl_profiling_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clGetEventProfilingInfo) return CL_INVALID_OPERATION; + + return pfn_clGetEventProfilingInfo(event, param_name, param_value_size, param_value, param_value_size_ret); +} + +// Flush and Finish APIs +extern CL_API_ENTRY cl_int CL_API_CALL +clFlush(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clFlush) return CL_INVALID_OPERATION; + + return pfn_clFlush(command_queue); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clFinish(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clFinish) return CL_INVALID_OPERATION; + + return pfn_clFinish(command_queue); +} + +// Enqueued Commands APIs +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadBuffer(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_read, + size_t offset, + size_t cb, + void * ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clEnqueueReadBuffer) return CL_INVALID_OPERATION; + + return pfn_clEnqueueReadBuffer(command_queue, buffer, blocking_read, offset, cb, ptr, num_events_in_wait_list, event_wait_list, event); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadBufferRect(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_read, + const size_t * buffer_origin, + const size_t * host_origin, + const size_t * region, + size_t buffer_row_pitch, + size_t buffer_slice_pitch, + size_t host_row_pitch, + size_t host_slice_pitch, + void * ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_1 +{ + if (!pfn_clEnqueueReadBufferRect) return CL_INVALID_OPERATION; + + return pfn_clEnqueueReadBufferRect(command_queue, buffer, blocking_read, buffer_origin, host_origin, region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteBuffer(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_write, + size_t offset, + size_t cb, + const void * ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clEnqueueWriteBuffer) return CL_INVALID_OPERATION; + + return pfn_clEnqueueWriteBuffer(command_queue, buffer, blocking_write, offset, cb, ptr, num_events_in_wait_list, event_wait_list, event); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteBufferRect(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_write, + const size_t * buffer_origin, + const size_t * host_origin, + const size_t * region, + size_t buffer_row_pitch, + size_t buffer_slice_pitch, + size_t host_row_pitch, + size_t host_slice_pitch, + const void * ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_1 +{ + if (!pfn_clEnqueueWriteBufferRect) return CL_INVALID_OPERATION; + + return pfn_clEnqueueWriteBufferRect(command_queue, buffer, blocking_write, buffer_origin, host_origin, region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBuffer(cl_command_queue command_queue, + cl_mem src_buffer, + cl_mem dst_buffer, + size_t src_offset, + size_t dst_offset, + size_t cb, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clEnqueueCopyBuffer) return CL_INVALID_OPERATION; + + return pfn_clEnqueueCopyBuffer(command_queue, src_buffer, dst_buffer, src_offset, dst_offset, cb, num_events_in_wait_list, event_wait_list, event); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadImage(cl_command_queue command_queue, + cl_mem image, + cl_bool blocking_read, + const size_t * origin[3], + const size_t * region[3], + size_t row_pitch, + size_t slice_pitch, + void * ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clEnqueueReadImage) return CL_INVALID_OPERATION; + + return pfn_clEnqueueReadImage(command_queue, image, blocking_read, origin, region, row_pitch, slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteImage(cl_command_queue command_queue, + cl_mem image, + cl_bool blocking_write, + const size_t * origin[3], + const size_t * region[3], + size_t input_row_pitch, + size_t input_slice_pitch, + const void * ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clEnqueueWriteImage) return CL_INVALID_OPERATION; + + return pfn_clEnqueueWriteImage(command_queue, image, blocking_write, origin, region, input_row_pitch, input_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyImage(cl_command_queue command_queue, + cl_mem src_image, + cl_mem dst_image, + const size_t * src_origin[3], + const size_t * dst_origin[3], + const size_t * region[3], + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clEnqueueCopyImage) return CL_INVALID_OPERATION; + + return pfn_clEnqueueCopyImage(command_queue, src_image, dst_image, src_origin, dst_origin, region, num_events_in_wait_list, event_wait_list, event); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyImageToBuffer(cl_command_queue command_queue, + cl_mem src_image, + cl_mem dst_buffer, + const size_t * src_origin[3], + const size_t * region[3], + size_t dst_offset, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clEnqueueCopyImageToBuffer) return CL_INVALID_OPERATION; + + return pfn_clEnqueueCopyImageToBuffer(command_queue, src_image, dst_buffer, src_origin, region, dst_offset, num_events_in_wait_list, event_wait_list, event); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBufferToImage(cl_command_queue command_queue, + cl_mem src_buffer, + cl_mem dst_image, + size_t src_offset, + const size_t * dst_origin[3], + const size_t * region[3], + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clEnqueueCopyBufferToImage) return CL_INVALID_OPERATION; + + return pfn_clEnqueueCopyBufferToImage(command_queue, src_buffer, dst_image, src_offset, dst_origin, region, num_events_in_wait_list, event_wait_list, event); +} + +extern CL_API_ENTRY void * CL_API_CALL +clEnqueueMapBuffer(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_map, + cl_map_flags map_flags, + size_t offset, + size_t cb, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clEnqueueMapBuffer) return 0; + + return pfn_clEnqueueMapBuffer(command_queue, buffer, blocking_map, map_flags, offset, cb, num_events_in_wait_list, event_wait_list, event, errcode_ret); +} + +extern CL_API_ENTRY void * CL_API_CALL +clEnqueueMapImage(cl_command_queue command_queue, + cl_mem image, + cl_bool blocking_map, + cl_map_flags map_flags, + const size_t * origin, + const size_t * region, + size_t * image_row_pitch, + size_t * image_slice_pitch, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clEnqueueMapImage) return 0; + + return pfn_clEnqueueMapImage(command_queue, image, blocking_map, map_flags, origin, region, image_row_pitch, image_slice_pitch, num_events_in_wait_list, event_wait_list, event, errcode_ret); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueUnmapMemObject(cl_command_queue command_queue, + cl_mem memobj, + void * mapped_ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clEnqueueUnmapMemObject) return CL_INVALID_OPERATION; + + return pfn_clEnqueueUnmapMemObject(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueNDRangeKernel(cl_command_queue command_queue, + cl_kernel kernel, + cl_uint work_dim, + const size_t * global_work_offset, + const size_t * global_work_size, + const size_t * local_work_size, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clEnqueueNDRangeKernel) return CL_INVALID_OPERATION; + + return pfn_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueTask(cl_command_queue command_queue, + cl_kernel kernel, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clEnqueueTask) return CL_INVALID_OPERATION; + + return pfn_clEnqueueTask(command_queue, kernel, num_events_in_wait_list, event_wait_list, event); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueNativeKernel(cl_command_queue command_queue, + void (CL_CALLBACK *user_func)(void *), + void * args, + size_t cb_args, + cl_uint num_mem_objects, + const cl_mem * mem_list, + const void ** args_mem_loc, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clEnqueueNativeKernel) return CL_INVALID_OPERATION; + + return pfn_clEnqueueNativeKernel(command_queue, user_func, args, cb_args, num_mem_objects, mem_list, args_mem_loc, num_events_in_wait_list, event_wait_list, event); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMarker(cl_command_queue command_queue, + cl_event * event) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clEnqueueMarker) return CL_INVALID_OPERATION; + + return pfn_clEnqueueMarker(command_queue, event); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWaitForEvents(cl_command_queue command_queue, + cl_uint num_events, + const cl_event * event_list) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clEnqueueWaitForEvents) return CL_INVALID_OPERATION; + + return pfn_clEnqueueWaitForEvents(command_queue, num_events, event_list); +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueBarrier(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clEnqueueBarrier) return CL_INVALID_OPERATION; + + return pfn_clEnqueueBarrier(command_queue); +} + +// Extension function access +// +// Returns the extension function address for the given function name, +// or NULL if a valid function can not be found. The client must +// check to make sure the address is not NULL, before using or +// calling the returned function address. +// +extern CL_API_ENTRY void * CL_API_CALL clGetExtensionFunctionAddress(const char * func_name) CL_API_SUFFIX__VERSION_1_0 +{ + if (!pfn_clGetExtensionFunctionAddress) return 0; + + return pfn_clGetExtensionFunctionAddress(func_name); +} diff --git a/libs/clew/libclew/ocl_init.h b/libs/clew/libclew/ocl_init.h new file mode 100644 index 0000000..7929b0f --- /dev/null +++ b/libs/clew/libclew/ocl_init.h @@ -0,0 +1,3 @@ +#pragma once + +int ocl_init(void); diff --git a/libs/gpu/CMakeLists.txt b/libs/gpu/CMakeLists.txt new file mode 100644 index 0000000..e0ac49b --- /dev/null +++ b/libs/gpu/CMakeLists.txt @@ -0,0 +1,79 @@ +cmake_minimum_required(VERSION 3.1) + +project(libgpu) + +set(HEADERS + libgpu/opencl/device_info.h + libgpu/opencl/engine.h + libgpu/opencl/enum.h + libgpu/opencl/utils.h + libgpu/context.h + libgpu/device.h + libgpu/gold_helpers.h + libgpu/shared_device_buffer.h + libgpu/shared_host_buffer.h + libgpu/utils.h + libgpu/work_size.h + ) + +set(SOURCES + libgpu/opencl/device_info.cpp + libgpu/opencl/engine.cpp + libgpu/opencl/enum.cpp + libgpu/opencl/utils.cpp + libgpu/context.cpp + libgpu/device.cpp + libgpu/gold_helpers.cpp + libgpu/shared_device_buffer.cpp + libgpu/shared_host_buffer.cpp + libgpu/utils.cpp + ) + +set(CUDA_HEADERS + libgpu/cuda/sdk/helper_math.h + libgpu/cuda/cuda_api.h + libgpu/cuda/enum.h + libgpu/cuda/utils.h + ) + +set(CUDA_SOURCES + libgpu/cuda/cuda_api.cpp + libgpu/cuda/enum.cpp + libgpu/cuda/utils.cpp + ) + +option(GPU_CUDA_SUPPORT "CUDA support." OFF) + +set (LIBRARIES + libclew + libutils) + +set(CMAKE_CXX_STANDARD 11) + +if (GPU_CUDA_SUPPORT) + find_package (CUDA REQUIRED) + + set(HEADERS ${HEADERS} ${CUDA_HEADERS}) + set(SOURCES ${SOURCES} ${CUDA_SOURCES}) + set(LIBRARIES ${LIBRARIES} ${CUDA_LIBRARIES}) + + add_definitions(-DCUDA_SUPPORT) + cuda_add_library(${PROJECT_NAME} ${SOURCES} ${HEADERS}) +else () + add_library(${PROJECT_NAME} ${SOURCES} ${HEADERS}) +endif () + +target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}) +target_link_libraries(${PROJECT_NAME} ${LIBRARIES}) + +add_executable(hexdumparray libgpu/hexdumparray.cpp) + +function(convertIntoHeader sourceFile headerFile arrayName) + add_custom_command( + OUTPUT ${PROJECT_SOURCE_DIR}/${headerFile} + + COMMAND hexdumparray ${PROJECT_SOURCE_DIR}/${sourceFile} ${PROJECT_SOURCE_DIR}/${headerFile} ${arrayName} + + DEPENDS ${PROJECT_SOURCE_DIR}/${sourceFile} hexdumparray + ) +endfunction() diff --git a/libs/gpu/LICENSE b/libs/gpu/LICENSE new file mode 100644 index 0000000..6fb2204 --- /dev/null +++ b/libs/gpu/LICENSE @@ -0,0 +1,23 @@ +MIT License + +Copyright (c) 2018 Nikolay Polyarniy +Copyright (c) 2018 GPGPUCourse2018 +Copyright (c) 2018 Agisoft LLC + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/libs/gpu/libgpu/context.cpp b/libs/gpu/libgpu/context.cpp new file mode 100644 index 0000000..2c07b2d --- /dev/null +++ b/libs/gpu/libgpu/context.cpp @@ -0,0 +1,310 @@ +#include "context.h" + +#ifdef CUDA_SUPPORT +#include +#include +#endif + +namespace gpu { + +THREAD_LOCAL Context::Data *Context::data_current_ = 0; + +Context::Data::Data() +{ + type = TypeUndefined; + cuda_device = 0; + cuda_context = 0; + cuda_stream = 0; + ocl_device = 0; + activated = false; +} + +Context::Data::~Data() +{ + if (data_current_ != this) { + if (data_current_ != 0) { + std::cerr << "Another GPU context found on context destruction" << std::endl; + } + } else { + data_current_ = 0; + } + +#ifdef CUDA_SUPPORT + if (cuda_stream) { + cudaError_t err = cudaStreamDestroy(cuda_stream); + if (cudaSuccess != err) + std::cerr << "Warning: cudaStreamDestroy failed: " << cuda::formatError(err) << std::endl; + } + +#ifndef CUDA_USE_PRIMARY_CONTEXT + if (cuda_context) { + CUresult err = cuCtxDestroy(cuda_context); + if (CUDA_SUCCESS != err) + std::cerr << "Warning: cuCtxDestroy failed: " << cuda::formatDriverError(err) << std::endl; + } +#endif +#endif +} + +Context::Context() +{ + data_ = data_current_; +} + +void Context::clear() +{ + data_ = NULL; +} + +void Context::init(int device) +{ +#ifdef CUDA_SUPPORT +#ifndef CUDA_USE_PRIMARY_CONTEXT + if (!cuda_api_init()) + throw cuda::cuda_exception("Can't load nvcuda library"); +#endif + std::shared_ptr data = std::make_shared(); + data->type = TypeCUDA; + data->cuda_device = device; + data_ref_ = data; +#endif +} + +void Context::init(struct _cl_device_id *device) +{ + std::shared_ptr data = std::make_shared(); + data->type = TypeOpenCL; + data->ocl_device = device; + data_ref_ = data; +} + +bool Context::isInitialized() +{ + return data_ref_.get() && data_ref_->type != TypeUndefined; +} + +bool Context::isGPU() +{ + return (type() != TypeUndefined); +} + +bool Context::isIntelGPU() +{ + if (type() != TypeOpenCL) { + return false; + } + + return cl()->deviceInfo().isIntelGPU(); +} + +bool Context::isGoldChecksEnabled() +{ + return false; // NOTTODO: Make it switchable +} + +void Context::activate() +{ + if (!data_ref_) + throw std::runtime_error("Unexpected GPU context activate call"); + + // create cuda stream on first activate call + if (!data_ref_->activated) { +#ifdef CUDA_SUPPORT + if (data_ref_->type == TypeCUDA) { +#ifndef CUDA_USE_PRIMARY_CONTEXT + // It is claimed that contexts are thread safe starting from CUDA 4.0. + // Nevertheless, we observe crashes in nvcuda.dll if the same device is used in parallel from 2 threads using its primary context. + // To avoid this problem we create a separate standard context for each processing thread. + // https://devtalk.nvidia.com/default/topic/519087/cuda-programming-and-performance/cuda-context-and-threading/post/3689477/#3689477 + // http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DRIVER.html#axzz4g8KX5QV5 + + CUdevice device = 0; + CU_SAFE_CALL( cuDeviceGet(&device, data_ref_->cuda_device) ); + CU_SAFE_CALL( cuCtxCreate(&data_ref_->cuda_context, 0, device) ); +#else + CUDA_SAFE_CALL( cudaSetDevice(data_ref_->cuda_device) ); +#endif + CUDA_SAFE_CALL( cudaStreamCreate(&data_ref_->cuda_stream) ); + } +#endif + + if (data_ref_->type == TypeOpenCL) { + ocl::sh_ptr_ocl_engine engine = std::make_shared(); + engine->init(data_ref_->ocl_device); + data_ref_->ocl_engine = engine; + } + + data_ref_->activated = true; + } + + if (data_current_ && data_current_ != data_ref_.get()) + throw std::runtime_error("Another GPU context is already active"); + + data_ = data_ref_.get(); + data_current_ = data_; +} + +Context::Data *Context::data() const +{ + if (!data_) + throw std::runtime_error("Null context"); + + return data_; +} + +size_t Context::getCoresEstimate() +{ + size_t compute_units = 1; + + switch (type()) { +#ifdef CUDA_SUPPORT + case Context::TypeCUDA: + cudaDeviceProp deviceProp; + CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, data_->cuda_device)); + compute_units = (size_t) deviceProp.multiProcessorCount; + break; +#endif + case Context::TypeOpenCL: + compute_units = cl()->maxComputeUnits(); + break; + default: + gpu::raiseException(__FILE__, __LINE__, "No GPU context!"); + } + + return compute_units * 256; +} + +size_t Context::getTotalMemory() +{ + size_t total_mem_size = 0; + size_t free_mem_size = 0; + + switch (type()) { +#ifdef CUDA_SUPPORT + case Context::TypeCUDA: + CUDA_SAFE_CALL(cudaMemGetInfo(&free_mem_size, &total_mem_size)); + break; +#endif + case Context::TypeOpenCL: + total_mem_size = cl()->totalMemSize(); + break; + default: + gpu::raiseException(__FILE__, __LINE__, "No GPU context!"); + } + + return total_mem_size; +} + +size_t Context::getFreeMemory() +{ + size_t total_mem_size = 0; + size_t free_mem_size = 0; + + switch (type()) { +#ifdef CUDA_SUPPORT + case Context::TypeCUDA: + CUDA_SAFE_CALL(cudaMemGetInfo(&free_mem_size, &total_mem_size)); + break; +#endif + case Context::TypeOpenCL: + total_mem_size = cl()->totalMemSize(); + free_mem_size = total_mem_size - total_mem_size / 5; + break; + default: + gpu::raiseException(__FILE__, __LINE__, "No GPU context!"); + } + + return free_mem_size; +} + +size_t Context::getMaxMemAlloc() +{ + size_t max_mem_alloc_size = 0; + +#ifdef CUDA_SUPPORT + if (type() == gpu::Context::TypeCUDA) { + size_t total_mem_size = 0; + size_t free_mem_size = 0; + CUDA_SAFE_CALL(cudaMemGetInfo(&free_mem_size, &total_mem_size)); + max_mem_alloc_size = total_mem_size / 2; + } else +#endif + if (type() == gpu::Context::TypeOpenCL) { + max_mem_alloc_size = cl()->maxMemAllocSize(); + } else { + gpu::raiseException(__FILE__, __LINE__, "No GPU context!"); + } + + return max_mem_alloc_size; +} + +size_t Context::getMaxWorkgroupSize() +{ + size_t max_workgroup_size = 0; + + switch (type()) { +#ifdef CUDA_SUPPORT + case Context::TypeCUDA: + int value; + CUDA_SAFE_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMaxThreadsPerBlock, data_->cuda_device)); + max_workgroup_size = value; + break; +#endif + case Context::TypeOpenCL: + max_workgroup_size = cl()->maxWorkgroupSize(); + break; + default: + gpu::raiseException(__FILE__, __LINE__, "No GPU context!"); + } + + return max_workgroup_size; +} + +std::vector Context::getMaxWorkItemSizes() +{ + std::vector work_item_sizes(3); + + switch (type()) { +#ifdef CUDA_SUPPORT + case Context::TypeCUDA: + int value[3]; + CUDA_SAFE_CALL(cudaDeviceGetAttribute(&value[0], cudaDevAttrMaxBlockDimX, data_->cuda_device)); + CUDA_SAFE_CALL(cudaDeviceGetAttribute(&value[1], cudaDevAttrMaxBlockDimY, data_->cuda_device)); + CUDA_SAFE_CALL(cudaDeviceGetAttribute(&value[2], cudaDevAttrMaxBlockDimZ, data_->cuda_device)); + for (int i = 0; i < 3; ++i) { + work_item_sizes[i] = value[i]; + } + break; +#endif + case Context::TypeOpenCL: + for (int i = 0; i < 3; ++i) { + work_item_sizes[i] = cl()->maxWorkItemSizes(i); + } + break; + default: + gpu::raiseException(__FILE__, __LINE__, "No GPU context!"); + } + + return work_item_sizes; +} + +Context::Type Context::type() const +{ + if (data_) + return data_->type; + if (data_ref_) + return data_ref_->type; + return TypeUndefined; +} + +ocl::sh_ptr_ocl_engine Context::cl() const +{ + return data()->ocl_engine; +} + +cudaStream_t Context::cudaStream() const +{ + return data()->cuda_stream; +} + +} diff --git a/libs/gpu/libgpu/context.h b/libs/gpu/libgpu/context.h new file mode 100644 index 0000000..23ce598 --- /dev/null +++ b/libs/gpu/libgpu/context.h @@ -0,0 +1,72 @@ +#pragma once + +#include +#include + +typedef struct CUctx_st *cudaContext_t; +typedef struct CUstream_st *cudaStream_t; + +#ifdef _MSC_VER + #define THREAD_LOCAL __declspec(thread) +#else + #define THREAD_LOCAL __thread +#endif + +namespace gpu { + +class Context { +public: + Context(); + + enum Type { + TypeUndefined, + TypeOpenCL, + TypeCUDA + }; + + void clear(); + void init(int device); + void init(struct _cl_device_id *device); + bool isInitialized(); + bool isGPU(); + bool isIntelGPU(); + bool isGoldChecksEnabled(); + + void activate(); + + size_t getCoresEstimate(); + size_t getTotalMemory(); + size_t getFreeMemory(); + size_t getMaxMemAlloc(); + size_t getMaxWorkgroupSize(); + std::vector getMaxWorkItemSizes(); + + Type type() const; + + ocl::sh_ptr_ocl_engine cl() const; + cudaStream_t cudaStream() const; + +protected: + class Data { + public: + Data(); + ~Data(); + + Type type; + + int cuda_device; + cudaContext_t cuda_context; + cudaStream_t cuda_stream; + struct _cl_device_id * ocl_device; + ocl::sh_ptr_ocl_engine ocl_engine; + bool activated; + }; + + Data * data() const; + + Data * data_; + std::shared_ptr data_ref_; + static THREAD_LOCAL Data * data_current_; +}; + +} diff --git a/libs/gpu/libgpu/cuda/cu/common.cu b/libs/gpu/libgpu/cuda/cu/common.cu new file mode 100644 index 0000000..12320bf --- /dev/null +++ b/libs/gpu/libgpu/cuda/cu/common.cu @@ -0,0 +1,535 @@ +#ifndef common_cu // pragma once +#define common_cu + +#include +#include +#include +#include + +using gpu::WorkSize; + +//#define DEBUG + +#ifdef DEBUG +#include +#define printf_assert(condition, message) \ + if (!(condition)) printf("%s Line %d\n", message, __LINE__); +#else +#define printf_assert(condition, message) +#endif +#define assert_isfinite(value) \ + printf_assert(isfinite(value), "Value should be finite!"); + +#define WARP_SIZE 32 // NOTTODO: WHY WARP_SZ IS UNDEFINED? + +#ifndef M_PI +#define M_PI 3.141592654f +#endif + +namespace cuda { + + template + __device__ T max(T a, T b) { + return a > b ? a : b; + } + + template + __device__ T max(T a, T b, T c) { + return max(a, max(b, c)); + } + + template + __device__ T min(T a, T b) { + return a < b ? a : b; + } + + template + __device__ T min(T a, T b, T c) { + return min(a, min(b, c)); + } + + inline __device__ uint3 fetch_uint3(const unsigned int* ptr, size_t index) + { + return make_uint3(ptr[3 * index + 0], ptr[3 * index + 1], ptr[3 * index + 2]); + } + + inline __device__ uint4 fetch_uint4(const unsigned int* ptr, size_t index) + { + return make_uint4(ptr[4 * index + 0], ptr[4 * index + 1], ptr[4 * index + 2], ptr[4 * index + 3]); + } + + inline __device__ float2 fetch_float2(const float* ptr, size_t index) + { + return make_float2(ptr[2 * index + 0], ptr[2 * index + 1]); + } + + inline __device__ float3 fetch_float3(const float* ptr, size_t index) + { + return make_float3(ptr[3 * index + 0], ptr[3 * index + 1], ptr[3 * index + 2]); + } + + inline __device__ float4 fetch_float4(const float* ptr, size_t index) + { + return make_float4(ptr[4 * index + 0], ptr[4 * index + 1], ptr[4 * index + 2], ptr[4 * index + 3]); + } + + inline __device__ void set_uint3(unsigned int* ptr, size_t index, uint3 value) + { + ptr[3 * index + 0] = value.x; + ptr[3 * index + 1] = value.y; + ptr[3 * index + 2] = value.z; + } + + inline __device__ void set_float2(float* ptr, size_t index, float2 value) + { + ptr[2 * index + 0] = value.x; + ptr[2 * index + 1] = value.y; + } + + inline __device__ void set_float3(float* ptr, size_t index, float3 value) + { + ptr[3 * index + 0] = value.x; + ptr[3 * index + 1] = value.y; + ptr[3 * index + 2] = value.z; + } + + inline __device__ __host__ float clamp(float f, float a, float b) + { + return fmaxf(a, fminf(f, b)); + } + + inline __device__ float atomicCAS_f32(float *p, float cmp, float val) { + return __int_as_float(atomicCAS((int *) p, __float_as_int(cmp), __float_as_int(val))); + } + + inline __device__ float atomicCAS_32(float* p, float cmp, float val) { + return atomicCAS_f32(p, cmp, val); + } + + inline __device__ unsigned int atomicCAS_32(unsigned int* p, unsigned int cmp, unsigned int val) { + return atomicCAS(p, cmp, val); + } + + inline __device__ float3 operator-(const float3 &a) + { + return make_float3(-a.x, -a.y, -a.z); + } + + inline __device__ float2 operator-(float2 a, float2 b) + { + return make_float2(a.x - b.x, a.y - b.y); + } + + inline __device__ void operator-=(float2 &a, float2 b) + { + a.x -= b.x; + a.y -= b.y; + } + + inline __device__ float2 operator+(float2 a, float2 b) + { + return make_float2(a.x + b.x, a.y + b.y); + } + + inline __device__ void operator+=(float2 &a, float2 b) + { + a.x += b.x; + a.y += b.y; + } + + inline __device__ float3 operator-(float3 a, float3 b) + { + return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); + } + + inline __device__ float3 operator+(float3 a, float3 b) + { + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); + } + + inline __device__ void operator+=(float3 &a, float3 b) + { + a.x += b.x; + a.y += b.y; + a.z += b.z; + } + + inline __device__ float4 operator+(float4 a, float4 b) + { + return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); + } + + inline __device__ float2 operator*(float2 a, float b) + { + return make_float2(a.x * b, a.y * b); + } + + inline __device__ float3 operator*(float3 a, float b) + { + return make_float3(a.x * b, a.y * b, a.z * b); + } + + inline __device__ float4 operator*(float4 a, float b) + { + return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); + } + + inline __device__ float2 operator/(float2 a, float b) + { + return make_float2(a.x / b, a.y / b); + } + + inline __device__ float3 operator/(float3 a, float b) + { + return make_float3(a.x / b, a.y / b, a.z / b); + } + + inline __device__ void operator/=(float3& a, float b) + { + a.x /= b; + a.y /= b; + a.z /= b; + } + + inline __device__ float3 operator*(float3 a, float3 b) + { + return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); + } + + inline __device__ void operator*=(float3 &a, float b) + { + a.x *= b; + a.y *= b; + a.z *= b; + } + + inline __device__ bool operator==(float3 a, float3 b) + { + return a.x == b.x && a.y == b.y && a.z == b.z; + } + + inline __device__ bool operator==(float4 a, float4 b) + { + return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; + } + + inline __device__ float dot(float2 a, float2 b) + { + return a.x * b.x + a.y * b.y; + } + + inline __device__ float dot(float3 a, float3 b) + { + return a.x * b.x + a.y * b.y + a.z * b.z; + } + + inline __device__ float dot(float4 a, float4 b) + { + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; + } + + inline __device__ float3 cross(float3 a, float3 b) + { + return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); + } + + inline __device__ float norm2(float2 v) + { + return dot(v, v); + } + + inline __device__ float norm2(float3 v) + { + return dot(v, v); + } + + inline __device__ float norm2(float4 v) + { + return dot(v, v); + } + + inline __device__ float norm(float2 v) + { + return sqrtf(norm2(v)); + } + + inline __device__ float norm(float3 v) + { + return sqrtf(norm2(v)); + } + + inline __device__ float norm(float4 v) + { + return sqrtf(norm2(v)); + } + + inline __device__ float3 normalize(float3 v) + { + return v / sqrtf(dot(v, v)); + } + +//______SHARED_STRUCTS__________________________________________________________________________________________________ + + // https://devtalk.nvidia.com/default/topic/673965/are-there-any-cuda-libararies-for-3x3-matrix-amp-vector3-amp-quaternion-operations-/ + typedef struct { + float4 m_row[3]; + } Matrix3x3f; + + typedef struct { + float4 m_row[4]; + } Matrix4x4f; + +//______HOST_CODE_______________________________________________________________________________________________________ + + inline __host__ float3 make_float3_from_vector(const vector3d &v) + { + return make_float3((float) v.x(), (float) v.y(), (float) v.z()); + } + + inline __host__ Matrix3x3f make_matrix_f3x3(const matrix3x3d &a) + { + Matrix3x3f m; + m.m_row[0] = make_float4((float) a(0, 0), (float) a(0, 1), (float) a(0, 2), 0.0f); + m.m_row[1] = make_float4((float) a(1, 0), (float) a(1, 1), (float) a(1, 2), 0.0f); + m.m_row[2] = make_float4((float) a(2, 0), (float) a(2, 1), (float) a(2, 2), 0.0f); + return m; + } + + inline __host__ Matrix4x4f make_matrix_f4x4(const matrix4x4d &a) + { + Matrix4x4f m; + m.m_row[0] = make_float4((float) a(0, 0), (float) a(0, 1), (float) a(0, 2), (float) a(0, 3)); + m.m_row[1] = make_float4((float) a(1, 0), (float) a(1, 1), (float) a(1, 2), (float) a(1, 3)); + m.m_row[2] = make_float4((float) a(2, 0), (float) a(2, 1), (float) a(2, 2), (float) a(2, 3)); + m.m_row[3] = make_float4((float) a(3, 0), (float) a(3, 1), (float) a(3, 2), (float) a(3, 3)); + return m; + } + +//______DEVICE_CODE_____________________________________________________________________________________________________ + +#ifdef DEBUG + inline __device__ void print_matrix_f3x3(const Matrix3x3f &m) + { + printf("[\n"); + printf(" [%f, %f, %f],\n", m.m_row[0].x, m.m_row[0].y, m.m_row[0].z); + printf(" [%f, %f, %f],\n", m.m_row[1].x, m.m_row[1].y, m.m_row[1].z); + printf(" [%f, %f, %f],\n", m.m_row[2].x, m.m_row[2].y, m.m_row[2].z); + printf("]\n"); + } +#endif + + inline __device__ Matrix3x3f make_matrix_f3x3(float a00, float a01, float a02, float a10, float a11, float a12, float a20, float a21, float a22) + { + Matrix3x3f m; + m.m_row[0] = make_float4(a00, a01, a02, 0.0f); + m.m_row[1] = make_float4(a10, a11, a12, 0.0f); + m.m_row[2] = make_float4(a20, a21, a22, 0.0f); + return m; + } + + inline __device__ Matrix3x3f make_zero_matrix_f3x3() + { + Matrix3x3f m; + m.m_row[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + m.m_row[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + m.m_row[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + return m; + } + + inline __device__ Matrix3x3f make_eye_matrix_f3x3() + { + Matrix3x3f m; + m.m_row[0] = make_float4(1.0f, 0.0f, 0.0f, 0.0f); + m.m_row[1] = make_float4(0.0f, 1.0f, 0.0f, 0.0f); + m.m_row[2] = make_float4(0.0f, 0.0f, 1.0f, 0.0f); + return m; + } + + inline __device__ Matrix3x3f transpose_f3x3(const Matrix3x3f &m) + { + Matrix3x3f t; + t.m_row[0] = make_float4(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.0f); + t.m_row[1] = make_float4(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.0f); + t.m_row[2] = make_float4(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.0f); + return t; + } + + inline __device__ Matrix3x3f add_f3x3(const Matrix3x3f& a, const Matrix3x3f& b) + { + Matrix3x3f m; + m.m_row[0] = a.m_row[0] + b.m_row[0]; + m.m_row[1] = a.m_row[1] + b.m_row[1]; + m.m_row[2] = a.m_row[2] + b.m_row[2]; + return m; + } + + inline __device__ Matrix3x3f mul_f3x3(const Matrix3x3f &a, const Matrix3x3f &b) + { + Matrix3x3f bt = transpose_f3x3(b); + Matrix3x3f res; + res.m_row[0] = make_float4(dot(a.m_row[0], bt.m_row[0]), dot(a.m_row[0], bt.m_row[1]), dot(a.m_row[0], bt.m_row[2]), 0.0f); + res.m_row[1] = make_float4(dot(a.m_row[1], bt.m_row[0]), dot(a.m_row[1], bt.m_row[1]), dot(a.m_row[1], bt.m_row[2]), 0.0f); + res.m_row[2] = make_float4(dot(a.m_row[2], bt.m_row[0]), dot(a.m_row[2], bt.m_row[1]), dot(a.m_row[2], bt.m_row[2]), 0.0f); + return res; + } + + inline __device__ Matrix3x3f mul_f_f3x3(float k, const Matrix3x3f& a) + { + Matrix3x3f m; + m.m_row[0] = a.m_row[0] * k; + m.m_row[1] = a.m_row[1] * k; + m.m_row[2] = a.m_row[2] * k; + return m; + } + + inline __device__ float3 mul_f3x3_f3(const Matrix3x3f& a, const float3 &b) + { + return make_float3(a.m_row[0].x * b.x + a.m_row[0].y * b.y + a.m_row[0].z * b.z, + a.m_row[1].x * b.x + a.m_row[1].y * b.y + a.m_row[1].z * b.z, + a.m_row[2].x * b.x + a.m_row[2].y * b.y + a.m_row[2].z * b.z); + } + + inline __device__ float2 transformPoint_f3x3(const Matrix3x3f &m, const float2 &p) + { + float3 temp = mul_f3x3_f3(m, make_float3(p.x, p.y, 1.0f)); + return make_float2(temp.x, temp.y) / temp.z; + } + +#ifdef DEBUG + inline __device__ void print_matrix_f4x4(const Matrix4x4f &m) + { + printf("[\n"); + printf(" [%f, %f, %f, %f],\n", m.m_row[0].x, m.m_row[0].y, m.m_row[0].z, m.m_row[0].w); + printf(" [%f, %f, %f, %f],\n", m.m_row[1].x, m.m_row[1].y, m.m_row[1].z, m.m_row[1].w); + printf(" [%f, %f, %f, %f],\n", m.m_row[2].x, m.m_row[2].y, m.m_row[2].z, m.m_row[2].w); + printf(" [%f, %f, %f, %f],\n", m.m_row[3].x, m.m_row[3].y, m.m_row[3].z, m.m_row[3].w); + printf("]\n"); + } +#endif + + inline __device__ Matrix4x4f make_matrix_f4x4(float a00, float a01, float a02, float a03, + float a10, float a11, float a12, float a13, + float a20, float a21, float a22, float a23, + float a30, float a31, float a32, float a33) + { + Matrix4x4f m; + m.m_row[0] = make_float4(a00, a01, a02, a03); + m.m_row[1] = make_float4(a10, a11, a12, a13); + m.m_row[2] = make_float4(a20, a21, a22, a23); + m.m_row[3] = make_float4(a30, a31, a32, a33); + return m; + } + + inline __device__ Matrix4x4f make_translation_f4x4(const float3 &t) + { + return make_matrix_f4x4(1.0f, 0.0f, 0.0f, t.x, + 0.0f, 1.0f, 0.0f, t.y, + 0.0f, 0.0f, 1.0f, t.z, + 0.0f, 0.0f, 0.0f, 1.0f); + } + + inline __device__ Matrix4x4f make_rotation_f4x4(const Matrix3x3f &r) + { + Matrix4x4f m; + m.m_row[0] = r.m_row[0]; + m.m_row[1] = r.m_row[1]; + m.m_row[2] = r.m_row[2]; + + m.m_row[0].w = 0.0f; + m.m_row[1].w = 0.0f; + m.m_row[2].w = 0.0f; + m.m_row[3] = make_float4(0.0f, 0.0f, 0.0f, 1.0f); + return m; + } + + inline __device__ float3 extract_translation_f4x4(const Matrix4x4f &m) + { + float norm = 1.0f / m.m_row[3].w; + return make_float3(m.m_row[0].w, m.m_row[1].w, m.m_row[2].w) * norm; + } + + inline __device__ Matrix3x3f extract_rotation_f4x4(const Matrix4x4f &m) + { + Matrix3x3f R = make_matrix_f3x3( + m.m_row[0].x, m.m_row[0].y, m.m_row[0].z, + m.m_row[1].x, m.m_row[1].y, m.m_row[1].z, + m.m_row[2].x, m.m_row[2].y, m.m_row[2].z); + + // matrix4x4f.scale3() + Matrix3x3f MtM = mul_f3x3(transpose_f3x3(R), R); + + float3 d = make_float3(MtM.m_row[0].x, MtM.m_row[1].y, MtM.m_row[2].z); + + if (d.x > 0) d.x = sqrtf(d.x); + if (d.y > 0) d.y = sqrtf(d.y); + if (d.z > 0) d.z = sqrtf(d.z); + + float3 s = d; + + if (s.x) s.x = 1.0f / s.x; + if (s.y) s.y = 1.0f / s.y; + if (s.z) s.z = 1.0f / s.z; + + return mul_f3x3(R, make_matrix_f3x3(s.x, 0.0f, 0.0f, + 0.0f, s.y, 0.0f, + 0.0f, 0.0f, s.z)); + } + + inline __device__ Matrix4x4f transpose_f4x4(const Matrix4x4f &m) + { + Matrix4x4f t; + t.m_row[0] = make_float4(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, m.m_row[3].x); + t.m_row[1] = make_float4(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, m.m_row[3].y); + t.m_row[2] = make_float4(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, m.m_row[3].z); + t.m_row[3] = make_float4(m.m_row[0].w, m.m_row[1].w, m.m_row[2].w, m.m_row[3].w); + return t; + } + + inline __device__ Matrix4x4f mul_f4x4(const Matrix4x4f &a, const Matrix4x4f &b) + { + Matrix4x4f bt = transpose_f4x4(b); + Matrix4x4f res; + res.m_row[0] = make_float4(dot(a.m_row[0], bt.m_row[0]), dot(a.m_row[0], bt.m_row[1]), dot(a.m_row[0], bt.m_row[2]), dot(a.m_row[0], bt.m_row[3])); + res.m_row[1] = make_float4(dot(a.m_row[1], bt.m_row[0]), dot(a.m_row[1], bt.m_row[1]), dot(a.m_row[1], bt.m_row[2]), dot(a.m_row[1], bt.m_row[3])); + res.m_row[2] = make_float4(dot(a.m_row[2], bt.m_row[0]), dot(a.m_row[2], bt.m_row[1]), dot(a.m_row[2], bt.m_row[2]), dot(a.m_row[2], bt.m_row[3])); + res.m_row[3] = make_float4(dot(a.m_row[3], bt.m_row[0]), dot(a.m_row[3], bt.m_row[1]), dot(a.m_row[3], bt.m_row[2]), dot(a.m_row[3], bt.m_row[3])); + return res; + } + + inline __device__ float4 mul_f4x4_f4(const Matrix4x4f& a, const float4 &b) + { + return make_float4(a.m_row[0].x * b.x + a.m_row[0].y * b.y + a.m_row[0].z * b.z + a.m_row[0].w * b.w, + a.m_row[1].x * b.x + a.m_row[1].y * b.y + a.m_row[1].z * b.z + a.m_row[1].w * b.w, + a.m_row[2].x * b.x + a.m_row[2].y * b.y + a.m_row[2].z * b.z + a.m_row[2].w * b.w, + a.m_row[3].x * b.x + a.m_row[3].y * b.y + a.m_row[3].z * b.z + a.m_row[3].w * b.w); + } + + inline __device__ float3 transformPoint(const Matrix4x4f &m, const float3 &p) + { + float4 temp = mul_f4x4_f4(m, make_float4(p.x, p.y, p.z, 1.0f)); + return make_float3(temp.x, temp.y, temp.z) / temp.w; + } + + inline __device__ float3 transformVector(const Matrix4x4f &m, const float3 &v) + { + float4 temp = mul_f4x4_f4(m, make_float4(v.x, v.y, v.z, 0.0f)); + return make_float3(temp.x, temp.y, temp.z); + } + + inline __device__ float smootherstep(float edge0, float edge1, float x) + { + if (x < edge0) { + return 0.0f; + } else if (x >= edge1) { + return 1.0f; + } + + // Scale, and clamp x to 0..1 range + x = (x - edge0) / (edge1 - edge0); + // Evaluate polynomial + return x * x * x * (x * (x * 6.0f - 15.0f) + 10.0f); + } + +} + +#endif // pragma once \ No newline at end of file diff --git a/libs/gpu/libgpu/cuda/cu/opencl_translator.cu b/libs/gpu/libgpu/cuda/cu/opencl_translator.cu new file mode 100644 index 0000000..6866fee --- /dev/null +++ b/libs/gpu/libgpu/cuda/cu/opencl_translator.cu @@ -0,0 +1,96 @@ +#ifndef opencl_translator_cu // pragma once +#define opencl_translator_cu + +#ifdef __NVCC__ + +#ifndef STATIC_KEYWORD +#define STATIC_KEYWORD __device__ +#endif + +// See https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/functionQualifiers.html +#define vec_type_hint(typen) +#define work_group_size_hint(X, Y, Z) +#define reqd_work_group_size(X, Y, Z) + +#define __kernel __global__ +#define __global +#define __local __shared__ +#define __constant __constant__ + +typedef unsigned int uint; + +// https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/barrier.html +enum cl_mem_fence_flags +{ + CLK_LOCAL_MEM_FENCE, + CLK_GLOBAL_MEM_FENCE +}; + +STATIC_KEYWORD void barrier(cl_mem_fence_flags flags) +{ + __syncthreads(); +} + +// https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/workItemFunctions.html +STATIC_KEYWORD size_t getXYZByIndex(dim3 xyz, uint dimindx) +{ + if (dimindx == 2) { + return xyz.z; + } else if (dimindx == 1) { + return xyz.y; + } else { + return xyz.x; + } +} + +STATIC_KEYWORD size_t get_global_size (uint dimindx) { + return getXYZByIndex(gridDim, dimindx) * getXYZByIndex(blockDim, dimindx); +} + +STATIC_KEYWORD size_t get_global_id (uint dimindx) { + return getXYZByIndex(blockIdx, dimindx) * getXYZByIndex(blockDim, dimindx) + getXYZByIndex(threadIdx, dimindx); +} + +STATIC_KEYWORD size_t get_local_size (uint dimindx) { + return getXYZByIndex(blockDim, dimindx); +} + +STATIC_KEYWORD size_t get_local_id (uint dimindx) { + return getXYZByIndex(threadIdx, dimindx); +} + +STATIC_KEYWORD size_t get_num_groups (uint dimindx) { + return getXYZByIndex(gridDim, dimindx); +} + +STATIC_KEYWORD size_t get_group_id (uint dimindx) { + return getXYZByIndex(blockIdx, dimindx); +} + +STATIC_KEYWORD uint get_work_dim() +{ + if (get_global_size(2) > 1) { + return 3; + } else if (get_global_size(1) > 1) { + return 2; + } else { + return 1; + } +} + +#define WARP_SIZE 32 + +#endif + +#ifdef __CUDA_ARCH__ +#define DEVICE_CODE +#else +#define HOST_CODE +#endif + +#include +#include +#include +#include + +#endif // pragma once \ No newline at end of file diff --git a/libs/gpu/libgpu/cuda/cuda_api.cpp b/libs/gpu/libgpu/cuda/cuda_api.cpp new file mode 100644 index 0000000..21aaf26 --- /dev/null +++ b/libs/gpu/libgpu/cuda/cuda_api.cpp @@ -0,0 +1,158 @@ +#ifdef CUDA_SUPPORT +#include "cuda_api.h" + +#ifdef _WIN32 + +#include + +typedef HMODULE CudaLibrary; + +static HMODULE cudaLoadLibrary() +{ + return LoadLibraryW(L"nvcuda.dll"); +} + +static FARPROC cudaGetProcAddress(HMODULE hModule, LPCSTR lpProcName) +{ + return ::GetProcAddress(hModule, lpProcName); +} + +#elif defined(__unix__) || defined(__APPLE__) || defined(__MACOSX) + +#include + +typedef void * CudaLibrary; + +static CudaLibrary cudaLoadLibrary() +{ +#if defined(__APPLE__) || defined(__MACOSX) + return dlopen("/Library/Frameworks/CUDA.framework/Versions/Current/CUDA", RTLD_NOW); +#else + return dlopen("libcuda.so", RTLD_NOW); +#endif +} + +static void *cudaGetProcAddress(void *handle, const char *symbol) +{ + return dlsym(handle, symbol); +} + +#else +#error unsupported platform +#endif + +namespace cuda { + +std::string driverErrorString(CUresult code) +{ +#define DEFINE_ERROR(value) case value: return #value; + + switch (code) { + DEFINE_ERROR(CUDA_SUCCESS) + DEFINE_ERROR(CUDA_ERROR_INVALID_VALUE) + DEFINE_ERROR(CUDA_ERROR_OUT_OF_MEMORY) + DEFINE_ERROR(CUDA_ERROR_NOT_INITIALIZED) + DEFINE_ERROR(CUDA_ERROR_DEINITIALIZED) + DEFINE_ERROR(CUDA_ERROR_PROFILER_DISABLED) + DEFINE_ERROR(CUDA_ERROR_PROFILER_NOT_INITIALIZED) + DEFINE_ERROR(CUDA_ERROR_PROFILER_ALREADY_STARTED) + DEFINE_ERROR(CUDA_ERROR_PROFILER_ALREADY_STOPPED) + DEFINE_ERROR(CUDA_ERROR_NO_DEVICE) + DEFINE_ERROR(CUDA_ERROR_INVALID_DEVICE) + DEFINE_ERROR(CUDA_ERROR_INVALID_IMAGE) + DEFINE_ERROR(CUDA_ERROR_INVALID_CONTEXT) + DEFINE_ERROR(CUDA_ERROR_CONTEXT_ALREADY_CURRENT) + DEFINE_ERROR(CUDA_ERROR_MAP_FAILED) + DEFINE_ERROR(CUDA_ERROR_UNMAP_FAILED) + DEFINE_ERROR(CUDA_ERROR_ARRAY_IS_MAPPED) + DEFINE_ERROR(CUDA_ERROR_ALREADY_MAPPED) + DEFINE_ERROR(CUDA_ERROR_NO_BINARY_FOR_GPU) + DEFINE_ERROR(CUDA_ERROR_ALREADY_ACQUIRED) + DEFINE_ERROR(CUDA_ERROR_NOT_MAPPED) + DEFINE_ERROR(CUDA_ERROR_NOT_MAPPED_AS_ARRAY) + DEFINE_ERROR(CUDA_ERROR_NOT_MAPPED_AS_POINTER) + DEFINE_ERROR(CUDA_ERROR_ECC_UNCORRECTABLE) + DEFINE_ERROR(CUDA_ERROR_UNSUPPORTED_LIMIT) + DEFINE_ERROR(CUDA_ERROR_CONTEXT_ALREADY_IN_USE) + DEFINE_ERROR(CUDA_ERROR_PEER_ACCESS_UNSUPPORTED) + DEFINE_ERROR(CUDA_ERROR_INVALID_SOURCE) + DEFINE_ERROR(CUDA_ERROR_FILE_NOT_FOUND) + DEFINE_ERROR(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND) + DEFINE_ERROR(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED) + DEFINE_ERROR(CUDA_ERROR_OPERATING_SYSTEM) + DEFINE_ERROR(CUDA_ERROR_INVALID_HANDLE) + DEFINE_ERROR(CUDA_ERROR_NOT_FOUND) + DEFINE_ERROR(CUDA_ERROR_NOT_READY) + DEFINE_ERROR(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) + DEFINE_ERROR(CUDA_ERROR_LAUNCH_TIMEOUT) + DEFINE_ERROR(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING) + DEFINE_ERROR(CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) + DEFINE_ERROR(CUDA_ERROR_PEER_ACCESS_NOT_ENABLED) + DEFINE_ERROR(CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE) + DEFINE_ERROR(CUDA_ERROR_CONTEXT_IS_DESTROYED) + DEFINE_ERROR(CUDA_ERROR_ASSERT) + DEFINE_ERROR(CUDA_ERROR_TOO_MANY_PEERS) + DEFINE_ERROR(CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED) + DEFINE_ERROR(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED) + DEFINE_ERROR(CUDA_ERROR_LAUNCH_FAILED) + DEFINE_ERROR(CUDA_ERROR_NOT_PERMITTED) + DEFINE_ERROR(CUDA_ERROR_NOT_SUPPORTED) + DEFINE_ERROR(CUDA_ERROR_UNKNOWN) + default: return "CUDA_ERROR_UNKNOWN_CODE_" + to_string(code); + } + +#undef DEFINE_ERROR +} + +std::string formatDriverError(CUresult code) +{ + return driverErrorString(code) + " (" + to_string(code) + ")"; +} + +} + +typedef CUresult (CUDAAPI * p_pfn_cuDeviceGet) (CUdevice *, int); +typedef CUresult (CUDAAPI * p_pfn_cuCtxCreate) (CUcontext *, unsigned int, CUdevice); +typedef CUresult (CUDAAPI * p_pfn_cuCtxDestroy) (CUcontext); + +p_pfn_cuDeviceGet pfn_cuDeviceGet = 0; +p_pfn_cuCtxCreate pfn_cuCtxCreate = 0; +p_pfn_cuCtxDestroy pfn_cuCtxDestroy = 0; + +bool cuda_api_init() +{ + if (pfn_cuCtxCreate) + return true; + + CudaLibrary lib = cudaLoadLibrary(); + if (!lib) + return false; + + pfn_cuDeviceGet = (p_pfn_cuDeviceGet) cudaGetProcAddress(lib, "cuDeviceGet"); + pfn_cuCtxCreate = (p_pfn_cuCtxCreate) cudaGetProcAddress(lib, "cuCtxCreate_v2"); + pfn_cuCtxDestroy = (p_pfn_cuCtxDestroy) cudaGetProcAddress(lib, "cuCtxDestroy_v2"); + + return true; +} + +CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal) +{ + if (!pfn_cuDeviceGet) return CUDA_ERROR_NOT_INITIALIZED; + + return pfn_cuDeviceGet(device, ordinal); +} + +CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev) +{ + if (!pfn_cuCtxCreate) return CUDA_ERROR_NOT_INITIALIZED; + + return pfn_cuCtxCreate(pctx, flags, dev); +} + +CUresult CUDAAPI cuCtxDestroy(CUcontext ctx) +{ + if (!pfn_cuCtxDestroy) return CUDA_ERROR_NOT_INITIALIZED; + + return pfn_cuCtxDestroy(ctx); +} +#endif diff --git a/libs/gpu/libgpu/cuda/cuda_api.h b/libs/gpu/libgpu/cuda/cuda_api.h new file mode 100644 index 0000000..6e878cf --- /dev/null +++ b/libs/gpu/libgpu/cuda/cuda_api.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include + +bool cuda_api_init(); + +namespace cuda { + + std::string formatDriverError(CUresult code); + + static inline void reportErrorCU(CUresult err, int line, std::string prefix="") + { + if (CUDA_SUCCESS == err) + return; + + std::string message = prefix + formatDriverError(err) + " at line " + to_string(line); + + switch (err) { + case CUDA_ERROR_OUT_OF_MEMORY: + throw cuda_bad_alloc(message); + default: + throw cuda_exception(message); + } + } + + #define CU_SAFE_CALL(expr) cuda::reportErrorCU(expr, __LINE__) + +} diff --git a/libs/gpu/libgpu/cuda/enum.cpp b/libs/gpu/libgpu/cuda/enum.cpp new file mode 100644 index 0000000..6e3f243 --- /dev/null +++ b/libs/gpu/libgpu/cuda/enum.cpp @@ -0,0 +1,94 @@ +#ifdef CUDA_SUPPORT +#include +#include +#include +#include +#include "enum.h" +#include "utils.h" + +bool CUDAEnum::printInfo(int id) +{ + cudaError_t status; + + cudaDeviceProp prop; + status = cudaGetDeviceProperties(&prop, id); + if (status != cudaSuccess) + return false; + + int driverVersion = 239; + status = cudaDriverGetVersion(&driverVersion); + if (status != cudaSuccess) + return false; + + int runtimeVersion = 239; + status = cudaRuntimeGetVersion(&runtimeVersion); + if (status != cudaSuccess) + return false; + + std::cout << "Using device: " << prop.name << ", " << prop.multiProcessorCount << " compute units, " << (prop.totalGlobalMem >> 20) << " MB global memory, compute capability " << prop.major << "." << prop.minor << std::endl; + std::cout << " driver version: " << driverVersion << ", runtime version: " << runtimeVersion << std::endl; + std::cout << " max work group size " << prop.maxThreadsPerBlock << std::endl; + std::cout << " max work item sizes [" << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", " << prop.maxThreadsDim[2] << "]" << std::endl; + return true; +} + +CUDAEnum::CUDAEnum() +{ +} + +CUDAEnum::~CUDAEnum() +{ +} + +bool CUDAEnum::compareDevice(const Device &dev1, const Device &dev2) +{ + if (dev1.name > dev2.name) return false; + if (dev1.name < dev2.name) return true; + if (dev1.id > dev2.id) return false; + return true; +} + +bool CUDAEnum::enumDevices() +{ + int device_count = 0; + + cudaError_t res = cudaGetDeviceCount(&device_count); + if (res == cudaErrorNoDevice || res == cudaErrorInsufficientDriver) + return true; + + if (res != cudaSuccess) { + std::cerr << "cudaGetDeviceCount failed: " << cuda::formatError(res) << std::endl; + return false; + } + + for (int device_index = 0; device_index < device_count; device_index++) { + cudaDeviceProp prop; + + res = cudaGetDeviceProperties(&prop, device_index); + if (res != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties failed: " << cuda::formatError(res) << std::endl; + return false; + } + + // we don't support CUDA devices with compute capability < 2.0 + if (prop.major < 2) + continue; + + Device device; + + device.id = device_index; + device.name = prop.name; + device.compute_units = prop.multiProcessorCount; + device.mem_size = prop.totalGlobalMem; + device.clock = prop.clockRate / 1000; + device.pci_bus_id = prop.pciBusID; + device.pci_device_id = prop.pciDeviceID; + + devices_.push_back(device); + } + + std::sort(devices_.begin(), devices_.end(), compareDevice); + + return true; +} +#endif diff --git a/libs/gpu/libgpu/cuda/enum.h b/libs/gpu/libgpu/cuda/enum.h new file mode 100644 index 0000000..f864e38 --- /dev/null +++ b/libs/gpu/libgpu/cuda/enum.h @@ -0,0 +1,45 @@ +#pragma once + +#include +#include + +#ifdef CUDA_SUPPORT + +class CUDAEnum { +public: + CUDAEnum(); + ~CUDAEnum(); + + class Device { + public: + Device() + { + id = 0; + compute_units = 0; + mem_size = 0; + clock = 0; + pci_bus_id = 0; + pci_device_id = 0; + } + + int id; + std::string name; + unsigned int compute_units; + unsigned long long mem_size; + unsigned int clock; + unsigned int pci_bus_id; + unsigned int pci_device_id; + }; + + bool enumDevices(); + std::vector & devices() { return devices_; } + + static bool printInfo(int id); + +protected: + static bool compareDevice(const Device &dev1, const Device &dev2); + + std::vector devices_; +}; + +#endif diff --git a/libs/gpu/libgpu/cuda/sdk/helper_math.h b/libs/gpu/libgpu/cuda/sdk/helper_math.h new file mode 100644 index 0000000..24b79b3 --- /dev/null +++ b/libs/gpu/libgpu/cuda/sdk/helper_math.h @@ -0,0 +1,1453 @@ + /** + * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +/* + * This file implements common mathematical operations on vector types + * (float3, float4 etc.) since these are not provided as standard by CUDA. + * + * The syntax is modeled on the Cg standard library. + * + * This is part of the Helper library includes + * + * Thanks to Linh Hah for additions and fixes. + */ + +#ifndef HELPER_MATH_H +#define HELPER_MATH_H + +#include "cuda_runtime.h" + +typedef unsigned int uint; +typedef unsigned short ushort; + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#ifndef __CUDACC__ +#include + +//////////////////////////////////////////////////////////////////////////////// +// host implementations of CUDA functions +//////////////////////////////////////////////////////////////////////////////// + +inline float fminf(float a, float b) +{ + return a < b ? a : b; +} + +inline float fmaxf(float a, float b) +{ + return a > b ? a : b; +} + +inline int max(int a, int b) +{ + return a > b ? a : b; +} + +inline int min(int a, int b) +{ + return a < b ? a : b; +} + +inline float rsqrtf(float x) +{ + return 1.0f / sqrtf(x); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// constructors +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 make_float2(float s) +{ + return make_float2(s, s); +} +inline __host__ __device__ float2 make_float2(float3 a) +{ + return make_float2(a.x, a.y); +} +inline __host__ __device__ float2 make_float2(int2 a) +{ + return make_float2(float(a.x), float(a.y)); +} +inline __host__ __device__ float2 make_float2(uint2 a) +{ + return make_float2(float(a.x), float(a.y)); +} + +inline __host__ __device__ int2 make_int2(int s) +{ + return make_int2(s, s); +} +inline __host__ __device__ int2 make_int2(int3 a) +{ + return make_int2(a.x, a.y); +} +inline __host__ __device__ int2 make_int2(uint2 a) +{ + return make_int2(int(a.x), int(a.y)); +} +inline __host__ __device__ int2 make_int2(float2 a) +{ + return make_int2(int(a.x), int(a.y)); +} + +inline __host__ __device__ uint2 make_uint2(uint s) +{ + return make_uint2(s, s); +} +inline __host__ __device__ uint2 make_uint2(uint3 a) +{ + return make_uint2(a.x, a.y); +} +inline __host__ __device__ uint2 make_uint2(int2 a) +{ + return make_uint2(uint(a.x), uint(a.y)); +} + +inline __host__ __device__ float3 make_float3(float s) +{ + return make_float3(s, s, s); +} +inline __host__ __device__ float3 make_float3(float2 a) +{ + return make_float3(a.x, a.y, 0.0f); +} +inline __host__ __device__ float3 make_float3(float2 a, float s) +{ + return make_float3(a.x, a.y, s); +} +inline __host__ __device__ float3 make_float3(float4 a) +{ + return make_float3(a.x, a.y, a.z); +} +inline __host__ __device__ float3 make_float3(int3 a) +{ + return make_float3(float(a.x), float(a.y), float(a.z)); +} +inline __host__ __device__ float3 make_float3(uint3 a) +{ + return make_float3(float(a.x), float(a.y), float(a.z)); +} + +inline __host__ __device__ int3 make_int3(int s) +{ + return make_int3(s, s, s); +} +inline __host__ __device__ int3 make_int3(int2 a) +{ + return make_int3(a.x, a.y, 0); +} +inline __host__ __device__ int3 make_int3(int2 a, int s) +{ + return make_int3(a.x, a.y, s); +} +inline __host__ __device__ int3 make_int3(uint3 a) +{ + return make_int3(int(a.x), int(a.y), int(a.z)); +} +inline __host__ __device__ int3 make_int3(float3 a) +{ + return make_int3(int(a.x), int(a.y), int(a.z)); +} + +inline __host__ __device__ uint3 make_uint3(uint s) +{ + return make_uint3(s, s, s); +} +inline __host__ __device__ uint3 make_uint3(uint2 a) +{ + return make_uint3(a.x, a.y, 0); +} +inline __host__ __device__ uint3 make_uint3(uint2 a, uint s) +{ + return make_uint3(a.x, a.y, s); +} +inline __host__ __device__ uint3 make_uint3(uint4 a) +{ + return make_uint3(a.x, a.y, a.z); +} +inline __host__ __device__ uint3 make_uint3(int3 a) +{ + return make_uint3(uint(a.x), uint(a.y), uint(a.z)); +} + +inline __host__ __device__ float4 make_float4(float s) +{ + return make_float4(s, s, s, s); +} +inline __host__ __device__ float4 make_float4(float3 a) +{ + return make_float4(a.x, a.y, a.z, 0.0f); +} +inline __host__ __device__ float4 make_float4(float3 a, float w) +{ + return make_float4(a.x, a.y, a.z, w); +} +inline __host__ __device__ float4 make_float4(int4 a) +{ + return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); +} +inline __host__ __device__ float4 make_float4(uint4 a) +{ + return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); +} + +inline __host__ __device__ int4 make_int4(int s) +{ + return make_int4(s, s, s, s); +} +inline __host__ __device__ int4 make_int4(int3 a) +{ + return make_int4(a.x, a.y, a.z, 0); +} +inline __host__ __device__ int4 make_int4(int3 a, int w) +{ + return make_int4(a.x, a.y, a.z, w); +} +inline __host__ __device__ int4 make_int4(uint4 a) +{ + return make_int4(int(a.x), int(a.y), int(a.z), int(a.w)); +} +inline __host__ __device__ int4 make_int4(float4 a) +{ + return make_int4(int(a.x), int(a.y), int(a.z), int(a.w)); +} + + +inline __host__ __device__ uint4 make_uint4(uint s) +{ + return make_uint4(s, s, s, s); +} +inline __host__ __device__ uint4 make_uint4(uint3 a) +{ + return make_uint4(a.x, a.y, a.z, 0); +} +inline __host__ __device__ uint4 make_uint4(uint3 a, uint w) +{ + return make_uint4(a.x, a.y, a.z, w); +} +inline __host__ __device__ uint4 make_uint4(int4 a) +{ + return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// negate +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator-(float2 &a) +{ + return make_float2(-a.x, -a.y); +} +inline __host__ __device__ int2 operator-(int2 &a) +{ + return make_int2(-a.x, -a.y); +} +inline __host__ __device__ float3 operator-(float3 &a) +{ + return make_float3(-a.x, -a.y, -a.z); +} +inline __host__ __device__ int3 operator-(int3 &a) +{ + return make_int3(-a.x, -a.y, -a.z); +} +inline __host__ __device__ float4 operator-(float4 &a) +{ + return make_float4(-a.x, -a.y, -a.z, -a.w); +} +inline __host__ __device__ int4 operator-(int4 &a) +{ + return make_int4(-a.x, -a.y, -a.z, -a.w); +} + +//////////////////////////////////////////////////////////////////////////////// +// addition +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator+(float2 a, float2 b) +{ + return make_float2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(float2 &a, float2 b) +{ + a.x += b.x; + a.y += b.y; +} +inline __host__ __device__ float2 operator+(float2 a, float b) +{ + return make_float2(a.x + b, a.y + b); +} +inline __host__ __device__ float2 operator+(float b, float2 a) +{ + return make_float2(a.x + b, a.y + b); +} +inline __host__ __device__ void operator+=(float2 &a, float b) +{ + a.x += b; + a.y += b; +} + +inline __host__ __device__ int2 operator+(int2 a, int2 b) +{ + return make_int2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(int2 &a, int2 b) +{ + a.x += b.x; + a.y += b.y; +} +inline __host__ __device__ int2 operator+(int2 a, int b) +{ + return make_int2(a.x + b, a.y + b); +} +inline __host__ __device__ int2 operator+(int b, int2 a) +{ + return make_int2(a.x + b, a.y + b); +} +inline __host__ __device__ void operator+=(int2 &a, int b) +{ + a.x += b; + a.y += b; +} + +inline __host__ __device__ uint2 operator+(uint2 a, uint2 b) +{ + return make_uint2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(uint2 &a, uint2 b) +{ + a.x += b.x; + a.y += b.y; +} +inline __host__ __device__ uint2 operator+(uint2 a, uint b) +{ + return make_uint2(a.x + b, a.y + b); +} +inline __host__ __device__ uint2 operator+(uint b, uint2 a) +{ + return make_uint2(a.x + b, a.y + b); +} +inline __host__ __device__ void operator+=(uint2 &a, uint b) +{ + a.x += b; + a.y += b; +} + + +inline __host__ __device__ float3 operator+(float3 a, float3 b) +{ + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(float3 &a, float3 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; +} +inline __host__ __device__ float3 operator+(float3 a, float b) +{ + return make_float3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(float3 &a, float b) +{ + a.x += b; + a.y += b; + a.z += b; +} + +inline __host__ __device__ int3 operator+(int3 a, int3 b) +{ + return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(int3 &a, int3 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; +} +inline __host__ __device__ int3 operator+(int3 a, int b) +{ + return make_int3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(int3 &a, int b) +{ + a.x += b; + a.y += b; + a.z += b; +} + +inline __host__ __device__ uint3 operator+(uint3 a, uint3 b) +{ + return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(uint3 &a, uint3 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; +} +inline __host__ __device__ uint3 operator+(uint3 a, uint b) +{ + return make_uint3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(uint3 &a, uint b) +{ + a.x += b; + a.y += b; + a.z += b; +} + +inline __host__ __device__ int3 operator+(int b, int3 a) +{ + return make_int3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ uint3 operator+(uint b, uint3 a) +{ + return make_uint3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ float3 operator+(float b, float3 a) +{ + return make_float3(a.x + b, a.y + b, a.z + b); +} + +inline __host__ __device__ float4 operator+(float4 a, float4 b) +{ + return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(float4 &a, float4 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; +} +inline __host__ __device__ float4 operator+(float4 a, float b) +{ + return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ float4 operator+(float b, float4 a) +{ + return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ void operator+=(float4 &a, float b) +{ + a.x += b; + a.y += b; + a.z += b; + a.w += b; +} + +inline __host__ __device__ int4 operator+(int4 a, int4 b) +{ + return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(int4 &a, int4 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; +} +inline __host__ __device__ int4 operator+(int4 a, int b) +{ + return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ int4 operator+(int b, int4 a) +{ + return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ void operator+=(int4 &a, int b) +{ + a.x += b; + a.y += b; + a.z += b; + a.w += b; +} + +inline __host__ __device__ uint4 operator+(uint4 a, uint4 b) +{ + return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(uint4 &a, uint4 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; +} +inline __host__ __device__ uint4 operator+(uint4 a, uint b) +{ + return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ uint4 operator+(uint b, uint4 a) +{ + return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ void operator+=(uint4 &a, uint b) +{ + a.x += b; + a.y += b; + a.z += b; + a.w += b; +} + +//////////////////////////////////////////////////////////////////////////////// +// subtract +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator-(float2 a, float2 b) +{ + return make_float2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(float2 &a, float2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ float2 operator-(float2 a, float b) +{ + return make_float2(a.x - b, a.y - b); +} +inline __host__ __device__ float2 operator-(float b, float2 a) +{ + return make_float2(b - a.x, b - a.y); +} +inline __host__ __device__ void operator-=(float2 &a, float b) +{ + a.x -= b; + a.y -= b; +} + +inline __host__ __device__ int2 operator-(int2 a, int2 b) +{ + return make_int2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(int2 &a, int2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ int2 operator-(int2 a, int b) +{ + return make_int2(a.x - b, a.y - b); +} +inline __host__ __device__ int2 operator-(int b, int2 a) +{ + return make_int2(b - a.x, b - a.y); +} +inline __host__ __device__ void operator-=(int2 &a, int b) +{ + a.x -= b; + a.y -= b; +} + +inline __host__ __device__ uint2 operator-(uint2 a, uint2 b) +{ + return make_uint2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(uint2 &a, uint2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ uint2 operator-(uint2 a, uint b) +{ + return make_uint2(a.x - b, a.y - b); +} +inline __host__ __device__ uint2 operator-(uint b, uint2 a) +{ + return make_uint2(b - a.x, b - a.y); +} +inline __host__ __device__ void operator-=(uint2 &a, uint b) +{ + a.x -= b; + a.y -= b; +} + +inline __host__ __device__ float3 operator-(float3 a, float3 b) +{ + return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ void operator-=(float3 &a, float3 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; +} +inline __host__ __device__ float3 operator-(float3 a, float b) +{ + return make_float3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ float3 operator-(float b, float3 a) +{ + return make_float3(b - a.x, b - a.y, b - a.z); +} +inline __host__ __device__ void operator-=(float3 &a, float b) +{ + a.x -= b; + a.y -= b; + a.z -= b; +} + +inline __host__ __device__ int3 operator-(int3 a, int3 b) +{ + return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ void operator-=(int3 &a, int3 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; +} +inline __host__ __device__ int3 operator-(int3 a, int b) +{ + return make_int3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ int3 operator-(int b, int3 a) +{ + return make_int3(b - a.x, b - a.y, b - a.z); +} +inline __host__ __device__ void operator-=(int3 &a, int b) +{ + a.x -= b; + a.y -= b; + a.z -= b; +} + +inline __host__ __device__ uint3 operator-(uint3 a, uint3 b) +{ + return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ void operator-=(uint3 &a, uint3 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; +} +inline __host__ __device__ uint3 operator-(uint3 a, uint b) +{ + return make_uint3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ uint3 operator-(uint b, uint3 a) +{ + return make_uint3(b - a.x, b - a.y, b - a.z); +} +inline __host__ __device__ void operator-=(uint3 &a, uint b) +{ + a.x -= b; + a.y -= b; + a.z -= b; +} + +inline __host__ __device__ float4 operator-(float4 a, float4 b) +{ + return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(float4 &a, float4 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; + a.w -= b.w; +} +inline __host__ __device__ float4 operator-(float4 a, float b) +{ + return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); +} +inline __host__ __device__ void operator-=(float4 &a, float b) +{ + a.x -= b; + a.y -= b; + a.z -= b; + a.w -= b; +} + +inline __host__ __device__ int4 operator-(int4 a, int4 b) +{ + return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(int4 &a, int4 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; + a.w -= b.w; +} +inline __host__ __device__ int4 operator-(int4 a, int b) +{ + return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); +} +inline __host__ __device__ int4 operator-(int b, int4 a) +{ + return make_int4(b - a.x, b - a.y, b - a.z, b - a.w); +} +inline __host__ __device__ void operator-=(int4 &a, int b) +{ + a.x -= b; + a.y -= b; + a.z -= b; + a.w -= b; +} + +inline __host__ __device__ uint4 operator-(uint4 a, uint4 b) +{ + return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(uint4 &a, uint4 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; + a.w -= b.w; +} +inline __host__ __device__ uint4 operator-(uint4 a, uint b) +{ + return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); +} +inline __host__ __device__ uint4 operator-(uint b, uint4 a) +{ + return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w); +} +inline __host__ __device__ void operator-=(uint4 &a, uint b) +{ + a.x -= b; + a.y -= b; + a.z -= b; + a.w -= b; +} + +//////////////////////////////////////////////////////////////////////////////// +// multiply +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator*(float2 a, float2 b) +{ + return make_float2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ void operator*=(float2 &a, float2 b) +{ + a.x *= b.x; + a.y *= b.y; +} +inline __host__ __device__ float2 operator*(float2 a, float b) +{ + return make_float2(a.x * b, a.y * b); +} +inline __host__ __device__ float2 operator*(float b, float2 a) +{ + return make_float2(b * a.x, b * a.y); +} +inline __host__ __device__ void operator*=(float2 &a, float b) +{ + a.x *= b; + a.y *= b; +} + +inline __host__ __device__ int2 operator*(int2 a, int2 b) +{ + return make_int2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ void operator*=(int2 &a, int2 b) +{ + a.x *= b.x; + a.y *= b.y; +} +inline __host__ __device__ int2 operator*(int2 a, int b) +{ + return make_int2(a.x * b, a.y * b); +} +inline __host__ __device__ int2 operator*(int b, int2 a) +{ + return make_int2(b * a.x, b * a.y); +} +inline __host__ __device__ void operator*=(int2 &a, int b) +{ + a.x *= b; + a.y *= b; +} + +inline __host__ __device__ uint2 operator*(uint2 a, uint2 b) +{ + return make_uint2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ void operator*=(uint2 &a, uint2 b) +{ + a.x *= b.x; + a.y *= b.y; +} +inline __host__ __device__ uint2 operator*(uint2 a, uint b) +{ + return make_uint2(a.x * b, a.y * b); +} +inline __host__ __device__ uint2 operator*(uint b, uint2 a) +{ + return make_uint2(b * a.x, b * a.y); +} +inline __host__ __device__ void operator*=(uint2 &a, uint b) +{ + a.x *= b; + a.y *= b; +} + +inline __host__ __device__ float3 operator*(float3 a, float3 b) +{ + return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ void operator*=(float3 &a, float3 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; +} +inline __host__ __device__ float3 operator*(float3 a, float b) +{ + return make_float3(a.x * b, a.y * b, a.z * b); +} +inline __host__ __device__ float3 operator*(float b, float3 a) +{ + return make_float3(b * a.x, b * a.y, b * a.z); +} +inline __host__ __device__ void operator*=(float3 &a, float b) +{ + a.x *= b; + a.y *= b; + a.z *= b; +} + +inline __host__ __device__ int3 operator*(int3 a, int3 b) +{ + return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ void operator*=(int3 &a, int3 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; +} +inline __host__ __device__ int3 operator*(int3 a, int b) +{ + return make_int3(a.x * b, a.y * b, a.z * b); +} +inline __host__ __device__ int3 operator*(int b, int3 a) +{ + return make_int3(b * a.x, b * a.y, b * a.z); +} +inline __host__ __device__ void operator*=(int3 &a, int b) +{ + a.x *= b; + a.y *= b; + a.z *= b; +} + +inline __host__ __device__ uint3 operator*(uint3 a, uint3 b) +{ + return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ void operator*=(uint3 &a, uint3 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; +} +inline __host__ __device__ uint3 operator*(uint3 a, uint b) +{ + return make_uint3(a.x * b, a.y * b, a.z * b); +} +inline __host__ __device__ uint3 operator*(uint b, uint3 a) +{ + return make_uint3(b * a.x, b * a.y, b * a.z); +} +inline __host__ __device__ void operator*=(uint3 &a, uint b) +{ + a.x *= b; + a.y *= b; + a.z *= b; +} + +inline __host__ __device__ float4 operator*(float4 a, float4 b) +{ + return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __host__ __device__ void operator*=(float4 &a, float4 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; + a.w *= b.w; +} +inline __host__ __device__ float4 operator*(float4 a, float b) +{ + return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); +} +inline __host__ __device__ float4 operator*(float b, float4 a) +{ + return make_float4(b * a.x, b * a.y, b * a.z, b * a.w); +} +inline __host__ __device__ void operator*=(float4 &a, float b) +{ + a.x *= b; + a.y *= b; + a.z *= b; + a.w *= b; +} + +inline __host__ __device__ int4 operator*(int4 a, int4 b) +{ + return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __host__ __device__ void operator*=(int4 &a, int4 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; + a.w *= b.w; +} +inline __host__ __device__ int4 operator*(int4 a, int b) +{ + return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); +} +inline __host__ __device__ int4 operator*(int b, int4 a) +{ + return make_int4(b * a.x, b * a.y, b * a.z, b * a.w); +} +inline __host__ __device__ void operator*=(int4 &a, int b) +{ + a.x *= b; + a.y *= b; + a.z *= b; + a.w *= b; +} + +inline __host__ __device__ uint4 operator*(uint4 a, uint4 b) +{ + return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __host__ __device__ void operator*=(uint4 &a, uint4 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; + a.w *= b.w; +} +inline __host__ __device__ uint4 operator*(uint4 a, uint b) +{ + return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); +} +inline __host__ __device__ uint4 operator*(uint b, uint4 a) +{ + return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w); +} +inline __host__ __device__ void operator*=(uint4 &a, uint b) +{ + a.x *= b; + a.y *= b; + a.z *= b; + a.w *= b; +} + +//////////////////////////////////////////////////////////////////////////////// +// divide +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator/(float2 a, float2 b) +{ + return make_float2(a.x / b.x, a.y / b.y); +} +inline __host__ __device__ void operator/=(float2 &a, float2 b) +{ + a.x /= b.x; + a.y /= b.y; +} +inline __host__ __device__ float2 operator/(float2 a, float b) +{ + return make_float2(a.x / b, a.y / b); +} +inline __host__ __device__ void operator/=(float2 &a, float b) +{ + a.x /= b; + a.y /= b; +} +inline __host__ __device__ float2 operator/(float b, float2 a) +{ + return make_float2(b / a.x, b / a.y); +} + +inline __host__ __device__ float3 operator/(float3 a, float3 b) +{ + return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); +} +inline __host__ __device__ void operator/=(float3 &a, float3 b) +{ + a.x /= b.x; + a.y /= b.y; + a.z /= b.z; +} +inline __host__ __device__ float3 operator/(float3 a, float b) +{ + return make_float3(a.x / b, a.y / b, a.z / b); +} +inline __host__ __device__ void operator/=(float3 &a, float b) +{ + a.x /= b; + a.y /= b; + a.z /= b; +} +inline __host__ __device__ float3 operator/(float b, float3 a) +{ + return make_float3(b / a.x, b / a.y, b / a.z); +} + +inline __host__ __device__ float4 operator/(float4 a, float4 b) +{ + return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); +} +inline __host__ __device__ void operator/=(float4 &a, float4 b) +{ + a.x /= b.x; + a.y /= b.y; + a.z /= b.z; + a.w /= b.w; +} +inline __host__ __device__ float4 operator/(float4 a, float b) +{ + return make_float4(a.x / b, a.y / b, a.z / b, a.w / b); +} +inline __host__ __device__ void operator/=(float4 &a, float b) +{ + a.x /= b; + a.y /= b; + a.z /= b; + a.w /= b; +} +inline __host__ __device__ float4 operator/(float b, float4 a) +{ + return make_float4(b / a.x, b / a.y, b / a.z, b / a.w); +} + +//////////////////////////////////////////////////////////////////////////////// +// min +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fminf(float2 a, float2 b) +{ + return make_float2(fminf(a.x,b.x), fminf(a.y,b.y)); +} +inline __host__ __device__ float3 fminf(float3 a, float3 b) +{ + return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z)); +} +inline __host__ __device__ float4 fminf(float4 a, float4 b) +{ + return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w)); +} + +inline __host__ __device__ int2 min(int2 a, int2 b) +{ + return make_int2(min(a.x,b.x), min(a.y,b.y)); +} +inline __host__ __device__ int3 min(int3 a, int3 b) +{ + return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); +} +inline __host__ __device__ int4 min(int4 a, int4 b) +{ + return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w)); +} + +inline __host__ __device__ uint2 min(uint2 a, uint2 b) +{ + return make_uint2(min(a.x,b.x), min(a.y,b.y)); +} +inline __host__ __device__ uint3 min(uint3 a, uint3 b) +{ + return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); +} +inline __host__ __device__ uint4 min(uint4 a, uint4 b) +{ + return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// max +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fmaxf(float2 a, float2 b) +{ + return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y)); +} +inline __host__ __device__ float3 fmaxf(float3 a, float3 b) +{ + return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z)); +} +inline __host__ __device__ float4 fmaxf(float4 a, float4 b) +{ + return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w)); +} + +inline __host__ __device__ int2 max(int2 a, int2 b) +{ + return make_int2(max(a.x,b.x), max(a.y,b.y)); +} +inline __host__ __device__ int3 max(int3 a, int3 b) +{ + return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); +} +inline __host__ __device__ int4 max(int4 a, int4 b) +{ + return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w)); +} + +inline __host__ __device__ uint2 max(uint2 a, uint2 b) +{ + return make_uint2(max(a.x,b.x), max(a.y,b.y)); +} +inline __host__ __device__ uint3 max(uint3 a, uint3 b) +{ + return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); +} +inline __host__ __device__ uint4 max(uint4 a, uint4 b) +{ + return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// lerp +// - linear interpolation between a and b, based on value t in [0, 1] range +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ __host__ float lerp(float a, float b, float t) +{ + return a + t*(b-a); +} +inline __device__ __host__ float2 lerp(float2 a, float2 b, float t) +{ + return a + t*(b-a); +} +inline __device__ __host__ float3 lerp(float3 a, float3 b, float t) +{ + return a + t*(b-a); +} +inline __device__ __host__ float4 lerp(float4 a, float4 b, float t) +{ + return a + t*(b-a); +} + +//////////////////////////////////////////////////////////////////////////////// +// clamp +// - clamp the value v to be in the range [a, b] +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ __host__ float clamp(float f, float a, float b) +{ + return fmaxf(a, fminf(f, b)); +} +inline __device__ __host__ int clamp(int f, int a, int b) +{ + return max(a, min(f, b)); +} +inline __device__ __host__ uint clamp(uint f, uint a, uint b) +{ + return max(a, min(f, b)); +} + +inline __device__ __host__ float2 clamp(float2 v, float a, float b) +{ + return make_float2(clamp(v.x, a, b), clamp(v.y, a, b)); +} +inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b) +{ + return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} +inline __device__ __host__ float3 clamp(float3 v, float a, float b) +{ + return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} +inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b) +{ + return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} +inline __device__ __host__ float4 clamp(float4 v, float a, float b) +{ + return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} +inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b) +{ + return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +inline __device__ __host__ int2 clamp(int2 v, int a, int b) +{ + return make_int2(clamp(v.x, a, b), clamp(v.y, a, b)); +} +inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b) +{ + return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} +inline __device__ __host__ int3 clamp(int3 v, int a, int b) +{ + return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} +inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b) +{ + return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} +inline __device__ __host__ int4 clamp(int4 v, int a, int b) +{ + return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} +inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b) +{ + return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b) +{ + return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b)); +} +inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b) +{ + return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} +inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b) +{ + return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} +inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b) +{ + return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} +inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b) +{ + return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} +inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b) +{ + return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// dot product +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float dot(float2 a, float2 b) +{ + return a.x * b.x + a.y * b.y; +} +inline __host__ __device__ float dot(float3 a, float3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +inline __host__ __device__ float dot(float4 a, float4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +inline __host__ __device__ int dot(int2 a, int2 b) +{ + return a.x * b.x + a.y * b.y; +} +inline __host__ __device__ int dot(int3 a, int3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +inline __host__ __device__ int dot(int4 a, int4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +inline __host__ __device__ uint dot(uint2 a, uint2 b) +{ + return a.x * b.x + a.y * b.y; +} +inline __host__ __device__ uint dot(uint3 a, uint3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +inline __host__ __device__ uint dot(uint4 a, uint4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +//////////////////////////////////////////////////////////////////////////////// +// length +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float length(float2 v) +{ + return sqrtf(dot(v, v)); +} +inline __host__ __device__ float length(float3 v) +{ + return sqrtf(dot(v, v)); +} +inline __host__ __device__ float length(float4 v) +{ + return sqrtf(dot(v, v)); +} + +//////////////////////////////////////////////////////////////////////////////// +// normalize +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 normalize(float2 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} +inline __host__ __device__ float3 normalize(float3 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} +inline __host__ __device__ float4 normalize(float4 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} + +//////////////////////////////////////////////////////////////////////////////// +// floor +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 floorf(float2 v) +{ + return make_float2(floorf(v.x), floorf(v.y)); +} +inline __host__ __device__ float3 floorf(float3 v) +{ + return make_float3(floorf(v.x), floorf(v.y), floorf(v.z)); +} +inline __host__ __device__ float4 floorf(float4 v) +{ + return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// frac - returns the fractional portion of a scalar or each vector component +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float fracf(float v) +{ + return v - floorf(v); +} +inline __host__ __device__ float2 fracf(float2 v) +{ + return make_float2(fracf(v.x), fracf(v.y)); +} +inline __host__ __device__ float3 fracf(float3 v) +{ + return make_float3(fracf(v.x), fracf(v.y), fracf(v.z)); +} +inline __host__ __device__ float4 fracf(float4 v) +{ + return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// fmod +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fmodf(float2 a, float2 b) +{ + return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y)); +} +inline __host__ __device__ float3 fmodf(float3 a, float3 b) +{ + return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z)); +} +inline __host__ __device__ float4 fmodf(float4 a, float4 b) +{ + return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// absolute value +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fabs(float2 v) +{ + return make_float2(fabs(v.x), fabs(v.y)); +} +inline __host__ __device__ float3 fabs(float3 v) +{ + return make_float3(fabs(v.x), fabs(v.y), fabs(v.z)); +} +inline __host__ __device__ float4 fabs(float4 v) +{ + return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w)); +} + +inline __host__ __device__ int2 abs(int2 v) +{ + return make_int2(abs(v.x), abs(v.y)); +} +inline __host__ __device__ int3 abs(int3 v) +{ + return make_int3(abs(v.x), abs(v.y), abs(v.z)); +} +inline __host__ __device__ int4 abs(int4 v) +{ + return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// reflect +// - returns reflection of incident ray I around surface normal N +// - N should be normalized, reflected vector's length is equal to length of I +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float3 reflect(float3 i, float3 n) +{ + return i - 2.0f * n * dot(n,i); +} + +//////////////////////////////////////////////////////////////////////////////// +// cross product +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float3 cross(float3 a, float3 b) +{ + return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); +} + +//////////////////////////////////////////////////////////////////////////////// +// smoothstep +// - returns 0 if x < a +// - returns 1 if x > b +// - otherwise returns smooth interpolation between 0 and 1 based on x +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ __host__ float smoothstep(float a, float b, float x) +{ + float y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(3.0f - (2.0f*y))); +} +inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x) +{ + float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y))); +} +inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x) +{ + float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y))); +} +inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x) +{ + float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y))); +} + +#endif diff --git a/libs/gpu/libgpu/cuda/utils.cpp b/libs/gpu/libgpu/cuda/utils.cpp new file mode 100644 index 0000000..36a3794 --- /dev/null +++ b/libs/gpu/libgpu/cuda/utils.cpp @@ -0,0 +1,14 @@ +#ifdef CUDA_SUPPORT +#include "utils.h" +#include "cuda_api.h" + +namespace cuda { + +std::string formatError(cudaError_t code) +{ + return std::string(cudaGetErrorString(code)) + " (" + to_string(code) + ")"; +} + +} + +#endif diff --git a/libs/gpu/libgpu/cuda/utils.h b/libs/gpu/libgpu/cuda/utils.h new file mode 100644 index 0000000..b7c3bc5 --- /dev/null +++ b/libs/gpu/libgpu/cuda/utils.h @@ -0,0 +1,76 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#define CUDA_KERNELS_ACCURATE_ERRORS_CHECKS true + +#ifndef NDEBUG +#undef CUDA_KERNELS_ACCURATE_ERRORS_CHECKS +#define CUDA_KERNELS_ACCURATE_ERRORS_CHECKS true +#endif + +namespace cuda { + + class cuda_exception : public gpu::gpu_exception { + public: + cuda_exception(std::string msg) throw () : gpu_exception(msg) { } + cuda_exception(const char *msg) throw () : gpu_exception(msg) { } + cuda_exception() throw () : gpu_exception("CUDA exception") { } + }; + + class cuda_bad_alloc : public gpu::gpu_bad_alloc { + public: + cuda_bad_alloc(std::string msg) throw () : gpu_bad_alloc(msg) { } + cuda_bad_alloc(const char *msg) throw () : gpu_bad_alloc(msg) { } + cuda_bad_alloc() throw () : gpu_bad_alloc("CUDA exception") { } + }; + + std::string formatError(cudaError_t code); + + static inline void reportError(cudaError_t err, int line, std::string prefix="") + { + if (cudaSuccess == err) + return; + + std::string message = prefix + formatError(err) + " at line " + to_string(line); + + switch (err) { + case cudaErrorMemoryAllocation: + throw cuda_bad_alloc(message); + default: + throw cuda_exception(message); + } + } + + static inline void checkKernelErrors(cudaStream_t stream, int line) + { + reportError(cudaGetLastError(), line, "Kernel failed: "); + if (CUDA_KERNELS_ACCURATE_ERRORS_CHECKS) { + reportError(cudaStreamSynchronize(stream), line, "Kernel failed: "); + } + } + + #define CUDA_SAFE_CALL(expr) cuda::reportError(expr, __LINE__) + #define CUDA_CHECK_KERNEL(stream) cuda::checkKernelErrors(stream, __LINE__) + + template class DataTypeRange { }; + template<> class DataTypeRange { public: static __device__ unsigned char min() { return 0; } static __device__ unsigned char max() { return UCHAR_MAX; }}; + template<> class DataTypeRange { public: static __device__ unsigned short min() { return 0; } static __device__ unsigned short max() { return USHRT_MAX; }}; + template<> class DataTypeRange { public: static __device__ unsigned int min() { return 0; } static __device__ unsigned int max() { return UINT_MAX; }}; + template<> class DataTypeRange { public: static __device__ float min() { return FLT_MIN; } static __device__ float max() { return FLT_MAX; }}; + template<> class DataTypeRange { public: static __device__ double min() { return DBL_MIN; } static __device__ double max() { return DBL_MAX; }}; + + template class TypeHelper { }; + template<> class TypeHelper { public: typedef unsigned int type32; }; + template<> class TypeHelper { public: typedef unsigned int type32; }; + template<> class TypeHelper { public: typedef unsigned int type32; }; + template<> class TypeHelper { public: typedef float type32; }; + template<> class TypeHelper { public: typedef float type32; }; + +} diff --git a/libs/gpu/libgpu/device.cpp b/libs/gpu/libgpu/device.cpp new file mode 100644 index 0000000..f663dd4 --- /dev/null +++ b/libs/gpu/libgpu/device.cpp @@ -0,0 +1,174 @@ +#include "device.h" +#include "context.h" +#include +#include +#include + +#ifdef CUDA_SUPPORT +#include +#include +#include +#endif + +namespace gpu { + +std::vector enumDevices() +{ + std::vector devices; + +#ifdef CUDA_SUPPORT + CUDAEnum cuda_enum; + cuda_enum.enumDevices(); + + const std::vector &cuda_devices = cuda_enum.devices(); + for (size_t k = 0; k < cuda_devices.size(); k++) { + const CUDAEnum::Device &cuda_device = cuda_devices[k]; + + Device device; + device.name = cuda_device.name; + device.compute_units = cuda_device.compute_units; + device.clock = cuda_device.clock; + device.mem_size = cuda_device.mem_size; + device.pci_bus_id = cuda_device.pci_bus_id; + device.pci_device_id = cuda_device.pci_device_id; + device.supports_opencl = false; + device.supports_cuda = true; + device.device_id_opencl = 0; + device.device_id_cuda = cuda_device.id; + devices.push_back(device); + } +#endif + + OpenCLEnum opencl_enum; + opencl_enum.enumDevices(); + + const std::vector &opencl_devices = opencl_enum.devices(); + for (size_t k = 0; k < opencl_devices.size(); k++) { + const OpenCLEnum::Device &opencl_device = opencl_devices[k]; + + Device device; + device.name = opencl_device.name; + device.opencl_vendor = opencl_device.vendor; + device.opencl_version = opencl_device.version; + device.compute_units = opencl_device.compute_units; + device.clock = opencl_device.clock; + device.mem_size = opencl_device.mem_size; + device.pci_bus_id = opencl_device.nvidia_pci_bus_id; + device.pci_device_id = opencl_device.nvidia_pci_slot_id; + device.supports_opencl = true; + device.supports_cuda = false; + device.device_id_opencl = opencl_device.id; + device.device_id_cuda = 0; + devices.push_back(device); + } + +#ifdef CUDA_SUPPORT + std::sort(devices.begin(), devices.end()); + + // merge corresponding devices + for (size_t k = 0; k + 1 < devices.size(); k++) { + if (devices[k].name != devices[k + 1].name) continue; + if (devices[k].pci_bus_id != devices[k + 1].pci_bus_id) continue; + if (devices[k].pci_device_id != devices[k + 1].pci_device_id) continue; + + if (!devices[k].supports_opencl && !devices[k + 1].supports_cuda) { + devices[k].supports_opencl = true; + devices[k].device_id_opencl = devices[k + 1].device_id_opencl; + devices.erase(devices.begin() + k + 1); + } + } +#endif + + return devices; +} + +bool Device::printInfo() const +{ +#ifdef CUDA_SUPPORT + if (supports_cuda) { + return CUDAEnum::printInfo(device_id_cuda); + } +#endif + + if (supports_opencl) { + ocl::DeviceInfo device_info; + device_info.init(device_id_opencl); + device_info.print(); + return true; + } + + return false; +} + +bool Device::supportsFreeMemoryQuery() const +{ +#ifdef CUDA_SUPPORT + if (supports_cuda) { + return true; + } else +#endif + if (supports_opencl) { + ocl::DeviceInfo device_info; + device_info.init(device_id_opencl); + if (device_info.hasExtension(CL_AMD_DEVICE_ATTRIBUTE_QUERY_EXT)) { + return true; + } + } + + return false; +} + +unsigned long long Device::getFreeMemory() const +{ +#ifdef CUDA_SUPPORT + if (supports_cuda) { + Context context; + context.init(device_id_cuda); + context.activate(); + + size_t total_mem_size = 0; + size_t free_mem_size = 0; + CUDA_SAFE_CALL(cudaMemGetInfo(&free_mem_size, &total_mem_size)); + return free_mem_size; + } else +#endif + if (supports_opencl) { + ocl::DeviceInfo device_info; + device_info.init(device_id_opencl); + if (device_info.device_type == CL_DEVICE_TYPE_GPU && device_info.hasExtension(CL_AMD_DEVICE_ATTRIBUTE_QUERY_EXT)) { + cl_ulong free_mem = 0; + OCL_SAFE_CALL(clGetDeviceInfo(device_id_opencl, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, sizeof(free_mem), &free_mem, NULL)); + return free_mem * 1024; + } else { + size_t free_mem_size = mem_size - mem_size / 5; + return free_mem_size; + } + } else { + return 0x40000000ull * 64; // assume 64GB by default + } +} + +std::vector selectDevices(unsigned int mask, bool silent) +{ + if (!mask) + return std::vector(); + + std::vector devices = enumDevices(); + + std::vector res; + for (size_t k = 0; k < devices.size(); k++) { + if (!(mask & (1 << k))) + continue; + + Device &device = devices[k]; + if (!silent) + if (!device.printInfo()) + continue; + + res.push_back(device); + } + + return res; +} + +} diff --git a/libs/gpu/libgpu/device.h b/libs/gpu/libgpu/device.h new file mode 100644 index 0000000..0ae5e0a --- /dev/null +++ b/libs/gpu/libgpu/device.h @@ -0,0 +1,51 @@ +#pragma once + +#include +#include + +typedef struct _cl_device_id * cl_device_id; + +namespace gpu { + +class Device { +public: + std::string name; + std::string opencl_vendor; + std::string opencl_version; + unsigned int compute_units; + unsigned int clock; + unsigned long long mem_size; + unsigned int pci_bus_id; + unsigned int pci_device_id; + + bool supports_opencl; + bool supports_cuda; + + cl_device_id device_id_opencl; + int device_id_cuda; + + bool printInfo() const; + + bool supportsFreeMemoryQuery() const; + unsigned long long getFreeMemory() const; + + bool operator< (const Device &other) const + { + if (name < other.name) return true; + if (name > other.name) return false; + if (pci_bus_id < other.pci_bus_id) return true; + if (pci_bus_id > other.pci_bus_id) return false; + if (pci_device_id < other.pci_device_id) return true; + if (pci_device_id > other.pci_device_id) return false; + if (supports_opencl < other.supports_opencl) return true; + if (supports_opencl > other.supports_opencl) return false; + if (supports_cuda < other.supports_cuda) return true; + if (supports_cuda > other.supports_cuda) return false; + return false; + } +}; + +std::vector enumDevices(); +std::vector selectDevices(unsigned int mask, bool silent=false); + +} diff --git a/libs/gpu/libgpu/gold_helpers.cpp b/libs/gpu/libgpu/gold_helpers.cpp new file mode 100644 index 0000000..14a0d15 --- /dev/null +++ b/libs/gpu/libgpu/gold_helpers.cpp @@ -0,0 +1,96 @@ +#include "gold_helpers.h" + +#include +#include +#include +#include + +namespace gold { + + template + void host_data::init(const gpu::gpu_mem_any &gpu_data) { + size_t n = gpu_data.size() / sizeof(T); + data = std::vector(n); + gpu_data.read(data.data(), gpu_data.size()); + } + + template + void host_data::init(const gpu::shared_device_buffer_typed &gpu_data) { + size_t n = gpu_data.size() / sizeof(T); + data = std::vector(n); + gpu_data.readN(data.data(), n); + } + + template + bool host_data::operator==(const host_data &that) { + for (size_t i = 0; i < data.size(); ++i) { + if (data[i] != that.data[i]) { + return false; + } + } + return true; + } + + template + T diff(T a, T b) { + return std::max(a, b) - std::min(a, b); + } + + float diff(float a, float b) { + if (!std::isnan(a) && std::isnan(b)) { + return std::max(a, b) - std::min(a, b); + } else if (std::isnan(a) &&std::isnan(b)) { + return 0.0f; + } else if (std::isnan(a)) { + return std::abs(b); + } else { + assert(std::isnan(b)); + return std::abs(a); + } + } + + double diff(double a, double b) { + if (std::isnan(a) && std::isnan(b)) { + return std::max(a, b) - std::min(a, b); + } else if (std::isnan(a) &&std::isnan(b)) { + return 0.0; + } else if (std::isnan(a)) { + return std::abs(b); + } else { + assert(std::isnan(b)); + return std::abs(a); + } + } + + void ensure(bool condition, int line) { + if (!condition) { + std::cerr << "GOLD check filed at line " << line << "!" << std::endl; + } + } + + template + void ensure_less(T a, T b, int line) { + if (a < b) { + return; + } else { + std::cerr << "Failed check: " << a << " < " << b << std::endl; + ensure(a < b, line); + } + } + + template class host_data; + template class host_data; + template class host_data; + template class host_data; + template class host_data; + template class host_data; + template class host_data; + template class host_data; + + template void ensure_less(uint8_t a, uint8_t b, int line); + template void ensure_less(uint16_t a, uint16_t b, int line); + template void ensure_less(uint32_t a, uint32_t b, int line); + template void ensure_less(float a, float b, int line); + template void ensure_less(double a, double b, int line); + +} diff --git a/libs/gpu/libgpu/gold_helpers.h b/libs/gpu/libgpu/gold_helpers.h new file mode 100644 index 0000000..5315eef --- /dev/null +++ b/libs/gpu/libgpu/gold_helpers.h @@ -0,0 +1,36 @@ +#pragma once + +#include "shared_device_buffer.h" + +#include + +#define GOLD_CHECK(condition) gold::ensure(condition, __LINE__) +#define GOLD_CHECK_LESS(a, b) gold::ensure_less(a, b, __LINE__) + +namespace gold { + + template + class host_data { + public: + host_data() {} + host_data(const gpu::gpu_mem_any& gpu_data) { init(gpu_data); }; + host_data(const gpu::shared_device_buffer_typed& gpu_data) { init(gpu_data); }; + + void init(const gpu::gpu_mem_any& gpu_data); + void init(const gpu::shared_device_buffer_typed& gpu_data); + + bool operator==(const host_data& that); + bool operator!=(const host_data& that) { return !(*this == that); } + + T* ptr() { return data.data(); } + + private: + std::vector data; + }; + + void ensure(bool condition, int line); + + template + void ensure_less(T a, T b, int line); + +} diff --git a/libs/gpu/libgpu/hexdumparray.cpp b/libs/gpu/libgpu/hexdumparray.cpp new file mode 100644 index 0000000..0c888af --- /dev/null +++ b/libs/gpu/libgpu/hexdumparray.cpp @@ -0,0 +1,74 @@ +#include +#include +#include +#include + +int main(int argc, char **argv) +{ + if (argc != 4) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + + std::string sourceFilename(argv[1]); + std::string headerFilename(argv[2]); + std::string arrayName(argv[3]); + + std::ifstream fin(sourceFilename, std::ios::binary); + std::ofstream fout(headerFilename); + + if (!fin) { + std::cerr << "Can't open file " << sourceFilename << "!" << std::endl; + return 1; + } + + // #include + // + // static const char [] = { + // /*... hexadecimal data from source file ...*/ + // }; + // + // size_t _length = sizeof() / sizeof(char); + + fout << "#include " << std::endl; + fout << std::endl; + fout << "static const char " << arrayName << "[] = {" << std::endl; + + char buffer[2391]; + const int maxBytesInLine = 120 / 6; + int bytesInLine = 0; + + std::streamsize n; + do { + fin.read(buffer, sizeof(buffer) / sizeof(char)); + n = fin.gcount(); + for (std::streamsize i = 0; i < n; ++i) { + unsigned int value = (unsigned int) buffer[i]; + if (value > 0xff) { + value -= 0xffffff00; + } + if (value >= 128) { + fout << "-"; + value = 256 - value; + } + fout << "0x" << std::setw(2) << std::setfill('0') << std::hex << value << ", "; + ++bytesInLine; + if (bytesInLine == maxBytesInLine) { + fout << std::endl; + bytesInLine = 0; + } + } + } while (n > 0); + + if (bytesInLine > 0) + fout << std::endl; + + fout << "};" << std::endl; + fout << std::endl; + fout << "size_t " << arrayName << "_length = sizeof(" << arrayName << ") / sizeof(char);" << std::endl; + + fin.close(); + fout.close(); + + return 0; +} diff --git a/libs/gpu/libgpu/opencl/cl/c_template.cl b/libs/gpu/libgpu/opencl/cl/c_template.cl new file mode 100644 index 0000000..0970716 --- /dev/null +++ b/libs/gpu/libgpu/opencl/cl/c_template.cl @@ -0,0 +1,8 @@ +#ifndef c_template_cl // pragma once +#define c_template_cl + +#define T_DEPENDENT2(fun, suffix) fun ## _ ## suffix +#define T_DEPENDENT1(fun, suffix) T_DEPENDENT2(fun, suffix) +#define T_DEPENDENT(fun) T_DEPENDENT1(fun, T) + +#endif // pragma once diff --git a/libs/gpu/libgpu/opencl/cl/clion_defines.cl b/libs/gpu/libgpu/opencl/cl/clion_defines.cl new file mode 100644 index 0000000..942af70 --- /dev/null +++ b/libs/gpu/libgpu/opencl/cl/clion_defines.cl @@ -0,0 +1,74 @@ +#ifndef clion_defines_cl // pragma once +#define clion_defines_cl + +#ifdef __CLION_IDE__ + +#ifndef STATIC_KEYWORD +#define STATIC_KEYWORD static +#endif + +#define __kernel +#define __global +#define __local +#define __constant +#define __private + +#define half float + +struct float2 { float x; }; +struct float3 { float x, y, z; }; +struct float4 { float x, y, z, w; }; + +// https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/commonFunctions.html +#define gentype float +gentype clamp (gentype x, float minval, float maxval); +gentype degrees (gentype radians); +gentype max (gentype x, gentype y); +gentype min (gentype x, gentype y); +gentype mix (gentype x, gentype y, gentype a); +gentype radians (gentype degrees); +gentype sign (gentype x); +gentype smoothstep (gentype edge0, gentype edge1, gentype x); +gentype step (gentype edge, gentype x); +#undef gentype + +// https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/barrier.html +enum cl_mem_fence_flags +{ + CLK_LOCAL_MEM_FENCE, + CLK_GLOBAL_MEM_FENCE +}; +void barrier(cl_mem_fence_flags flags); + +// https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/vectorDataLoadandStoreFunctions.html +#define gentype float +#define gentypen float4 +gentypen vload4 (size_t offset, const gentype *p); +void vstore4 (gentypen data, size_t offset, gentype *p); +void vstore4 (gentypen data, size_t offset, gentype *p); +#undef gentypen +#undef gentype +float vload_half (size_t offset, const half *p); +float4 vload_half4 (size_t offset, const half *p); +void vstore_half (float data, size_t offset, half *p); +void vstore_half4 (float4 data, size_t offset, half *p); +float4 vloada_half4 (size_t offset, const half *p); +void vstorea_half4 (float4 data, size_t offset, half *p); + +// https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/workItemFunctions.html +size_t get_global_size (uint dimindx); +size_t get_global_id (uint dimindx); +size_t get_local_size (uint dimindx); +size_t get_local_id (uint dimindx); +size_t get_num_groups (uint dimindx); +size_t get_group_id (uint dimindx); +size_t get_global_offset (uint dimindx); +uint get_work_dim (); + +// Defined in libs/gpu/libgpu/opencl/engine.cpp:584 +// 64 for AMD, 32 for NVidia, 8 for intel GPUs, 1 for CPU +#define WARP_SIZE 64 + +#endif + +#endif // pragma once diff --git a/libs/gpu/libgpu/opencl/cl/common.cl b/libs/gpu/libgpu/opencl/cl/common.cl new file mode 100644 index 0000000..6f3dac1 --- /dev/null +++ b/libs/gpu/libgpu/opencl/cl/common.cl @@ -0,0 +1,427 @@ +#ifndef common_cl // pragma once +#define common_cl + +#line 5 + +#ifdef HOST_CODE +#include +#include +#else +#include "clion_defines.cl" +#endif + +//#define DEBUG + +#ifndef HOST_CODE +#ifdef DEBUG +#define printf_assert(condition, message) \ + if (!(condition)) printf("%s Line %d\n", message, __LINE__); +#else +#define printf_assert(condition, message) +#endif + +#define assert_isfinite(value) \ + printf_assert(isfinite(value), "Value should be finite!"); +#endif + +#define BOOL_TYPE int +#define BOOL_TRUE 1 +#define BOOL_FALSE 0 + +//______DEVICE_CODE_____________________________________________________________________________________________________ + +#ifndef HOST_CODE + #define make_uint3 (uint3) + #define make_uint4 (uint4) + #define make_float2 (float2) + #define make_float3 (float3) + #define make_float4 (float4) + + #define tanf tan + #define cosf cos + #define sinf sin + #define atan2f atan2 + #define atanf atan + #define asinf asin + + STATIC_KEYWORD float sqrtf(float x) + { + return sqrt(x); + } + + STATIC_KEYWORD float expf(float x) + { + return exp(x); + } + + #define norm(v) length(v) + #define norm2(v) dot(v, v) + + #define min3(x, y, z) (min(min(x, y), z)) + #define max3(x, y, z) (max(max(x, y), z)) + + #define cl_float4 float4 + + STATIC_KEYWORD uint3 fetch_uint3(__global const unsigned int* ptr, size_t index) + { + return make_uint3(ptr[3 * index + 0], ptr[3 * index + 1], ptr[3 * index + 2]); + } + + STATIC_KEYWORD uint4 fetch_uint4(__global const unsigned int* ptr, size_t index) + { + return make_uint4(ptr[4 * index + 0], ptr[4 * index + 1], ptr[4 * index + 2], ptr[4 * index + 3]); + } + + STATIC_KEYWORD float2 fetch_float2(__global const float* ptr, size_t index) + { + return make_float2(ptr[2 * index + 0], ptr[2 * index + 1]); + } + + STATIC_KEYWORD float3 fetch_float3(__global const float* ptr, size_t index) + { + return make_float3(ptr[3 * index + 0], ptr[3 * index + 1], ptr[3 * index + 2]); + } + + STATIC_KEYWORD float4 fetch_float4(__global const float* ptr, size_t index) + { + return make_float4(ptr[4 * index + 0], ptr[4 * index + 1], ptr[4 * index + 2], ptr[4 * index + 3]); + } + + STATIC_KEYWORD void set_uint3(__global unsigned int* ptr, size_t index, uint3 value) + { + ptr[3 * index + 0] = value.x; + ptr[3 * index + 1] = value.y; + ptr[3 * index + 2] = value.z; + } + + STATIC_KEYWORD void set_float3(__global float* ptr, size_t index, float3 value) + { + ptr[3 * index + 0] = value.x; + ptr[3 * index + 1] = value.y; + ptr[3 * index + 2] = value.z; + } + + STATIC_KEYWORD void atomic_add_f32(volatile __global float *address, float value) { + float old = value; + while ((old = atomic_xchg(address, atomic_xchg(address, 0.0f)+old))!=0.0f); + } + + STATIC_KEYWORD float atomic_cmpxchg_f32(volatile __global float *p, float cmp, float val) { + union { + unsigned int u32; + float f32; + } cmp_union, val_union, old_union; + + cmp_union.f32 = cmp; + val_union.f32 = val; + old_union.u32 = atomic_cmpxchg((volatile __global unsigned int *) p, cmp_union.u32, val_union.u32); + return old_union.f32; + } + + STATIC_KEYWORD float atomic_cmpxchg_float(volatile __global float *p, float cmp, float val) { + return atomic_cmpxchg_f32(p, cmp, val); + } + + STATIC_KEYWORD unsigned int atomic_cmpxchg_uint(volatile __global uint *p, uint cmp, uint val) { + return atomic_cmpxchg(p, cmp, val); + } + + STATIC_KEYWORD unsigned char rounded_cast_uchar(float value) { + return (unsigned char) (value + 0.5f); + } + + STATIC_KEYWORD unsigned short rounded_cast_ushort(float value) { + return (unsigned short) (value + 0.5f); + } + + STATIC_KEYWORD unsigned int rounded_cast_uint(float value) { + return (unsigned int) (value + 0.5f); + } + + STATIC_KEYWORD float rounded_cast_float(float value) { + return value; + } +#endif + +//______SHARED_STRUCTS__________________________________________________________________________________________________ + +// https://devtalk.nvidia.com/default/topic/673965/are-there-any-cuda-libararies-for-3x3-matrix-amp-vector3-amp-quaternion-operations-/ +typedef struct { + cl_float4 m_row[3]; +} Matrix3x3f; + +typedef struct { + cl_float4 m_row[4]; +} Matrix4x4f; + +//______HOST_CODE_______________________________________________________________________________________________________ + +#ifdef HOST_CODE + inline cl_float3 make_float3(const vector3d &a) + { + cl_float3 v = {(float) a.x(), (float) a.y(), (float) a.z()}; + return v; + } + + inline cl_float4 make_float4(const vector4d &a) + { + cl_float4 v = {(float) a.x(), (float) a.y(), (float) a.z(), (float) a.w()}; + return v; + } + + inline cl_float4 make_float4(float x, float y, float z, float w) + { + cl_float4 v = {x, y, z, w}; + return v; + } + + inline Matrix3x3f make_matrix_f3x3(const matrix3x3d &a) + { + Matrix3x3f m; + m.m_row[0] = make_float4((float) a(0, 0), (float) a(0, 1), (float) a(0, 2), 0.0f); + m.m_row[1] = make_float4((float) a(1, 0), (float) a(1, 1), (float) a(1, 2), 0.0f); + m.m_row[2] = make_float4((float) a(2, 0), (float) a(2, 1), (float) a(2, 2), 0.0f); + return m; + } + + inline Matrix4x4f make_matrix_f4x4(const matrix4x4d &a) + { + Matrix4x4f m; + m.m_row[0] = make_float4((float) a(0, 0), (float) a(0, 1), (float) a(0, 2), (float) a(0, 3)); + m.m_row[1] = make_float4((float) a(1, 0), (float) a(1, 1), (float) a(1, 2), (float) a(1, 3)); + m.m_row[2] = make_float4((float) a(2, 0), (float) a(2, 1), (float) a(2, 2), (float) a(2, 3)); + m.m_row[3] = make_float4((float) a(3, 0), (float) a(3, 1), (float) a(3, 2), (float) a(3, 3)); + return m; + } +#endif + +//______DEVICE_CODE_____________________________________________________________________________________________________ + +#ifndef HOST_CODE + +#ifdef DEBUG + STATIC_KEYWORD void print_matrix_f3x3(const Matrix3x3f m) + { + printf("[\n"); + printf(" [%f, %f, %f],\n", m.m_row[0].x, m.m_row[0].y, m.m_row[0].z); + printf(" [%f, %f, %f],\n", m.m_row[1].x, m.m_row[1].y, m.m_row[1].z); + printf(" [%f, %f, %f],\n", m.m_row[2].x, m.m_row[2].y, m.m_row[2].z); + printf("]\n"); + } +#endif + + STATIC_KEYWORD Matrix3x3f make_matrix_f3x3(float a00, float a01, float a02, float a10, float a11, float a12, float a20, float a21, float a22) + { + Matrix3x3f m; + m.m_row[0] = make_float4(a00, a01, a02, 0.0f); + m.m_row[1] = make_float4(a10, a11, a12, 0.0f); + m.m_row[2] = make_float4(a20, a21, a22, 0.0f); + return m; + } + + STATIC_KEYWORD Matrix3x3f make_zero_matrix_f3x3() + { + Matrix3x3f m; + m.m_row[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + m.m_row[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + m.m_row[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + return m; + } + + STATIC_KEYWORD Matrix3x3f make_eye_matrix_f3x3() + { + Matrix3x3f m; + m.m_row[0] = make_float4(1.0f, 0.0f, 0.0f, 0.0f); + m.m_row[1] = make_float4(0.0f, 1.0f, 0.0f, 0.0f); + m.m_row[2] = make_float4(0.0f, 0.0f, 1.0f, 0.0f); + return m; + } + + STATIC_KEYWORD Matrix3x3f transpose_f3x3(const Matrix3x3f m) + { + Matrix3x3f t; + t.m_row[0] = make_float4(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.0f); + t.m_row[1] = make_float4(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.0f); + t.m_row[2] = make_float4(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.0f); + return t; + } + + STATIC_KEYWORD Matrix3x3f add_f3x3(const Matrix3x3f a, const Matrix3x3f b) + { + Matrix3x3f m; + m.m_row[0] = a.m_row[0] + b.m_row[0]; + m.m_row[1] = a.m_row[1] + b.m_row[1]; + m.m_row[2] = a.m_row[2] + b.m_row[2]; + return m; + } + + STATIC_KEYWORD Matrix3x3f mul_f3x3(const Matrix3x3f a, const Matrix3x3f b) + { + Matrix3x3f bt = transpose_f3x3(b); + Matrix3x3f res; + res.m_row[0] = make_float4(dot(a.m_row[0], bt.m_row[0]), dot(a.m_row[0], bt.m_row[1]), dot(a.m_row[0], bt.m_row[2]), 0.0f); + res.m_row[1] = make_float4(dot(a.m_row[1], bt.m_row[0]), dot(a.m_row[1], bt.m_row[1]), dot(a.m_row[1], bt.m_row[2]), 0.0f); + res.m_row[2] = make_float4(dot(a.m_row[2], bt.m_row[0]), dot(a.m_row[2], bt.m_row[1]), dot(a.m_row[2], bt.m_row[2]), 0.0f); + return res; + } + + STATIC_KEYWORD Matrix3x3f mul_f_f3x3(float k, const Matrix3x3f a) + { + Matrix3x3f m; + m.m_row[0] = a.m_row[0] * k; + m.m_row[1] = a.m_row[1] * k; + m.m_row[2] = a.m_row[2] * k; + return m; + } + + STATIC_KEYWORD float3 mul_f3x3_f3(const Matrix3x3f a, const float3 b) + { + return make_float3(a.m_row[0].x * b.x + a.m_row[0].y * b.y + a.m_row[0].z * b.z, + a.m_row[1].x * b.x + a.m_row[1].y * b.y + a.m_row[1].z * b.z, + a.m_row[2].x * b.x + a.m_row[2].y * b.y + a.m_row[2].z * b.z); + } + + STATIC_KEYWORD float2 transformPoint_f3x3(const Matrix3x3f m, const float2 p) + { + float3 temp = mul_f3x3_f3(m, make_float3(p.x, p.y, 1.0f)); + return make_float2(temp.x, temp.y) / temp.z; + } + +#ifdef DEBUG + STATIC_KEYWORD void print_matrix_f4x4(const Matrix4x4f m) + { + printf("[\n"); + printf(" [%f, %f, %f, %f],\n", m.m_row[0].x, m.m_row[0].y, m.m_row[0].z, m.m_row[0].w); + printf(" [%f, %f, %f, %f],\n", m.m_row[1].x, m.m_row[1].y, m.m_row[1].z, m.m_row[1].w); + printf(" [%f, %f, %f, %f],\n", m.m_row[2].x, m.m_row[2].y, m.m_row[2].z, m.m_row[2].w); + printf(" [%f, %f, %f, %f],\n", m.m_row[3].x, m.m_row[3].y, m.m_row[3].z, m.m_row[3].w); + printf("]\n"); + } +#endif + + STATIC_KEYWORD Matrix4x4f make_matrix_f4x4(float a00, float a01, float a02, float a03, + float a10, float a11, float a12, float a13, + float a20, float a21, float a22, float a23, + float a30, float a31, float a32, float a33) + { + Matrix4x4f m; + m.m_row[0] = make_float4(a00, a01, a02, a03); + m.m_row[1] = make_float4(a10, a11, a12, a13); + m.m_row[2] = make_float4(a20, a21, a22, a23); + m.m_row[3] = make_float4(a30, a31, a32, a33); + return m; + } + + STATIC_KEYWORD Matrix4x4f make_translation_f4x4(const float3 t) + { + return make_matrix_f4x4(1.0f, 0.0f, 0.0f, t.x, + 0.0f, 1.0f, 0.0f, t.y, + 0.0f, 0.0f, 1.0f, t.z, + 0.0f, 0.0f, 0.0f, 1.0f); + } + + STATIC_KEYWORD Matrix4x4f make_rotation_f4x4(const Matrix3x3f r) + { + Matrix4x4f m; + m.m_row[0] = r.m_row[0]; + m.m_row[1] = r.m_row[1]; + m.m_row[2] = r.m_row[2]; + + m.m_row[0].w = 0.0f; + m.m_row[1].w = 0.0f; + m.m_row[2].w = 0.0f; + m.m_row[3] = make_float4(0.0f, 0.0f, 0.0f, 1.0f); + return m; + } + + STATIC_KEYWORD float3 extract_translation_f4x4(const Matrix4x4f m) + { + float norm = 1.0f / m.m_row[3].w; + return make_float3(m.m_row[0].w, m.m_row[1].w, m.m_row[2].w) * norm; + } + + STATIC_KEYWORD Matrix3x3f extract_rotation_f4x4(const Matrix4x4f m) + { + Matrix3x3f R = make_matrix_f3x3( + m.m_row[0].x, m.m_row[0].y, m.m_row[0].z, + m.m_row[1].x, m.m_row[1].y, m.m_row[1].z, + m.m_row[2].x, m.m_row[2].y, m.m_row[2].z); + + // matrix4x4f.scale3() + Matrix3x3f MtM = mul_f3x3(transpose_f3x3(R), R); + + float3 d = make_float3(MtM.m_row[0].x, MtM.m_row[1].y, MtM.m_row[2].z); + + if (d.x > 0) d.x = sqrtf(d.x); + if (d.y > 0) d.y = sqrtf(d.y); + if (d.z > 0) d.z = sqrtf(d.z); + + float3 s = d; + + if (s.x) s.x = 1.0f / s.x; + if (s.y) s.y = 1.0f / s.y; + if (s.z) s.z = 1.0f / s.z; + + return mul_f3x3(R, make_matrix_f3x3(s.x, 0.0f, 0.0f, + 0.0f, s.y, 0.0f, + 0.0f, 0.0f, s.z)); + } + + STATIC_KEYWORD Matrix4x4f transpose_f4x4(const Matrix4x4f m) + { + Matrix4x4f t; + t.m_row[0] = make_float4(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, m.m_row[3].x); + t.m_row[1] = make_float4(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, m.m_row[3].y); + t.m_row[2] = make_float4(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, m.m_row[3].z); + t.m_row[3] = make_float4(m.m_row[0].w, m.m_row[1].w, m.m_row[2].w, m.m_row[3].w); + return t; + } + + STATIC_KEYWORD Matrix4x4f mul_f4x4(const Matrix4x4f a, const Matrix4x4f b) + { + Matrix4x4f bt = transpose_f4x4(b); + Matrix4x4f res; + res.m_row[0] = make_float4(dot(a.m_row[0], bt.m_row[0]), dot(a.m_row[0], bt.m_row[1]), dot(a.m_row[0], bt.m_row[2]), dot(a.m_row[0], bt.m_row[3])); + res.m_row[1] = make_float4(dot(a.m_row[1], bt.m_row[0]), dot(a.m_row[1], bt.m_row[1]), dot(a.m_row[1], bt.m_row[2]), dot(a.m_row[1], bt.m_row[3])); + res.m_row[2] = make_float4(dot(a.m_row[2], bt.m_row[0]), dot(a.m_row[2], bt.m_row[1]), dot(a.m_row[2], bt.m_row[2]), dot(a.m_row[2], bt.m_row[3])); + res.m_row[3] = make_float4(dot(a.m_row[3], bt.m_row[0]), dot(a.m_row[3], bt.m_row[1]), dot(a.m_row[3], bt.m_row[2]), dot(a.m_row[3], bt.m_row[3])); + return res; + } + + STATIC_KEYWORD float4 mul_f4x4_f4(const Matrix4x4f a, const float4 b) + { + return make_float4(a.m_row[0].x * b.x + a.m_row[0].y * b.y + a.m_row[0].z * b.z + a.m_row[0].w * b.w, + a.m_row[1].x * b.x + a.m_row[1].y * b.y + a.m_row[1].z * b.z + a.m_row[1].w * b.w, + a.m_row[2].x * b.x + a.m_row[2].y * b.y + a.m_row[2].z * b.z + a.m_row[2].w * b.w, + a.m_row[3].x * b.x + a.m_row[3].y * b.y + a.m_row[3].z * b.z + a.m_row[3].w * b.w); + } + + STATIC_KEYWORD float3 transformPoint(const Matrix4x4f m, const float3 p) + { + float4 temp = mul_f4x4_f4(m, make_float4(p.x, p.y, p.z, 1.0f)); + return make_float3(temp.x, temp.y, temp.z) / temp.w; + } + + STATIC_KEYWORD float3 transformVector(const Matrix4x4f m, const float3 v) + { + float4 temp = mul_f4x4_f4(m, make_float4(v.x, v.y, v.z, 0.0f)); + return make_float3(temp.x, temp.y, temp.z); + } + + STATIC_KEYWORD float smootherstep(float edge0, float edge1, float x) + { + if (x < edge0) { + return 0.0f; + } else if (x >= edge1) { + return 1.0f; + } + + // Scale, and clamp x to 0..1 range + x = (x - edge0) / (edge1 - edge0); + // Evaluate polynomial + return x * x * x * (x * (x * 6.0f - 15.0f) + 10.0f); + } + +#endif + +#endif // pragma once diff --git a/libs/gpu/libgpu/opencl/device_info.cpp b/libs/gpu/libgpu/opencl/device_info.cpp new file mode 100644 index 0000000..30e875c --- /dev/null +++ b/libs/gpu/libgpu/opencl/device_info.cpp @@ -0,0 +1,204 @@ +#include "device_info.h" +#include "utils.h" +#include +#include + +namespace ocl { + +DeviceInfo::DeviceInfo() +{ + device_type = 0; + max_compute_units = 0; + max_mem_alloc_size = 0; + max_workgroup_size = 0; + max_work_item_sizes[0] = 0; + max_work_item_sizes[1] = 0; + max_work_item_sizes[2] = 0; + global_mem_size = 0; + device_address_bits = 0; + vendor_id = 0; + warp_size = 0; + wavefront_width = 0; + opencl_major_version = 0; + opencl_minor_version = 0; +} + +void DeviceInfo::print() const +{ + std::cout << "Using device: " << device_name << ", " << max_compute_units << " compute units, " << (global_mem_size >> 20) << " MB global memory," + " OpenCL " << opencl_major_version << "." << opencl_minor_version << std::endl; + std::cout << " driver version: " << driver_version << ", platform version: " << platform_version << std::endl; + std::cout << " max work group size " << max_workgroup_size << std::endl; + std::cout << " max work item sizes [" << max_work_item_sizes[0] << ", " << max_work_item_sizes[1] << ", " << max_work_item_sizes[2] << "]" << std::endl; + std::cout << " max mem alloc size " << (max_mem_alloc_size >> 20) << " MB" << std::endl; + if (warp_size != 0) + std::cout << " warp size " << warp_size << std::endl; + if (wavefront_width != 0) + std::cout << " wavefront width " << wavefront_width << std::endl; +} + +void DeviceInfo::init(cl_device_id device_id) +{ + cl_device_type device_type = 0; + cl_uint max_compute_units = 0; // Number of compute units (SM's on NV GPU) + cl_uint max_work_item_dimensions = 0; + size_t max_workgroup_size = 0; + cl_uint vendor_id = 0; + cl_ulong max_mem_alloc_size = 0; + cl_ulong global_mem_size = 0; + cl_uint device_address_bits = 0; + char device_string[1024] = ""; + char vendor_string[1024] = ""; + char driver_version_string[1024] = ""; + char platform_version_string[1024]= ""; + + cl_platform_id platform_id = 0; + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_PLATFORM, sizeof(platform_id), &platform_id, NULL)); + + OCL_SAFE_CALL(clGetPlatformInfo(platform_id, CL_PLATFORM_VERSION, sizeof(platform_version_string), &platform_version_string, NULL)); + + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL)); + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(vendor_string), &vendor_string, NULL)); + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(driver_version_string), &driver_version_string, NULL)); + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL)); + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(max_compute_units), &max_compute_units, NULL)); + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_mem_alloc_size), &max_mem_alloc_size, NULL)); + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(max_work_item_dimensions), &max_work_item_dimensions, NULL)); + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_workgroup_size), &max_workgroup_size, NULL)); + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(global_mem_size), &global_mem_size, NULL)); + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_ADDRESS_BITS, sizeof(device_address_bits), &device_address_bits, NULL)); + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_VENDOR_ID, sizeof(vendor_id), &vendor_id, NULL)); + + std::vector max_work_item_sizes(max_work_item_dimensions); + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_ITEM_SIZES, max_work_item_dimensions * sizeof(size_t), max_work_item_sizes.data(), NULL)); + for (int i = 0; i < 3; i++) + this->max_work_item_sizes[i] = max_work_item_sizes[i]; + + this->device_name = std::string(device_string); + this->vendor_name = std::string(vendor_string); + this->device_type = device_type; + this->vendor_id = vendor_id; + this->max_compute_units = max_compute_units; + this->max_mem_alloc_size = max_mem_alloc_size; + this->max_workgroup_size = max_workgroup_size; + this->global_mem_size = global_mem_size; + this->device_address_bits = device_address_bits; + this->max_work_item_dimensions = max_work_item_dimensions; + this->driver_version = std::string(driver_version_string); + this->platform_version = std::string(platform_version_string); + + initExtensions(platform_id, device_id); + initOpenCLVersion(platform_id, device_id); + + if (device_type == CL_DEVICE_TYPE_GPU && vendor_id == ID_AMD && hasExtension(CL_AMD_DEVICE_ATTRIBUTE_QUERY_EXT)) { + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_BOARD_NAME_AMD, sizeof(device_string), &device_string, NULL)); + this->device_name = std::string(device_string) + " (" + this->device_name + ")"; + } + + cl_uint warp_size = 0; + size_t wavefront_width = 0; + if (device_type == CL_DEVICE_TYPE_GPU) { + if (vendor_id == ID_NVIDIA && hasExtension(CL_NV_DEVICE_ATTRIBUTE_QUERY_EXT)) { + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), &warp_size, NULL)); + } else if (vendor_id == ID_AMD && hasExtension(CL_AMD_DEVICE_ATTRIBUTE_QUERY_EXT)) { + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_WAVEFRONT_WIDTH_AMD, sizeof(cl_uint), &wavefront_width, NULL)); + } + } + + this->warp_size = warp_size; + this->wavefront_width = wavefront_width; +} + +void DeviceInfo::initOpenCLVersion(cl_platform_id platform_id, cl_device_id device_id) +{ + const int buffer_limit = 1024; + char buffer[buffer_limit]; + + int platform_major_version = 0; + int platform_minor_version = 0; + + OCL_SAFE_CALL(clGetPlatformInfo (platform_id, CL_PLATFORM_VERSION, buffer_limit, buffer, NULL)); + parseOpenCLVersion(buffer, buffer_limit, platform_major_version, platform_minor_version); + + int device_major_version = 0; + int device_minor_version = 0; + + OCL_SAFE_CALL(clGetDeviceInfo (device_id, CL_DEVICE_VERSION, buffer_limit, buffer, NULL)); + parseOpenCLVersion(buffer, buffer_limit, device_major_version, device_minor_version); + + if (device_major_version < platform_major_version + || (device_major_version == platform_major_version && device_minor_version < platform_minor_version)) { + opencl_major_version = device_major_version; + opencl_minor_version = device_minor_version; + } else { + opencl_major_version = platform_major_version; + opencl_minor_version = platform_minor_version; + } +} + +void DeviceInfo::parseOpenCLVersion(char* buffer, int buffer_limit, int& major_version, int& minor_verions) +{ + // For platform: + // "OpenCL" + // For device: + // "OpenCL" + int firstSpaceIndex = -1; + int firstDotIndex = -1; + int secondSpaceIndex = -1; + for (int i = 0; i < buffer_limit; i++ ) { + if (buffer[i] == ' ') { + if (firstSpaceIndex == -1) { + firstSpaceIndex = i; + } else if (secondSpaceIndex == -1) { + secondSpaceIndex = i; + buffer[i] = 0; + break; + } + } else if (buffer[i] == '.' && firstDotIndex == -1) { + firstDotIndex = i; + buffer[i] = 0; + } + } + + major_version = atoi(buffer + firstSpaceIndex + 1); + minor_verions = atoi(buffer + firstDotIndex + 1); +} + +bool DeviceInfo::isIntelGPU() const +{ + return device_type == CL_DEVICE_TYPE_GPU + && (vendor_id == ocl::ID_INTEL || vendor_name.find("Intel") != std::string::npos); +} + +void DeviceInfo::initExtensions(cl_platform_id platform_id, cl_device_id device_id) +{ + for (int i = 0; i < 2; ++i) { + size_t length; + if (i == 0) { + OCL_SAFE_CALL(clGetPlatformInfo(platform_id, CL_PLATFORM_EXTENSIONS, 0, 0, &length)); + } else { + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, 0, &length)); + } + + std::vector buffer(length); + if (i == 0) { + OCL_SAFE_CALL(clGetPlatformInfo(platform_id, CL_PLATFORM_EXTENSIONS, sizeof(char) * buffer.size(), buffer.data(), NULL)); + } else { + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, sizeof(char) * buffer.size(), buffer.data(), NULL)); + } + + std::string extension = ""; + for (int i = 0; i <= buffer.size(); i++) { + if (i == buffer.size() || buffer[i] == ' ') { + if (extension.length() > 0) { + extensions.insert(extension); + extension = ""; + } + } else { + extension += buffer[i]; + } + } + } +} + +} diff --git a/libs/gpu/libgpu/opencl/device_info.h b/libs/gpu/libgpu/opencl/device_info.h new file mode 100644 index 0000000..979cf00 --- /dev/null +++ b/libs/gpu/libgpu/opencl/device_info.h @@ -0,0 +1,49 @@ +#pragma once + +#include +#include +#include + +typedef struct _cl_platform_id * cl_platform_id; +typedef struct _cl_device_id * cl_device_id; + +namespace ocl { + +class DeviceInfo { +public: + DeviceInfo(); + + void init(cl_device_id device_id); + void print() const; + + bool isIntelGPU() const; + bool hasExtension(std::string extension) { return extensions.count(extension) > 0;} + + std::string device_name; + std::string vendor_name; + unsigned int device_type; + unsigned int vendor_id; + size_t max_compute_units; + size_t max_mem_alloc_size; + size_t max_workgroup_size; + size_t max_work_item_sizes[3]; + size_t global_mem_size; + size_t device_address_bits; + size_t max_work_item_dimensions; + unsigned int warp_size; + size_t wavefront_width; + std::string driver_version; + std::string platform_version; + + int opencl_major_version; + int opencl_minor_version; + + std::set extensions; + +protected: + void initExtensions(cl_platform_id platform_id, cl_device_id device_id); + void initOpenCLVersion(cl_platform_id platform_id, cl_device_id device_id); + void parseOpenCLVersion(char* buffer, int buffer_limit, int& major_version, int& minor_verions); +}; + +} diff --git a/libs/gpu/libgpu/opencl/engine.cpp b/libs/gpu/libgpu/opencl/engine.cpp new file mode 100644 index 0000000..28b82ee --- /dev/null +++ b/libs/gpu/libgpu/opencl/engine.cpp @@ -0,0 +1,749 @@ +#include "utils.h" +#include "libutils/thread_mutex.h" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#define _SHORT_FILE_ "ocl_engine.cpp" + +#define LOAD_KERNEL_BINARIES_FROM_FILE "" +#define DUMP_KERNEL_BINARIES_TO_FILE "" +#define OCL_VERBOSE_COMPILE_LOG false + +#ifdef _MSC_VER +typedef unsigned long long uint64_t; +#endif + +using namespace ocl; + +// Gets the platform ID for NVIDIA if available, otherwise default +cl_platform_id oclGetPlatformID(void) +{ + char chBuffer[1024]; + cl_uint num_platforms; + + cl_platform_id platform = 0; + + // Get OpenCL platform count + OCL_SAFE_CALL(clGetPlatformIDs (0, NULL, &num_platforms)); + + if (num_platforms == 0) + throw ocl_exception("No OpenCL platforms found"); + + // if there's a platform or more, make space for ID's + std::vector clPlatformIDs(num_platforms); + + // get platform info for each platform and trap the NVIDIA platform if found + OCL_SAFE_CALL(clGetPlatformIDs (num_platforms, clPlatformIDs.data(), NULL)); + for (cl_uint i = 0; i < num_platforms; ++i) { + OCL_SAFE_CALL(clGetPlatformInfo (clPlatformIDs[i], CL_PLATFORM_NAME, 1024, &chBuffer, NULL)); + if (strstr(chBuffer, "NVIDIA") != NULL) { + platform = clPlatformIDs[i]; + break; + } + } + + // default to zeroeth platform if NVIDIA not found + if (platform == 0) + platform = clPlatformIDs[0]; + + return platform; +} + +OpenCLKernel::OpenCLKernel() +{ + kernel_ = 0; + work_group_size_ = 0; +} + +OpenCLKernel::~OpenCLKernel() +{ + if (kernel_) clReleaseKernel(kernel_); +} + +void OpenCLKernel::create(cl_program program, const char *kernel_name, cl_device_id device_id_) +{ + if (device_id_ == NULL) { + gpu::Context context; + GPU_CHECKED_VERBOSE(context.type() == gpu::Context::TypeOpenCL, "Can not link with OpenCL kernel!"); + device_id_ = context.cl()->device(); + } + + cl_int ciErrNum = CL_SUCCESS; + kernel_name_ = std::string(kernel_name); + kernel_ = clCreateKernel(program, kernel_name, &ciErrNum); + + if (ciErrNum != CL_SUCCESS) + throw std::runtime_error("clCreateKernel " + to_string(kernel_name_) + " failed: " + errorString(ciErrNum)); + + size_t kernel_workgroup_size = 0; + + ciErrNum = clGetKernelWorkGroupInfo(kernel_, device_id_, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernel_workgroup_size, NULL); + if (ciErrNum != CL_SUCCESS) + throw std::runtime_error("clGetKernelWorkGroupInfo failed: " + errorString(ciErrNum)); + + work_group_size_ = kernel_workgroup_size; +} + +void OpenCLKernel::setArg(cl_uint arg_index, size_t arg_size, const void *arg_value) +{ + cl_int ciErrNum = clSetKernelArg(kernel_, arg_index, arg_size, arg_value); + + if (ciErrNum != CL_SUCCESS) + throw std::runtime_error("clSetKernelArg " + to_string(kernel_name_) + "#" + to_string(arg_index) + " (" +to_string(arg_size) + " bytes) failed: " + errorString(ciErrNum)); +} + +OpenCLEngine::OpenCLEngine() +{ + platform_id_ = 0; + device_id_ = 0; + context_ = 0; + command_queue_ = 0; + total_mem_size_ = 0; +} + +OpenCLEngine::~OpenCLEngine() +{ + for (std::map::iterator it = kernels_.begin(); it != kernels_.end(); ++it) + delete it->second; + + for (std::map::iterator it = programs_.begin(); it != programs_.end(); ++it) + clReleaseProgram(it->second); + + if (command_queue_) clReleaseCommandQueue(command_queue_); + if (context_) clReleaseContext(context_); +} + +void OpenCLEngine::init(cl_device_id device_id, const char *cl_params, bool verbose) +{ + if (!device_id) { + init((cl_platform_id) 0, (cl_device_id) 0, cl_params, verbose); + return; + } + + cl_platform_id platform_id = 0; + OCL_SAFE_CALL(clGetDeviceInfo(device_id, CL_DEVICE_PLATFORM, sizeof(platform_id), &platform_id, NULL)); + + return init(platform_id, device_id, cl_params, verbose); +} + +void OpenCLEngine::init(cl_platform_id platform_id, cl_device_id device_id, const char *cl_params, bool verbose) +{ + if (!ocl_init()) + throw ocl_exception("Can't init OpenCL driver"); + + if (command_queue_) { + clReleaseCommandQueue(command_queue_); + command_queue_ = 0; + } + + if (context_) { + clReleaseContext(context_); + context_ = 0; + } + + if (!platform_id) { + device_id = 0; + platform_id = oclGetPlatformID(); + } + + if (!device_id) { + // Get all the devices + cl_uint uiNumDevices = 0; // Number of devices available + + OCL_SAFE_CALL(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices)); + + if (uiNumDevices < 1) + throw ocl_exception("No OpenCL devices found"); + + std::vector devices(uiNumDevices); + OCL_SAFE_CALL(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, uiNumDevices, devices.data(), NULL)); + device_id = devices[0]; + } + + device_info_.init(device_id); + + if (device_info_.max_work_item_dimensions < 3) + throw ocl_exception("3 dimensional work items not supported"); + + total_mem_size_ = device_info_.global_mem_size; + + cl_context_properties context_props[] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform_id, 0 }; + + cl_int ciErrNum; + context_ = clCreateContext(context_props, 1, &device_id, NULL, NULL, &ciErrNum); + OCL_SAFE_CALL(ciErrNum); + + command_queue_ = clCreateCommandQueue(context_, device_id, 0, &ciErrNum); + OCL_SAFE_CALL(ciErrNum); + + platform_id_ = platform_id; + device_id_ = device_id; + + if (device_info_.device_type == CL_DEVICE_TYPE_GPU) { + if (device_info_.warp_size) { + wavefront_size_ = device_info_.warp_size; + } else if (device_info_.wavefront_width) { + wavefront_size_ = device_info_.wavefront_width; + } else if (device_info_.isIntelGPU()) { + wavefront_size_ = 8; + } else { + wavefront_size_ = 1; + } + } else { + wavefront_size_ = 1; + } + + if (verbose) { + device_info_.print(); + if (device_info_.warp_size == 0 && device_info_.wavefront_width == 0) { + std::cout << " wavefront width " << wavefront_size_ << std::endl; + } + } +} + +void ocl::oclPrintBuildLog(cl_program program) +{ + size_t device_count; + OCL_SAFE_CALL(clGetProgramInfo(program, CL_PROGRAM_DEVICES, 0, NULL, &device_count)); + device_count /= sizeof(cl_device_id); + + std::vector devices(device_count); + + OCL_SAFE_CALL(clGetProgramInfo(program, CL_PROGRAM_DEVICES, device_count * sizeof(cl_device_id), devices.data(), NULL)); + + for (size_t k = 0; k < device_count; k++) { + std::cout << "Device " << k + 1 << std::endl; + size_t log_size = 0; + OCL_SAFE_CALL(clGetProgramBuildInfo(program, devices[k], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); + if (log_size > 0) { + std::cout << "\tProgram build log:" << std::endl; + std::vector log(log_size + 1); + OCL_SAFE_CALL(clGetProgramBuildInfo(program, devices[k], CL_PROGRAM_BUILD_LOG, log_size, log.data(), NULL)); + + log[log_size] = 0; + std::cout << log.data() << std::endl << std::endl; + } else { + std::cout << "\tProgram build log is clear" << std::endl; + } + } +} + +cl_mem OpenCLEngine::createBuffer(cl_mem_flags flags, size_t size) +{ +// if (size > device_info_.max_mem_alloc_size) { +// throw ocl_bad_alloc("Can't allocate " + to_string(size) + " bytes, because max allocation size is " + to_string(device_info_.max_mem_alloc_size) + "!"); +// } + + cl_int status = CL_SUCCESS; + cl_mem res = clCreateBuffer(context_, flags, size, NULL, &status); + OCL_SAFE_CALL(status); + + // forcing buffer allocation by fictive write + size_t data_size = (size >= 8) ? 4 : 1; + assert (size >= 2 * data_size); + + int test_data = 239; + try { + writeBuffer(res, CL_TRUE, 0, data_size, &test_data); + } catch (ocl_exception& e) { + releaseMemObject(res); + throw; + } + + return res; +} + +void OpenCLEngine::writeBuffer(cl_mem buffer, cl_bool blocking_write, size_t offset, size_t cb, const void *ptr) +{ + if (cb == 0) + return; + OCL_SAFE_CALL(clEnqueueWriteBuffer(queue(), buffer, blocking_write, offset, cb, ptr, 0, NULL, NULL)); +} + +void OpenCLEngine::writeBufferRect(cl_mem buffer, cl_bool blocking_write, const size_t buffer_origin[3], const size_t host_origin[3], const size_t region[3], + size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, const void *ptr) +{ + if (region[0] == 0 || region[1] == 0 || region[2] == 0) + return; + OCL_SAFE_CALL(clEnqueueWriteBufferRect(queue(), buffer, blocking_write, buffer_origin, host_origin, region, + buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, NULL)); +} + +void OpenCLEngine::readBuffer(cl_mem buffer, cl_bool blocking_read, size_t offset, size_t cb, void *ptr) +{ + if (cb == 0) + return; + OCL_SAFE_CALL(clEnqueueReadBuffer(queue(), buffer, blocking_read, offset, cb, ptr, 0, NULL, NULL)); +} + +void OpenCLEngine::readBufferRect(cl_mem buffer, cl_bool blocking_write, const size_t buffer_origin[3], const size_t host_origin[3], const size_t region[3], + size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, void *ptr) +{ + if (region[0] == 0 || region[1] == 0 || region[2] == 0) + return; + OCL_SAFE_CALL(clEnqueueReadBufferRect(queue(), buffer, blocking_write, buffer_origin, host_origin, region, + buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, NULL)); +} + +void OpenCLEngine::copyBuffer(cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t cb) +{ + if (cb == 0) + return; + cl_event ev = NULL; + OCL_SAFE_CALL(clEnqueueCopyBuffer(queue(), src_buffer, dst_buffer, src_offset, dst_offset, cb, 0, NULL, &ev)); + trackEvent(ev); +} + +void OpenCLEngine::releaseMemObject(cl_mem memobj) +{ + if (memobj == NULL) + return; + + OCL_SAFE_CALL(clReleaseMemObject(memobj)); +} + +void OpenCLEngine::ndRangeKernel(OpenCLKernel &kernel, cl_uint work_dim, const size_t *global_work_offset, + const size_t *global_work_size, const size_t *local_work_size) +{ + if (work_dim < 1 || work_dim > 3) + throw ocl_exception("Wrong work dimension size: " + to_string(work_dim) + "!"); + + // check workgroup size + if (local_work_size) { + size_t workgroup_size = 1; + for (cl_uint dim = 0; dim < work_dim; dim++) { + if (local_work_size[dim] > device_info_.max_work_item_sizes[dim]) + throw ocl_exception("Wrong work_size[" + to_string(dim) + "] value: " + to_string(local_work_size[dim]) + "!"); + workgroup_size *= local_work_size[dim]; + } + if (workgroup_size > device_info_.max_workgroup_size) + throw ocl_exception("Too big workgroup size: " + to_string(workgroup_size) + "!"); + if (workgroup_size > kernel.workGroupSize()) + throw ocl_exception("Too big workgroup size for this kernel: " + to_string(workgroup_size) + "!"); + } + + // If, for example, CL_DEVICE_ADDRESS_BITS = 32, i.e. the device uses a 32-bit address space, + // size_t is a 32-bit unsigned integer and global_work_size values must be in the range 1 .. 2^32 - 1. + // Values outside this range return a CL_OUT_OF_RESOURCES error. + uint64_t max_global_work_size = (size_t) 1 << (device_info_.device_address_bits - 1); + max_global_work_size = max_global_work_size + (max_global_work_size - 1); + for (size_t d = 0; d < work_dim; ++d) { + if (global_work_size[d] == 0) { + std::cerr << "Global work size is zero!" << std::endl; + throw ocl_exception("Global work_size[" + to_string(d) + "] value is zero!"); + } else if (global_work_size[d] > max_global_work_size && device_info_.device_address_bits <= 64) { + throw ocl_exception("Global work_size[" + to_string(d) + "] value is too big for this device address bits: " + + to_string(global_work_size[d]) + ", while device has " + to_string(device_info_.device_address_bits) + " address bits!"); + } + } + + cl_event ev = NULL; + OCL_SAFE_CALL(clEnqueueNDRangeKernel(queue(), kernel.kernel(), work_dim, global_work_offset, global_work_size, local_work_size, 0, NULL, &ev)); + trackEvent(ev, "Kernel " + kernel.kernelName() + ": "); +} + +void OpenCLEngine::trackEvent(cl_event ev, std::string message) +{ + cl_int ciErrNum = CL_SUCCESS; + cl_int result = CL_SUCCESS; + + try { + OCL_SAFE_CALL_MESSAGE(clFlush(queue()), message); + OCL_SAFE_CALL_MESSAGE(clWaitForEvents(1, &ev), message); + OCL_SAFE_CALL_MESSAGE(clGetEventInfo(ev, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &result, 0), message); + + if (result != CL_COMPLETE) { + throw ocl_exception("Wait for event succeed, but it is still is not complete with execution status: " + to_string(result) + "!"); + } + } catch (...) { + OCL_SAFE_CALL_MESSAGE(clReleaseEvent(ev), message); + throw; + } + + OCL_SAFE_CALL_MESSAGE(clReleaseEvent(ev), message); +} + +cl_program OpenCLEngine::findProgram(int id) const +{ + std::map::const_iterator it = programs_.find(id); + if (it != programs_.end()) + return it->second; + return 0; +} + +OpenCLKernel *OpenCLEngine::findKernel(int id) const +{ + std::map::const_iterator it = kernels_.find(id); + if (it != kernels_.end()) + return it->second; + return 0; +} + +VersionedBinary::VersionedBinary(const char *data, const size_t size, + int bits, const int opencl_major_version, const int opencl_minor_version) + : data_(data), size_(size), device_address_bits_(bits), opencl_major_version_(opencl_major_version), opencl_minor_version_(opencl_minor_version) +{} + +ProgramBinaries::ProgramBinaries(std::vector binaries, std::string defines, std::string program_name) : binaries_(binaries) +{ + static int next_program_id = 0; + program_name_ = program_name; + id_ = next_program_id++; + defines_ = defines; +} + +ProgramBinaries::ProgramBinaries(const char *source_code, size_t source_code_length, std::string defines, std::string program_name) : binaries_({VersionedBinary(source_code, source_code_length, 0, 1, 2)}) +{ + static int next_program_id = 0; + program_name_ = program_name; + id_ = next_program_id++; + defines_ = defines; +} + +const VersionedBinary* ProgramBinaries::getBinary(const std::shared_ptr &cl) const +{ + for (int i = 0; i < binaries_.size(); ++i) { + const VersionedBinary* binary = &binaries_[i]; + + if (binary->deviceAddressBits() && binary->deviceAddressBits() != cl->deviceAddressBits()) + continue; + + if (binary->openclMajorVersion() > cl->deviceInfo().opencl_major_version) + continue; + + if (binary->openclMajorVersion() == cl->deviceInfo().opencl_major_version && binary->openclMinorVersion() > cl->deviceInfo().opencl_minor_version) + continue; + + return binary; + } + + throw ocl_exception("No SPIR version for " + to_string(cl->deviceAddressBits()) + "-bit device with OpenCL " + + to_string(cl->deviceInfo().opencl_major_version) + "." + to_string(cl->deviceInfo().opencl_minor_version) + "!"); +} + +KernelSource::KernelSource(std::shared_ptr program, const char *name) : program_(program) +{ + id_ = getNextKernelId(); + name_ = std::string(name); +} + +KernelSource::KernelSource(std::shared_ptr program, const std::string &name) : program_(program) +{ + id_ = getNextKernelId(); + name_ = name; +} + +int KernelSource::getNextKernelId() +{ + static int next_kernel_id = 0; + return next_kernel_id++; +} + +namespace ocl { + typedef std::map, std::vector> binaries_by_device; + static std::map cached_kernels_binaries; + static Mutex cached_kernels_mutex; + + std::vector* getCachedBinary(int programId, cl_platform_id platform, cl_device_id device) + { + auto programCacheIt = cached_kernels_binaries.find(programId); + if (programCacheIt == cached_kernels_binaries.end()) + cached_kernels_binaries[programId] = binaries_by_device(); + auto binaryIt = cached_kernels_binaries[programId].find(std::make_pair(platform, device)); + if (binaryIt != cached_kernels_binaries[programId].end()) { + return &binaryIt->second; + } else { + return NULL; + } + } + + void setCachedBinary(int programId, cl_platform_id platform, cl_device_id device, std::vector binaries) + { + auto programCacheIt = cached_kernels_binaries.find(programId); + if (programCacheIt == cached_kernels_binaries.end()) + cached_kernels_binaries[programId] = binaries_by_device(); + cached_kernels_binaries[programId][std::make_pair(platform, device)] = binaries; + } + + std::vector getProgramBinaries(cl_program program) + { + size_t binaries_size; + OCL_SAFE_CALL(clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, + sizeof(size_t), &binaries_size, NULL)); + + std::vector binaries(binaries_size); + unsigned char *data = binaries.data(); + OCL_SAFE_CALL(clGetProgramInfo(program, CL_PROGRAM_BINARIES, + sizeof(unsigned char *), &data, NULL)); + + return binaries; + } +} + +OpenCLKernel *KernelSource::getKernel(const std::shared_ptr &cl, bool printLog) +{ + OpenCLKernel *kernel = cl->findKernel(id_); + if (kernel) + return kernel; + + cl_program program = cl->findProgram(program_->id()); + + if (!program) { + Lock lock(cached_kernels_mutex); + + bool verbose = printLog || OCL_VERBOSE_COMPILE_LOG; + + const VersionedBinary* binary = program_->getBinary(cl); + const std::vector* cachedCompiledBinary = getCachedBinary(program_->id(), cl->platform(), cl->device()); + + cl_int ciErrNum = CL_SUCCESS; + + std::string options = program_->defines(); + + std::vector loaded_from_file_binaries; + std::string binaries_to_load_filename = LOAD_KERNEL_BINARIES_FROM_FILE; + + if (!binaries_to_load_filename.empty()){ + std::ifstream program_binaries_file; + program_binaries_file.open(binaries_to_load_filename); + + std::string binaries_string((std::istreambuf_iterator(program_binaries_file)), std::istreambuf_iterator()); + + loaded_from_file_binaries = std::vector(binaries_string.size()); + for (int i = 0; i < binaries_string.size(); ++i) { + loaded_from_file_binaries[i] = (unsigned char) binaries_string[i]; + } + cachedCompiledBinary = &loaded_from_file_binaries; + + program_binaries_file.close(); + } + + if (cachedCompiledBinary != NULL) { + std::vector kernel_ptrs; + std::vector kernel_sizes; + + kernel_ptrs.push_back(cachedCompiledBinary->data()); + kernel_sizes.push_back(cachedCompiledBinary->size()); + + cl_device_id device = cl->device(); + cl_int binary_status; + + program = clCreateProgramWithBinary(cl->context(), 1, &device, &kernel_sizes[0], &kernel_ptrs[0], &binary_status, &ciErrNum); + OCL_SAFE_CALL(binary_status); + OCL_SAFE_CALL(ciErrNum); + } else if (binary->deviceAddressBits() == 0) { + std::vector kernel_ptrs; + std::vector kernel_sizes; + + kernel_ptrs.push_back(binary->data()); + kernel_sizes.push_back(binary->size()); + + program = clCreateProgramWithSource(cl->context(), kernel_ptrs.size(), &kernel_ptrs[0], &kernel_sizes[0], &ciErrNum); + OCL_SAFE_CALL(ciErrNum); + } else { + std::vector kernel_ptrs; + std::vector kernel_sizes; + + kernel_ptrs.push_back((unsigned char*) binary->data()); + kernel_sizes.push_back(binary->size()); + + cl_device_id device = cl->device(); + cl_int binary_status; + + program = clCreateProgramWithBinary(cl->context(), 1, &device, &kernel_sizes[0], &kernel_ptrs[0], &binary_status, &ciErrNum); + OCL_SAFE_CALL(binary_status); + OCL_SAFE_CALL(ciErrNum); + + if (cl->deviceInfo().extensions.count("cl_khr_spir") == 0) + throw ocl_exception("Device does not support SPIR!"); + + options += " -x spir"; + } + + options += " -D WARP_SIZE=" + to_string(cl->wavefrontSize()); + + timer tm; + tm.start(); + + if (cachedCompiledBinary == NULL && verbose) { + if (program_->programName() == "") { + std::cout << "Building kernels for " << cl->deviceName() << "... " << std::endl; + } +// else { +// std::cout << "Building kernel " << program_->programName() << " for " << cl->deviceName() << "... " << std::endl; +// } + } + + ciErrNum = clBuildProgram(program, 0, NULL, options.c_str(), NULL, NULL); + + if (ciErrNum == CL_SUCCESS && cachedCompiledBinary == NULL) { + if (program_->programName() == "" && verbose) { + std::cout << "Kernels compilation done in " << tm.elapsed() << " seconds" << std::endl; + } +// else { +// std::cout << "Kernel " << program_->programName() << " compilation done in " << tm.elapsed() << " seconds" << std::endl; +// } + + std::vector binaries = getProgramBinaries(program); + setCachedBinary(program_->id(), cl->platform(), cl->device(), binaries); + } + + if (ciErrNum != CL_SUCCESS || verbose) { + ocl::oclPrintBuildLog(program); + + std::string binaries_filename = DUMP_KERNEL_BINARIES_TO_FILE; + if (!binaries_filename.empty()) { + std::vector binaries = getProgramBinaries(program); + std::string binaries_string((char*) binaries.data(), binaries.size()); + + std::ofstream program_binaries_file; + program_binaries_file.open(binaries_filename + "_platform" + to_string(cl->platform()) + "_device" + to_string(cl->device()) + "_program" + to_string(program_->id())); + + program_binaries_file << binaries_string; + program_binaries_file.close(); + } + } + + if (ciErrNum != CL_SUCCESS) { + clReleaseProgram(program); + program = 0; + } + + OCL_SAFE_CALL(ciErrNum); + cl->programs()[program_->id()] = program; + } + + kernel = new OpenCLKernel; + kernel->create(program, name_.c_str()); + + cl->kernels()[id_] = kernel; + + return kernel; +} + +void KernelSource::exec(const gpu::WorkSize &ws, const Arg &arg0, const Arg &arg1, const Arg &arg2, const Arg &arg3, const Arg &arg4, const Arg &arg5, const Arg &arg6, const Arg &arg7, const Arg &arg8, const Arg &arg9, const Arg &arg10, const Arg &arg11, const Arg &arg12, const Arg &arg13, const Arg &arg14, const Arg &arg15, const Arg &arg16, const Arg &arg17, const Arg &arg18, const Arg &arg19, const Arg &arg20, const Arg &arg21, const Arg &arg22, const Arg &arg23, const Arg &arg24, const Arg &arg25, const Arg &arg26, const Arg &arg27, const Arg &arg28, const Arg &arg29, const Arg &arg30, const Arg &arg31, const Arg &arg32, const Arg &arg33, const Arg &arg34, const Arg &arg35, const Arg &arg36, const Arg &arg37, const Arg &arg38, const Arg &arg39, const Arg &arg40) +{ + gpu::Context context; + + OpenCLKernel *kernel = getKernel(context.cl()); + + kernel->setArgs(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15, arg16, arg17, arg18, arg19, arg20, arg21, arg22, arg23, arg24, arg25, arg26, arg27, arg28, arg29, arg30, arg31, arg32, arg33, arg34, arg35, arg36, arg37, arg38, arg39, arg40); + + context.cl()->ndRangeKernel(*kernel, 3, NULL, ws.clGlobalSize(), ws.clLocalSize()); +} + +void KernelSource::execSubdivided(const gpu::WorkSize &ws, const Arg &arg0, const Arg &arg1, const Arg &arg2, const Arg &arg3, const Arg &arg4, const Arg &arg5, const Arg &arg6, const Arg &arg7, const Arg &arg8, const Arg &arg9, const Arg &arg10, const Arg &arg11, const Arg &arg12, const Arg &arg13, const Arg &arg14, const Arg &arg15, const Arg &arg16, const Arg &arg17, const Arg &arg18, const Arg &arg19, const Arg &arg20, const Arg &arg21, const Arg &arg22, const Arg &arg23, const Arg &arg24, const Arg &arg25, const Arg &arg26, const Arg &arg27, const Arg &arg28, const Arg &arg29, const Arg &arg30, const Arg &arg31, const Arg &arg32, const Arg &arg33, const Arg &arg34, const Arg &arg35, const Arg &arg36, const Arg &arg37, const Arg &arg38, const Arg &arg39, const Arg &arg40) +{ + const size_t max_total_size = 1000000; + + const size_t local_x = ws.clLocalSize()[0]; + const size_t local_y = ws.clLocalSize()[1]; + const size_t local_z = ws.clLocalSize()[2]; + + const size_t total_x = ws.clGlobalSize()[0]; + const size_t total_y = ws.clGlobalSize()[1]; + const size_t total_z = ws.clGlobalSize()[2]; + + size_t nparts_x = 1; + size_t nparts_y = 1; + size_t nparts_z = 1; + + size_t part_x = total_x; + size_t part_y = total_y; + size_t part_z = total_z; + + while (part_x * part_y * part_z > max_total_size && part_x > local_x) { + nparts_x *= 2; + part_x = local_x * gpu::divup(gpu::divup(total_x, nparts_x), local_x); + } + while (part_x * part_y * part_z > max_total_size && part_y > local_y) { + nparts_y *= 2; + part_y = local_y * gpu::divup(gpu::divup(total_y, nparts_y), local_y); + } + while (part_x * part_y * part_z > max_total_size && part_z > local_z) { + nparts_z *= 2; + part_z = local_z * gpu::divup(gpu::divup(total_z, nparts_z), local_z); + } + + gpu::Context context; + + OpenCLKernel *kernel = getKernel(context.cl()); + + kernel->setArgs(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15, arg16, arg17, arg18, arg19, arg20, arg21, arg22, arg23, arg24, arg25, arg26, arg27, arg28, arg29, arg30, arg31, arg32, arg33, arg34, arg35, arg36, arg37, arg38, arg39, arg40); + + for (size_t offset_x = 0; offset_x < total_x; offset_x += part_x) { + for (size_t offset_y = 0; offset_y < total_y; offset_y += part_y) { + for (size_t offset_z = 0; offset_z < total_z; offset_z += part_z) { + size_t offset[3]; + offset[0] = offset_x; + offset[1] = offset_y; + offset[2] = offset_z; + + size_t current_x = std::min(part_x, total_x - offset_x); + size_t current_y = std::min(part_y, total_y - offset_y); + size_t current_z = std::min(part_z, total_z - offset_z); + + gpu::WorkSize ws_part(local_x, local_y, local_z, current_x, current_y, current_z); + + // NOTTODO: generalize this logic, apply it to CUDA, make so that ndRangeKernel is called only in one place in codebase, remove all get_group_id/get_num_groups calls, etc. + context.cl()->ndRangeKernel(*kernel, 3, offset, ws_part.clGlobalSize(), ws_part.clLocalSize()); + } + } + } +} + +void KernelSource::precompile(bool printLog) { + gpu::Context context; + + precompile(context.cl(), printLog); +} + +void KernelSource::precompile(const std::shared_ptr &cl, bool printLog) { + getKernel(cl, printLog); +} + +OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer &arg) +{ + is_null = false; + size = sizeof(cl_mem); + cl_mem_storage = arg.clmem(); + value = &cl_mem_storage; + if (arg.cloffset() != 0) { + ocl_exception("Offset is not zero, but ignored!"); + } +} + +template +OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer_typed &arg) +{ + is_null = false; + size = sizeof(cl_mem); + cl_mem_storage = arg.clmem(); + value = &cl_mem_storage; + if (arg.cloffset() != 0) { + ocl_exception("Offset is not zero, but ignored!"); + } +} + +template OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer_typed &arg); +template OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer_typed &arg); +template OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer_typed &arg); +template OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer_typed &arg); +template OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer_typed &arg); +template OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer_typed &arg); +template OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer_typed &arg); +template OpenCLKernelArg::OpenCLKernelArg(const gpu::shared_device_buffer_typed &arg); \ No newline at end of file diff --git a/libs/gpu/libgpu/opencl/engine.h b/libs/gpu/libgpu/opencl/engine.h new file mode 100644 index 0000000..dbca5d6 --- /dev/null +++ b/libs/gpu/libgpu/opencl/engine.h @@ -0,0 +1,266 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace gpu { + class WorkSize; + class shared_device_buffer; + + template + class shared_device_buffer_typed; +} + +namespace ocl { + + template + struct OpenCLType; + + template<> struct OpenCLType { typedef cl_char type; static std::string name() { return "char"; } static int8_t max() { return CL_CHAR_MAX; } static int8_t min() { return CL_CHAR_MIN; } }; + template<> struct OpenCLType { typedef cl_short type; static std::string name() { return "short"; } static int16_t max() { return CL_SHRT_MAX; } static int16_t min() { return CL_SHRT_MIN; } }; + template<> struct OpenCLType { typedef cl_int type; static std::string name() { return "int"; } static int32_t max() { return CL_INT_MAX; } static int32_t min() { return CL_INT_MIN; } }; + template<> struct OpenCLType { typedef cl_uchar type; static std::string name() { return "uchar"; } static uint8_t max() { return CL_UCHAR_MAX; } static uint8_t min() { return 0; } }; + template<> struct OpenCLType { typedef cl_ushort type; static std::string name() { return "ushort"; } static uint16_t max() { return CL_CHAR_MAX; } static uint16_t min() { return 0; } }; + template<> struct OpenCLType { typedef cl_uint type; static std::string name() { return "uint"; } static uint32_t max() { return CL_UINT_MAX; } static uint32_t min() { return 0; } }; + template<> struct OpenCLType { typedef cl_float type; static std::string name() { return "float"; } static float max() { return CL_FLT_MAX; } static float min() { return CL_FLT_MIN; } }; + template<> struct OpenCLType { typedef cl_double type; static std::string name() { return "double"; } static double max() { return std::numeric_limits::max(); } static double min() { return CL_DBL_MIN; } }; + + class OpenCLEngine; + + typedef std::shared_ptr sh_ptr_ocl_engine; + + class LocalMem { + public: + LocalMem(size_t size) : size(size) { } + + ::size_t size; + }; + + class OpenCLKernelArg { + public: + OpenCLKernelArg() : is_null(true), size(0), value(0), cl_mem_storage(NULL) { } + + template + OpenCLKernelArg(const T &arg) : is_null(false), size(sizeof(arg)), value(&arg), cl_mem_storage(NULL) { } + + OpenCLKernelArg(const LocalMem &arg) : is_null(false), size(arg.size), value(0), cl_mem_storage(NULL) { } + + OpenCLKernelArg(const gpu::shared_device_buffer &arg); + + template + OpenCLKernelArg(const gpu::shared_device_buffer_typed &arg); + + bool is_null; + size_t size; + const void * value; + protected: + cl_mem cl_mem_storage; + }; + + class OpenCLKernel { + public: + OpenCLKernel(); + ~OpenCLKernel(); + + void create(cl_program program, const char *kernel_name, cl_device_id device_id_=NULL); + + cl_kernel kernel(void) { return kernel_; } + std::string kernelName(void) { return kernel_name_; } + size_t workGroupSize(void) { return work_group_size_; } + + typedef OpenCLKernelArg Arg; + + void setArgs(const Arg &arg0 = Arg(), const Arg &arg1 = Arg(), const Arg &arg2 = Arg(), const Arg &arg3 = Arg(), const Arg &arg4 = Arg(), const Arg &arg5 = Arg(), const Arg &arg6 = Arg(), const Arg &arg7 = Arg(), const Arg &arg8 = Arg(), const Arg &arg9 = Arg(), const Arg &arg10 = Arg(), const Arg &arg11 = Arg(), const Arg &arg12 = Arg(), const Arg &arg13 = Arg(), const Arg &arg14 = Arg(), const Arg &arg15 = Arg(), const Arg &arg16 = Arg(), const Arg &arg17 = Arg(), const Arg &arg18 = Arg(), const Arg &arg19 = Arg(), const Arg &arg20 = Arg(), const Arg &arg21 = Arg(), const Arg &arg22 = Arg(), const Arg &arg23 = Arg(), const Arg &arg24 = Arg(), const Arg &arg25 = Arg(), const Arg &arg26 = Arg(), const Arg &arg27 = Arg(), const Arg &arg28 = Arg(), const Arg &arg29 = Arg(), const Arg &arg30 = Arg(), const Arg &arg31 = Arg(), const Arg &arg32 = Arg(), const Arg &arg33 = Arg(), const Arg &arg34 = Arg(), const Arg &arg35 = Arg(), const Arg &arg36 = Arg(), const Arg &arg37 = Arg(), const Arg &arg38 = Arg(), const Arg &arg39 = Arg(), const Arg &arg40 = Arg()) + { + setArg( 0, arg0); + setArg( 1, arg1); + setArg( 2, arg2); + setArg( 3, arg3); + setArg( 4, arg4); + setArg( 5, arg5); + setArg( 6, arg6); + setArg( 7, arg7); + setArg( 8, arg8); + setArg( 9, arg9); + setArg(10, arg10); + setArg(11, arg11); + setArg(12, arg12); + setArg(13, arg13); + setArg(14, arg14); + setArg(15, arg15); + setArg(16, arg16); + setArg(17, arg17); + setArg(18, arg18); + setArg(19, arg19); + setArg(20, arg20); + setArg(21, arg21); + setArg(22, arg22); + setArg(23, arg23); + setArg(24, arg24); + setArg(25, arg25); + setArg(26, arg26); + setArg(27, arg27); + setArg(28, arg28); + setArg(29, arg29); + setArg(30, arg30); + setArg(31, arg31); + setArg(32, arg32); + setArg(33, arg33); + setArg(34, arg34); + setArg(35, arg35); + setArg(36, arg36); + setArg(37, arg37); + setArg(38, arg38); + setArg(39, arg39); + setArg(40, arg40); + } + + protected: + void setArg(cl_uint arg_index, size_t arg_size, const void *arg_value); + + void setArg(cl_uint arg_index, const Arg &arg) + { + if (!arg.is_null) + setArg(arg_index, arg.size, arg.value); + } + + cl_kernel kernel_; + size_t work_group_size_; + std::string kernel_name_; + }; + + class OpenCLEngine { + public: + OpenCLEngine(); + ~OpenCLEngine(); + + void init(cl_device_id device_id = 0, const char *cl_params = 0, bool verbose = false); + void init(cl_platform_id platform_id = 0, cl_device_id device_id = 0, const char *cl_params = 0, bool verbose = false); + cl_mem createBuffer(cl_mem_flags flags, size_t size); + void writeBuffer(cl_mem buffer, cl_bool blocking_write, size_t offset, size_t cb, const void *ptr); + void writeBufferRect(cl_mem buffer, cl_bool blocking_write, const size_t buffer_origin[3], const size_t host_origin[3], const size_t region[3], + size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, const void *ptr); + void readBuffer(cl_mem buffer, cl_bool blocking_read, size_t offset, size_t cb, void *ptr); + void readBufferRect(cl_mem buffer, cl_bool blocking_write, const size_t buffer_origin[3], const size_t host_origin[3], const size_t region[3], + size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, void *ptr); + void copyBuffer(cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t cb); + void ndRangeKernel(OpenCLKernel &kernel, cl_uint work_dim, const size_t *global_work_offset, + const size_t *global_work_size, const size_t *local_work_size); + void releaseMemObject(cl_mem memobj); + + const DeviceInfo & deviceInfo() const { return device_info_; } + + cl_platform_id platform() { return platform_id_; } + cl_device_id device() { return device_id_; } + cl_context context() { return context_; } + cl_command_queue queue() { return command_queue_; } + + const std::string & deviceName() { return device_info_.device_name; } + size_t maxComputeUnits() const { return device_info_.max_compute_units; } + size_t maxWorkgroupSize() const { return device_info_.max_workgroup_size; } + size_t maxWorkItemSizes(int dim) { return device_info_.max_work_item_sizes[dim]; } + size_t maxMemAllocSize() { return device_info_.max_mem_alloc_size; } + size_t globalMemSize() { return device_info_.global_mem_size; } + size_t deviceAddressBits() { return device_info_.device_address_bits; } + size_t wavefrontSize() { return wavefront_size_; } + size_t totalMemSize() { return total_mem_size_; } + + std::map & programs() { return programs_; } + std::map & kernels() { return kernels_; } + + cl_program findProgram(int id) const; + OpenCLKernel * findKernel(int id) const; + + protected: + void trackEvent(cl_event ev, std::string message=""); + + cl_platform_id platform_id_; + cl_device_id device_id_; + cl_context context_; + cl_command_queue command_queue_; + + size_t wavefront_size_; + + DeviceInfo device_info_; + size_t total_mem_size_; + + std::map programs_; + std::map kernels_; + }; + + void oclPrintBuildLog(cl_program program); + +class VersionedBinary { + +public: + VersionedBinary(const char *data, const size_t size, + int bits, const int opencl_major_version, const int opencl_minor_version); + + const char * data() const { return data_; } + size_t size() const { return size_; } + int deviceAddressBits() const { return device_address_bits_; } + int openclMajorVersion() const { return opencl_major_version_; } + int openclMinorVersion() const { return opencl_minor_version_; } + +protected: + const char * data_; + const size_t size_; + int device_address_bits_; + const int opencl_major_version_; + const int opencl_minor_version_; +}; + +class ProgramBinaries { +public: + ProgramBinaries(std::vector binaries, std::string defines = std::string(), std::string program_name = std::string()); + ProgramBinaries(const char *source_code, size_t source_code_length, std::string defines = std::string(), std::string program_name = std::string()); + + int id() const { return id_; } + std::string defines() const { return defines_; } + const VersionedBinary* getBinary(const std::shared_ptr &cl) const; + const std::string & programName() const { return program_name_; }; + +protected: + int id_; + std::vector binaries_; + std::string program_name_; + std::string defines_; +}; + +class KernelSource { +public: + KernelSource(std::shared_ptr program, const char *name); + KernelSource(std::shared_ptr program, const std::string &name); + + typedef OpenCLKernel::Arg Arg; + + void exec(const gpu::WorkSize &ws, const Arg &arg0 = Arg(), const Arg &arg1 = Arg(), const Arg &arg2 = Arg(), const Arg &arg3 = Arg(), const Arg &arg4 = Arg(), const Arg &arg5 = Arg(), const Arg &arg6 = Arg(), const Arg &arg7 = Arg(), const Arg &arg8 = Arg(), const Arg &arg9 = Arg(), const Arg &arg10 = Arg(), const Arg &arg11 = Arg(), const Arg &arg12 = Arg(), const Arg &arg13 = Arg(), const Arg &arg14 = Arg(), const Arg &arg15 = Arg(), const Arg &arg16 = Arg(), const Arg &arg17 = Arg(), const Arg &arg18 = Arg(), const Arg &arg19 = Arg(), const Arg &arg20 = Arg(), const Arg &arg21 = Arg(), const Arg &arg22 = Arg(), const Arg &arg23 = Arg(), const Arg &arg24 = Arg(), const Arg &arg25 = Arg(), const Arg &arg26 = Arg(), const Arg &arg27 = Arg(), const Arg &arg28 = Arg(), const Arg &arg29 = Arg(), const Arg &arg30 = Arg(), const Arg &arg31 = Arg(), const Arg &arg32 = Arg(), const Arg &arg33 = Arg(), const Arg &arg34 = Arg(), const Arg &arg35 = Arg(), const Arg &arg36 = Arg(), const Arg &arg37 = Arg(), const Arg &arg38 = Arg(), const Arg &arg39 = Arg(), const Arg &arg40 = Arg()); + void execSubdivided(const gpu::WorkSize &ws, const Arg &arg0 = Arg(), const Arg &arg1 = Arg(), const Arg &arg2 = Arg(), const Arg &arg3 = Arg(), const Arg &arg4 = Arg(), const Arg &arg5 = Arg(), const Arg &arg6 = Arg(), const Arg &arg7 = Arg(), const Arg &arg8 = Arg(), const Arg &arg9 = Arg(), const Arg &arg10 = Arg(), const Arg &arg11 = Arg(), const Arg &arg12 = Arg(), const Arg &arg13 = Arg(), const Arg &arg14 = Arg(), const Arg &arg15 = Arg(), const Arg &arg16 = Arg(), const Arg &arg17 = Arg(), const Arg &arg18 = Arg(), const Arg &arg19 = Arg(), const Arg &arg20 = Arg(), const Arg &arg21 = Arg(), const Arg &arg22 = Arg(), const Arg &arg23 = Arg(), const Arg &arg24 = Arg(), const Arg &arg25 = Arg(), const Arg &arg26 = Arg(), const Arg &arg27 = Arg(), const Arg &arg28 = Arg(), const Arg &arg29 = Arg(), const Arg &arg30 = Arg(), const Arg &arg31 = Arg(), const Arg &arg32 = Arg(), const Arg &arg33 = Arg(), const Arg &arg34 = Arg(), const Arg &arg35 = Arg(), const Arg &arg36 = Arg(), const Arg &arg37 = Arg(), const Arg &arg38 = Arg(), const Arg &arg39 = Arg(), const Arg &arg40 = Arg()); + + void precompile(bool printLog=false); + void precompile(const std::shared_ptr &cl, bool printLog=false); + +protected: + int getNextKernelId(); + + OpenCLKernel *getKernel(const std::shared_ptr &cl, bool printLog=false); + + std::shared_ptr program_; + + int id_; + std::string name_; +}; + +} diff --git a/libs/gpu/libgpu/opencl/enum.cpp b/libs/gpu/libgpu/opencl/enum.cpp new file mode 100644 index 0000000..61bc967 --- /dev/null +++ b/libs/gpu/libgpu/opencl/enum.cpp @@ -0,0 +1,251 @@ +#include +#include +#include +#include +#include +#include +#include +#include "enum.h" + +#define OCL_CPU_DEVICES_ENABLED true + +bool OpenCLEnum::Device::printInfo() const +{ + ocl::DeviceInfo device_info; + device_info.init(id); + device_info.print(); + return true; +} + +OpenCLEnum::OpenCLEnum() +{ +} + +OpenCLEnum::~OpenCLEnum() +{ +} + +bool OpenCLEnum::enumPlatforms() +{ + cl_uint num_platforms; + cl_int ciErrNum; + + // Get OpenCL platform count + ciErrNum = clGetPlatformIDs (0, NULL, &num_platforms); + if (ciErrNum != CL_SUCCESS) { + std::cerr << "clGetPlatformIDs failed: " << ocl::errorString(ciErrNum) << std::endl; + return false; + } + + if (num_platforms == 0) + return true; + + std::vector clPlatformIDs(num_platforms); + + // get platform info for each platform and trap the NVIDIA platform if found + ciErrNum = clGetPlatformIDs (num_platforms, clPlatformIDs.data(), NULL); + if (ciErrNum != CL_SUCCESS) { + std::cerr << "clGetPlatformIDs for " << num_platforms << " platforms failed: " << ocl::errorString(ciErrNum) << std::endl; + return false; + } + + for (cl_uint i = 0; i < num_platforms; ++i) { + Platform platform; + + cl_platform_id platform_id = clPlatformIDs[i]; + platform.id = platform_id; + + queryPlatformInfo(platform_id, CL_PLATFORM_NAME, platform.name, "CL_PLATFORM_NAME", 1024); + queryPlatformInfo(platform_id, CL_PLATFORM_VENDOR, platform.vendor, "CL_PLATFORM_VENDOR", 1024); + queryPlatformInfo(platform_id, CL_PLATFORM_VERSION, platform.version, "CL_PLATFORM_VERSION", 1024); + + platforms_.push_back(platform); + } + + return true; +} + +bool OpenCLEnum::queryDeviceInfo(Device &device) +{ + cl_device_id device_id = device.id; + + queryDeviceInfo(device_id, CL_DEVICE_TYPE, device.device_type, "CL_DEVICE_TYPE"); + queryDeviceInfo(device_id, CL_DEVICE_NAME, device.name, "CL_DEVICE_NAME", 1024); + queryDeviceInfo(device_id, CL_DEVICE_VENDOR, device.vendor, "CL_DEVICE_VENDOR", 1024); + queryDeviceInfo(device_id, CL_DEVICE_VENDOR_ID, device.vendor_id, "CL_DEVICE_VENDOR_ID"); + queryDeviceInfo(device_id, CL_DEVICE_VERSION, device.version, "CL_DEVICE_VERSION", 1024); + queryDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, device.compute_units, "CL_DEVICE_MAX_COMPUTE_UNITS"); + queryDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, device.mem_size, "CL_DEVICE_GLOBAL_MEM_SIZE"); + queryDeviceInfo(device_id, CL_DEVICE_MAX_CLOCK_FREQUENCY, device.clock, "CL_DEVICE_MAX_CLOCK_FREQUENCY"); + + std::set< std::string > extensions; + queryExtensionList(device_id, extensions); + + if (extensions.count("cl_nv_device_attribute_query")) { + queryDeviceInfo(device_id, CL_DEVICE_PCI_BUS_ID_NV, device.nvidia_pci_bus_id, "CL_DEVICE_PCI_BUS_ID_NV"); + queryDeviceInfo(device_id, CL_DEVICE_PCI_SLOT_ID_NV, device.nvidia_pci_slot_id, "CL_DEVICE_PCI_SLOT_ID_NV"); + } + + device.has_cl_khr_spir = (extensions.count("cl_khr_spir") != 0); + + return true; +} + +template +bool OpenCLEnum::queryDeviceInfo(cl_device_id device_id, unsigned int param, T &value, const std::string ¶m_name) +{ + cl_int res = clGetDeviceInfo(device_id, param, sizeof(value), &value, NULL); + if (res != CL_SUCCESS) { + std::cerr << "clGetDeviceInfo(" << param_name << ") failed: " << ocl::errorString(res) << std::endl; + return false; + } + return true; +} + +bool OpenCLEnum::queryDeviceInfo(cl_device_id device_id, unsigned int param, std::string &value, const std::string ¶m_name, size_t max_size) +{ + cl_int res; + if (max_size == 0) { + res = clGetDeviceInfo(device_id, param, 0, NULL, &max_size); + if (res != CL_SUCCESS) { + std::cerr << "clGetDeviceInfo(" << param_name << ") failed: " << ocl::errorString(res) << std::endl; + return false; + } + } + + std::vector data(max_size); + res = clGetDeviceInfo(device_id, param, max_size, data.data(), &max_size); + if (res != CL_SUCCESS) { + std::cerr << "clGetDeviceInfo(" << param_name << ") failed: " << ocl::errorString(res) << std::endl; + return false; + } + + value.assign(data.begin(), data.begin() + max_size); + value = value.c_str(); // remove trailing null chars + return true; +} + +bool OpenCLEnum::queryPlatformInfo(cl_platform_id platform_id, unsigned int param, std::string &value, const std::string ¶m_name, size_t max_size) +{ + cl_int res; + + std::vector data(max_size); + res = clGetPlatformInfo(platform_id, param, max_size, data.data(), &max_size); + if (res != CL_SUCCESS) { + std::cerr << "clGetPlatformInfo(" << param_name << ") failed: " << ocl::errorString(res) << std::endl; + return false; + } + + value.assign(data.begin(), data.begin() + max_size); + value = value.c_str(); // remove trailing null chars + return true; +} + +bool OpenCLEnum::queryExtensionList(cl_device_id device_id, std::set &extensions) +{ + std::string extensions_string; + if (!queryDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, extensions_string, "CL_DEVICE_EXTENSIONS")) + return false; + + std::vector tokens = split(extensions_string, " "); + extensions.insert(tokens.begin(), tokens.end()); + return true; +} + +bool OpenCLEnum::enumDevices(cl_platform_id platform_id) +{ + cl_int ciErrNum; + + cl_uint uiNumDevices = 0; // number of devices available + cl_device_type device_type = CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR; + if (OCL_CPU_DEVICES_ENABLED) { + device_type |= CL_DEVICE_TYPE_CPU; + } + + ciErrNum = clGetDeviceIDs(platform_id, device_type, 0, NULL, &uiNumDevices); + if (ciErrNum != CL_SUCCESS && ciErrNum != CL_DEVICE_NOT_FOUND) { + std::cerr << "clGetDeviceIDs failed: " << ocl::errorString(ciErrNum) << std::endl; + return false; + } + + if (ciErrNum == CL_DEVICE_NOT_FOUND || uiNumDevices == 0) + return true; + + std::vector cdDevices(uiNumDevices); + + ciErrNum = clGetDeviceIDs(platform_id, device_type, uiNumDevices, cdDevices.data(), NULL); + if (ciErrNum != CL_SUCCESS) { + std::cerr << "clGetDeviceIDs for " << uiNumDevices << " devices failed: " << ocl::errorString(ciErrNum) << std::endl; + return false; + } + + for (cl_uint i = 0; i < uiNumDevices; i++) { + Device device; + + device.id = cdDevices[i]; + device.platform_id = platform_id; + + if (!queryDeviceInfo(device)) { + std::cerr << device.name << ": can't query device info" << std::endl; + continue; + } + +#ifdef SPIR_SUPPORT + if (!device.has_cl_khr_spir) { + #ifdef CUDA_SUPPORT + if (device.vendor_id != ocl::ID_NVIDIA && device.vendor.find("NVIDIA") == std::string::npos) { + #endif + std::cerr << device.name << ": no SPIR support" << std::endl; + #ifdef CUDA_SUPPORT + } + #endif + continue; + } +#endif + +#ifdef CUDA_SUPPORT + if (device.vendor_id == ocl::ID_NVIDIA || device.vendor.find("NVIDIA") != std::string::npos) { + continue; + } +#endif + + devices_.push_back(device); + } + + return true; +} + +bool OpenCLEnum::compareDevice(const Device &dev1, const Device &dev2) +{ + if (dev1.name > dev2.name) return false; + if (dev1.name < dev2.name) return true; + if (dev1.id > dev2.id) return false; + return true; +} + +bool OpenCLEnum::enumDevices() +{ + if (!ocl_init()) { + std::cerr << "Can't load OpenCL library" << std::endl; + return false; + } + + if (!enumPlatforms()) + return false; + + for (size_t k = 0; k < platforms_.size(); k++) { + if (!enumDevices(platforms_[k].id)) { + std::cerr << platforms_[k].name << ": can't enumerate devices" << std::endl; + } + } + + std::sort(devices_.begin(), devices_.end(), compareDevice); + + return true; +} + +std::shared_ptr OpenCLEnum::Device::createEngine(bool printInfo) { + std::shared_ptr engine(new ocl::OpenCLEngine()); + engine->init(platform_id, id, 0, printInfo); + return engine; +} diff --git a/libs/gpu/libgpu/opencl/enum.h b/libs/gpu/libgpu/opencl/enum.h new file mode 100644 index 0000000..f7db09d --- /dev/null +++ b/libs/gpu/libgpu/opencl/enum.h @@ -0,0 +1,77 @@ +#pragma once + +#include +#include +#include + +typedef struct _cl_platform_id * cl_platform_id; +typedef struct _cl_device_id * cl_device_id; + +class OpenCLEnum { +public: + OpenCLEnum(); + ~OpenCLEnum(); + + class Device { + public: + Device() + { + device_type = 0; + compute_units = 0; + mem_size = 0; + clock = 0; + nvidia_pci_bus_id = 0; + nvidia_pci_slot_id = 0; + has_cl_khr_spir = false; + } + + cl_device_id id; + unsigned int vendor_id; + cl_platform_id platform_id; + cl_device_type device_type; + std::string name; + std::string vendor; + std::string version; + unsigned int compute_units; + unsigned long long mem_size; + unsigned int clock; + unsigned int nvidia_pci_bus_id; + unsigned int nvidia_pci_slot_id; + bool has_cl_khr_spir; + + ocl::sh_ptr_ocl_engine createEngine(bool printInfo=false); + + bool isCPU(void) { return device_type == CL_DEVICE_TYPE_CPU; } + bool isGPU(void) { return device_type == CL_DEVICE_TYPE_GPU; } + + bool printInfo() const; + }; + + class Platform { + public: + cl_platform_id id; + std::string name; + std::string vendor; + std::string version; + }; + + bool enumDevices(); + std::vector & devices() { return devices_; } + std::vector & platforms() { return platforms_; } + +protected: + bool enumPlatforms(); + bool enumDevices(cl_platform_id platform_id); + + bool queryDeviceInfo(Device &device); + bool queryDeviceInfo(cl_device_id device_id, unsigned int param, std::string &value, const std::string ¶m_name, size_t max_size = 0); + template + bool queryDeviceInfo(cl_device_id device_id, unsigned int param, T &value, const std::string ¶m_name); + bool queryPlatformInfo(cl_platform_id platform_id, unsigned int param, std::string &value, const std::string ¶m_name, size_t max_size); + bool queryExtensionList(cl_device_id device_id, std::set &extensions); + + static bool compareDevice(const Device &dev1, const Device &dev2); + + std::vector devices_; + std::vector platforms_; +}; diff --git a/libs/gpu/libgpu/opencl/utils.cpp b/libs/gpu/libgpu/opencl/utils.cpp new file mode 100644 index 0000000..a2d8214 --- /dev/null +++ b/libs/gpu/libgpu/opencl/utils.cpp @@ -0,0 +1,61 @@ +#include +#include "utils.h" + +namespace ocl { + +std::string errorString(cl_int code) +{ + switch (code) { + case CL_SUCCESS: return "CL_SUCCESS"; + case CL_DEVICE_NOT_FOUND: return "CL_DEVICE_NOT_FOUND"; + case CL_DEVICE_NOT_AVAILABLE: return "CL_DEVICE_NOT_AVAILABLE"; + case CL_COMPILER_NOT_AVAILABLE: return "CL_COMPILER_NOT_AVAILABLE"; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case CL_OUT_OF_RESOURCES: return "CL_OUT_OF_RESOURCES"; + case CL_OUT_OF_HOST_MEMORY: return "CL_OUT_OF_HOST_MEMORY"; + case CL_PROFILING_INFO_NOT_AVAILABLE: return "CL_PROFILING_INFO_NOT_AVAILABLE"; + case CL_MEM_COPY_OVERLAP: return "CL_MEM_COPY_OVERLAP"; + case CL_IMAGE_FORMAT_MISMATCH: return "CL_IMAGE_FORMAT_MISMATCH"; + case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; + case CL_BUILD_PROGRAM_FAILURE: return "CL_BUILD_PROGRAM_FAILURE"; + case CL_MAP_FAILURE: return "CL_MAP_FAILURE"; + + case CL_INVALID_VALUE: return "CL_INVALID_VALUE"; + case CL_INVALID_DEVICE_TYPE: return "CL_INVALID_DEVICE_TYPE"; + case CL_INVALID_PLATFORM: return "CL_INVALID_PLATFORM"; + case CL_INVALID_DEVICE: return "CL_INVALID_DEVICE"; + case CL_INVALID_CONTEXT: return "CL_INVALID_CONTEXT"; + case CL_INVALID_QUEUE_PROPERTIES: return "CL_INVALID_QUEUE_PROPERTIES"; + case CL_INVALID_COMMAND_QUEUE: return "CL_INVALID_COMMAND_QUEUE"; + case CL_INVALID_HOST_PTR: return "CL_INVALID_HOST_PTR"; + case CL_INVALID_MEM_OBJECT: return "CL_INVALID_MEM_OBJECT"; + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case CL_INVALID_IMAGE_SIZE: return "CL_INVALID_IMAGE_SIZE"; + case CL_INVALID_SAMPLER: return "CL_INVALID_SAMPLER"; + case CL_INVALID_BINARY: return "CL_INVALID_BINARY"; + case CL_INVALID_BUILD_OPTIONS: return "CL_INVALID_BUILD_OPTIONS"; + case CL_INVALID_PROGRAM: return "CL_INVALID_PROGRAM"; + case CL_INVALID_PROGRAM_EXECUTABLE: return "CL_INVALID_PROGRAM_EXECUTABLE"; + case CL_INVALID_KERNEL_NAME: return "CL_INVALID_KERNEL_NAME"; + case CL_INVALID_KERNEL_DEFINITION: return "CL_INVALID_KERNEL_DEFINITION"; + case CL_INVALID_KERNEL: return "CL_INVALID_KERNEL"; + case CL_INVALID_ARG_INDEX: return "CL_INVALID_ARG_INDEX"; + case CL_INVALID_ARG_VALUE: return "CL_INVALID_ARG_VALUE"; + case CL_INVALID_ARG_SIZE: return "CL_INVALID_ARG_SIZE"; + case CL_INVALID_KERNEL_ARGS: return "CL_INVALID_KERNEL_ARGS"; + case CL_INVALID_WORK_DIMENSION: return "CL_INVALID_WORK_DIMENSION"; + case CL_INVALID_WORK_GROUP_SIZE: return "CL_INVALID_WORK_GROUP_SIZE"; + case CL_INVALID_WORK_ITEM_SIZE: return "CL_INVALID_WORK_ITEM_SIZE"; + case CL_INVALID_GLOBAL_OFFSET: return "CL_INVALID_GLOBAL_OFFSET"; + case CL_INVALID_EVENT_WAIT_LIST: return "CL_INVALID_EVENT_WAIT_LIST"; + case CL_INVALID_EVENT: return "CL_INVALID_EVENT"; + case CL_INVALID_OPERATION: return "CL_INVALID_OPERATION"; + case CL_INVALID_GL_OBJECT: return "CL_INVALID_GL_OBJECT"; + case CL_INVALID_BUFFER_SIZE: return "CL_INVALID_BUFFER_SIZE"; + case CL_INVALID_MIP_LEVEL: return "CL_INVALID_MIP_LEVEL"; + case CL_INVALID_GLOBAL_WORK_SIZE: return "CL_INVALID_GLOBAL_WORK_SIZE"; + default: return "CL_UNKNOWN_ERROR_CODE_" + to_string(code); + } +} + +} diff --git a/libs/gpu/libgpu/opencl/utils.h b/libs/gpu/libgpu/opencl/utils.h new file mode 100644 index 0000000..f866b9c --- /dev/null +++ b/libs/gpu/libgpu/opencl/utils.h @@ -0,0 +1,69 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace ocl { + +#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3 // since OpenCL 1.1 + +#define CL_NV_DEVICE_ATTRIBUTE_QUERY_EXT "cl_nv_device_attribute_query" +#define CL_AMD_DEVICE_ATTRIBUTE_QUERY_EXT "cl_amd_device_attribute_query" + +#ifndef CL_DEVICE_PCI_BUS_ID_NV +#define CL_DEVICE_PCI_BUS_ID_NV 0x4008 +#endif + +#ifndef CL_DEVICE_PCI_SLOT_ID_NV +#define CL_DEVICE_PCI_SLOT_ID_NV 0x4009 +#endif + +#define CL_DEVICE_TOPOLOGY_AMD 0x4037 +#define CL_DEVICE_BOARD_NAME_AMD 0x4038 +#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD 0x4039 +#define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043 + +enum VENDOR { + ID_AMD = 0x1002, + ID_INTEL = 0x8086, + ID_NVIDIA = 0x10de, +}; + +class ocl_exception : public gpu::gpu_exception { +public: + ocl_exception(std::string msg) throw () : gpu_exception(msg) { } + ocl_exception(const char *msg) throw () : gpu_exception(msg) { } + ocl_exception() throw () : gpu_exception("OpenCL exception") { } +}; + +class ocl_bad_alloc : public gpu::gpu_bad_alloc { +public: + ocl_bad_alloc(std::string msg) throw () : gpu_bad_alloc(msg) { } + ocl_bad_alloc(const char *msg) throw () : gpu_bad_alloc(msg) { } + ocl_bad_alloc() throw () : gpu_bad_alloc("OpenCL exception") { } +}; + +std::string errorString(cl_int code); + +static inline void reportError(cl_int err, int line, std::string prefix="") +{ + if (CL_SUCCESS == err) + return; + + std::string message = prefix + errorString(err) + " (" + to_string(err) + ")" + " at line " + to_string(line); + + switch (err) { + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + throw ocl_bad_alloc(message); + default: + throw ocl_exception(message); + } +} + +#define OCL_SAFE_CALL(expr) ocl::reportError(expr, __LINE__, "") +#define OCL_SAFE_CALL_MESSAGE(expr, message) ocl::reportError(expr, __LINE__, message) + +} diff --git a/libs/gpu/libgpu/shared_device_buffer.cpp b/libs/gpu/libgpu/shared_device_buffer.cpp new file mode 100644 index 0000000..4d164c6 --- /dev/null +++ b/libs/gpu/libgpu/shared_device_buffer.cpp @@ -0,0 +1,428 @@ +#include "shared_device_buffer.h" +#include "context.h" +#include +#include +#include + +#ifdef CUDA_SUPPORT +#include +#include +#endif + +#ifdef _WIN32 +#include +#endif + +namespace gpu { + +shared_device_buffer::shared_device_buffer() +{ + buffer_ = 0; + data_ = 0; + type_ = Context::TypeUndefined; + size_ = 0; + offset_ = 0; +} + +shared_device_buffer::~shared_device_buffer() +{ + decref(); +} + +shared_device_buffer::shared_device_buffer(const shared_device_buffer &other, size_t offset) +{ + buffer_ = other.buffer_; + data_ = other.data_; + type_ = other.type_; + size_ = other.size_; + offset_ = other.offset_ + offset; + incref(); +} + +shared_device_buffer &shared_device_buffer::operator= (const shared_device_buffer &other) +{ + if (this != &other) { + decref(); + buffer_ = other.buffer_; + data_ = other.data_; + type_ = other.type_; + size_ = other.size_; + offset_ = other.offset_; + incref(); + } + + return *this; +} + +void shared_device_buffer::swap(shared_device_buffer &other) +{ + std::swap(buffer_, other.buffer_); + std::swap(data_, other.data_); + std::swap(type_, other.type_); + std::swap(size_, other.size_); + std::swap(offset_, other.offset_); +} + +void shared_device_buffer::incref() +{ + if (!buffer_) + return; + +#if defined(_WIN64) + InterlockedIncrement64((LONGLONG *) buffer_); +#elif defined(_WIN32) + InterlockedIncrement((LONG *) buffer_); +#else + __sync_add_and_fetch((long long *) buffer_, 1); +#endif +} + +void shared_device_buffer::decref() +{ + if (!buffer_) + return; + + long long count = 0; + +#if defined(_WIN64) + count = InterlockedDecrement64((LONGLONG *) buffer_); +#elif defined(_WIN32) + count = InterlockedDecrement((LONG *) buffer_); +#else + count = __sync_sub_and_fetch((long long *) buffer_, 1); +#endif + + if (!count) { + switch (type_) { +#ifdef CUDA_SUPPORT + case Context::TypeCUDA: + cudaFree(data_); + break; +#endif + case Context::TypeOpenCL: + clReleaseMemObject((cl_mem) data_); + break; + default: + gpu::raiseException(__FILE__, __LINE__, "No GPU context!"); + } + + delete [] buffer_; + } + + buffer_ = 0; + data_ = 0; + type_ = Context::TypeUndefined; + size_ = 0; + offset_ = 0; +} + +shared_device_buffer shared_device_buffer::create(size_t size) +{ + shared_device_buffer res; + res.resize(size); + return res; +} + +void *shared_device_buffer::cuptr() const +{ + if (type_ == Context::TypeOpenCL) + throw gpu_exception("GPU buffer type mismatch"); + + return (char *) data_ + offset_; +} + +cl_mem shared_device_buffer::clmem() const +{ + if (type_ == Context::TypeCUDA) + throw gpu_exception("GPU buffer type mismatch"); + + return (cl_mem) data_; +} + +size_t shared_device_buffer::cloffset() const +{ + if (type_ == Context::TypeCUDA) + throw gpu_exception("GPU buffer type mismatch"); + + return offset_; +} + +size_t shared_device_buffer::size() const +{ + return size_; +} + +bool shared_device_buffer::isNull() const +{ + return data_ == NULL; +} + +void shared_device_buffer::reset() +{ + decref(); +} + +void shared_device_buffer::resize(size_t size) +{ + if (size == size_) + return; + + decref(); + + Context context; + Context::Type type = context.type(); + + switch (type) { +#ifdef CUDA_SUPPORT + case Context::TypeCUDA: + CUDA_SAFE_CALL( cudaMalloc(&data_, size) ); + break; +#endif + case Context::TypeOpenCL: + data_ = context.cl()->createBuffer(CL_MEM_READ_WRITE, size); + break; + default: + gpu::raiseException(__FILE__, __LINE__, "No GPU context!"); + } + + buffer_ = new unsigned char [8]; + * (long long *) buffer_ = 0; + incref(); + + type_ = type; + size_ = size; + offset_ = 0; +} + +void shared_device_buffer::grow(size_t size, float reserveMultiplier) +{ + if (size > size_) + resize((size_t) (size * reserveMultiplier)); +} + +void shared_device_buffer::write(const void *data, size_t size) +{ + if (size == 0) + return; + + if (size > size_) + throw gpu_exception("Too many data for this device buffer: " + to_string(size) + " > " + to_string(size_)); + + Context context; + switch (context.type()) { +#ifdef CUDA_SUPPORT + case Context::TypeCUDA: + CUDA_SAFE_CALL(cudaMemcpy(cuptr(), data, size, cudaMemcpyHostToDevice)); + break; +#endif + case Context::TypeOpenCL: + context.cl()->writeBuffer((cl_mem) data_, CL_TRUE, offset_, size, data); + break; + default: + gpu::raiseException(__FILE__, __LINE__, "No GPU context!"); + } +} + +void shared_device_buffer::write(const shared_device_buffer &buffer, size_t size) +{ + if (!size) + return; + + if (size > size_) + throw gpu_exception("Too many data for this device buffer: " + to_string(size) + " > " + to_string(size_)); + + Context context; + switch (context.type()) { +#ifdef CUDA_SUPPORT + case Context::TypeCUDA: + CUDA_SAFE_CALL(cudaMemcpy(cuptr(), buffer.cuptr(), size, cudaMemcpyDeviceToDevice)); + break; +#endif + case Context::TypeOpenCL: + context.cl()->copyBuffer(buffer.clmem(), clmem(), buffer.cloffset(), cloffset(), size); + break; + default: + gpu::raiseException(__FILE__, __LINE__, "No GPU context!"); + } +} + +void shared_device_buffer::write(const shared_host_buffer &buffer, size_t size) +{ + if (!size) + return; + + if (size > size_) + throw gpu_exception("Too many data for this device buffer: " + to_string(size) + " > " + to_string(size_)); + + Context context; + switch (context.type()) { +#ifdef CUDA_SUPPORT + case Context::TypeCUDA: + CUDA_SAFE_CALL(cudaMemcpy(cuptr(), buffer.get(), size, cudaMemcpyHostToDevice)); + break; +#endif + case Context::TypeOpenCL: + context.cl()->writeBuffer((cl_mem) data_, CL_TRUE, offset_, size, buffer.get()); + break; + default: + gpu::raiseException(__FILE__, __LINE__, "No GPU context!"); + } +} + +void shared_device_buffer::write2D(size_t dpitch, const void *src, size_t spitch, size_t width, size_t height) +{ + if (spitch == width && dpitch == width) { + write(src, width * height); + return; + } + + Context context; + switch (context.type()) { +#ifdef CUDA_SUPPORT + case Context::TypeCUDA: + CUDA_SAFE_CALL(cudaMemcpy2D(cuptr(), dpitch, src, spitch, width, height, cudaMemcpyHostToDevice)); + break; +#endif + case Context::TypeOpenCL: + { + size_t buffer_origin[3] = { offset_, 0, 0 }; + size_t host_origin[3] = { 0, 0, 0 }; + size_t region[3] = { width, height, 1 }; + context.cl()->writeBufferRect((cl_mem) data_, CL_TRUE, buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src); + } + break; + default: + gpu::raiseException(__FILE__, __LINE__, "No GPU context!"); + } +} + +void shared_device_buffer::read(void *data, size_t size, size_t offset) const +{ + if (size == 0) + return; + if (size > size_) + throw gpu_exception("Not enough data in this device buffer: " + to_string(size) + " > " + to_string(size_)); + + Context context; + switch (context.type()) { +#ifdef CUDA_SUPPORT + case Context::TypeCUDA: + CUDA_SAFE_CALL(cudaMemcpy(data, (char *) cuptr() + offset, size, cudaMemcpyDeviceToHost)); + break; +#endif + case Context::TypeOpenCL: + context.cl()->readBuffer((cl_mem) data_, CL_TRUE, offset_ + offset, size, data); + break; + default: + gpu::raiseException(__FILE__, __LINE__, "No GPU context!"); + } +} + +void shared_device_buffer::read2D(size_t spitch, void *dst, size_t dpitch, size_t width, size_t height) const +{ + if (spitch == width && dpitch == width) { + read(dst, width * height); + return; + } + + Context context; + switch (context.type()) { +#ifdef CUDA_SUPPORT + case Context::TypeCUDA: + CUDA_SAFE_CALL(cudaMemcpy2D(dst, dpitch, cuptr(), spitch, width, height, cudaMemcpyDeviceToHost)); + break; +#endif + case Context::TypeOpenCL: + { + size_t buffer_origin[3] = { offset_, 0, 0 }; + size_t host_origin[3] = { 0, 0, 0 }; + size_t region[3] = { width, height, 1 }; + context.cl()->readBufferRect((cl_mem) data_, CL_TRUE, buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst); + } + break; + default: + gpu::raiseException(__FILE__, __LINE__, "No GPU context!"); + } +} + +void shared_device_buffer::copyTo(shared_device_buffer &that, size_t size) const +{ + if (size == 0) + return; + if (size > size_) + throw gpu_exception("Not enough data in this device buffer: " + to_string(size) + " > " + to_string(size_)); + + Context context; + switch (context.type()) { +#ifdef CUDA_SUPPORT + case Context::TypeCUDA: + CUDA_SAFE_CALL(cudaMemcpy((char *) that.cuptr(), (char *) cuptr(), size, cudaMemcpyDeviceToDevice)); + break; +#endif + case Context::TypeOpenCL: + context.cl()->copyBuffer(clmem(), that.clmem(), offset_, that.offset_, size); + break; + default: + gpu::raiseException(__FILE__, __LINE__, "No GPU context!"); + } +} + +template +shared_device_buffer_typed shared_device_buffer_typed::createN(size_t number) +{ + shared_device_buffer_typed res; + res.resizeN(number); + return res; +} + +template +size_t shared_device_buffer_typed::number() const +{ + return size_ / sizeof(T); +} + +template +void shared_device_buffer_typed::resizeN(size_t number) +{ + this->resize(number * sizeof(T)); +} + +template +void shared_device_buffer_typed::growN(size_t number, float reserveMultiplier) +{ + this->grow(number * sizeof(T), reserveMultiplier); +} + +template +T *shared_device_buffer_typed::cuptr() const +{ + return (T *) shared_device_buffer::cuptr(); +} + +template +void shared_device_buffer_typed::writeN(const T* data, size_t number) { + this->write(data, number * sizeof(T)); +} + +template +void shared_device_buffer_typed::readN(T* data, size_t number, size_t offset) const +{ + this->read(data, number * sizeof(T), offset * sizeof(T)); +} + +template +void shared_device_buffer_typed::copyToN(shared_device_buffer_typed &that, size_t number) const +{ + this->copyTo(that, number * sizeof(T)); +} + +template class shared_device_buffer_typed; +template class shared_device_buffer_typed; +template class shared_device_buffer_typed; +template class shared_device_buffer_typed; +template class shared_device_buffer_typed; +template class shared_device_buffer_typed; +template class shared_device_buffer_typed; +template class shared_device_buffer_typed; + +} diff --git a/libs/gpu/libgpu/shared_device_buffer.h b/libs/gpu/libgpu/shared_device_buffer.h new file mode 100644 index 0000000..1885fac --- /dev/null +++ b/libs/gpu/libgpu/shared_device_buffer.h @@ -0,0 +1,87 @@ +#pragma once + +#include +#include "shared_host_buffer.h" + +typedef struct _cl_mem *cl_mem; + +namespace gpu { + +class shared_device_buffer { +public: + shared_device_buffer(); + ~shared_device_buffer(); + shared_device_buffer(const shared_device_buffer &other, size_t offset = 0); + shared_device_buffer &operator= (const shared_device_buffer &other); + + void swap(shared_device_buffer &other); + void reset(); + size_t size() const; + void resize(size_t size); + void grow(size_t size, float reserveMultiplier=1.1f); + bool isNull() const; + + void * cuptr() const; + cl_mem clmem() const; + size_t cloffset() const; + + void write(const void *data, size_t size); + void write(const shared_device_buffer &buffer, size_t size); + void write(const shared_host_buffer &buffer, size_t size); + void write2D(size_t dpitch, const void *src, size_t spitch, size_t width, size_t height); + + void read(void *data, size_t size, size_t offset = 0) const; + void read2D(size_t spitch, void *dst, size_t dpitch, size_t width, size_t height) const; + + void copyTo(shared_device_buffer &that, size_t size) const; + + static shared_device_buffer create(size_t size); + +protected: + void incref(); + void decref(); + + unsigned char * buffer_; + void * data_; + int type_; + size_t size_; + size_t offset_; +}; + +template +class shared_device_buffer_typed : public shared_device_buffer { +public: + shared_device_buffer_typed() : shared_device_buffer() {} + shared_device_buffer_typed(const shared_device_buffer_typed &other, size_t offset) : shared_device_buffer(other, offset * sizeof(T)) {} + explicit shared_device_buffer_typed(const shared_device_buffer &other) : shared_device_buffer(other) {} + + size_t number() const; + + void resizeN(size_t number); + void growN(size_t number, float reserveMultiplier=1.1f); + + T * cuptr() const; + + void writeN(const T* data, size_t number); + + void readN(T* data, size_t number, size_t offset = 0) const; + + void copyToN(shared_device_buffer_typed &that, size_t number) const; + + static shared_device_buffer_typed createN(size_t number); +}; + +typedef shared_device_buffer gpu_mem_any; + +typedef shared_device_buffer_typed gpu_mem_8i; +typedef shared_device_buffer_typed gpu_mem_16i; +typedef shared_device_buffer_typed gpu_mem_32i; +typedef shared_device_buffer_typed gpu_mem_8u; +typedef shared_device_buffer_typed gpu_mem_16u; +typedef shared_device_buffer_typed gpu_mem_32u; +typedef shared_device_buffer_typed gpu_mem_32f; +typedef shared_device_buffer_typed gpu_mem_64f; + +#define gpu_mem shared_device_buffer_typed + +} diff --git a/libs/gpu/libgpu/shared_host_buffer.cpp b/libs/gpu/libgpu/shared_host_buffer.cpp new file mode 100644 index 0000000..cbdfbac --- /dev/null +++ b/libs/gpu/libgpu/shared_host_buffer.cpp @@ -0,0 +1,206 @@ +#include "shared_host_buffer.h" +#include "context.h" +#include +#include + +#ifdef CUDA_SUPPORT +#include +#endif + +#ifdef _WIN32 +#include +#endif + +namespace gpu { + +shared_host_buffer::shared_host_buffer() +{ + buffer_ = 0; + data_ = 0; + type_ = Context::TypeUndefined; + size_ = 0; +} + +shared_host_buffer::~shared_host_buffer() +{ + decref(); +} + +shared_host_buffer::shared_host_buffer(const shared_host_buffer &other) +{ + buffer_ = other.buffer_; + data_ = other.data_; + type_ = other.type_; + size_ = other.size_; + incref(); +} + +shared_host_buffer &shared_host_buffer::operator= (const shared_host_buffer &other) +{ + if (this != &other) { + decref(); + buffer_ = other.buffer_; + data_ = other.data_; + type_ = other.type_; + size_ = other.size_; + incref(); + } + + return *this; +} + +void shared_host_buffer::swap(shared_host_buffer &other) +{ + std::swap(buffer_, other.buffer_); + std::swap(data_, other.data_); + std::swap(type_, other.type_); + std::swap(size_, other.size_); +} + +void shared_host_buffer::incref() +{ + if (!buffer_) + return; + +#if defined(_WIN64) + InterlockedIncrement64((LONGLONG *) buffer_); +#elif defined(_WIN32) + InterlockedIncrement((LONG *) buffer_); +#else + __sync_add_and_fetch((long long *) buffer_, 1); +#endif +} + +void shared_host_buffer::decref() +{ + if (!buffer_) + return; + + long long count = 0; + +#if defined(_WIN64) + count = InterlockedDecrement64((LONGLONG *) buffer_); +#elif defined(_WIN32) + count = InterlockedDecrement((LONG *) buffer_); +#else + count = __sync_sub_and_fetch((long long *) buffer_, 1); +#endif + + if (count) + return; + + switch (type_) { +#ifdef CUDA_SUPPORT + case Context::TypeCUDA: + cudaFreeHost(data_); + break; +#endif + case Context::TypeOpenCL: + free(data_); + break; + default: + gpu::raiseException(__FILE__, __LINE__, "No GPU context!"); + } + + delete [] buffer_; + + buffer_ = 0; + data_ = 0; + type_ = Context::TypeUndefined; + size_ = 0; +} + +shared_host_buffer shared_host_buffer::create(size_t size) +{ + shared_host_buffer res; + res.resize(size); + return res; +} + +void *shared_host_buffer::get() const +{ + return data_; +} + +size_t shared_host_buffer::size() const +{ + return size_; +} + +void shared_host_buffer::resize(size_t size) +{ + if (size == size_) + return; + + decref(); + + buffer_ = new unsigned char [8]; + * (long long *) buffer_ = 0; + incref(); + + Context context; + Context::Type type = context.type(); + + switch (type) { +#ifdef CUDA_SUPPORT + case Context::TypeCUDA: + CUDA_SAFE_CALL( cudaMallocHost(&data_, size) ); + break; +#endif + case Context::TypeOpenCL: + // NOTTODO: implement pinned memory in opencl + // currently we use a plain paged memory buffer + data_ = malloc(size); + if (!data_) + throw std::bad_alloc(); + break; + default: + gpu::raiseException(__FILE__, __LINE__, "No GPU context!"); + } + + type_ = type; + size_ = size; +} + +void shared_host_buffer::grow(size_t size) +{ + if (size > size_) + resize(size); +} + +template +shared_host_buffer_typed shared_host_buffer_typed::createN(size_t number) +{ + shared_host_buffer_typed res; + res.resizeN(number); + return res; +} + +template +void shared_host_buffer_typed::resizeN(size_t number) +{ + this->resize(number * sizeof(T)); +} + +template +T *shared_host_buffer_typed::get() const +{ + return (T*) data_; +} + +template +size_t shared_host_buffer_typed::number() const +{ + return this->size_ / sizeof(T); +} + +template class shared_host_buffer_typed; +template class shared_host_buffer_typed; +template class shared_host_buffer_typed; +template class shared_host_buffer_typed; +template class shared_host_buffer_typed; +template class shared_host_buffer_typed; +template class shared_host_buffer_typed; +template class shared_host_buffer_typed; + +} diff --git a/libs/gpu/libgpu/shared_host_buffer.h b/libs/gpu/libgpu/shared_host_buffer.h new file mode 100644 index 0000000..c2a74d6 --- /dev/null +++ b/libs/gpu/libgpu/shared_host_buffer.h @@ -0,0 +1,54 @@ +#pragma once + +#include +#include + +namespace gpu { + +class shared_host_buffer { +public: + shared_host_buffer(); + ~shared_host_buffer(); + shared_host_buffer(const shared_host_buffer &other); + shared_host_buffer &operator= (const shared_host_buffer &other); + + void swap(shared_host_buffer &other); + void * get() const; + size_t size() const; + void resize(size_t size); + void grow(size_t size); + + static shared_host_buffer create(size_t size); + +protected: + void incref(); + void decref(); + + unsigned char * buffer_; + void * data_; + int type_; + size_t size_; +}; + +template +class shared_host_buffer_typed : public shared_host_buffer { +public: + void resizeN(size_t number); + + T * get() const; + + size_t number() const; + + static shared_host_buffer_typed createN(size_t number); +}; + +typedef shared_host_buffer gpu_host_mem_any; + +typedef shared_host_buffer_typed gpu_host_mem_16i; +typedef shared_host_buffer_typed gpu_host_mem_32i; +typedef shared_host_buffer_typed gpu_host_mem_8u; +typedef shared_host_buffer_typed gpu_host_mem_16u; +typedef shared_host_buffer_typed gpu_host_mem_32u; +typedef shared_host_buffer_typed gpu_host_mem_32f; + +} diff --git a/libs/gpu/libgpu/utils.cpp b/libs/gpu/libgpu/utils.cpp new file mode 100644 index 0000000..f33f625 --- /dev/null +++ b/libs/gpu/libgpu/utils.cpp @@ -0,0 +1,121 @@ +#include "utils.h" +#include "context.h" + +#include +#include + + +void gpu::raiseException(std::string file, int line, std::string message) { + if (message.length() > 0) { + throw gpu_exception("Failure at " + file + ":" + to_string(line) + ": " + message); + } else { + throw gpu_exception("Failure at " + file + ":" + to_string(line)); + } +} + +template +size_t gpu::deviceTypeSize() { + Context context; +#ifdef CUDA_SUPPORT + if (context.type() == Context::TypeCUDA) { + return sizeof(T); + } else +#endif + if (context.type() == Context::TypeOpenCL) { + return sizeof(typename ocl::OpenCLType::type); + } else { + throw gpu_exception("No GPU active context!"); + } +} + +template +T gpu::deviceTypeMax() { + Context context; +#ifdef CUDA_SUPPORT + if (context.type() == Context::TypeCUDA) { + return std::numeric_limits::max(); + } else +#endif + if (context.type() == Context::TypeOpenCL) { + return ocl::OpenCLType::max(); + } else { + throw gpu_exception("No GPU active context!"); + } +} + +template +T gpu::deviceTypeMin() { + Context context; +#ifdef CUDA_SUPPORT + if (context.type() == Context::TypeCUDA) { + return std::numeric_limits::min(); + } else +#endif + if (context.type() == Context::TypeOpenCL) { + return ocl::OpenCLType::min(); + } else { + throw gpu_exception("No GPU active context!"); + } +} + +unsigned int gpu::calcNChunk(size_t n, size_t group_size, size_t max_size) +{ + if (n == 0) + return group_size; + + size_t work_parts_n = (n + max_size - 1) / max_size; + size_t exec_n = (n + work_parts_n - 1) / work_parts_n; + exec_n = (exec_n + group_size - 1) / group_size * group_size; + return (unsigned int) exec_n; +} + +unsigned int gpu::calcColsChunk(size_t width, size_t height, size_t group_size_x, size_t max_size) +{ + size_t work_parts_n = (width * height + max_size - 1) / max_size; + size_t ncols = (width + work_parts_n - 1) / work_parts_n; + ncols = (ncols + group_size_x - 1) / group_size_x * group_size_x; + return (unsigned int) ncols; +} + +unsigned int gpu::calcRowsChunk(size_t width, size_t height, size_t group_size_y, size_t max_size) +{ + size_t work_parts_n = (width * height + max_size - 1) / max_size; + size_t nrows = (height + work_parts_n - 1) / work_parts_n; + nrows = (nrows + group_size_y - 1) / group_size_y * group_size_y; + return (unsigned int) nrows; +} + +unsigned int gpu::calcZSlicesChunk(size_t x, size_t y, size_t z, size_t group_size_z, size_t max_size) +{ + size_t work_parts_n = (z * y * x + max_size - 1) / max_size; + size_t z_slices = (z + work_parts_n - 1) / work_parts_n; + z_slices = (z_slices + group_size_z - 1) / group_size_z * group_size_z; + return (unsigned int) z_slices; +} + +template size_t gpu::deviceTypeSize(); +template size_t gpu::deviceTypeSize(); +template size_t gpu::deviceTypeSize(); +template size_t gpu::deviceTypeSize(); +template size_t gpu::deviceTypeSize(); +template size_t gpu::deviceTypeSize(); +template size_t gpu::deviceTypeSize(); +template size_t gpu::deviceTypeSize(); + +template int8_t gpu::deviceTypeMax(); +template int16_t gpu::deviceTypeMax(); +template int32_t gpu::deviceTypeMax(); +template uint8_t gpu::deviceTypeMax(); +template uint16_t gpu::deviceTypeMax(); +template uint32_t gpu::deviceTypeMax(); +template float gpu::deviceTypeMax(); +template double gpu::deviceTypeMax(); + +template int8_t gpu::deviceTypeMin(); +template int16_t gpu::deviceTypeMin(); +template int32_t gpu::deviceTypeMin(); +template uint8_t gpu::deviceTypeMin(); +template uint16_t gpu::deviceTypeMin(); +template uint32_t gpu::deviceTypeMin(); +template float gpu::deviceTypeMin(); +template double gpu::deviceTypeMin(); diff --git a/libs/gpu/libgpu/utils.h b/libs/gpu/libgpu/utils.h new file mode 100644 index 0000000..1d6f88d --- /dev/null +++ b/libs/gpu/libgpu/utils.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include + +namespace gpu { + + class gpu_exception : public std::runtime_error { + public: + gpu_exception(std::string msg) throw () : runtime_error(msg) { } + gpu_exception(const char *msg) throw () : runtime_error(msg) { } + gpu_exception() throw () : runtime_error("GPU exception") { } + }; + + class gpu_bad_alloc : public gpu_exception { + public: + gpu_bad_alloc(std::string msg) throw () : gpu_exception(msg) { } + gpu_bad_alloc(const char *msg) throw () : gpu_exception(msg) { } + gpu_bad_alloc() throw () : gpu_exception("GPU exception") { } + }; + + void raiseException(std::string file, int line, std::string message); + + template + size_t deviceTypeSize(); + + template + T deviceTypeMax(); + + template + T deviceTypeMin(); + + inline unsigned int divup(unsigned int num, unsigned int denom) { + return (num + denom - 1) / denom; + } + + unsigned int calcNChunk(size_t n, size_t group_size, size_t max_size=1000*1000); + unsigned int calcColsChunk(size_t width, size_t height, size_t group_size_x, size_t max_size=1000*1000); + unsigned int calcRowsChunk(size_t width, size_t height, size_t group_size_y, size_t max_size=1000*1000); + unsigned int calcZSlicesChunk(size_t x, size_t y, size_t z, size_t group_size_z, size_t max_size=1000*1000); +} + +#define GPU_CHECKED_VERBOSE(x, message) if (!(x)) {gpu::raiseException(__FILE__, __LINE__, message);} +#define GPU_CHECKED(x) if (!(x)) {gpu::raiseException(__FILE__, __LINE__, "");} diff --git a/libs/gpu/libgpu/work_size.h b/libs/gpu/libgpu/work_size.h new file mode 100644 index 0000000..36eac11 --- /dev/null +++ b/libs/gpu/libgpu/work_size.h @@ -0,0 +1,84 @@ +#pragma once + +#include "utils.h" + +#ifdef CUDA_SUPPORT + #include +#endif + +namespace gpu { + class WorkSize { + public: + WorkSize(unsigned int groupSizeX, unsigned int workSizeX) + { + init(1, groupSizeX, 1, 1, workSizeX, 1, 1); + } + + WorkSize(unsigned int groupSizeX, unsigned int groupSizeY, unsigned int workSizeX, unsigned int workSizeY) + { + init(2, groupSizeX, groupSizeY, 1, workSizeX, workSizeY, 1); + } + + WorkSize(unsigned int groupSizeX, unsigned int groupSizeY, unsigned int groupSizeZ,unsigned int workSizeX, unsigned int workSizeY, unsigned int workSizeZ) + { + init(3, groupSizeX, groupSizeY, groupSizeZ, workSizeX, workSizeY, workSizeZ); + } + +#ifdef CUDA_SUPPORT + const dim3 &cuBlockSize() const { + return blockSize; + } + + const dim3 &cuGridSize() const { + return gridSize; + } +#endif + + const size_t *clLocalSize() const { + return localWorkSize; + } + + const size_t *clGlobalSize() const { + return globalWorkSize; + } + + int clWorkDim() const { + return workDims; + } + + private: + void init(int workDims, unsigned int groupSizeX, unsigned int groupSizeY, unsigned int groupSizeZ, unsigned int workSizeX, unsigned int workSizeY, unsigned int workSizeZ) + { + this->workDims = workDims; + + localWorkSize[0] = groupSizeX; + localWorkSize[1] = groupSizeY; + localWorkSize[2] = groupSizeZ; + + workSizeX = gpu::divup(workSizeX, groupSizeX) * groupSizeX; + workSizeY = gpu::divup(workSizeY, groupSizeY) * groupSizeY; + workSizeZ = gpu::divup(workSizeZ, groupSizeZ) * groupSizeZ; + + globalWorkSize[0] = workSizeX; + globalWorkSize[1] = workSizeY; + globalWorkSize[2] = workSizeZ; + +#ifdef CUDA_SUPPORT + blockSize = dim3(groupSizeX, groupSizeY, groupSizeZ); + gridSize = dim3(gpu::divup(workSizeX, groupSizeX), + gpu::divup(workSizeY, groupSizeY), + gpu::divup(workSizeZ, groupSizeZ)); +#endif + } + + private: + size_t localWorkSize[3]; + size_t globalWorkSize[3]; + int workDims; + +#ifdef CUDA_SUPPORT + dim3 blockSize; + dim3 gridSize; +#endif + }; +} \ No newline at end of file diff --git a/libs/utils/CMakeLists.txt b/libs/utils/CMakeLists.txt new file mode 100644 index 0000000..ddf23a3 --- /dev/null +++ b/libs/utils/CMakeLists.txt @@ -0,0 +1,37 @@ +cmake_minimum_required(VERSION 3.1) + +project(libutils) + +set(HEADERS + libutils/fast_random.h + libutils/misc.h + libutils/string_utils.h + libutils/thread_mutex.h + libutils/timer.h + ) + +set(SOURCES + libutils/misc.cpp + libutils/string_utils.cpp + libutils/thread_mutex.cpp + ) + +option(GPU_CUDA_SUPPORT "CUDA support." OFF) + +set(CMAKE_CXX_STANDARD 11) + +set(THREADS_PREFER_PTHREAD_FLAG ON) +find_package(Threads REQUIRED) + +if (GPU_CUDA_SUPPORT) + find_package (CUDA REQUIRED) + + add_definitions(-DCUDA_SUPPORT) + + cuda_add_library(${PROJECT_NAME} ${SOURCES} ${HEADERS}) +else() + add_library(${PROJECT_NAME} ${SOURCES} ${HEADERS}) +endif() + +target_link_libraries(${PROJECT_NAME} Threads::Threads libgpu) +target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_SOURCE_DIR}) diff --git a/libs/utils/libutils/fast_random.h b/libs/utils/libutils/fast_random.h new file mode 100644 index 0000000..a03a077 --- /dev/null +++ b/libs/utils/libutils/fast_random.h @@ -0,0 +1,38 @@ +#pragma once + +#include + +// See https://stackoverflow.com/a/1640399 +class FastRandom { +public: + FastRandom(unsigned long seed=123456789) { + reset(seed); + } + + void reset(unsigned long seed=123456789) { + x = seed; + y = 362436069; + z = 521288629; + } + + // Returns pseudo-random value in range [min; max] (inclusive) + int next(int min=0, int max=std::numeric_limits::max()) { + x ^= x << 16; + x ^= x >> 5; + x ^= x << 1; + + unsigned long t = x; + x = y; + y = z; + z = t ^ x ^ y; + + return min + (unsigned int) (z % (((unsigned long) max) - min + 1)); + } + + float nextf() { + return (next() * 2000.0f / std::numeric_limits::max()) - 1000.0f; + } + +private: + unsigned long x, y, z; +}; diff --git a/libs/utils/libutils/misc.cpp b/libs/utils/libutils/misc.cpp new file mode 100644 index 0000000..bf29d01 --- /dev/null +++ b/libs/utils/libutils/misc.cpp @@ -0,0 +1,74 @@ +#include "misc.h" + +#ifdef CUDA_SUPPORT +#include +#endif + +void gpu::printDeviceInfo(gpu::Device &device) +{ +#ifdef CUDA_SUPPORT + if (device.supports_cuda) { + int driverVersion = 239; + cudaDriverGetVersion(&driverVersion); + std::cout << "GPU. " << device.name << " (CUDA " << driverVersion << ")."; + } else +#endif + { + ocl::DeviceInfo info; + info.init(device.device_id_opencl); + if (info.device_type == CL_DEVICE_TYPE_GPU) { + std::cout << "GPU."; + } else if (info.device_type == CL_DEVICE_TYPE_CPU) { + std::cout << "CPU."; + } else { + throw std::runtime_error( + "Only CPU and GPU supported! But type=" + to_string(info.device_type) + " encountered!"); + } + std::cout << " " << info.device_name << "."; + if (info.device_type == CL_DEVICE_TYPE_CPU) { + std::cout << " " << info.vendor_name << "."; + } + } + + if (device.supportsFreeMemoryQuery()) { + std::cout << " Free memory: " << (device.getFreeMemory() >> 20) << "/" << (device.mem_size >> 20) << " Mb"; + } else { + std::cout << " Total memory: " << (device.mem_size >> 20) << " Mb"; + } + std::cout << std::endl; +} + + +gpu::Device gpu::chooseGPUDevice(int argc, char **argv) +{ + std::vector devices = gpu::enumDevices(); + unsigned int device_index = std::numeric_limits::max(); + + if (devices.size() == 0) { + throw std::runtime_error("No OpenCL devices found!"); + } else { + std::cout << "OpenCL devices:" << std::endl; + for (int i = 0; i < devices.size(); ++i) { + std::cout << " Device #" << i << ": "; + gpu::printDeviceInfo(devices[i]); + } + if (devices.size() == 1) { + device_index = 0; + } else { + if (argc != 2) { + std::cerr << "Usage: " << std::endl; + std::cerr << " Where should be from 0 to " << (devices.size() - 1) << " (inclusive)" << std::endl; + throw std::runtime_error("Illegal arguments!"); + } else { + device_index = atoi(argv[1]); + if (device_index >= devices.size()) { + std::cerr << " should be from 0 to " << (devices.size() - 1) << " (inclusive)! But " << argv[1] << " provided!" << std::endl; + throw std::runtime_error("Illegal arguments!"); + } + } + } + std::cout << "Using device #" << device_index << ": "; + gpu::printDeviceInfo(devices[device_index]); + } + return devices[device_index]; +} diff --git a/libs/utils/libutils/misc.h b/libs/utils/libutils/misc.h new file mode 100644 index 0000000..678ec00 --- /dev/null +++ b/libs/utils/libutils/misc.h @@ -0,0 +1,59 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace gpu { + void printDeviceInfo(gpu::Device &device); + + gpu::Device chooseGPUDevice(int argc, char **argv); +} + +namespace ocl { + + class Kernel { + public: + Kernel() {} + + Kernel(const char *source_code, size_t source_code_length, std::string kernel_name, + std::string defines = std::string()) + { + init(source_code, source_code_length, kernel_name, defines); + } + + void init(const char *source_code, size_t source_code_length, std::string kernel_name, + std::string defines = std::string()) + { + program_ = std::make_shared(source_code, source_code_length, defines); + kernel_ = std::make_shared(program_, kernel_name); + } + + void compile(bool printLog=false) + { + if (!kernel_) + throw std::runtime_error("Null kernel!"); + kernel_->precompile(printLog); + } + + typedef ocl::OpenCLKernel::Arg Arg; + + void exec(const gpu::WorkSize &ws, const Arg &arg0 = Arg(), const Arg &arg1 = Arg(), const Arg &arg2 = Arg(), const Arg &arg3 = Arg(), const Arg &arg4 = Arg(), const Arg &arg5 = Arg(), const Arg &arg6 = Arg(), const Arg &arg7 = Arg(), const Arg &arg8 = Arg(), const Arg &arg9 = Arg(), const Arg &arg10 = Arg(), const Arg &arg11 = Arg(), const Arg &arg12 = Arg(), const Arg &arg13 = Arg(), const Arg &arg14 = Arg(), const Arg &arg15 = Arg(), const Arg &arg16 = Arg(), const Arg &arg17 = Arg(), const Arg &arg18 = Arg(), const Arg &arg19 = Arg(), const Arg &arg20 = Arg(), const Arg &arg21 = Arg(), const Arg &arg22 = Arg(), const Arg &arg23 = Arg(), const Arg &arg24 = Arg(), const Arg &arg25 = Arg(), const Arg &arg26 = Arg(), const Arg &arg27 = Arg(), const Arg &arg28 = Arg(), const Arg &arg29 = Arg(), const Arg &arg30 = Arg(), const Arg &arg31 = Arg(), const Arg &arg32 = Arg(), const Arg &arg33 = Arg(), const Arg &arg34 = Arg(), const Arg &arg35 = Arg(), const Arg &arg36 = Arg(), const Arg &arg37 = Arg(), const Arg &arg38 = Arg(), const Arg &arg39 = Arg(), const Arg &arg40 = Arg()) + { + if (!kernel_) + throw std::runtime_error("Null kernel!"); + kernel_->exec(ws, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15, arg16, arg17, arg18, arg19, arg20, arg21, arg22, arg23, arg24, arg25, arg26, arg27, arg28, arg29, arg30, arg31, arg32, arg33, arg34, arg35, arg36, arg37, arg38, arg39, arg40); + } + + private: + std::shared_ptr program_; + std::shared_ptr kernel_; + }; +} diff --git a/libs/utils/libutils/string_utils.cpp b/libs/utils/libutils/string_utils.cpp new file mode 100644 index 0000000..ab82898 --- /dev/null +++ b/libs/utils/libutils/string_utils.cpp @@ -0,0 +1,158 @@ +#include "string_utils.h" +#include + +std::vector split(const std::string &string, const std::string &separator, bool keep_empty_parts) +{ + std::vector result; + size_t p = 0; + + while (true) { + size_t s = string.find(separator, p); + if (s == std::string::npos) + break; + std::string token = string.substr(p, s - p); + if (keep_empty_parts || token.size()) + result.push_back(token); + p = s + separator.size(); + } + + std::string token = string.substr(p); + if (keep_empty_parts || token.size()) + result.push_back(token); + return result; +} + +std::string join(const std::vector &tokens, const std::string &separator) +{ + std::string res; + for (size_t i = 0; i < tokens.size(); i++) { + if (i) + res += separator; + res += tokens[i]; + } + return res; +} + +std::istream &getline(std::istream &is, std::string &str) +{ + std::string::size_type nread = 0; + + if (std::istream::sentry(is, true)) { + std::streambuf *const sbuf = is.rdbuf(); + str.clear(); + + while (nread < str.max_size()) { + int c1 = sbuf->sbumpc(); + if (c1 == std::streambuf::traits_type::eof()) { + is.setstate(std::istream::eofbit); + break; + } else { + ++nread; + const char ch = c1; + if (ch != '\n' && ch != '\r') { + str.push_back(ch); + } else { + const char ch1 = is.peek(); + if (ch == '\n' && ch1 == '\r') is.ignore(1); + if (ch == '\r' && ch1 == '\n') is.ignore(1); + break; + } + } + } + } + + if (nread == 0 || nread >= str.max_size()) { + is.setstate(std::istream::failbit); + } + + return is; +} + +double atof(const std::string &s) +{ + std::stringstream ss(s); + ss.imbue(std::locale::classic()); + + double value = 0; + ss >> value; + return value; +} + +int atoi(const std::string &s) +{ + std::stringstream ss(s); + ss.imbue(std::locale::classic()); + + int value = 0; + ss >> value; + return value; +} + +std::string tolower(const std::string &str) +{ + std::string res = str; + for (size_t k = 0; k < res.size(); k++) res[k] = ::tolower(res[k]); + return res; +} + +std::string trimmed(const std::string &s) +{ + const size_t p1 = s.find_first_not_of(' '); + const size_t p2 = s.find_last_not_of(' '); + + if (p1 == std::string::npos) + return std::string(); + + return s.substr(p1, p2 - p1 + 1); +} + +// base 64 encoding/decoding +// http://stackoverflow.com/questions/180947/base64-decode-snippet-in-c + +std::string base64_encode(const std::string &in) +{ + std::string out; + + int val=0, valb=-6; + for (std::string::const_iterator it = in.begin(); it != in.end(); ++it) { + unsigned char c = *it; + + val = (val<<8) + c; + valb += 8; + while (valb>=0) { + out.push_back("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(val>>valb)&0x3F]); + valb-=6; + } + } + + if (valb>-6) out.push_back("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[((val<<8)>>(valb+8))&0x3F]); + while (out.size()%4) out.push_back('='); + return out; +} + +std::string base64_decode(const std::string &in) +{ + std::string out; + + std::vector T(256,-1); + for (int i=0; i<64; i++) T["ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[i]] = i; + + int val=0, valb=-8; + for (std::string::const_iterator it = in.begin(); it != in.end(); ++it) { + unsigned char c = *it; + if (isspace(c)) + continue; + + if (T[c] == -1) + break; + + val = (val<<6) + T[c]; + valb += 6; + if (valb>=0) { + out.push_back(char((val>>valb)&0xFF)); + valb-=8; + } + } + + return out; +} diff --git a/libs/utils/libutils/string_utils.h b/libs/utils/libutils/string_utils.h new file mode 100644 index 0000000..cbd96ec --- /dev/null +++ b/libs/utils/libutils/string_utils.h @@ -0,0 +1,24 @@ +#pragma once + +#include +#include +#include +#include + +template +std::string to_string(T value) +{ + std::ostringstream ss; + ss << value; + return ss.str(); +} + +std::vector split(const std::string &string, const std::string &separator, bool keep_empty_parts = true); +std::string join(const std::vector &tokens, const std::string &separator); +std::istream &getline(std::istream &is, std::string &str); +double atof(const std::string &s); +int atoi(const std::string &s); +std::string tolower(const std::string &str); +std::string trimmed(const std::string &str); +std::string base64_encode(const std::string &in); +std::string base64_decode(const std::string &in); diff --git a/libs/utils/libutils/thread_mutex.cpp b/libs/utils/libutils/thread_mutex.cpp new file mode 100644 index 0000000..15a163b --- /dev/null +++ b/libs/utils/libutils/thread_mutex.cpp @@ -0,0 +1,119 @@ +#include "thread_mutex.h" +#include + +#define MUTEX_POOL_CHECK_FOR_DEADLOCKS 0 + +#if defined _WIN32 || defined _WIN64 + +// Windows threads + +Mutex::Mutex() +{ + ::InitializeCriticalSection(&_mutex); +} + +Mutex::~Mutex() +{ + ::DeleteCriticalSection(&_mutex); +} + +void Mutex::lock() const +{ + ::EnterCriticalSection(&_mutex); + assert(_mutex.RecursionCount == 1); +} + +void Mutex::unlock() const +{ + assert(_mutex.RecursionCount == 1); + ::LeaveCriticalSection(&_mutex); +} + +bool Mutex::tryLock () const +{ + return (::TryEnterCriticalSection(&_mutex) != 0); +} + +#else + +// Posix threads + +#include + +Mutex::Mutex () +{ + int error = ::pthread_mutex_init(&_mutex, 0); + assert(error == 0); +} + +Mutex::~Mutex () +{ + int error = ::pthread_mutex_destroy(&_mutex); + assert(error == 0); +} + +void Mutex::lock() const +{ + int error = ::pthread_mutex_lock(&_mutex); + assert(error == 0); +} + +void Mutex::unlock() const +{ + int error = ::pthread_mutex_unlock(&_mutex); + assert(error == 0); +} + +bool Mutex::tryLock () const +{ + int error = ::pthread_mutex_trylock(&_mutex); + if (error == EBUSY) return false; + assert(error == 0); + return true; +} + +#endif + +MutexPool global_mutexpool; + +MutexPool::MutexPool(size_t size) +{ + size_ = size; + mutexes_ = new MutexPtr[size]; + for (size_t k = 0; k < size; k++) + mutexes_[k] = 0; +} + +MutexPool::~MutexPool() +{ + for (size_t k = 0; k < size_; k++) { + delete mutexes_[k]; + mutexes_[k] = 0; + } + delete[] mutexes_; +} + +MutexPool *MutexPool::instance() +{ + return &global_mutexpool; +} + +Mutex &MutexPool::get(const void *address) +{ + Lock lock(mutex_); + + size_t index = int(((size_t)(void *)(address) >> (sizeof(address) >> 1)) % size_); + +#if MUTEX_POOL_CHECK_FOR_DEADLOCKS + index = 0; +#endif + + Mutex *m = mutexes_[index]; + + if (!m) { + mutexes_[index] = new Mutex; + m = mutexes_[index]; + } + + return *m; +} diff --git a/libs/utils/libutils/thread_mutex.h b/libs/utils/libutils/thread_mutex.h new file mode 100644 index 0000000..25f4e84 --- /dev/null +++ b/libs/utils/libutils/thread_mutex.h @@ -0,0 +1,127 @@ +#pragma once + +#if defined _WIN32 || defined _WIN64 +#ifndef NOMINMAX +#define NOMINMAX +#endif +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#include +#else +#include +#endif + +class Lock; + +class Mutex { +public: + Mutex (); + virtual ~Mutex (); + + void lock () const; + void unlock () const; + bool tryLock () const; + +private: +#if defined _WIN32 || defined _WIN64 + mutable CRITICAL_SECTION _mutex; +#else + mutable pthread_mutex_t _mutex; +#endif + + void operator = (const Mutex& M); + Mutex (const Mutex& M); + + friend class Lock; +}; + +class Lock { +public: + Lock (const Mutex& m, bool autoLock = true) : _mutex (m), _locked (false) + { + if (autoLock) { + _mutex.lock(); + _locked = true; + } + } + + ~Lock () + { + if (_locked) + _mutex.unlock(); + } + + void acquire () + { + _mutex.lock(); + _locked = true; + } + + void release () + { + _mutex.unlock(); + _locked = false; + } + + bool locked () + { + return _locked; + } + +private: + const Mutex & _mutex; + bool _locked; +}; + +class TryLock { +public: + TryLock (const Mutex& m, bool autoLock = true) : _mutex (m), _locked (false) + { + if (autoLock) + _locked = _mutex.tryLock(); + } + + ~TryLock () + { + if (_locked) + _mutex.unlock(); + } + + bool acquire () + { + _locked = _mutex.tryLock(); + } + + void release () + { + _mutex.unlock(); + _locked = false; + } + + bool locked () + { + return _locked; + } + +private: + const Mutex & _mutex; + bool _locked; +}; + +class MutexPool { +public: + MutexPool(size_t size = 256); + ~MutexPool(); + + Mutex &get(const void *address); + + static MutexPool *instance(); + +private: + typedef Mutex * MutexPtr; + + Mutex mutex_; + MutexPtr * mutexes_; + size_t size_; +}; diff --git a/libs/utils/libutils/timer.h b/libs/utils/libutils/timer.h new file mode 100644 index 0000000..fddf6d1 --- /dev/null +++ b/libs/utils/libutils/timer.h @@ -0,0 +1,161 @@ +#pragma once + +#ifdef _WIN32 +#include +#else +#include +#endif + +#include +#include +#include + +class timer { +protected: +#ifdef _WIN32 + typedef clock_t timer_type; +#else + typedef struct timeval timer_type; +#endif + + double counter_; + timer_type start_; + int is_running_; + + std::vector laps_; + +public: + timer(bool paused = false) + { + counter_ = 0; + is_running_ = 0; + if (!paused) + start(); + } + + void start() + { + if (is_running_) return; + + start_ = measure(); + is_running_ = 1; + } + + void stop() + { + if (!is_running_) return; + + counter_ += diff(start_, measure()); + is_running_ = 0; + } + + double nextLap() + { + double lap_time = elapsed(); + laps_.push_back(lap_time); + restart(); + return lap_time; + } + + void reset() + { + counter_ = 0; + is_running_ = 0; + } + + void restart() + { + reset(); + start(); + } + + double elapsed() const + { + double tm = counter_; + + if (is_running_) + tm += diff(start_, measure()); + + if (tm < 0) + tm = 0; + + return tm; + } + + const std::vector& laps() const + { + return laps_; + } + + // Note that this is not true averaging, if there is at least 5 laps - averaging made from 20% percentile to 80% percentile (See lapsFiltered) + double lapAvg() const + { + std::vector laps = lapsFiltered(); + + double sum = 0.0; + for (int i = 0; i < laps.size(); ++i) { + sum += laps[i]; + } + if (laps.size() > 0) { + sum /= laps.size(); + } + return sum; + } + + // Note that this is not true averaging, if there is at least 5 laps - averaging made from 20% percentile to 80% percentile (See lapsFiltered) + double lapStd() const + { + double avg = lapAvg(); + + std::vector laps = lapsFiltered(); + + double sum2 = 0.0; + for (int i = 0; i < laps.size(); ++i) { + sum2 += laps[i] * laps[i]; + } + if (laps.size() > 0) { + sum2 /= laps.size(); + } + return sqrt(std::max(0.0, sum2 - avg * avg)); + } + +protected: + + std::vector lapsFiltered() const + { + std::vector laps = laps_; + std::sort(laps.begin(), laps.end()); + + unsigned int nlaps = laps.size(); + if (nlaps >= 5) { + // Removing last 20% of measures + laps.erase(laps.end() - nlaps/5, laps.end()); + // Removing first 20% of measures + laps.erase(laps.begin(), laps.begin() + nlaps/5); + } + return laps; + } + + static timer_type measure() + { + timer_type tm; +#ifdef _WIN32 + tm = clock(); +#else + ::gettimeofday(&tm, 0); +#endif + return tm; + } + + static double diff(const timer_type &start, const timer_type &end) + { +#ifdef _WIN32 + return (double) (end - start) / (double) CLOCKS_PER_SEC; +#else + long secs = end.tv_sec - start.tv_sec; + long usecs = end.tv_usec - start.tv_usec; + + return (double) secs + (double) usecs / 1000000.0; +#endif + } +}; diff --git a/src/cl/merge_sort.cl b/src/cl/merge_sort.cl new file mode 100644 index 0000000..0b7c27a --- /dev/null +++ b/src/cl/merge_sort.cl @@ -0,0 +1,32 @@ +#ifdef __CLION_IDE__ +#include +#define WORKGROUP_SIZE 256 +#endif + +#line 6 + +__attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1))) +__kernel void merge_sort(__global const float* as, + __global float* bs, + unsigned int n, + unsigned int sorted_chunks_size) +{ + const unsigned int global_index = get_global_id(0); + const unsigned int local_index = get_local_id(0); + + __local float as_local[WORKGROUP_SIZE]; + __local float bs_local[WORKGROUP_SIZE]; + + if (2 * sorted_chunks_size <= WORKGROUP_SIZE) { + as_local[local_index] = (global_index < n) ? as[global_index] : FLT_MAX; + barrier(CLK_LOCAL_MEM_FENCE); + if (local_index % (2 * sorted_chunks_size) == 0) { + // Merging two sorted chunks in new one + int i = 0; + int j = 0; + for (int _ = 0; _ < 2 * sorted_chunks_size; ++_) { + + } + } + } +} diff --git a/src/cl/merge_sort_cl.h b/src/cl/merge_sort_cl.h new file mode 100644 index 0000000..8ef7c42 --- /dev/null +++ b/src/cl/merge_sort_cl.h @@ -0,0 +1,49 @@ +#include + +static const char merge_sort_kernel[] = { +0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x5f, 0x5f, 0x43, 0x4c, 0x49, 0x4f, 0x4e, 0x5f, 0x49, 0x44, 0x45, 0x5f, 0x5f, +0x0a, 0x23, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x20, 0x3c, 0x6c, 0x69, 0x62, 0x67, 0x70, 0x75, 0x2f, 0x6f, 0x70, +0x65, 0x6e, 0x63, 0x6c, 0x2f, 0x63, 0x6c, 0x2f, 0x63, 0x6c, 0x69, 0x6f, 0x6e, 0x5f, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x73, 0x2e, 0x63, 0x6c, 0x3e, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x57, 0x4f, 0x52, 0x4b, 0x47, 0x52, +0x4f, 0x55, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, 0x32, 0x35, 0x36, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, +0x0a, 0x23, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x36, 0x0a, 0x0a, 0x5f, 0x5f, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, +0x65, 0x5f, 0x5f, 0x28, 0x28, 0x72, 0x65, 0x71, 0x64, 0x5f, 0x77, 0x6f, 0x72, 0x6b, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, +0x5f, 0x73, 0x69, 0x7a, 0x65, 0x28, 0x57, 0x4f, 0x52, 0x4b, 0x47, 0x52, 0x4f, 0x55, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, +0x2c, 0x20, 0x31, 0x2c, 0x20, 0x31, 0x29, 0x29, 0x29, 0x0a, 0x5f, 0x5f, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x20, 0x76, +0x6f, 0x69, 0x64, 0x20, 0x6d, 0x65, 0x72, 0x67, 0x65, 0x5f, 0x73, 0x6f, 0x72, 0x74, 0x28, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, +0x62, 0x61, 0x6c, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, 0x20, 0x61, 0x73, 0x2c, +0x0a, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, 0x20, 0x62, 0x73, 0x2c, 0x0a, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, +0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x6e, 0x2c, 0x0a, 0x09, 0x09, 0x09, +0x09, 0x09, 0x09, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x73, 0x6f, 0x72, +0x74, 0x65, 0x64, 0x5f, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x73, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, +0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x67, +0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, +0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x3b, 0x0a, 0x09, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, +0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x69, 0x6e, +0x64, 0x65, 0x78, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x30, +0x29, 0x3b, 0x0a, 0x0a, 0x09, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x61, +0x73, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5b, 0x57, 0x4f, 0x52, 0x4b, 0x47, 0x52, 0x4f, 0x55, 0x50, 0x5f, 0x53, 0x49, +0x5a, 0x45, 0x5d, 0x3b, 0x0a, 0x09, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, +0x62, 0x73, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5b, 0x57, 0x4f, 0x52, 0x4b, 0x47, 0x52, 0x4f, 0x55, 0x50, 0x5f, 0x53, +0x49, 0x5a, 0x45, 0x5d, 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x32, 0x20, 0x2a, 0x20, 0x73, 0x6f, 0x72, 0x74, +0x65, 0x64, 0x5f, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x73, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x3c, 0x3d, 0x20, 0x57, 0x4f, +0x52, 0x4b, 0x47, 0x52, 0x4f, 0x55, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x61, 0x73, +0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5b, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x5d, 0x20, +0x3d, 0x20, 0x28, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x20, 0x3c, 0x20, 0x6e, 0x29, +0x20, 0x3f, 0x20, 0x61, 0x73, 0x5b, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x5d, 0x20, +0x3a, 0x20, 0x46, 0x4c, 0x54, 0x5f, 0x4d, 0x41, 0x58, 0x3b, 0x0a, 0x09, 0x09, 0x62, 0x61, 0x72, 0x72, 0x69, 0x65, 0x72, +0x28, 0x43, 0x4c, 0x4b, 0x5f, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x5f, 0x46, 0x45, 0x4e, 0x43, 0x45, +0x29, 0x3b, 0x0a, 0x09, 0x09, 0x69, 0x66, 0x20, 0x28, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x69, 0x6e, 0x64, 0x65, 0x78, +0x20, 0x25, 0x20, 0x28, 0x32, 0x20, 0x2a, 0x20, 0x73, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x5f, 0x63, 0x68, 0x75, 0x6e, 0x6b, +0x73, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x20, 0x3d, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x09, 0x2f, +0x2f, 0x20, 0x4d, 0x65, 0x72, 0x67, 0x69, 0x6e, 0x67, 0x20, 0x74, 0x77, 0x6f, 0x20, 0x73, 0x6f, 0x72, 0x74, 0x65, 0x64, +0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x6f, 0x6e, 0x65, 0x0a, 0x09, +0x09, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x0a, 0x09, 0x09, 0x09, 0x69, 0x6e, 0x74, 0x20, +0x6a, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x0a, 0x09, 0x09, 0x09, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x69, 0x6e, 0x74, 0x20, 0x5f, +0x20, 0x3d, 0x20, 0x30, 0x3b, 0x20, 0x5f, 0x20, 0x3c, 0x20, 0x32, 0x20, 0x2a, 0x20, 0x73, 0x6f, 0x72, 0x74, 0x65, 0x64, +0x5f, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x73, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x3b, 0x20, 0x2b, 0x2b, 0x5f, 0x29, 0x20, 0x7b, +0x0a, 0x09, 0x09, 0x09, 0x09, 0x0a, 0x09, 0x09, 0x09, 0x7d, 0x0a, 0x09, 0x09, 0x7d, 0x0a, 0x09, 0x7d, 0x0a, 0x7d, 0x0a, +}; + +size_t merge_sort_kernel_length = sizeof(merge_sort_kernel) / sizeof(char); diff --git a/src/cu/merge_sort.cu b/src/cu/merge_sort.cu new file mode 100644 index 0000000..b0f4dc0 --- /dev/null +++ b/src/cu/merge_sort.cu @@ -0,0 +1,9 @@ +#include + +#include "../cl/merge_sort.cl" + +void cuda_merge_sort(const gpu::WorkSize &workSize, cudaStream_t stream, + const float* as, float* bs, unsigned int n, unsigned int sorted_chunks_size) { + merge_sort<<>>(as, bs, n, sorted_chunks_size); + CUDA_CHECK_KERNEL(stream); +} diff --git a/src/defines.h b/src/defines.h new file mode 100644 index 0000000..ada5956 --- /dev/null +++ b/src/defines.h @@ -0,0 +1,8 @@ +#pragma once + +#define MAX_IN_MEMORY_VALUES (16*1024*1024) + +#define PARTS_MERGED_PER_PASS 16 + +// BYTES_PER_BUFFER = 512 KB +#define BYTES_PER_BUFFER ((size_t) (128 * 1024 * sizeof(float))) diff --git a/src/io_utils/buffer_reader.cpp b/src/io_utils/buffer_reader.cpp new file mode 100644 index 0000000..b37eebf --- /dev/null +++ b/src/io_utils/buffer_reader.cpp @@ -0,0 +1,30 @@ +#include "buffer_reader.h" + +#include +#include + + +BufferReader::BufferReader(const std::vector &data) : data(data), offset(0) +{ + +} + +bool BufferReader::isEmpty() +{ + if (offset >= data.size()) { + assert(offset == data.size()); + return true; + } else { + return false; + } +} + +float BufferReader::next() +{ + if (isEmpty()) { + throw std::runtime_error("Empty buffer!"); + } + float value = *((float*) (data.data() + offset)); + offset += sizeof(float); + return value; +} diff --git a/src/io_utils/buffer_reader.h b/src/io_utils/buffer_reader.h new file mode 100644 index 0000000..d416a82 --- /dev/null +++ b/src/io_utils/buffer_reader.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include + + +class BufferReader { +public: + explicit BufferReader(const std::vector &data); + + float next(); + bool isEmpty(); + +private: + std::vector data; + size_t offset; +}; diff --git a/src/io_utils/buffer_writer.cpp b/src/io_utils/buffer_writer.cpp new file mode 100644 index 0000000..a20236c --- /dev/null +++ b/src/io_utils/buffer_writer.cpp @@ -0,0 +1,44 @@ +#include "buffer_writer.h" + +#include +#include + + +BufferWriter::BufferWriter(size_t size) : data(std::vector(size, 0)), offset(0) +{ + +} + +void BufferWriter::write(float value) +{ + assert(!isFull()); + *((float*) (data.data() + offset)) = value; + offset += sizeof(float); +} + +bool BufferWriter::isFull() +{ + if (offset >= data.size()) { + assert(offset == data.size()); + return true; + } else { + return false; + } +} + +char* BufferWriter::ptr() +{ + return data.data(); +} + + +size_t BufferWriter::valuesNumber() +{ + return offset / sizeof(float); +} + +void BufferWriter::clear() +{ + memset(data.data(), 0, data.size()); + offset = 0; +} diff --git a/src/io_utils/buffer_writer.h b/src/io_utils/buffer_writer.h new file mode 100644 index 0000000..a721b8c --- /dev/null +++ b/src/io_utils/buffer_writer.h @@ -0,0 +1,21 @@ +#pragma once + +#include +#include + + +class BufferWriter { +public: + explicit BufferWriter(size_t size); + + void write(float value); + bool isFull(); + + char* ptr(); + size_t valuesNumber(); + void clear(); + +private: + std::vector data; + size_t offset; +}; diff --git a/src/io_utils/file_reader.cpp b/src/io_utils/file_reader.cpp new file mode 100644 index 0000000..3454b84 --- /dev/null +++ b/src/io_utils/file_reader.cpp @@ -0,0 +1,71 @@ +#include "file_reader.h" +#include "../defines.h" + +#include +#include + + +FileReader::FileReader(const std::string &filename) : + filename(filename), + file(std::fstream(filename, std::ios::in | std::ios::binary | std::ios::ate)), + buffer(BufferReader(std::vector())) +{ + ptrdiff_t file_size = file.tellg(); + if (file_size < 0) { + throw std::runtime_error("Failed to get size of file " + filename + "!"); + } + file.seekg(0); + + size = (size_t) file_size; + offset = 0; + assert(size % sizeof(float) == 0); +} + +FileReader::~FileReader() +{ + file.close(); +} + +size_t FileReader::valuesNumber() +{ + return size / sizeof(float); +} + +bool FileReader::isEmpty() +{ + if (offset >= size) { + assert(buffer.isEmpty() && file.tellg() == size); + return true; + } else { + return false; + } +} + +float FileReader::next() +{ + if (isEmpty()) { + throw std::runtime_error("Empty file!"); + } + if (buffer.isEmpty()) { + size_t buffer_size = std::min(BYTES_PER_BUFFER, size - offset); + std::vector data(buffer_size, 0); + file.read(data.data(), buffer_size); + buffer = BufferReader(data); + } + assert(!buffer.isEmpty()); + offset += sizeof(float); + return buffer.next(); +} + +void FileReader::seek(size_t index) +{ + buffer = BufferReader(std::vector()); + offset = index * sizeof(float); + assert(offset < size); + file.seekg(offset); +} + +std::string FileReader::getFilename() +{ + return filename; +} diff --git a/src/io_utils/file_reader.h b/src/io_utils/file_reader.h new file mode 100644 index 0000000..2e3553b --- /dev/null +++ b/src/io_utils/file_reader.h @@ -0,0 +1,25 @@ +#pragma once + +#include "buffer_reader.h" + +#include + + +class FileReader { +public: + explicit FileReader(const std::string &filename); + ~FileReader(); + + size_t valuesNumber(); + bool isEmpty(); + float next(); + void seek(size_t index); + std::string getFilename(); + +private: + const std::string filename; + std::basic_fstream file; + BufferReader buffer; + size_t size; + size_t offset; +}; diff --git a/src/io_utils/file_writer.cpp b/src/io_utils/file_writer.cpp new file mode 100644 index 0000000..b2c56e8 --- /dev/null +++ b/src/io_utils/file_writer.cpp @@ -0,0 +1,42 @@ +#include "file_writer.h" +#include "../defines.h" + +#include + + +FileWriter::FileWriter(const std::string &filename) + : filename(filename), + file(std::fstream(filename, std::ios::out | std::ios::binary)), + buffer(BYTES_PER_BUFFER) +{ + +} + +FileWriter::~FileWriter() +{ + flushBuffer(); + file.close(); +} + +size_t FileWriter::valuesNumber() +{ + return offset / sizeof(float); +} + +void FileWriter::write(float value) +{ + if (buffer.isFull()) { + flushBuffer(); + } + assert(!buffer.isFull()); + buffer.write(value); +} + +void FileWriter::flushBuffer() +{ + size_t values_number = buffer.valuesNumber(); + if (values_number > 0) { + file.write(buffer.ptr(), values_number * sizeof(float)); + buffer.clear(); + } +} diff --git a/src/io_utils/file_writer.h b/src/io_utils/file_writer.h new file mode 100644 index 0000000..9ddcc19 --- /dev/null +++ b/src/io_utils/file_writer.h @@ -0,0 +1,23 @@ +#pragma once + +#include "buffer_writer.h" + +#include + + +class FileWriter { +public: + explicit FileWriter(const std::string &filename); + ~FileWriter(); + + size_t valuesNumber(); + void write(float value); + +private: + void flushBuffer(); + + const std::string filename; + std::basic_fstream file; + BufferWriter buffer; + size_t offset; +}; diff --git a/src/main_generator.cpp b/src/main_generator.cpp new file mode 100644 index 0000000..8173fa3 --- /dev/null +++ b/src/main_generator.cpp @@ -0,0 +1,47 @@ +#include +#include + +#include +#include + +#include "io_utils/file_reader.h" +#include "io_utils/file_writer.h" + + +int main(int argc, char* argv[]) +{ + if (argc != 3) { + std::cout << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + + long int n = strtol(argv[1], nullptr, 10); + if (n <= 0) { + std::cout << "Number of values to generate should be positive integer value, but " << argv[1] << " found!" << std::endl; + return 1; + } + + std::string outputFilename(argv[2]); + std::cout << "Saving " << n << " random floats (" << (n * sizeof(float) / 1024 / 1024) << " MB) to file " << outputFilename << std::endl; + + timer t; + + { + FastRandom r; + FileWriter out(outputFilename); + for (size_t i = 0; i < n; ++i) { + float v = r.nextf(); + out.write(v); + } + } + + double dt = t.elapsed(); + + std::cout << "Done in " << dt << " seconds"; + if (dt > 0.0) { + std::cout << " (" << ((double) n * sizeof(float) / dt / 1024 / 1024) << " MB/s)"; + } + std::cout << std::endl; + + return 0; +} diff --git a/src/main_sorter.cpp b/src/main_sorter.cpp new file mode 100644 index 0000000..10e205c --- /dev/null +++ b/src/main_sorter.cpp @@ -0,0 +1,168 @@ +#include "io_utils/file_reader.h" +#include "defines.h" +#include "io_utils/file_writer.h" + +#include +#include +#include +#include +#include +#include +#include + + +std::string getFilename(const std::string &outputFilename, size_t pass, size_t part_index) +{ + return outputFilename + "." + to_string(pass) + "." + to_string(part_index) + ".tmp"; +} + + +std::string toPercent(double part, double total) +{ + if (total == 0.0) + return "0"; + return to_string((int) std::floor(100.0 * part / total + 0.5)); +} + + +int main(int argc, char* argv[]) +{ + if (argc != 3) { + std::cout << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + + std::string inputFilename(argv[1]); + std::string outputFilename(argv[2]); + + size_t n; + { + FileReader reader(inputFilename); + n = reader.valuesNumber(); + } + std::cout << "Values number: " << n << " (" << (n * sizeof(float) / 1024 / 1024) << " MB)" << std::endl; + + size_t pass = 0; + size_t prevpass_nparts = 0; + size_t prevpass_nvalues = 0; + + timer full_t; + + { + size_t in_core_parts = (n + MAX_IN_MEMORY_VALUES - 1) / MAX_IN_MEMORY_VALUES; + std::cout << "Pass #" << pass << ": sorting part by part in core..." << std::endl; + std::cout << " In core parts number: " << in_core_parts << std::endl; + std::cout << " Limit for values in core: " << MAX_IN_MEMORY_VALUES << " (" << (MAX_IN_MEMORY_VALUES * sizeof(float) / 1024 / 1024) << " MB)" << std::endl; + double reading_time = 0.0; + double sorting_time = 0.0; + double writing_time = 0.0; + timer total_t; + #pragma omp parallel reduction(+:reading_time,sorting_time,writing_time) + { + FileReader reader(inputFilename); + std::vector data(MAX_IN_MEMORY_VALUES, 0.0f); + #pragma omp parallel for schedule(dynamic, 1) + for (size_t part_index = 0; part_index < in_core_parts; ++part_index) { + size_t from = part_index * MAX_IN_MEMORY_VALUES; + size_t to = std::min(n, (part_index + 1) * MAX_IN_MEMORY_VALUES); + + timer reading_t; + { + reader.seek(from); + for (size_t i = from; i < to; ++i) { + data[i - from] = reader.next(); + } + } + reading_time += reading_t.elapsed(); + + timer sorting_t; + // TODO: implement sort on GPU + std::sort(data.begin(), data.begin() + (to - from)); + sorting_time += sorting_t.elapsed(); + + timer writing_t; + { + FileWriter writer(in_core_parts == 1 ? outputFilename : getFilename(outputFilename, pass, part_index)); + for (size_t i = 0; i < to - from; ++i) { + writer.write(data[i]); + } + } + writing_time += writing_t.elapsed(); + } + } + double sum_time = reading_time + sorting_time + writing_time; + size_t total_values = 2 * n; + std::cout << " IO: " << (total_values / total_t.elapsed() / 1024 / 1024 * sizeof(float)) << " MB/s" << std::endl; + std::cout << " Finished in " << total_t.elapsed() << " s (" << toPercent(reading_time, sum_time) << "% reading + " << toPercent(sorting_time, sum_time) << "% sorting + " << toPercent(writing_time, sum_time) << "% writing)" << std::endl; + prevpass_nparts = in_core_parts; + prevpass_nvalues = MAX_IN_MEMORY_VALUES; + } + + while (prevpass_nparts > 1) { + size_t prevpass = pass; + ++pass; + size_t merged_parts = (prevpass_nparts + PARTS_MERGED_PER_PASS - 1) / PARTS_MERGED_PER_PASS; + size_t merged_nvalues = prevpass_nvalues * PARTS_MERGED_PER_PASS; + std::cout << "Pass #" << pass << ": merging groups of " << PARTS_MERGED_PER_PASS << " parts..." << std::endl; + std::cout << " Input parts: " << prevpass_nparts << " with " << prevpass_nvalues << " values (" << (prevpass_nvalues * sizeof(float) / 1024 / 1024) << " MB) in each" << std::endl; + + timer total_t; + #pragma omp parallel for schedule(dynamic, 1) + for (size_t part_index = 0; part_index < merged_parts; ++part_index) { + std::vector> readers; + for (size_t i = 0; i < std::min((size_t) PARTS_MERGED_PER_PASS, prevpass_nparts - PARTS_MERGED_PER_PASS * part_index); ++i) { + size_t prevpass_part_index = PARTS_MERGED_PER_PASS * part_index + i; + readers.emplace_back(new FileReader(getFilename(outputFilename, prevpass, prevpass_part_index))); + } + FileWriter writer(merged_parts == 1 ? outputFilename : getFilename(outputFilename, pass, part_index)); + const float NO_VALUE = std::numeric_limits::max(); + float min_values[PARTS_MERGED_PER_PASS]; + bool is_empty[PARTS_MERGED_PER_PASS]; + for (size_t i = 0; i < readers.size(); ++i) { + min_values[i] = readers[i]->next(); + is_empty[i] = false; + } + while (true) { + float min = NO_VALUE; + ptrdiff_t min_reader = -1; + for (size_t i = 0; i < readers.size(); ++i) { + if (is_empty[i]) continue; + if (min_reader == -1 || min_values[i] < min) { + min = min_values[i]; + min_reader = i; + } + } + if (min_reader == -1) { + // i.e. all readers are empty + for (size_t i = 0; i < readers.size(); ++i) { + assert(min_values[i] == NO_VALUE); + assert(is_empty[i]); + assert(readers[i]->isEmpty()); + } + break; + } + assert(min_values[min_reader] == min); + if (readers[min_reader]->isEmpty()) { + is_empty[min_reader] = true; + min_values[min_reader] = NO_VALUE; + } else { + min_values[min_reader] = readers[min_reader]->next(); + } + writer.write(min); + } + for (size_t i = 0; i < readers.size(); ++i) { + std::remove(readers[i]->getFilename().c_str()); + } + } + size_t total_values = 2 * n; + std::cout << " IO: " << (total_values / total_t.elapsed() / 1024 / 1024 * sizeof(float)) << " MB/s" << std::endl; + std::cout << " Finished in " << total_t.elapsed() << " s" << std::endl; + + prevpass_nparts = merged_parts; + prevpass_nvalues = merged_nvalues; + } + + std::cout << "Finished in " << full_t.elapsed() << " s" << std::endl; + + return 0; +}