runtime/utils/flags.hpp

//
// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved.
//

#ifndef FLAGS_HPP_
#define FLAGS_HPP_


#define RUNTIME_FLAGS(debug,release,release_on_stg)                           \
                                                                              \
debug(int, LOG_LEVEL, 0,                                                      \
        "The default log level")                                              \
debug(uint, DEBUG_GPU_FLAGS, 0,                                               \
        "The debug options for GPU device")                                   \
release(uint, GPU_MAX_COMMAND_QUEUES, IF(IS_HIP, 300, 70),                    \
        "The maximum number of concurrent Virtual GPUs")                      \
release(size_t, CQ_THREAD_STACK_SIZE, 256*Ki, /* @todo: that much! */         \
        "The default command queue thread stack size")                        \
release(int, GPU_MAX_WORKGROUP_SIZE, 0,                                       \
        "Maximum number of workitems in a workgroup for GPU, 0 -use default") \
release(int, GPU_MAX_WORKGROUP_SIZE_2D_X, 0,                                  \
        "Maximum number of workitems in a 2D workgroup for GPU, x component, 0 -use default") \
release(int, GPU_MAX_WORKGROUP_SIZE_2D_Y, 0,                                  \
        "Maximum number of workitems in a 2D workgroup for GPU, y component, 0 -use default") \
release(int, GPU_MAX_WORKGROUP_SIZE_3D_X, 0,                                  \
        "Maximum number of workitems in a 3D workgroup for GPU, x component, 0 -use default") \
release(int, GPU_MAX_WORKGROUP_SIZE_3D_Y, 0,                                  \
        "Maximum number of workitems in a 3D workgroup for GPU, y component, 0 -use default") \
release(int, GPU_MAX_WORKGROUP_SIZE_3D_Z, 0,                                  \
        "Maximum number of workitems in a 3D workgroup for GPU, z component, 0 -use default") \
debug(bool, CPU_MEMORY_GUARD_PAGES, false,                                    \
        "Use guard pages for CPU memory")                                     \
debug(size_t, CPU_MEMORY_GUARD_PAGE_SIZE, 64,                                 \
        "Size in KB of CPU memory guard page")                                \
debug(size_t, CPU_MEMORY_ALIGNMENT_SIZE, 256,                                 \
        "Size in bytes for the default alignment for guarded memory on CPU")  \
debug(size_t, PARAMETERS_MIN_ALIGNMENT, 16,                                   \
        "Minimum alignment required for the abstract parameters stack")       \
debug(size_t, MEMOBJ_BASE_ADDR_ALIGN, 4*Ki,                                   \
        "Alignment of the base address of any allocate memory object")        \
release(cstring, GPU_DEVICE_NAME, "",                                         \
        "Select the device ordinal (will only report a single device)")       \
release(cstring, GPU_DEVICE_ORDINAL, "",                                      \
        "Select the device ordinal (comma seperated list of available devices)") \
release(bool, REMOTE_ALLOC, false,                                            \
        "Use remote memory for the global heap allocation")                   \
release(uint, GPU_MAX_HEAP_SIZE, 100,                                         \
        "Set maximum size of the GPU heap to % of board memory")              \
release(uint, GPU_STAGING_BUFFER_SIZE, 512,                                   \
        "Size of the GPU staging buffer in KiB")                              \
release(bool, GPU_DUMP_BLIT_KERNELS, false,                                   \
        "Dump the kernels for blit manager")                                  \
release(uint, GPU_BLIT_ENGINE_TYPE, 0x0,                                      \
        "Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel")       \
release(bool, GPU_FLUSH_ON_EXECUTION, false,                                  \
        "Submit commands to HW on every operation. 0 - Disable, 1 - Enable")  \
release(bool, GPU_USE_SYNC_OBJECTS, true,                                     \
        "If enabled, use sync objects instead of polling")                    \
release(bool, CL_KHR_FP64, true,                                              \
        "Enable/Disable support for double precision")                        \
release(cstring, AMD_OCL_BUILD_OPTIONS, 0,                                    \
        "Set clBuildProgram() and clCompileProgram()'s options (override)")   \
release(cstring, AMD_OCL_BUILD_OPTIONS_APPEND, 0,                             \
        "Append clBuildProgram() and clCompileProgram()'s options")           \
release(cstring, AMD_OCL_LINK_OPTIONS, 0,                                     \
        "Set clLinkProgram()'s options (override)")                           \
release(cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0,                              \
        "Append clLinkProgram()'s options")                                   \
release(cstring, AMD_OCL_SC_LIB, 0,                                           \
        "Set shader compiler shared library name or path")                    \
debug(bool, AMD_OCL_ENABLE_MESSAGE_BOX, false,                                \
        "Enable the error dialog on Windows")                                 \
release(size_t, GPU_PINNED_XFER_SIZE, 16,                                     \
        "The pinned buffer size for pinning in read/write transfers")         \
release(size_t, GPU_PINNED_MIN_XFER_SIZE, 512,                                \
        "The minimal buffer size for pinned read/write transfers in KBytes")  \
release(size_t, GPU_RESOURCE_CACHE_SIZE, 64,                                  \
        "The resource cache size in MB")                                      \
release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096,                                  \
        "The maximum size accepted for suballocaitons in KB")                 \
release(bool, GPU_FORCE_64BIT_PTR, 0,                                         \
        "Forces 64 bit pointers on GPU")                                      \
release(bool, GPU_FORCE_OCL20_32BIT, 0,                                       \
        "Forces 32 bit apps to take CLANG\HSAIL path")                        \
release(bool, GPU_RAW_TIMESTAMP, 0,                                           \
        "Reports GPU raw timestamps in GPU timeline")                         \
release(size_t, GPU_NUM_MEM_DEPENDENCY, 256,                                  \
        "Number of memory objects for dependency tracking")                   \
release(size_t, GPU_XFER_BUFFER_SIZE, 0,                                      \
        "Transfer buffer size for image copy optimization in KB")             \
release(bool, GPU_IMAGE_DMA, true,                                            \
        "Enable DRM DMA for image transfers")                                 \
release(uint, GPU_SINGLE_ALLOC_PERCENT, 85,                                   \
        "Maximum size of a single allocation as percentage of total")         \
release(uint, GPU_NUM_COMPUTE_RINGS, 2,                                       \
        "GPU number of compute rings. 0 - disabled, 1 , 2,.. - the number of compute rings") \
release(int, GPU_SELECT_COMPUTE_RINGS_ID, -1,                                 \
        "GPU select the compute rings ID -1 - disabled, 0 , 1,.. - the forced compute rings ID for submission") \
release(uint, GPU_WORKLOAD_SPLIT, 22,                                         \
        "Workload split size")                                                \
release(bool, GPU_USE_SINGLE_SCRATCH, false,                                  \
        "Use single scratch buffer per device instead of per HW ring")        \
release(bool, AMD_OCL_WAIT_COMMAND, false,                                    \
        "1 = Enable a wait for every submitted command")                      \
/* HSAIL is by default, except Linux 32bit, because of known Catalyst 32bit issue */  \
release(bool, GPU_HSAIL_ENABLE, LP64_SWITCH(LINUX_SWITCH(false,true),true),   \
        "Enable HSAIL on dGPU stack (requires CI+ HW)")                       \
release(uint, GPU_PRINT_CHILD_KERNEL, 0,                                      \
        "Prints the specified number of the child kernels")                   \
release(bool, GPU_USE_DEVICE_QUEUE, false,                                    \
        "Use a dedicated device queue for the actual submissions")            \
release(bool, GPU_ENABLE_LARGE_ALLOCATION, true,                              \
        "Enable >4GB single allocations")                                     \
release(bool, AMD_THREAD_TRACE_ENABLE, true,                                  \
        "Enable thread trace extension")                                      \
release(uint, OPENCL_VERSION, (IS_BRAHMA ? 120 : 200),                        \
        "Force GPU opencl verison")                                           \
release(bool, HSA_LOCAL_MEMORY_ENABLE, true,                                  \
        "Enable HSA device local memory usage")                               \
release(uint, HSA_KERNARG_POOL_SIZE, 2 * 1024 * 1024,                         \
        "Kernarg pool size")                                                  \
release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, true,                              \
        "Enable device memory for coarse grain SVM allocations")              \
release(bool, GPU_IFH_MODE, false,                                            \
        "1 = Enable GPU IFH (infinitely fast hardware) mode. Any other value keeps setting disabled.") \
release(bool, GPU_MIPMAP, true,                                               \
        "Enables GPU mipmap extension")                                       \
release(uint, GPU_ENABLE_PAL, IF(IS_LIGHTNING,1,2),                           \
        "Enables PAL backend. 0 - GSL(default), 1 - PAL, 2 - GSL and PAL")    \
release(bool, DISABLE_DEFERRED_ALLOC, false,                                  \
        "Disables deferred memory allocation on device")                      \
release(int, AMD_GPU_FORCE_SINGLE_FP_DENORM, -1,                              \
        "Force denorm for single precision: -1 - don't force, 0 - disable, 1 - enable") \
release(uint, OCL_SET_SVM_SIZE, IF(IS_HIP, 16384, 4096),                      \
        "set SVM space size for discrete GPU")                                \
debug(uint, OCL_SYSMEM_REQUIREMENT, 2,                                        \
        "Use flag to change the minimum requirement of system memory not to downgrade")        \
debug(bool, GPU_ENABLE_HW_DEBUG, false,                                       \
        "Enable HW DEBUG for GPU")                                            \
release(uint, GPU_WAVES_PER_SIMD, 0,                                          \
        "Force the number of waves per SIMD (1-10)")                          \
release(bool, GPU_WAVE_LIMIT_ENABLE, false,                                   \
        "1 = Enable adaptive wave limiter")                                   \
release(bool, OCL_STUB_PROGRAMS, false,                                       \
        "1 = Enables OCL programs stubing")                                   \
release(bool, GPU_ANALYZE_HANG, false,                                        \
        "1 = Enables GPU hang analysis")                                      \
release(uint, GPU_MAX_REMOTE_MEM_SIZE, 2,                                     \
        "Maximum size (in Ki) that allows device memory substitution with system") \
release(bool, GPU_ADD_HBCC_SIZE, false,                                        \
        "Add HBCC size to the reported device memory")                        \
release_on_stg(uint, GPU_WAVE_LIMIT_CU_PER_SH, 0,                             \
        "Assume the number of CU per SH for wave limiter")                    \
release_on_stg(uint, GPU_WAVE_LIMIT_MAX_WAVE, 10,                             \
        "Set maximum waves per SIMD to try for wave limiter")                 \
release_on_stg(uint, GPU_WAVE_LIMIT_RUN, 20,                                  \
        "Set running factor for wave limiter")                                \
release_on_stg(cstring, GPU_WAVE_LIMIT_DUMP, "",                              \
        "File path prefix for dumping wave limiter output")                   \
release_on_stg(cstring, GPU_WAVE_LIMIT_TRACE, "",                             \
        "File path prefix for tracing wave limiter")                          \
release(bool, OCL_CODE_CACHE_ENABLE, false,                                   \
        "1 = Enable compiler code cache")                                     \
release(bool, OCL_CODE_CACHE_RESET, false,                                    \
        "1 =  Reset the compiler code cache storage")                         \
release(bool, GPU_VEGA10_ONLY, VEGA10_ONLY,                                   \
        "1 = Report vega10 only on OCL/ROCR")                                 \
release_on_stg(bool, PAL_DISABLE_SDMA, false,                                 \
        "1 = Disable SDMA for PAL")                                           \
release(uint, PAL_RGP_DISP_COUNT, 50,                                         \
        "The number of dispatches for RGP capture with SQTT")                 \
release(bool, GPU_ENABLE_LC, IS_LIGHTNING,                                    \
        "Enables LC path")                                                    \
release(uint, GPU_MAX_COMMAND_BUFFERS, 8,                                     \
         "The maximum number of command buffers allocated per queue")         \
release(cstring, HIP_VISIBLE_DEVICES, "",                                             \
        "Only devices whose index is present in the sequence are visible to HIP")    \
release(cstring, CUDA_VISIBLE_DEVICES, "",                                            \
        "Only devices whose index is present in the sequence are visible to HIP")    \

namespace amd {

//! \addtogroup Utils
//  @{

struct Flag {
  enum Type {
    Tinvalid = 0,
    Tbool,    //!< A boolean type flag (true, false).
    Tint,     //!< An integer type flag (signed).
    Tuint,    //!< An integer type flag (unsigned).
    Tsize_t,  //!< A size_t type flag.
    Tcstring  //!< A string type flag.
  };

#define DEFINE_FLAG_NAME(type, name, value, help) k##name,
  enum Name {
    RUNTIME_FLAGS(DEFINE_FLAG_NAME, DEFINE_FLAG_NAME, DEFINE_FLAG_NAME)
    numFlags_
  };
#undef DEFINE_FLAG_NAME

#define CAN_SET(type, name, v, h)    static const bool cannotSet##name = false;
#define CANNOT_SET(type, name, v, h) static const bool cannotSet##name = true;

#ifdef DEBUG
  RUNTIME_FLAGS(CAN_SET, CAN_SET, CAN_SET)
#else // !DEBUG
# ifdef OPENCL_MAINLINE
  RUNTIME_FLAGS(CANNOT_SET, CAN_SET, CANNOT_SET)
# else // OPENCL_STAGING
  RUNTIME_FLAGS(CANNOT_SET, CAN_SET, CAN_SET)
# endif // OPENCL_STAGING
#endif // !DEBUG

#undef CAN_SET
#undef CANNOT_SET

 private:
  static Flag flags_[];

 public:
  static char* envstr_;
  const char* name_;
  const void* value_;
  Type type_;
  bool isDefault_;

 public:
  static bool init();

  static void tearDown();

  bool setValue(const char* value);

  static bool isDefault(Name name) { return flags_[name].isDefault_; }
};

#define flagIsDefault(name) \
  (amd::Flag::cannotSet##name || amd::Flag::isDefault(amd::Flag::k##name))

#define setIfNotDefault(var, opt, other) \
  if (!flagIsDefault(opt)) \
    var = (opt);\
  else \
    var = (other);

//  @}

} // namespace amd

#ifdef _WIN32
# define EXPORT_FLAG extern "C" __declspec(dllexport)
#else // !_WIN32
# define EXPORT_FLAG extern "C"
#endif // !_WIN32

#define DECLARE_RELEASE_FLAG(type, name, value, help) EXPORT_FLAG type name;
#ifdef DEBUG
# define DECLARE_DEBUG_FLAG(type, name, value, help) EXPORT_FLAG type name;
#else // !DEBUG
# define DECLARE_DEBUG_FLAG(type, name, value, help) const type name = value;
#endif // !DEBUG

#ifdef OPENCL_MAINLINE
RUNTIME_FLAGS(DECLARE_DEBUG_FLAG, DECLARE_RELEASE_FLAG, DECLARE_DEBUG_FLAG);
#else
RUNTIME_FLAGS(DECLARE_DEBUG_FLAG, DECLARE_RELEASE_FLAG, DECLARE_RELEASE_FLAG);
#endif

#undef DECLARE_DEBUG_FLAG
#undef DECLARE_RELEASE_FLAG

#endif /*FLAGS_HPP_*/