diff --git a/CMakeLists.txt b/CMakeLists.txt index a97e6561f8..7bc524337b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -175,6 +175,9 @@ add_feature_info(TA_RANGEV3 TA_RANGEV3 "Range-V3 ranges library") option(TA_TTG "Enable search/build of TTG library" OFF) add_feature_info(TA_TTG TA_TTG "TTG library") +option(IntelMKL_FAIR_DISPATCH "Enable fair dispatch in Intel MKL" OFF) +add_feature_info(IntelMKL_FAIR_DISPATCH IntelMKL_FAIR_DISPATCH "Use of fair dispatch in Intel MKL") + # Enable shared library support options redefaultable_option(TA_ASSUMES_ASLR_DISABLED "TiledArray assumes the Address Space Layout Randomization (ASLR) to be disabled" OFF) add_feature_info(ASSUMES_ASLR_DISABLED TA_ASSUMES_ASLR_DISABLED diff --git a/INSTALL.md b/INSTALL.md index 6e7c6fc746..3f669073f0 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -423,6 +423,7 @@ support may be added. * `TA_TENSOR_MEM_PROFILE` -- Set to `ON` to profile host memory allocations used by TA::Tensor. This causes the use of Umpire for host memory allocation. This also enables additional tracing facilities provided by Umpire; these can be controlled via [environment variable `UMPIRE_LOG_LEVEL`](https://umpire.readthedocs.io/en/develop/sphinx/features/logging_and_replay.html), but note that the default is to log Umpire info into a file rather than stdout. * `TA_TENSOR_MEM_TRACE` -- Set to `ON` to *trace* host memory allocations used by TA::Tensor. This turns on support for tracking memory used by `Tensor` objects; such tracking must be enabled programmatically. This can greatly increase memory consumption by the application and is only intended for expert developers troubleshooting memory use by TiledArray. * `TA_UT_CTEST_TIMEOUT` -- The value (in seconds) of the timeout to use for running the TA unit tests via CTest when building the `check`/`check-tiledarray` targets. The default timeout is 1500s. +* `IntelMKL_FAIR_DISPATCH` -- If want to use Intel MKL library on non-Intel (e.g., AMD) CPUs, set to `ON` to use fair kernel dispatch. [Default=OFF]. # Build TiledArray diff --git a/examples/dgemm/ta_blas.cpp b/examples/dgemm/ta_blas.cpp index 0a4feff383..aeefaae908 100644 --- a/examples/dgemm/ta_blas.cpp +++ b/examples/dgemm/ta_blas.cpp @@ -69,7 +69,7 @@ int main(int argc, char** argv) { // Start clock const double wall_time_start = madness::wall_time(); - // Do matrix multiplcation + // Do matrix multiplication // Note: If TiledArray has not been configured with blas, this will be an // eigen call. for (int i = 0; i < repeat; ++i) { diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9bb82bf537..0167aab636 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -313,6 +313,16 @@ if( TARGET ttg-parsec ) list(APPEND _TILEDARRAY_DEPENDENCIES ttg-parsec) endif() +if (IntelMKL_FAIR_DISPATCH AND BLAS_IS_MKL) + message(WARNING "created tiledarray_mkl_dispatch") + add_library(tiledarray_mkl_dispatch OBJECT + TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c + TiledArray/external/agnerfog/intel_mkl_feature_patch.c + ) + # N.B. --allow-multiple-definition is a GNU linker extension + list(APPEND _TILEDARRAY_DEPENDENCIES $ -Wl,--allow-multiple-definition) +endif() + # cache deps as TILEDARRAY_PRIVATE_LINK_LIBRARIES set(TILEDARRAY_PRIVATE_LINK_LIBRARIES ${_TILEDARRAY_DEPENDENCIES} CACHE STRING "List of libraries on which TiledArray depends on") diff --git a/src/TiledArray/config.h.in b/src/TiledArray/config.h.in index 79f9f0932a..483847067f 100644 --- a/src/TiledArray/config.h.in +++ b/src/TiledArray/config.h.in @@ -113,6 +113,8 @@ #endif // !defined(TILEDARRAY_HAS_BTAS) #if defined(TILEDARRAY_HAS_BTAS) && defined(BTAS_HAS_INTEL_MKL) # define TILEDARRAY_HAS_INTEL_MKL +/* use fair dispatch in Intel MKL? */ +#cmakedefine IntelMKL_FAIR_DISPATCH #endif /* Add macro TILEDARRAY_FORCE_INLINE which does as the name implies. */ diff --git a/src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c b/src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c new file mode 100644 index 0000000000..f3706ef1fa --- /dev/null +++ b/src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c @@ -0,0 +1,48 @@ +/*********************** intel_cpu_feature_patch.c ************************** + * Author: Agner Fog + * Date created: 2014-07-30 + * Last modified: 2019-12-29 + * Source URL: https://www.agner.org/optimize/intel_dispatch_patch.zip + * Language: C or C++ + * + * Description: + * Patch for Intel compiler version 13.0 and later, including the general + * libraries, LIBM and SVML, but not MKL and VML. + * + * Example of how to patch Intel's CPU feature dispatcher in order to improve + * compatibility of generated code with non-Intel processors. + * In Windows: Use the static link libraries (*.lib), not the dynamic link + * librarise (*.DLL). + * In Linux and Mac: use static linking (*.a) or dynamic linking (*.so). + * + * Include this code in your C or C++ program and call intel_cpu_patch(); + * before any call to the library functions. + * + * Copyright (c) 2014-2019. BSD License 2.0 + ******************************************************************************/ +#include + +#ifdef __cplusplus // use C-style linking +extern "C" { +#endif + +// link to Intel libraries +extern int64_t __intel_cpu_feature_indicator; // CPU feature bits +extern int64_t __intel_cpu_feature_indicator_x; // CPU feature bits +void __intel_cpu_features_init(); // unfair dispatcher: checks CPU features for + // Intel CPU's only +void __intel_cpu_features_init_x(); // fair dispatcher: checks CPU features + // without discriminating by CPU brand + +#ifdef __cplusplus +} // end of extern "C" +#endif + +void intel_cpu_patch() { + // force a re-evaluation of the CPU features without discriminating by CPU + // brand + __intel_cpu_feature_indicator = 0; + __intel_cpu_feature_indicator_x = 0; + __intel_cpu_features_init_x(); + __intel_cpu_feature_indicator = __intel_cpu_feature_indicator_x; +} diff --git a/src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c b/src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c new file mode 100644 index 0000000000..b88a1807f7 --- /dev/null +++ b/src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c @@ -0,0 +1,61 @@ +/*********************** intel_mkl_cpuid_patch.c ************************** + * Author: Agner Fog + * Date created: 2019-12-29 + * Source URL: https://www.agner.org/optimize/intel_dispatch_patch.zip + * Language: C or C++ + * + * Description: + * Patch for Intel Math Kernel Library (MKL) version 14.0 and later, except + * the Vector Math Library (VML). + * + * Example of how to override Intel's CPU feature dispatcher in order to improve + * compatibility of Intel function libraries with non-Intel processors. + * + * Include this code in your C or C++ program and make sure it is linked before + * any Intel libraries. You may need to include intel_mkl_feature_patch.c as + *well. + * + * Copyright (c) 2019. BSD License 2.0 + ******************************************************************************/ +#include + +#ifdef __cplusplus // use C-style linking +extern "C" { +#endif + +// detect if Intel CPU +int mkl_serv_intel_cpu() { return 1; } + +// detect if Intel CPU +int mkl_serv_intel_cpu_true() { return 1; } + +int mkl_serv_cpuhaspnr_true() { return 1; } + +int mkl_serv_cpuhaspnr() { return 1; } + +int mkl_serv_cpuhasnhm() { return 1; } + +int mkl_serv_cpuisbulldozer() { return 0; } + +int mkl_serv_cpuiszen() { return 0; } + +int mkl_serv_cpuisatomsse4_2() { return 0; } + +int mkl_serv_cpuisatomssse3() { return 0; } + +int mkl_serv_cpuisitbarcelona() { return 0; } + +int mkl_serv_cpuisskl() { return 0; } + +int mkl_serv_cpuisknm() { return 0; } + +int mkl_serv_cpuisclx() { return 0; } + +int mkl_serv_get_microarchitecture() { + // I don't know what this number means + return 33; +} + +#ifdef __cplusplus +} // end of extern "C" +#endif diff --git a/src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c b/src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c new file mode 100644 index 0000000000..4844f2621d --- /dev/null +++ b/src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c @@ -0,0 +1,49 @@ +/*********************** intel_mkl_feature_patch.c ************************** + * Author: Agner Fog + * Date created: 2014-07-30 + * Last modified: 2019-12-29 + * Source URL: https://www.agner.org/optimize/intel_dispatch_patch.zip + * Language: C or C++ + * + * Description: + * Patch for Intel Math Kernel Library (MKL) version 14.0 and later, except + * the Vector Math Library (VML). + * + * Example of how to patch Intel's CPU feature dispatcher in order to improve + * compatibility of Intel function libraries with non-Intel processors. + * In Windows: Use the static link libraries (*.lib), not the dynamic link + * librarise (*.DLL). + * In Linux and Mac: use static linking (*.a) or dynamic linking (*.so). + * + * Include this code in your C or C++ program and call intel_mkl_patch(); + * before any call to the MKL functions. You may need to include + * intel_mkl_cpuid_patch.c as well. + * + * Copyright (c) 2014-2019. BSD License 2.0 + ******************************************************************************/ +#include + +#ifdef __cplusplus // use C-style linking +extern "C" { +#endif + +// link to MKL libraries +extern int64_t __intel_mkl_feature_indicator; // CPU feature bits +extern int64_t __intel_mkl_feature_indicator_x; // CPU feature bits +void __intel_mkl_features_init(); // unfair dispatcher: checks CPU features for + // Intel CPU's only +void __intel_mkl_features_init_x(); // fair dispatcher: checks CPU features + // without discriminating by CPU brand + +#ifdef __cplusplus +} // end of extern "C" +#endif + +void intel_mkl_use_fair_dispatch() { + // force a re-evaluation of the CPU features without discriminating by CPU + // brand + __intel_mkl_feature_indicator = 0; + __intel_mkl_feature_indicator_x = 0; + __intel_mkl_features_init_x(); + __intel_mkl_feature_indicator = __intel_mkl_feature_indicator_x; +} diff --git a/src/TiledArray/external/agnerfog/readme.txt b/src/TiledArray/external/agnerfog/readme.txt new file mode 100644 index 0000000000..0f891c9ed3 --- /dev/null +++ b/src/TiledArray/external/agnerfog/readme.txt @@ -0,0 +1,84 @@ + intel_dispatch_patch.zip + ======================== + +By Agner Fog, Technical University of Denmark, 2019. + +Intel's compilers are generating code that will run slower than necessary when +the code is executed on a CPU that is not produced by Intel. This has been +observed with Intel C, C++, and Fortran compilers. + +The same happens when certain function libraries produced by Intel are used, +even if the code is compiled with another compiler, such as Microsoft, Gnu +or Clang compilers. + +This problem is affecting several commonly used software programs such as +Matlab, because they are using Intel software libraries. + +The library code and the code generated by an Intel compiler may contain +multiple versions, each optimized for a particular instruction set extension. +A so-called CPU dispatcher is chosing the optimal version of the code at +runtime, based on which CPU it is running on. + +CPU dispatchers can be fair or unfair. A fair CPU dispatcher is chosing the +optimal code based only on which instruction set extensions are supported +by the CPU. An unfair dispatcher first checks the CPU brand. If the brand +is not Intel, then the unfair dispatcher will chose the "generic" version +of the code, i.e. the slowest version that is compatible with old CPUs +without the relevant instruction set extensions. + +The CPU dispatchers in many Intel function libraries have two versions, a +fair and an unfair one. It is not clear when the fair dispatcher is used +and when the unfair dispatcher is used. My observations about fair and +unfair CPU dispatching are as follows: + +* Code compiled with an Intel compiler will usually have unfair CPU dispatching. + +* The SVML (Short Vector Math Library) and IPP (Intel Performance Primitives) + function libraries from Intel are using the fair CPU dispatcher when used + with a non-Intel compiler. + +* The MKL (Math Kernel Library) library contains both fair and unfair + dispatchers. It is not clear which dispatcher is used on each function. + +The code examples contained herein may be used for circumventing unfair CPU +dispatching in order to improve compatibility with non-Intel CPUs. + +The following files are contained: + +intel_cpu_feature_patch.c +------------------------- +This code makes sure the fair dispatcher is called instead of the unfair +one for code generated with an Intel compiler and for general Intel +function libraries. + +intel_mkl_feature_patch.c +------------------------- +This does the same for the Intel MKL library. + +intel_mkl_cpuid_patch.c +----------------------- +This code example is overriding CPU detection functions in Intel's MKL +function library. The mkl_serv_intel_cpu() function in MKL is returning +1 when running on an Intel CPU and 0 when running on any other brand of +CPU. You may include this code to replace this function in MKL with a +function that returns 1 regardless of CPU brand. + +It may be necessary to use both intel_mkl_feature_patch.c and +intel_mkl_cpuid_patch.c when using the MKL library in software that +may run on any brand of CPU. + +An alternative method is to set the environment variable + MKL_DEBUG_CPU_TYPE=5 +when running on an AMD processor. This may be useful when you do not have +access to the source code, for example when running Matlab software. + +The patches provided here are based on undocumented features in Intel +function libraries. Use them at your own risk, and make sure to test your +code properly to make sure it works as intended. + +The most reliable solution is, of course, to avoid Intel compilers and +Intel function libraries in code that may run on other CPU brands such +as AMD and VIA. You may find other function libraries on the web, or +you may make your own functions. My vector class library (VCL) is useful +for making mathematical functions that process multiple data in parallel, +using the vector processing features of modern CPUs. diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp index 38bf61e86e..2a4b3d1199 100644 --- a/src/TiledArray/tiledarray.cpp +++ b/src/TiledArray/tiledarray.cpp @@ -16,6 +16,10 @@ #include #endif +#ifdef IntelMKL_FAIR_DISPATCH +extern "C" void intel_mkl_use_fair_dispatch(); +#endif + #include #include #include @@ -100,6 +104,9 @@ TiledArray::World& TiledArray::initialize(int& argc, char**& argv, TiledArray::set_default_world(default_world); #ifdef TILEDARRAY_HAS_DEVICE TiledArray::device_initialize(); +#endif +#ifdef IntelMKL_FAIR_DISPATCH + intel_mkl_use_fair_dispatch(); #endif TiledArray::max_threads = TiledArray::get_num_threads(); TiledArray::set_num_threads(1);