diff --git a/CMakeLists.txt b/CMakeLists.txt index 32db779d..ee91a89d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -444,6 +444,71 @@ endif() +# ============================ +# Patch CPU performance +# ============================ + + +# Patch performance of CPU std::complex arithmetic operator overloads. +# The cpu_subroutines.cpp file makes extensive use of std::complex operator +# overloads, and alas these are significantly slower than hand-rolled +# arithmetic, due to their NaN and inf checks, and interference with SIMD. +# It is crucial to pass additional optimisation flags to this file to restore +# hand-rolled performance (else QuEST v3 is faster than v4 eep). In theory, +# we can achieve this with specific, relatively 'safe' flags such as LLVM's: +# -ffinite-math-only -fno-signed-zeros -ffp-contract=fast +# However, it is a nuisance to find equivalent flags for different compilers +# and monitor their performance vs accuracy trade-offs. So instead, we use the +# much more aggressive and ubiquitous -Ofast flag to guarantee performance. +# This introduces many potentially dangerous optimisations, such as asserting +# associativity of flops, which would break techniques like Kahan summation. +# The cpu_subroutines.cpp must ergo be very conscious of these optimisations. +# We here also explicitly inform the file cpu_subroutines.cpp whether or not +# we are passing the flags, so it can detect/error when flags are forgotten. + +if (CMAKE_BUILD_TYPE STREQUAL "Release") + + # Release build will pass -Ofast when known for the given compiler, and + # fallback to giving a performance warning and proceeding with compilation + + if (CMAKE_CXX_COMPILER_ID MATCHES "AppleClang|Clang|Cray|CrayClang|GNU|HP|Intel|IntelLLVM|NVHPC|NVIDIA|XL|XLClang") + set(patch_flags "-Ofast") + set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1") + elseif (CMAKE_CXX_COMPILER_ID MATCHES "HP") + set(patch_flags "+Ofast") + set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1") + elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + set(patch_flags "/fp:fast") + set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1") + else() + message(WARNING + "The compiler (${CMAKE_CXX_COMPILER_ID}) is unrecognised and so crucial optimisation flags have not been " + "passed to the CPU backend. These flags are necessary for full performance when performing complex algebra, " + "otherwise a slowdown of 3-50x may be observed. Please edit the root CMakeLists.txt to include flags which are " + "equivalent to GNU's -Ofast flag for your compiler (search this warning), or contact the QuEST developers for help." + ) + set(patch_flags "") + set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=0") + endif() + +else() + + # Non-release builds (e.g. Debug) will pass no optimisation flags, and will + # communicate to cpu_subroutines.cpp that this is intentional via a macro + + set(patch_flags "") + set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=0") + +endif() + +set_source_files_properties( + quest/src/cpu/cpu_subroutines.cpp + PROPERTIES + COMPILE_FLAGS "${patch_flags} ${patch_macro}" +) + + + # ============================ # Pass files to library # ============================ diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 26c80dab..a853bc9b 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -2,6 +2,12 @@ * CPU OpenMP-accelerated definitions of the main backend simulation routines, * as mirrored by gpu_subroutines.cpp, and called by accelerator.cpp. * + * BEWARE that this specific file receives additional compiler optimisation flags + * in order to counteract a performance issue in the use of std::complex operator + * overloads. These flags (like -Ofast) may induce assumed associativity of qcomp + * algebra, breaking techniques like Kahan summation. As such, this file CANNOT + * assume IEEE floating-point behaviour. + * * Some of these definitions are templated, defining multiple versions optimised * (at compile-time) for handling different numbers of input qubits; such functions * are proceeded by macro INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS(), to force the @@ -40,6 +46,28 @@ using std::vector; +/* + * Beware that this file makes extensive use of std::complex (qcomp) operator + * overloads and so requires additional compiler flags to achieve hand-rolled + * arithmetic performance; otherwise a 3-50x slowdown may be observed. We here + * enforce that these flags were not forgotton (but may be deliberatedly avoided). + * Beware these flags may induce associativity and break e.g. Kakan summation. + */ + +#if !defined(COMPLEX_OVERLOADS_PATCHED) + #error "Crucial, bespoke optimisation flags were not passed (or acknowledged) to cpu_subroutines.cpp which are necessary for full complex arithmetic performance." + +#elif !COMPLEX_OVERLOADS_PATCHED + + #if defined(_MSC_VER) + #pragma message("Warning: The CPU backend is being deliberately compiled without the necessary flags to obtain full complex arithmetic performance.") + #else + #warning "The CPU backend is being deliberately compiled without the necessary flags to obtain full complex arithmetic performance." + #endif + +#endif + + /* * GETTERS @@ -568,6 +596,9 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve /// qureg.cpuAmps[i] is being serially updated by only this thread, /// so is a candidate for Kahan summation for improved numerical /// stability. Explore whether this is time-free and worthwhile! + /// + /// BEWARE that Kahan summation is incompatible with the optimisation + /// flags currently passed to this file } } } @@ -1758,6 +1789,9 @@ qreal cpu_statevec_calcTotalProb_sub(Qureg qureg) { /// final serial combination). This invokes several times /// as many arithmetic operations (4x?) but we are anyway /// memory-bandwidth bound + /// + /// BEWARE that Kahan summation is incompatible with the optimisation + /// flags currently passed to this file qreal prob = 0; @@ -1783,6 +1817,9 @@ qreal cpu_densmatr_calcTotalProb_sub(Qureg qureg) { /// final serial combination). This invokes several times /// as many arithmetic operations (4x?) but we are anyway /// memory-bandwidth bound + /// + /// BEWARE that Kahan summation is incompatible with the optimisation + /// flags currently passed to this file qreal prob = 0;