From 2ede422db0f555ff7f84f8062783079c31788876 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Sun, 7 Sep 2025 20:29:08 -0400 Subject: [PATCH 1/3] patching MacOS performance as per #638 --- CMakeLists.txt | 27 +++++++++++++++++++++++++++ quest/src/cpu/cpu_subroutines.cpp | 26 ++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e2e52d85e..c813e207b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -382,6 +382,33 @@ else() target_compile_definitions(QuEST PRIVATE INCLUDE_DEPRECATED_FUNCTIONS=0) endif() +# Patch Clang/LLVM performance. +# std::complex arithmetic overloads are painfully slow in Clang/LLVM, and +# further greatly sabotage multithreaded performance. We patch this by +# passing additional optimisation flags only to the single affected file +# (cpu_subroutines.cpp), which should not worsen accuracy nor e.g. assert +# associativity (which would break e.g. Kahan summation). We pass these flags +# only in Release mode, and also pass defensive 'PATCHED' macros so that +# cpu_subroutines.cpp can detect whether the build forgot these flags, which +# would otherwise silently ruin performance + +if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|AppleClang") + + if (CMAKE_BUILD_TYPE STREQUAL "Release") + message(WARNING "Building with Clang, the CPU backend will receive additional optimisation flags to counteract a std::complex performance issue.") + endif() + + set(patch_flags "-ffinite-math-only -fno-signed-zeros -ffp-contract=fast -DCLANG_COMPLEX_PERFORMANCE_PATCHED=1") + set(no_patch_flags "-DCLANG_COMPLEX_PERFORMANCE_PATCHED=0") + + set_source_files_properties( + quest/src/cpu/cpu_subroutines.cpp + PROPERTIES + COMPILE_FLAGS + "$,${patch_flags},${no_patch_flags}>" + ) + +endif() # add math library if (NOT MSVC) diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 26c80dabc..5416346ed 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -41,6 +41,32 @@ using std::vector; +/* + * Beware that this file makes extensive use of std::complex arithmetic + * overloads which, on Clang/LLVM, have enormous performance issues and sabotage + * multithreading using LLVM's OpenMP runtime library (libomp). We counteract + * this pitfall by specifying compiler flags + * -ffinite-math-only + * -fno-signed-zeros + * -ffp-contract=fast + * which restores performance to that of manual complex arithmetic. We here + * defensively check the build correctly passed these flags. Note that value + * CLANG_COMPLEX_PERFORMANCE_PATCHED=0 is permitted which communicates that the + * flags were deliberately not passed because the CMake build type is not "Release". + */ + +#if defined(__clang__) + + #if !defined(CLANG_COMPLEX_PERFORMANCE_PATCHED) + #error "Additional optimisation flags were not passed (or acknowledged) to cpu_subroutines.cpp which is necessary with Clang to counteract a performance issue." + + #elif !CLANG_COMPLEX_PERFORMANCE_PATCHED + #warning "The CPU backend is being compiled without the necessary flags to counteract a Clang-specific performance issue." + #endif +#endif + + + /* * GETTERS */ From 5c50fe1e98f184bb0ae9963a03a620822536d724 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Fri, 12 Sep 2025 15:14:28 -0400 Subject: [PATCH 2/3] passed -Ofast to all compiler types since the std::complex performance issue was seen in other compilers too, grr! --- CMakeLists.txt | 76 ++++++++++++++++++++++--------- quest/src/cpu/cpu_subroutines.cpp | 44 ++++++++++-------- 2 files changed, 78 insertions(+), 42 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c813e207b..95c17dfa6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -382,34 +382,66 @@ else() target_compile_definitions(QuEST PRIVATE INCLUDE_DEPRECATED_FUNCTIONS=0) endif() -# Patch Clang/LLVM performance. -# std::complex arithmetic overloads are painfully slow in Clang/LLVM, and -# further greatly sabotage multithreaded performance. We patch this by -# passing additional optimisation flags only to the single affected file -# (cpu_subroutines.cpp), which should not worsen accuracy nor e.g. assert -# associativity (which would break e.g. Kahan summation). We pass these flags -# only in Release mode, and also pass defensive 'PATCHED' macros so that -# cpu_subroutines.cpp can detect whether the build forgot these flags, which -# would otherwise silently ruin performance - -if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|AppleClang") - - if (CMAKE_BUILD_TYPE STREQUAL "Release") - message(WARNING "Building with Clang, the CPU backend will receive additional optimisation flags to counteract a std::complex performance issue.") + +# Patch performance of CPU std::complex arithmetic operator overloads. +# The cpu_subroutines.cpp file makes extensive use of std::complex operator +# overloads, and alas these are significantly slower than hand-rolled +# arithmetic, due to their NaN and inf checks, and interference with SIMD. +# It is crucial to pass additional optimisation flags to this file to restore +# hand-rolled performance (else QuEST v3 is faster than v4 eep). In theory, +# we can achieve this with specific, relatively 'safe' flags such as LLVM's: +# -ffinite-math-only -fno-signed-zeros -ffp-contract=fast +# However, it is a nuisance to find equivalent flags for different compilers +# and monitor their performance vs accuracy trade-offs. So instead, we use the +# much more aggressive and ubiquitous -Ofast flag to guarantee performance. +# This introduces many potentially dangerous optimisations, such as asserting +# associativity of flops, which would break techniques like Kahan summation. +# The cpu_subroutines.cpp must ergo be very conscious of these optimisations. +# We here also explicitly inform the file cpu_subroutines.cpp whether or not +# we are passing the flags, so it can detect/error when flags are forgotten. + +if (CMAKE_BUILD_TYPE STREQUAL "Release") + + # Release build will pass -Ofast when known for the given compiler, and + # fallback to giving a performance warning and proceeding with compilation + + if (CMAKE_CXX_COMPILER_ID MATCHES "AppleClang|Clang|Cray|CrayClang|GNU|HP|Intel|IntelLLVM|NVHPC|NVIDIA|XL|XLClang") + set(patch_flags "-Ofast") + set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1") + elseif (CMAKE_CXX_COMPILER_ID MATCHES "HP") + set(patch_flags "+Ofast") + set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1") + elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + set(patch_flags "/fp:fast") + set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1") + else() + message(WARNING + "The compiler (${CMAKE_CXX_COMPILER_ID}) is unrecognised and so crucial optimisation flags have not been " + "passed to the CPU backend. These flags are necessary for full performance when performing complex algebra, " + "otherwise a slowdown of 3-50x may be observed. Please edit the root CMakeLists.txt to include flags which are " + "equivalent to GNU's -Ofast flag for your compiler (search this warning), or contact the QuEST developers for help." + ) + set(patch_flags "") + set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=0") endif() + +else() - set(patch_flags "-ffinite-math-only -fno-signed-zeros -ffp-contract=fast -DCLANG_COMPLEX_PERFORMANCE_PATCHED=1") - set(no_patch_flags "-DCLANG_COMPLEX_PERFORMANCE_PATCHED=0") + # Non-release builds (e.g. Debug) will pass no optimisation flags, and will + # communicate to cpu_subroutines.cpp that this is intentional via a macro - set_source_files_properties( - quest/src/cpu/cpu_subroutines.cpp - PROPERTIES - COMPILE_FLAGS - "$,${patch_flags},${no_patch_flags}>" - ) + set(patch_flags "") + set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=0") endif() +set_source_files_properties( + quest/src/cpu/cpu_subroutines.cpp + PROPERTIES + COMPILE_FLAGS "${patch_flags} ${patch_macro}" +) + + # add math library if (NOT MSVC) target_link_libraries(QuEST PRIVATE ${MATH_LIBRARY}) diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 5416346ed..c24be37c4 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -2,6 +2,12 @@ * CPU OpenMP-accelerated definitions of the main backend simulation routines, * as mirrored by gpu_subroutines.cpp, and called by accelerator.cpp. * + * BEWARE that this specific file receives additional compiler optimisation flags + * in order to counteract a performance issue in the use of std::complex operator + * overloads. These flags (like -Ofast) may induce assumed associativity of qcomp + * algebra, breaking techniques like Kahan summation. As such, this file CANNOT + * assume IEEE floating-point behaviour. + * * Some of these definitions are templated, defining multiple versions optimised * (at compile-time) for handling different numbers of input qubits; such functions * are proceeded by macro INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS(), to force the @@ -40,29 +46,18 @@ using std::vector; - /* - * Beware that this file makes extensive use of std::complex arithmetic - * overloads which, on Clang/LLVM, have enormous performance issues and sabotage - * multithreading using LLVM's OpenMP runtime library (libomp). We counteract - * this pitfall by specifying compiler flags - * -ffinite-math-only - * -fno-signed-zeros - * -ffp-contract=fast - * which restores performance to that of manual complex arithmetic. We here - * defensively check the build correctly passed these flags. Note that value - * CLANG_COMPLEX_PERFORMANCE_PATCHED=0 is permitted which communicates that the - * flags were deliberately not passed because the CMake build type is not "Release". + * Beware that this file makes extensive use of std::complex (qcomp) operator + * overloads and so requires additional compiler flags to achieve hand-rolled + * arithmetic performance; otherwise a 3-50x slowdown may be observed. We here + * enforce that these flags were not forgotton (but may be deliberatedly avoided). + * Beware these flags may induce associativity and break e.g. Kakan summation. */ -#if defined(__clang__) - - #if !defined(CLANG_COMPLEX_PERFORMANCE_PATCHED) - #error "Additional optimisation flags were not passed (or acknowledged) to cpu_subroutines.cpp which is necessary with Clang to counteract a performance issue." - - #elif !CLANG_COMPLEX_PERFORMANCE_PATCHED - #warning "The CPU backend is being compiled without the necessary flags to counteract a Clang-specific performance issue." - #endif +#if !defined(COMPLEX_OVERLOADS_PATCHED) + #error "Crucial, bespoke optimisation flags were not passed (or acknowledged) to cpu_subroutines.cpp which are necessary for full complex arithmetic performance." +#elif !COMPLEX_OVERLOADS_PATCHED + #warning "The CPU backend is being deliberately compiled without the necessary flags to obtain full complex arithmetic performance." #endif @@ -594,6 +589,9 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve /// qureg.cpuAmps[i] is being serially updated by only this thread, /// so is a candidate for Kahan summation for improved numerical /// stability. Explore whether this is time-free and worthwhile! + /// + /// BEWARE that Kahan summation is incompatible with the optimisation + /// flags currently passed to this file } } } @@ -1784,6 +1782,9 @@ qreal cpu_statevec_calcTotalProb_sub(Qureg qureg) { /// final serial combination). This invokes several times /// as many arithmetic operations (4x?) but we are anyway /// memory-bandwidth bound + /// + /// BEWARE that Kahan summation is incompatible with the optimisation + /// flags currently passed to this file qreal prob = 0; @@ -1809,6 +1810,9 @@ qreal cpu_densmatr_calcTotalProb_sub(Qureg qureg) { /// final serial combination). This invokes several times /// as many arithmetic operations (4x?) but we are anyway /// memory-bandwidth bound + /// + /// BEWARE that Kahan summation is incompatible with the optimisation + /// flags currently passed to this file qreal prob = 0; From 7b36505b5a8eeca7437f0b443ee93a592ff4054e Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Fri, 12 Sep 2025 15:25:31 -0400 Subject: [PATCH 3/3] patching MSVC warning --- quest/src/cpu/cpu_subroutines.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index c24be37c4..a853bc9be 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -56,8 +56,15 @@ using std::vector; #if !defined(COMPLEX_OVERLOADS_PATCHED) #error "Crucial, bespoke optimisation flags were not passed (or acknowledged) to cpu_subroutines.cpp which are necessary for full complex arithmetic performance." + #elif !COMPLEX_OVERLOADS_PATCHED - #warning "The CPU backend is being deliberately compiled without the necessary flags to obtain full complex arithmetic performance." + + #if defined(_MSC_VER) + #pragma message("Warning: The CPU backend is being deliberately compiled without the necessary flags to obtain full complex arithmetic performance.") + #else + #warning "The CPU backend is being deliberately compiled without the necessary flags to obtain full complex arithmetic performance." + #endif + #endif