diff --git a/cmake/FindSIMDe.cmake b/cmake/FindSIMDe.cmake new file mode 100644 index 0000000000..a4cf479858 --- /dev/null +++ b/cmake/FindSIMDe.cmake @@ -0,0 +1,62 @@ +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +include(FindPackageHandleStandardArgs) + +find_package(PkgConfig REQUIRED) +if(PkgConfig_FOUND) + pkg_check_modules(simde IMPORTED_TARGET simde) +endif() + +if(PkgConfig_FOUND AND simde_FOUND) + message(STATUS "Found SIMDe via pkg-config") + set(SIMDE_TARGET PkgConfig::SIMDE) +else() + message(STATUS "SIMDe not found via pkg-config. Falling back to find_path...") + + if(WIN32) + find_path(SIMDE_INCLUDE_DIR + NAMES simde/simde-common.h + PATHS + "C:/simde" + ENV INCLUDE + ) + elseif(UNIX) + find_path(SIMDE_INCLUDE_DIR + NAMES simde/simde-common.h + PATHS + /usr/include + /usr/local/include + NO_DEFAULT_PATH + ) + endif() + + find_package_handle_standard_args(SIMDe + REQUIRED_VARS SIMDE_INCLUDE_DIR) + if(SIMDE_FOUND) + message(STATUS "Found SIMDe headers at: ${SIMDE_INCLUDE_DIR}") + if(NOT TARGET SIMDE) + add_library(SIMDE INTERFACE) + target_include_directories(SIMDE INTERFACE ${SIMDE_INCLUDE_DIR}) + endif() + else() + message(WARNING "could not find simde") + endif() +endif() \ No newline at end of file diff --git a/hipamd/include/hip/amd_detail/amd_hip_vector_types.h b/hipamd/include/hip/amd_detail/amd_hip_vector_types.h index 702c3f85a6..531303309f 100644 --- a/hipamd/include/hip/amd_detail/amd_hip_vector_types.h +++ b/hipamd/include/hip/amd_detail/amd_hip_vector_types.h @@ -655,15 +655,13 @@ get_native_pointer(const HIP_vector_base& base_vec) { return make_vector_type(x) /= y; } - template - __HOST_DEVICE__ - inline - #if __cplusplus >= 201402L && !defined(__HIPCC_RTC__) - constexpr - #endif - bool operator==( - const HIP_vector_type& x, const HIP_vector_type& y) noexcept - { + template + __HOST_DEVICE__ inline +#if __cplusplus >= 201402L && !defined(__HIPCC_RTC__) + constexpr +#endif + bool + operator==(const HIP_vector_type& x, const HIP_vector_type& y) noexcept { bool isTrue = true; const auto& native_x = get_native_vector(x); const auto& native_y = get_native_vector(y); diff --git a/hipamd/src/CMakeLists.txt b/hipamd/src/CMakeLists.txt index 46ca17212f..a49105de8d 100644 --- a/hipamd/src/CMakeLists.txt +++ b/hipamd/src/CMakeLists.txt @@ -54,6 +54,7 @@ option(DISABLE_DIRECT_DISPATCH "Disable Direct Dispatch" OFF) option(BUILD_SHARED_LIBS "Build the shared library" ON) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake") +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/../../cmake") if(BUILD_SHARED_LIBS) add_library(amdhip64 SHARED) @@ -156,6 +157,13 @@ target_include_directories(amdhip64 target_compile_definitions(amdhip64 PRIVATE __HIP_PLATFORM_AMD__) target_link_libraries(amdhip64 PRIVATE ${OPENGL_LIBRARIES}) target_link_libraries(amdhip64 PRIVATE ${CMAKE_DL_LIBS}) + +find_package(SIMDe REQUIRED) + +if(SIMDE_FOUND) + target_link_libraries(amdhip64 PRIVATE ${SIMDE_TARGET}) +endif() + # Add link to comgr, hsa-runtime and other required libraries in target files # This is required for static libraries if(NOT BUILD_SHARED_LIBS) diff --git a/hipamd/src/hip_embed_pch.sh b/hipamd/src/hip_embed_pch.sh index 6c92d43884..4593f76444 100755 --- a/hipamd/src/hip_embed_pch.sh +++ b/hipamd/src/hip_embed_pch.sh @@ -142,19 +142,20 @@ __hip_pch_wave64_size: .long __hip_pch_wave64_size - __hip_pch_wave64 EOF + host_triple="$(uname -m)" set -x $LLVM_DIR/bin/clang -O3 --hip-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only --cuda-gpu-arch=gfx1030 -x hip $tmp/hip_pch.h -E >$tmp/pch_wave32.cui && cat $tmp/hip_macros.h >> $tmp/pch_wave32.cui && - $LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave32.pch -x hip-cpp-output - <$tmp/pch_wave32.cui && + $LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple "$host_triple" -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave32.pch -x hip-cpp-output - <$tmp/pch_wave32.cui && $LLVM_DIR/bin/clang -O3 --hip-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch_wave64.cui && cat $tmp/hip_macros.h >> $tmp/pch_wave64.cui && - $LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave64.pch -x hip-cpp-output - <$tmp/pch_wave64.cui && + $LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple "$host_triple" -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave64.pch -x hip-cpp-output - <$tmp/pch_wave64.cui && $LLVM_DIR/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj && diff --git a/hipamd/src/hip_graph_internal.cpp b/hipamd/src/hip_graph_internal.cpp index 4839b18af3..f884e96f65 100644 --- a/hipamd/src/hip_graph_internal.cpp +++ b/hipamd/src/hip_graph_internal.cpp @@ -19,6 +19,9 @@ THE SOFTWARE. */ #include "hip_graph_internal.hpp" +#include +#include + #include #define CASE_STRING(X, C) \ @@ -804,9 +807,9 @@ void GraphKernelArgManager::ReadBackOrFlush() { address dev_ptr = kernarg_graph_.back().kernarg_pool_addr_ + kernarg_graph_.back().kernarg_pool_size_; auto kSentinel = *reinterpret_cast(dev_ptr - 1); - _mm_sfence(); + simde_mm_sfence(); *(dev_ptr - 1) = kSentinel; - _mm_mfence(); + simde_mm_mfence(); kSentinel = *reinterpret_cast(dev_ptr - 1); } } diff --git a/opencl/amdocl/CMakeLists.txt b/opencl/amdocl/CMakeLists.txt index cbe814a6e5..3d7d635053 100644 --- a/opencl/amdocl/CMakeLists.txt +++ b/opencl/amdocl/CMakeLists.txt @@ -42,6 +42,7 @@ if(ADDRESS_SANITIZER) endif() list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake") +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/../../cmake") if(BUILD_SHARED_LIBS) add_library(amdocl SHARED) @@ -126,6 +127,11 @@ endif() target_link_libraries(amdocl PUBLIC rocclr) +find_package(SIMDe REQUIRED) +if(SIMDE_FOUND) + target_link_libraries(amdocl PRIVATE ${SIMDE_TARGET}) +endif() + INSTALL(TARGETS amdocl COMPONENT MAIN RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 6834ad8d31..a4836d3467 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -38,6 +38,15 @@ #include "hsa/amd_hsa_queue.h" #include "hsa/amd_hsa_signal.h" +#include +#include +#if defined(SIMDE_VERSION_MAJOR) && \ + ((SIMDE_VERSION_MAJOR > 0) || (SIMDE_VERSION_MAJOR == 0 && SIMDE_VERSION_MINOR >= 7)) + +#include +#endif + + #include #include #include @@ -47,14 +56,6 @@ #include #include -#if defined(__AVX__) -#if defined(__MINGW64__) -#include -#else -#include -#endif -#endif - /** * HSA image object size in bytes (see HSAIL spec) */ @@ -3235,49 +3236,44 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) __attribute__((optimize("unroll-all-loops"), always_inline)) static inline void nontemporalMemcpy( void* __restrict dst, const void* __restrict src, size_t size) { -#if defined(ATI_ARCH_X86) -#if defined(__AVX512F__) - for (auto i = 0u; i != size / sizeof(__m512i); ++i) { - _mm512_stream_si512(reinterpret_cast<__m512i* __restrict&>(dst)++, - *reinterpret_cast(src)++); +#if defined(__AVX512F__) && false // Disable until SIMDe adds support. + for (auto i = 0u; i != size / sizeof(simde__m512i); ++i) { + simde_mm512_stream_si512(reinterpret_cast(dst)++, + *reinterpret_cast(src)++); } - size = size % sizeof(__m512i); + size = size % sizeof(simde__m512i); #endif #if defined(__AVX__) - for (auto i = 0u; i != size / sizeof(__m256i); ++i) { - _mm256_stream_si256(reinterpret_cast<__m256i* __restrict&>(dst)++, - *reinterpret_cast(src)++); + for (auto i = 0u; i != size / sizeof(simde__m256i); ++i) { + simde_mm256_stream_si256(reinterpret_cast(dst)++, + *reinterpret_cast(src)++); } - size = size % sizeof(__m256i); + size = size % sizeof(simde__m256i); #endif - - for (auto i = 0u; i != size / sizeof(__m128i); ++i) { - _mm_stream_si128(reinterpret_cast<__m128i* __restrict&>(dst)++, - *(reinterpret_cast(src)++)); + for (auto i = 0u; i != size / sizeof(simde__m128i); ++i) { + simde_mm_stream_si128(reinterpret_cast(dst)++, + *(reinterpret_cast(src)++)); } - size = size % sizeof(__m128i); + size = size % sizeof(simde__m128i); - for (auto i = 0u; i != size / sizeof(long long); ++i) { - _mm_stream_si64(reinterpret_cast(dst)++, - *reinterpret_cast(src)++); + for (auto i = 0u; i != size / sizeof(int64_t); ++i) { + simde_mm_stream_si64(reinterpret_cast(dst)++, + *reinterpret_cast(src)++); } - size = size % sizeof(long long); + size = size % sizeof(int64_t); - for (auto i = 0u; i != size / sizeof(int); ++i) { - _mm_stream_si32(reinterpret_cast(dst)++, - *reinterpret_cast(src)++); + for (auto i = 0u; i != size / sizeof(int32_t); ++i) { + simde_mm_stream_si32(reinterpret_cast(dst)++, + *reinterpret_cast(src)++); } - size = size % sizeof(int); + size = size % sizeof(int32_t); // Copy remaining bytes for unaligned size std::memcpy(dst, src, size); // Add memory fence - _mm_sfence(); -#else - std::memcpy(dst, src, size); -#endif + simde_mm_sfence(); } #else static inline void nontemporalMemcpy(void* __restrict dst, const void* __restrict src, @@ -3533,9 +3529,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, *dev().info().hdpMemFlushCntl = 1u; auto kSentinel = *reinterpret_cast(dev().info().hdpMemFlushCntl); } else if (kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback && argSize != 0) { - _mm_sfence(); + simde_mm_sfence(); *(argBuffer + argSize - 1) = *(parameters + argSize - 1); - _mm_mfence(); + simde_mm_mfence(); auto kSentinel = *reinterpret_cast(argBuffer + argSize - 1); } } diff --git a/rocclr/os/os.cpp b/rocclr/os/os.cpp index e7a316e477..ea2cec94b8 100644 --- a/rocclr/os/os.cpp +++ b/rocclr/os/os.cpp @@ -30,10 +30,8 @@ #include #include #endif // !_WIN32 - -#if defined(ATI_ARCH_X86) -#include // for _mm_pause -#endif // ATI_ARCH_X86 +#include +#include namespace amd { @@ -120,13 +118,7 @@ size_t Os::pageSize_ = 0; int Os::processorCount_ = 0; -void Os::spinPause() { -#if defined(ATI_ARCH_X86) - _mm_pause(); -#elif defined(ATI_ARCH_ARM) - __asm__ __volatile__("yield"); -#endif -} +void Os::spinPause() { simde_mm_pause(); } void Os::sleep(long n) { // FIXME_lmoriche: Should be nano-seconds not seconds.