Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions cmake/FindSIMDe.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

include(FindPackageHandleStandardArgs)

find_package(PkgConfig REQUIRED)
if(PkgConfig_FOUND)
pkg_check_modules(simde IMPORTED_TARGET simde)
endif()

if(PkgConfig_FOUND AND simde_FOUND)
message(STATUS "Found SIMDe via pkg-config")
set(SIMDE_TARGET PkgConfig::SIMDE)
else()
message(STATUS "SIMDe not found via pkg-config. Falling back to find_path...")

if(WIN32)
find_path(SIMDE_INCLUDE_DIR
NAMES simde/simde-common.h
PATHS
"C:/simde"
ENV INCLUDE
)
elseif(UNIX)
find_path(SIMDE_INCLUDE_DIR
NAMES simde/simde-common.h
PATHS
/usr/include
/usr/local/include
NO_DEFAULT_PATH
)
endif()

find_package_handle_standard_args(SIMDe
REQUIRED_VARS SIMDE_INCLUDE_DIR)
if(SIMDE_FOUND)
message(STATUS "Found SIMDe headers at: ${SIMDE_INCLUDE_DIR}")
if(NOT TARGET SIMDE)
add_library(SIMDE INTERFACE)
target_include_directories(SIMDE INTERFACE ${SIMDE_INCLUDE_DIR})
endif()
else()
message(WARNING "could not find simde")
endif()
endif()
16 changes: 7 additions & 9 deletions hipamd/include/hip/amd_detail/amd_hip_vector_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -655,15 +655,13 @@ get_native_pointer(const HIP_vector_base<T, n>& base_vec) {
return make_vector_type<T, n>(x) /= y;
}

template<typename T, unsigned int n>
__HOST_DEVICE__
inline
#if __cplusplus >= 201402L && !defined(__HIPCC_RTC__)
constexpr
#endif
bool operator==(
const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
{
template <typename T, unsigned int n>
__HOST_DEVICE__ inline
#if __cplusplus >= 201402L && !defined(__HIPCC_RTC__)
constexpr
#endif
bool
operator==(const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept {
bool isTrue = true;
const auto& native_x = get_native_vector(x);
const auto& native_y = get_native_vector(y);
Expand Down
8 changes: 8 additions & 0 deletions hipamd/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ option(DISABLE_DIRECT_DISPATCH "Disable Direct Dispatch" OFF)
option(BUILD_SHARED_LIBS "Build the shared library" ON)

list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/../../cmake")

if(BUILD_SHARED_LIBS)
add_library(amdhip64 SHARED)
Expand Down Expand Up @@ -156,6 +157,13 @@ target_include_directories(amdhip64
target_compile_definitions(amdhip64 PRIVATE __HIP_PLATFORM_AMD__)
target_link_libraries(amdhip64 PRIVATE ${OPENGL_LIBRARIES})
target_link_libraries(amdhip64 PRIVATE ${CMAKE_DL_LIBS})

find_package(SIMDe REQUIRED)

if(SIMDE_FOUND)
target_link_libraries(amdhip64 PRIVATE ${SIMDE_TARGET})
endif()

# Add link to comgr, hsa-runtime and other required libraries in target files
# This is required for static libraries
if(NOT BUILD_SHARED_LIBS)
Expand Down
5 changes: 3 additions & 2 deletions hipamd/src/hip_embed_pch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -142,19 +142,20 @@ __hip_pch_wave64_size:
.long __hip_pch_wave64_size - __hip_pch_wave64
EOF

host_triple="$(uname -m)"
set -x

$LLVM_DIR/bin/clang -O3 --hip-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only --cuda-gpu-arch=gfx1030 -x hip $tmp/hip_pch.h -E >$tmp/pch_wave32.cui &&

cat $tmp/hip_macros.h >> $tmp/pch_wave32.cui &&

$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave32.pch -x hip-cpp-output - <$tmp/pch_wave32.cui &&
$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple "$host_triple" -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave32.pch -x hip-cpp-output - <$tmp/pch_wave32.cui &&

$LLVM_DIR/bin/clang -O3 --hip-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch_wave64.cui &&

cat $tmp/hip_macros.h >> $tmp/pch_wave64.cui &&

$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave64.pch -x hip-cpp-output - <$tmp/pch_wave64.cui &&
$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple "$host_triple" -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave64.pch -x hip-cpp-output - <$tmp/pch_wave64.cui &&

$LLVM_DIR/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj &&

Expand Down
7 changes: 5 additions & 2 deletions hipamd/src/hip_graph_internal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
THE SOFTWARE. */

#include "hip_graph_internal.hpp"
#include <cmath>
#include <simde/x86/sse2.h>

#include <queue>

#define CASE_STRING(X, C) \
Expand Down Expand Up @@ -804,9 +807,9 @@ void GraphKernelArgManager::ReadBackOrFlush() {
address dev_ptr =
kernarg_graph_.back().kernarg_pool_addr_ + kernarg_graph_.back().kernarg_pool_size_;
auto kSentinel = *reinterpret_cast<volatile unsigned char*>(dev_ptr - 1);
_mm_sfence();
simde_mm_sfence();
*(dev_ptr - 1) = kSentinel;
_mm_mfence();
simde_mm_mfence();
kSentinel = *reinterpret_cast<volatile unsigned char*>(dev_ptr - 1);
}
}
Expand Down
6 changes: 6 additions & 0 deletions opencl/amdocl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ if(ADDRESS_SANITIZER)
endif()

list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/../../cmake")

if(BUILD_SHARED_LIBS)
add_library(amdocl SHARED)
Expand Down Expand Up @@ -126,6 +127,11 @@ endif()

target_link_libraries(amdocl PUBLIC rocclr)

find_package(SIMDe REQUIRED)
if(SIMDE_FOUND)
target_link_libraries(amdocl PRIVATE ${SIMDE_TARGET})
endif()

INSTALL(TARGETS amdocl
COMPONENT MAIN
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
Expand Down
70 changes: 33 additions & 37 deletions rocclr/device/rocm/rocvirtual.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,15 @@
#include "hsa/amd_hsa_queue.h"
#include "hsa/amd_hsa_signal.h"

#include <simde/x86/avx.h>
#include <simde/x86/sse2.h>
#if defined(SIMDE_VERSION_MAJOR) && \
((SIMDE_VERSION_MAJOR > 0) || (SIMDE_VERSION_MAJOR == 0 && SIMDE_VERSION_MINOR >= 7))

#include <simde/x86/avx512.h>
#endif


#include <fstream>
#include <limits>
#include <memory>
Expand All @@ -47,14 +56,6 @@
#include <atomic>
#include <cinttypes>

#if defined(__AVX__)
#if defined(__MINGW64__)
#include <intrin.h>
#else
#include <immintrin.h>
#endif
#endif

/**
* HSA image object size in bytes (see HSAIL spec)
*/
Expand Down Expand Up @@ -3235,49 +3236,44 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
__attribute__((optimize("unroll-all-loops"), always_inline))
static inline void nontemporalMemcpy(
void* __restrict dst, const void* __restrict src, size_t size) {
#if defined(ATI_ARCH_X86)
#if defined(__AVX512F__)
for (auto i = 0u; i != size / sizeof(__m512i); ++i) {
_mm512_stream_si512(reinterpret_cast<__m512i* __restrict&>(dst)++,
*reinterpret_cast<const __m512i* __restrict&>(src)++);
#if defined(__AVX512F__) && false // Disable until SIMDe adds support.
for (auto i = 0u; i != size / sizeof(simde__m512i); ++i) {
simde_mm512_stream_si512(reinterpret_cast<simde__m512i* __restrict&>(dst)++,
*reinterpret_cast<const simde__m512i* __restrict&>(src)++);
}
size = size % sizeof(__m512i);
size = size % sizeof(simde__m512i);
#endif

#if defined(__AVX__)
for (auto i = 0u; i != size / sizeof(__m256i); ++i) {
_mm256_stream_si256(reinterpret_cast<__m256i* __restrict&>(dst)++,
*reinterpret_cast<const __m256i* __restrict&>(src)++);
for (auto i = 0u; i != size / sizeof(simde__m256i); ++i) {
simde_mm256_stream_si256(reinterpret_cast<simde__m256i* __restrict&>(dst)++,
*reinterpret_cast<const simde__m256i* __restrict&>(src)++);
}
size = size % sizeof(__m256i);
size = size % sizeof(simde__m256i);
#endif

for (auto i = 0u; i != size / sizeof(__m128i); ++i) {
_mm_stream_si128(reinterpret_cast<__m128i* __restrict&>(dst)++,
*(reinterpret_cast<const __m128i* __restrict&>(src)++));
for (auto i = 0u; i != size / sizeof(simde__m128i); ++i) {
simde_mm_stream_si128(reinterpret_cast<simde__m128i* __restrict&>(dst)++,
*(reinterpret_cast<const simde__m128i* __restrict&>(src)++));
}
size = size % sizeof(__m128i);
size = size % sizeof(simde__m128i);

for (auto i = 0u; i != size / sizeof(long long); ++i) {
_mm_stream_si64(reinterpret_cast<long long* __restrict&>(dst)++,
*reinterpret_cast<const long long* __restrict&>(src)++);
for (auto i = 0u; i != size / sizeof(int64_t); ++i) {
simde_mm_stream_si64(reinterpret_cast<int64_t* __restrict&>(dst)++,
*reinterpret_cast<const int64_t* __restrict&>(src)++);
}
size = size % sizeof(long long);
size = size % sizeof(int64_t);

for (auto i = 0u; i != size / sizeof(int); ++i) {
_mm_stream_si32(reinterpret_cast<int* __restrict&>(dst)++,
*reinterpret_cast<const int* __restrict&>(src)++);
for (auto i = 0u; i != size / sizeof(int32_t); ++i) {
simde_mm_stream_si32(reinterpret_cast<int32_t* __restrict&>(dst)++,
*reinterpret_cast<const int32_t* __restrict&>(src)++);
}

size = size % sizeof(int);
size = size % sizeof(int32_t);
// Copy remaining bytes for unaligned size
std::memcpy(dst, src, size);

// Add memory fence
_mm_sfence();
#else
std::memcpy(dst, src, size);
#endif
simde_mm_sfence();
}
#else
static inline void nontemporalMemcpy(void* __restrict dst, const void* __restrict src,
Expand Down Expand Up @@ -3533,9 +3529,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
*dev().info().hdpMemFlushCntl = 1u;
auto kSentinel = *reinterpret_cast<volatile int*>(dev().info().hdpMemFlushCntl);
} else if (kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback && argSize != 0) {
_mm_sfence();
simde_mm_sfence();
*(argBuffer + argSize - 1) = *(parameters + argSize - 1);
_mm_mfence();
simde_mm_mfence();
auto kSentinel = *reinterpret_cast<volatile unsigned char*>(argBuffer + argSize - 1);
}
}
Expand Down
14 changes: 3 additions & 11 deletions rocclr/os/os.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,8 @@
#include <time.h>
#include <unistd.h>
#endif // !_WIN32

#if defined(ATI_ARCH_X86)
#include <xmmintrin.h> // for _mm_pause
#endif // ATI_ARCH_X86
#include <cmath>
#include <simde/x86/sse2.h>

namespace amd {

Expand Down Expand Up @@ -120,13 +118,7 @@ size_t Os::pageSize_ = 0;

int Os::processorCount_ = 0;

void Os::spinPause() {
#if defined(ATI_ARCH_X86)
_mm_pause();
#elif defined(ATI_ARCH_ARM)
__asm__ __volatile__("yield");
#endif
}
void Os::spinPause() { simde_mm_pause(); }

void Os::sleep(long n) {
// FIXME_lmoriche: Should be nano-seconds not seconds.
Expand Down
Loading