Skip to content

Commit

Permalink
Only translate SSE4.1 to VSX if _mm_packus_epi32 available
Browse files Browse the repository at this point in the history
_mm_packus_epi32 was added in GCC v12.1.
  • Loading branch information
Jeremy Rand committed Jun 17, 2023
1 parent 14fa906 commit 1f96f60
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 1 deletion.
12 changes: 12 additions & 0 deletions CMakeLists.txt
Expand Up @@ -361,6 +361,18 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)")
set(NCNN_SSE2 ON)
endif()

set(CMAKE_REQUIRED_FLAGS "-DNO_WARN_X86_INTRINSICS -D__SSE4_1__")
check_cxx_source_compiles("#include <smmintrin.h>\nint main() { __m128i _v, _a, _b; _v = _mm_packus_epi32(_a, _b); return 0; }" NCNN_COMPILER_SUPPORT_PPC64LE_SSE4_1)
unset(CMAKE_REQUIRED_FLAGS)

if(NCNN_COMPILER_SUPPORT_PPC64LE_SSE4_1)
if(NCNN_VSX)
option(NCNN_SSE4_1 "optimize ppc64le platform with sse4.1 extension" ON)
endif()
else()
message(WARNING "The compiler does not support sse4.1 extension. NCNN_SSE4_1 will be OFF.")
endif()

check_cxx_compiler_flag("-mcpu=native -mtune=native" NCNN_COMPILER_SUPPORT_CPU_NATIVE)
if(NCNN_COMPILER_SUPPORT_CPU_NATIVE)
set(NCNN_PPC64LE_DEFAULT_MACHINE_TYPE "native")
Expand Down
7 changes: 6 additions & 1 deletion src/CMakeLists.txt
Expand Up @@ -513,7 +513,12 @@ if(NCNN_TARGET_ARCH STREQUAL "powerpc")
# Auto-translate SSE2 to VSX
if(NCNN_SSE2)
# Don't define __SSE3__ because it triggers usage of _MM_DENORMALS_ZERO_OFF, which isn't supported on ppc64le yet.
target_compile_options(ncnn PRIVATE -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSSE3__ -D__SSE4_1__)
target_compile_options(ncnn PRIVATE -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSSE3__)
endif()

# Auto-translate SSE4.1 to VSX
if(NCNN_SSE4_1)
target_compile_options(ncnn PRIVATE -DNO_WARN_X86_INTRINSICS -D__SSE4_1__)
endif()
endif()

Expand Down

0 comments on commit 1f96f60

Please sign in to comment.