Add support for ARM dot product instructions

Fixes official-stockfish#4193
UniQP · Feb 20, 2023 · 655ceb0 · 655ceb0
1 parent 037ef3e
commit 655ceb0
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 1 deletion.
diff --git a/src/Makefile b/src/Makefile
@@ -95,6 +95,7 @@ VPATH = syzygy:nnue:nnue/features
 # vnni256 = yes/no    --- -mavx512vnni     --- Use Intel Vector Neural Network Instructions 256
 # vnni512 = yes/no    --- -mavx512vnni     --- Use Intel Vector Neural Network Instructions 512
 # neon = yes/no       --- -DUSE_NEON       --- Use ARM SIMD architecture
+# dotprod = yes/no    --- -DUSE_DOTPROD    --- Use ARM advanced SIMD Int8 dot product instructions
 #
 # Note that Makefile is space sensitive, so when adding new architectures
 # or modifying existing flags, you have to make sure there are no extra spaces
@@ -116,7 +117,7 @@ ifeq ($(ARCH), $(filter $(ARCH), \
                  x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-avxvnni x86-64-bmi2 \
                  x86-64-avx2 x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
                  x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 e2k \
-                 armv7 armv7-neon armv8 apple-silicon general-64 general-32 riscv64))
+                 armv7 armv7-neon armv8 armv8-dotprod apple-silicon general-64 general-32 riscv64))
    SUPPORTED_ARCH=true
 else
    SUPPORTED_ARCH=false
@@ -140,6 +141,7 @@ avx512 = no
 vnni256 = no
 vnni512 = no
 neon = no
+dotprod = no
 arm_version = 0
 STRIP = strip
 
@@ -308,11 +310,21 @@ ifeq ($(ARCH),armv8)
 	arm_version = 8
 endif
 
+ifeq ($(ARCH),armv8-dotprod)
+	arch = armv8
+	prefetch = yes
+	popcnt = yes
+	neon = yes
+	dotprod = yes
+	arm_version = 8
+endif
+
 ifeq ($(ARCH),apple-silicon)
 	arch = arm64
 	prefetch = yes
 	popcnt = yes
 	neon = yes
+	dotprod = yes
 	arm_version = 8
 endif
 
@@ -675,6 +687,10 @@ ifeq ($(neon),yes)
 	endif
 endif
 
+ifeq ($(dotprod),yes)
+	CXXFLAGS += -march=armv8.2-a+dotprod -DUSE_DOTPROD
+endif
+
 ### 3.7 pext
 ifeq ($(pext),yes)
 	CXXFLAGS += -DUSE_PEXT
@@ -776,6 +792,7 @@ help:
 	@echo "armv7                   > ARMv7 32-bit"
 	@echo "armv7-neon              > ARMv7 32-bit with popcnt and neon"
 	@echo "armv8                   > ARMv8 64-bit with popcnt and neon"
+	@echo "armv8-dotprod           > ARMv8 64-bit with popcnt, neon and dot product support"
 	@echo "e2k                     > Elbrus 2000"
 	@echo "apple-silicon           > Apple silicon ARM64"
 	@echo "general-64              > unspecified 64-bit"

diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
@@ -72,6 +72,10 @@ namespace Stockfish::Eval::NNUE::Layers {
     const __m64 Zeros = _mm_setzero_si64();
     const auto inputVector = reinterpret_cast<const __m64*>(input);
 
+# elif defined(USE_DOTPROD)
+    constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
+    const auto inputVector = reinterpret_cast<const int8x16_t*>(input);
+
 # elif defined(USE_NEON)
     constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
     const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
@@ -123,6 +127,14 @@ namespace Stockfish::Eval::NNUE::Layers {
       sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
       output[i] = _mm_cvtsi64_si32(sum);
 
+# elif defined(USE_DOTPROD)
+      int32x4_t sum = {biases[i]};
+      const auto row = reinterpret_cast<const int8x16_t*>(&weights[offset]);
+      for (IndexType j = 0; j < NumChunks; ++j) {
+        sum = vdotq_s32(sum, inputVector[j], row[j]);
+      }
+      output[i] = vaddvq_s32(sum);
+
 # elif defined(USE_NEON)
       int32x4_t sum = {biases[i]};
       const auto row = reinterpret_cast<const int8x8_t*>(&weights[offset]);
@@ -187,6 +199,9 @@ namespace Stockfish::Eval::NNUE::Layers {
 #elif defined (USE_SSSE3)
     static constexpr IndexType InputSimdWidth = 16;
     static constexpr IndexType MaxNumOutputRegs = 8;
+#elif defined (USE_DOTPROD)
+    static constexpr IndexType InputSimdWidth = 16;
+    static constexpr IndexType MaxNumOutputRegs = 8;
 #elif defined (USE_NEON)
     static constexpr IndexType InputSimdWidth = 8;
     static constexpr IndexType MaxNumOutputRegs = 8;
@@ -292,6 +307,15 @@ namespace Stockfish::Eval::NNUE::Layers {
       #define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2
       #define vec_hadd Simd::m128_hadd
       #define vec_haddx4 Simd::m128_haddx4
+#elif defined (USE_DOTPROD)
+      using acc_vec_t = int32x4_t;
+      using bias_vec_t = int32x4_t;
+      using weight_vec_t = int8x16_t;
+      using in_vec_t = int8x16_t;
+      #define vec_zero {0}
+      #define vec_add_dpbusd_32x2 Simd::dotprod_m128_add_dpbusd_epi32x2
+      #define vec_hadd Simd::neon_m128_hadd
+      #define vec_haddx4 Simd::neon_m128_haddx4
 #elif defined (USE_NEON)
       using acc_vec_t = int32x4_t;
       using bias_vec_t = int32x4_t;

diff --git a/src/nnue/layers/simd.h b/src/nnue/layers/simd.h
@@ -346,6 +346,19 @@ namespace Stockfish::Simd {
 
 #endif
 
+#if defined (USE_DOTPROD)
+
+    [[maybe_unused]] static void dotprod_m128_add_dpbusd_epi32x2(
+        int32x4_t& acc,
+        int8x16_t a0, int8x16_t b0,
+        int8x16_t a1, int8x16_t b1) {
+
+        acc = vdotq_s32(acc, a0, b0);
+        acc = vdotq_s32(acc, a1, b1);
+    }
+
+#endif
+
 #if defined (USE_NEON)
 
     [[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) {