Skip to content

Commit

Permalink
Add support for ARM dot product instructions
Browse files Browse the repository at this point in the history
  • Loading branch information
UniQP committed Feb 20, 2023
1 parent 037ef3e commit 655ceb0
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 1 deletion.
19 changes: 18 additions & 1 deletion src/Makefile
Expand Up @@ -95,6 +95,7 @@ VPATH = syzygy:nnue:nnue/features
# vnni256 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 256
# vnni512 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512
# neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture
# dotprod = yes/no --- -DUSE_DOTPROD --- Use ARM advanced SIMD Int8 dot product instructions
#
# Note that Makefile is space sensitive, so when adding new architectures
# or modifying existing flags, you have to make sure there are no extra spaces
Expand All @@ -116,7 +117,7 @@ ifeq ($(ARCH), $(filter $(ARCH), \
x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-avxvnni x86-64-bmi2 \
x86-64-avx2 x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 e2k \
armv7 armv7-neon armv8 apple-silicon general-64 general-32 riscv64))
armv7 armv7-neon armv8 armv8-dotprod apple-silicon general-64 general-32 riscv64))
SUPPORTED_ARCH=true
else
SUPPORTED_ARCH=false
Expand All @@ -140,6 +141,7 @@ avx512 = no
vnni256 = no
vnni512 = no
neon = no
dotprod = no
arm_version = 0
STRIP = strip

Expand Down Expand Up @@ -308,11 +310,21 @@ ifeq ($(ARCH),armv8)
arm_version = 8
endif

ifeq ($(ARCH),armv8-dotprod)
arch = armv8
prefetch = yes
popcnt = yes
neon = yes
dotprod = yes
arm_version = 8
endif

ifeq ($(ARCH),apple-silicon)
arch = arm64
prefetch = yes
popcnt = yes
neon = yes
dotprod = yes
arm_version = 8
endif

Expand Down Expand Up @@ -675,6 +687,10 @@ ifeq ($(neon),yes)
endif
endif

ifeq ($(dotprod),yes)
CXXFLAGS += -march=armv8.2-a+dotprod -DUSE_DOTPROD
endif

### 3.7 pext
ifeq ($(pext),yes)
CXXFLAGS += -DUSE_PEXT
Expand Down Expand Up @@ -776,6 +792,7 @@ help:
@echo "armv7 > ARMv7 32-bit"
@echo "armv7-neon > ARMv7 32-bit with popcnt and neon"
@echo "armv8 > ARMv8 64-bit with popcnt and neon"
@echo "armv8-dotprod > ARMv8 64-bit with popcnt, neon and dot product support"
@echo "e2k > Elbrus 2000"
@echo "apple-silicon > Apple silicon ARM64"
@echo "general-64 > unspecified 64-bit"
Expand Down
24 changes: 24 additions & 0 deletions src/nnue/layers/affine_transform.h
Expand Up @@ -72,6 +72,10 @@ namespace Stockfish::Eval::NNUE::Layers {
const __m64 Zeros = _mm_setzero_si64();
const auto inputVector = reinterpret_cast<const __m64*>(input);

# elif defined(USE_DOTPROD)
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
const auto inputVector = reinterpret_cast<const int8x16_t*>(input);

# elif defined(USE_NEON)
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
Expand Down Expand Up @@ -123,6 +127,14 @@ namespace Stockfish::Eval::NNUE::Layers {
sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
output[i] = _mm_cvtsi64_si32(sum);

# elif defined(USE_DOTPROD)
int32x4_t sum = {biases[i]};
const auto row = reinterpret_cast<const int8x16_t*>(&weights[offset]);
for (IndexType j = 0; j < NumChunks; ++j) {
sum = vdotq_s32(sum, inputVector[j], row[j]);
}
output[i] = vaddvq_s32(sum);

# elif defined(USE_NEON)
int32x4_t sum = {biases[i]};
const auto row = reinterpret_cast<const int8x8_t*>(&weights[offset]);
Expand Down Expand Up @@ -187,6 +199,9 @@ namespace Stockfish::Eval::NNUE::Layers {
#elif defined (USE_SSSE3)
static constexpr IndexType InputSimdWidth = 16;
static constexpr IndexType MaxNumOutputRegs = 8;
#elif defined (USE_DOTPROD)
static constexpr IndexType InputSimdWidth = 16;
static constexpr IndexType MaxNumOutputRegs = 8;
#elif defined (USE_NEON)
static constexpr IndexType InputSimdWidth = 8;
static constexpr IndexType MaxNumOutputRegs = 8;
Expand Down Expand Up @@ -292,6 +307,15 @@ namespace Stockfish::Eval::NNUE::Layers {
#define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2
#define vec_hadd Simd::m128_hadd
#define vec_haddx4 Simd::m128_haddx4
#elif defined (USE_DOTPROD)
using acc_vec_t = int32x4_t;
using bias_vec_t = int32x4_t;
using weight_vec_t = int8x16_t;
using in_vec_t = int8x16_t;
#define vec_zero {0}
#define vec_add_dpbusd_32x2 Simd::dotprod_m128_add_dpbusd_epi32x2
#define vec_hadd Simd::neon_m128_hadd
#define vec_haddx4 Simd::neon_m128_haddx4
#elif defined (USE_NEON)
using acc_vec_t = int32x4_t;
using bias_vec_t = int32x4_t;
Expand Down
13 changes: 13 additions & 0 deletions src/nnue/layers/simd.h
Expand Up @@ -346,6 +346,19 @@ namespace Stockfish::Simd {

#endif

#if defined (USE_DOTPROD)

[[maybe_unused]] static void dotprod_m128_add_dpbusd_epi32x2(
int32x4_t& acc,
int8x16_t a0, int8x16_t b0,
int8x16_t a1, int8x16_t b1) {

acc = vdotq_s32(acc, a0, b0);
acc = vdotq_s32(acc, a1, b1);
}

#endif

#if defined (USE_NEON)

[[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) {
Expand Down

0 comments on commit 655ceb0

Please sign in to comment.