Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cmake/aarch64InstructionFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ message(STATUS "Building for ARM aarch64")
# Check what compiler flags are supported
CHECK_CXX_COMPILER_FLAG("-march=armv7-a+neon" CXX_ARMV7_NEON)
CHECK_CXX_COMPILER_FLAG("-march=armv8-a" CXX_ARMV8A)
CHECK_CXX_COMPILER_FLAG("-march=armv8.2-a+dotprod" CXX_NEON_DOTPROD)
CHECK_CXX_COMPILER_FLAG("-march=armv8-a+sve" CXX_SVE)
CHECK_CXX_COMPILER_FLAG("-march=armv9-a+sve2" CXX_SVE2)
CHECK_CXX_COMPILER_FLAG("-march=armv8.2-a+fp16fml" CXX_NEON_HP)
Expand All @@ -17,6 +18,9 @@ if(CXX_SVE2)
message(STATUS "Using ARMv9.0-a with SVE2 (supported by CPU)")
add_compile_definitions(OPT_SVE2)
endif()
if (CXX_NEON_DOTPROD)
add_compile_definitions(OPT_NEON_DOTPROD)
endif()
if (CXX_ARMV8A OR CXX_ARMV7_NEON)
message(STATUS "Using ARMv8.0-a with NEON")
add_compile_definitions(OPT_NEON)
Expand Down
5 changes: 5 additions & 0 deletions src/VecSim/spaces/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)|(ARM64)|(armv.*)")
include(${root}/cmake/aarch64InstructionFlags.cmake)

# Create different optimization implementations for ARM architecture
if (CXX_NEON_DOTPROD)
message("Building with ARMV8.2 with dotprod")
set_source_files_properties(functions/NEON_DOTPROD.cpp PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+dotprod")
list(APPEND OPTIMIZATIONS functions/NEON_DOTPROD.cpp)
endif()
if (CXX_ARMV8A)
message("Building with ARMV8A")
set_source_files_properties(functions/NEON.cpp PROPERTIES COMPILE_FLAGS "-march=armv8-a")
Expand Down
119 changes: 119 additions & 0 deletions src/VecSim/spaces/IP/IP_NEON_DOTPROD_INT8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/*
*Copyright Redis Ltd. 2021 - present
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
*the Server Side Public License v1 (SSPLv1).
*/

#include "VecSim/spaces/space_includes.h"
#include <arm_neon.h>

__attribute__((always_inline)) static inline void InnerProductOp(int8x16_t &v1, int8x16_t &v2,
int32x4_t &sum) {
sum = vdotq_s32(sum, v1, v2);
}

__attribute__((always_inline)) static inline void InnerProductStep(int8_t *&pVect1, int8_t *&pVect2,
int32x4_t &sum) {
// Load 16 int8 elements (16 bytes) into NEON registers
int8x16_t v1 = vld1q_s8(pVect1);
int8x16_t v2 = vld1q_s8(pVect2);

InnerProductOp(v1, v2, sum);

pVect1 += 16;
pVect2 += 16;
}

template <unsigned char residual> // 0..63
float INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
int8_t *pVect1 = (int8_t *)pVect1v;
int8_t *pVect2 = (int8_t *)pVect2v;

// Initialize multiple sum accumulators for better parallelism
int32x4_t sum0 = vdupq_n_s32(0);
int32x4_t sum1 = vdupq_n_s32(0);

constexpr size_t final_residual = residual % 16;
if constexpr (final_residual > 0) {
// Define a compile-time constant mask based on final_residual
constexpr uint8x16_t mask = {
0xFF,
(final_residual >= 2) ? 0xFF : 0,
(final_residual >= 3) ? 0xFF : 0,
(final_residual >= 4) ? 0xFF : 0,
(final_residual >= 5) ? 0xFF : 0,
(final_residual >= 6) ? 0xFF : 0,
(final_residual >= 7) ? 0xFF : 0,
(final_residual >= 8) ? 0xFF : 0,
(final_residual >= 9) ? 0xFF : 0,
(final_residual >= 10) ? 0xFF : 0,
(final_residual >= 11) ? 0xFF : 0,
(final_residual >= 12) ? 0xFF : 0,
(final_residual >= 13) ? 0xFF : 0,
(final_residual >= 14) ? 0xFF : 0,
(final_residual >= 15) ? 0xFF : 0,
0,
};

// Load data directly from input vectors
int8x16_t v1 = vld1q_s8(pVect1);
int8x16_t v2 = vld1q_s8(pVect2);

// Zero vector for replacement
int8x16_t zeros = vdupq_n_s8(0);

// Apply bit select to zero out irrelevant elements
v1 = vbslq_s8(mask, v1, zeros);
v2 = vbslq_s8(mask, v2, zeros);
InnerProductOp(v1, v2, sum0);
pVect1 += final_residual;
pVect2 += final_residual;
}

// Process 64 elements at a time in the main loop
const size_t num_of_chunks = dimension / 64;

for (size_t i = 0; i < num_of_chunks; i++) {
InnerProductStep(pVect1, pVect2, sum0);
InnerProductStep(pVect1, pVect2, sum1);
InnerProductStep(pVect1, pVect2, sum0);
InnerProductStep(pVect1, pVect2, sum1);
}

constexpr size_t residual_chunks = residual / 16;

if constexpr (residual_chunks > 0) {
if constexpr (residual_chunks >= 1) {
InnerProductStep(pVect1, pVect2, sum0);
}
if constexpr (residual_chunks >= 2) {
InnerProductStep(pVect1, pVect2, sum1);
}
if constexpr (residual_chunks >= 3) {
InnerProductStep(pVect1, pVect2, sum0);
}
}

// Combine all four sum registers
int32x4_t total_sum = vaddq_s32(sum0, sum1);
// Horizontal sum of the 4 elements in the combined sum register
int32_t result = vaddvq_s32(total_sum);

return static_cast<float>(result);
}

template <unsigned char residual> // 0..63
float INT8_InnerProductSIMD16_NEON_DOTPROD(const void *pVect1v, const void *pVect2v,
size_t dimension) {
return 1.0f - INT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
}

template <unsigned char residual> // 0..63
float INT8_CosineSIMD_NEON_DOTPROD(const void *pVect1v, const void *pVect2v, size_t dimension) {
float ip = INT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
float norm_v1 =
*reinterpret_cast<const float *>(static_cast<const uint8_t *>(pVect1v) + dimension);
float norm_v2 =
*reinterpret_cast<const float *>(static_cast<const uint8_t *>(pVect2v) + dimension);
return 1.0f - ip / (norm_v1 * norm_v2);
}
116 changes: 116 additions & 0 deletions src/VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*
*Copyright Redis Ltd. 2021 - present
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
*the Server Side Public License v1 (SSPLv1).
*/

#include "VecSim/spaces/space_includes.h"
#include <arm_neon.h>

__attribute__((always_inline)) static inline void InnerProductOp(uint8x16_t &v1, uint8x16_t &v2,
uint32x4_t &sum) {
sum = vdotq_u32(sum, v1, v2);
}

__attribute__((always_inline)) static inline void
InnerProductStep(uint8_t *&pVect1, uint8_t *&pVect2, uint32x4_t &sum) {
// Load 16 uint8 elements (16 bytes) into NEON registers
uint8x16_t v1 = vld1q_u8(pVect1);
uint8x16_t v2 = vld1q_u8(pVect2);
InnerProductOp(v1, v2, sum);

pVect1 += 16;
pVect2 += 16;
}

template <unsigned char residual> // 0..63
float UINT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
uint8_t *pVect1 = (uint8_t *)pVect1v;
uint8_t *pVect2 = (uint8_t *)pVect2v;

// Initialize multiple sum accumulators for better parallelism
uint32x4_t sum0 = vdupq_n_u32(0);
uint32x4_t sum1 = vdupq_n_u32(0);

constexpr size_t final_residual = residual % 16;
if constexpr (final_residual > 0) {
constexpr uint8x16_t mask = {
0xFF,
(final_residual >= 2) ? 0xFF : 0,
(final_residual >= 3) ? 0xFF : 0,
(final_residual >= 4) ? 0xFF : 0,
(final_residual >= 5) ? 0xFF : 0,
(final_residual >= 6) ? 0xFF : 0,
(final_residual >= 7) ? 0xFF : 0,
(final_residual >= 8) ? 0xFF : 0,
(final_residual >= 9) ? 0xFF : 0,
(final_residual >= 10) ? 0xFF : 0,
(final_residual >= 11) ? 0xFF : 0,
(final_residual >= 12) ? 0xFF : 0,
(final_residual >= 13) ? 0xFF : 0,
(final_residual >= 14) ? 0xFF : 0,
(final_residual >= 15) ? 0xFF : 0,
0,
};

// Load data directly from input vectors
uint8x16_t v1 = vld1q_u8(pVect1);
uint8x16_t v2 = vld1q_u8(pVect2);

// Zero vector for replacement
uint8x16_t zeros = vdupq_n_u8(0);

// Apply bit select to zero out irrelevant elements
v1 = vbslq_u8(mask, v1, zeros);
v2 = vbslq_u8(mask, v2, zeros);
InnerProductOp(v1, v2, sum1);
pVect1 += final_residual;
pVect2 += final_residual;
}

// Process 64 elements at a time in the main loop
const size_t num_of_chunks = dimension / 64;

for (size_t i = 0; i < num_of_chunks; i++) {
InnerProductStep(pVect1, pVect2, sum0);
InnerProductStep(pVect1, pVect2, sum1);
InnerProductStep(pVect1, pVect2, sum0);
InnerProductStep(pVect1, pVect2, sum1);
}

constexpr size_t residual_chunks = residual / 16;

if constexpr (residual_chunks > 0) {
if constexpr (residual_chunks >= 1) {
InnerProductStep(pVect1, pVect2, sum0);
}
if constexpr (residual_chunks >= 2) {
InnerProductStep(pVect1, pVect2, sum1);
}
if constexpr (residual_chunks >= 3) {
InnerProductStep(pVect1, pVect2, sum0);
}
}

uint32x4_t total_sum = vaddq_u32(sum0, sum1);

int32_t result = vaddvq_u32(total_sum);

return static_cast<float>(result);
}

template <unsigned char residual> // 0..63
float UINT8_InnerProductSIMD16_NEON_DOTPROD(const void *pVect1v, const void *pVect2v,
size_t dimension) {
return 1.0f - UINT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
}

template <unsigned char residual> // 0..63
float UINT8_CosineSIMD_NEON_DOTPROD(const void *pVect1v, const void *pVect2v, size_t dimension) {
float ip = UINT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
float norm_v1 =
*reinterpret_cast<const float *>(static_cast<const uint8_t *>(pVect1v) + dimension);
float norm_v2 =
*reinterpret_cast<const float *>(static_cast<const uint8_t *>(pVect2v) + dimension);
return 1.0f - ip / (norm_v1 * norm_v2);
}
125 changes: 125 additions & 0 deletions src/VecSim/spaces/IP/IP_NEON_INT8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
/*
*Copyright Redis Ltd. 2021 - present
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
*the Server Side Public License v1 (SSPLv1).
*/

#include "VecSim/spaces/space_includes.h"
#include <arm_neon.h>

__attribute__((always_inline)) static inline void InnerProductOp(int8x16_t &v1, int8x16_t &v2,
int32x4_t &sum) {
// Multiply low 8 elements (first half)
int16x8_t prod_low = vmull_s8(vget_low_s8(v1), vget_low_s8(v2));

// Multiply high 8 elements (second half) using vmull_high_s8
int16x8_t prod_high = vmull_high_s8(v1, v2);

// Pairwise add adjacent elements to 32-bit accumulators
sum = vpadalq_s16(sum, prod_low);
sum = vpadalq_s16(sum, prod_high);
}

__attribute__((always_inline)) static inline void InnerProductStep(int8_t *&pVect1, int8_t *&pVect2,
int32x4_t &sum) {
// Load 16 int8 elements (16 bytes) into NEON registers
int8x16_t v1 = vld1q_s8(pVect1);
int8x16_t v2 = vld1q_s8(pVect2);
InnerProductOp(v1, v2, sum);

pVect1 += 16;
pVect2 += 16;
}

template <unsigned char residual> // 0..63
float INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
int8_t *pVect1 = (int8_t *)pVect1v;
int8_t *pVect2 = (int8_t *)pVect2v;

// Initialize multiple sum accumulators for better parallelism
int32x4_t sum0 = vdupq_n_s32(0);
int32x4_t sum1 = vdupq_n_s32(0);

constexpr size_t final_residual = residual % 16;
if constexpr (final_residual > 0) {
// Define a compile-time constant mask based on final_residual
constexpr uint8x16_t mask = {
0xFF,
(final_residual >= 2) ? 0xFF : 0,
(final_residual >= 3) ? 0xFF : 0,
(final_residual >= 4) ? 0xFF : 0,
(final_residual >= 5) ? 0xFF : 0,
(final_residual >= 6) ? 0xFF : 0,
(final_residual >= 7) ? 0xFF : 0,
(final_residual >= 8) ? 0xFF : 0,
(final_residual >= 9) ? 0xFF : 0,
(final_residual >= 10) ? 0xFF : 0,
(final_residual >= 11) ? 0xFF : 0,
(final_residual >= 12) ? 0xFF : 0,
(final_residual >= 13) ? 0xFF : 0,
(final_residual >= 14) ? 0xFF : 0,
(final_residual >= 15) ? 0xFF : 0,
0,
};

// Load data directly from input vectors
int8x16_t v1 = vld1q_s8(pVect1);
int8x16_t v2 = vld1q_s8(pVect2);

// Zero vector for replacement
int8x16_t zeros = vdupq_n_s8(0);

// Apply bit select to zero out irrelevant elements
v1 = vbslq_s8(mask, v1, zeros);
v2 = vbslq_s8(mask, v2, zeros);
InnerProductOp(v1, v2, sum0);
pVect1 += final_residual;
pVect2 += final_residual;
}

// Process 64 elements at a time in the main loop
const size_t num_of_chunks = dimension / 64;

for (size_t i = 0; i < num_of_chunks; i++) {
InnerProductStep(pVect1, pVect2, sum0);
InnerProductStep(pVect1, pVect2, sum1);
InnerProductStep(pVect1, pVect2, sum0);
InnerProductStep(pVect1, pVect2, sum1);
}

constexpr size_t residual_chunks = residual / 16;

if constexpr (residual_chunks > 0) {
if constexpr (residual_chunks >= 1) {
InnerProductStep(pVect1, pVect2, sum0);
}
if constexpr (residual_chunks >= 2) {
InnerProductStep(pVect1, pVect2, sum1);
}
if constexpr (residual_chunks >= 3) {
InnerProductStep(pVect1, pVect2, sum0);
}
}

// Combine all four sum registers
int32x4_t total_sum = vaddq_s32(sum0, sum1);
// Horizontal sum of the 4 elements in the combined sum register
int32_t result = vaddvq_s32(total_sum);

return static_cast<float>(result);
}

template <unsigned char residual> // 0..15
float INT8_InnerProductSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
return 1.0f - INT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
}

template <unsigned char residual> // 0..63
float INT8_CosineSIMD_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
float ip = INT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
float norm_v1 =
*reinterpret_cast<const float *>(static_cast<const uint8_t *>(pVect1v) + dimension);
float norm_v2 =
*reinterpret_cast<const float *>(static_cast<const uint8_t *>(pVect2v) + dimension);
return 1.0f - ip / (norm_v1 * norm_v2);
}
Loading