diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h new file mode 100644 index 000000000..f14c2289f --- /dev/null +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" +#include + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +/* + * Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity: + * IP(x, y) ~= min * y_sum + delta * Σ(q_i * y_i) + * + * FP16 query lanes are widened to FP32 via vcvt_f32_f16 per 16-lane chunk. + */ + +// Helper: 16 lanes per call, four FP32 accumulators (one per quarter). +static inline void SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2, + float32x4_t &sum0, float32x4_t &sum1, + float32x4_t &sum2, float32x4_t &sum3) { + uint8x16_t v1_u8 = vld1q_u8(pVect1); + // SQ8 values 0..255 are exact in FP16, so widen uint8 -> uint16 -> fp16 -> fp32. + // This drops two integer-widening ops per chunk versus the uint8 -> u16 -> u32 -> f32 + // chain while producing bit-identical FP32 lane values. + float16x8_t v1_h_lo = vcvtq_f16_u16(vmovl_u8(vget_low_u8(v1_u8))); + float16x8_t v1_h_hi = vcvtq_f16_u16(vmovl_u8(vget_high_u8(v1_u8))); + float32x4_t v1_0 = vcvt_f32_f16(vget_low_f16(v1_h_lo)); + float32x4_t v1_1 = vcvt_f32_f16(vget_high_f16(v1_h_lo)); + float32x4_t v1_2 = vcvt_f32_f16(vget_low_f16(v1_h_hi)); + float32x4_t v1_3 = vcvt_f32_f16(vget_high_f16(v1_h_hi)); + + const float16_t *q = reinterpret_cast(pVect2); + float16x8_t q_lo = vld1q_f16(q); + float16x8_t q_hi = vld1q_f16(q + 8); + float32x4_t v2_0 = vcvt_f32_f16(vget_low_f16(q_lo)); + float32x4_t v2_1 = vcvt_f32_f16(vget_high_f16(q_lo)); + float32x4_t v2_2 = vcvt_f32_f16(vget_low_f16(q_hi)); + float32x4_t v2_3 = vcvt_f32_f16(vget_high_f16(q_hi)); + + sum0 = vfmaq_f32(sum0, v1_0, v2_0); + sum1 = vfmaq_f32(sum1, v1_1, v2_1); + sum2 = vfmaq_f32(sum2, v1_2, v2_2); + sum3 = vfmaq_f32(sum3, v1_3, v2_3); + + pVect1 += 16; + pVect2 += 16; +} + +// pVect1v = SQ8 storage, pVect2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher). +template // 0..15 +float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *pVect2v, + size_t dimension) { + const uint8_t *pVect1 = static_cast(pVect1v); + const float16 *pVect2 = static_cast(pVect2v); + + float32x4_t sum0 = vdupq_n_f32(0.0f); + float32x4_t sum1 = vdupq_n_f32(0.0f); + float32x4_t sum2 = vdupq_n_f32(0.0f); + float32x4_t sum3 = vdupq_n_f32(0.0f); + + const size_t num_of_chunks = dimension / 16; + for (size_t i = 0; i < num_of_chunks; i++) { + SQ8_FP16_InnerProductStep_NEON_HP(pVect1, pVect2, sum0, sum1, sum2, sum3); + } + + // Residual: up to three independent 4-lane sub-steps, leaving at most 3 elements + // for scalar — mirrors the SQ8_FP32 NEON sister pattern. + // vld1_f16 (4 FP16 = 8 bytes) is safe for any residual: FP16 metadata follows + // the lane data so there is always enough headroom. + constexpr unsigned char r = residual; + if constexpr (r >= 4) { + uint8x8_t v1_u8 = vld1_u8(pVect1); + float32x4_t v1_a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8)))); + float32x4_t v2_a = vcvt_f32_f16(vld1_f16(reinterpret_cast(pVect2))); + sum0 = vfmaq_f32(sum0, v1_a, v2_a); + pVect1 += 4; + pVect2 += 4; + } + if constexpr (r >= 8) { + uint8x8_t v1_u8 = vld1_u8(pVect1); + float32x4_t v1_b = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8)))); + float32x4_t v2_b = vcvt_f32_f16(vld1_f16(reinterpret_cast(pVect2))); + sum1 = vfmaq_f32(sum1, v1_b, v2_b); + pVect1 += 4; + pVect2 += 4; + } + if constexpr (r >= 12) { + uint8x8_t v1_u8 = vld1_u8(pVect1); + float32x4_t v1_c = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8)))); + float32x4_t v2_c = vcvt_f32_f16(vld1_f16(reinterpret_cast(pVect2))); + sum2 = vfmaq_f32(sum2, v1_c, v2_c); + pVect1 += 4; + pVect2 += 4; + } + constexpr unsigned char tail = r & 3; + float scalar_dot = 0.0f; + for (unsigned char k = 0; k < tail; ++k) { + scalar_dot += static_cast(pVect1[k]) * vecsim_types::FP16_to_FP32(pVect2[k]); + } + + float32x4_t sum_lo = vaddq_f32(sum0, sum1); + float32x4_t sum_hi = vaddq_f32(sum2, sum3); + float quantized_dot = vaddvq_f32(vaddq_f32(sum_lo, sum_hi)) + scalar_dot; + + const uint8_t *params_bytes = static_cast(pVect1v) + dimension; + const float min_val = load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); + const float delta = load_unaligned(params_bytes + sq8::DELTA * sizeof(float)); + const uint8_t *query_meta_bytes = + reinterpret_cast(static_cast(pVect2v) + dimension); + const float y_sum = load_unaligned(query_meta_bytes + sq8::SUM_QUERY * sizeof(float)); + + return min_val * y_sum + delta * quantized_dot; +} + +template +float SQ8_FP16_InnerProductSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, + size_t dimension) { + return 1.0f - SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(pVect1v, pVect2v, dimension); +} + +template +float SQ8_FP16_CosineSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) { + return SQ8_FP16_InnerProductSIMD16_NEON_HP(pVect1v, pVect2v, dimension); +} diff --git a/src/VecSim/spaces/IP/IP_SVE2_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_SVE2_SQ8_FP16.h new file mode 100644 index 000000000..a36627e80 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_SVE2_SQ8_FP16.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" +#include + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +/* + * SVE2 asymmetric SQ8 (storage) <-> FP16 (query) inner product using the identity: + * IP(x, y) ~= min * y_sum + delta * Σ(q_i * y_i) + * + * SVE2-only fast path: the storage bytes (0..255, exact in FP16) and the FP16 query + * lanes stay 16-bit, and the FP16->FP32 widening multiply-accumulate is done by the + * FMLALB/FMLALT pair (svmlalb_f32 / svmlalt_f32). Each pair widens the even/odd + * half-precision lanes to single precision and multiplies/accumulates in FP32 WITHOUT + * intermediate rounding, so the per-lane products match the SVE svmla path exactly while + * processing svcnth() lanes per step (twice the base-SVE svcntw() granularity) and halving + * the number of loads and explicit conversions. The even/odd accumulator split groups the + * FP32 additions differently than the base SVE kernel, so the reduced result is numerically + * equivalent (well within the test tolerance) rather than bit-identical. + */ + +// Helper: one svcnth()-wide FP16 step feeding an even/odd FP32 accumulator pair. +static inline void SQ8_FP16_InnerProductStep_SVE2(const uint8_t *pVect1, const float16 *pVect2, + size_t &offset, svfloat32_t &sum_even, + svfloat32_t &sum_odd, svbool_t pg16, + size_t chunk) { + svuint16_t v1_u16 = svld1ub_u16(pg16, pVect1 + offset); + svfloat16_t v1_f16 = svcvt_f16_u16_x(pg16, v1_u16); + svfloat16_t q_f16 = svld1_f16(pg16, reinterpret_cast(pVect2 + offset)); + // FMLALB/FMLALT are unpredicated; inactive lanes were zeroed by the loads above so + // their contribution is 0 and walking all lanes is safe. + sum_even = svmlalb_f32(sum_even, v1_f16, q_f16); + sum_odd = svmlalt_f32(sum_odd, v1_f16, q_f16); + offset += chunk; +} + +// pVect1v = SQ8 storage, pVect2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher). +template +float SQ8_FP16_InnerProductSIMD_SVE2_IMP(const void *pVect1v, const void *pVect2v, + size_t dimension) { + const uint8_t *pVect1 = static_cast(pVect1v); + const float16 *pVect2 = static_cast(pVect2v); + size_t offset = 0; + const svbool_t pg16 = svptrue_b16(); + const size_t chunk = svcnth(); + + svfloat32_t sum0e = svdup_f32(0.0f), sum0o = svdup_f32(0.0f); + svfloat32_t sum1e = svdup_f32(0.0f), sum1o = svdup_f32(0.0f); + svfloat32_t sum2e = svdup_f32(0.0f), sum2o = svdup_f32(0.0f); + svfloat32_t sum3e = svdup_f32(0.0f), sum3o = svdup_f32(0.0f); + + // Partial chunk for dim % chunk FP16 lanes. Zeroing loads (_z convert) leave inactive + // lanes at 0 so the unpredicated FMLALB/FMLALT below ignore them. + if constexpr (partial_chunk) { + size_t remaining = dimension % chunk; + if (remaining > 0) { + svbool_t pg_partial = svwhilelt_b16(uint64_t(0), uint64_t(remaining)); + svuint16_t v1_u16 = svld1ub_u16(pg_partial, pVect1 + offset); + svfloat16_t v1_f16 = svcvt_f16_u16_z(pg_partial, v1_u16); + svfloat16_t q_f16 = + svld1_f16(pg_partial, reinterpret_cast(pVect2 + offset)); + sum0e = svmlalb_f32(sum0e, v1_f16, q_f16); + sum0o = svmlalt_f32(sum0o, v1_f16, q_f16); + offset += remaining; + } + } + + // Main loop: 4 steps per iteration, one even/odd accumulator pair per step. + const size_t chunk_size = 4 * chunk; + const size_t number_of_chunks = + (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size; + for (size_t i = 0; i < number_of_chunks; i++) { + SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum0e, sum0o, pg16, chunk); + SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum1e, sum1o, pg16, chunk); + SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum2e, sum2o, pg16, chunk); + SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum3e, sum3o, pg16, chunk); + } + + if constexpr (additional_steps > 0) + SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum0e, sum0o, pg16, chunk); + if constexpr (additional_steps > 1) + SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum1e, sum1o, pg16, chunk); + if constexpr (additional_steps > 2) + SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum2e, sum2o, pg16, chunk); + + const svbool_t pg32 = svptrue_b32(); + svfloat32_t sum = svadd_f32_z(pg32, sum0e, sum0o); + sum = svadd_f32_x(pg32, sum, svadd_f32_x(pg32, sum1e, sum1o)); + sum = svadd_f32_x(pg32, sum, svadd_f32_x(pg32, sum2e, sum2o)); + sum = svadd_f32_x(pg32, sum, svadd_f32_x(pg32, sum3e, sum3o)); + float quantized_dot = svaddv_f32(pg32, sum); + + const uint8_t *params_bytes = static_cast(pVect1v) + dimension; + const float min_val = load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); + const float delta = load_unaligned(params_bytes + sq8::DELTA * sizeof(float)); + const uint8_t *query_meta_bytes = + reinterpret_cast(static_cast(pVect2v) + dimension); + const float y_sum = load_unaligned(query_meta_bytes + sq8::SUM_QUERY * sizeof(float)); + + return min_val * y_sum + delta * quantized_dot; +} + +template +float SQ8_FP16_InnerProductSIMD_SVE2(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1.0f - SQ8_FP16_InnerProductSIMD_SVE2_IMP( + pVect1v, pVect2v, dimension); +} + +template +float SQ8_FP16_CosineSIMD_SVE2(const void *pVect1v, const void *pVect2v, size_t dimension) { + return SQ8_FP16_InnerProductSIMD_SVE2(pVect1v, pVect2v, + dimension); +} diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h new file mode 100644 index 000000000..1408e0880 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" +#include + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +/* + * Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity: + * IP(x, y) ~= min * y_sum + delta * Σ(q_i * y_i) + * + * FP16 query lanes are widened to FP32 per step via svld1uh_u32 + svcvt_f32_f16_x. + * svld1uh_u32 zero-extends each FP16 halfword into a 32-bit lane so that + * svcvt_f32_f16_x reads the correct bits directly without any interleaving. + */ + +// Helper: one SVE-vector-width-of-FP32 step. +static inline void SQ8_FP16_InnerProductStep_SVE(const uint8_t *pVect1, const float16 *pVect2, + size_t &offset, svfloat32_t &sum, svbool_t pg, + size_t chunk) { + svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset); + svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32); + svuint32_t q_u32 = svld1uh_u32(pg, reinterpret_cast(pVect2 + offset)); + svfloat32_t v2_f = svcvt_f32_f16_x(pg, svreinterpret_f16_u32(q_u32)); + sum = svmla_f32_x(pg, sum, v1_f, v2_f); + offset += chunk; +} + +// pVect1v = SQ8 storage, pVect2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher). +template +float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, + size_t dimension) { + const uint8_t *pVect1 = static_cast(pVect1v); + const float16 *pVect2 = static_cast(pVect2v); + size_t offset = 0; + svbool_t pg = svptrue_b32(); + const size_t chunk = svcntw(); + + svfloat32_t sum0 = svdup_f32(0.0f); + svfloat32_t sum1 = svdup_f32(0.0f); + svfloat32_t sum2 = svdup_f32(0.0f); + svfloat32_t sum3 = svdup_f32(0.0f); + + // Partial chunk for dim % chunk lanes. Use _z form so inactive lanes are zero; + // the final reduction walks all lanes via svptrue_b32(). + if constexpr (partial_chunk) { + size_t remaining = dimension % chunk; + if (remaining > 0) { + svbool_t pg_partial = svwhilelt_b32(uint32_t(0), uint32_t(remaining)); + svuint32_t v1_u32 = svld1ub_u32(pg_partial, pVect1 + offset); + svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32); + svuint32_t q_u32 = + svld1uh_u32(pg_partial, reinterpret_cast(pVect2 + offset)); + svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, svreinterpret_f16_u32(q_u32)); + sum0 = svmla_f32_z(pg_partial, sum0, v1_f, v2_f); + offset += remaining; + } + } + + // Main loop: 4 chunks per iteration, one chunk per accumulator. + const size_t chunk_size = 4 * chunk; + const size_t number_of_chunks = + (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size; + for (size_t i = 0; i < number_of_chunks; i++) { + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk); + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk); + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk); + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum3, pg, chunk); + } + + if constexpr (additional_steps > 0) + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk); + if constexpr (additional_steps > 1) + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk); + if constexpr (additional_steps > 2) + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk); + + svfloat32_t sum = svadd_f32_z(pg, sum0, sum1); + sum = svadd_f32_z(pg, sum, sum2); + sum = svadd_f32_z(pg, sum, sum3); + float quantized_dot = svaddv_f32(pg, sum); + + const uint8_t *params_bytes = static_cast(pVect1v) + dimension; + const float min_val = load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); + const float delta = load_unaligned(params_bytes + sq8::DELTA * sizeof(float)); + const uint8_t *query_meta_bytes = + reinterpret_cast(static_cast(pVect2v) + dimension); + const float y_sum = load_unaligned(query_meta_bytes + sq8::SUM_QUERY * sizeof(float)); + + return min_val * y_sum + delta * quantized_dot; +} + +template +float SQ8_FP16_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1.0f - SQ8_FP16_InnerProductSIMD_SVE_IMP( + pVect1v, pVect2v, dimension); +} + +template +float SQ8_FP16_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + return SQ8_FP16_InnerProductSIMD_SVE(pVect1v, pVect2v, + dimension); +} diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index b57971b60..9366d3144 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -225,6 +225,26 @@ dist_func_t IP_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment, #endif #endif // OPT_F16C #endif // x86_64 +#ifdef CPU_FEATURES_ARCH_AARCH64 + if (dim < 16) { + return ret_dist_func; + } +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_FP16_IP_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_FP16_IP_implementation_SVE(dim); + } +#endif +#ifdef OPT_NEON_HP + if (features.asimdhp) { + return Choose_SQ8_FP16_IP_implementation_NEON_HP(dim); + } +#endif +#endif // CPU_FEATURES_ARCH_AARCH64 return ret_dist_func; } @@ -274,6 +294,26 @@ dist_func_t Cosine_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignm #endif #endif // OPT_F16C #endif // x86_64 +#ifdef CPU_FEATURES_ARCH_AARCH64 + if (dim < 16) { + return ret_dist_func; + } +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_FP16_Cosine_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_FP16_Cosine_implementation_SVE(dim); + } +#endif +#ifdef OPT_NEON_HP + if (features.asimdhp) { + return Choose_SQ8_FP16_Cosine_implementation_NEON_HP(dim); + } +#endif +#endif // CPU_FEATURES_ARCH_AARCH64 return ret_dist_func; } diff --git a/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h new file mode 100644 index 000000000..70367d7fe --- /dev/null +++ b/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h" + +/* + * Optimised asymmetric SQ8<->FP16 L2 squared distance using the algebraic identity: + * + * ||x - y||^2 = sum(x_i^2) - 2 * IP(x, y) + sum(y_i^2) + * = x_sum_squares - 2 * IP(x, y) + y_sum_squares + * + * IP is computed by SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP; metadata is FP32. + */ + +template // 0..15 +float SQ8_FP16_L2SqrSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float ip = SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(pVect1v, pVect2v, dimension); + + const uint8_t *params_bytes = static_cast(pVect1v) + dimension; + const float x_sum_sq = load_unaligned(params_bytes + sq8::SUM_SQUARES * sizeof(float)); + + const uint8_t *query_meta_bytes = + reinterpret_cast(static_cast(pVect2v) + dimension); + const float y_sum_sq = + load_unaligned(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float)); + + return x_sum_sq + y_sum_sq - 2.0f * ip; +} diff --git a/src/VecSim/spaces/L2/L2_SVE2_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_SVE2_SQ8_FP16.h new file mode 100644 index 000000000..d9451fe2a --- /dev/null +++ b/src/VecSim/spaces/L2/L2_SVE2_SQ8_FP16.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/IP/IP_SVE2_SQ8_FP16.h" + +/* + * SVE2 SQ8<->FP16 L2 squared distance: + * ||x - y||^2 = x_sum_squares - 2 * IP(x, y) + y_sum_squares + * IP is computed by SQ8_FP16_InnerProductSIMD_SVE2_IMP; metadata is FP32. + */ + +template +float SQ8_FP16_L2SqrSIMD_SVE2(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float ip = SQ8_FP16_InnerProductSIMD_SVE2_IMP( + pVect1v, pVect2v, dimension); + + const uint8_t *params_bytes = static_cast(pVect1v) + dimension; + const float x_sum_sq = load_unaligned(params_bytes + sq8::SUM_SQUARES * sizeof(float)); + const uint8_t *query_meta_bytes = + reinterpret_cast(static_cast(pVect2v) + dimension); + const float y_sum_sq = + load_unaligned(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float)); + + return x_sum_sq + y_sum_sq - 2.0f * ip; +} diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h new file mode 100644 index 000000000..f70ef493d --- /dev/null +++ b/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h" + +/* + * SVE SQ8<->FP16 L2 squared distance: + * ||x - y||^2 = x_sum_squares - 2 * IP(x, y) + y_sum_squares + * IP is computed by SQ8_FP16_InnerProductSIMD_SVE_IMP; metadata is FP32. + */ + +template +float SQ8_FP16_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float ip = SQ8_FP16_InnerProductSIMD_SVE_IMP( + pVect1v, pVect2v, dimension); + + const uint8_t *params_bytes = static_cast(pVect1v) + dimension; + const float x_sum_sq = load_unaligned(params_bytes + sq8::SUM_SQUARES * sizeof(float)); + const uint8_t *query_meta_bytes = + reinterpret_cast(static_cast(pVect2v) + dimension); + const float y_sum_sq = + load_unaligned(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float)); + + return x_sum_sq + y_sum_sq - 2.0f * ip; +} diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index 43020399f..7d65814e0 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -156,6 +156,26 @@ dist_func_t L2_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment, #endif #endif // OPT_F16C #endif // x86_64 +#ifdef CPU_FEATURES_ARCH_AARCH64 + if (dim < 16) { + return ret_dist_func; + } +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_FP16_L2_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_FP16_L2_implementation_SVE(dim); + } +#endif +#ifdef OPT_NEON_HP + if (features.asimdhp) { + return Choose_SQ8_FP16_L2_implementation_NEON_HP(dim); + } +#endif +#endif // CPU_FEATURES_ARCH_AARCH64 return ret_dist_func; } diff --git a/src/VecSim/spaces/functions/NEON_HP.cpp b/src/VecSim/spaces/functions/NEON_HP.cpp index 2dea94934..20d93a517 100644 --- a/src/VecSim/spaces/functions/NEON_HP.cpp +++ b/src/VecSim/spaces/functions/NEON_HP.cpp @@ -10,6 +10,8 @@ #include "VecSim/spaces/L2/L2_NEON_FP16.h" #include "VecSim/spaces/IP/IP_NEON_FP16.h" +#include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h" +#include "VecSim/spaces/L2/L2_NEON_SQ8_FP16.h" namespace spaces { @@ -27,6 +29,24 @@ dist_func_t Choose_FP16_IP_implementation_NEON_HP(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_InnerProductSIMD16_NEON_HP); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_FP16_L2_implementation_NEON_HP(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_L2SqrSIMD16_NEON_HP); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_FP16_Cosine_implementation_NEON_HP(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_CosineSIMD16_NEON_HP); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON_HP.h b/src/VecSim/spaces/functions/NEON_HP.h index c65bd6948..889eb0919 100644 --- a/src/VecSim/spaces/functions/NEON_HP.h +++ b/src/VecSim/spaces/functions/NEON_HP.h @@ -16,4 +16,8 @@ dist_func_t Choose_FP16_IP_implementation_NEON_HP(size_t dim); dist_func_t Choose_FP16_L2_implementation_NEON_HP(size_t dim); +dist_func_t Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim); +dist_func_t Choose_SQ8_FP16_L2_implementation_NEON_HP(size_t dim); +dist_func_t Choose_SQ8_FP16_Cosine_implementation_NEON_HP(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp index fde853db2..bd197c84c 100644 --- a/src/VecSim/spaces/functions/SVE.cpp +++ b/src/VecSim/spaces/functions/SVE.cpp @@ -25,6 +25,9 @@ #include "VecSim/spaces/IP/IP_SVE_SQ8_FP32.h" #include "VecSim/spaces/L2/L2_SVE_SQ8_FP32.h" +#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h" +#include "VecSim/spaces/L2/L2_SVE_SQ8_FP16.h" + #include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h" #include "VecSim/spaces/L2/L2_SVE_SQ8_SQ8.h" @@ -119,6 +122,24 @@ dist_func_t Choose_SQ8_FP32_L2_implementation_SVE(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_FP16_IP_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_FP16_Cosine_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_CosineSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_FP16_L2_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_L2SqrSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) // Note: Use svcntb for uint8 elements (not svcntw which is for 32-bit elements) dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim) { diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h index bd3bc97c3..43b3b22cd 100644 --- a/src/VecSim/spaces/functions/SVE.h +++ b/src/VecSim/spaces/functions/SVE.h @@ -33,6 +33,10 @@ dist_func_t Choose_SQ8_FP32_IP_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_FP32_Cosine_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_FP32_L2_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_FP16_IP_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_FP16_Cosine_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_FP16_L2_implementation_SVE(size_t dim); + // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE(size_t dim); diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp index 4215d79cf..9eea81523 100644 --- a/src/VecSim/spaces/functions/SVE2.cpp +++ b/src/VecSim/spaces/functions/SVE2.cpp @@ -16,14 +16,16 @@ #include "VecSim/spaces/IP/IP_SVE_FP64.h" #include "VecSim/spaces/L2/L2_SVE_FP64.h" -#include "VecSim/spaces/L2/L2_SVE_INT8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/IP/IP_SVE_INT8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/L2/L2_SVE_UINT8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/IP/IP_SVE_UINT8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/IP/IP_SVE_SQ8_FP32.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/L2/L2_SVE_SQ8_FP32.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/L2/L2_SVE_SQ8_SQ8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_INT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_INT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_UINT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_UINT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_SQ8_FP32.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_SQ8_FP32.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE2_SQ8_FP16.h" // SVE2 fast path: FMLALB/FMLALT widening +#include "VecSim/spaces/L2/L2_SVE2_SQ8_FP16.h" // SVE2 fast path: FMLALB/FMLALT widening +#include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_SQ8_SQ8.h" // SVE2 implementation is identical to SVE namespace spaces { @@ -116,6 +118,24 @@ dist_func_t Choose_SQ8_FP32_L2_implementation_SVE2(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_InnerProductSIMD_SVE2, dim, svcnth); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_FP16_Cosine_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_CosineSIMD_SVE2, dim, svcnth); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_FP16_L2_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_L2SqrSIMD_SVE2, dim, svcnth); + return ret_dist_func; +} + // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) // Note: Use svcntb for uint8 elements (not svcntw which is for 32-bit elements) dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE2(size_t dim) { diff --git a/src/VecSim/spaces/functions/SVE2.h b/src/VecSim/spaces/functions/SVE2.h index 04078a91e..2c1bfbac3 100644 --- a/src/VecSim/spaces/functions/SVE2.h +++ b/src/VecSim/spaces/functions/SVE2.h @@ -33,6 +33,10 @@ dist_func_t Choose_SQ8_FP32_IP_implementation_SVE2(size_t dim); dist_func_t Choose_SQ8_FP32_Cosine_implementation_SVE2(size_t dim); dist_func_t Choose_SQ8_FP32_L2_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_FP16_Cosine_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_FP16_L2_implementation_SVE2(size_t dim); + // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE2(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE2(size_t dim); diff --git a/tests/benchmark/benchmarks.sh b/tests/benchmark/benchmarks.sh index 91ba49448..115a4cac9 100755 --- a/tests/benchmark/benchmarks.sh +++ b/tests/benchmark/benchmarks.sh @@ -21,6 +21,7 @@ if [ -z "$BM_TYPE" ] || [ "$BM_TYPE" = "benchmarks-all" ]; then echo spaces_int8 echo spaces_uint8 echo spaces_sq8_fp32 + echo spaces_sq8_fp16 echo spaces_sq8_sq8 elif [ "$BM_TYPE" = "benchmarks-default" ]; then @@ -33,6 +34,7 @@ elif [ "$BM_TYPE" = "benchmarks-default" ]; then echo spaces_int8 echo spaces_uint8 echo spaces_sq8_fp32 + echo spaces_sq8_fp16 echo spaces_sq8_sq8 @@ -106,6 +108,7 @@ elif [ "$BM_TYPE" = "bm-basics-svs-fp32-single" ] ; then echo basics_svs_single_fp32_LVQ8 elif [ "$BM_TYPE" = "bm-spaces-sq8-full" ] ; then echo spaces_sq8_fp32 + echo spaces_sq8_fp16 echo spaces_sq8_sq8 @@ -118,6 +121,7 @@ elif [ "$BM_TYPE" = "bm-spaces" ] ; then echo spaces_int8 echo spaces_uint8 echo spaces_sq8_fp32 + echo spaces_sq8_fp16 echo spaces_sq8_sq8 elif [ "$BM_TYPE" = "bm-spaces-fp32" ] ; then @@ -134,6 +138,8 @@ elif [ "$BM_TYPE" = "bm-spaces-uint8" ] ; then echo spaces_uint8 elif [ "$BM_TYPE" = "bm-spaces-sq8-fp32" ] ; then echo spaces_sq8_fp32 +elif [ "$BM_TYPE" = "bm-spaces-sq8-fp16" ] ; then + echo spaces_sq8_fp16 elif [ "$BM_TYPE" = "bm-spaces-sq8-sq8" ] ; then echo spaces_sq8_sq8 fi diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp index ba3030064..cc5d040cb 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp @@ -16,8 +16,8 @@ using float16 = vecsim_types::float16; /** * SQ8-to-FP16 benchmarks: SQ8 quantized storage with FP16 query. * Registers the naive (scalar) baseline plus per-ISA SIMD variants (x86: AVX-512 / AVX2+FMA / - * AVX2 / SSE4 — gated on the matching OPT_* defines and runtime CPU features). ARM kernels - * land via MOD-14972. + * AVX2 / SSE4 — gated on the matching OPT_* defines and runtime CPU features). ARM kernels (NEON_HP + * / SVE / SVE2) are registered below. */ class BM_VecSimSpaces_SQ8_FP16 : public benchmark::Fixture { protected: @@ -85,6 +85,29 @@ INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SSE4, 16, s #endif // OPT_F16C #endif // x86_64 +#ifdef CPU_FEATURES_ARCH_AARCH64 +cpu_features::Aarch64Features arm_opt = cpu_features::GetAarch64Info().features; + +#ifdef OPT_SVE2 +bool sve2_supported = arm_opt.sve2; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported); +#endif + +#ifdef OPT_SVE +bool sve_supported = arm_opt.sve; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported); +#endif + +#ifdef OPT_NEON_HP +bool neon_hp_supported = arm_opt.asimdhp; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, neon_hp_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, + neon_hp_supported); +#endif +#endif // CPU_FEATURES_ARCH_AARCH64 + // Naive (scalar) baseline — always registered as the comparison anchor. INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, InnerProduct, 16); diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 474ac5c75..ce8605565 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -3149,6 +3149,45 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_L2SqrTest) { #endif #endif // OPT_F16C +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "SVE2 with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment SVE2 with dim " << dim; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment SVE with dim " << dim; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_HP + if (optimization.asimdhp) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_NEON_HP(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "NEON_HP with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment NEON_HP with dim " << dim; + optimization.asimdhp = 0; + } +#endif +#endif // CPU_FEATURES_ARCH_AARCH64 + unsigned char alignment = 0; arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_FP16_L2Sqr) @@ -3224,6 +3263,45 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_InnerProductTest) { #endif #endif // OPT_F16C +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "SVE2 with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment SVE2 with dim " << dim; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment SVE with dim " << dim; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_HP + if (optimization.asimdhp) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_NEON_HP(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "NEON_HP with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment NEON_HP with dim " << dim; + optimization.asimdhp = 0; + } +#endif +#endif // CPU_FEATURES_ARCH_AARCH64 + unsigned char alignment = 0; arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_FP16_InnerProduct) @@ -3299,6 +3377,45 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_CosineTest) { #endif #endif // OPT_F16C +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "SVE2 with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment SVE2 with dim " << dim; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment SVE with dim " << dim; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_HP + if (optimization.asimdhp) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_NEON_HP(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "NEON_HP with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment NEON_HP with dim " << dim; + optimization.asimdhp = 0; + } +#endif +#endif // CPU_FEATURES_ARCH_AARCH64 + unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_FP16_Cosine)