Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
10ea6df
Add design spec for SQ8↔FP16 ARM SIMD kernels [MOD-14972]
dor-forer May 28, 2026
c061da9
Add implementation plan for SQ8↔FP16 ARM SIMD kernels [MOD-14972]
dor-forer May 28, 2026
4f0534c
Add NEON_HP SQ8↔FP16 IP kernel header [MOD-14972]
dor-forer May 28, 2026
d3c6415
Add NEON_HP SQ8↔FP16 L2 kernel header [MOD-14972]
dor-forer May 28, 2026
69cee3d
Wire NEON_HP SQ8↔FP16 choosers [MOD-14972]
dor-forer May 28, 2026
1b36b38
Dispatch SQ8↔FP16 to NEON_HP tier on AArch64 [MOD-14972]
dor-forer May 28, 2026
1af4812
Extend SQ8↔FP16 tier-walk tests with NEON_HP [MOD-14972]
dor-forer May 28, 2026
0ce0bce
Add SVE SQ8↔FP16 IP kernel header [MOD-14972]
dor-forer May 28, 2026
eb4952a
Add SVE SQ8↔FP16 L2 kernel header [MOD-14972]
dor-forer May 28, 2026
fcb01bb
Wire SVE/SVE2 SQ8↔FP16 choosers [MOD-14972]
dor-forer May 28, 2026
15fca69
Dispatch SQ8↔FP16 to SVE/SVE2 tiers on AArch64 [MOD-14972]
dor-forer May 28, 2026
0fcd7d0
Extend SQ8↔FP16 tier-walk tests with SVE/SVE2 [MOD-14972]
dor-forer May 28, 2026
6a783f8
Register ARM SQ8↔FP16 microbenchmarks [MOD-14972]
dor-forer May 28, 2026
a2a1b24
Add missing alignment=0 assertions to SQ8↔FP16 ARM tier-walk tests [M…
May 31, 2026
284ad69
Fix SVE SQ8↔FP16 kernel: use svzip1 to correct FP16→FP32 widening [MO…
May 31, 2026
3754f76
Optimize ARM SQ8↔FP16 kernels and align with codebase conventions [MO…
May 31, 2026
10c03aa
Apply clang-format [MOD-14972]
May 31, 2026
9741cfb
Trim PR churn: remove docs, dispatcher comments, and test verbosity […
May 31, 2026
e1647dc
Apply clang-format 18.1.8 (matches CI) [MOD-14972]
May 31, 2026
3602e5c
bench: register spaces_sq8_fp16 in benchmark setups
lerman25 Jun 1, 2026
c6f1363
perf(arm): optimize SQ8<->FP16 NEON_HP widening and add SVE2 FMLALB/F…
lerman25 Jun 1, 2026
68103a6
style: clang-format SVE2.cpp
lerman25 Jun 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 132 additions & 0 deletions src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/*
* Copyright (c) 2006-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of the Redis Source Available License 2.0
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include "VecSim/types/sq8.h"
#include "VecSim/types/float16.h"
#include <arm_neon.h>

using sq8 = vecsim_types::sq8;
using float16 = vecsim_types::float16;

/*
* Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity:
* IP(x, y) ~= min * y_sum + delta * Σ(q_i * y_i)
*
* FP16 query lanes are widened to FP32 via vcvt_f32_f16 per 16-lane chunk.
*/

// Helper: 16 lanes per call, four FP32 accumulators (one per quarter).
static inline void SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2,
float32x4_t &sum0, float32x4_t &sum1,
float32x4_t &sum2, float32x4_t &sum3) {
uint8x16_t v1_u8 = vld1q_u8(pVect1);
// SQ8 values 0..255 are exact in FP16, so widen uint8 -> uint16 -> fp16 -> fp32.
// This drops two integer-widening ops per chunk versus the uint8 -> u16 -> u32 -> f32
// chain while producing bit-identical FP32 lane values.
float16x8_t v1_h_lo = vcvtq_f16_u16(vmovl_u8(vget_low_u8(v1_u8)));
float16x8_t v1_h_hi = vcvtq_f16_u16(vmovl_u8(vget_high_u8(v1_u8)));
float32x4_t v1_0 = vcvt_f32_f16(vget_low_f16(v1_h_lo));
float32x4_t v1_1 = vcvt_f32_f16(vget_high_f16(v1_h_lo));
float32x4_t v1_2 = vcvt_f32_f16(vget_low_f16(v1_h_hi));
float32x4_t v1_3 = vcvt_f32_f16(vget_high_f16(v1_h_hi));

const float16_t *q = reinterpret_cast<const float16_t *>(pVect2);
float16x8_t q_lo = vld1q_f16(q);
float16x8_t q_hi = vld1q_f16(q + 8);
float32x4_t v2_0 = vcvt_f32_f16(vget_low_f16(q_lo));
float32x4_t v2_1 = vcvt_f32_f16(vget_high_f16(q_lo));
float32x4_t v2_2 = vcvt_f32_f16(vget_low_f16(q_hi));
float32x4_t v2_3 = vcvt_f32_f16(vget_high_f16(q_hi));

sum0 = vfmaq_f32(sum0, v1_0, v2_0);
sum1 = vfmaq_f32(sum1, v1_1, v2_1);
sum2 = vfmaq_f32(sum2, v1_2, v2_2);
sum3 = vfmaq_f32(sum3, v1_3, v2_3);

pVect1 += 16;
pVect2 += 16;
}

// pVect1v = SQ8 storage, pVect2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
template <unsigned char residual> // 0..15
float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *pVect2v,
size_t dimension) {
const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v);
const float16 *pVect2 = static_cast<const float16 *>(pVect2v);

float32x4_t sum0 = vdupq_n_f32(0.0f);
float32x4_t sum1 = vdupq_n_f32(0.0f);
float32x4_t sum2 = vdupq_n_f32(0.0f);
float32x4_t sum3 = vdupq_n_f32(0.0f);

const size_t num_of_chunks = dimension / 16;
for (size_t i = 0; i < num_of_chunks; i++) {
SQ8_FP16_InnerProductStep_NEON_HP(pVect1, pVect2, sum0, sum1, sum2, sum3);
}

// Residual: up to three independent 4-lane sub-steps, leaving at most 3 elements
// for scalar — mirrors the SQ8_FP32 NEON sister pattern.
// vld1_f16 (4 FP16 = 8 bytes) is safe for any residual: FP16 metadata follows
// the lane data so there is always enough headroom.
constexpr unsigned char r = residual;
if constexpr (r >= 4) {
uint8x8_t v1_u8 = vld1_u8(pVect1);
float32x4_t v1_a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))));
float32x4_t v2_a = vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(pVect2)));
sum0 = vfmaq_f32(sum0, v1_a, v2_a);
pVect1 += 4;
pVect2 += 4;
}
if constexpr (r >= 8) {
uint8x8_t v1_u8 = vld1_u8(pVect1);
float32x4_t v1_b = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))));
float32x4_t v2_b = vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(pVect2)));
sum1 = vfmaq_f32(sum1, v1_b, v2_b);
pVect1 += 4;
pVect2 += 4;
}
if constexpr (r >= 12) {
uint8x8_t v1_u8 = vld1_u8(pVect1);
float32x4_t v1_c = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))));
float32x4_t v2_c = vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(pVect2)));
sum2 = vfmaq_f32(sum2, v1_c, v2_c);
pVect1 += 4;
pVect2 += 4;
}
constexpr unsigned char tail = r & 3;
float scalar_dot = 0.0f;
for (unsigned char k = 0; k < tail; ++k) {
scalar_dot += static_cast<float>(pVect1[k]) * vecsim_types::FP16_to_FP32(pVect2[k]);
}

float32x4_t sum_lo = vaddq_f32(sum0, sum1);
float32x4_t sum_hi = vaddq_f32(sum2, sum3);
float quantized_dot = vaddvq_f32(vaddq_f32(sum_lo, sum_hi)) + scalar_dot;

const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
const uint8_t *query_meta_bytes =
reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));

return min_val * y_sum + delta * quantized_dot;
}

template <unsigned char residual>
float SQ8_FP16_InnerProductSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v,
size_t dimension) {
return 1.0f - SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP<residual>(pVect1v, pVect2v, dimension);
}

template <unsigned char residual>
float SQ8_FP16_CosineSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) {
return SQ8_FP16_InnerProductSIMD16_NEON_HP<residual>(pVect1v, pVect2v, dimension);
}
124 changes: 124 additions & 0 deletions src/VecSim/spaces/IP/IP_SVE2_SQ8_FP16.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
/*
* Copyright (c) 2006-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of the Redis Source Available License 2.0
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include "VecSim/types/sq8.h"
#include "VecSim/types/float16.h"
#include <arm_sve.h>

using sq8 = vecsim_types::sq8;
using float16 = vecsim_types::float16;

/*
* SVE2 asymmetric SQ8 (storage) <-> FP16 (query) inner product using the identity:
* IP(x, y) ~= min * y_sum + delta * Σ(q_i * y_i)
*
* SVE2-only fast path: the storage bytes (0..255, exact in FP16) and the FP16 query
* lanes stay 16-bit, and the FP16->FP32 widening multiply-accumulate is done by the
* FMLALB/FMLALT pair (svmlalb_f32 / svmlalt_f32). Each pair widens the even/odd
* half-precision lanes to single precision and multiplies/accumulates in FP32 WITHOUT
* intermediate rounding, so the per-lane products match the SVE svmla path exactly while
* processing svcnth() lanes per step (twice the base-SVE svcntw() granularity) and halving
* the number of loads and explicit conversions. The even/odd accumulator split groups the
* FP32 additions differently than the base SVE kernel, so the reduced result is numerically
* equivalent (well within the test tolerance) rather than bit-identical.
*/

// Helper: one svcnth()-wide FP16 step feeding an even/odd FP32 accumulator pair.
static inline void SQ8_FP16_InnerProductStep_SVE2(const uint8_t *pVect1, const float16 *pVect2,
size_t &offset, svfloat32_t &sum_even,
svfloat32_t &sum_odd, svbool_t pg16,
size_t chunk) {
svuint16_t v1_u16 = svld1ub_u16(pg16, pVect1 + offset);
svfloat16_t v1_f16 = svcvt_f16_u16_x(pg16, v1_u16);
svfloat16_t q_f16 = svld1_f16(pg16, reinterpret_cast<const float16_t *>(pVect2 + offset));
// FMLALB/FMLALT are unpredicated; inactive lanes were zeroed by the loads above so
// their contribution is 0 and walking all lanes is safe.
sum_even = svmlalb_f32(sum_even, v1_f16, q_f16);
sum_odd = svmlalt_f32(sum_odd, v1_f16, q_f16);
offset += chunk;
}

// pVect1v = SQ8 storage, pVect2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
template <bool partial_chunk, unsigned char additional_steps>
float SQ8_FP16_InnerProductSIMD_SVE2_IMP(const void *pVect1v, const void *pVect2v,
size_t dimension) {
const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v);
const float16 *pVect2 = static_cast<const float16 *>(pVect2v);
size_t offset = 0;
const svbool_t pg16 = svptrue_b16();
const size_t chunk = svcnth();

svfloat32_t sum0e = svdup_f32(0.0f), sum0o = svdup_f32(0.0f);
svfloat32_t sum1e = svdup_f32(0.0f), sum1o = svdup_f32(0.0f);
svfloat32_t sum2e = svdup_f32(0.0f), sum2o = svdup_f32(0.0f);
svfloat32_t sum3e = svdup_f32(0.0f), sum3o = svdup_f32(0.0f);

// Partial chunk for dim % chunk FP16 lanes. Zeroing loads (_z convert) leave inactive
// lanes at 0 so the unpredicated FMLALB/FMLALT below ignore them.
if constexpr (partial_chunk) {
size_t remaining = dimension % chunk;
if (remaining > 0) {
svbool_t pg_partial = svwhilelt_b16(uint64_t(0), uint64_t(remaining));
svuint16_t v1_u16 = svld1ub_u16(pg_partial, pVect1 + offset);
svfloat16_t v1_f16 = svcvt_f16_u16_z(pg_partial, v1_u16);
svfloat16_t q_f16 =
svld1_f16(pg_partial, reinterpret_cast<const float16_t *>(pVect2 + offset));
sum0e = svmlalb_f32(sum0e, v1_f16, q_f16);
sum0o = svmlalt_f32(sum0o, v1_f16, q_f16);
offset += remaining;
}
}

// Main loop: 4 steps per iteration, one even/odd accumulator pair per step.
const size_t chunk_size = 4 * chunk;
const size_t number_of_chunks =
(dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size;
for (size_t i = 0; i < number_of_chunks; i++) {
SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum0e, sum0o, pg16, chunk);
SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum1e, sum1o, pg16, chunk);
SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum2e, sum2o, pg16, chunk);
SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum3e, sum3o, pg16, chunk);
}

if constexpr (additional_steps > 0)
SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum0e, sum0o, pg16, chunk);
if constexpr (additional_steps > 1)
SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum1e, sum1o, pg16, chunk);
if constexpr (additional_steps > 2)
SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum2e, sum2o, pg16, chunk);

const svbool_t pg32 = svptrue_b32();
svfloat32_t sum = svadd_f32_z(pg32, sum0e, sum0o);
sum = svadd_f32_x(pg32, sum, svadd_f32_x(pg32, sum1e, sum1o));
sum = svadd_f32_x(pg32, sum, svadd_f32_x(pg32, sum2e, sum2o));
sum = svadd_f32_x(pg32, sum, svadd_f32_x(pg32, sum3e, sum3o));
float quantized_dot = svaddv_f32(pg32, sum);

const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
const uint8_t *query_meta_bytes =
reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));

return min_val * y_sum + delta * quantized_dot;
}

template <bool partial_chunk, unsigned char additional_steps>
float SQ8_FP16_InnerProductSIMD_SVE2(const void *pVect1v, const void *pVect2v, size_t dimension) {
return 1.0f - SQ8_FP16_InnerProductSIMD_SVE2_IMP<partial_chunk, additional_steps>(
pVect1v, pVect2v, dimension);
}

template <bool partial_chunk, unsigned char additional_steps>
float SQ8_FP16_CosineSIMD_SVE2(const void *pVect1v, const void *pVect2v, size_t dimension) {
return SQ8_FP16_InnerProductSIMD_SVE2<partial_chunk, additional_steps>(pVect1v, pVect2v,
dimension);
}
113 changes: 113 additions & 0 deletions src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/*
* Copyright (c) 2006-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of the Redis Source Available License 2.0
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include "VecSim/types/sq8.h"
#include "VecSim/types/float16.h"
#include <arm_sve.h>

using sq8 = vecsim_types::sq8;
using float16 = vecsim_types::float16;

/*
* Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity:
* IP(x, y) ~= min * y_sum + delta * Σ(q_i * y_i)
*
* FP16 query lanes are widened to FP32 per step via svld1uh_u32 + svcvt_f32_f16_x.
* svld1uh_u32 zero-extends each FP16 halfword into a 32-bit lane so that
* svcvt_f32_f16_x reads the correct bits directly without any interleaving.
*/

// Helper: one SVE-vector-width-of-FP32 step.
static inline void SQ8_FP16_InnerProductStep_SVE(const uint8_t *pVect1, const float16 *pVect2,
size_t &offset, svfloat32_t &sum, svbool_t pg,
size_t chunk) {
svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset);
svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32);
svuint32_t q_u32 = svld1uh_u32(pg, reinterpret_cast<const uint16_t *>(pVect2 + offset));
svfloat32_t v2_f = svcvt_f32_f16_x(pg, svreinterpret_f16_u32(q_u32));
sum = svmla_f32_x(pg, sum, v1_f, v2_f);
offset += chunk;
}

// pVect1v = SQ8 storage, pVect2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
template <bool partial_chunk, unsigned char additional_steps>
float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v,
size_t dimension) {
const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v);
const float16 *pVect2 = static_cast<const float16 *>(pVect2v);
size_t offset = 0;
svbool_t pg = svptrue_b32();
const size_t chunk = svcntw();

svfloat32_t sum0 = svdup_f32(0.0f);
svfloat32_t sum1 = svdup_f32(0.0f);
svfloat32_t sum2 = svdup_f32(0.0f);
svfloat32_t sum3 = svdup_f32(0.0f);

// Partial chunk for dim % chunk lanes. Use _z form so inactive lanes are zero;
// the final reduction walks all lanes via svptrue_b32().
if constexpr (partial_chunk) {
size_t remaining = dimension % chunk;
if (remaining > 0) {
svbool_t pg_partial = svwhilelt_b32(uint32_t(0), uint32_t(remaining));
svuint32_t v1_u32 = svld1ub_u32(pg_partial, pVect1 + offset);
svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32);
svuint32_t q_u32 =
svld1uh_u32(pg_partial, reinterpret_cast<const uint16_t *>(pVect2 + offset));
svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, svreinterpret_f16_u32(q_u32));
sum0 = svmla_f32_z(pg_partial, sum0, v1_f, v2_f);
offset += remaining;
}
}

// Main loop: 4 chunks per iteration, one chunk per accumulator.
const size_t chunk_size = 4 * chunk;
const size_t number_of_chunks =
(dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size;
for (size_t i = 0; i < number_of_chunks; i++) {
SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk);
SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk);
SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk);
SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum3, pg, chunk);
}

if constexpr (additional_steps > 0)
SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk);
if constexpr (additional_steps > 1)
SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk);
if constexpr (additional_steps > 2)
SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk);

svfloat32_t sum = svadd_f32_z(pg, sum0, sum1);
sum = svadd_f32_z(pg, sum, sum2);
sum = svadd_f32_z(pg, sum, sum3);
float quantized_dot = svaddv_f32(pg, sum);

const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
const uint8_t *query_meta_bytes =
reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));

return min_val * y_sum + delta * quantized_dot;
}

template <bool partial_chunk, unsigned char additional_steps>
float SQ8_FP16_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
return 1.0f - SQ8_FP16_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(
pVect1v, pVect2v, dimension);
}

template <bool partial_chunk, unsigned char additional_steps>
float SQ8_FP16_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
return SQ8_FP16_InnerProductSIMD_SVE<partial_chunk, additional_steps>(pVect1v, pVect2v,
dimension);
}
Loading