From 166ddcae6077beee449886ff63ab1da03a0c86cc Mon Sep 17 00:00:00 2001 From: meiravgri <109056284+meiravgri@users.noreply.github.com> Date: Wed, 15 Oct 2025 18:37:37 +0300 Subject: [PATCH 1/2] [0.8] Replace Valgrind with address-sanitizer [MOD-10409] (#803) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Replace Valgrind with address-sanitizer [MOD-10409] (#734) * add asan using augment - wip (code with svs cannot compile with llvm clang currently) * rename run-valgrind to asan * fix buffer overflow in test * fix bad testSizeEstimation in int8 and uint8 + cleanup * fix cmake install for macos * ran sanitizer as part of every flow + remove from non latest OSs + only run debian11 with gcc11 (alignment with search) * revert 18 from clang format * fix syntax errors * * rename dataSize -> storedDataSize *refine distinction between storedDataSize and inputBlobSize: inputBlobSize not necessarly dim *sizeof(type) * vecsimAbstract: move data members to private when possible and not too complicated * remove effective_input_size check from VecSimIndexAbstract::preprocessQuery: That's logically wrong. The previous logic incorrectly used force_copy to determine blob size: - force_copy only indicates whether to copy the blob for memory management - It doesn't determine the actual size of the input blob In tiered indexes, when the backend creates a batch iterator, it receives the query blob from the frontend's batch iterator (already preprocessed). This preprocessed blob has the processed size (e.g., with cosine norm appended), not the original dim * sizeof(type) size. The old logic accidentally worked when inputBlobSize == dim * sizeof(type) because using dataSize (processed size) happened to match the preprocessed blob size from the frontend. However, this was a coincidence that broke when inputBlobSize was updated to reflect actual input blob sizes. Now preprocessQuery correctly uses inputBlobSize, which represents the actual size of the input blob being processed. BruteForce Factory Changes: • Templated NewAbstractInitParams to be reusable across different algorithm parameter types All factories changes: storedDataSize: Size after preprocessing (e.g., with cosine norm appended) inputBlobSize: Size of input blobs the index expects to receive (depends on whether vectors are pre-normalized) • Added inputBlobSize calculation based on normalization status: uses storedDataSize if normalized, otherwise dim * sizeof(type) *svs tests testSizeEstimation: using constexpr to eliminate compiler warnings about potential division by zero when quantBits could be 0 (VecSimSvsQuant_NONE) * fix merge * another fix * have leak * run first san * build with debug * upload san srtifacts * fix condition * fix path * clean up * run wuth san macos * real leak * fix pah * fix security issue * remove san from event pr run cov with san run mac with san * trigger again * unit test flow name * add space * split sanitizer in intel machine * fix cpu info * fix alpine pre deps * add to needs * unify codecov and sanitizer again * disable flow temp add cov and sanitize to nightly * revert unnecessary changes * remove bin before building sanitizer in codecov to use clanmg * install clang * install clang 18 * instsll clang 18 * remove tools * install llvm on cov * fix deps --------- Co-authored-by: meiravgri (cherry picked from commit 2fcc6690aa465b35855c11d199e182f34b483b46) * fix debian and nightly * remove slack notify on nightly * change cov machine * revert cov changes - no sanitizer * no pre deps * remove --------- Co-authored-by: alonre24 (cherry picked from commit 44df9db329c10e8fe6abb1db5a9285ece6e53ba0) --- .github/workflows/arm.yml | 1 - .github/workflows/debian11.yml | 4 +- .github/workflows/event-merge-to-queue.yml | 4 +- .github/workflows/event-nightly.yml | 32 ++++-------- .github/workflows/event-pull_request.yml | 8 ++- .github/workflows/macos.yml | 6 ++- .github/workflows/mariner2.yml | 1 - .github/workflows/task-unit-test.yml | 30 ++++++----- .install/ubuntu_22.04.sh | 2 +- Makefile | 59 ++++++++-------------- cmake/san.cmake | 20 ++------ 11 files changed, 67 insertions(+), 100 deletions(-) diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml index e1fad3af1..1362b57b2 100644 --- a/.github/workflows/arm.yml +++ b/.github/workflows/arm.yml @@ -34,7 +34,6 @@ jobs: uses: ./.github/workflows/task-unit-test.yml with: env: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner - run-valgrind: false # run the job without valgrind stop-runner: name: Stop self-hosted EC2 runner diff --git a/.github/workflows/debian11.yml b/.github/workflows/debian11.yml index b4cdaea7f..50d8d3826 100644 --- a/.github/workflows/debian11.yml +++ b/.github/workflows/debian11.yml @@ -1,9 +1,9 @@ name: debian bullseye flow -on: [workflow_dispatch] +on: [workflow_dispatch, workflow_call] jobs: bullseye: uses: ./.github/workflows/task-unit-test.yml with: - container: debian:bullseye + container: gcc:11-bullseye diff --git a/.github/workflows/event-merge-to-queue.yml b/.github/workflows/event-merge-to-queue.yml index d54e9920f..eb99c280c 100644 --- a/.github/workflows/event-merge-to-queue.yml +++ b/.github/workflows/event-merge-to-queue.yml @@ -29,9 +29,7 @@ jobs: # with: # container: ubuntu:xenial bullseye: - uses: ./.github/workflows/task-unit-test.yml - with: - container: debian:bullseye + uses: ./.github/workflows/debian11.yml # amazonlinux2: # needs: [check-if-docs-only] # if: ${{ needs.check-if-docs-only.outputs.only-docs-changed == 'false' }} diff --git a/.github/workflows/event-nightly.yml b/.github/workflows/event-nightly.yml index 9a378c441..97ff189e0 100644 --- a/.github/workflows/event-nightly.yml +++ b/.github/workflows/event-nightly.yml @@ -16,48 +16,38 @@ jobs: uses: ./.github/workflows/task-unit-test.yml with: container: ubuntu:jammy - run-valgrind: false focal: uses: ./.github/workflows/task-unit-test.yml with: container: ubuntu:focal - run-valgrind: false # bionic: # uses: ./.github/workflows/task-unit-test.yml # with: # container: ubuntu:bionic -# run-valgrind: false -# xenial: -# uses: ./.github/workflows/task-unit-test.yml -# with: -# container: ubuntu:xenial -# run-valgrind: false bullseye: - uses: ./.github/workflows/task-unit-test.yml - with: - container: debian:bullseye - run-valgrind: false -# centos7: -# uses: ./.github/workflows/task-unit-test.yml -# with: -# container: centos:7 -# run-valgrind: false -# amazonlinux2: -# uses: ./.github/workflows/amazon2.yml + uses: ./.github/workflows/debian11.yml + # amazonlinux2: + # uses: ./.github/workflows/amazon2.yml mariner2: uses: ./.github/workflows/mariner2.yml rocky8: uses: ./.github/workflows/task-unit-test.yml with: container: rockylinux:8 - run-valgrind: false rocky9: uses: ./.github/workflows/task-unit-test.yml with: container: rockylinux:9 - run-valgrind: false macos: uses: ./.github/workflows/macos.yml arm: uses: ./.github/workflows/arm.yml secrets: inherit + coverage: + uses: ./.github/workflows/coverage.yml + secrets: inherit + sanitizer: + uses: ./.github/workflows/task-unit-test.yml + with: + san: address + secrets: inherit diff --git a/.github/workflows/event-pull_request.yml b/.github/workflows/event-pull_request.yml index 190520812..c39eb7f69 100644 --- a/.github/workflows/event-pull_request.yml +++ b/.github/workflows/event-pull_request.yml @@ -30,8 +30,13 @@ jobs: - name: flow tests run: make flow_test + sanitizer: + uses: ./.github/workflows/task-unit-test.yml + with: + san: address + secrets: inherit + coverage: - needs: [basic-tests] if: ${{ !github.event.pull_request.draft}} uses: ./.github/workflows/coverage.yml secrets: inherit @@ -54,6 +59,7 @@ jobs: pr-validation: needs: - basic-tests + - sanitizer - coverage - codeql-analysis - spellcheck diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 64ab0224e..ec93149aa 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -7,4 +7,8 @@ jobs: uses: ./.github/workflows/task-unit-test.yml with: env: macos-latest - run-valgrind: false + macos-san: + uses: ./.github/workflows/task-unit-test.yml + with: + env: macos-latest + san: address diff --git a/.github/workflows/mariner2.yml b/.github/workflows/mariner2.yml index 109c8c178..4bc582596 100644 --- a/.github/workflows/mariner2.yml +++ b/.github/workflows/mariner2.yml @@ -8,4 +8,3 @@ jobs: with: container: mcr.microsoft.com/cbl-mariner/base/core:2.0 pre-checkout-script: tdnf install -y --noplugins --skipsignature tar gzip - run-valgrind: false # TODO: enable valgrind? (requires to install valgrind) diff --git a/.github/workflows/task-unit-test.yml b/.github/workflows/task-unit-test.yml index 49ccaa295..b2a011e0e 100644 --- a/.github/workflows/task-unit-test.yml +++ b/.github/workflows/task-unit-test.yml @@ -13,14 +13,15 @@ on: pre-checkout-script: description: 'Script to run before checkout' type: string - run-valgrind: - description: 'Run valgrind tests' - type: boolean - default: true + san: + type: string + +env: + SAN_VALUE: ${{ inputs.san }} jobs: test: - name: Test ${{ inputs.container && format('{0} (on {1})', inputs.container, inputs.env) || inputs.env }} + name: Test ${{ inputs.container && format('{0} (on {1})', inputs.container, inputs.env) || inputs.env }}${{ inputs.san && ' sanitizer' || '' }} runs-on: ${{ inputs.env }} container: ${{ inputs.container || null }} defaults: @@ -47,13 +48,16 @@ jobs: echo "name=$NAME" >> $GITHUB_OUTPUT - name: unit tests - run: make unit_test - - name: valgrind - if: ${{ inputs.run-valgrind }} - run: make valgrind - - name: Archive valgrind tests reports - if: ${{ inputs.run-valgrind && failure() }} + run: make unit_test SAN="$SAN_VALUE" + - name: Set test path + if: ${{ inputs.san == 'address' && failure() }} + id: tests-artifact-path + run: | + FULL_VARIANT=$(uname)-$(uname -m)-debug-asan + echo "path=bin/${FULL_VARIANT}/unit_tests/Testing/Temporary/" >> $GITHUB_OUTPUT + - name: Archive san tests reports + if: ${{ inputs.san == 'address' && failure() }} uses: actions/upload-artifact@v4 with: - name: valgrind tests reports on ${{ steps.artifact-name.outputs.name }} - path: bin/Linux-x86_64-debug/unit_tests/Testing/Temporary/ + name: san tests reports on ${{ steps.artifact-name.outputs.name }} + path: ${{ steps.tests-artifact-path.outputs.path }} diff --git a/.install/ubuntu_22.04.sh b/.install/ubuntu_22.04.sh index 399708c1c..0a5f1af96 100755 --- a/.install/ubuntu_22.04.sh +++ b/.install/ubuntu_22.04.sh @@ -4,5 +4,5 @@ export DEBIAN_FRONTEND=noninteractive MODE=$1 # whether to install using sudo or not $MODE apt-get update -qq -$MODE apt-get install -yqq git wget build-essential valgrind lcov +$MODE apt-get install -yqq git wget build-essential lcov source install_cmake.sh $MODE diff --git a/Makefile b/Makefile index 55c39d647..2b2727baf 100644 --- a/Makefile +++ b/Makefile @@ -8,14 +8,6 @@ ifneq ($(filter coverage show-cov upload-cov,$(MAKECMDGOALS)),) COV=1 endif -ifneq ($(VG),) -VALGRIND=$(VG) -endif - -ifeq ($(VALGRIND),1) -override DEBUG ?= 1 -endif - ifeq ($(COV),1) override DEBUG ?= 1 CMAKE_COV += -DUSE_COVERAGE=ON @@ -26,30 +18,18 @@ CMAKE_TESTS += -DVECSIM_BUILD_TESTS=off endif ifneq ($(SAN),) -override DEBUG ?= 1 -export ASAN_OPTIONS=detect_odr_violation=0:allocator_may_return_null=1 -export MSAN_OPTIONS=allocator_may_return_null=1 -ifeq ($(SAN),mem) -override SAN=memory -else ifeq ($(SAN),addr) +ifeq ($(SAN),address) override SAN=address -endif - -ifeq ($(SAN),memory) -CMAKE_SAN=-DUSE_MSAN=ON -override CTEST_ARGS += --exclude-regex BruteForceTest.sanity_rinsert_1280 - -else ifeq ($(SAN),address) +DEBUG=1 +export ASAN_OPTIONS=detect_odr_violation=0:allocator_may_return_null=1 CMAKE_SAN=-DUSE_ASAN=ON -else ifeq ($(SAN),leak) -else ifeq ($(SAN),thread) else -$(error SAN=mem|addr|leak|thread) +$(error SAN=address is currently the only supported option) endif export SAN -endif # SAN +endif # SAN != '' ROOT=. export ROOT @@ -60,18 +40,18 @@ make build DEBUG=1 # build debug variant COV=1 # build for code coverage VERBOSE=1 # print detailed build info - VG|VALGRIND=1 # build for Valgrind - SAN=type # build with LLVM sanitizer (type=address|memory|leak|thread) - SLOW=1 # don't run build in parallel (for diagnostics) - PROFILE=1 # enable profiling compile flags (and debug symbols) for release type. + SAN=address # build with AddressSanitizer (clang) + SLOW=1 # don't run build in parallel (for diagnostics) + PROFILE=1 # enable profiling compile flags (and debug symbols) for release type. make pybind # build Python bindings make clean # remove binary files ALL=1 # remove binary directories make unit_test # run unit tests CTEST_ARGS=args # extra CTest arguments - VG|VALGRIND=1 # run tests with valgrind -make valgrind # build for Valgrind and run tests + SAN=address # run tests with AddressSanitizer + FP_64=1 # run tests with 64-bit floating point +make asan # build with AddressSanitizer and run unit tests make flow_test # run flow tests (with pytest) TEST=file::name # run specific test make mod_test # run Redis module intergration tests (with RLTest) @@ -94,6 +74,11 @@ FLAVOR=debug else FLAVOR=release endif + +ifeq ($(SAN),address) +FLAVOR := ${FLAVOR}-asan +endif + FULL_VARIANT:=$(shell uname)-$(shell uname -m)-$(FLAVOR) BINROOT=$(ROOT)/bin/$(FULL_VARIANT) BINDIR=$(BINROOT) @@ -166,11 +151,7 @@ ifeq ($(VERBOSE),1) _CTEST_ARGS += -V endif -ifeq ($(VALGRIND),1) -_CTEST_ARGS += \ - -T memcheck \ - --overwrite MemoryCheckCommandOptions="--leak-check=full --error-exitcode=255" -endif +# AddressSanitizer is handled via SAN=address in cmake/san.cmake unit_test: $(SHOW)mkdir -p $(BINDIR) @@ -178,10 +159,10 @@ unit_test: @make --no-print-directory -C $(BINDIR) $(MAKE_J) $(SHOW)cd $(TESTDIR) && GTEST_COLOR=1 ctest $(_CTEST_ARGS) -valgrind: - $(SHOW)$(MAKE) VG=1 unit_test +asan: + $(SHOW)$(MAKE) SAN=address unit_test -.PHONY: unit_test valgrind +.PHONY: unit_test asan #---------------------------------------------------------------------------------------------- diff --git a/cmake/san.cmake b/cmake/san.cmake index 4e4aa049b..9415be8d7 100644 --- a/cmake/san.cmake +++ b/cmake/san.cmake @@ -1,23 +1,9 @@ option(USE_ASAN "Use AddressSanitizer (clang)" OFF) -option(USE_MSAN "Use MemorySanitizer (clang)" OFF) -if (USE_ASAN OR USE_MSAN) +if (USE_ASAN) # define this before project() find_file(CMAKE_C_COMPILER "clang") find_file(CMAKE_CXX_COMPILER "clang++") set(CMAKE_LINKER "${CMAKE_C_COMPILER}") + set(CLANG_SAN_FLAGS "-fno-omit-frame-pointer -fsanitize=address -fsized-deallocation") - if (USE_ASAN) - set(CLANG_SAN_FLAGS "-fno-omit-frame-pointer -fsanitize=address") - - elseif (USE_MSAN) - set(CLANG_SAN_FLAGS "-fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins=2") - if (NOT MSAN_PREFIX) - set(MSAN_PREFIX "/opt/llvm-project/build-msan") - if (NOT EXISTS ${MSAN_PREFIX}) - message(FATAL_ERROR "LLVM/MSAN stdlibc++ directory '${MSAN_PREFIX}' is missing") - endif() - endif() - set(LLVM_CXX_FLAGS "-stdlib=libc++ -I${MSAN_PREFIX}/include -I${MSAN_PREFIX}/include/c++/v1") - set(LLVM_LD_FLAGS "-stdlib=libc++ -Wl,-rpath=${MSAN_PREFIX}/lib -L${MSAN_PREFIX}/lib -lc++abi") - endif() -endif() \ No newline at end of file +endif() From 5fcf6ee7f78b85c962d30d4da94b9f8ba1dd2abd Mon Sep 17 00:00:00 2001 From: meiravgri <109056284+meiravgri@users.noreply.github.com> Date: Thu, 16 Oct 2025 15:11:23 +0300 Subject: [PATCH 2/2] [MOD-11881] Fix HNSW undefined behavior in container access (#806) fix invalid reads --- src/VecSim/algorithms/hnsw/hnsw.h | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h index 6098fc99d..a6418c550 100644 --- a/src/VecSim/algorithms/hnsw/hnsw.h +++ b/src/VecSim/algorithms/hnsw/hnsw.h @@ -389,7 +389,8 @@ void HNSWIndex::removeExtraLinks( size_t removed_idx = 0; size_t link_idx = 0; - while (orig_candidates.size() > 0) { + // candidates <= orig_candidates + while (candidates.size() > 0) { if (orig_candidates.top().second != candidates.top().second) { if (neighbors_bitmap[orig_candidates.top().second]) { removed_links[removed_idx++] = orig_candidates.top().second; @@ -401,6 +402,15 @@ void HNSWIndex::removeExtraLinks( orig_candidates.pop(); } } + + assert(candidates.empty() && "candidates should be empty"); + // Handle remaining elements in orig_candidates that were rejected by heuristic + while (orig_candidates.size() > 0) { + if (neighbors_bitmap[orig_candidates.top().second]) { + removed_links[removed_idx++] = orig_candidates.top().second; + } + orig_candidates.pop(); + } setListCount(node_ll, link_idx); *removed_links_num = removed_idx; } @@ -466,7 +476,10 @@ DistType HNSWIndex::processCandidate( } // Pre-fetch the neighbours list of the top candidate (the one that is going // to be processed in the next iteration) into memory cache, to improve performance. - __builtin_prefetch(get_linklist_at_level(candidate_set.top().second, layer)); + if (!candidate_set.empty()) { + assert(links_num); + __builtin_prefetch(get_linklist_at_level(candidate_set.top().second, layer)); + } return lowerBound; } @@ -515,7 +528,10 @@ void HNSWIndex::processCandidate_RangeSearch( } // Pre-fetch the neighbours list of the top candidate (the one that is going // to be processed in the next iteration) into memory cache, to improve performance. - __builtin_prefetch(get_linklist_at_level(candidate_set.top().second, layer)); + if (!candidate_set.empty()) { + assert(links_num); + __builtin_prefetch(get_linklist_at_level(candidate_set.top().second, layer)); + } } template