From 16068f59a72fd3e91fdcd7300f404c1adf2895df Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Mon, 6 Oct 2025 15:12:25 +0200 Subject: [PATCH 01/19] First attempt for compression in serialization (not tested and to be cleaned up) --- CMakeLists.txt | 11 ++- cmake/Findlz4.cmake | 94 +++++++++++++++++++++ cmake/external_dependencies.cmake | 26 +++++- conanfile.py | 2 + environment-dev.yml | 2 + include/sparrow_ipc/compression.hpp | 23 +++++ include/sparrow_ipc/serialize.hpp | 4 +- include/sparrow_ipc/serialize_utils.hpp | 6 +- src/compression.cpp | 98 ++++++++++++++++++++++ src/deserialize.cpp | 3 +- src/serialize_utils.cpp | 2 + tests/test_de_serialization_with_files.cpp | 61 ++++++++++++++ tests/test_serialize_utils.cpp | 2 +- 13 files changed, 324 insertions(+), 10 deletions(-) create mode 100644 cmake/Findlz4.cmake create mode 100644 include/sparrow_ipc/compression.hpp create mode 100644 src/compression.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 02d2f1e..9781f32 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,6 +18,8 @@ set(SPARROW_IPC_COMPILE_DEFINITIONS "" CACHE STRING "List of public compile defi set(SPARROW_IPC_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) set(SPARROW_IPC_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src) +# set(ENV{PKG_CONFIG_PATH} "$ENV{CONDA_PREFIX}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}") + # Linter options # ============= OPTION(ACTIVATE_LINTER "Create targets to run clang-format" OFF) @@ -108,11 +110,11 @@ set(SPARROW_IPC_HEADERS ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/chunk_memory_serializer.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/config/config.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/config/sparrow_ipc_version.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/compression.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_fixedsizebinary_array.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_primitive_array.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_utils.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_variable_size_binary_array.hpp - ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_variable_size_binary_array.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/encapsulated_message.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/flatbuffer_utils.hpp @@ -132,6 +134,7 @@ set(SPARROW_IPC_SRC ${SPARROW_IPC_SOURCE_DIR}/arrow_interface/arrow_schema.cpp ${SPARROW_IPC_SOURCE_DIR}/arrow_interface/arrow_schema/private_data.cpp ${SPARROW_IPC_SOURCE_DIR}/chunk_memory_serializer.cpp + ${SPARROW_IPC_SOURCE_DIR}/compression.cpp ${SPARROW_IPC_SOURCE_DIR}/deserialize_fixedsizebinary_array.cpp ${SPARROW_IPC_SOURCE_DIR}/deserialize_utils.cpp ${SPARROW_IPC_SOURCE_DIR}/deserialize.cpp @@ -253,8 +256,14 @@ target_link_libraries(sparrow-ipc PUBLIC sparrow::sparrow flatbuffers::flatbuffers + PRIVATE + lz4::lz4 ) +target_include_directories(sparrow-ipc PRIVATE ${LZ4_INCLUDE_DIRS}) +target_link_libraries(sparrow-ipc PRIVATE ${LZ4_LIBRARIES}) + + # Ensure generated headers are available when building sparrow-ipc add_dependencies(sparrow-ipc generate_flatbuffers_headers) diff --git a/cmake/Findlz4.cmake b/cmake/Findlz4.cmake new file mode 100644 index 0000000..e3d3e3b --- /dev/null +++ b/cmake/Findlz4.cmake @@ -0,0 +1,94 @@ +# Find LZ4 library and headers + +# This module defines: +# LZ4_FOUND - True if lz4 is found +# LZ4_INCLUDE_DIRS - LZ4 include directories +# LZ4_LIBRARIES - Libraries needed to use LZ4 +# LZ4_VERSION - LZ4 version number +# + +find_package(PkgConfig) +if(PKG_CONFIG_FOUND) + pkg_check_modules(LZ4 QUIET liblz4) + if(NOT LZ4_FOUND) + message(STATUS "Did not find 'liblz4.pc', trying 'lz4.pc'") + pkg_check_modules(LZ4 QUIET lz4) + endif() +endif() + +find_path(LZ4_INCLUDE_DIR lz4.h) +# HINTS ${LZ4_INCLUDEDIR} ${LZ4_INCLUDE_DIRS}) +find_library(LZ4_LIBRARY NAMES lz4 liblz4) +# HINTS ${LZ4_LIBDIR} ${LZ4_LIBRARY_DIRS}) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(lz4 DEFAULT_MSG + LZ4_LIBRARY LZ4_INCLUDE_DIR) +mark_as_advanced(LZ4_INCLUDE_DIR LZ4_LIBRARY) + +set(LZ4_LIBRARIES ${LZ4_LIBRARY}) +set(LZ4_INCLUDE_DIRS ${LZ4_INCLUDE_DIR}) + +if(LZ4_FOUND AND NOT TARGET lz4::lz4) + add_library(lz4::lz4 UNKNOWN IMPORTED) + set_target_properties(lz4::lz4 PROPERTIES + IMPORTED_LOCATION "${LZ4_LIBRARIES}" + INTERFACE_INCLUDE_DIRECTORIES "${LZ4_INCLUDE_DIRS}") + if (NOT TARGET LZ4::LZ4 AND TARGET lz4::lz4) + add_library(LZ4::LZ4 ALIAS lz4::lz4) + endif () +endif() + +#TODO add version? +########### + +# find_path(LZ4_INCLUDE_DIR +# NAMES lz4.h +# DOC "lz4 include directory") +# mark_as_advanced(LZ4_INCLUDE_DIR) +# find_library(LZ4_LIBRARY +# NAMES lz4 liblz4 +# DOC "lz4 library") +# mark_as_advanced(LZ4_LIBRARY) +# +# if (LZ4_INCLUDE_DIR) +# file(STRINGS "${LZ4_INCLUDE_DIR}/lz4.h" _lz4_version_lines +# REGEX "#define[ \t]+LZ4_VERSION_(MAJOR|MINOR|RELEASE)") +# string(REGEX REPLACE ".*LZ4_VERSION_MAJOR *\([0-9]*\).*" "\\1" _lz4_version_major "${_lz4_version_lines}") +# string(REGEX REPLACE ".*LZ4_VERSION_MINOR *\([0-9]*\).*" "\\1" _lz4_version_minor "${_lz4_version_lines}") +# string(REGEX REPLACE ".*LZ4_VERSION_RELEASE *\([0-9]*\).*" "\\1" _lz4_version_release "${_lz4_version_lines}") +# set(LZ4_VERSION "${_lz4_version_major}.${_lz4_version_minor}.${_lz4_version_release}") +# unset(_lz4_version_major) +# unset(_lz4_version_minor) +# unset(_lz4_version_release) +# unset(_lz4_version_lines) +# endif () +# +# include(FindPackageHandleStandardArgs) +# find_package_handle_standard_args(LZ4 +# REQUIRED_VARS LZ4_LIBRARY LZ4_INCLUDE_DIR +# VERSION_VAR LZ4_VERSION) +# +# if (LZ4_FOUND) +# # `lz4_FOUND` needs to be defined because: +# # - Other dependencies also resolve LZ4 using `find_package(lz4 ...)` +# # - CMake's syntax is case-sensitive +# # +# # See: +# # - https://github.com/facebook/rocksdb/blob/0836a2b26dfbbe30c15e8cebf47771917d55e760/cmake/RocksDBConfig.cmake.in#L36 +# # - https://github.com/facebook/rocksdb/blob/0836a2b26dfbbe30c15e8cebf47771917d55e760/cmake/modules/Findlz4.cmake#L17 +# # - https://github.com/man-group/ArcticDB/pull/961 +# set(lz4_FOUND TRUE) +# set(LZ4_INCLUDE_DIRS "${LZ4_INCLUDE_DIR}") +# set(LZ4_LIBRARIES "${LZ4_LIBRARY}") +# +# if (NOT TARGET LZ4::LZ4) +# add_library(LZ4::LZ4 UNKNOWN IMPORTED) +# set_target_properties(LZ4::LZ4 PROPERTIES +# IMPORTED_LOCATION "${LZ4_LIBRARY}" +# INTERFACE_INCLUDE_DIRECTORIES "${LZ4_INCLUDE_DIR}") +# if (NOT TARGET lz4::lz4 AND TARGET LZ4::LZ4) +# add_library(lz4::lz4 ALIAS LZ4::LZ4) +# endif () +# endif () +# endif () diff --git a/cmake/external_dependencies.cmake b/cmake/external_dependencies.cmake index a59bf05..ee10c15 100644 --- a/cmake/external_dependencies.cmake +++ b/cmake/external_dependencies.cmake @@ -93,6 +93,19 @@ if(NOT TARGET flatbuffers::flatbuffers) endif() unset(FLATBUFFERS_BUILD_TESTS CACHE) +############################### +#TODO need to add fetch for zstd +# find_package_or_fetch( +# PACKAGE_NAME lz4 +# GIT_REPOSITORY https://github.com/lz4/lz4.git +# TAG v1.10.0 +# ) +find_package(lz4 REQUIRED) +# if(NOT TARGET lz4::lz4) +# add_library(lz4::lz4 ALIAS lz4) +# endif() + +############################### if(SPARROW_IPC_BUILD_TESTS) find_package_or_fetch( PACKAGE_NAME doctest @@ -123,10 +136,18 @@ if(SPARROW_IPC_BUILD_TESTS) ) message(STATUS "\t✅ Fetched arrow-testing") - # Iterate over all the files in the arrow-testing-data source directiory. When it's a gz, extract in place. - file(GLOB_RECURSE arrow_testing_data_targz_files CONFIGURE_DEPENDS + # Fetch all the files in the cpp-21.0.0 directory + file(GLOB_RECURSE arrow_testing_data_targz_files_cpp_21 CONFIGURE_DEPENDS "${arrow-testing_SOURCE_DIR}/data/arrow-ipc-stream/integration/cpp-21.0.0/*.json.gz" ) + # Fetch all the files in the 2.0.0-compression directory + file(GLOB_RECURSE arrow_testing_data_targz_files_compression CONFIGURE_DEPENDS + "${arrow-testing_SOURCE_DIR}/data/arrow-ipc-stream/integration/2.0.0-compression/*.json.gz" + ) + + # Combine lists of files + list(APPEND arrow_testing_data_targz_files ${arrow_testing_data_targz_files_cpp_21} ${arrow_testing_data_targz_files_compression}) + # Iterate over all the files in the arrow-testing-data source directory. When it's a gz, extract in place. foreach(file_path IN LISTS arrow_testing_data_targz_files) cmake_path(GET file_path PARENT_PATH parent_dir) cmake_path(GET file_path STEM filename) @@ -142,5 +163,4 @@ if(SPARROW_IPC_BUILD_TESTS) endif() endif() endforeach() - endif() diff --git a/conanfile.py b/conanfile.py index 59916f8..1783a32 100644 --- a/conanfile.py +++ b/conanfile.py @@ -45,6 +45,8 @@ def configure(self): def requirements(self): self.requires("sparrow/1.0.0") self.requires(f"flatbuffers/{self._flatbuffers_version}") + self.requires("lz4/1.9.4") + self.requires("zstd/1.5.5") if self.options.get_safe("build_tests"): self.test_requires("doctest/2.4.12") diff --git a/environment-dev.yml b/environment-dev.yml index 2d71c4d..17971e8 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -11,6 +11,8 @@ dependencies: - nlohmann_json - sparrow-devel - sparrow-json-reader + - lz4 + #- zstd - doctest # Documentation dependencies - doxygen diff --git a/include/sparrow_ipc/compression.hpp b/include/sparrow_ipc/compression.hpp new file mode 100644 index 0000000..9bf51e7 --- /dev/null +++ b/include/sparrow_ipc/compression.hpp @@ -0,0 +1,23 @@ +#pragma once + +#include +#include +#include + +#include "Message_generated.h" + +namespace sparrow_ipc +{ +// enum class CompressionType // TODO class ? enum cf. mamba? +// { +// NONE, +// LZ4, +// ZSTD +// }; + +// CompressionType to_compression_type(org::apache::arrow::flatbuf::CompressionType compression_type); + + std::vector compress(org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); +// std::vector decompress(CompressionType type, std::span data); + +} diff --git a/include/sparrow_ipc/serialize.hpp b/include/sparrow_ipc/serialize.hpp index 4a18e57..67da9af 100644 --- a/include/sparrow_ipc/serialize.hpp +++ b/include/sparrow_ipc/serialize.hpp @@ -35,7 +35,7 @@ namespace sparrow_ipc */ template requires std::same_as, sparrow::record_batch> - void serialize_record_batches_to_ipc_stream(const R& record_batches, any_output_stream& stream) + void serialize_record_batches_to_ipc_stream(const R& record_batches, any_output_stream& stream, std::optional compression) { if (record_batches.empty()) { @@ -51,7 +51,7 @@ namespace sparrow_ipc serialize_schema_message(record_batches[0], stream); for (const auto& rb : record_batches) { - serialize_record_batch(rb, stream); + serialize_record_batch(rb, stream, compression); } stream.write(end_of_stream); } diff --git a/include/sparrow_ipc/serialize_utils.hpp b/include/sparrow_ipc/serialize_utils.hpp index ae881a5..2ec9dc2 100644 --- a/include/sparrow_ipc/serialize_utils.hpp +++ b/include/sparrow_ipc/serialize_utils.hpp @@ -9,6 +9,7 @@ #include "sparrow_ipc/any_output_stream.hpp" #include "sparrow_ipc/config/config.hpp" #include "sparrow_ipc/utils.hpp" +#include "compression.hpp" namespace sparrow_ipc { @@ -39,10 +40,11 @@ namespace sparrow_ipc * consists of a metadata section followed by a body section containing the actual data. * * @param record_batch The sparrow record batch to be serialized + * TODO add parameter compression here and every place it was added to * @param stream The output stream where the serialized record batch will be written */ SPARROW_IPC_API void - serialize_record_batch(const sparrow::record_batch& record_batch, any_output_stream& stream); + serialize_record_batch(const sparrow::record_batch& record_batch, any_output_stream& stream, std::optional compression); /** * @brief Calculates the total serialized size of a schema message. @@ -115,6 +117,8 @@ namespace sparrow_ipc return total_size; } + void fill_body_and_get_buffers_compressed(const sparrow::arrow_proxy& arrow_proxy, std::vector& body, std::vector& flatbuf_buffers, int64_t& offset, org::apache::arrow::flatbuf::CompressionType compression_type); + /** * @brief Fills the body vector with serialized data from an arrow proxy and its children. * diff --git a/src/compression.cpp b/src/compression.cpp new file mode 100644 index 0000000..41fdb89 --- /dev/null +++ b/src/compression.cpp @@ -0,0 +1,98 @@ +#include +#include +#include +#include + +#include "sparrow_ipc/compression.hpp" + +namespace sparrow_ipc +{ + // TODO not sure we need this unless if we need it to hide flatbuffers dependency +// CompressionType to_compression_type(org::apache::arrow::flatbuf::CompressionType compression_type) +// { +// switch (compression_type) +// { +// case org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME: +// return CompressionType::LZ4; +// // case org::apache::arrow::flatbuf::CompressionType::ZSTD: +// // // TODO: Add ZSTD support +// // break; +// default: +// return CompressionType::NONE; +// } +// } + + std::vector compress(org::apache::arrow::flatbuf::CompressionType compression_type, std::span data) + { + if (data.empty()) + { + return {}; + } + switch (compression_type) + { + case org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME: + { + int64_t uncompressed_size = data.size(); + const size_t max_compressed_size = LZ4F_compressFrameBound(uncompressed_size, nullptr); + std::vector compressed_data(max_compressed_size); + const size_t compressed_size = LZ4F_compressFrame(compressed_data.data(), max_compressed_size, data.data(), uncompressed_size, nullptr); + if (LZ4F_isError(compressed_size)) + { + throw std::runtime_error("Failed to compress data with LZ4 frame format"); + } + compressed_data.resize(compressed_size); + return compressed_data; + } +// case CompressionType::NONE: + default: + return {data.begin(), data.end()}; + } + } + +// std::vector decompress(CompressionType type, std::span data) +// { +// switch (type) +// { +// case CompressionType::LZ4: +// { +// if (data.empty()) +// { +// return {}; +// } +// if (data.size() < sizeof(int64_t)) +// { +// throw std::runtime_error("Invalid LZ4 compressed data: missing uncompressed size"); +// } +// const int64_t uncompressed_size = *reinterpret_cast(data.data()); +// if (uncompressed_size == -1) +// { +// return {data.begin() + sizeof(uncompressed_size), data.end()}; +// } +// +// std::vector decompressed_data(uncompressed_size); +// LZ4F_dctx *dctx; +// if (LZ4F_isError(LZ4F_createDecompressionContext(&dctx, LZ4F_VERSION))) +// { +// throw std::runtime_error("Failed to create LZ4 decompression context"); +// } +// +// size_t decompressed_size = uncompressed_size; +// size_t src_size = data.size() - sizeof(uncompressed_size); +// const size_t result = LZ4F_decompress(dctx, decompressed_data.data(), &decompressed_size, data.data() + sizeof(uncompressed_size), &src_size, nullptr); +// +// LZ4F_freeDecompressionContext(dctx); +// +// if (LZ4F_isError(result) || decompressed_size != (size_t)uncompressed_size) +// { +// throw std::runtime_error("Failed to decompress data with LZ4 frame format"); +// } +// +// return decompressed_data; +// } +// case CompressionType::NONE: +// default: +// return {data.begin(), data.end()}; +// } +// } + +} diff --git a/src/deserialize.cpp b/src/deserialize.cpp index 5779ca9..55f863d 100644 --- a/src/deserialize.cpp +++ b/src/deserialize.cpp @@ -49,7 +49,6 @@ namespace sparrow_ipc const std::vector>>& field_metadata ) { - const size_t length = static_cast(record_batch.length()); size_t buffer_index = 0; std::vector arrays; @@ -277,4 +276,4 @@ namespace sparrow_ipc } while (!data.empty()); return record_batches; } -} \ No newline at end of file +} diff --git a/src/serialize_utils.cpp b/src/serialize_utils.cpp index 8545927..545f17d 100644 --- a/src/serialize_utils.cpp +++ b/src/serialize_utils.cpp @@ -1,3 +1,5 @@ +#include + #include "sparrow_ipc/flatbuffer_utils.hpp" #include "sparrow_ipc/magic_values.hpp" #include "sparrow_ipc/serialize.hpp" diff --git a/tests/test_de_serialization_with_files.cpp b/tests/test_de_serialization_with_files.cpp index 3bcb79c..8bf23a4 100644 --- a/tests/test_de_serialization_with_files.cpp +++ b/tests/test_de_serialization_with_files.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -22,6 +23,9 @@ const std::filesystem::path arrow_testing_data_dir = ARROW_TESTING_DATA_DIR; const std::filesystem::path tests_resources_files_path = arrow_testing_data_dir / "data" / "arrow-ipc-stream" / "integration" / "cpp-21.0.0"; +const std::filesystem::path tests_resources_files_path_with_compression = arrow_testing_data_dir / "data" / "arrow-ipc-stream" + / "integration" / "2.0.0-compression"; + const std::vector files_paths_to_test = { tests_resources_files_path / "generated_primitive", // tests_resources_files_path / "generated_primitive_large_offsets", @@ -29,6 +33,14 @@ const std::vector files_paths_to_test = { // tests_resources_files_path / "generated_primitive_no_batches" }; +const std::vector files_paths_to_test_with_compression = { + tests_resources_files_path_with_compression / "generated_lz4" +// tests_resources_files_path_with_compression/ "generated_uncompressible_lz4" +// tests_resources_files_path_with_compression / "generated_zstd" +// tests_resources_files_path_with_compression/ "generated_uncompressible_zstd" +}; + + size_t get_number_of_batches(const std::filesystem::path& json_path) { std::ifstream json_file(json_path); @@ -174,4 +186,53 @@ TEST_SUITE("Integration tests") } } } + + TEST_CASE("Serialization with LZ4 compression") + { + for (const auto& file_path : files_paths_to_test_with_compression) + { + std::filesystem::path json_path = file_path; + json_path.replace_extension(".json"); + const std::string test_name = "Testing LZ4 compression with " + file_path.filename().string(); + SUBCASE(test_name.c_str()) + { + // Load the JSON file + auto json_data = load_json_file(json_path); + CHECK(json_data != nullptr); + + const size_t num_batches = get_number_of_batches(json_path); + std::vector record_batches_from_json; + for (size_t batch_idx = 0; batch_idx < num_batches; ++batch_idx) + { + INFO("Processing batch " << batch_idx << " of " << num_batches); + record_batches_from_json.emplace_back( + sparrow::json_reader::build_record_batch_from_json(json_data, batch_idx) + ); + } + + // Load stream file + std::filesystem::path stream_file_path = file_path; + stream_file_path.replace_extension(".stream"); + std::ifstream stream_file(stream_file_path, std::ios::in | std::ios::binary); + REQUIRE(stream_file.is_open()); + const std::vector stream_data( + (std::istreambuf_iterator(stream_file)), + (std::istreambuf_iterator()) + ); + stream_file.close(); + + // Process the stream file + const auto record_batches_from_stream = sparrow_ipc::deserialize_stream( + std::span(stream_data) + ); + + const auto serialized_data = sparrow_ipc::serialize(record_batches_from_json, std::nullopt); +// const auto deserialized_serialized_data = sparrow_ipc::deserialize_stream( +// std::span(serialized_data) +// ); +// compare_record_batches(record_batches_from_stream, deserialized_serialized_data); + } + + } + } } diff --git a/tests/test_serialize_utils.cpp b/tests/test_serialize_utils.cpp index ea4011e..6d08be6 100644 --- a/tests/test_serialize_utils.cpp +++ b/tests/test_serialize_utils.cpp @@ -284,4 +284,4 @@ namespace sparrow_ipc } } } -} \ No newline at end of file +} From 4efa40c6a9c37a7de83f287c5bc167ecec510942 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Mon, 6 Oct 2025 16:11:50 +0200 Subject: [PATCH 02/19] Clean up and fetch --- cmake/Findlz4.cmake | 52 --------------------- cmake/external_dependencies.cmake | 14 +++--- include/sparrow_ipc/compression.hpp | 7 ++- include/sparrow_ipc/serialize.hpp | 2 + include/sparrow_ipc/serialize_utils.hpp | 2 +- src/compression.cpp | 60 +++---------------------- 6 files changed, 16 insertions(+), 121 deletions(-) diff --git a/cmake/Findlz4.cmake b/cmake/Findlz4.cmake index e3d3e3b..2b9e9c0 100644 --- a/cmake/Findlz4.cmake +++ b/cmake/Findlz4.cmake @@ -40,55 +40,3 @@ if(LZ4_FOUND AND NOT TARGET lz4::lz4) endif() #TODO add version? -########### - -# find_path(LZ4_INCLUDE_DIR -# NAMES lz4.h -# DOC "lz4 include directory") -# mark_as_advanced(LZ4_INCLUDE_DIR) -# find_library(LZ4_LIBRARY -# NAMES lz4 liblz4 -# DOC "lz4 library") -# mark_as_advanced(LZ4_LIBRARY) -# -# if (LZ4_INCLUDE_DIR) -# file(STRINGS "${LZ4_INCLUDE_DIR}/lz4.h" _lz4_version_lines -# REGEX "#define[ \t]+LZ4_VERSION_(MAJOR|MINOR|RELEASE)") -# string(REGEX REPLACE ".*LZ4_VERSION_MAJOR *\([0-9]*\).*" "\\1" _lz4_version_major "${_lz4_version_lines}") -# string(REGEX REPLACE ".*LZ4_VERSION_MINOR *\([0-9]*\).*" "\\1" _lz4_version_minor "${_lz4_version_lines}") -# string(REGEX REPLACE ".*LZ4_VERSION_RELEASE *\([0-9]*\).*" "\\1" _lz4_version_release "${_lz4_version_lines}") -# set(LZ4_VERSION "${_lz4_version_major}.${_lz4_version_minor}.${_lz4_version_release}") -# unset(_lz4_version_major) -# unset(_lz4_version_minor) -# unset(_lz4_version_release) -# unset(_lz4_version_lines) -# endif () -# -# include(FindPackageHandleStandardArgs) -# find_package_handle_standard_args(LZ4 -# REQUIRED_VARS LZ4_LIBRARY LZ4_INCLUDE_DIR -# VERSION_VAR LZ4_VERSION) -# -# if (LZ4_FOUND) -# # `lz4_FOUND` needs to be defined because: -# # - Other dependencies also resolve LZ4 using `find_package(lz4 ...)` -# # - CMake's syntax is case-sensitive -# # -# # See: -# # - https://github.com/facebook/rocksdb/blob/0836a2b26dfbbe30c15e8cebf47771917d55e760/cmake/RocksDBConfig.cmake.in#L36 -# # - https://github.com/facebook/rocksdb/blob/0836a2b26dfbbe30c15e8cebf47771917d55e760/cmake/modules/Findlz4.cmake#L17 -# # - https://github.com/man-group/ArcticDB/pull/961 -# set(lz4_FOUND TRUE) -# set(LZ4_INCLUDE_DIRS "${LZ4_INCLUDE_DIR}") -# set(LZ4_LIBRARIES "${LZ4_LIBRARY}") -# -# if (NOT TARGET LZ4::LZ4) -# add_library(LZ4::LZ4 UNKNOWN IMPORTED) -# set_target_properties(LZ4::LZ4 PROPERTIES -# IMPORTED_LOCATION "${LZ4_LIBRARY}" -# INTERFACE_INCLUDE_DIRECTORIES "${LZ4_INCLUDE_DIR}") -# if (NOT TARGET lz4::lz4 AND TARGET LZ4::LZ4) -# add_library(lz4::lz4 ALIAS LZ4::LZ4) -# endif () -# endif () -# endif () diff --git a/cmake/external_dependencies.cmake b/cmake/external_dependencies.cmake index ee10c15..fc34099 100644 --- a/cmake/external_dependencies.cmake +++ b/cmake/external_dependencies.cmake @@ -93,19 +93,15 @@ if(NOT TARGET flatbuffers::flatbuffers) endif() unset(FLATBUFFERS_BUILD_TESTS CACHE) -############################### -#TODO need to add fetch for zstd -# find_package_or_fetch( -# PACKAGE_NAME lz4 -# GIT_REPOSITORY https://github.com/lz4/lz4.git -# TAG v1.10.0 -# ) -find_package(lz4 REQUIRED) +find_package_or_fetch( + PACKAGE_NAME lz4 + GIT_REPOSITORY https://github.com/lz4/lz4.git + TAG v1.10.0 +) # if(NOT TARGET lz4::lz4) # add_library(lz4::lz4 ALIAS lz4) # endif() -############################### if(SPARROW_IPC_BUILD_TESTS) find_package_or_fetch( PACKAGE_NAME doctest diff --git a/include/sparrow_ipc/compression.hpp b/include/sparrow_ipc/compression.hpp index 9bf51e7..14403a7 100644 --- a/include/sparrow_ipc/compression.hpp +++ b/include/sparrow_ipc/compression.hpp @@ -8,7 +8,8 @@ namespace sparrow_ipc { -// enum class CompressionType // TODO class ? enum cf. mamba? +// TODO use these later if needed for wrapping purposes (flatbuffers/lz4) +// enum class CompressionType // { // NONE, // LZ4, @@ -17,7 +18,5 @@ namespace sparrow_ipc // CompressionType to_compression_type(org::apache::arrow::flatbuf::CompressionType compression_type); - std::vector compress(org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); -// std::vector decompress(CompressionType type, std::span data); - + std::vector compress(org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); } diff --git a/include/sparrow_ipc/serialize.hpp b/include/sparrow_ipc/serialize.hpp index 67da9af..9c16122 100644 --- a/include/sparrow_ipc/serialize.hpp +++ b/include/sparrow_ipc/serialize.hpp @@ -25,6 +25,8 @@ namespace sparrow_ipc * @tparam R Container type that holds record batches (must support empty(), operator[], begin(), end()) * @param record_batches Collection of record batches to serialize. All batches must have identical * schemas. + * @param compression The compression type to use when serializing + * * @param stream The output stream where the serialized data will be written. * * @throws std::invalid_argument If record batches have inconsistent schemas or if the collection diff --git a/include/sparrow_ipc/serialize_utils.hpp b/include/sparrow_ipc/serialize_utils.hpp index 2ec9dc2..110a393 100644 --- a/include/sparrow_ipc/serialize_utils.hpp +++ b/include/sparrow_ipc/serialize_utils.hpp @@ -40,7 +40,7 @@ namespace sparrow_ipc * consists of a metadata section followed by a body section containing the actual data. * * @param record_batch The sparrow record batch to be serialized - * TODO add parameter compression here and every place it was added to + * @param compression The compression type to use when serializing * @param stream The output stream where the serialized record batch will be written */ SPARROW_IPC_API void diff --git a/src/compression.cpp b/src/compression.cpp index 41fdb89..964d304 100644 --- a/src/compression.cpp +++ b/src/compression.cpp @@ -1,13 +1,11 @@ -#include -#include #include -#include + +#include #include "sparrow_ipc/compression.hpp" namespace sparrow_ipc { - // TODO not sure we need this unless if we need it to hide flatbuffers dependency // CompressionType to_compression_type(org::apache::arrow::flatbuf::CompressionType compression_type) // { // switch (compression_type) @@ -22,7 +20,7 @@ namespace sparrow_ipc // } // } - std::vector compress(org::apache::arrow::flatbuf::CompressionType compression_type, std::span data) + std::vector compress(org::apache::arrow::flatbuf::CompressionType compression_type, std::span data) { if (data.empty()) { @@ -32,9 +30,9 @@ namespace sparrow_ipc { case org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME: { - int64_t uncompressed_size = data.size(); + std::int64_t uncompressed_size = data.size(); const size_t max_compressed_size = LZ4F_compressFrameBound(uncompressed_size, nullptr); - std::vector compressed_data(max_compressed_size); + std::vector compressed_data(max_compressed_size); const size_t compressed_size = LZ4F_compressFrame(compressed_data.data(), max_compressed_size, data.data(), uncompressed_size, nullptr); if (LZ4F_isError(compressed_size)) { @@ -43,56 +41,8 @@ namespace sparrow_ipc compressed_data.resize(compressed_size); return compressed_data; } -// case CompressionType::NONE: default: return {data.begin(), data.end()}; } } - -// std::vector decompress(CompressionType type, std::span data) -// { -// switch (type) -// { -// case CompressionType::LZ4: -// { -// if (data.empty()) -// { -// return {}; -// } -// if (data.size() < sizeof(int64_t)) -// { -// throw std::runtime_error("Invalid LZ4 compressed data: missing uncompressed size"); -// } -// const int64_t uncompressed_size = *reinterpret_cast(data.data()); -// if (uncompressed_size == -1) -// { -// return {data.begin() + sizeof(uncompressed_size), data.end()}; -// } -// -// std::vector decompressed_data(uncompressed_size); -// LZ4F_dctx *dctx; -// if (LZ4F_isError(LZ4F_createDecompressionContext(&dctx, LZ4F_VERSION))) -// { -// throw std::runtime_error("Failed to create LZ4 decompression context"); -// } -// -// size_t decompressed_size = uncompressed_size; -// size_t src_size = data.size() - sizeof(uncompressed_size); -// const size_t result = LZ4F_decompress(dctx, decompressed_data.data(), &decompressed_size, data.data() + sizeof(uncompressed_size), &src_size, nullptr); -// -// LZ4F_freeDecompressionContext(dctx); -// -// if (LZ4F_isError(result) || decompressed_size != (size_t)uncompressed_size) -// { -// throw std::runtime_error("Failed to decompress data with LZ4 frame format"); -// } -// -// return decompressed_data; -// } -// case CompressionType::NONE: -// default: -// return {data.begin(), data.end()}; -// } -// } - } From 5ecfe01721f0972a00d1cf49af4191e5ad222bdb Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Mon, 6 Oct 2025 17:38:00 +0200 Subject: [PATCH 03/19] More cleanup --- CMakeLists.txt | 3 --- conanfile.py | 2 +- environment-dev.yml | 2 +- include/sparrow_ipc/serialize_utils.hpp | 2 +- tests/test_de_serialization_with_files.cpp | 1 - 5 files changed, 3 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9781f32..7a51e72 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,8 +18,6 @@ set(SPARROW_IPC_COMPILE_DEFINITIONS "" CACHE STRING "List of public compile defi set(SPARROW_IPC_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) set(SPARROW_IPC_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src) -# set(ENV{PKG_CONFIG_PATH} "$ENV{CONDA_PREFIX}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}") - # Linter options # ============= OPTION(ACTIVATE_LINTER "Create targets to run clang-format" OFF) @@ -263,7 +261,6 @@ target_link_libraries(sparrow-ipc target_include_directories(sparrow-ipc PRIVATE ${LZ4_INCLUDE_DIRS}) target_link_libraries(sparrow-ipc PRIVATE ${LZ4_LIBRARIES}) - # Ensure generated headers are available when building sparrow-ipc add_dependencies(sparrow-ipc generate_flatbuffers_headers) diff --git a/conanfile.py b/conanfile.py index 1783a32..e2f251a 100644 --- a/conanfile.py +++ b/conanfile.py @@ -46,7 +46,7 @@ def requirements(self): self.requires("sparrow/1.0.0") self.requires(f"flatbuffers/{self._flatbuffers_version}") self.requires("lz4/1.9.4") - self.requires("zstd/1.5.5") + #self.requires("zstd/1.5.5") if self.options.get_safe("build_tests"): self.test_requires("doctest/2.4.12") diff --git a/environment-dev.yml b/environment-dev.yml index 17971e8..3d21831 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -12,7 +12,7 @@ dependencies: - sparrow-devel - sparrow-json-reader - lz4 - #- zstd + # Testing dependencies - doctest # Documentation dependencies - doxygen diff --git a/include/sparrow_ipc/serialize_utils.hpp b/include/sparrow_ipc/serialize_utils.hpp index 110a393..3c825b4 100644 --- a/include/sparrow_ipc/serialize_utils.hpp +++ b/include/sparrow_ipc/serialize_utils.hpp @@ -8,8 +8,8 @@ #include "Message_generated.h" #include "sparrow_ipc/any_output_stream.hpp" #include "sparrow_ipc/config/config.hpp" +#include "sparrow_ipc/compression.hpp" #include "sparrow_ipc/utils.hpp" -#include "compression.hpp" namespace sparrow_ipc { diff --git a/tests/test_de_serialization_with_files.cpp b/tests/test_de_serialization_with_files.cpp index 8bf23a4..140c545 100644 --- a/tests/test_de_serialization_with_files.cpp +++ b/tests/test_de_serialization_with_files.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include From 9be57adfad02f925030890a4ab8f544fecd7eee8 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Tue, 7 Oct 2025 10:17:31 +0200 Subject: [PATCH 04/19] Fix fetching lz4 --- CMakeLists.txt | 3 --- cmake/external_dependencies.cmake | 33 +++++++++++++++++++++++++------ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a51e72..bee1ff8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -258,9 +258,6 @@ target_link_libraries(sparrow-ipc lz4::lz4 ) -target_include_directories(sparrow-ipc PRIVATE ${LZ4_INCLUDE_DIRS}) -target_link_libraries(sparrow-ipc PRIVATE ${LZ4_LIBRARIES}) - # Ensure generated headers are available when building sparrow-ipc add_dependencies(sparrow-ipc generate_flatbuffers_headers) diff --git a/cmake/external_dependencies.cmake b/cmake/external_dependencies.cmake index fc34099..897b48f 100644 --- a/cmake/external_dependencies.cmake +++ b/cmake/external_dependencies.cmake @@ -11,8 +11,8 @@ endif() function(find_package_or_fetch) set(options) - set(oneValueArgs CONAN_PKG_NAME PACKAGE_NAME GIT_REPOSITORY TAG) - set(multiValueArgs) + set(oneValueArgs CONAN_PKG_NAME PACKAGE_NAME GIT_REPOSITORY TAG SOURCE_SUBDIR) + set(multiValueArgs CMAKE_ARGS) cmake_parse_arguments(PARSE_ARGV 0 arg "${options}" "${oneValueArgs}" "${multiValueArgs}" ) @@ -30,7 +30,14 @@ function(find_package_or_fetch) if(FETCH_DEPENDENCIES_WITH_CMAKE STREQUAL "ON" OR FETCH_DEPENDENCIES_WITH_CMAKE STREQUAL "MISSING") if(NOT ${actual_pkg_name}_FOUND) message(STATUS "📦 Fetching ${arg_PACKAGE_NAME}") - FetchContent_Declare( + # Apply CMAKE_ARGS before fetching + foreach(cmake_arg ${arg_CMAKE_ARGS}) + string(REGEX MATCH "^([^=]+)=(.*)$" _ ${cmake_arg}) + if(CMAKE_MATCH_1) + set(${CMAKE_MATCH_1} ${CMAKE_MATCH_2} CACHE BOOL "" FORCE) + endif() + endforeach() + set(fetch_args ${arg_PACKAGE_NAME} GIT_SHALLOW TRUE GIT_REPOSITORY ${arg_GIT_REPOSITORY} @@ -38,6 +45,10 @@ function(find_package_or_fetch) GIT_PROGRESS TRUE SYSTEM EXCLUDE_FROM_ALL) + if(arg_SOURCE_SUBDIR) + list(APPEND fetch_args SOURCE_SUBDIR ${arg_SOURCE_SUBDIR}) + endif() + FetchContent_Declare(${fetch_args}) FetchContent_MakeAvailable(${arg_PACKAGE_NAME}) message(STATUS "\t✅ Fetched ${arg_PACKAGE_NAME}") else() @@ -93,14 +104,24 @@ if(NOT TARGET flatbuffers::flatbuffers) endif() unset(FLATBUFFERS_BUILD_TESTS CACHE) +# Fetching lz4 +# Disable bundled mode to allow shared libraries if needed +# lz4 is built as static by default if bundled +# set(LZ4_BUNDLED_MODE OFF CACHE BOOL "" FORCE) +# set(BUILD_SHARED_LIBS ON CACHE BOOL "" FORCE) find_package_or_fetch( PACKAGE_NAME lz4 GIT_REPOSITORY https://github.com/lz4/lz4.git TAG v1.10.0 + SOURCE_SUBDIR build/cmake + CMAKE_ARGS + "LZ4_BUILD_CLI=OFF" + "LZ4_BUILD_LEGACY_LZ4C=OFF" ) -# if(NOT TARGET lz4::lz4) -# add_library(lz4::lz4 ALIAS lz4) -# endif() + +if(NOT TARGET lz4::lz4) + add_library(lz4::lz4 ALIAS lz4) +endif() if(SPARROW_IPC_BUILD_TESTS) find_package_or_fetch( From da7b9f6ec962108f4c98a21c1543cd74b6cf1411 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Wed, 8 Oct 2025 13:53:23 +0200 Subject: [PATCH 05/19] Add lz4 targets to be exported --- CMakeLists.txt | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index bee1ff8..5769f6b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -321,6 +321,25 @@ if (TARGET flatbuffers) endif() endif() +if (TARGET lz4) + get_target_property(is_imported lz4 IMPORTED) + if(NOT is_imported) + # This means `lz4` was fetched using FetchContent + # We need to export `lz4` target explicitly + list(APPEND SPARROW_IPC_EXPORTED_TARGETS lz4) + endif() +endif() + +if (TARGET lz4_static) + get_target_property(is_imported lz4_static IMPORTED) + if(NOT is_imported) + # `lz4_static` is needed as this is the actual library + # and `lz4` is an interface pointing to it. + # If `lz4_shared` is used instead for some reason, modify this accordingly + list(APPEND SPARROW_IPC_EXPORTED_TARGETS lz4_static) + endif() +endif() + install(TARGETS ${SPARROW_IPC_EXPORTED_TARGETS} EXPORT ${PROJECT_NAME}-targets) From e52ccf294f4fd997eb90a84bddad7ed055df013b Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Fri, 10 Oct 2025 11:50:21 +0200 Subject: [PATCH 06/19] Add lz4 compression in deserialization --- include/sparrow_ipc/compression.hpp | 1 + .../deserialize_primitive_array.hpp | 81 ++++++++++++-- ...deserialize_variable_size_binary_array.hpp | 103 +++++++++++++++--- src/compression.cpp | 40 +++++++ tests/test_de_serialization_with_files.cpp | 22 ++-- 5 files changed, 207 insertions(+), 40 deletions(-) diff --git a/include/sparrow_ipc/compression.hpp b/include/sparrow_ipc/compression.hpp index 14403a7..b21a8b1 100644 --- a/include/sparrow_ipc/compression.hpp +++ b/include/sparrow_ipc/compression.hpp @@ -19,4 +19,5 @@ namespace sparrow_ipc // CompressionType to_compression_type(org::apache::arrow::flatbuf::CompressionType compression_type); std::vector compress(org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); + std::vector decompress(org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); } diff --git a/include/sparrow_ipc/deserialize_primitive_array.hpp b/include/sparrow_ipc/deserialize_primitive_array.hpp index a1c5dad..13ab4b9 100644 --- a/include/sparrow_ipc/deserialize_primitive_array.hpp +++ b/include/sparrow_ipc/deserialize_primitive_array.hpp @@ -9,10 +9,30 @@ #include "Message_generated.h" #include "sparrow_ipc/arrow_interface/arrow_array.hpp" #include "sparrow_ipc/arrow_interface/arrow_schema.hpp" +#include "sparrow_ipc/compression.hpp" #include "sparrow_ipc/deserialize_utils.hpp" namespace sparrow_ipc { + namespace + { + struct DecompressedBuffers + { + std::vector validity_buffer; + std::vector data_buffer; + }; + + void release_decompressed_buffers(ArrowArray* array) + { + if (array->private_data) + { + delete static_cast(array->private_data); + array->private_data = nullptr; + } + array->release = nullptr; + } + } + template [[nodiscard]] sparrow::primitive_array deserialize_non_owning_primitive_array( const org::apache::arrow::flatbuf::RecordBatch& record_batch, @@ -22,6 +42,46 @@ namespace sparrow_ipc size_t& buffer_index ) { + const auto compression = record_batch.compression(); + DecompressedBuffers* decompressed_buffers_owner = nullptr; + + // Validity buffer + const auto validity_buffer_metadata = record_batch.buffers()->Get(buffer_index++); + auto validity_buffer_span = body.subspan(validity_buffer_metadata->offset(), validity_buffer_metadata->length()); + if (compression) + { + if (!decompressed_buffers_owner) + { + decompressed_buffers_owner = new DecompressedBuffers(); + } + decompressed_buffers_owner->validity_buffer = decompress(compression->codec(), validity_buffer_span); + validity_buffer_span = decompressed_buffers_owner->validity_buffer; + } + auto bitmap_ptr = const_cast(validity_buffer_span.data()); + const sparrow::dynamic_bitset_view bitmap_view{ + bitmap_ptr, + static_cast(record_batch.length())}; + auto null_count = bitmap_view.null_count(); + if (validity_buffer_metadata->length() == 0) + { + bitmap_ptr = nullptr; + null_count = 0; + } + + // Data buffer + const auto primitive_buffer_metadata = record_batch.buffers()->Get(buffer_index++); + auto data_buffer_span = body.subspan(primitive_buffer_metadata->offset(), primitive_buffer_metadata->length()); + if (compression) + { + if (!decompressed_buffers_owner) + { + decompressed_buffers_owner = new DecompressedBuffers(); + } + decompressed_buffers_owner->data_buffer = decompress(compression->codec(), data_buffer_span); + data_buffer_span = decompressed_buffers_owner->data_buffer; + } + auto primitives_ptr = const_cast(data_buffer_span.data()); + const std::string_view format = data_type_to_format( sparrow::detail::get_data_type_from_array>::get() ); @@ -34,17 +94,7 @@ namespace sparrow_ipc nullptr, nullptr ); - const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count( - record_batch, - body, - buffer_index++ - ); - const auto primitive_buffer_metadata = record_batch.buffers()->Get(buffer_index++); - if (body.size() < (primitive_buffer_metadata->offset() + primitive_buffer_metadata->length())) - { - throw std::runtime_error("Primitive buffer exceeds body size"); - } - auto primitives_ptr = const_cast(body.data() + primitive_buffer_metadata->offset()); + std::vector buffers = {bitmap_ptr, primitives_ptr}; ArrowArray array = make_non_owning_arrow_array( record_batch.length(), @@ -55,7 +105,14 @@ namespace sparrow_ipc nullptr, nullptr ); + + if (decompressed_buffers_owner) + { + array.private_data = decompressed_buffers_owner; + array.release = release_decompressed_buffers; + } + sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; return sparrow::primitive_array{std::move(ap)}; } -} \ No newline at end of file +} diff --git a/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp b/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp index f6a5729..36c99f9 100644 --- a/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp +++ b/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp @@ -8,10 +8,32 @@ #include "Message_generated.h" #include "sparrow_ipc/arrow_interface/arrow_array.hpp" #include "sparrow_ipc/arrow_interface/arrow_schema.hpp" +#include "sparrow_ipc/compression.hpp" #include "sparrow_ipc/deserialize_utils.hpp" namespace sparrow_ipc { + // TODO after handling deserialize_primitive_array, do the same here and then in other data types + namespace + { + struct DecompressedBinaryBuffers + { + std::vector validity_buffer; + std::vector offset_buffer; + std::vector data_buffer; + }; + + void release_decompressed_binary_buffers(ArrowArray* array) + { + if (array->private_data) + { + delete static_cast(array->private_data); + array->private_data = nullptr; + } + array->release = nullptr; + } + } + template [[nodiscard]] T deserialize_non_owning_variable_size_binary( const org::apache::arrow::flatbuf::RecordBatch& record_batch, @@ -21,6 +43,61 @@ namespace sparrow_ipc size_t& buffer_index ) { + const auto compression = record_batch.compression(); + DecompressedBinaryBuffers* decompressed_buffers_owner = nullptr; + + // Validity buffer + const auto validity_buffer_metadata = record_batch.buffers()->Get(buffer_index++); + auto validity_buffer_span = body.subspan(validity_buffer_metadata->offset(), validity_buffer_metadata->length()); + if (compression && validity_buffer_metadata->length() > 0) + { + if (!decompressed_buffers_owner) + { + decompressed_buffers_owner = new DecompressedBinaryBuffers(); + } + decompressed_buffers_owner->validity_buffer = decompress(compression->codec(), validity_buffer_span); + validity_buffer_span = decompressed_buffers_owner->validity_buffer; + } + + uint8_t* bitmap_ptr = nullptr; + int64_t null_count = 0; + if (validity_buffer_metadata->length() > 0) + { + bitmap_ptr = const_cast(validity_buffer_span.data()); + const sparrow::dynamic_bitset_view bitmap_view{ + bitmap_ptr, + static_cast(record_batch.length())}; + null_count = bitmap_view.null_count(); + } + + // Offset buffer + const auto offset_metadata = record_batch.buffers()->Get(buffer_index++); + auto offset_buffer_span = body.subspan(offset_metadata->offset(), offset_metadata->length()); + if (compression) + { + if (!decompressed_buffers_owner) + { + decompressed_buffers_owner = new DecompressedBinaryBuffers(); + } + decompressed_buffers_owner->offset_buffer = decompress(compression->codec(), offset_buffer_span); + offset_buffer_span = decompressed_buffers_owner->offset_buffer; + } + auto offset_ptr = const_cast(offset_buffer_span.data()); + + // Data buffer + const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++); + auto data_buffer_span = body.subspan(buffer_metadata->offset(), buffer_metadata->length()); + if (compression) + { + if (!decompressed_buffers_owner) + { + decompressed_buffers_owner = new DecompressedBinaryBuffers(); + } + decompressed_buffers_owner->data_buffer = decompress(compression->codec(), data_buffer_span); + data_buffer_span = decompressed_buffers_owner->data_buffer; + } + auto buffer_ptr = const_cast(data_buffer_span.data()); + const std::string_view format = data_type_to_format(sparrow::detail::get_data_type_from_array::get()); ArrowSchema schema = make_non_owning_arrow_schema( format, @@ -31,24 +108,7 @@ namespace sparrow_ipc nullptr, nullptr ); - const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count( - record_batch, - body, - buffer_index++ - ); - const auto offset_metadata = record_batch.buffers()->Get(buffer_index++); - if ((offset_metadata->offset() + offset_metadata->length()) > body.size()) - { - throw std::runtime_error("Offset buffer exceeds body size"); - } - auto offset_ptr = const_cast(body.data() + offset_metadata->offset()); - const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++); - if ((buffer_metadata->offset() + buffer_metadata->length()) > body.size()) - { - throw std::runtime_error("Data buffer exceeds body size"); - } - auto buffer_ptr = const_cast(body.data() + buffer_metadata->offset()); std::vector buffers = {bitmap_ptr, offset_ptr, buffer_ptr}; ArrowArray array = make_non_owning_arrow_array( record_batch.length(), @@ -59,7 +119,14 @@ namespace sparrow_ipc nullptr, nullptr ); + + if (decompressed_buffers_owner) + { + array.private_data = decompressed_buffers_owner; + array.release = release_decompressed_binary_buffers; + } + sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; return T{std::move(ap)}; } -} \ No newline at end of file +} diff --git a/src/compression.cpp b/src/compression.cpp index 964d304..59d6d56 100644 --- a/src/compression.cpp +++ b/src/compression.cpp @@ -45,4 +45,44 @@ namespace sparrow_ipc return {data.begin(), data.end()}; } } + + std::vector decompress(org::apache::arrow::flatbuf::CompressionType compression_type, std::span data) + { + if (data.empty()) + { + return {}; + } + switch (compression_type) + { + case org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME: + { + if (data.size() < 8) + { + throw std::runtime_error("Invalid compressed data: missing decompressed size"); + } + const std::int64_t decompressed_size = *reinterpret_cast(data.data()); + const auto compressed_data = data.subspan(8); + + if (decompressed_size == -1) + { + return {compressed_data.begin(), compressed_data.end()}; + } + + std::vector decompressed_data(decompressed_size); + LZ4F_dctx* dctx = nullptr; + LZ4F_createDecompressionContext(&dctx, LZ4F_VERSION); + size_t compressed_size_in_out = compressed_data.size(); + size_t decompressed_size_in_out = decompressed_size; + size_t result = LZ4F_decompress(dctx, decompressed_data.data(), &decompressed_size_in_out, compressed_data.data(), &compressed_size_in_out, nullptr); + if (LZ4F_isError(result)) + { + throw std::runtime_error("Failed to decompress data with LZ4 frame format"); + } + LZ4F_freeDecompressionContext(dctx); + return decompressed_data; + } + default: + return {data.begin(), data.end()}; + } + } } diff --git a/tests/test_de_serialization_with_files.cpp b/tests/test_de_serialization_with_files.cpp index 140c545..d46f7bf 100644 --- a/tests/test_de_serialization_with_files.cpp +++ b/tests/test_de_serialization_with_files.cpp @@ -33,8 +33,8 @@ const std::vector files_paths_to_test = { }; const std::vector files_paths_to_test_with_compression = { - tests_resources_files_path_with_compression / "generated_lz4" -// tests_resources_files_path_with_compression/ "generated_uncompressible_lz4" + tests_resources_files_path_with_compression / "generated_lz4", + tests_resources_files_path_with_compression/ "generated_uncompressible_lz4" // tests_resources_files_path_with_compression / "generated_zstd" // tests_resources_files_path_with_compression/ "generated_uncompressible_zstd" }; @@ -67,10 +67,12 @@ void compare_record_batches( ) { REQUIRE_EQ(record_batches_1.size(), record_batches_2.size()); +// std::cout << "record_batches1 size: " << record_batches_1.size() << " record_batches2 size: " << record_batches_2.size() << std::endl; for (size_t i = 0; i < record_batches_1.size(); ++i) { for (size_t y = 0; y < record_batches_1[i].nb_columns(); y++) { +// std::cout << "record_batches1 nb cols: " << record_batches_1[i].nb_columns() << " record_batches2 nb cols: " << record_batches_2[i].nb_columns() << std::endl; const auto& column_1 = record_batches_1[i].get_column(y); const auto& column_2 = record_batches_2[i].get_column(y); REQUIRE_EQ(column_1.size(), column_2.size()); @@ -79,9 +81,11 @@ void compare_record_batches( { const auto col_name = column_1.name().value_or("NA"); INFO("Comparing batch " << i << ", column " << y << " named :" << col_name << " , row " << z); +// std::cout << "Comparing batch " << i << ", column " << y << " named :" << col_name << " , row " << z << std::endl; REQUIRE_EQ(column_1.data_type(), column_2.data_type()); const auto& column_1_value = column_1[z]; const auto& column_2_value = column_2[z]; +// std::cout << "column_1_value :" << column_1_value << " and " << column_2_value << std::endl; CHECK_EQ(column_1_value, column_2_value); } } @@ -186,7 +190,7 @@ TEST_SUITE("Integration tests") } } - TEST_CASE("Serialization with LZ4 compression") + TEST_CASE("Compare record_batch serialization with stream file using LZ4 compression") { for (const auto& file_path : files_paths_to_test_with_compression) { @@ -224,14 +228,12 @@ TEST_SUITE("Integration tests") const auto record_batches_from_stream = sparrow_ipc::deserialize_stream( std::span(stream_data) ); - - const auto serialized_data = sparrow_ipc::serialize(record_batches_from_json, std::nullopt); -// const auto deserialized_serialized_data = sparrow_ipc::deserialize_stream( -// std::span(serialized_data) -// ); -// compare_record_batches(record_batches_from_stream, deserialized_serialized_data); + const auto serialized_data = sparrow_ipc::serialize(record_batches_from_json, org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME); + const auto deserialized_serialized_data = sparrow_ipc::deserialize_stream( + std::span(serialized_data) + ); + compare_record_batches(record_batches_from_stream, deserialized_serialized_data); } - } } } From 4d52af9e13d8b5abca5f297d865e59bb7b1311d0 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Mon, 13 Oct 2025 11:46:07 +0200 Subject: [PATCH 07/19] Add owning_arrow_array_private_data --- CMakeLists.txt | 1 + .../arrow_interface/arrow_array.hpp | 11 ++- .../arrow_array/private_data.hpp | 30 ++++++++ .../deserialize_primitive_array.hpp | 76 ++++++++----------- src/arrow_interface/arrow_array.cpp | 58 ++++++++++++++ 5 files changed, 129 insertions(+), 47 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5769f6b..5bbc612 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -133,6 +133,7 @@ set(SPARROW_IPC_SRC ${SPARROW_IPC_SOURCE_DIR}/arrow_interface/arrow_schema/private_data.cpp ${SPARROW_IPC_SOURCE_DIR}/chunk_memory_serializer.cpp ${SPARROW_IPC_SOURCE_DIR}/compression.cpp + #${SPARROW_IPC_SOURCE_DIR}/decompressed_buffers.cpp ${SPARROW_IPC_SOURCE_DIR}/deserialize_fixedsizebinary_array.cpp ${SPARROW_IPC_SOURCE_DIR}/deserialize_utils.cpp ${SPARROW_IPC_SOURCE_DIR}/deserialize.cpp diff --git a/include/sparrow_ipc/arrow_interface/arrow_array.hpp b/include/sparrow_ipc/arrow_interface/arrow_array.hpp index 2f1f72d..9348c44 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_array.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_array.hpp @@ -1,4 +1,3 @@ - #pragma once #include @@ -9,6 +8,16 @@ namespace sparrow_ipc { + [[nodiscard]] SPARROW_IPC_API ArrowArray make_owning_arrow_array( + int64_t length, + int64_t null_count, + int64_t offset, + std::vector>&& buffers, + size_t children_count, + ArrowArray** children, + ArrowArray* dictionary + ); + [[nodiscard]] SPARROW_IPC_API ArrowArray make_non_owning_arrow_array( int64_t length, int64_t null_count, diff --git a/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp b/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp index 90e633f..265d9e2 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp @@ -7,6 +7,36 @@ namespace sparrow_ipc { + class owning_arrow_array_private_data + { + public: + + explicit owning_arrow_array_private_data(std::vector>&& buffers) + : m_buffers(std::move(buffers)) + { + m_buffer_pointers.reserve(m_buffers.size()); + for (const auto& buffer : m_buffers) + { + m_buffer_pointers.push_back(buffer.data()); + } + } + + const void** buffers_ptrs() noexcept + { + return m_buffer_pointers.data(); + } + + std::size_t n_buffers() const noexcept + { + return m_buffers.size(); + } + + private: + + std::vector> m_buffers; + std::vector m_buffer_pointers; + }; + class non_owning_arrow_array_private_data { public: diff --git a/include/sparrow_ipc/deserialize_primitive_array.hpp b/include/sparrow_ipc/deserialize_primitive_array.hpp index 13ab4b9..6578565 100644 --- a/include/sparrow_ipc/deserialize_primitive_array.hpp +++ b/include/sparrow_ipc/deserialize_primitive_array.hpp @@ -14,25 +14,6 @@ namespace sparrow_ipc { - namespace - { - struct DecompressedBuffers - { - std::vector validity_buffer; - std::vector data_buffer; - }; - - void release_decompressed_buffers(ArrowArray* array) - { - if (array->private_data) - { - delete static_cast(array->private_data); - array->private_data = nullptr; - } - array->release = nullptr; - } - } - template [[nodiscard]] sparrow::primitive_array deserialize_non_owning_primitive_array( const org::apache::arrow::flatbuf::RecordBatch& record_batch, @@ -43,19 +24,15 @@ namespace sparrow_ipc ) { const auto compression = record_batch.compression(); - DecompressedBuffers* decompressed_buffers_owner = nullptr; // Validity buffer const auto validity_buffer_metadata = record_batch.buffers()->Get(buffer_index++); auto validity_buffer_span = body.subspan(validity_buffer_metadata->offset(), validity_buffer_metadata->length()); + std::vector> decompressed_buffers; if (compression) { - if (!decompressed_buffers_owner) - { - decompressed_buffers_owner = new DecompressedBuffers(); - } - decompressed_buffers_owner->validity_buffer = decompress(compression->codec(), validity_buffer_span); - validity_buffer_span = decompressed_buffers_owner->validity_buffer; + decompressed_buffers.emplace_back(decompress(compression->codec(), validity_buffer_span)); + validity_buffer_span = decompressed_buffers.back(); } auto bitmap_ptr = const_cast(validity_buffer_span.data()); const sparrow::dynamic_bitset_view bitmap_view{ @@ -73,12 +50,8 @@ namespace sparrow_ipc auto data_buffer_span = body.subspan(primitive_buffer_metadata->offset(), primitive_buffer_metadata->length()); if (compression) { - if (!decompressed_buffers_owner) - { - decompressed_buffers_owner = new DecompressedBuffers(); - } - decompressed_buffers_owner->data_buffer = decompress(compression->codec(), data_buffer_span); - data_buffer_span = decompressed_buffers_owner->data_buffer; + decompressed_buffers.emplace_back(decompress(compression->codec(), data_buffer_span)); + data_buffer_span = decompressed_buffers.back(); } auto primitives_ptr = const_cast(data_buffer_span.data()); @@ -95,24 +68,35 @@ namespace sparrow_ipc nullptr ); - std::vector buffers = {bitmap_ptr, primitives_ptr}; - ArrowArray array = make_non_owning_arrow_array( - record_batch.length(), - null_count, - 0, - std::move(buffers), - 0, - nullptr, - nullptr - ); - - if (decompressed_buffers_owner) + ArrowArray array; + if (compression) + { + array = make_owning_arrow_array( + record_batch.length(), + null_count, + 0, + std::move(decompressed_buffers), + 0, + nullptr, + nullptr + ); + } + else { - array.private_data = decompressed_buffers_owner; - array.release = release_decompressed_buffers; + std::vector buffers = {bitmap_ptr, primitives_ptr}; + array = make_non_owning_arrow_array( + record_batch.length(), + null_count, + 0, + std::move(buffers), + 0, + nullptr, + nullptr + ); } sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; return sparrow::primitive_array{std::move(ap)}; } } + diff --git a/src/arrow_interface/arrow_array.cpp b/src/arrow_interface/arrow_array.cpp index ed0a0f2..7afe070 100644 --- a/src/arrow_interface/arrow_array.cpp +++ b/src/arrow_interface/arrow_array.cpp @@ -10,6 +10,64 @@ namespace sparrow_ipc { + void release_owning_arrow_array(ArrowArray* array) + { + SPARROW_ASSERT_FALSE(array == nullptr) + SPARROW_ASSERT_TRUE(array->release == std::addressof(release_owning_arrow_array)) + + if (array->private_data) + { + delete static_cast(array->private_data); + array->private_data = nullptr; + } + + for (int64_t i = 0; i < array->n_children; ++i) + { + if (array->children[i] && array->children[i]->release) + { + array->children[i]->release(array->children[i]); + } + } + delete[] array->children; + array->children = nullptr; + + if (array->dictionary && array->dictionary->release) + { + array->dictionary->release(array->dictionary); + } + delete array->dictionary; + array->dictionary = nullptr; + + array->release = nullptr; + } + + ArrowArray make_owning_arrow_array( + int64_t length, + int64_t null_count, + int64_t offset, + std::vector>&& buffers, + size_t children_count, + ArrowArray** children, + ArrowArray* dictionary + ) + { + ArrowArray array{}; + array.length = length; + array.null_count = null_count; + array.offset = offset; + + auto private_data = new owning_arrow_array_private_data(std::move(buffers)); + array.private_data = private_data; + array.n_buffers = private_data->n_buffers(); + array.buffers = private_data->buffers_ptrs(); + + array.n_children = static_cast(children_count); + array.children = children; + array.dictionary = dictionary; + array.release = release_owning_arrow_array; + return array; + } + void release_non_owning_arrow_array(ArrowArray* array) { SPARROW_ASSERT_FALSE(array == nullptr) From 7c59a7de19fae90f99e629e14d242c230da8d283 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Tue, 14 Oct 2025 10:07:07 +0200 Subject: [PATCH 08/19] Factorize --- .../arrow_interface/arrow_array.hpp | 85 ++++++++--- .../arrow_array/private_data.hpp | 27 +++- .../deserialize_primitive_array.hpp | 66 ++++----- include/sparrow_ipc/deserialize_utils.hpp | 29 +++- ...deserialize_variable_size_binary_array.hpp | 126 +++++----------- src/arrow_interface/arrow_array.cpp | 137 +++--------------- .../arrow_array/private_data.cpp | 9 -- src/deserialize_fixedsizebinary_array.cpp | 7 +- src/deserialize_utils.cpp | 21 +++ 9 files changed, 226 insertions(+), 281 deletions(-) diff --git a/include/sparrow_ipc/arrow_interface/arrow_array.hpp b/include/sparrow_ipc/arrow_interface/arrow_array.hpp index 9348c44..726ecb9 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_array.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_array.hpp @@ -1,43 +1,86 @@ #pragma once -#include +#include #include +#include #include "sparrow_ipc/config/config.hpp" +#include "sparrow_ipc/arrow_interface/arrow_array/private_data.hpp" namespace sparrow_ipc { - [[nodiscard]] SPARROW_IPC_API ArrowArray make_owning_arrow_array( - int64_t length, - int64_t null_count, - int64_t offset, - std::vector>&& buffers, - size_t children_count, - ArrowArray** children, - ArrowArray* dictionary - ); + SPARROW_IPC_API void release_arrow_array_children_and_dictionary(ArrowArray* array); + + template + void arrow_array_release(ArrowArray* array) + { + SPARROW_ASSERT_TRUE(array != nullptr) + SPARROW_ASSERT_TRUE(array->release == std::addressof(arrow_array_release)) + + SPARROW_ASSERT_TRUE(array->private_data != nullptr); - [[nodiscard]] SPARROW_IPC_API ArrowArray make_non_owning_arrow_array( + delete static_cast(array->private_data); + array->private_data = nullptr; + array->buffers = nullptr; // The buffers were deleted with the private data + + release_arrow_array_children_and_dictionary(array); + array->release = nullptr; + } + + template + SPARROW_IPC_API void fill_arrow_array( + ArrowArray& array, int64_t length, int64_t null_count, int64_t offset, - std::vector&& buffers, size_t children_count, ArrowArray** children, - ArrowArray* dictionary - ); + ArrowArray* dictionary, + Arg&& private_data_arg + ) + { + SPARROW_ASSERT_TRUE(length >= 0); + SPARROW_ASSERT_TRUE(null_count >= -1); + SPARROW_ASSERT_TRUE(offset >= 0); - SPARROW_IPC_API void release_non_owning_arrow_array(ArrowArray* array); + array.length = length; + array.null_count = null_count; + array.offset = offset; + array.n_children = static_cast(children_count); + array.children = children; + array.dictionary = dictionary; - SPARROW_IPC_API void fill_non_owning_arrow_array( - ArrowArray& array, + auto private_data = new T(std::forward(private_data_arg)); + array.private_data = private_data; + array.n_buffers = private_data->n_buffers(); + array.buffers = private_data->buffers_ptrs(); + + array.release = &arrow_array_release; + } + + template + [[nodiscard]] SPARROW_IPC_API ArrowArray make_arrow_array( int64_t length, int64_t null_count, int64_t offset, - std::vector&& buffers, size_t children_count, ArrowArray** children, - ArrowArray* dictionary - ); -} \ No newline at end of file + ArrowArray* dictionary, + Arg&& private_data_arg + ) + { + ArrowArray array{}; + fill_arrow_array( + array, + length, + null_count, + offset, + children_count, + children, + dictionary, + std::forward(private_data_arg) + ); + return array; + } +} diff --git a/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp b/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp index 265d9e2..2a9386f 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp @@ -1,5 +1,5 @@ #pragma once - +#include #include #include @@ -7,6 +7,13 @@ namespace sparrow_ipc { + template + concept ArrowPrivateData = requires(T& t) + { + { t.buffers_ptrs() } -> std::same_as; + { t.n_buffers() } -> std::convertible_to; + }; + class owning_arrow_array_private_data { public: @@ -21,12 +28,12 @@ namespace sparrow_ipc } } - const void** buffers_ptrs() noexcept + [[nodiscard]] SPARROW_IPC_API const void** buffers_ptrs() noexcept { return m_buffer_pointers.data(); } - std::size_t n_buffers() const noexcept + [[nodiscard]] SPARROW_IPC_API std::size_t n_buffers() const noexcept { return m_buffers.size(); } @@ -42,14 +49,22 @@ namespace sparrow_ipc public: explicit constexpr non_owning_arrow_array_private_data(std::vector&& buffers_pointers) - : m_buffers_pointers(std::move(buffers_pointers)) + : m_buffer_pointers(std::move(buffers_pointers)) { } - [[nodiscard]] SPARROW_IPC_API const void** buffers_ptrs() noexcept; + [[nodiscard]] SPARROW_IPC_API const void** buffers_ptrs() noexcept + { + return const_cast(reinterpret_cast(m_buffer_pointers.data())); + } + + [[nodiscard]] SPARROW_IPC_API std::size_t n_buffers() const noexcept + { + return m_buffer_pointers.size(); + } private: - std::vector m_buffers_pointers; + std::vector m_buffer_pointers; }; } diff --git a/include/sparrow_ipc/deserialize_primitive_array.hpp b/include/sparrow_ipc/deserialize_primitive_array.hpp index 6578565..1808789 100644 --- a/include/sparrow_ipc/deserialize_primitive_array.hpp +++ b/include/sparrow_ipc/deserialize_primitive_array.hpp @@ -9,7 +9,6 @@ #include "Message_generated.h" #include "sparrow_ipc/arrow_interface/arrow_array.hpp" #include "sparrow_ipc/arrow_interface/arrow_schema.hpp" -#include "sparrow_ipc/compression.hpp" #include "sparrow_ipc/deserialize_utils.hpp" namespace sparrow_ipc @@ -23,38 +22,6 @@ namespace sparrow_ipc size_t& buffer_index ) { - const auto compression = record_batch.compression(); - - // Validity buffer - const auto validity_buffer_metadata = record_batch.buffers()->Get(buffer_index++); - auto validity_buffer_span = body.subspan(validity_buffer_metadata->offset(), validity_buffer_metadata->length()); - std::vector> decompressed_buffers; - if (compression) - { - decompressed_buffers.emplace_back(decompress(compression->codec(), validity_buffer_span)); - validity_buffer_span = decompressed_buffers.back(); - } - auto bitmap_ptr = const_cast(validity_buffer_span.data()); - const sparrow::dynamic_bitset_view bitmap_view{ - bitmap_ptr, - static_cast(record_batch.length())}; - auto null_count = bitmap_view.null_count(); - if (validity_buffer_metadata->length() == 0) - { - bitmap_ptr = nullptr; - null_count = 0; - } - - // Data buffer - const auto primitive_buffer_metadata = record_batch.buffers()->Get(buffer_index++); - auto data_buffer_span = body.subspan(primitive_buffer_metadata->offset(), primitive_buffer_metadata->length()); - if (compression) - { - decompressed_buffers.emplace_back(decompress(compression->codec(), data_buffer_span)); - data_buffer_span = decompressed_buffers.back(); - } - auto primitives_ptr = const_cast(data_buffer_span.data()); - const std::string_view format = data_type_to_format( sparrow::detail::get_data_type_from_array>::get() ); @@ -68,30 +35,50 @@ namespace sparrow_ipc nullptr ); + const auto compression = record_batch.compression(); + std::vector> decompressed_buffers; + + auto validity_buffer_span = utils::get_and_decompress_buffer(record_batch, body, buffer_index, compression, decompressed_buffers); + + uint8_t* bitmap_ptr = nullptr; + int64_t null_count = 0; + + if (validity_buffer_span.size() > 0) + { + bitmap_ptr = const_cast(validity_buffer_span.data()); + const sparrow::dynamic_bitset_view bitmap_view{ + bitmap_ptr, + static_cast(record_batch.length())}; + null_count = bitmap_view.null_count(); + } + + auto data_buffer_span = utils::get_and_decompress_buffer(record_batch, body, buffer_index, compression, decompressed_buffers); + ArrowArray array; if (compression) { - array = make_owning_arrow_array( + array = make_arrow_array( record_batch.length(), null_count, 0, - std::move(decompressed_buffers), 0, nullptr, - nullptr + nullptr, + std::move(decompressed_buffers) ); } else { + auto primitives_ptr = const_cast(data_buffer_span.data()); std::vector buffers = {bitmap_ptr, primitives_ptr}; - array = make_non_owning_arrow_array( + array = make_arrow_array( record_batch.length(), null_count, 0, - std::move(buffers), 0, nullptr, - nullptr + nullptr, + std::move(buffers) ); } @@ -99,4 +86,3 @@ namespace sparrow_ipc return sparrow::primitive_array{std::move(ap)}; } } - diff --git a/include/sparrow_ipc/deserialize_utils.hpp b/include/sparrow_ipc/deserialize_utils.hpp index fc1ca05..cbc52b3 100644 --- a/include/sparrow_ipc/deserialize_utils.hpp +++ b/include/sparrow_ipc/deserialize_utils.hpp @@ -2,12 +2,12 @@ #include #include +#include #include #include #include "Message_generated.h" -#include "Schema_generated.h" namespace sparrow_ipc::utils { @@ -33,4 +33,29 @@ namespace sparrow_ipc::utils std::span body, size_t index ); -} \ No newline at end of file + + /** + * @brief Extracts a buffer from a RecordBatch and decompresses it if necessary. + * + * This function retrieves a buffer span from the specified index, increments the index, + * and applies decompression if specified. If the buffer is decompressed, the new + * data is stored in `decompressed_storage` and the returned span will point to this new data. + * + * @param record_batch The Arrow RecordBatch containing buffer metadata. + * @param body The raw buffer data as a byte span. + * @param buffer_index The index of the buffer to retrieve. This value is incremented by the function. + * @param compression The compression algorithm to use. If nullptr, no decompression is performed. + * @param decompressed_storage A vector that will be used to store the data of any decompressed buffers. + * + * @return A span viewing the resulting buffer data. This will be a view of the original + * `body` if no decompression occurs, or a view of the newly added buffer in + * `decompressed_storage` if decompression occurs. + */ + [[nodiscard]] std::span get_and_decompress_buffer( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + std::span body, + size_t& buffer_index, + const org::apache::arrow::flatbuf::BodyCompression* compression, + std::vector>& decompressed_storage + ); +} diff --git a/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp b/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp index 36c99f9..9895b97 100644 --- a/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp +++ b/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp @@ -8,32 +8,10 @@ #include "Message_generated.h" #include "sparrow_ipc/arrow_interface/arrow_array.hpp" #include "sparrow_ipc/arrow_interface/arrow_schema.hpp" -#include "sparrow_ipc/compression.hpp" #include "sparrow_ipc/deserialize_utils.hpp" namespace sparrow_ipc { - // TODO after handling deserialize_primitive_array, do the same here and then in other data types - namespace - { - struct DecompressedBinaryBuffers - { - std::vector validity_buffer; - std::vector offset_buffer; - std::vector data_buffer; - }; - - void release_decompressed_binary_buffers(ArrowArray* array) - { - if (array->private_data) - { - delete static_cast(array->private_data); - array->private_data = nullptr; - } - array->release = nullptr; - } - } - template [[nodiscard]] T deserialize_non_owning_variable_size_binary( const org::apache::arrow::flatbuf::RecordBatch& record_batch, @@ -43,25 +21,25 @@ namespace sparrow_ipc size_t& buffer_index ) { - const auto compression = record_batch.compression(); - DecompressedBinaryBuffers* decompressed_buffers_owner = nullptr; + const std::string_view format = data_type_to_format(sparrow::detail::get_data_type_from_array::get()); + ArrowSchema schema = make_non_owning_arrow_schema( + format, + name.data(), + metadata, + std::nullopt, + 0, + nullptr, + nullptr + ); - // Validity buffer - const auto validity_buffer_metadata = record_batch.buffers()->Get(buffer_index++); - auto validity_buffer_span = body.subspan(validity_buffer_metadata->offset(), validity_buffer_metadata->length()); - if (compression && validity_buffer_metadata->length() > 0) - { - if (!decompressed_buffers_owner) - { - decompressed_buffers_owner = new DecompressedBinaryBuffers(); - } - decompressed_buffers_owner->validity_buffer = decompress(compression->codec(), validity_buffer_span); - validity_buffer_span = decompressed_buffers_owner->validity_buffer; - } + const auto compression = record_batch.compression(); + std::vector> decompressed_buffers; + // TODO add another fct here + auto validity_buffer_span = utils::get_and_decompress_buffer(record_batch, body, buffer_index, compression, decompressed_buffers); uint8_t* bitmap_ptr = nullptr; int64_t null_count = 0; - if (validity_buffer_metadata->length() > 0) + if (validity_buffer_span.size() > 0) { bitmap_ptr = const_cast(validity_buffer_span.data()); const sparrow::dynamic_bitset_view bitmap_view{ @@ -70,60 +48,36 @@ namespace sparrow_ipc null_count = bitmap_view.null_count(); } - // Offset buffer - const auto offset_metadata = record_batch.buffers()->Get(buffer_index++); - auto offset_buffer_span = body.subspan(offset_metadata->offset(), offset_metadata->length()); - if (compression) - { - if (!decompressed_buffers_owner) - { - decompressed_buffers_owner = new DecompressedBinaryBuffers(); - } - decompressed_buffers_owner->offset_buffer = decompress(compression->codec(), offset_buffer_span); - offset_buffer_span = decompressed_buffers_owner->offset_buffer; - } - auto offset_ptr = const_cast(offset_buffer_span.data()); + auto offset_buffer_span = utils::get_and_decompress_buffer(record_batch, body, buffer_index, compression, decompressed_buffers); + auto data_buffer_span = utils::get_and_decompress_buffer(record_batch, body, buffer_index, compression, decompressed_buffers); - // Data buffer - const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++); - auto data_buffer_span = body.subspan(buffer_metadata->offset(), buffer_metadata->length()); + ArrowArray array; if (compression) { - if (!decompressed_buffers_owner) - { - decompressed_buffers_owner = new DecompressedBinaryBuffers(); - } - decompressed_buffers_owner->data_buffer = decompress(compression->codec(), data_buffer_span); - data_buffer_span = decompressed_buffers_owner->data_buffer; + array = make_arrow_array( + record_batch.length(), + null_count, + 0, + 0, + nullptr, + nullptr, + std::move(decompressed_buffers) + ); } - auto buffer_ptr = const_cast(data_buffer_span.data()); - - const std::string_view format = data_type_to_format(sparrow::detail::get_data_type_from_array::get()); - ArrowSchema schema = make_non_owning_arrow_schema( - format, - name.data(), - metadata, - std::nullopt, - 0, - nullptr, - nullptr - ); - - std::vector buffers = {bitmap_ptr, offset_ptr, buffer_ptr}; - ArrowArray array = make_non_owning_arrow_array( - record_batch.length(), - null_count, - 0, - std::move(buffers), - 0, - nullptr, - nullptr - ); - - if (decompressed_buffers_owner) + else { - array.private_data = decompressed_buffers_owner; - array.release = release_decompressed_binary_buffers; + auto offset_ptr = const_cast(offset_buffer_span.data()); + auto buffer_ptr = const_cast(data_buffer_span.data()); + std::vector buffers = {bitmap_ptr, offset_ptr, buffer_ptr}; + array = make_arrow_array( + record_batch.length(), + null_count, + 0, + 0, + nullptr, + nullptr, + std::move(buffers) + ); } sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; diff --git a/src/arrow_interface/arrow_array.cpp b/src/arrow_interface/arrow_array.cpp index 7afe070..a01006b 100644 --- a/src/arrow_interface/arrow_array.cpp +++ b/src/arrow_interface/arrow_array.cpp @@ -1,131 +1,40 @@ #include "sparrow_ipc/arrow_interface/arrow_array.hpp" -#include - #include -#include - -#include "sparrow_ipc/arrow_interface/arrow_array/private_data.hpp" -#include "sparrow_ipc/arrow_interface/arrow_array_schema_common_release.hpp" namespace sparrow_ipc { - void release_owning_arrow_array(ArrowArray* array) + void release_arrow_array_children_and_dictionary(ArrowArray* array) { - SPARROW_ASSERT_FALSE(array == nullptr) - SPARROW_ASSERT_TRUE(array->release == std::addressof(release_owning_arrow_array)) - - if (array->private_data) - { - delete static_cast(array->private_data); - array->private_data = nullptr; - } + SPARROW_ASSERT_TRUE(array != nullptr) - for (int64_t i = 0; i < array->n_children; ++i) + if (array->children) { - if (array->children[i] && array->children[i]->release) + for (int64_t i = 0; i < array->n_children; ++i) { - array->children[i]->release(array->children[i]); + ArrowArray* child = array->children[i]; + if (child) + { + if (child->release) + { + child->release(child); + } + delete child; + child = nullptr; + } } + delete[] array->children; + array->children = nullptr; } - delete[] array->children; - array->children = nullptr; - if (array->dictionary && array->dictionary->release) + if (array->dictionary) { - array->dictionary->release(array->dictionary); + if (array->dictionary->release) + { + array->dictionary->release(array->dictionary); + } + delete array->dictionary; + array->dictionary = nullptr; } - delete array->dictionary; - array->dictionary = nullptr; - - array->release = nullptr; - } - - ArrowArray make_owning_arrow_array( - int64_t length, - int64_t null_count, - int64_t offset, - std::vector>&& buffers, - size_t children_count, - ArrowArray** children, - ArrowArray* dictionary - ) - { - ArrowArray array{}; - array.length = length; - array.null_count = null_count; - array.offset = offset; - - auto private_data = new owning_arrow_array_private_data(std::move(buffers)); - array.private_data = private_data; - array.n_buffers = private_data->n_buffers(); - array.buffers = private_data->buffers_ptrs(); - - array.n_children = static_cast(children_count); - array.children = children; - array.dictionary = dictionary; - array.release = release_owning_arrow_array; - return array; - } - - void release_non_owning_arrow_array(ArrowArray* array) - { - SPARROW_ASSERT_FALSE(array == nullptr) - SPARROW_ASSERT_TRUE(array->release == std::addressof(release_non_owning_arrow_array)) - - release_common_non_owning_arrow(*array); - array->buffers = nullptr; // The buffers were deleted with the private data - } - - void fill_non_owning_arrow_array( - ArrowArray& array, - int64_t length, - int64_t null_count, - int64_t offset, - std::vector&& buffers, - size_t children_count, - ArrowArray** children, - ArrowArray* dictionary - ) - { - SPARROW_ASSERT_TRUE(length >= 0); - SPARROW_ASSERT_TRUE(null_count >= -1); - SPARROW_ASSERT_TRUE(offset >= 0); - - array.length = length; - array.null_count = null_count; - array.offset = offset; - array.n_buffers = static_cast(buffers.size()); - array.private_data = new non_owning_arrow_array_private_data(std::move(buffers)); - const auto private_data = static_cast(array.private_data); - array.buffers = private_data->buffers_ptrs(); - array.n_children = static_cast(children_count); - array.children = children; - array.dictionary = dictionary; - array.release = release_non_owning_arrow_array; - } - - ArrowArray make_non_owning_arrow_array( - int64_t length, - int64_t null_count, - int64_t offset, - std::vector&& buffers, - size_t children_count, - ArrowArray** children, - ArrowArray* dictionary - ) - { - ArrowArray array{}; - fill_non_owning_arrow_array( - array, - length, - null_count, - offset, - std::move(buffers), - children_count, - children, - dictionary - ); - return array; } } diff --git a/src/arrow_interface/arrow_array/private_data.cpp b/src/arrow_interface/arrow_array/private_data.cpp index b133c8e..e69de29 100644 --- a/src/arrow_interface/arrow_array/private_data.cpp +++ b/src/arrow_interface/arrow_array/private_data.cpp @@ -1,9 +0,0 @@ -#include "sparrow_ipc/arrow_interface/arrow_array/private_data.hpp" - -namespace sparrow_ipc -{ - const void** non_owning_arrow_array_private_data::buffers_ptrs() noexcept - { - return const_cast(reinterpret_cast(m_buffers_pointers.data())); - } -} \ No newline at end of file diff --git a/src/deserialize_fixedsizebinary_array.cpp b/src/deserialize_fixedsizebinary_array.cpp index 63ea213..427f600 100644 --- a/src/deserialize_fixedsizebinary_array.cpp +++ b/src/deserialize_fixedsizebinary_array.cpp @@ -2,6 +2,7 @@ namespace sparrow_ipc { + // TODO add compression here and tests (not available for this type in apache arrow integration tests files) sparrow::fixed_width_binary_array deserialize_non_owning_fixedwidthbinary( const org::apache::arrow::flatbuf::RecordBatch& record_batch, std::span body, @@ -33,14 +34,14 @@ namespace sparrow_ipc } auto buffer_ptr = const_cast(body.data() + buffer_metadata->offset()); std::vector buffers = {bitmap_ptr, buffer_ptr}; - ArrowArray array = make_non_owning_arrow_array( + ArrowArray array = make_arrow_array( record_batch.length(), null_count, 0, - std::move(buffers), 0, nullptr, - nullptr + nullptr, + std::move(buffers) ); sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; return sparrow::fixed_width_binary_array{std::move(ap)}; diff --git a/src/deserialize_utils.cpp b/src/deserialize_utils.cpp index d89be6c..d42ddca 100644 --- a/src/deserialize_utils.cpp +++ b/src/deserialize_utils.cpp @@ -1,5 +1,7 @@ #include "sparrow_ipc/deserialize_utils.hpp" +#include "sparrow_ipc/compression.hpp" + namespace sparrow_ipc::utils { std::pair get_bitmap_pointer_and_null_count( @@ -24,4 +26,23 @@ namespace sparrow_ipc::utils }; return {ptr, bitmap_view.null_count()}; } + + std::span get_and_decompress_buffer( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + std::span body, + size_t& buffer_index, + const org::apache::arrow::flatbuf::BodyCompression* compression, + std::vector>& decompressed_storage + ) + { + const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++); + auto buffer_span = body.subspan(buffer_metadata->offset(), buffer_metadata->length()); + + if (compression) + { + decompressed_storage.emplace_back(decompress(compression->codec(), buffer_span)); + buffer_span = decompressed_storage.back(); + } + return buffer_span; + } } \ No newline at end of file From 8bc880c1dfc197f6515da2bc4bcf27e71721673f Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Tue, 14 Oct 2025 10:19:05 +0200 Subject: [PATCH 09/19] Remove macro from template --- include/sparrow_ipc/arrow_interface/arrow_array.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/sparrow_ipc/arrow_interface/arrow_array.hpp b/include/sparrow_ipc/arrow_interface/arrow_array.hpp index 726ecb9..4faecf4 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_array.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_array.hpp @@ -29,7 +29,7 @@ namespace sparrow_ipc } template - SPARROW_IPC_API void fill_arrow_array( + void fill_arrow_array( ArrowArray& array, int64_t length, int64_t null_count, @@ -60,7 +60,7 @@ namespace sparrow_ipc } template - [[nodiscard]] SPARROW_IPC_API ArrowArray make_arrow_array( + [[nodiscard]] ArrowArray make_arrow_array( int64_t length, int64_t null_count, int64_t offset, From 10fcf73487a02c2098d8a367854eca0ae7cee837 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Tue, 14 Oct 2025 11:46:15 +0200 Subject: [PATCH 10/19] Add fct and remove cout --- .../deserialize_primitive_array.hpp | 12 +---------- include/sparrow_ipc/deserialize_utils.hpp | 21 +++++++++++++++++++ ...deserialize_variable_size_binary_array.hpp | 13 ++---------- src/deserialize_utils.cpp | 19 ++++++++++++++++- tests/test_de_serialization_with_files.cpp | 4 ---- 5 files changed, 42 insertions(+), 27 deletions(-) diff --git a/include/sparrow_ipc/deserialize_primitive_array.hpp b/include/sparrow_ipc/deserialize_primitive_array.hpp index 1808789..76f7212 100644 --- a/include/sparrow_ipc/deserialize_primitive_array.hpp +++ b/include/sparrow_ipc/deserialize_primitive_array.hpp @@ -40,17 +40,7 @@ namespace sparrow_ipc auto validity_buffer_span = utils::get_and_decompress_buffer(record_batch, body, buffer_index, compression, decompressed_buffers); - uint8_t* bitmap_ptr = nullptr; - int64_t null_count = 0; - - if (validity_buffer_span.size() > 0) - { - bitmap_ptr = const_cast(validity_buffer_span.data()); - const sparrow::dynamic_bitset_view bitmap_view{ - bitmap_ptr, - static_cast(record_batch.length())}; - null_count = bitmap_view.null_count(); - } + const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); auto data_buffer_span = utils::get_and_decompress_buffer(record_batch, body, buffer_index, compression, decompressed_buffers); diff --git a/include/sparrow_ipc/deserialize_utils.hpp b/include/sparrow_ipc/deserialize_utils.hpp index cbc52b3..36f93ad 100644 --- a/include/sparrow_ipc/deserialize_utils.hpp +++ b/include/sparrow_ipc/deserialize_utils.hpp @@ -11,6 +11,26 @@ namespace sparrow_ipc::utils { + /** + * @brief Extracts bitmap pointer and null count from a validity buffer span. + * + * This function calculates the number of null values represented by the bitmap. + * + * @param validity_buffer_span The validity buffer as a byte span. + * @param length The Arrow RecordBatch length (number of values in the array). + * + * @return A pair containing: + * - First: Pointer to the bitmap data (nullptr if buffer is empty) + * - Second: Count of null values in the bitmap (0 if buffer is empty) + * + * @note If the bitmap buffer is empty, returns {nullptr, 0} + * @note The returned pointer is a non-const cast of the original const data + */ + [[nodiscard]] std::pair get_bitmap_pointer_and_null_count( + std::span validity_buffer_span, + const int64_t length + ); + /** * @brief Extracts bitmap pointer and null count from a RecordBatch buffer. * @@ -28,6 +48,7 @@ namespace sparrow_ipc::utils * @note If the bitmap buffer has zero length, returns {nullptr, 0} * @note The returned pointer is a non-const cast of the original const data */ + // TODO to be removed when not used anymore (after adding compression to deserialize_fixedsizebinary_array) [[nodiscard]] std::pair get_bitmap_pointer_and_null_count( const org::apache::arrow::flatbuf::RecordBatch& record_batch, std::span body, diff --git a/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp b/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp index 9895b97..623776d 100644 --- a/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp +++ b/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp @@ -35,18 +35,9 @@ namespace sparrow_ipc const auto compression = record_batch.compression(); std::vector> decompressed_buffers; - // TODO add another fct here auto validity_buffer_span = utils::get_and_decompress_buffer(record_batch, body, buffer_index, compression, decompressed_buffers); - uint8_t* bitmap_ptr = nullptr; - int64_t null_count = 0; - if (validity_buffer_span.size() > 0) - { - bitmap_ptr = const_cast(validity_buffer_span.data()); - const sparrow::dynamic_bitset_view bitmap_view{ - bitmap_ptr, - static_cast(record_batch.length())}; - null_count = bitmap_view.null_count(); - } + + const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); auto offset_buffer_span = utils::get_and_decompress_buffer(record_batch, body, buffer_index, compression, decompressed_buffers); auto data_buffer_span = utils::get_and_decompress_buffer(record_batch, body, buffer_index, compression, decompressed_buffers); diff --git a/src/deserialize_utils.cpp b/src/deserialize_utils.cpp index d42ddca..1d01029 100644 --- a/src/deserialize_utils.cpp +++ b/src/deserialize_utils.cpp @@ -4,6 +4,23 @@ namespace sparrow_ipc::utils { + std::pair get_bitmap_pointer_and_null_count( + std::span validity_buffer_span, + const int64_t length + ) + { + if (validity_buffer_span.empty()) + { + return {nullptr, 0}; + } + auto ptr = const_cast(validity_buffer_span.data()); + const sparrow::dynamic_bitset_view bitmap_view{ + ptr, + static_cast(length) + }; + return {ptr, bitmap_view.null_count()}; + } + std::pair get_bitmap_pointer_and_null_count( const org::apache::arrow::flatbuf::RecordBatch& record_batch, std::span body, @@ -45,4 +62,4 @@ namespace sparrow_ipc::utils } return buffer_span; } -} \ No newline at end of file +} diff --git a/tests/test_de_serialization_with_files.cpp b/tests/test_de_serialization_with_files.cpp index d46f7bf..52984fe 100644 --- a/tests/test_de_serialization_with_files.cpp +++ b/tests/test_de_serialization_with_files.cpp @@ -67,12 +67,10 @@ void compare_record_batches( ) { REQUIRE_EQ(record_batches_1.size(), record_batches_2.size()); -// std::cout << "record_batches1 size: " << record_batches_1.size() << " record_batches2 size: " << record_batches_2.size() << std::endl; for (size_t i = 0; i < record_batches_1.size(); ++i) { for (size_t y = 0; y < record_batches_1[i].nb_columns(); y++) { -// std::cout << "record_batches1 nb cols: " << record_batches_1[i].nb_columns() << " record_batches2 nb cols: " << record_batches_2[i].nb_columns() << std::endl; const auto& column_1 = record_batches_1[i].get_column(y); const auto& column_2 = record_batches_2[i].get_column(y); REQUIRE_EQ(column_1.size(), column_2.size()); @@ -81,11 +79,9 @@ void compare_record_batches( { const auto col_name = column_1.name().value_or("NA"); INFO("Comparing batch " << i << ", column " << y << " named :" << col_name << " , row " << z); -// std::cout << "Comparing batch " << i << ", column " << y << " named :" << col_name << " , row " << z << std::endl; REQUIRE_EQ(column_1.data_type(), column_2.data_type()); const auto& column_1_value = column_1[z]; const auto& column_2_value = column_2[z]; -// std::cout << "column_1_value :" << column_1_value << " and " << column_2_value << std::endl; CHECK_EQ(column_1_value, column_2_value); } } From c667acd89c6790edd912c535cd4a658e8b0c2daf Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Tue, 14 Oct 2025 12:16:14 +0200 Subject: [PATCH 11/19] Move implementation details to cpp and add check boundaries --- CMakeLists.txt | 2 +- .../arrow_array/private_data.hpp | 34 +++---------------- .../arrow_array/private_data.cpp | 34 +++++++++++++++++++ src/deserialize_utils.cpp | 4 +++ 4 files changed, 44 insertions(+), 30 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5bbc612..857389c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -109,6 +109,7 @@ set(SPARROW_IPC_HEADERS ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/config/config.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/config/sparrow_ipc_version.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/compression.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_variable_size_binary_array.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_fixedsizebinary_array.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_primitive_array.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_utils.hpp @@ -133,7 +134,6 @@ set(SPARROW_IPC_SRC ${SPARROW_IPC_SOURCE_DIR}/arrow_interface/arrow_schema/private_data.cpp ${SPARROW_IPC_SOURCE_DIR}/chunk_memory_serializer.cpp ${SPARROW_IPC_SOURCE_DIR}/compression.cpp - #${SPARROW_IPC_SOURCE_DIR}/decompressed_buffers.cpp ${SPARROW_IPC_SOURCE_DIR}/deserialize_fixedsizebinary_array.cpp ${SPARROW_IPC_SOURCE_DIR}/deserialize_utils.cpp ${SPARROW_IPC_SOURCE_DIR}/deserialize.cpp diff --git a/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp b/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp index 2a9386f..5ad6c90 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp @@ -18,28 +18,12 @@ namespace sparrow_ipc { public: - explicit owning_arrow_array_private_data(std::vector>&& buffers) - : m_buffers(std::move(buffers)) - { - m_buffer_pointers.reserve(m_buffers.size()); - for (const auto& buffer : m_buffers) - { - m_buffer_pointers.push_back(buffer.data()); - } - } + explicit owning_arrow_array_private_data(std::vector>&& buffers); - [[nodiscard]] SPARROW_IPC_API const void** buffers_ptrs() noexcept - { - return m_buffer_pointers.data(); - } - - [[nodiscard]] SPARROW_IPC_API std::size_t n_buffers() const noexcept - { - return m_buffers.size(); - } + [[nodiscard]] SPARROW_IPC_API const void** buffers_ptrs() noexcept; + [[nodiscard]] SPARROW_IPC_API std::size_t n_buffers() const noexcept; private: - std::vector> m_buffers; std::vector m_buffer_pointers; }; @@ -53,18 +37,10 @@ namespace sparrow_ipc { } - [[nodiscard]] SPARROW_IPC_API const void** buffers_ptrs() noexcept - { - return const_cast(reinterpret_cast(m_buffer_pointers.data())); - } - - [[nodiscard]] SPARROW_IPC_API std::size_t n_buffers() const noexcept - { - return m_buffer_pointers.size(); - } + [[nodiscard]] SPARROW_IPC_API const void** buffers_ptrs() noexcept; + [[nodiscard]] SPARROW_IPC_API std::size_t n_buffers() const noexcept; private: - std::vector m_buffer_pointers; }; } diff --git a/src/arrow_interface/arrow_array/private_data.cpp b/src/arrow_interface/arrow_array/private_data.cpp index e69de29..9c3738b 100644 --- a/src/arrow_interface/arrow_array/private_data.cpp +++ b/src/arrow_interface/arrow_array/private_data.cpp @@ -0,0 +1,34 @@ +#include "sparrow_ipc/arrow_interface/arrow_array/private_data.hpp" + +namespace sparrow_ipc +{ + owning_arrow_array_private_data::owning_arrow_array_private_data(std::vector>&& buffers) + : m_buffers(std::move(buffers)) + { + m_buffer_pointers.reserve(m_buffers.size()); + for (const auto& buffer : m_buffers) + { + m_buffer_pointers.push_back(buffer.data()); + } + } + + const void** owning_arrow_array_private_data::buffers_ptrs() noexcept + { + return m_buffer_pointers.data(); + } + + std::size_t owning_arrow_array_private_data::n_buffers() const noexcept + { + return m_buffers.size(); + } + + const void** non_owning_arrow_array_private_data::buffers_ptrs() noexcept + { + return const_cast(reinterpret_cast(m_buffer_pointers.data())); + } + + std::size_t non_owning_arrow_array_private_data::n_buffers() const noexcept + { + return m_buffer_pointers.size(); + } +} diff --git a/src/deserialize_utils.cpp b/src/deserialize_utils.cpp index 1d01029..3eba4cc 100644 --- a/src/deserialize_utils.cpp +++ b/src/deserialize_utils.cpp @@ -53,6 +53,10 @@ namespace sparrow_ipc::utils ) { const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++); + if (body.size() < (buffer_metadata->offset() + buffer_metadata->length())) + { + throw std::runtime_error("Buffer metadata exceeds body size"); + } auto buffer_span = body.subspan(buffer_metadata->offset(), buffer_metadata->length()); if (compression) From 568a8e7056867f3b78dd8ea2faef3d03a96e29b4 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Wed, 15 Oct 2025 10:42:31 +0200 Subject: [PATCH 12/19] Factorize more --- include/sparrow_ipc/compression.hpp | 4 ++-- include/sparrow_ipc/serialize_utils.hpp | 18 +++++++++++++++++- src/compression.cpp | 6 +++--- src/serialize_utils.cpp | 1 + 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/include/sparrow_ipc/compression.hpp b/include/sparrow_ipc/compression.hpp index b21a8b1..47d0cc3 100644 --- a/include/sparrow_ipc/compression.hpp +++ b/include/sparrow_ipc/compression.hpp @@ -18,6 +18,6 @@ namespace sparrow_ipc // CompressionType to_compression_type(org::apache::arrow::flatbuf::CompressionType compression_type); - std::vector compress(org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); - std::vector decompress(org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); + std::vector compress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); + std::vector decompress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); } diff --git a/include/sparrow_ipc/serialize_utils.hpp b/include/sparrow_ipc/serialize_utils.hpp index 3c825b4..e8de1b9 100644 --- a/include/sparrow_ipc/serialize_utils.hpp +++ b/include/sparrow_ipc/serialize_utils.hpp @@ -117,7 +117,23 @@ namespace sparrow_ipc return total_size; } - void fill_body_and_get_buffers_compressed(const sparrow::arrow_proxy& arrow_proxy, std::vector& body, std::vector& flatbuf_buffers, int64_t& offset, org::apache::arrow::flatbuf::CompressionType compression_type); + /** + * @brief Generates the compressed message body and buffer metadata for a record batch. + * + * This function traverses the record batch, compresses each buffer using the specified + * compression algorithm, and constructs the message body. For each compressed buffer, + * it is prefixed by its 8-byte uncompressed size. Padding is added after each + * compressed buffer to ensure 8-byte alignment. + * + * @param record_batch The record batch to serialize. + * @param compression_type The compression algorithm to use (e.g., LZ4_FRAME, ZSTD). + * @return A std::pair containing: + * - first: A vector of bytes representing the complete compressed message body. + * - second: A vector of FlatBuffer Buffer objects describing the offset and + * size of each buffer within the compressed body. + */ + [[nodiscard]] SPARROW_IPC_API std::pair, std::vector> + generate_compressed_body_and_buffers(const sparrow::record_batch& record_batch, const org::apache::arrow::flatbuf::CompressionType compression_type); /** * @brief Fills the body vector with serialized data from an arrow proxy and its children. diff --git a/src/compression.cpp b/src/compression.cpp index 59d6d56..5170903 100644 --- a/src/compression.cpp +++ b/src/compression.cpp @@ -20,7 +20,7 @@ namespace sparrow_ipc // } // } - std::vector compress(org::apache::arrow::flatbuf::CompressionType compression_type, std::span data) + std::vector compress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data) { if (data.empty()) { @@ -30,7 +30,7 @@ namespace sparrow_ipc { case org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME: { - std::int64_t uncompressed_size = data.size(); + const std::int64_t uncompressed_size = data.size(); const size_t max_compressed_size = LZ4F_compressFrameBound(uncompressed_size, nullptr); std::vector compressed_data(max_compressed_size); const size_t compressed_size = LZ4F_compressFrame(compressed_data.data(), max_compressed_size, data.data(), uncompressed_size, nullptr); @@ -46,7 +46,7 @@ namespace sparrow_ipc } } - std::vector decompress(org::apache::arrow::flatbuf::CompressionType compression_type, std::span data) + std::vector decompress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data) { if (data.empty()) { diff --git a/src/serialize_utils.cpp b/src/serialize_utils.cpp index 545f17d..18eb854 100644 --- a/src/serialize_utils.cpp +++ b/src/serialize_utils.cpp @@ -1,3 +1,4 @@ +#include #include #include "sparrow_ipc/flatbuffer_utils.hpp" From d410d151b600ba4d3ba6bef3412c1db1e7368a2b Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Thu, 16 Oct 2025 11:38:23 +0200 Subject: [PATCH 13/19] Fix conflicts and rework serialization with compression --- CMakeLists.txt | 1 - .../sparrow_ipc/chunk_memory_serializer.hpp | 37 +++++++++-- include/sparrow_ipc/flatbuffer_utils.hpp | 15 +++-- include/sparrow_ipc/serialize.hpp | 6 +- include/sparrow_ipc/serialize_utils.hpp | 11 ++-- include/sparrow_ipc/serializer.hpp | 9 +-- src/chunk_memory_serializer.cpp | 4 +- src/flatbuffer_utils.cpp | 13 ++-- src/serialize.cpp | 25 +++++-- src/serialize_utils.cpp | 66 ++++++++++++++++--- tests/test_de_serialization_with_files.cpp | 6 +- tests/test_serialize_utils.cpp | 12 ++-- 12 files changed, 154 insertions(+), 51 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 857389c..5769f6b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -109,7 +109,6 @@ set(SPARROW_IPC_HEADERS ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/config/config.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/config/sparrow_ipc_version.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/compression.hpp - ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_variable_size_binary_array.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_fixedsizebinary_array.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_primitive_array.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_utils.hpp diff --git a/include/sparrow_ipc/chunk_memory_serializer.hpp b/include/sparrow_ipc/chunk_memory_serializer.hpp index 3b241f8..99c5acf 100644 --- a/include/sparrow_ipc/chunk_memory_serializer.hpp +++ b/include/sparrow_ipc/chunk_memory_serializer.hpp @@ -1,7 +1,16 @@ #pragma once +#include +#include +#include +#include +#include + #include +#include "Message_generated.h" + +#include "sparrow_ipc/any_output_stream.hpp" #include "sparrow_ipc/chunk_memory_output_stream.hpp" #include "sparrow_ipc/config/config.hpp" #include "sparrow_ipc/memory_output_stream.hpp" @@ -33,8 +42,9 @@ namespace sparrow_ipc * @brief Constructs a chunk serializer with a reference to a chunked memory output stream. * * @param stream Reference to a chunked memory output stream that will receive the serialized chunks + * @param compression Optional: The compression type to use for record batch bodies. */ - chunk_serializer(chunked_memory_output_stream>>& stream); + chunk_serializer(chunked_memory_output_stream>>& stream, std::optional compression = std::nullopt); /** * @brief Writes a single record batch to the chunked stream. @@ -120,6 +130,7 @@ namespace sparrow_ipc std::vector m_dtypes; chunked_memory_output_stream>>* m_pstream; bool m_ended{false}; + std::optional m_compression; }; // Implementation @@ -133,7 +144,21 @@ namespace sparrow_ipc throw std::runtime_error("Cannot append record batches to a serializer that has been ended"); } - m_pstream->reserve((m_schema_received ? 0 : 1) + m_pstream->size() + record_batches.size()); + const auto reserve_function = [&record_batches, this]() + { + return std::accumulate( + record_batches.begin(), + record_batches.end(), + m_pstream->size(), + [this](size_t acc, const sparrow::record_batch& rb) + { + return acc + calculate_record_batch_message_size(rb, m_compression); + } + ) + + (m_schema_received ? 0 : calculate_schema_message_size(*record_batches.begin())); + }; + + m_pstream->reserve(reserve_function); if (!m_schema_received) { @@ -148,10 +173,14 @@ namespace sparrow_ipc for (const auto& rb : record_batches) { + if (get_column_dtypes(rb) != m_dtypes) + { + throw std::invalid_argument("Record batch schema does not match serializer schema"); + } std::vector buffer; memory_output_stream stream(buffer); any_output_stream astream(stream); - serialize_record_batch(rb, astream); + serialize_record_batch(rb, astream, m_compression); m_pstream->write(std::move(buffer)); } } @@ -169,4 +198,4 @@ namespace sparrow_ipc write(record_batches); return *this; } -} \ No newline at end of file +} diff --git a/include/sparrow_ipc/flatbuffer_utils.hpp b/include/sparrow_ipc/flatbuffer_utils.hpp index 87c322d..b26f0ed 100644 --- a/include/sparrow_ipc/flatbuffer_utils.hpp +++ b/include/sparrow_ipc/flatbuffer_utils.hpp @@ -213,15 +213,20 @@ namespace sparrow_ipc * format that conforms to the Arrow IPC specification. * * @param record_batch The source record batch containing the data to be serialized - * + * @param compression Optional: The compression algorithm to be used for the message body + * @param body_size Optional: An override for the total size of the message body + * If not provided, the size is calculated from the uncompressed buffers + * This is required when using compression + * @param compressed_buffers Optional: A pointer to a vector of buffer metadata. + * If provided, this metadata is used instead of generating it from the + * uncompressed record batch. This is required when using compression. * @return A FlatBufferBuilder containing the complete serialized message ready for * transmission or storage. The builder is finished and ready to be accessed * via GetBufferPointer() and GetSize(). * * @note The returned message uses Arrow IPC format version V5 - * @note Compression and variadic buffer counts are not currently implemented (set to 0) - * @note The body size is automatically calculated based on the record batch contents + * @note Variadic buffer counts is not currently implemented (set to 0) */ [[nodiscard]] flatbuffers::FlatBufferBuilder - get_record_batch_message_builder(const sparrow::record_batch& record_batch); -} \ No newline at end of file + get_record_batch_message_builder(const sparrow::record_batch& record_batch, std::optional compression = std::nullopt, std::optional body_size = std::nullopt, const std::vector* compressed_buffers = nullptr); +} diff --git a/include/sparrow_ipc/serialize.hpp b/include/sparrow_ipc/serialize.hpp index 9c16122..ab47646 100644 --- a/include/sparrow_ipc/serialize.hpp +++ b/include/sparrow_ipc/serialize.hpp @@ -25,9 +25,8 @@ namespace sparrow_ipc * @tparam R Container type that holds record batches (must support empty(), operator[], begin(), end()) * @param record_batches Collection of record batches to serialize. All batches must have identical * schemas. - * @param compression The compression type to use when serializing - * * @param stream The output stream where the serialized data will be written. + * @param compression The compression type to use when serializing. * * @throws std::invalid_argument If record batches have inconsistent schemas or if the collection * contains batches that cannot be serialized together. @@ -70,13 +69,14 @@ namespace sparrow_ipc * * @param record_batch The sparrow record batch to serialize * @param stream The output stream where the serialized record batch will be written + * @param compression The compression type to use when serializing. * * @note The output follows Arrow IPC message format with proper alignment and * includes both metadata and data portions of the record batch */ SPARROW_IPC_API void - serialize_record_batch(const sparrow::record_batch& record_batch, any_output_stream& stream); + serialize_record_batch(const sparrow::record_batch& record_batch, any_output_stream& stream, std::optional compression); /** * @brief Serializes a schema message for a record batch into a byte buffer. diff --git a/include/sparrow_ipc/serialize_utils.hpp b/include/sparrow_ipc/serialize_utils.hpp index e8de1b9..5129b73 100644 --- a/include/sparrow_ipc/serialize_utils.hpp +++ b/include/sparrow_ipc/serialize_utils.hpp @@ -8,7 +8,6 @@ #include "Message_generated.h" #include "sparrow_ipc/any_output_stream.hpp" #include "sparrow_ipc/config/config.hpp" -#include "sparrow_ipc/compression.hpp" #include "sparrow_ipc/utils.hpp" namespace sparrow_ipc @@ -40,8 +39,8 @@ namespace sparrow_ipc * consists of a metadata section followed by a body section containing the actual data. * * @param record_batch The sparrow record batch to be serialized - * @param compression The compression type to use when serializing * @param stream The output stream where the serialized record batch will be written + * @param compression The compression type to use when serializing */ SPARROW_IPC_API void serialize_record_batch(const sparrow::record_batch& record_batch, any_output_stream& stream, std::optional compression); @@ -74,10 +73,11 @@ namespace sparrow_ipc * - Body data with 8-byte alignment between buffers * * @param record_batch The record batch to be measured + * @param compression The compression type to use when serializing * @return The total size in bytes that the serialized record batch would occupy */ [[nodiscard]] SPARROW_IPC_API std::size_t - calculate_record_batch_message_size(const sparrow::record_batch& record_batch); + calculate_record_batch_message_size(const sparrow::record_batch& record_batch, std::optional compression = std::nullopt); /** * @brief Calculates the total serialized size for a collection of record batches. @@ -87,12 +87,13 @@ namespace sparrow_ipc * * @tparam R Range type containing sparrow::record_batch objects * @param record_batches Collection of record batches to be measured + * @param compression The compression type to use when serializing * @return The total size in bytes for the complete serialized output * @throws std::invalid_argument if record batches have inconsistent schemas */ template requires std::same_as, sparrow::record_batch> - [[nodiscard]] std::size_t calculate_total_serialized_size(const R& record_batches) + [[nodiscard]] std::size_t calculate_total_serialized_size(const R& record_batches, std::optional compression = std::nullopt) { if (record_batches.empty()) { @@ -111,7 +112,7 @@ namespace sparrow_ipc // Calculate record batch message sizes for (const auto& record_batch : record_batches) { - total_size += calculate_record_batch_message_size(record_batch); + total_size += calculate_record_batch_message_size(record_batch, compression); } return total_size; diff --git a/include/sparrow_ipc/serializer.hpp b/include/sparrow_ipc/serializer.hpp index 9a8c1e0..f0ebcb9 100644 --- a/include/sparrow_ipc/serializer.hpp +++ b/include/sparrow_ipc/serializer.hpp @@ -41,8 +41,8 @@ namespace sparrow_ipc * The serializer stores a pointer to this stream for later use. */ template - serializer(TStream& stream) - : m_stream(stream) + serializer(TStream& stream, std::optional compression = std::nullopt) + : m_stream(stream), m_compression(compression) { } @@ -94,7 +94,7 @@ namespace sparrow_ipc m_stream.size(), [this](size_t acc, const sparrow::record_batch& rb) { - return acc + calculate_record_batch_message_size(rb); + return acc + calculate_record_batch_message_size(rb, m_compression); } ) + (m_schema_received ? 0 : calculate_schema_message_size(*record_batches.begin())); @@ -115,7 +115,7 @@ namespace sparrow_ipc { throw std::invalid_argument("Record batch schema does not match serializer schema"); } - serialize_record_batch(rb, m_stream); + serialize_record_batch(rb, m_stream, m_compression); } } @@ -206,6 +206,7 @@ namespace sparrow_ipc std::vector m_dtypes; any_output_stream m_stream; bool m_ended{false}; + std::optional m_compression; }; inline serializer& end_stream(serializer& serializer) diff --git a/src/chunk_memory_serializer.cpp b/src/chunk_memory_serializer.cpp index cbdfb4a..db2c8a2 100644 --- a/src/chunk_memory_serializer.cpp +++ b/src/chunk_memory_serializer.cpp @@ -6,8 +6,8 @@ namespace sparrow_ipc { - chunk_serializer::chunk_serializer(chunked_memory_output_stream>>& stream) - : m_pstream(&stream) + chunk_serializer::chunk_serializer(chunked_memory_output_stream>>& stream, std::optional compression) + : m_pstream(&stream), m_compression(compression) { } diff --git a/src/flatbuffer_utils.cpp b/src/flatbuffer_utils.cpp index 9f510b7..1e580f3 100644 --- a/src/flatbuffer_utils.cpp +++ b/src/flatbuffer_utils.cpp @@ -562,23 +562,28 @@ namespace sparrow_ipc return buffers; } - flatbuffers::FlatBufferBuilder get_record_batch_message_builder(const sparrow::record_batch& record_batch) + flatbuffers::FlatBufferBuilder get_record_batch_message_builder(const sparrow::record_batch& record_batch, std::optional compression, std::optional body_size_override, const std::vector* compressed_buffers) { const std::vector nodes = create_fieldnodes(record_batch); - const std::vector buffers = get_buffers(record_batch); + const std::vector& buffers = compressed_buffers ? *compressed_buffers : get_buffers(record_batch); flatbuffers::FlatBufferBuilder record_batch_builder; auto nodes_offset = record_batch_builder.CreateVectorOfStructs(nodes); auto buffers_offset = record_batch_builder.CreateVectorOfStructs(buffers); + flatbuffers::Offset compression_offset = 0; + if (compression) + { + compression_offset = org::apache::arrow::flatbuf::CreateBodyCompression(record_batch_builder, compression.value(), org::apache::arrow::flatbuf::BodyCompressionMethod::BUFFER); + } const auto record_batch_offset = org::apache::arrow::flatbuf::CreateRecordBatch( record_batch_builder, static_cast(record_batch.nb_rows()), nodes_offset, buffers_offset, - 0, // TODO: Compression + compression_offset, 0 // TODO :variadic buffer Counts ); - const int64_t body_size = calculate_body_size(record_batch); + const int64_t body_size = body_size_override.value_or(calculate_body_size(record_batch)); const auto record_batch_message_offset = org::apache::arrow::flatbuf::CreateMessage( record_batch_builder, org::apache::arrow::flatbuf::MetadataVersion::V5, diff --git a/src/serialize.cpp b/src/serialize.cpp index a4e797d..644acf3 100644 --- a/src/serialize.cpp +++ b/src/serialize.cpp @@ -1,11 +1,11 @@ -#include "sparrow_ipc/serialize.hpp" +#include +#include "sparrow_ipc/serialize.hpp" #include "sparrow_ipc/flatbuffer_utils.hpp" namespace sparrow_ipc { void common_serialize( - const sparrow::record_batch& record_batch, const flatbuffers::FlatBufferBuilder& builder, any_output_stream& stream ) @@ -20,12 +20,23 @@ namespace sparrow_ipc void serialize_schema_message(const sparrow::record_batch& record_batch, any_output_stream& stream) { - common_serialize(record_batch, get_schema_message_builder(record_batch), stream); + common_serialize(get_schema_message_builder(record_batch), stream); } - void serialize_record_batch(const sparrow::record_batch& record_batch, any_output_stream& stream) + void serialize_record_batch(const sparrow::record_batch& record_batch, any_output_stream& stream, std::optional compression) { - common_serialize(record_batch, get_record_batch_message_builder(record_batch), stream); - generate_body(record_batch, stream); + if (compression.has_value()) + { + // TODO Handle this inside get_record_batch_message_builder + auto [compressed_body, compressed_buffers] = generate_compressed_body_and_buffers(record_batch, compression.value()); + common_serialize(get_record_batch_message_builder(record_batch, compression, compressed_body.size(), &compressed_buffers), stream); + // TODO Use something equivalent to generate_body (stream wise, handling children etc) + stream.write(std::span(compressed_body.data(), compressed_body.size())); + } + else + { + common_serialize(get_record_batch_message_builder(record_batch, compression), stream); + generate_body(record_batch, stream); + } } -} \ No newline at end of file +} diff --git a/src/serialize_utils.cpp b/src/serialize_utils.cpp index 18eb854..f3c972d 100644 --- a/src/serialize_utils.cpp +++ b/src/serialize_utils.cpp @@ -1,10 +1,9 @@ -#include -#include - +#include "sparrow_ipc/compression.hpp" #include "sparrow_ipc/flatbuffer_utils.hpp" #include "sparrow_ipc/magic_values.hpp" #include "sparrow_ipc/serialize.hpp" -#include "sparrow_ipc/utils.hpp" +#include "sparrow_ipc/serialize_utils.hpp" + namespace sparrow_ipc { @@ -73,14 +72,24 @@ namespace sparrow_ipc return utils::align_to_8(total_size); } - std::size_t calculate_record_batch_message_size(const sparrow::record_batch& record_batch) + std::size_t calculate_record_batch_message_size(const sparrow::record_batch& record_batch, std::optional compression) { // Build the record batch message to get its exact metadata size - flatbuffers::FlatBufferBuilder record_batch_builder = get_record_batch_message_builder(record_batch); + flatbuffers::FlatBufferBuilder record_batch_builder = get_record_batch_message_builder(record_batch, compression); const flatbuffers::uoffset_t record_batch_len = record_batch_builder.GetSize(); - // Calculate body size (already includes 8-byte alignment for each buffer) - const int64_t body_size = calculate_body_size(record_batch); + std::size_t actual_body_size = 0; + if (compression.has_value()) + { + // If compressed, the body size is the sum of compressed buffer sizes + original size prefixes + padding + auto [compressed_body, compressed_buffers] = generate_compressed_body_and_buffers(record_batch, compression.value()); + actual_body_size = compressed_body.size(); + } + else + { + // If not compressed, the body size is the sum of uncompressed buffer sizes with padding + actual_body_size = static_cast(calculate_body_size(record_batch)); + } // Calculate total size: // - Continuation bytes (4) @@ -91,7 +100,46 @@ namespace sparrow_ipc std::size_t metadata_size = continuation.size() + sizeof(uint32_t) + record_batch_len; metadata_size = utils::align_to_8(metadata_size); - return metadata_size + static_cast(body_size); + return metadata_size + actual_body_size; + } + + [[nodiscard]] SPARROW_IPC_API std::pair, std::vector> + generate_compressed_body_and_buffers(const sparrow::record_batch& record_batch, const org::apache::arrow::flatbuf::CompressionType compression_type) + { + std::vector compressed_body; + std::vector compressed_buffers; + int64_t current_offset = 0; + + for (const auto& column : record_batch.columns()) + { + const auto& arrow_proxy = sparrow::detail::array_access::get_arrow_proxy(column); + for (const auto& buffer : arrow_proxy.buffers()) + { + // Original buffer size + const int64_t original_size = static_cast(buffer.size()); + + // Compress the buffer + std::vector compressed_buffer_data = compress(compression_type, std::span(buffer.data(), original_size)); + const int64_t compressed_data_size = static_cast(compressed_buffer_data.size()); + + // Calculate total size of this compressed chunk (original size prefix + compressed data + padding) + const int64_t total_chunk_size = sizeof(int64_t) + compressed_data_size; + const size_t aligned_chunk_size = utils::align_to_8(total_chunk_size); + const size_t padding_needed = aligned_chunk_size - total_chunk_size; + + // Write original size (8 bytes) followed by compressed data + compressed_body.insert(compressed_body.end(), reinterpret_cast(&original_size), reinterpret_cast(&original_size) + sizeof(int64_t)); + compressed_body.insert(compressed_body.end(), compressed_buffer_data.begin(), compressed_buffer_data.end()); + + // Add padding to the compressed data + compressed_body.insert(compressed_body.end(), padding_needed, 0); + + // Update compressed buffer metadata + compressed_buffers.emplace_back(current_offset, aligned_chunk_size); + current_offset += aligned_chunk_size; + } + } + return {compressed_body, compressed_buffers}; } std::vector get_column_dtypes(const sparrow::record_batch& rb) diff --git a/tests/test_de_serialization_with_files.cpp b/tests/test_de_serialization_with_files.cpp index 52984fe..7b7d236 100644 --- a/tests/test_de_serialization_with_files.cpp +++ b/tests/test_de_serialization_with_files.cpp @@ -224,7 +224,11 @@ TEST_SUITE("Integration tests") const auto record_batches_from_stream = sparrow_ipc::deserialize_stream( std::span(stream_data) ); - const auto serialized_data = sparrow_ipc::serialize(record_batches_from_json, org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME); + + std::vector serialized_data; + sparrow_ipc::memory_output_stream stream(serialized_data); + sparrow_ipc::serializer serializer(stream, org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME); + serializer << record_batches_from_json << sparrow_ipc::end_stream; const auto deserialized_serialized_data = sparrow_ipc::deserialize_stream( std::span(serialized_data) ); diff --git a/tests/test_serialize_utils.cpp b/tests/test_serialize_utils.cpp index 6d08be6..3aebbbb 100644 --- a/tests/test_serialize_utils.cpp +++ b/tests/test_serialize_utils.cpp @@ -111,7 +111,7 @@ namespace sparrow_ipc std::vector serialized; memory_output_stream stream(serialized); any_output_stream astream(stream); - serialize_schema_message(record_batch, astream ); + serialize_schema_message(record_batch, astream); CHECK_EQ(estimated_size, serialized.size()); } @@ -147,7 +147,7 @@ namespace sparrow_ipc std::vector serialized; memory_output_stream stream(serialized); any_output_stream astream(stream); - serialize_record_batch(record_batch, astream); + serialize_record_batch(record_batch, astream, std::nullopt); CHECK_EQ(estimated_size, serialized.size()); } @@ -164,7 +164,7 @@ namespace sparrow_ipc std::vector serialized; memory_output_stream stream(serialized); any_output_stream astream(stream); - serialize_record_batch(record_batch, astream); + serialize_record_batch(record_batch, astream, std::nullopt); CHECK_EQ(estimated_size, serialized.size()); } @@ -243,8 +243,8 @@ namespace sparrow_ipc std::vector serialized; memory_output_stream stream(serialized); any_output_stream astream(stream); - serialize_record_batch(record_batch, astream); - CHECK_GT(serialized.size(), 0); + serialize_record_batch(record_batch, astream, std::nullopt); + CHECK_GT(serialized.size(), 0); // Check that it starts with continuation bytes CHECK_GE(serialized.size(), continuation.size()); @@ -278,7 +278,7 @@ namespace sparrow_ipc std::vector serialized; memory_output_stream stream(serialized); any_output_stream astream(stream); - serialize_record_batch(empty_batch, astream); + serialize_record_batch(empty_batch, astream, std::nullopt); CHECK_GT(serialized.size(), 0); CHECK_GE(serialized.size(), continuation.size()); } From 78d015008d9cf8465cccd9659e1fb07f5a7e336c Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Thu, 16 Oct 2025 15:55:19 +0200 Subject: [PATCH 14/19] Add compression test --- include/sparrow_ipc/compression.hpp | 6 +++-- tests/CMakeLists.txt | 1 + tests/test_compression.cpp | 35 +++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 2 deletions(-) create mode 100644 tests/test_compression.cpp diff --git a/include/sparrow_ipc/compression.hpp b/include/sparrow_ipc/compression.hpp index 47d0cc3..1b4d786 100644 --- a/include/sparrow_ipc/compression.hpp +++ b/include/sparrow_ipc/compression.hpp @@ -6,6 +6,8 @@ #include "Message_generated.h" +#include "sparrow_ipc/config/config.hpp" + namespace sparrow_ipc { // TODO use these later if needed for wrapping purposes (flatbuffers/lz4) @@ -18,6 +20,6 @@ namespace sparrow_ipc // CompressionType to_compression_type(org::apache::arrow::flatbuf::CompressionType compression_type); - std::vector compress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); - std::vector decompress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); + SPARROW_IPC_API std::vector compress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); + SPARROW_IPC_API std::vector decompress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 11c2f9f..d1de914 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -10,6 +10,7 @@ set(SPARROW_IPC_TESTS_SRC test_arrow_schema.cpp test_chunk_memory_output_stream.cpp test_chunk_memory_serializer.cpp + test_compression.cpp test_de_serialization_with_files.cpp $<$>:test_flatbuffer_utils.cpp> test_memory_output_streams.cpp diff --git a/tests/test_compression.cpp b/tests/test_compression.cpp new file mode 100644 index 0000000..cfda32f --- /dev/null +++ b/tests/test_compression.cpp @@ -0,0 +1,35 @@ +#include +#include + +#include + +#include + +namespace sparrow_ipc +{ + TEST_CASE("Compression and Decompression Round-trip") + { + std::string original_string = "hello world, this is a test of compression and decompression!!"; + std::vector original_data(original_string.begin(), original_string.end()); + const int64_t original_size = original_data.size(); + + // Compress data + auto compression_type = org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME; + std::vector compressed_frame = compress(compression_type, original_data); + + CHECK_GT(compressed_frame.size(), 0); + CHECK_NE(compressed_frame, original_data); + + // Manually create the IPC-formatted compressed buffer by adding the 8-byte prefix + std::vector ipc_formatted_buffer; + ipc_formatted_buffer.reserve(sizeof(int64_t) + compressed_frame.size()); + ipc_formatted_buffer.insert(ipc_formatted_buffer.end(), reinterpret_cast(&original_size), reinterpret_cast(&original_size) + sizeof(int64_t)); + ipc_formatted_buffer.insert(ipc_formatted_buffer.end(), compressed_frame.begin(), compressed_frame.end()); + + // Decompress + std::vector decompressed_data = decompress(compression_type, ipc_formatted_buffer); + + CHECK_EQ(decompressed_data.size(), original_data.size()); + CHECK_EQ(decompressed_data, original_data); + } +} From bed642108ca8beb48fdb0c28c58228a41b20f71b Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Fri, 17 Oct 2025 10:17:32 +0200 Subject: [PATCH 15/19] Minor changes --- environment-dev.yml | 2 +- .../sparrow_ipc/chunk_memory_serializer.hpp | 17 +--- src/compression.cpp | 89 ++++++++++++------- tests/test_compression.cpp | 66 ++++++++++---- 4 files changed, 106 insertions(+), 68 deletions(-) diff --git a/environment-dev.yml b/environment-dev.yml index 3d21831..c9d941b 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -8,10 +8,10 @@ dependencies: - cxx-compiler # Libraries dependencies - flatbuffers + - lz4-c - nlohmann_json - sparrow-devel - sparrow-json-reader - - lz4 # Testing dependencies - doctest # Documentation dependencies diff --git a/include/sparrow_ipc/chunk_memory_serializer.hpp b/include/sparrow_ipc/chunk_memory_serializer.hpp index 99c5acf..4868a42 100644 --- a/include/sparrow_ipc/chunk_memory_serializer.hpp +++ b/include/sparrow_ipc/chunk_memory_serializer.hpp @@ -44,6 +44,7 @@ namespace sparrow_ipc * @param stream Reference to a chunked memory output stream that will receive the serialized chunks * @param compression Optional: The compression type to use for record batch bodies. */ + // TODO Use enums and such to avoid including flatbuffers headers chunk_serializer(chunked_memory_output_stream>>& stream, std::optional compression = std::nullopt); /** @@ -144,21 +145,7 @@ namespace sparrow_ipc throw std::runtime_error("Cannot append record batches to a serializer that has been ended"); } - const auto reserve_function = [&record_batches, this]() - { - return std::accumulate( - record_batches.begin(), - record_batches.end(), - m_pstream->size(), - [this](size_t acc, const sparrow::record_batch& rb) - { - return acc + calculate_record_batch_message_size(rb, m_compression); - } - ) - + (m_schema_received ? 0 : calculate_schema_message_size(*record_batches.begin())); - }; - - m_pstream->reserve(reserve_function); + m_pstream->reserve((m_schema_received ? 0 : 1) + m_pstream->size() + record_batches.size()); if (!m_schema_received) { diff --git a/src/compression.cpp b/src/compression.cpp index 5170903..ea9983c 100644 --- a/src/compression.cpp +++ b/src/compression.cpp @@ -20,6 +20,49 @@ namespace sparrow_ipc // } // } + std::vector lz4_compress(std::span data) + { + const std::int64_t uncompressed_size = data.size(); + const size_t max_compressed_size = LZ4F_compressFrameBound(uncompressed_size, nullptr); + std::vector compressed_data(max_compressed_size); + const size_t compressed_size = LZ4F_compressFrame(compressed_data.data(), max_compressed_size, data.data(), uncompressed_size, nullptr); + if (LZ4F_isError(compressed_size)) + { + throw std::runtime_error("Failed to compress data with LZ4 frame format"); + } + compressed_data.resize(compressed_size); + return compressed_data; + } + + std::vector lz4_decompress(std::span data) + { + if (data.size() < 8) + { + throw std::runtime_error("Invalid compressed data: missing decompressed size"); + } + const std::int64_t decompressed_size = *reinterpret_cast(data.data()); + const auto compressed_data = data.subspan(8); + + if (decompressed_size == -1) + { + // TODO think of avoiding copy here + return {compressed_data.begin(), compressed_data.end()}; + } + + std::vector decompressed_data(decompressed_size); + LZ4F_dctx* dctx = nullptr; + LZ4F_createDecompressionContext(&dctx, LZ4F_VERSION); + size_t compressed_size_in_out = compressed_data.size(); + size_t decompressed_size_in_out = decompressed_size; + size_t result = LZ4F_decompress(dctx, decompressed_data.data(), &decompressed_size_in_out, compressed_data.data(), &compressed_size_in_out, nullptr); + if (LZ4F_isError(result) || (decompressed_size_in_out != (size_t)decompressed_size)) + { + throw std::runtime_error("Failed to decompress data with LZ4 frame format"); + } + LZ4F_freeDecompressionContext(dctx); + return decompressed_data; + } + std::vector compress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data) { if (data.empty()) @@ -30,18 +73,14 @@ namespace sparrow_ipc { case org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME: { - const std::int64_t uncompressed_size = data.size(); - const size_t max_compressed_size = LZ4F_compressFrameBound(uncompressed_size, nullptr); - std::vector compressed_data(max_compressed_size); - const size_t compressed_size = LZ4F_compressFrame(compressed_data.data(), max_compressed_size, data.data(), uncompressed_size, nullptr); - if (LZ4F_isError(compressed_size)) - { - throw std::runtime_error("Failed to compress data with LZ4 frame format"); - } - compressed_data.resize(compressed_size); - return compressed_data; + return lz4_compress(data); + } + case org::apache::arrow::flatbuf::CompressionType::ZSTD: + { + throw std::runtime_error("Compression using zstd is not supported yet."); } default: + // TODO think of avoiding copy here return {data.begin(), data.end()}; } } @@ -56,32 +95,14 @@ namespace sparrow_ipc { case org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME: { - if (data.size() < 8) - { - throw std::runtime_error("Invalid compressed data: missing decompressed size"); - } - const std::int64_t decompressed_size = *reinterpret_cast(data.data()); - const auto compressed_data = data.subspan(8); - - if (decompressed_size == -1) - { - return {compressed_data.begin(), compressed_data.end()}; - } - - std::vector decompressed_data(decompressed_size); - LZ4F_dctx* dctx = nullptr; - LZ4F_createDecompressionContext(&dctx, LZ4F_VERSION); - size_t compressed_size_in_out = compressed_data.size(); - size_t decompressed_size_in_out = decompressed_size; - size_t result = LZ4F_decompress(dctx, decompressed_data.data(), &decompressed_size_in_out, compressed_data.data(), &compressed_size_in_out, nullptr); - if (LZ4F_isError(result)) - { - throw std::runtime_error("Failed to decompress data with LZ4 frame format"); - } - LZ4F_freeDecompressionContext(dctx); - return decompressed_data; + return lz4_decompress(data); + } + case org::apache::arrow::flatbuf::CompressionType::ZSTD: + { + throw std::runtime_error("Decompression using zstd is not supported yet."); } default: + // TODO think of avoiding copy here return {data.begin(), data.end()}; } } diff --git a/tests/test_compression.cpp b/tests/test_compression.cpp index cfda32f..8af026f 100644 --- a/tests/test_compression.cpp +++ b/tests/test_compression.cpp @@ -7,29 +7,59 @@ namespace sparrow_ipc { - TEST_CASE("Compression and Decompression Round-trip") + TEST_SUITE("De/Compression") { - std::string original_string = "hello world, this is a test of compression and decompression!!"; - std::vector original_data(original_string.begin(), original_string.end()); - const int64_t original_size = original_data.size(); + TEST_CASE("Unsupported ZSTD de/compression") + { + std::string original_string = "some data to compress"; + std::vector original_data(original_string.begin(), original_string.end()); + const auto compression_type = org::apache::arrow::flatbuf::CompressionType::ZSTD; - // Compress data - auto compression_type = org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME; - std::vector compressed_frame = compress(compression_type, original_data); + // Test compression with ZSTD + CHECK_THROWS_WITH_AS(sparrow_ipc::compress(compression_type, original_data), "Compression using zstd is not supported yet.", std::runtime_error); - CHECK_GT(compressed_frame.size(), 0); - CHECK_NE(compressed_frame, original_data); + // Test decompression with ZSTD + CHECK_THROWS_WITH_AS(sparrow_ipc::decompress(compression_type, original_data), "Decompression using zstd is not supported yet.", std::runtime_error); + } - // Manually create the IPC-formatted compressed buffer by adding the 8-byte prefix - std::vector ipc_formatted_buffer; - ipc_formatted_buffer.reserve(sizeof(int64_t) + compressed_frame.size()); - ipc_formatted_buffer.insert(ipc_formatted_buffer.end(), reinterpret_cast(&original_size), reinterpret_cast(&original_size) + sizeof(int64_t)); - ipc_formatted_buffer.insert(ipc_formatted_buffer.end(), compressed_frame.begin(), compressed_frame.end()); + TEST_CASE("Empty data") + { + const std::vector empty_data; + const auto compression_type = org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME; - // Decompress - std::vector decompressed_data = decompress(compression_type, ipc_formatted_buffer); + // Test compression of empty data + auto compressed = compress(compression_type, empty_data); + CHECK(compressed.empty()); - CHECK_EQ(decompressed_data.size(), original_data.size()); - CHECK_EQ(decompressed_data, original_data); + // Test decompression of empty data + auto decompressed = decompress(compression_type, empty_data); + CHECK(decompressed.empty()); + } + + TEST_CASE("Data compression and decompression round-trip") + { + std::string original_string = "hello world, this is a test of compression and decompression!!"; + std::vector original_data(original_string.begin(), original_string.end()); + const int64_t original_size = original_data.size(); + + // Compress data + auto compression_type = org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME; + std::vector compressed_frame = compress(compression_type, original_data); + + CHECK_GT(compressed_frame.size(), 0); + CHECK_NE(compressed_frame, original_data); + + // Manually create the IPC-formatted compressed buffer by adding the 8-byte prefix + std::vector ipc_formatted_buffer; + ipc_formatted_buffer.reserve(sizeof(int64_t) + compressed_frame.size()); + ipc_formatted_buffer.insert(ipc_formatted_buffer.end(), reinterpret_cast(&original_size), reinterpret_cast(&original_size) + sizeof(int64_t)); + ipc_formatted_buffer.insert(ipc_formatted_buffer.end(), compressed_frame.begin(), compressed_frame.end()); + + // Decompress + std::vector decompressed_data = decompress(compression_type, ipc_formatted_buffer); + + CHECK_EQ(decompressed_data.size(), original_data.size()); + CHECK_EQ(decompressed_data, original_data); + } } } From 4b171cca3f93b48e246090137f59fb0dad7a29c2 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Fri, 17 Oct 2025 16:49:29 +0200 Subject: [PATCH 16/19] Rework compression and add tests --- include/sparrow_ipc/compression.hpp | 7 +- src/compression.cpp | 125 +++++++++++++++++++--------- src/deserialize_utils.cpp | 24 +++++- src/serialize_utils.cpp | 23 ++--- tests/test_compression.cpp | 62 ++++++++++---- 5 files changed, 164 insertions(+), 77 deletions(-) diff --git a/include/sparrow_ipc/compression.hpp b/include/sparrow_ipc/compression.hpp index 1b4d786..96b47ec 100644 --- a/include/sparrow_ipc/compression.hpp +++ b/include/sparrow_ipc/compression.hpp @@ -2,6 +2,7 @@ #include #include +#include #include #include "Message_generated.h" @@ -20,6 +21,8 @@ namespace sparrow_ipc // CompressionType to_compression_type(org::apache::arrow::flatbuf::CompressionType compression_type); - SPARROW_IPC_API std::vector compress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); - SPARROW_IPC_API std::vector decompress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); + constexpr auto CompressionHeaderSize = sizeof(std::int64_t); + + [[nodiscard]] SPARROW_IPC_API std::vector compress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); + [[nodiscard]] SPARROW_IPC_API std::variant, std::span> decompress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); } diff --git a/src/compression.cpp b/src/compression.cpp index ea9983c..c8ad598 100644 --- a/src/compression.cpp +++ b/src/compression.cpp @@ -20,90 +20,135 @@ namespace sparrow_ipc // } // } - std::vector lz4_compress(std::span data) + namespace { - const std::int64_t uncompressed_size = data.size(); - const size_t max_compressed_size = LZ4F_compressFrameBound(uncompressed_size, nullptr); - std::vector compressed_data(max_compressed_size); - const size_t compressed_size = LZ4F_compressFrame(compressed_data.data(), max_compressed_size, data.data(), uncompressed_size, nullptr); - if (LZ4F_isError(compressed_size)) + std::vector lz4_compress(std::span data) { - throw std::runtime_error("Failed to compress data with LZ4 frame format"); + const std::int64_t uncompressed_size = data.size(); + const size_t max_compressed_size = LZ4F_compressFrameBound(uncompressed_size, nullptr); + std::vector compressed_data(max_compressed_size); + const size_t compressed_size = LZ4F_compressFrame(compressed_data.data(), max_compressed_size, data.data(), uncompressed_size, nullptr); + if (LZ4F_isError(compressed_size)) + { + throw std::runtime_error("Failed to compress data with LZ4 frame format"); + } + compressed_data.resize(compressed_size); + return compressed_data; } - compressed_data.resize(compressed_size); - return compressed_data; - } - std::vector lz4_decompress(std::span data) - { - if (data.size() < 8) + std::vector lz4_decompress(std::span data, const std::int64_t decompressed_size) { - throw std::runtime_error("Invalid compressed data: missing decompressed size"); + std::vector decompressed_data(decompressed_size); + LZ4F_dctx* dctx = nullptr; + LZ4F_createDecompressionContext(&dctx, LZ4F_VERSION); + size_t compressed_size_in_out = data.size(); + size_t decompressed_size_in_out = decompressed_size; + size_t result = LZ4F_decompress(dctx, decompressed_data.data(), &decompressed_size_in_out, data.data(), &compressed_size_in_out, nullptr); + if (LZ4F_isError(result) || (decompressed_size_in_out != (size_t)decompressed_size)) + { + throw std::runtime_error("Failed to decompress data with LZ4 frame format"); + } + LZ4F_freeDecompressionContext(dctx); + return decompressed_data; } - const std::int64_t decompressed_size = *reinterpret_cast(data.data()); - const auto compressed_data = data.subspan(8); - if (decompressed_size == -1) + // TODO These functions could be moved to serialize_utils and deserialize_utils if preferred + // as they are handling the header size + std::vector uncompressed_data_with_header(std::span data) { - // TODO think of avoiding copy here - return {compressed_data.begin(), compressed_data.end()}; + std::vector result; + result.reserve(CompressionHeaderSize + data.size()); + const std::int64_t header = -1; + result.insert(result.end(), reinterpret_cast(&header), reinterpret_cast(&header) + sizeof(header)); + result.insert(result.end(), data.begin(), data.end()); + return result; } - std::vector decompressed_data(decompressed_size); - LZ4F_dctx* dctx = nullptr; - LZ4F_createDecompressionContext(&dctx, LZ4F_VERSION); - size_t compressed_size_in_out = compressed_data.size(); - size_t decompressed_size_in_out = decompressed_size; - size_t result = LZ4F_decompress(dctx, decompressed_data.data(), &decompressed_size_in_out, compressed_data.data(), &compressed_size_in_out, nullptr); - if (LZ4F_isError(result) || (decompressed_size_in_out != (size_t)decompressed_size)) + std::vector lz4_compress_with_header(std::span data) { - throw std::runtime_error("Failed to decompress data with LZ4 frame format"); + const std::int64_t original_size = data.size(); + auto compressed_body = lz4_compress(data); + + if (compressed_body.size() >= static_cast(original_size)) + { + return uncompressed_data_with_header(data); + } + + std::vector result; + result.reserve(CompressionHeaderSize + compressed_body.size()); + result.insert(result.end(), reinterpret_cast(&original_size), reinterpret_cast(&original_size) + sizeof(original_size)); + result.insert(result.end(), compressed_body.begin(), compressed_body.end()); + return result; + } + + std::variant, std::span> lz4_decompress_with_header(std::span data) + { + if (data.size() < CompressionHeaderSize) + { + throw std::runtime_error("Invalid compressed data: missing decompressed size"); + } + const std::int64_t decompressed_size = *reinterpret_cast(data.data()); + const auto compressed_data = data.subspan(CompressionHeaderSize); + + if (decompressed_size == -1) + { + return compressed_data; + } + + return lz4_decompress(compressed_data, decompressed_size); + } + + std::span get_body_from_uncompressed_data(std::span data) + { + if (data.size() < CompressionHeaderSize) + { + throw std::runtime_error("Invalid data: missing header"); + } + return data.subspan(CompressionHeaderSize); } - LZ4F_freeDecompressionContext(dctx); - return decompressed_data; } std::vector compress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data) { - if (data.empty()) - { - return {}; - } switch (compression_type) { case org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME: { - return lz4_compress(data); + return lz4_compress_with_header(data); } case org::apache::arrow::flatbuf::CompressionType::ZSTD: { throw std::runtime_error("Compression using zstd is not supported yet."); } default: - // TODO think of avoiding copy here - return {data.begin(), data.end()}; + return uncompressed_data_with_header(data); } } - std::vector decompress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data) + std::variant, std::span> decompress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data) { + // Handle empty input: an empty span is a valid representation for an empty buffer + // (e.g., a validity bitmap for a column with no nulls) and should decompress to an empty output. + // TODO if we don't call this fct anymore on validity buffers, remove this empty data handling if (data.empty()) { return {}; } + switch (compression_type) { case org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME: { - return lz4_decompress(data); + return lz4_decompress_with_header(data); } case org::apache::arrow::flatbuf::CompressionType::ZSTD: { throw std::runtime_error("Decompression using zstd is not supported yet."); } default: - // TODO think of avoiding copy here - return {data.begin(), data.end()}; + { + return get_body_from_uncompressed_data(data); + } } } } diff --git a/src/deserialize_utils.cpp b/src/deserialize_utils.cpp index 3eba4cc..a3ea3a0 100644 --- a/src/deserialize_utils.cpp +++ b/src/deserialize_utils.cpp @@ -61,8 +61,28 @@ namespace sparrow_ipc::utils if (compression) { - decompressed_storage.emplace_back(decompress(compression->codec(), buffer_span)); - buffer_span = decompressed_storage.back(); + auto decompressed_result = decompress(compression->codec(), buffer_span); + return std::visit( + [&decompressed_storage](auto&& arg) -> std::span + { + using T = std::decay_t; + if constexpr (std::is_same_v>) + { + // Decompression happened + decompressed_storage.emplace_back(std::move(arg)); + return decompressed_storage.back(); + } + else // It's a std::span + { + // No decompression, but we are in a compression context, so we must copy the buffer + // to ensure the owning ArrowArray has access to all its data. + // TODO think about other strategies + decompressed_storage.emplace_back(arg.begin(), arg.end()); + return decompressed_storage.back(); + } + }, + decompressed_result + ); } return buffer_span; } diff --git a/src/serialize_utils.cpp b/src/serialize_utils.cpp index f3c972d..cbab29f 100644 --- a/src/serialize_utils.cpp +++ b/src/serialize_utils.cpp @@ -103,7 +103,7 @@ namespace sparrow_ipc return metadata_size + actual_body_size; } - [[nodiscard]] SPARROW_IPC_API std::pair, std::vector> + std::pair, std::vector> generate_compressed_body_and_buffers(const sparrow::record_batch& record_batch, const org::apache::arrow::flatbuf::CompressionType compression_type) { std::vector compressed_body; @@ -115,23 +115,16 @@ namespace sparrow_ipc const auto& arrow_proxy = sparrow::detail::array_access::get_arrow_proxy(column); for (const auto& buffer : arrow_proxy.buffers()) { - // Original buffer size - const int64_t original_size = static_cast(buffer.size()); + // Compress the buffer. The returned buffer already has the correct size header. + std::vector compressed_buffer_with_header = compress(compression_type, std::span(buffer.data(), buffer.size())); - // Compress the buffer - std::vector compressed_buffer_data = compress(compression_type, std::span(buffer.data(), original_size)); - const int64_t compressed_data_size = static_cast(compressed_buffer_data.size()); + const size_t aligned_chunk_size = utils::align_to_8(compressed_buffer_with_header.size()); + const size_t padding_needed = aligned_chunk_size - compressed_buffer_with_header.size(); - // Calculate total size of this compressed chunk (original size prefix + compressed data + padding) - const int64_t total_chunk_size = sizeof(int64_t) + compressed_data_size; - const size_t aligned_chunk_size = utils::align_to_8(total_chunk_size); - const size_t padding_needed = aligned_chunk_size - total_chunk_size; + // Write compressed data with header + compressed_body.insert(compressed_body.end(), compressed_buffer_with_header.begin(), compressed_buffer_with_header.end()); - // Write original size (8 bytes) followed by compressed data - compressed_body.insert(compressed_body.end(), reinterpret_cast(&original_size), reinterpret_cast(&original_size) + sizeof(int64_t)); - compressed_body.insert(compressed_body.end(), compressed_buffer_data.begin(), compressed_buffer_data.end()); - - // Add padding to the compressed data + // Add padding compressed_body.insert(compressed_body.end(), padding_needed, 0); // Update compressed buffer metadata diff --git a/tests/test_compression.cpp b/tests/test_compression.cpp index 8af026f..81140ba 100644 --- a/tests/test_compression.cpp +++ b/tests/test_compression.cpp @@ -16,10 +16,10 @@ namespace sparrow_ipc const auto compression_type = org::apache::arrow::flatbuf::CompressionType::ZSTD; // Test compression with ZSTD - CHECK_THROWS_WITH_AS(sparrow_ipc::compress(compression_type, original_data), "Compression using zstd is not supported yet.", std::runtime_error); + CHECK_THROWS_WITH_AS(compress(compression_type, original_data), "Compression using zstd is not supported yet.", std::runtime_error); // Test decompression with ZSTD - CHECK_THROWS_WITH_AS(sparrow_ipc::decompress(compression_type, original_data), "Decompression using zstd is not supported yet.", std::runtime_error); + CHECK_THROWS_WITH_AS(decompress(compression_type, original_data), "Decompression using zstd is not supported yet.", std::runtime_error); } TEST_CASE("Empty data") @@ -29,37 +29,63 @@ namespace sparrow_ipc // Test compression of empty data auto compressed = compress(compression_type, empty_data); - CHECK(compressed.empty()); + CHECK_EQ(compressed.size(), CompressionHeaderSize); + const std::int64_t header = *reinterpret_cast(compressed.data()); + CHECK_EQ(header, -1); // Test decompression of empty data - auto decompressed = decompress(compression_type, empty_data); - CHECK(decompressed.empty()); + auto decompressed = decompress(compression_type, compressed); + std::visit([](const auto& value) { CHECK(value.empty()); }, decompressed); } TEST_CASE("Data compression and decompression round-trip") { - std::string original_string = "hello world, this is a test of compression and decompression!!"; + std::string original_string = "Hello world, this is a test of compression and decompression. But we need more words to make this compression worth it!"; std::vector original_data(original_string.begin(), original_string.end()); - const int64_t original_size = original_data.size(); // Compress data auto compression_type = org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME; - std::vector compressed_frame = compress(compression_type, original_data); + std::vector compressed_data = compress(compression_type, original_data); - CHECK_GT(compressed_frame.size(), 0); - CHECK_NE(compressed_frame, original_data); + // Decompress + auto decompressed_result = decompress(compression_type, compressed_data); + std::visit( + [&original_data](const auto& decompressed_data) + { + CHECK_EQ(decompressed_data.size(), original_data.size()); + const std::vector vec(decompressed_data.begin(), decompressed_data.end()); + CHECK_EQ(vec, original_data); + }, + decompressed_result + ); + } - // Manually create the IPC-formatted compressed buffer by adding the 8-byte prefix - std::vector ipc_formatted_buffer; - ipc_formatted_buffer.reserve(sizeof(int64_t) + compressed_frame.size()); - ipc_formatted_buffer.insert(ipc_formatted_buffer.end(), reinterpret_cast(&original_size), reinterpret_cast(&original_size) + sizeof(int64_t)); - ipc_formatted_buffer.insert(ipc_formatted_buffer.end(), compressed_frame.begin(), compressed_frame.end()); + TEST_CASE("Data compression with incompressible data") + { + std::string original_string = "abc"; + std::vector original_data(original_string.begin(), original_string.end()); + + // Compress data + auto compression_type = org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME; + std::vector compressed_data = compress(compression_type, original_data); // Decompress - std::vector decompressed_data = decompress(compression_type, ipc_formatted_buffer); + auto decompressed_result = decompress(compression_type, compressed_data); + std::visit( + [&original_data](const auto& decompressed_data) + { + CHECK_EQ(decompressed_data.size(), original_data.size()); + const std::vector vec(decompressed_data.begin(), decompressed_data.end()); + CHECK_EQ(vec, original_data); + }, + decompressed_result + ); - CHECK_EQ(decompressed_data.size(), original_data.size()); - CHECK_EQ(decompressed_data, original_data); + // Check that the compressed data is just the original data with a -1 header + const std::int64_t header = *reinterpret_cast(compressed_data.data()); + CHECK_EQ(header, -1); + std::vector body(compressed_data.begin() + sizeof(header), compressed_data.end()); + CHECK_EQ(body, original_data); } } } From 7b594368ea91d8ec9c2af8c9e0b41a7c7a03c931 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Fri, 17 Oct 2025 18:08:23 +0200 Subject: [PATCH 17/19] Add missing header in test --- tests/test_compression.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_compression.cpp b/tests/test_compression.cpp index 81140ba..4c5946e 100644 --- a/tests/test_compression.cpp +++ b/tests/test_compression.cpp @@ -1,3 +1,4 @@ +#include #include #include From 5c1b6ba2f6db09cbb94a562d18326743ed967446 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Mon, 20 Oct 2025 17:25:12 +0200 Subject: [PATCH 18/19] Simplify --- .../deserialize_primitive_array.hpp | 1 + include/sparrow_ipc/serialize_utils.hpp | 22 ++--- src/flatbuffer_utils.cpp | 2 +- src/serialize.cpp | 11 ++- src/serialize_utils.cpp | 90 +++++++++---------- tests/test_serialize_utils.cpp | 27 +++++- 6 files changed, 86 insertions(+), 67 deletions(-) diff --git a/include/sparrow_ipc/deserialize_primitive_array.hpp b/include/sparrow_ipc/deserialize_primitive_array.hpp index 76f7212..af24b7c 100644 --- a/include/sparrow_ipc/deserialize_primitive_array.hpp +++ b/include/sparrow_ipc/deserialize_primitive_array.hpp @@ -38,6 +38,7 @@ namespace sparrow_ipc const auto compression = record_batch.compression(); std::vector> decompressed_buffers; + // TODO do not decompress validity buffers? auto validity_buffer_span = utils::get_and_decompress_buffer(record_batch, body, buffer_index, compression, decompressed_buffers); const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); diff --git a/include/sparrow_ipc/serialize_utils.hpp b/include/sparrow_ipc/serialize_utils.hpp index 5129b73..25f9a1e 100644 --- a/include/sparrow_ipc/serialize_utils.hpp +++ b/include/sparrow_ipc/serialize_utils.hpp @@ -128,13 +128,11 @@ namespace sparrow_ipc * * @param record_batch The record batch to serialize. * @param compression_type The compression algorithm to use (e.g., LZ4_FRAME, ZSTD). - * @return A std::pair containing: - * - first: A vector of bytes representing the complete compressed message body. - * - second: A vector of FlatBuffer Buffer objects describing the offset and - * size of each buffer within the compressed body. + * @return A vector of FlatBuffer Buffer objects describing the offset and + * size of each buffer within the compressed body. */ - [[nodiscard]] SPARROW_IPC_API std::pair, std::vector> - generate_compressed_body_and_buffers(const sparrow::record_batch& record_batch, const org::apache::arrow::flatbuf::CompressionType compression_type); + [[nodiscard]] SPARROW_IPC_API std::vector + generate_compressed_buffers(const sparrow::record_batch& record_batch, const org::apache::arrow::flatbuf::CompressionType compression_type); /** * @brief Fills the body vector with serialized data from an arrow proxy and its children. @@ -150,8 +148,9 @@ namespace sparrow_ipc * * @param arrow_proxy The arrow proxy containing buffers and potential child proxies to serialize * @param stream The output stream where the serialized body data will be written + * @param compression The compression type to use when serializing */ - SPARROW_IPC_API void fill_body(const sparrow::arrow_proxy& arrow_proxy, any_output_stream& stream); + SPARROW_IPC_API void fill_body(const sparrow::arrow_proxy& arrow_proxy, any_output_stream& stream, std::optional compression = std::nullopt); /** * @brief Generates a serialized body from a record batch. @@ -162,8 +161,9 @@ namespace sparrow_ipc * * @param record_batch The record batch containing columns to be serialized * @param stream The output stream where the serialized body will be written + * @param compression The compression type to use when serializing */ - SPARROW_IPC_API void generate_body(const sparrow::record_batch& record_batch, any_output_stream& stream); + SPARROW_IPC_API void generate_body(const sparrow::record_batch& record_batch, any_output_stream& stream, std::optional compression = std::nullopt); /** * @brief Calculates the total size of the body section for an Arrow array. @@ -173,9 +173,10 @@ namespace sparrow_ipc * buffer size is aligned to 8-byte boundaries as required by the Arrow format. * * @param arrow_proxy The Arrow array proxy containing buffers and child arrays + * @param compression The compression type to use when serializing * @return int64_t The total aligned size in bytes of all buffers in the array hierarchy */ - [[nodiscard]] SPARROW_IPC_API int64_t calculate_body_size(const sparrow::arrow_proxy& arrow_proxy); + [[nodiscard]] SPARROW_IPC_API int64_t calculate_body_size(const sparrow::arrow_proxy& arrow_proxy, std::optional compression = std::nullopt); /** * @brief Calculates the total body size of a record batch by summing the body sizes of all its columns. @@ -185,9 +186,10 @@ namespace sparrow_ipc * the total memory required for the serialized data content of the record batch. * * @param record_batch The sparrow record batch containing columns to calculate size for + * @param compression The compression type to use when serializing * @return int64_t The total body size in bytes of all columns in the record batch */ - [[nodiscard]] SPARROW_IPC_API int64_t calculate_body_size(const sparrow::record_batch& record_batch); + [[nodiscard]] SPARROW_IPC_API int64_t calculate_body_size(const sparrow::record_batch& record_batch, std::optional compression = std::nullopt); SPARROW_IPC_API std::vector get_column_dtypes(const sparrow::record_batch& rb); } diff --git a/src/flatbuffer_utils.cpp b/src/flatbuffer_utils.cpp index 1e580f3..3b2a2d2 100644 --- a/src/flatbuffer_utils.cpp +++ b/src/flatbuffer_utils.cpp @@ -583,7 +583,7 @@ namespace sparrow_ipc 0 // TODO :variadic buffer Counts ); - const int64_t body_size = body_size_override.value_or(calculate_body_size(record_batch)); + const int64_t body_size = body_size_override.value_or(calculate_body_size(record_batch, compression)); const auto record_batch_message_offset = org::apache::arrow::flatbuf::CreateMessage( record_batch_builder, org::apache::arrow::flatbuf::MetadataVersion::V5, diff --git a/src/serialize.cpp b/src/serialize.cpp index 644acf3..99ae36f 100644 --- a/src/serialize.cpp +++ b/src/serialize.cpp @@ -28,15 +28,14 @@ namespace sparrow_ipc if (compression.has_value()) { // TODO Handle this inside get_record_batch_message_builder - auto [compressed_body, compressed_buffers] = generate_compressed_body_and_buffers(record_batch, compression.value()); - common_serialize(get_record_batch_message_builder(record_batch, compression, compressed_body.size(), &compressed_buffers), stream); - // TODO Use something equivalent to generate_body (stream wise, handling children etc) - stream.write(std::span(compressed_body.data(), compressed_body.size())); + auto compressed_buffers = generate_compressed_buffers(record_batch, compression.value()); + auto body_size_override = calculate_body_size(record_batch, compression); + common_serialize(get_record_batch_message_builder(record_batch, compression, body_size_override, &compressed_buffers), stream); } else { - common_serialize(get_record_batch_message_builder(record_batch, compression), stream); - generate_body(record_batch, stream); + common_serialize(get_record_batch_message_builder(record_batch, compression, std::nullopt, nullptr), stream); } + generate_body(record_batch, stream, compression); } } diff --git a/src/serialize_utils.cpp b/src/serialize_utils.cpp index cbab29f..875e8be 100644 --- a/src/serialize_utils.cpp +++ b/src/serialize_utils.cpp @@ -7,52 +7,69 @@ namespace sparrow_ipc { - void fill_body(const sparrow::arrow_proxy& arrow_proxy, any_output_stream& stream) + void fill_body(const sparrow::arrow_proxy& arrow_proxy, any_output_stream& stream, std::optional compression) { - for (const auto& buffer : arrow_proxy.buffers()) - { - stream.write(buffer); + std::for_each(arrow_proxy.buffers().begin(), arrow_proxy.buffers().end(), [&](const auto& buffer) { + if (compression.has_value()) + { + auto compressed_buffer_with_header = compress(compression.value(), std::span(buffer.data(), buffer.size())); + stream.write(std::span(compressed_buffer_with_header.data(), compressed_buffer_with_header.size())); + } + else + { + stream.write(buffer); + } stream.add_padding(); - } - for (const auto& child : arrow_proxy.children()) - { - fill_body(child, stream); - } + }); + + std::for_each(arrow_proxy.children().begin(), arrow_proxy.children().end(), [&](const auto& child) { + fill_body(child, stream, compression); + }); } - void generate_body(const sparrow::record_batch& record_batch, any_output_stream& stream) + void generate_body(const sparrow::record_batch& record_batch, any_output_stream& stream, std::optional compression) { - for (const auto& column : record_batch.columns()) - { + std::for_each(record_batch.columns().begin(), record_batch.columns().end(), [&](const auto& column) { const auto& arrow_proxy = sparrow::detail::array_access::get_arrow_proxy(column); - fill_body(arrow_proxy, stream); - } + fill_body(arrow_proxy, stream, compression); + }); } - int64_t calculate_body_size(const sparrow::arrow_proxy& arrow_proxy) + int64_t calculate_body_size(const sparrow::arrow_proxy& arrow_proxy, std::optional compression) { int64_t total_size = 0; - for (const auto& buffer : arrow_proxy.buffers()) + if (compression.has_value()) + { + for (const auto& buffer : arrow_proxy.buffers()) + { + total_size += utils::align_to_8(compress(compression.value(), std::span(buffer.data(), buffer.size())).size()); + } + } + else { - total_size += utils::align_to_8(buffer.size()); + for (const auto& buffer : arrow_proxy.buffers()) + { + total_size += utils::align_to_8(buffer.size()); + } } + for (const auto& child : arrow_proxy.children()) { - total_size += calculate_body_size(child); + total_size += calculate_body_size(child, compression); } return total_size; } - int64_t calculate_body_size(const sparrow::record_batch& record_batch) + int64_t calculate_body_size(const sparrow::record_batch& record_batch, std::optional compression) { return std::accumulate( record_batch.columns().begin(), record_batch.columns().end(), int64_t{0}, - [](int64_t acc, const sparrow::array& arr) + [&](int64_t acc, const sparrow::array& arr) { const auto& arrow_proxy = sparrow::detail::array_access::get_arrow_proxy(arr); - return acc + calculate_body_size(arrow_proxy); + return acc + calculate_body_size(arrow_proxy, compression); } ); } @@ -78,18 +95,7 @@ namespace sparrow_ipc flatbuffers::FlatBufferBuilder record_batch_builder = get_record_batch_message_builder(record_batch, compression); const flatbuffers::uoffset_t record_batch_len = record_batch_builder.GetSize(); - std::size_t actual_body_size = 0; - if (compression.has_value()) - { - // If compressed, the body size is the sum of compressed buffer sizes + original size prefixes + padding - auto [compressed_body, compressed_buffers] = generate_compressed_body_and_buffers(record_batch, compression.value()); - actual_body_size = compressed_body.size(); - } - else - { - // If not compressed, the body size is the sum of uncompressed buffer sizes with padding - actual_body_size = static_cast(calculate_body_size(record_batch)); - } + const std::size_t actual_body_size = static_cast(calculate_body_size(record_batch, compression)); // Calculate total size: // - Continuation bytes (4) @@ -103,10 +109,9 @@ namespace sparrow_ipc return metadata_size + actual_body_size; } - std::pair, std::vector> - generate_compressed_body_and_buffers(const sparrow::record_batch& record_batch, const org::apache::arrow::flatbuf::CompressionType compression_type) + std::vector + generate_compressed_buffers(const sparrow::record_batch& record_batch, const org::apache::arrow::flatbuf::CompressionType compression_type) { - std::vector compressed_body; std::vector compressed_buffers; int64_t current_offset = 0; @@ -115,24 +120,13 @@ namespace sparrow_ipc const auto& arrow_proxy = sparrow::detail::array_access::get_arrow_proxy(column); for (const auto& buffer : arrow_proxy.buffers()) { - // Compress the buffer. The returned buffer already has the correct size header. std::vector compressed_buffer_with_header = compress(compression_type, std::span(buffer.data(), buffer.size())); - const size_t aligned_chunk_size = utils::align_to_8(compressed_buffer_with_header.size()); - const size_t padding_needed = aligned_chunk_size - compressed_buffer_with_header.size(); - - // Write compressed data with header - compressed_body.insert(compressed_body.end(), compressed_buffer_with_header.begin(), compressed_buffer_with_header.end()); - - // Add padding - compressed_body.insert(compressed_body.end(), padding_needed, 0); - - // Update compressed buffer metadata compressed_buffers.emplace_back(current_offset, aligned_chunk_size); current_offset += aligned_chunk_size; } } - return {compressed_body, compressed_buffers}; + return compressed_buffers; } std::vector get_column_dtypes(const sparrow::record_batch& rb) diff --git a/tests/test_serialize_utils.cpp b/tests/test_serialize_utils.cpp index 3aebbbb..194f958 100644 --- a/tests/test_serialize_utils.cpp +++ b/tests/test_serialize_utils.cpp @@ -40,20 +40,43 @@ namespace sparrow_ipc } } + // TODO after the used fcts are stable regarding compression, add tests for fcts having it as an additional argument + // cf. fill_body example TEST_CASE("fill_body") { - SUBCASE("Simple primitive array") + SUBCASE("Simple primitive array (uncompressed)") { auto array = sp::primitive_array({1, 2, 3, 4, 5}); auto proxy = sp::detail::array_access::get_arrow_proxy(array); std::vector body; sparrow_ipc::memory_output_stream stream(body); sparrow_ipc::any_output_stream astream(stream); - fill_body(proxy, astream); + fill_body(proxy, astream, std::nullopt); CHECK_GT(body.size(), 0); // Body size should be aligned CHECK_EQ(body.size() % 8, 0); } + + SUBCASE("Simple primitive array (compressible)") + { + std::vector data(1000, 12345); // Repeating values, should be very compressible + auto array = sp::primitive_array(data); + auto proxy = sp::detail::array_access::get_arrow_proxy(array); + + // Compressed + std::vector body_compressed; + sparrow_ipc::memory_output_stream stream_compressed(body_compressed); + sparrow_ipc::any_output_stream astream_compressed(stream_compressed); + fill_body(proxy, astream_compressed, org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME); + + // Uncompressed + std::vector body_uncompressed; + sparrow_ipc::memory_output_stream stream_uncompressed(body_uncompressed); + sparrow_ipc::any_output_stream astream_uncompressed(stream_uncompressed); + fill_body(proxy, astream_uncompressed, std::nullopt); + // Check that compressed size is smaller than uncompressed size + CHECK_LT(body_compressed.size(), body_uncompressed.size()); + } } TEST_CASE("generate_body") From b5cc68173ca5202d45a79e4abbda7cd3822f7a0c Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Tue, 21 Oct 2025 10:22:47 +0200 Subject: [PATCH 19/19] Simplify get_record_batch_message_builder --- include/sparrow_ipc/flatbuffer_utils.hpp | 8 +------- src/flatbuffer_utils.cpp | 14 ++++++++------ src/serialize.cpp | 12 +----------- 3 files changed, 10 insertions(+), 24 deletions(-) diff --git a/include/sparrow_ipc/flatbuffer_utils.hpp b/include/sparrow_ipc/flatbuffer_utils.hpp index b26f0ed..78094bf 100644 --- a/include/sparrow_ipc/flatbuffer_utils.hpp +++ b/include/sparrow_ipc/flatbuffer_utils.hpp @@ -214,12 +214,6 @@ namespace sparrow_ipc * * @param record_batch The source record batch containing the data to be serialized * @param compression Optional: The compression algorithm to be used for the message body - * @param body_size Optional: An override for the total size of the message body - * If not provided, the size is calculated from the uncompressed buffers - * This is required when using compression - * @param compressed_buffers Optional: A pointer to a vector of buffer metadata. - * If provided, this metadata is used instead of generating it from the - * uncompressed record batch. This is required when using compression. * @return A FlatBufferBuilder containing the complete serialized message ready for * transmission or storage. The builder is finished and ready to be accessed * via GetBufferPointer() and GetSize(). @@ -228,5 +222,5 @@ namespace sparrow_ipc * @note Variadic buffer counts is not currently implemented (set to 0) */ [[nodiscard]] flatbuffers::FlatBufferBuilder - get_record_batch_message_builder(const sparrow::record_batch& record_batch, std::optional compression = std::nullopt, std::optional body_size = std::nullopt, const std::vector* compressed_buffers = nullptr); + get_record_batch_message_builder(const sparrow::record_batch& record_batch, std::optional compression = std::nullopt); } diff --git a/src/flatbuffer_utils.cpp b/src/flatbuffer_utils.cpp index 3b2a2d2..f3e418c 100644 --- a/src/flatbuffer_utils.cpp +++ b/src/flatbuffer_utils.cpp @@ -562,18 +562,20 @@ namespace sparrow_ipc return buffers; } - flatbuffers::FlatBufferBuilder get_record_batch_message_builder(const sparrow::record_batch& record_batch, std::optional compression, std::optional body_size_override, const std::vector* compressed_buffers) + flatbuffers::FlatBufferBuilder get_record_batch_message_builder(const sparrow::record_batch& record_batch, std::optional compression) { - const std::vector nodes = create_fieldnodes(record_batch); - const std::vector& buffers = compressed_buffers ? *compressed_buffers : get_buffers(record_batch); flatbuffers::FlatBufferBuilder record_batch_builder; - auto nodes_offset = record_batch_builder.CreateVectorOfStructs(nodes); - auto buffers_offset = record_batch_builder.CreateVectorOfStructs(buffers); flatbuffers::Offset compression_offset = 0; + std::optional> compressed_buffers; if (compression) { + compressed_buffers = generate_compressed_buffers(record_batch, compression.value()); compression_offset = org::apache::arrow::flatbuf::CreateBodyCompression(record_batch_builder, compression.value(), org::apache::arrow::flatbuf::BodyCompressionMethod::BUFFER); } + const auto& buffers = compressed_buffers ? *compressed_buffers : get_buffers(record_batch); + const std::vector nodes = create_fieldnodes(record_batch); + auto nodes_offset = record_batch_builder.CreateVectorOfStructs(nodes); + auto buffers_offset = record_batch_builder.CreateVectorOfStructs(buffers); const auto record_batch_offset = org::apache::arrow::flatbuf::CreateRecordBatch( record_batch_builder, static_cast(record_batch.nb_rows()), @@ -583,7 +585,7 @@ namespace sparrow_ipc 0 // TODO :variadic buffer Counts ); - const int64_t body_size = body_size_override.value_or(calculate_body_size(record_batch, compression)); + const int64_t body_size = calculate_body_size(record_batch, compression); const auto record_batch_message_offset = org::apache::arrow::flatbuf::CreateMessage( record_batch_builder, org::apache::arrow::flatbuf::MetadataVersion::V5, diff --git a/src/serialize.cpp b/src/serialize.cpp index 99ae36f..016068a 100644 --- a/src/serialize.cpp +++ b/src/serialize.cpp @@ -25,17 +25,7 @@ namespace sparrow_ipc void serialize_record_batch(const sparrow::record_batch& record_batch, any_output_stream& stream, std::optional compression) { - if (compression.has_value()) - { - // TODO Handle this inside get_record_batch_message_builder - auto compressed_buffers = generate_compressed_buffers(record_batch, compression.value()); - auto body_size_override = calculate_body_size(record_batch, compression); - common_serialize(get_record_batch_message_builder(record_batch, compression, body_size_override, &compressed_buffers), stream); - } - else - { - common_serialize(get_record_batch_message_builder(record_batch, compression, std::nullopt, nullptr), stream); - } + common_serialize(get_record_batch_message_builder(record_batch, compression), stream); generate_body(record_batch, stream, compression); } }