From df9279538c6017c37bfe1cabd7900a74b4a9cd23 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Mon, 1 Sep 2025 09:48:56 +0200 Subject: [PATCH 01/22] Rework serialization wip wip wip WIP wip wip wip wip wip --- CMakeLists.txt | 87 +- cmake/external_dependencies.cmake | 9 + cmake/sparrow-ipcConfig.cmake.in | 27 + include/serialize.hpp | 25 - .../arrow_interface/arrow_array.hpp | 32 + .../arrow_array/private_data.hpp | 22 + .../arrow_array_schema_common_release.hpp | 60 + .../arrow_interface/arrow_schema.hpp | 70 + .../arrow_schema/private_data.hpp | 25 + include/{ => sparrow_ipc}/config/config.hpp | 0 .../config/sparrow_ipc_version.hpp | 12 + include/sparrow_ipc/deserialize.hpp | 26 + .../deserialize_fixedsizebinary_array.hpp | 41 + .../deserialize_primitive_array.hpp | 42 + include/sparrow_ipc/deserialize_utils.hpp | 32 + ...deserialize_variable_size_binary_array.hpp | 46 + include/sparrow_ipc/encapsulated_message.hpp | 43 + include/sparrow_ipc/magic_values.hpp | 28 + include/sparrow_ipc/metadata.hpp | 16 + include/sparrow_ipc/serialize.hpp | 25 + .../serialize_null_array.hpp | 0 .../serialize_primitive_array.hpp | 11 +- include/sparrow_ipc/utils.hpp | 21 + include/utils.hpp | 24 - src/arrow_interface/arrow_array.cpp | 71 + .../arrow_array/private_data.cpp | 9 + src/arrow_interface/arrow_schema.cpp | 20 + .../arrow_schema/private_data.cpp | 30 + src/deserialize.cpp | 362 ++ src/deserialize_utils.cpp | 17 + src/encapsulated_message.cpp | 105 + src/magic_values.cpp | 19 + src/metadata.cpp | 21 + src/serialize.cpp | 100 +- src/serialize_null_array.cpp | 8 +- src/utils.cpp | 177 +- tests/CMakeLists.txt | 12 +- tests/metadata_sample.hpp | 75 + .../resources/generated_primitive.arrow_file | Bin 0 -> 22298 bytes tests/resources/generated_primitive.json | 3170 +++++++++++++++++ tests/resources/generated_primitive.stream | Bin 0 -> 20280 bytes ...nerated_primitive_large_offsets.arrow_file | Bin 0 -> 3578 bytes .../generated_primitive_large_offsets.json | 582 +++ .../generated_primitive_large_offsets.stream | Bin 0 -> 3160 bytes .../generated_primitive_no_batches.arrow_file | Bin 0 -> 3914 bytes .../generated_primitive_no_batches.json | 287 ++ .../generated_primitive_no_batches.stream | Bin 0 -> 1944 bytes .../generated_primitive_zerolength.arrow_file | Bin 0 -> 8858 bytes .../generated_primitive_zerolength.json | 879 +++++ .../generated_primitive_zerolength.stream | Bin 0 -> 6816 bytes tests/test_arrow_array.cpp | 0 tests/test_arrow_schema.cpp | 276 ++ tests/test_null_array_serialization.cpp | 7 +- tests/test_primitive_array_serialization.cpp | 27 +- tests/test_primitive_array_with_files.cpp | 103 + tests/test_utils.cpp | 322 +- 56 files changed, 7155 insertions(+), 248 deletions(-) create mode 100644 cmake/sparrow-ipcConfig.cmake.in delete mode 100644 include/serialize.hpp create mode 100644 include/sparrow_ipc/arrow_interface/arrow_array.hpp create mode 100644 include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp create mode 100644 include/sparrow_ipc/arrow_interface/arrow_array_schema_common_release.hpp create mode 100644 include/sparrow_ipc/arrow_interface/arrow_schema.hpp create mode 100644 include/sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp rename include/{ => sparrow_ipc}/config/config.hpp (100%) create mode 100644 include/sparrow_ipc/config/sparrow_ipc_version.hpp create mode 100644 include/sparrow_ipc/deserialize.hpp create mode 100644 include/sparrow_ipc/deserialize_fixedsizebinary_array.hpp create mode 100644 include/sparrow_ipc/deserialize_primitive_array.hpp create mode 100644 include/sparrow_ipc/deserialize_utils.hpp create mode 100644 include/sparrow_ipc/deserialize_variable_size_binary_array.hpp create mode 100644 include/sparrow_ipc/encapsulated_message.hpp create mode 100644 include/sparrow_ipc/magic_values.hpp create mode 100644 include/sparrow_ipc/metadata.hpp create mode 100644 include/sparrow_ipc/serialize.hpp rename include/{ => sparrow_ipc}/serialize_null_array.hpp (100%) rename include/{ => sparrow_ipc}/serialize_primitive_array.hpp (93%) create mode 100644 include/sparrow_ipc/utils.hpp delete mode 100644 include/utils.hpp create mode 100644 src/arrow_interface/arrow_array.cpp create mode 100644 src/arrow_interface/arrow_array/private_data.cpp create mode 100644 src/arrow_interface/arrow_schema.cpp create mode 100644 src/arrow_interface/arrow_schema/private_data.cpp create mode 100644 src/deserialize.cpp create mode 100644 src/deserialize_utils.cpp create mode 100644 src/encapsulated_message.cpp create mode 100644 src/magic_values.cpp create mode 100644 src/metadata.cpp create mode 100644 tests/metadata_sample.hpp create mode 100644 tests/resources/generated_primitive.arrow_file create mode 100644 tests/resources/generated_primitive.json create mode 100644 tests/resources/generated_primitive.stream create mode 100644 tests/resources/generated_primitive_large_offsets.arrow_file create mode 100644 tests/resources/generated_primitive_large_offsets.json create mode 100644 tests/resources/generated_primitive_large_offsets.stream create mode 100644 tests/resources/generated_primitive_no_batches.arrow_file create mode 100644 tests/resources/generated_primitive_no_batches.json create mode 100644 tests/resources/generated_primitive_no_batches.stream create mode 100644 tests/resources/generated_primitive_zerolength.arrow_file create mode 100644 tests/resources/generated_primitive_zerolength.json create mode 100644 tests/resources/generated_primitive_zerolength.stream create mode 100644 tests/test_arrow_array.cpp create mode 100644 tests/test_arrow_schema.cpp create mode 100644 tests/test_primitive_array_with_files.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index a30fa81..6fd7294 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.28) -project(sparrow-ipc CXX) +project(sparrow-ipc LANGUAGES CXX) set(CMAKE_CXX_STANDARD 20 CACHE STRING "C++ Standard") set(CMAKE_CXX_STANDARD_REQUIRED ON CACHE BOOL "C++ Standard Required") @@ -15,6 +15,9 @@ include(external_dependencies) set(SPARROW_IPC_COMPILE_DEFINITIONS "" CACHE STRING "List of public compile definitions of the sparrow-ipc target") +set(SPARROW_IPC_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) +set(SPARROW_IPC_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src) + # Linter options # ============= OPTION(ACTIVATE_LINTER "Create targets to run clang-format" OFF) @@ -26,6 +29,38 @@ if(ACTIVATE_LINTER) include(clang-tidy) endif() +# Versionning +# =========== +file(STRINGS "${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/config/sparrow_ipc_version.hpp" sparrow_ipc_version_defines + REGEX "constexpr int SPARROW_IPC_VERSION_(MAJOR|MINOR|PATCH)") + +foreach(ver ${sparrow_ipc_version_defines}) + if(ver MATCHES "constexpr int SPARROW_VERSION_(MAJOR|MINOR|PATCH) = ([0-9]+);$") + set(PROJECT_VERSION_${CMAKE_MATCH_1} "${CMAKE_MATCH_2}" CACHE INTERNAL "") + endif() +endforeach() + +set(CMAKE_PROJECT_VERSION + ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}) + +message(STATUS "Building sparrow_ipc v${CMAKE_PROJECT_VERSION}") + +# Binary version +# See the following URL for explanations about the binary versionning +# https://www.gnu.org/software/libtool/manual/html_node/Updating-version-info.html#Updating-version-info +file(STRINGS "${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/config/sparrow_ipc_version.hpp" sparrow_ipc_version_defines + REGEX "constexpr int SPARROW_IPC_BINARY_(CURRENT|REVISION|AGE)") + +foreach(ver ${sparrow_ipc_version_defines}) + if(ver MATCHES "constexpr int SPARROW_IPC_BINARY_(CURRENT|REVISION|AGE) = ([0-9]+);$") + set(SPARROW_IPC_BINARY_${CMAKE_MATCH_1} "${CMAKE_MATCH_2}" CACHE INTERNAL "") + endif() +endforeach() + +set(SPARROW_IPC_BINARY_VERSION + ${SPARROW_IPC_BINARY_CURRENT}.${SPARROW_IPC_BINARY_REVISION}.${SPARROW_IPC_BINARY_AGE}) + +message(STATUS "sparrow_ipc binary version: v${SPARROW_IPC_BINARY_VERSION}") # Build options # ============= @@ -51,17 +86,39 @@ set(SPARROW_IPC_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) set(SPARROW_IPC_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src) set(SPARROW_IPC_HEADERS - ${SPARROW_IPC_INCLUDE_DIR}/config/config.hpp - ${SPARROW_IPC_INCLUDE_DIR}/serialize.hpp - ${SPARROW_IPC_INCLUDE_DIR}/serialize_primitive_array.hpp - ${SPARROW_IPC_INCLUDE_DIR}/serialize_null_array.hpp - ${SPARROW_IPC_INCLUDE_DIR}/utils.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/arrow_interface/arrow_array_schema_common_release.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/arrow_interface/arrow_array.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/arrow_interface/arrow_schema.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/config/config.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_variable_size_binary_array.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_fixedsizebinary_array.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_primitive_array.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_utils.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/encapsulated_message.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/magic_values.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/metadata.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/serialize_null_array.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/serialize_primitive_array.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/serialize.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/utils.hpp ) set(SPARROW_IPC_SRC - ${SPARROW_IPC_SOURCE_DIR}/serialize.cpp + ${SPARROW_IPC_SOURCE_DIR}/arrow_interface/arrow_array.cpp + ${SPARROW_IPC_SOURCE_DIR}/arrow_interface/arrow_array/private_data.cpp + ${SPARROW_IPC_SOURCE_DIR}/arrow_interface/arrow_schema.cpp + ${SPARROW_IPC_SOURCE_DIR}/arrow_interface/arrow_schema/private_data.cpp + ${SPARROW_IPC_SOURCE_DIR}/deserialize.cpp + ${SPARROW_IPC_SOURCE_DIR}/encapsulated_message.cpp ${SPARROW_IPC_SOURCE_DIR}/serialize_null_array.cpp + ${SPARROW_IPC_SOURCE_DIR}/serialize.cpp ${SPARROW_IPC_SOURCE_DIR}/utils.cpp + ${SPARROW_IPC_SOURCE_DIR}/magic_values.cpp + ${SPARROW_IPC_SOURCE_DIR}/metadata.cpp + ${SPARROW_IPC_SOURCE_DIR}/deserialize_utils.cpp ) # Fetch schemas from apache arrow @@ -117,11 +174,14 @@ add_custom_command( add_custom_target(generate_flatbuffers_headers DEPENDS ${FLATBUFFERS_GENERATED_HEADERS} + COMMENT "Ensuring FlatBuffers headers are generated" ) # Interface target for generated headers add_library(flatbuffers_interface INTERFACE) -target_include_directories(flatbuffers_interface INTERFACE ${FLATBUFFERS_GENERATED_DIR}) +target_include_directories(flatbuffers_interface INTERFACE + $ + $) add_dependencies(flatbuffers_interface generate_flatbuffers_headers) add_library(sparrow-ipc ${SPARROW_IPC_LIBRARY_TYPE} ${SPARROW_IPC_SRC} ${SPARROW_IPC_HEADERS}) @@ -141,19 +201,20 @@ else() target_compile_definitions(sparrow-ipc PRIVATE SPARROW_IPC_EXPORTS) endif() -target_include_directories(sparrow-ipc +target_include_directories(sparrow-ipc PUBLIC - ${SPARROW_IPC_INCLUDE_DIR} + $ + $ PRIVATE - ${SPARROW_IPC_SOURCE_DIR} ) + $) target_link_libraries(sparrow-ipc PUBLIC sparrow::sparrow flatbuffers::flatbuffers - PRIVATE - flatbuffers_interface) + ) +# Ensure generated headers are available when building sparrow-ipc add_dependencies(sparrow-ipc generate_flatbuffers_headers) # Tests diff --git a/cmake/external_dependencies.cmake b/cmake/external_dependencies.cmake index ad93282..6139f43 100644 --- a/cmake/external_dependencies.cmake +++ b/cmake/external_dependencies.cmake @@ -46,15 +46,24 @@ function(find_package_or_fetch) endfunction() set(SPARROW_BUILD_SHARED ${SPARROW_IPC_BUILD_SHARED}) +if(${SPARROW_IPC_BUILD_TESTS}) + set(CREATE_JSON_READER_TARGET ON) +endif() find_package_or_fetch( PACKAGE_NAME sparrow GIT_REPOSITORY https://github.com/man-group/sparrow.git TAG 1.1.0 ) +unset(CREATE_JSON_READER_TARGET) if(NOT TARGET sparrow::sparrow) add_library(sparrow::sparrow ALIAS sparrow) endif() +if(${SPARROW_IPC_BUILD_TESTS}) + if(NOT TARGET sparrow::json_reader) + add_library(sparrow::json_reader ALIAS json_reader) + endif() +endif() set(FLATBUFFERS_BUILD_TESTS OFF) set(FLATBUFFERS_BUILD_SHAREDLIB ${SPARROW_IPC_BUILD_SHARED}) diff --git a/cmake/sparrow-ipcConfig.cmake.in b/cmake/sparrow-ipcConfig.cmake.in new file mode 100644 index 0000000..3bd4e21 --- /dev/null +++ b/cmake/sparrow-ipcConfig.cmake.in @@ -0,0 +1,27 @@ +# sparrow-ipc cmake module +# This module sets the following variables in your project:: +# +# sparrow-ipc_FOUND - true if sparrow-ipc found on the system +# sparrow-ipc_INCLUDE_DIRS - the directory containing sparrow-ipc headers +# sparrow-ipc_LIBRARY - empty + +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro) + +if("@USE_DATE_POLYFILL@") + find_dependency(date) +endif() + +if("@CREATE_JSON_READER_TARGET@") + find_dependency(nlohmann_json) +endif() + +find_dependency(sparrow) +find_dependency(FlatBuffers) + +if(NOT TARGET sparrow-ipc::sparrow-ipc) + include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake") + get_target_property(@PROJECT_NAME@_INCLUDE_DIRS sparrow-ipc::sparrow-ipc INTERFACE_INCLUDE_DIRECTORIES) + get_target_property(@PROJECT_NAME@_LIBRARY sparrow-ipc::sparrow-ipc LOCATION) +endif() diff --git a/include/serialize.hpp b/include/serialize.hpp deleted file mode 100644 index 2dbf148..0000000 --- a/include/serialize.hpp +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include "sparrow.hpp" - -#include "Message_generated.h" -#include "Schema_generated.h" - -#include "config/config.hpp" - -namespace sparrow_ipc -{ - namespace details - { - SPARROW_IPC_API std::vector serialize_schema_message(const ArrowSchema& arrow_schema); - SPARROW_IPC_API void serialize_record_batch_message(const ArrowArray& arrow_arr, const std::vector& buffers_sizes, std::vector& final_buffer); - - SPARROW_IPC_API void deserialize_schema_message(const uint8_t* buf_ptr, size_t& current_offset, std::optional& name, std::optional>& metadata); - SPARROW_IPC_API const org::apache::arrow::flatbuf::RecordBatch* deserialize_record_batch_message(const uint8_t* buf_ptr, size_t& current_offset); - } -} diff --git a/include/sparrow_ipc/arrow_interface/arrow_array.hpp b/include/sparrow_ipc/arrow_interface/arrow_array.hpp new file mode 100644 index 0000000..21044f6 --- /dev/null +++ b/include/sparrow_ipc/arrow_interface/arrow_array.hpp @@ -0,0 +1,32 @@ + +#pragma once + +#include + +#include + +namespace sparrow_ipc +{ + [[nodiscard]] ArrowArray make_arrow_array( + int64_t length, + int64_t null_count, + int64_t offset, + std::vector&& buffers, + size_t children_count, + ArrowArray** children, + ArrowArray* dictionary + ); + + void release_arrow_array(ArrowArray* array); + + void fill_arrow_array( + ArrowArray& array, + int64_t length, + int64_t null_count, + int64_t offset, + std::vector&& buffers, + size_t children_count, + ArrowArray** children, + ArrowArray* dictionary + ); +} \ No newline at end of file diff --git a/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp b/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp new file mode 100644 index 0000000..4472f13 --- /dev/null +++ b/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include + +namespace sparrow_ipc +{ + class arrow_array_private_data + { + public: + + explicit constexpr arrow_array_private_data(std::vector&& buffers_pointers) + : m_buffers_pointers(std::move(buffers_pointers)) + { + } + + [[nodiscard]] const void** buffers_ptrs() noexcept; + + private: + + std::vector m_buffers_pointers; + }; +} diff --git a/include/sparrow_ipc/arrow_interface/arrow_array_schema_common_release.hpp b/include/sparrow_ipc/arrow_interface/arrow_array_schema_common_release.hpp new file mode 100644 index 0000000..96ec8e7 --- /dev/null +++ b/include/sparrow_ipc/arrow_interface/arrow_array_schema_common_release.hpp @@ -0,0 +1,60 @@ + +#pragma once + +#include + +#include "arrow_array/private_data.hpp" +#include "arrow_schema/private_data.hpp" + +namespace sparrow_ipc +{ + /** + * Release the children and dictionnary of an `ArrowArray` or `ArrowSchema`. + * + * @tparam T `ArrowArray` or `ArrowSchema` + * @param t The `ArrowArray` or `ArrowSchema` to release. + */ + template + requires std::same_as || std::same_as + void release_common_arrow(T& t) + { + using private_data_type = std:: + conditional_t, arrow_array_private_data, arrow_schema_private_data>; + if (t.release == nullptr) + { + return; + } + SPARROW_ASSERT_TRUE(t.private_data != nullptr); + const auto private_data = static_cast(t.private_data); + + if (t.dictionary) + { + if (t.dictionary->release) + { + t.dictionary->release(t.dictionary); + } + delete t.dictionary; + t.dictionary = nullptr; + } + + if (t.children) + { + for (int64_t i = 0; i < t.n_children; ++i) + { + T* child = t.children[i]; + if (child) + { + if (child->release) + { + child->release(child); + } + delete child; + child = nullptr; + } + } + delete[] t.children; + t.children = nullptr; + } + t.release = nullptr; + } +} diff --git a/include/sparrow_ipc/arrow_interface/arrow_schema.hpp b/include/sparrow_ipc/arrow_interface/arrow_schema.hpp new file mode 100644 index 0000000..f41fc26 --- /dev/null +++ b/include/sparrow_ipc/arrow_interface/arrow_schema.hpp @@ -0,0 +1,70 @@ +#pragma once + +#include +#include + +#include +#include +#include + +#include "arrow_schema/private_data.hpp" + +namespace sparrow_ipc +{ + void release_arrow_schema(ArrowSchema* schema); + + template > + void fill_arrow_schema( + ArrowSchema& schema, + std::string_view format, + const char* name, + std::optional metadata, + std::optional> flags, + size_t children_count, + ArrowSchema** children, + ArrowSchema* dictionary + ) + { + schema.flags = 0; + if (flags.has_value()) + { + for (const auto& flag : *flags) + { + schema.flags |= static_cast(flag); + } + } + schema.n_children = static_cast(children_count); + + std::optional metadata_str = metadata.has_value() + ? std::make_optional( + sparrow::get_metadata_from_key_values(*metadata) + ) + : std::nullopt; + + schema.private_data = new arrow_schema_private_data(format, name, std::move(metadata_str)); + + const auto private_data = static_cast(schema.private_data); + schema.format = private_data->format_ptr(); + schema.name = private_data->name_ptr(); + schema.metadata = private_data->metadata_ptr(); + schema.children = children; + schema.dictionary = dictionary; + schema.release = release_arrow_schema; + } + + template > + [[nodiscard]] ArrowSchema make_arrow_schema( + std::string_view format, + const char* name, + std::optional metadata, + std::optional> flags, + size_t children_count, + ArrowSchema** children, + ArrowSchema* dictionary + ) + { + ArrowSchema schema{}; + fill_arrow_schema(schema, format, name, metadata, flags, children_count, children, dictionary); + return schema; + } +} \ No newline at end of file diff --git a/include/sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp b/include/sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp new file mode 100644 index 0000000..edd0412 --- /dev/null +++ b/include/sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp @@ -0,0 +1,25 @@ + +#pragma once + +#include +#include + +namespace sparrow_ipc +{ + class arrow_schema_private_data + { + public: + + arrow_schema_private_data(std::string_view format, const char* name, std::optional metadata); + + [[nodiscard]] const char* format_ptr() const noexcept; + [[nodiscard]] const char* name_ptr() const noexcept; + [[nodiscard]] const char* metadata_ptr() const noexcept; + + private: + + std::string m_format; + const char* m_name; + std::optional m_metadata; + }; +} diff --git a/include/config/config.hpp b/include/sparrow_ipc/config/config.hpp similarity index 100% rename from include/config/config.hpp rename to include/sparrow_ipc/config/config.hpp diff --git a/include/sparrow_ipc/config/sparrow_ipc_version.hpp b/include/sparrow_ipc/config/sparrow_ipc_version.hpp new file mode 100644 index 0000000..0c2b9bb --- /dev/null +++ b/include/sparrow_ipc/config/sparrow_ipc_version.hpp @@ -0,0 +1,12 @@ +#pragma once + +namespace sparrow_ipc +{ + constexpr int SPARROW_IPC_VERSION_MAJOR = 0; + constexpr int SPARROW_IPC_VERSION_MINOR = 1; + constexpr int SPARROW_IPC_VERSION_PATCH = 0; + + constexpr int SPARROW_IPC_BINARY_CURRENT = 9; + constexpr int SPARROW_IPC_BINARY_REVISION = 0; + constexpr int SPARROW_IPC_BINARY_AGE = 0; +} diff --git a/include/sparrow_ipc/deserialize.hpp b/include/sparrow_ipc/deserialize.hpp new file mode 100644 index 0000000..3784bb4 --- /dev/null +++ b/include/sparrow_ipc/deserialize.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include +#include + +#include + +#include "config/config.hpp" +#include "Message_generated.h" +#include "sparrow_ipc/encapsulated_message.hpp" +#include "SparseTensor_generated.h" + + +namespace sparrow_ipc +{ + SPARROW_IPC_API void deserialize_schema_message( + const uint8_t* buf_ptr, + size_t& current_offset, + std::optional& name, + std::optional>& metadata + ); + SPARROW_IPC_API [[nodiscard]] const org::apache::arrow::flatbuf::RecordBatch* + deserialize_record_batch_message(const uint8_t* buf_ptr, size_t& current_offset); + + SPARROW_IPC_API [[nodiscard]] std::vector deserialize_stream(const uint8_t* buf_ptr); +} \ No newline at end of file diff --git a/include/sparrow_ipc/deserialize_fixedsizebinary_array.hpp b/include/sparrow_ipc/deserialize_fixedsizebinary_array.hpp new file mode 100644 index 0000000..81c6629 --- /dev/null +++ b/include/sparrow_ipc/deserialize_fixedsizebinary_array.hpp @@ -0,0 +1,41 @@ +#pragma once + +#include +#include + +#include "Message_generated.h" +#include "sparrow_ipc/arrow_interface/arrow_array.hpp" +#include "sparrow_ipc/arrow_interface/arrow_schema.hpp" + + +namespace sparrow_ipc +{ + + [[nodiscard]] sparrow::fixed_width_binary_array deserialize_fixedwidthbinary( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + std::span body, + std::string_view name, + const std::optional>& metadata, + size_t& buffer_index, + int32_t byte_width + ) + { + const std::string format = "w:" + std::to_string(byte_width); + ArrowSchema schema = make_arrow_schema(format, name.data(), metadata, std::nullopt, 0, nullptr, nullptr); + + const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++); + auto buffer_ptr = const_cast(body.data() + buffer_metadata->offset()); + const size_t buffer_size = buffer_metadata->length(); + + const auto bitmap_buffer_metadata = record_batch.buffers()->Get(buffer_index++); + auto bitmap_ptr = const_cast(body.data() + bitmap_buffer_metadata->offset()); + + const sparrow::dynamic_bitset_view bitmap_view{bitmap_ptr, static_cast(record_batch.length())}; + std::vector buffers = {buffer_ptr, bitmap_ptr}; + + ArrowArray array = make_arrow_array(record_batch.length(), bitmap_view.null_count(), 0, std::move(buffers), 0, nullptr, nullptr); + + sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; + return sparrow::fixed_width_binary_array{std::move(ap)}; + } +} \ No newline at end of file diff --git a/include/sparrow_ipc/deserialize_primitive_array.hpp b/include/sparrow_ipc/deserialize_primitive_array.hpp new file mode 100644 index 0000000..5bf7624 --- /dev/null +++ b/include/sparrow_ipc/deserialize_primitive_array.hpp @@ -0,0 +1,42 @@ +#pragma once + +#include +#include + +#include +#include + +#include "Message_generated.h" +#include "sparrow_ipc/arrow_interface/arrow_array.hpp" +#include "sparrow_ipc/arrow_interface/arrow_schema.hpp" + +namespace sparrow_ipc +{ + template + [[nodiscard]] sparrow::primitive_array deserialize_primitive_array_bis( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + std::span body, + std::string_view name, + const std::optional>& metadata, + size_t& buffer_index + ) + { + const std::string_view format = data_type_to_format( + sparrow::detail::get_data_type_from_array>::get() + ); + ArrowSchema schema = make_arrow_schema(format, name.data(), metadata, std::nullopt, 0, nullptr, nullptr); + + const auto bitmap_buffer_metadata = record_batch.buffers()->Get(buffer_index++); + auto bitmap_ptr = const_cast(body.data() + bitmap_buffer_metadata->offset()); + + const auto primitive_buffer_metadata = record_batch.buffers()->Get(buffer_index++); + auto primitives_ptr = const_cast(body.data() + primitive_buffer_metadata->offset()); + + const sparrow::dynamic_bitset_view bitmap_view{bitmap_ptr, static_cast(record_batch.length())}; + std::vector buffers = {bitmap_ptr, primitives_ptr}; + ArrowArray array = make_arrow_array(record_batch.length(), bitmap_view.null_count(), 0, std::move(buffers), 0, nullptr, nullptr); + + sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; + return sparrow::primitive_array{std::move(ap)}; + } +} \ No newline at end of file diff --git a/include/sparrow_ipc/deserialize_utils.hpp b/include/sparrow_ipc/deserialize_utils.hpp new file mode 100644 index 0000000..b43f5d7 --- /dev/null +++ b/include/sparrow_ipc/deserialize_utils.hpp @@ -0,0 +1,32 @@ +#pragma once + +#include + +#include +#include + +#include "Message_generated.h" +#include "Schema_generated.h" + +namespace sparrow_ipc +{ + template + [[nodiscard]] sparrow::u8_buffer message_buffer_to_u8buffer( + const org::apache::arrow::flatbuf::RecordBatch* record_batch, + std::span body, + size_t index + ) + { + const auto buffer_metadata = record_batch->buffers()->Get(index); + auto ptr = const_cast(body.data() + buffer_metadata->offset()); + auto casted_ptr = reinterpret_cast(ptr); + const std::size_t count = static_cast(buffer_metadata->length() / sizeof(T)); + return sparrow::u8_buffer{casted_ptr, count}; + } + + [[nodiscard]] const sparrow::dynamic_bitset_view message_buffer_to_validity_bitmap( + const org::apache::arrow::flatbuf::RecordBatch* record_batch, + std::span body, + size_t index + ); +} \ No newline at end of file diff --git a/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp b/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp new file mode 100644 index 0000000..2309861 --- /dev/null +++ b/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp @@ -0,0 +1,46 @@ +#pragma once + +#include + +#include +#include + +#include "Message_generated.h" +#include "sparrow_ipc/arrow_interface/arrow_array.hpp" +#include "sparrow_ipc/arrow_interface/arrow_schema.hpp" + +namespace sparrow_ipc +{ + template + [[nodiscard]] T deserialize_variable_size_binary( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + std::span body, + std::string_view name, + const std::optional>& metadata, + size_t& buffer_index + ) + { + const std::string_view format = data_type_to_format( + sparrow::detail::get_data_type_from_array::get() + ); + ArrowSchema schema = make_arrow_schema(format, name.data(), metadata, std::nullopt, 0, nullptr, nullptr); + + const auto bitmap_buffer_metadata = record_batch.buffers()->Get(buffer_index++); + auto bitmap_ptr = const_cast(body.data() + bitmap_buffer_metadata->offset()); + + const auto offset_metadata = record_batch.buffers()->Get(buffer_index++); + auto offset_ptr = const_cast(body.data() + offset_metadata->offset()); + const size_t offset_size = offset_metadata->length(); + + const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++); + auto buffer_ptr = const_cast(body.data() + buffer_metadata->offset()); + const size_t buffer_size = buffer_metadata->length(); + + const sparrow::dynamic_bitset_view bitmap_view{bitmap_ptr, static_cast(record_batch.length())}; + std::vector buffers = {bitmap_ptr, offset_ptr, buffer_ptr}; + ArrowArray array = make_arrow_array(record_batch.length(), bitmap_view.null_count(), 0, std::move(buffers), 0, nullptr, nullptr); + + sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; + return T{std::move(ap)}; + } +} \ No newline at end of file diff --git a/include/sparrow_ipc/encapsulated_message.hpp b/include/sparrow_ipc/encapsulated_message.hpp new file mode 100644 index 0000000..ac9aab2 --- /dev/null +++ b/include/sparrow_ipc/encapsulated_message.hpp @@ -0,0 +1,43 @@ +#include +#include + +#include "Message_generated.h" + +namespace sparrow_ipc +{ + class EncapsulatedMessage + { + public: + + EncapsulatedMessage(const uint8_t* buf_ptr); + + [[nodiscard]] const org::apache::arrow::flatbuf::Message* flat_buffer_message() const; + + [[nodiscard]] size_t metadata_length() const; + + [[nodiscard]] std::variant< + const org::apache::arrow::flatbuf::Schema*, + const org::apache::arrow::flatbuf::RecordBatch*, + const org::apache::arrow::flatbuf::Tensor*, + const org::apache::arrow::flatbuf::DictionaryBatch*, + const org::apache::arrow::flatbuf::SparseTensor*> + metadata() const; + + [[nodiscard]] const ::flatbuffers::Vector<::flatbuffers::Offset>* + custom_metadata() const; + + [[nodiscard]] size_t body_length() const; + + [[nodiscard]] std::span body() const; + + [[nodiscard]] size_t total_length() const; + + [[nodiscard]] std::span as_span() const; + + private: + + const uint8_t* m_buf_ptr; + }; + + [[nodiscard]] EncapsulatedMessage create_encapsulated_message(const uint8_t* buf_ptr); +} \ No newline at end of file diff --git a/include/sparrow_ipc/magic_values.hpp b/include/sparrow_ipc/magic_values.hpp new file mode 100644 index 0000000..94b4cc2 --- /dev/null +++ b/include/sparrow_ipc/magic_values.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include +#include +#include + +namespace sparrow_ipc +{ + constexpr std::array continuation = {0xFF, 0xFF, 0xFF, 0xFF}; + + constexpr std::array end_of_stream = {0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00}; + + template + [[nodiscard]] bool is_continuation(const R& buf) + { + return std::ranges::equal(buf, continuation); + } + + [[nodiscard]] bool is_continuation(std::istream& stream); + + template + [[nodiscard]] bool is_end_of_stream(const R& buf) + { + return std::ranges::equal(buf, end_of_stream); + } + + [[nodiscard]] bool is_end_of_stream(std::istream& stream); +} \ No newline at end of file diff --git a/include/sparrow_ipc/metadata.hpp b/include/sparrow_ipc/metadata.hpp new file mode 100644 index 0000000..bab7290 --- /dev/null +++ b/include/sparrow_ipc/metadata.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include + +#include + +#include + +#include "Schema_generated.h" + +namespace sparrow_ipc +{ + std::vector to_sparrow_metadata( + const ::flatbuffers::Vector<::flatbuffers::Offset>& metadata + ); +} \ No newline at end of file diff --git a/include/sparrow_ipc/serialize.hpp b/include/sparrow_ipc/serialize.hpp new file mode 100644 index 0000000..a6896f8 --- /dev/null +++ b/include/sparrow_ipc/serialize.hpp @@ -0,0 +1,25 @@ +#pragma once + + +#include +#include + +#include +#include +#include + +#include "config/config.hpp" + + +namespace sparrow_ipc +{ + namespace details + { + SPARROW_IPC_API std::vector serialize_schema_message(const ArrowSchema& arrow_schema); + SPARROW_IPC_API void serialize_record_batch_message( + const ArrowArray& arrow_arr, + const std::vector& buffers_sizes, + std::vector& final_buffer + ); + } +} diff --git a/include/serialize_null_array.hpp b/include/sparrow_ipc/serialize_null_array.hpp similarity index 100% rename from include/serialize_null_array.hpp rename to include/sparrow_ipc/serialize_null_array.hpp diff --git a/include/serialize_primitive_array.hpp b/include/sparrow_ipc/serialize_primitive_array.hpp similarity index 93% rename from include/serialize_primitive_array.hpp rename to include/sparrow_ipc/serialize_primitive_array.hpp index e3fa799..b06c64b 100644 --- a/include/serialize_primitive_array.hpp +++ b/include/sparrow_ipc/serialize_primitive_array.hpp @@ -2,9 +2,13 @@ #include +#include + +#include "deserialize.hpp" #include "serialize.hpp" #include "utils.hpp" + namespace sparrow_ipc { // TODO Use `arr` as const after fixing the issue upstream in sparrow::get_arrow_structures @@ -50,18 +54,19 @@ namespace sparrow_ipc } template - sparrow::primitive_array deserialize_primitive_array(const std::vector& buffer) { + sparrow::primitive_array deserialize_primitive_array(const std::vector& buffer) + { const uint8_t* buf_ptr = buffer.data(); size_t current_offset = 0; // I - Deserialize the Schema message std::optional name; std::optional> metadata; - details::deserialize_schema_message(buf_ptr, current_offset, name, metadata); + deserialize_schema_message(buf_ptr, current_offset, name, metadata); // II - Deserialize the RecordBatch message const uint32_t batch_meta_len = *(reinterpret_cast(buf_ptr + current_offset)); - const auto* record_batch = details::deserialize_record_batch_message(buf_ptr, current_offset); + const auto* record_batch = deserialize_record_batch_message(buf_ptr, current_offset); current_offset += utils::align_to_8(batch_meta_len); const uint8_t* body_ptr = buf_ptr + current_offset; diff --git a/include/sparrow_ipc/utils.hpp b/include/sparrow_ipc/utils.hpp new file mode 100644 index 0000000..c83c60c --- /dev/null +++ b/include/sparrow_ipc/utils.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include +#include +#include +#include + +#include "config/config.hpp" +#include "Schema_generated.h" + +namespace sparrow_ipc::utils +{ + // Aligns a value to the next multiple of 8, as required by the Arrow IPC format for message bodies + SPARROW_IPC_API int64_t align_to_8(const int64_t n); + + // Creates a Flatbuffers type from a format string + // This function maps a sparrow data type to the corresponding Flatbuffers type + SPARROW_IPC_API std::pair> + get_flatbuffer_type(flatbuffers::FlatBufferBuilder& builder, std::string_view format_str); + +} diff --git a/include/utils.hpp b/include/utils.hpp deleted file mode 100644 index 60eae81..0000000 --- a/include/utils.hpp +++ /dev/null @@ -1,24 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include "Schema_generated.h" - -#include "config/config.hpp" - -namespace sparrow_ipc -{ - namespace utils - { - // Aligns a value to the next multiple of 8, as required by the Arrow IPC format for message bodies - SPARROW_IPC_API int64_t align_to_8(const int64_t n); - - // Creates a Flatbuffers type from a format string - // This function maps a sparrow data type to the corresponding Flatbuffers type - SPARROW_IPC_API std::pair> - get_flatbuffer_type(flatbuffers::FlatBufferBuilder& builder, std::string_view format_str); - } -} diff --git a/src/arrow_interface/arrow_array.cpp b/src/arrow_interface/arrow_array.cpp new file mode 100644 index 0000000..fa6a599 --- /dev/null +++ b/src/arrow_interface/arrow_array.cpp @@ -0,0 +1,71 @@ +#include "sparrow_ipc/arrow_interface/arrow_array.hpp" + +#include + +#include +#include + +#include "sparrow_ipc/arrow_interface/arrow_array/private_data.hpp" +#include "sparrow_ipc/arrow_interface/arrow_array_schema_common_release.hpp" + +namespace sparrow_ipc +{ + void release_arrow_array(ArrowArray* array) + { + SPARROW_ASSERT_FALSE(array == nullptr) + SPARROW_ASSERT_TRUE(array->release == std::addressof(release_arrow_array)) + + release_common_arrow(*array); + if (array->private_data != nullptr) + { + const auto private_data = static_cast(array->private_data); + delete private_data; + array->private_data = nullptr; + } + array->buffers = nullptr; // The buffers were deleted with the private data + } + + void fill_arrow_array( + ArrowArray& array, + int64_t length, + int64_t null_count, + int64_t offset, + std::vector&& buffers, + size_t children_count, + ArrowArray** children, + ArrowArray* dictionary + ) + { + SPARROW_ASSERT_TRUE(length >= 0); + SPARROW_ASSERT_TRUE(null_count >= -1); + SPARROW_ASSERT_TRUE(offset >= 0); + + array.length = length; + array.null_count = null_count; + array.offset = offset; + array.n_buffers = static_cast(buffers.size()); + array.private_data = new arrow_array_private_data(std::move(buffers)); + const auto private_data = static_cast(array.private_data); + array.buffers = private_data->buffers_ptrs(); + array.n_children = static_cast(children_count); + array.children = children; + array.dictionary = dictionary; + array.release = release_arrow_array; + } + + ArrowArray make_arrow_array( + int64_t length, + int64_t null_count, + int64_t offset, + std::vector&& buffers, + size_t children_count, + ArrowArray** children, + ArrowArray* dictionary + ) + { + ArrowArray array{}; + fill_arrow_array(array, length, null_count, offset, std::move(buffers), children_count, children, dictionary); + return array; + } + +} diff --git a/src/arrow_interface/arrow_array/private_data.cpp b/src/arrow_interface/arrow_array/private_data.cpp new file mode 100644 index 0000000..fac1ced --- /dev/null +++ b/src/arrow_interface/arrow_array/private_data.cpp @@ -0,0 +1,9 @@ +#include "sparrow_ipc/arrow_interface/arrow_array/private_data.hpp" + +namespace sparrow_ipc +{ + const void** arrow_array_private_data::buffers_ptrs() noexcept + { + return const_cast(reinterpret_cast(m_buffers_pointers.data())); + } +} \ No newline at end of file diff --git a/src/arrow_interface/arrow_schema.cpp b/src/arrow_interface/arrow_schema.cpp new file mode 100644 index 0000000..522dd9e --- /dev/null +++ b/src/arrow_interface/arrow_schema.cpp @@ -0,0 +1,20 @@ +#include "sparrow_ipc/arrow_interface/arrow_schema.hpp" + +#include "sparrow_ipc/arrow_interface/arrow_array_schema_common_release.hpp" + +namespace sparrow_ipc +{ + void release_arrow_schema(ArrowSchema* schema) + { + SPARROW_ASSERT_FALSE(schema == nullptr); + SPARROW_ASSERT_TRUE(schema->release == std::addressof(release_arrow_schema)); + release_common_arrow(*schema); + if (schema->private_data != nullptr) + { + const auto private_data = static_cast(schema->private_data); + delete private_data; + schema->private_data = nullptr; + } + *schema = {}; + } +} \ No newline at end of file diff --git a/src/arrow_interface/arrow_schema/private_data.cpp b/src/arrow_interface/arrow_schema/private_data.cpp new file mode 100644 index 0000000..9534e6d --- /dev/null +++ b/src/arrow_interface/arrow_schema/private_data.cpp @@ -0,0 +1,30 @@ +#include "sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp" + +namespace sparrow_ipc +{ + arrow_schema_private_data::arrow_schema_private_data( + std::string_view format, + const char* name, + std::optional metadata + ) + : m_format(format) + , m_name(name) + , m_metadata(std::move(metadata)) + { + } + + const char* arrow_schema_private_data::format_ptr() const noexcept + { + return m_format.data(); + } + + const char* arrow_schema_private_data::name_ptr() const noexcept + { + return m_name; + } + + const char* arrow_schema_private_data::metadata_ptr() const noexcept + { + return m_metadata.has_value() ? m_metadata->c_str() : nullptr; + } +} \ No newline at end of file diff --git a/src/deserialize.cpp b/src/deserialize.cpp new file mode 100644 index 0000000..7b29f3b --- /dev/null +++ b/src/deserialize.cpp @@ -0,0 +1,362 @@ +#include + +#include "sparrow_ipc/deserialize_variable_size_binary_array.hpp" +#include "sparrow_ipc/deserialize_fixedsizebinary_array.hpp" +#include "sparrow_ipc/deserialize_primitive_array.hpp" +#include "sparrow_ipc/magic_values.hpp" +#include "sparrow_ipc/metadata.hpp" + +namespace sparrow_ipc +{ + void deserialize_schema_message( + const uint8_t* buf_ptr, + size_t& current_offset, + std::optional& name, + std::optional>& metadata + ) + { + const uint32_t schema_meta_len = *(reinterpret_cast(buf_ptr + current_offset)); + current_offset += sizeof(uint32_t); + const auto schema_message = org::apache::arrow::flatbuf::GetMessage(buf_ptr + current_offset); + if (schema_message->header_type() != org::apache::arrow::flatbuf::MessageHeader::Schema) + { + throw std::runtime_error("Expected Schema message at the start of the buffer."); + } + const auto flatbuffer_schema = static_cast( + schema_message->header() + ); + const auto fields = flatbuffer_schema->fields(); + if (fields->size() != 1) + { + throw std::runtime_error("Expected schema with exactly one field."); + } + + const auto field = fields->Get(0); + + // Get name + if (const auto fb_name = field->name()) + { + name = fb_name->str(); + } + + // Handle metadata + const auto fb_metadata = field->custom_metadata(); + if (fb_metadata && !fb_metadata->empty()) + { + metadata = std::vector(); + metadata->reserve(fb_metadata->size()); + for (const auto& kv : *fb_metadata) + { + metadata->emplace_back(kv->key()->str(), kv->value()->str()); + } + } + current_offset += schema_meta_len; + } + + const org::apache::arrow::flatbuf::RecordBatch* + deserialize_record_batch_message(const uint8_t* buf_ptr, size_t& current_offset) + { + current_offset += sizeof(uint32_t); + const auto batch_message = org::apache::arrow::flatbuf::GetMessage(buf_ptr + current_offset); + if (batch_message->header_type() != org::apache::arrow::flatbuf::MessageHeader::RecordBatch) + { + throw std::runtime_error("Expected RecordBatch message, but got a different type."); + } + return static_cast(batch_message->header()); + } + + std::vector deserialize_stream(const uint8_t* buf_ptr) + { + const org::apache::arrow::flatbuf::Schema* schema = nullptr; + std::vector record_batches; + std::vector field_names; + std::vector fields_nullable; + std::vector field_types; + do + { + const EncapsulatedMessage encapsulated_message = create_encapsulated_message(buf_ptr); + const org::apache::arrow::flatbuf::Message* message = encapsulated_message.flat_buffer_message(); + switch (message->header_type()) + { + case org::apache::arrow::flatbuf::MessageHeader::Schema: + { + schema = message->header_as_Schema(); + const size_t size = static_cast(schema->fields()->size()); + field_names.reserve(size); + fields_nullable.reserve(size); + + for (const auto field : *(schema->fields())) + { + field_names.emplace_back(field->name()->string_view()); + fields_nullable.push_back(field->nullable()); + } + } + break; + case org::apache::arrow::flatbuf::MessageHeader::RecordBatch: + { + const auto lol = message->header_type(); + if (schema == nullptr) + { + throw std::runtime_error("Schema message is missing."); + } + const auto record_batch = message->header_as_RecordBatch(); + if (record_batch == nullptr) + { + throw std::runtime_error("RecordBatch message is missing."); + } + const size_t length = static_cast(record_batch->length()); + size_t buffer_index = 0; + + std::vector arrays; + arrays.reserve(schema->fields()->size()); + + for (const auto field : *(schema->fields())) + { + const ::flatbuffers::Vector<::flatbuffers::Offset>* + fb_custom_metadata = field->custom_metadata(); + const std::optional> + metadata = fb_custom_metadata == nullptr + ? std::nullopt + : std::make_optional(to_sparrow_metadata(*fb_custom_metadata)); + const auto name = field->name()->string_view(); + const auto field_type = field->type_type(); + switch (field_type) + { + case org::apache::arrow::flatbuf::Type::Bool: + arrays.emplace_back( + deserialize_primitive_array_bis( + *record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + ) + ); + break; + case org::apache::arrow::flatbuf::Type::Int: + { + const auto int_type = field->type_as_Int(); + + if (int_type->is_signed()) + { + switch (int_type->bitWidth()) + { + case 8: + arrays.emplace_back( + deserialize_primitive_array_bis( + *record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + ) + ); + break; + case 16: + arrays.emplace_back( + deserialize_primitive_array_bis( + *record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + ) + ); + break; + case 32: + arrays.emplace_back( + deserialize_primitive_array_bis( + *record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + ) + ); + break; + case 64: + arrays.emplace_back( + deserialize_primitive_array_bis( + *record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + ) + ); + break; + default: + throw std::runtime_error("Unsupported integer bit width."); + } + } + else + { + switch (int_type->bitWidth()) + { + case 8: + arrays.emplace_back( + deserialize_primitive_array_bis( + *record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + ) + ); + break; + case 16: + arrays.emplace_back( + deserialize_primitive_array_bis( + *record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + ) + ); + break; + case 32: + arrays.emplace_back( + deserialize_primitive_array_bis( + *record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + ) + ); + break; + case 64: + arrays.emplace_back( + deserialize_primitive_array_bis( + *record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + ) + ); + break; + default: + throw std::runtime_error("Unsupported integer bit width."); + } + } + } + break; + case org::apache::arrow::flatbuf::Type::FloatingPoint: + { + const auto float_type = field->type_as_FloatingPoint(); + switch (float_type->precision()) + { + case org::apache::arrow::flatbuf::Precision::HALF: + arrays.emplace_back( + deserialize_primitive_array_bis( + *record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + ) + ); + break; + case org::apache::arrow::flatbuf::Precision::SINGLE: + arrays.emplace_back( + deserialize_primitive_array_bis( + *record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + ) + ); + break; + case org::apache::arrow::flatbuf::Precision::DOUBLE: + arrays.emplace_back( + deserialize_primitive_array_bis( + *record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + ) + ); + break; + default: + throw std::runtime_error("Unsupported floating point precision."); + } + break; + } + case org::apache::arrow::flatbuf::Type::FixedSizeBinary: + { + const auto fixed_size_binary_field = field->type_as_FixedSizeBinary(); + arrays.emplace_back(deserialize_fixedwidthbinary(*record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index, + fixed_size_binary_field->byteWidth())); + break; + } + case org::apache::arrow::flatbuf::Type::Binary: + arrays.emplace_back(deserialize_variable_size_binary( + *record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + )); + break; + case org::apache::arrow::flatbuf::Type::LargeBinary: + arrays.emplace_back(deserialize_variable_size_binary( + *record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + )); + break; + case org::apache::arrow::flatbuf::Type::Utf8: + arrays.emplace_back(deserialize_variable_size_binary( + *record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + )); + break; + case org::apache::arrow::flatbuf::Type::LargeUtf8: + arrays.emplace_back(deserialize_variable_size_binary( + *record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + )); + break; + default: + throw std::runtime_error("Unsupported type."); + } + } + + std::vector field_names_str(field_names.cbegin(), field_names.cend()); + record_batches.emplace_back(std::move(field_names_str), std::move(arrays), "test"); + } + break; + case org::apache::arrow::flatbuf::MessageHeader::Tensor: + case org::apache::arrow::flatbuf::MessageHeader::DictionaryBatch: + case org::apache::arrow::flatbuf::MessageHeader::SparseTensor: + throw std::runtime_error("Not supported"); + default: + throw std::runtime_error("Unknown message header type."); + } + const size_t encapsulated_message_total_length = encapsulated_message.total_length(); + buf_ptr += encapsulated_message_total_length; + if (is_end_of_stream(std::span{buf_ptr, 8})) + { + break; + } + } while (true); + return record_batches; + } + +} \ No newline at end of file diff --git a/src/deserialize_utils.cpp b/src/deserialize_utils.cpp new file mode 100644 index 0000000..f1e5b3b --- /dev/null +++ b/src/deserialize_utils.cpp @@ -0,0 +1,17 @@ +#include "sparrow_ipc/deserialize_utils.hpp" + +namespace sparrow_ipc +{ + const sparrow::dynamic_bitset_view message_buffer_to_validity_bitmap( + const org::apache::arrow::flatbuf::RecordBatch* record_batch, + std::span body, + size_t index + ) + { + const auto buffer_metadata = record_batch->buffers()->Get(index); + return sparrow::dynamic_bitset_view{ + body.data() + buffer_metadata->offset(), + static_cast(buffer_metadata->length()) + }; + } +} \ No newline at end of file diff --git a/src/encapsulated_message.cpp b/src/encapsulated_message.cpp new file mode 100644 index 0000000..eb6f5a4 --- /dev/null +++ b/src/encapsulated_message.cpp @@ -0,0 +1,105 @@ +#include "sparrow_ipc/encapsulated_message.hpp" + +#include + +#include "sparrow_ipc/magic_values.hpp" + +namespace sparrow_ipc +{ + EncapsulatedMessage::EncapsulatedMessage(const uint8_t* buf_ptr) + : m_buf_ptr(buf_ptr) + { + } + + const org::apache::arrow::flatbuf::Message* EncapsulatedMessage::flat_buffer_message() const + { + const uint8_t* message_ptr = m_buf_ptr + (sizeof(uint32_t) * 2); // 4 bytes continuation + 4 bytes + // metadata size + return org::apache::arrow::flatbuf::GetMessage(message_ptr); + } + + size_t EncapsulatedMessage::metadata_length() const + { + return *(reinterpret_cast(m_buf_ptr + sizeof(uint32_t))); + } + + [[nodiscard]] std::variant< + const org::apache::arrow::flatbuf::Schema*, + const org::apache::arrow::flatbuf::RecordBatch*, + const org::apache::arrow::flatbuf::Tensor*, + const org::apache::arrow::flatbuf::DictionaryBatch*, + const org::apache::arrow::flatbuf::SparseTensor*> + EncapsulatedMessage::metadata() const + { + const auto schema_message = flat_buffer_message(); + switch (schema_message->header_type()) + { + case org::apache::arrow::flatbuf::MessageHeader::Schema: + { + return schema_message->header_as_Schema(); + } + case org::apache::arrow::flatbuf::MessageHeader::RecordBatch: + { + return schema_message->header_as_RecordBatch(); + } + case org::apache::arrow::flatbuf::MessageHeader::Tensor: + { + return schema_message->header_as_Tensor(); + } + case org::apache::arrow::flatbuf::MessageHeader::DictionaryBatch: + { + return schema_message->header_as_DictionaryBatch(); + } + case org::apache::arrow::flatbuf::MessageHeader::SparseTensor: + { + return schema_message->header_as_SparseTensor(); + } + default: + throw std::runtime_error("Unknown message header type."); + } + } + + const ::flatbuffers::Vector<::flatbuffers::Offset>* + EncapsulatedMessage::custom_metadata() const + { + return flat_buffer_message()->custom_metadata(); + } + + size_t EncapsulatedMessage::body_length() const + { + return static_cast(flat_buffer_message()->bodyLength()); + } + + std::span EncapsulatedMessage::body() const + { + const uint8_t* body_ptr = m_buf_ptr + (sizeof(uint32_t) * 2) // 4 bytes continuation + 4 bytes + // metadata size + + metadata_length(); + return {body_ptr, body_length()}; + } + + size_t EncapsulatedMessage::total_length() const + { + return sizeof(uint32_t) * 2 // 4 bytes continuation + 4 bytes metadata size + + metadata_length() + body_length(); + } + + std::span EncapsulatedMessage::as_span() const + { + return {m_buf_ptr, total_length()}; + } + + EncapsulatedMessage create_encapsulated_message(const uint8_t* buf_ptr) + { + if (!buf_ptr) + { + throw std::invalid_argument("Buffer pointer cannot be null."); + } + const std::span continuation_span(buf_ptr, 4); + if (!is_continuation(continuation_span)) + { + throw std::runtime_error("Buffer starts with continuation bytes, expected a valid message."); + } + return {buf_ptr}; + } +} \ No newline at end of file diff --git a/src/magic_values.cpp b/src/magic_values.cpp new file mode 100644 index 0000000..154021f --- /dev/null +++ b/src/magic_values.cpp @@ -0,0 +1,19 @@ +#include "sparrow_ipc/magic_values.hpp" + +namespace sparrow_ipc +{ + bool is_continuation(std::istream& stream) + { + std::array buf; + stream.read(reinterpret_cast(buf.data()), 4); + if (stream.gcount() < 4) + { + if (stream.eof()) + { + return false; // End of file reached, not a continuation + } + throw std::runtime_error("Failed to read enough bytes from stream."); + } + return is_continuation(buf); + } +} \ No newline at end of file diff --git a/src/metadata.cpp b/src/metadata.cpp new file mode 100644 index 0000000..a07f216 --- /dev/null +++ b/src/metadata.cpp @@ -0,0 +1,21 @@ +#include "sparrow_ipc/metadata.hpp" + +#include + +namespace sparrow_ipc +{ + std::vector to_sparrow_metadata( + const ::flatbuffers::Vector<::flatbuffers::Offset>& metadata + ) + { + std::vector sparrow_metadata; + sparrow_metadata.reserve(metadata.size()); + + for (const auto& kv : metadata) + { + sparrow_metadata.emplace_back(kv->key()->str(), kv->value()->str()); + } + + return sparrow_metadata; + } +} \ No newline at end of file diff --git a/src/serialize.cpp b/src/serialize.cpp index 0c76678..723ef53 100644 --- a/src/serialize.cpp +++ b/src/serialize.cpp @@ -1,8 +1,10 @@ +#include "sparrow_ipc/serialize.hpp" + #include #include -#include "serialize.hpp" -#include "utils.hpp" +#include "Message_generated.h" +#include "sparrow_ipc/utils.hpp" namespace sparrow_ipc { @@ -36,7 +38,8 @@ namespace sparrow_ipc const auto key_offset = schema_builder.CreateString(std::string(key)); const auto value_offset = schema_builder.CreateString(std::string(value)); kv_offsets.push_back( - org::apache::arrow::flatbuf::CreateKeyValue(schema_builder, key_offset, value_offset)); + org::apache::arrow::flatbuf::CreateKeyValue(schema_builder, key_offset, value_offset) + ); } fb_metadata_offset = schema_builder.CreateVector(kv_offsets); } @@ -50,14 +53,19 @@ namespace sparrow_ipc type_offset, 0, // dictionary 0, // children - fb_metadata_offset); + fb_metadata_offset + ); // A Schema contains a vector of fields const std::vector> fields_vec = {fb_field}; const auto fb_fields = schema_builder.CreateVector(fields_vec); // Build the Schema object from the vector of fields - const auto schema_offset = org::apache::arrow::flatbuf::CreateSchema(schema_builder, org::apache::arrow::flatbuf::Endianness::Little, fb_fields); + const auto schema_offset = org::apache::arrow::flatbuf::CreateSchema( + schema_builder, + org::apache::arrow::flatbuf::Endianness::Little, + fb_fields + ); // Wrap the Schema in a top-level Message, which is the standard IPC envelope const auto schema_message_offset = org::apache::arrow::flatbuf::CreateMessage( @@ -70,10 +78,10 @@ namespace sparrow_ipc schema_builder.Finish(schema_message_offset); // Assemble the Schema message bytes - const uint32_t schema_len = schema_builder.GetSize(); // Get the size of the serialized metadata + const uint32_t schema_len = schema_builder.GetSize(); // Get the size of the serialized metadata // This will be the final buffer holding the complete IPC stream. std::vector final_buffer; - final_buffer.resize(sizeof(uint32_t) + schema_len); // Resize the buffer to hold the message + final_buffer.resize(sizeof(uint32_t) + schema_len); // Resize the buffer to hold the message // Copy the metadata into the buffer, after the 4-byte length prefix memcpy(final_buffer.data() + sizeof(uint32_t), schema_builder.GetBufferPointer(), schema_len); // Write the 4-byte metadata length at the beginning of the message @@ -81,14 +89,18 @@ namespace sparrow_ipc return final_buffer; } - void serialize_record_batch_message(const ArrowArray& arrow_arr, const std::vector& buffers_sizes, std::vector& final_buffer) + void serialize_record_batch_message( + const ArrowArray& arrow_arr, + const std::vector& buffers_sizes, + std::vector& final_buffer + ) { // Create a new builder for the RecordBatch message's metadata flatbuffers::FlatBufferBuilder batch_builder; std::vector buffers_vec; int64_t current_offset = 0; - int64_t body_len = 0; // The total size of the message body + int64_t body_len = 0; // The total size of the message body for (const auto& size : buffers_sizes) { buffers_vec.emplace_back(current_offset, size); @@ -103,7 +115,12 @@ namespace sparrow_ipc const auto fb_buffers_vector = batch_builder.CreateVectorOfStructs(buffers_vec); // Build the RecordBatch metadata object - const auto record_batch_offset = org::apache::arrow::flatbuf::CreateRecordBatch(batch_builder, arrow_arr.length, fb_nodes_vector, fb_buffers_vector); + const auto record_batch_offset = org::apache::arrow::flatbuf::CreateRecordBatch( + batch_builder, + arrow_arr.length, + fb_nodes_vector, + fb_buffers_vector + ); // Wrap the RecordBatch in a top-level Message const auto batch_message_offset = org::apache::arrow::flatbuf::CreateMessage( @@ -116,13 +133,16 @@ namespace sparrow_ipc batch_builder.Finish(batch_message_offset); // Append the RecordBatch message to the final buffer - const uint32_t batch_meta_len = batch_builder.GetSize(); // Get the size of the batch metadata - const int64_t aligned_batch_meta_len = utils::align_to_8(batch_meta_len); // Calculate the padded length + const uint32_t batch_meta_len = batch_builder.GetSize(); // Get the size of the batch metadata + const int64_t aligned_batch_meta_len = utils::align_to_8(batch_meta_len); // Calculate the padded + // length - const size_t current_size = final_buffer.size(); // Get the current size (which is the end of the Schema message) + const size_t current_size = final_buffer.size(); // Get the current size (which is the end of the + // Schema message) // Resize the buffer to append the new message final_buffer.resize(current_size + sizeof(uint32_t) + aligned_batch_meta_len + body_len); - uint8_t* dst = final_buffer.data() + current_size; // Get a pointer to where the new message will start + uint8_t* dst = final_buffer.data() + current_size; // Get a pointer to where the new message will + // start // Write the 4-byte metadata length for the RecordBatch message *(reinterpret_cast(dst)) = batch_meta_len; @@ -155,54 +175,6 @@ namespace sparrow_ipc } } - void deserialize_schema_message(const uint8_t* buf_ptr, size_t& current_offset, std::optional& name, std::optional>& metadata) - { - const uint32_t schema_meta_len = *(reinterpret_cast(buf_ptr + current_offset)); - current_offset += sizeof(uint32_t); - const auto schema_message = org::apache::arrow::flatbuf::GetMessage(buf_ptr + current_offset); - if (schema_message->header_type() != org::apache::arrow::flatbuf::MessageHeader::Schema) - { - throw std::runtime_error("Expected Schema message at the start of the buffer."); - } - const auto flatbuffer_schema = static_cast(schema_message->header()); - const auto fields = flatbuffer_schema->fields(); - if (fields->size() != 1) - { - throw std::runtime_error("Expected schema with exactly one field."); - } - - const auto field = fields->Get(0); - - // Get name - if (const auto fb_name = field->name()) - { - name = fb_name->str(); - } - - // Handle metadata - const auto fb_metadata = field->custom_metadata(); - if (fb_metadata && !fb_metadata->empty()) - { - metadata = std::vector(); - metadata->reserve(fb_metadata->size()); - for (const auto& kv : *fb_metadata) - { - metadata->emplace_back(kv->key()->str(), kv->value()->str()); - } - } - current_offset += schema_meta_len; - } - - const org::apache::arrow::flatbuf::RecordBatch* deserialize_record_batch_message(const uint8_t* buf_ptr, size_t& current_offset) - { - current_offset += sizeof(uint32_t); - const auto batch_message = org::apache::arrow::flatbuf::GetMessage(buf_ptr + current_offset); - if (batch_message->header_type() != org::apache::arrow::flatbuf::MessageHeader::RecordBatch) - { - throw std::runtime_error("Expected RecordBatch message, but got a different type."); - } - return static_cast(batch_message->header()); - } - } // namespace details -} // namespace sparrow-ipc + } // namespace details +} // namespace sparrow-ipc diff --git a/src/serialize_null_array.cpp b/src/serialize_null_array.cpp index 230a9db..96b68a3 100644 --- a/src/serialize_null_array.cpp +++ b/src/serialize_null_array.cpp @@ -1,4 +1,6 @@ -#include "serialize_null_array.hpp" +#include "sparrow_ipc/serialize_null_array.hpp" + +#include "sparrow_ipc/deserialize.hpp" namespace sparrow_ipc { @@ -30,10 +32,10 @@ namespace sparrow_ipc // I - Deserialize the Schema message std::optional name; std::optional> metadata; - details::deserialize_schema_message(buf_ptr, current_offset, name, metadata); + deserialize_schema_message(buf_ptr, current_offset, name, metadata); // II - Deserialize the RecordBatch message - const auto* record_batch = details::deserialize_record_batch_message(buf_ptr, current_offset); + const auto* record_batch = deserialize_record_batch_message(buf_ptr, current_offset); // The body is empty, so we don't need to read any further. // Construct the null_array from the deserialized metadata. diff --git a/src/utils.cpp b/src/utils.cpp index a538700..d22288a 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -1,10 +1,11 @@ +#include "sparrow_ipc/utils.hpp" + #include #include #include #include "sparrow.hpp" -#include "utils.hpp" namespace sparrow_ipc { @@ -24,7 +25,11 @@ namespace sparrow_ipc std::string_view substr_str(format_str.data() + sep_pos + 1, format_str.size() - sep_pos - 1); int32_t substr_size = 0; - const auto [ptr, ec] = std::from_chars(substr_str.data(), substr_str.data() + substr_str.size(), substr_size); + const auto [ptr, ec] = std::from_chars( + substr_str.data(), + substr_str.data() + substr_str.size(), + substr_size + ); if (ec != std::errc() || ptr != substr_str.data() + substr_str.size()) { @@ -35,23 +40,37 @@ namespace sparrow_ipc // Creates a Flatbuffers Decimal type from a format string // The format string is expected to be in the format "d:precision,scale" - std::pair> - get_flatbuffer_decimal_type(flatbuffers::FlatBufferBuilder& builder, std::string_view format_str, const int32_t bitWidth) + std::pair> get_flatbuffer_decimal_type( + flatbuffers::FlatBufferBuilder& builder, + std::string_view format_str, + const int32_t bitWidth + ) { // Decimal requires precision and scale. We need to parse the format_str. // Format: "d:precision,scale" - const auto scale = parse_format(format_str, ","); + const auto scale = parse_format(format_str, ","); if (!scale.has_value()) { - throw std::runtime_error("Failed to parse Decimal " + std::to_string(bitWidth) + " scale from format string: " + std::string(format_str)); + throw std::runtime_error( + "Failed to parse Decimal " + std::to_string(bitWidth) + + " scale from format string: " + std::string(format_str) + ); } const size_t comma_pos = format_str.find(','); - const auto precision = parse_format(format_str.substr(0, comma_pos), ":"); + const auto precision = parse_format(format_str.substr(0, comma_pos), ":"); if (!precision.has_value()) { - throw std::runtime_error("Failed to parse Decimal " + std::to_string(bitWidth) + " precision from format string: " + std::string(format_str)); + throw std::runtime_error( + "Failed to parse Decimal " + std::to_string(bitWidth) + + " precision from format string: " + std::string(format_str) + ); } - const auto decimal_type = org::apache::arrow::flatbuf::CreateDecimal(builder, precision.value(), scale.value(), bitWidth); + const auto decimal_type = org::apache::arrow::flatbuf::CreateDecimal( + builder, + precision.value(), + scale.value(), + bitWidth + ); return {org::apache::arrow::flatbuf::Type::Decimal, decimal_type.Union()}; } } @@ -122,19 +141,25 @@ namespace sparrow_ipc case sparrow::data_type::HALF_FLOAT: { const auto fp_type = org::apache::arrow::flatbuf::CreateFloatingPoint( - builder, org::apache::arrow::flatbuf::Precision::HALF); + builder, + org::apache::arrow::flatbuf::Precision::HALF + ); return {org::apache::arrow::flatbuf::Type::FloatingPoint, fp_type.Union()}; } case sparrow::data_type::FLOAT: { const auto fp_type = org::apache::arrow::flatbuf::CreateFloatingPoint( - builder, org::apache::arrow::flatbuf::Precision::SINGLE); + builder, + org::apache::arrow::flatbuf::Precision::SINGLE + ); return {org::apache::arrow::flatbuf::Type::FloatingPoint, fp_type.Union()}; } case sparrow::data_type::DOUBLE: { const auto fp_type = org::apache::arrow::flatbuf::CreateFloatingPoint( - builder, org::apache::arrow::flatbuf::Precision::DOUBLE); + builder, + org::apache::arrow::flatbuf::Precision::DOUBLE + ); return {org::apache::arrow::flatbuf::Type::FloatingPoint, fp_type.Union()}; } case sparrow::data_type::STRING: @@ -169,87 +194,142 @@ namespace sparrow_ipc } case sparrow::data_type::DATE_DAYS: { - const auto date_type = org::apache::arrow::flatbuf::CreateDate(builder, org::apache::arrow::flatbuf::DateUnit::DAY); + const auto date_type = org::apache::arrow::flatbuf::CreateDate( + builder, + org::apache::arrow::flatbuf::DateUnit::DAY + ); return {org::apache::arrow::flatbuf::Type::Date, date_type.Union()}; } case sparrow::data_type::DATE_MILLISECONDS: { - const auto date_type = org::apache::arrow::flatbuf::CreateDate(builder, org::apache::arrow::flatbuf::DateUnit::MILLISECOND); + const auto date_type = org::apache::arrow::flatbuf::CreateDate( + builder, + org::apache::arrow::flatbuf::DateUnit::MILLISECOND + ); return {org::apache::arrow::flatbuf::Type::Date, date_type.Union()}; } case sparrow::data_type::TIMESTAMP_SECONDS: { - const auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp(builder, org::apache::arrow::flatbuf::TimeUnit::SECOND); + const auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp( + builder, + org::apache::arrow::flatbuf::TimeUnit::SECOND + ); return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()}; } case sparrow::data_type::TIMESTAMP_MILLISECONDS: { - const auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp(builder, org::apache::arrow::flatbuf::TimeUnit::MILLISECOND); + const auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp( + builder, + org::apache::arrow::flatbuf::TimeUnit::MILLISECOND + ); return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()}; } case sparrow::data_type::TIMESTAMP_MICROSECONDS: { - const auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp(builder, org::apache::arrow::flatbuf::TimeUnit::MICROSECOND); + const auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp( + builder, + org::apache::arrow::flatbuf::TimeUnit::MICROSECOND + ); return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()}; } case sparrow::data_type::TIMESTAMP_NANOSECONDS: { - const auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp(builder, org::apache::arrow::flatbuf::TimeUnit::NANOSECOND); + const auto timestamp_type = org::apache::arrow::flatbuf::CreateTimestamp( + builder, + org::apache::arrow::flatbuf::TimeUnit::NANOSECOND + ); return {org::apache::arrow::flatbuf::Type::Timestamp, timestamp_type.Union()}; } case sparrow::data_type::DURATION_SECONDS: { - const auto duration_type = org::apache::arrow::flatbuf::CreateDuration(builder, org::apache::arrow::flatbuf::TimeUnit::SECOND); + const auto duration_type = org::apache::arrow::flatbuf::CreateDuration( + builder, + org::apache::arrow::flatbuf::TimeUnit::SECOND + ); return {org::apache::arrow::flatbuf::Type::Duration, duration_type.Union()}; } case sparrow::data_type::DURATION_MILLISECONDS: { - const auto duration_type = org::apache::arrow::flatbuf::CreateDuration(builder, org::apache::arrow::flatbuf::TimeUnit::MILLISECOND); + const auto duration_type = org::apache::arrow::flatbuf::CreateDuration( + builder, + org::apache::arrow::flatbuf::TimeUnit::MILLISECOND + ); return {org::apache::arrow::flatbuf::Type::Duration, duration_type.Union()}; } case sparrow::data_type::DURATION_MICROSECONDS: { - const auto duration_type = org::apache::arrow::flatbuf::CreateDuration(builder, org::apache::arrow::flatbuf::TimeUnit::MICROSECOND); + const auto duration_type = org::apache::arrow::flatbuf::CreateDuration( + builder, + org::apache::arrow::flatbuf::TimeUnit::MICROSECOND + ); return {org::apache::arrow::flatbuf::Type::Duration, duration_type.Union()}; } case sparrow::data_type::DURATION_NANOSECONDS: { - const auto duration_type = org::apache::arrow::flatbuf::CreateDuration(builder, org::apache::arrow::flatbuf::TimeUnit::NANOSECOND); + const auto duration_type = org::apache::arrow::flatbuf::CreateDuration( + builder, + org::apache::arrow::flatbuf::TimeUnit::NANOSECOND + ); return {org::apache::arrow::flatbuf::Type::Duration, duration_type.Union()}; } case sparrow::data_type::INTERVAL_MONTHS: { - const auto interval_type = org::apache::arrow::flatbuf::CreateInterval(builder, org::apache::arrow::flatbuf::IntervalUnit::YEAR_MONTH); + const auto interval_type = org::apache::arrow::flatbuf::CreateInterval( + builder, + org::apache::arrow::flatbuf::IntervalUnit::YEAR_MONTH + ); return {org::apache::arrow::flatbuf::Type::Interval, interval_type.Union()}; } case sparrow::data_type::INTERVAL_DAYS_TIME: { - const auto interval_type = org::apache::arrow::flatbuf::CreateInterval(builder, org::apache::arrow::flatbuf::IntervalUnit::DAY_TIME); + const auto interval_type = org::apache::arrow::flatbuf::CreateInterval( + builder, + org::apache::arrow::flatbuf::IntervalUnit::DAY_TIME + ); return {org::apache::arrow::flatbuf::Type::Interval, interval_type.Union()}; } case sparrow::data_type::INTERVAL_MONTHS_DAYS_NANOSECONDS: { - const auto interval_type = org::apache::arrow::flatbuf::CreateInterval(builder, org::apache::arrow::flatbuf::IntervalUnit::MONTH_DAY_NANO); + const auto interval_type = org::apache::arrow::flatbuf::CreateInterval( + builder, + org::apache::arrow::flatbuf::IntervalUnit::MONTH_DAY_NANO + ); return {org::apache::arrow::flatbuf::Type::Interval, interval_type.Union()}; } case sparrow::data_type::TIME_SECONDS: { - const auto time_type = org::apache::arrow::flatbuf::CreateTime(builder, org::apache::arrow::flatbuf::TimeUnit::SECOND, 32); + const auto time_type = org::apache::arrow::flatbuf::CreateTime( + builder, + org::apache::arrow::flatbuf::TimeUnit::SECOND, + 32 + ); return {org::apache::arrow::flatbuf::Type::Time, time_type.Union()}; } case sparrow::data_type::TIME_MILLISECONDS: { - const auto time_type = org::apache::arrow::flatbuf::CreateTime(builder, org::apache::arrow::flatbuf::TimeUnit::MILLISECOND, 32); + const auto time_type = org::apache::arrow::flatbuf::CreateTime( + builder, + org::apache::arrow::flatbuf::TimeUnit::MILLISECOND, + 32 + ); return {org::apache::arrow::flatbuf::Type::Time, time_type.Union()}; } case sparrow::data_type::TIME_MICROSECONDS: { - const auto time_type = org::apache::arrow::flatbuf::CreateTime(builder, org::apache::arrow::flatbuf::TimeUnit::MICROSECOND, 64); + const auto time_type = org::apache::arrow::flatbuf::CreateTime( + builder, + org::apache::arrow::flatbuf::TimeUnit::MICROSECOND, + 64 + ); return {org::apache::arrow::flatbuf::Type::Time, time_type.Union()}; } case sparrow::data_type::TIME_NANOSECONDS: { - const auto time_type = org::apache::arrow::flatbuf::CreateTime(builder, org::apache::arrow::flatbuf::TimeUnit::NANOSECOND, 64); + const auto time_type = org::apache::arrow::flatbuf::CreateTime( + builder, + org::apache::arrow::flatbuf::TimeUnit::NANOSECOND, + 64 + ); return {org::apache::arrow::flatbuf::Type::Time, time_type.Union()}; } case sparrow::data_type::LIST: @@ -276,13 +356,18 @@ namespace sparrow_ipc { // FixedSizeList requires listSize. We need to parse the format_str. // Format: "+w:size" - const auto list_size = parse_format(format_str, ":"); + const auto list_size = parse_format(format_str, ":"); if (!list_size.has_value()) { - throw std::runtime_error("Failed to parse FixedSizeList size from format string: " + std::string(format_str)); + throw std::runtime_error( + "Failed to parse FixedSizeList size from format string: " + std::string(format_str) + ); } - const auto fixed_size_list_type = org::apache::arrow::flatbuf::CreateFixedSizeList(builder, list_size.value()); + const auto fixed_size_list_type = org::apache::arrow::flatbuf::CreateFixedSizeList( + builder, + list_size.value() + ); return {org::apache::arrow::flatbuf::Type::FixedSizeList, fixed_size_list_type.Union()}; } case sparrow::data_type::STRUCT: @@ -292,17 +377,27 @@ namespace sparrow_ipc } case sparrow::data_type::MAP: { - const auto map_type = org::apache::arrow::flatbuf::CreateMap(builder, false); // not sorted keys + const auto map_type = org::apache::arrow::flatbuf::CreateMap(builder, false); // not + // sorted + // keys return {org::apache::arrow::flatbuf::Type::Map, map_type.Union()}; } case sparrow::data_type::DENSE_UNION: { - const auto union_type = org::apache::arrow::flatbuf::CreateUnion(builder, org::apache::arrow::flatbuf::UnionMode::Dense, 0); + const auto union_type = org::apache::arrow::flatbuf::CreateUnion( + builder, + org::apache::arrow::flatbuf::UnionMode::Dense, + 0 + ); return {org::apache::arrow::flatbuf::Type::Union, union_type.Union()}; } case sparrow::data_type::SPARSE_UNION: { - const auto union_type = org::apache::arrow::flatbuf::CreateUnion(builder, org::apache::arrow::flatbuf::UnionMode::Sparse, 0); + const auto union_type = org::apache::arrow::flatbuf::CreateUnion( + builder, + org::apache::arrow::flatbuf::UnionMode::Sparse, + 0 + ); return {org::apache::arrow::flatbuf::Type::Union, union_type.Union()}; } case sparrow::data_type::RUN_ENCODED: @@ -330,13 +425,19 @@ namespace sparrow_ipc { // FixedSizeBinary requires byteWidth. We need to parse the format_str. // Format: "w:size" - const auto byte_width = parse_format(format_str, ":"); + const auto byte_width = parse_format(format_str, ":"); if (!byte_width.has_value()) { - throw std::runtime_error("Failed to parse FixedWidthBinary size from format string: " + std::string(format_str)); + throw std::runtime_error( + "Failed to parse FixedWidthBinary size from format string: " + + std::string(format_str) + ); } - const auto fixed_width_binary_type = org::apache::arrow::flatbuf::CreateFixedSizeBinary(builder, byte_width.value()); + const auto fixed_width_binary_type = org::apache::arrow::flatbuf::CreateFixedSizeBinary( + builder, + byte_width.value() + ); return {org::apache::arrow::flatbuf::Type::FixedSizeBinary, fixed_width_binary_type.Union()}; } default: diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e184974..2e5921f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -7,9 +7,12 @@ set( include/sparrow_ipc_tests_helpers.hpp # TODO move all the files below under src? main.cpp - test_utils.cpp - test_primitive_array_serialization.cpp + test_arrow_array.cpp + test_arrow_schema.cpp test_null_array_serialization.cpp + test_primitive_array_serialization.cpp + test_primitive_array_with_files.cpp + test_utils.cpp ) add_executable(${test_target} ${SPARROW_IPC_TESTS_SRC}) @@ -17,8 +20,13 @@ target_link_libraries(${test_target} PRIVATE sparrow-ipc doctest::doctest + sparrow::json_reader ) +target_compile_definitions(${test_target} + PRIVATE + TESTS_RESOURCES_FILES_PATH="${CMAKE_CURRENT_SOURCE_DIR}/resources/") + if(WIN32) add_custom_command( TARGET ${test_target} POST_BUILD diff --git a/tests/metadata_sample.hpp b/tests/metadata_sample.hpp new file mode 100644 index 0000000..5991e6b --- /dev/null +++ b/tests/metadata_sample.hpp @@ -0,0 +1,75 @@ +// Man Group Operations Limited +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include + +#include "sparrow/utils/metadata.hpp" + +namespace sparrow_ipc +{ + static const std::string metadata_buffer = []() + { + if constexpr (std::endian::native == std::endian::big) + { + return std::string{ + 0x00, 0x00, 0x00, 0x02, // Number of keys/values + 0x00, 0x00, 0x00, 0x04, // Length of key1 + 'k', 'e', 'y', '1', // Key 1 + 0x00, 0x00, 0x00, 0x04, // Length of value1 + 'v', 'a', 'l', '1', // Value 1 + 0x00, 0x00, 0x00, 0x04, // Length of key2 + 'k', 'e', 'y', '2', // Key 2 + 0x00, 0x00, 0x00, 0x04, // Length of value2 + 'v', 'a', 'l', '2' // Value 2 + }; + } + else if constexpr (std::endian::native == std::endian::little) + { + return std::string{ + 0x02, 0x00, 0x00, 0x00, // Number of keys/values + 0x04, 0x00, 0x00, 0x00, // Length of key1 + 'k', 'e', 'y', '1', // Key 1 + 0x04, 0x00, 0x00, 0x00, // Length of value1 + 'v', 'a', 'l', '1', // Value 1 + 0x04, 0x00, 0x00, 0x00, // Length of key2 + 'k', 'e', 'y', '2', // Key 2 + 0x04, 0x00, 0x00, 0x00, // Length of value2 + 'v', 'a', 'l', '2' // Value 2 + }; + } + }(); + + static const std::vector metadata_sample = {{"key1", "val1"}, {"key2", "val2"}}; + static const std::optional> metadata_sample_opt = metadata_sample; + + inline void + test_metadata(const std::vector& metadata_1, const sparrow::key_value_view& metadata_2) + { + REQUIRE_EQ(metadata_1.size(), metadata_2.size()); + + auto it = metadata_2.cbegin(); + for (const auto& [key, value] : metadata_1) + { + CHECK_EQ(key, (*it).first); + CHECK_EQ(value, (*it).second); + ++it; + } + } +} diff --git a/tests/resources/generated_primitive.arrow_file b/tests/resources/generated_primitive.arrow_file new file mode 100644 index 0000000000000000000000000000000000000000..5854eb145badfd88847c2939fae33e914c9a5ba8 GIT binary patch literal 22298 zcmeIa1ymJn+cvyGx}_xqNkO_n5Tp?hq$HK@+>+8MA&Q8Egp^8$v?y#!rBe`TK|pCF z1>~FA?D6*Ye*B;J`~Po!?^^%+oNJwP&2gP|)m$^|Iho5^S{m2+VX$L>BC%mG5*SE$ zFd`Th3=@U}oB@P}@w1))fEcLylXFl320IHdHDKR|1%vqkED12+0*1hZ!NLHR2N(z7 zcNj2OB*1C_lLP$W6bzOLuqD7;0I#FNU_}7C0xSgZ9vTeR3h-Tk^#NuCcpPw@0I(Xs zfD2gV{_!zT5K{w(U)%KN;P}`ZsAB-w+Rod`@|N9gD|0(%Gq}%9egVFl&MwZ*9*&M? z=8jeV~gYHj0AP0b4^83GT$5p@w86XG8=pXG6|64OqKK|K`c$5#?Klp%} zfjDUcCOM$@>sUAf_2ArkxLb?;Jt7QNhO|NZhfO9>Px#XYiU5PL0OJZ&WBeYY56~wC z1gIhYb$DUqNE@_&*kAzl#J|`;Q6f)boceuE^?-E>gQ3H~@w9eyF>@CY`dfGWPU8H- z?jFhx9D|>BP%xlw?+@|bAnnlpiYF-Ww|L8_c#MD#a6G{EctTjfRe}Xx0sx^QuODPw z(EgXWuE4wk3h2N0Gvvqdu?!#p$J4{k`S)(ZERl9-|FC-$42HehT(J1Ew#T~j{#*ZndCY`{5y`o%Bux8a0IBJKX!uXvOl6brP2`c?Ud zSm4~@0I^O3435R`3qKDPixH6jbM7#Za|hZJ{bdh@gq;V*45&s2_k~}tGuxATG#Q{C zjQcO94#tSoLHmbo?k~1rOqA`o zNKWMcXsag54xCF=KYyP}7!FeBpZ&c1i!CbVuhR(&`$NoDlpQ!HVBCK(r7%BKOcp@? z>zv{MvxEb~1{g_bUbg~A( zQ!x~V3J8O11SHLq;-|en3NxX~AXEtz5QgM=C;=3|429)U>&)v`G99Y zkRW?86o&kR&x^kjgdw@|$qn_Vy(S7Pql}VKm=cA{P#E$FVMu=uB^N;{@Bz=TApL9~ z3PbuqC~SYC23tpA6#R)HeKDL9Nis^I4}~HBA5d5hweA>^58RM9;CUB?A-OI7i43xL zMd3YEJ8Z!RqhAQZkQ@|&&!NzIM1}u}AyY^WO$UU3m*b);{v?N%!=L2P@cl{t|26)< z^Z)PUsNwy8JwE@=|G(=Ww0!?GK7W$8)|^ZX2J1;V2|p>1%}*9uP$2EI52g;`ur8p! zAs;4iB8CM8hWpxX?qAYT+HWf+-BMOX)jx@Z6pjNWXFb2NBJVr?b0@VasMV`(hth(< zf{p`oSTVZXDN=D%=WCDC@2YCf7R=v_TaU3z1kz~Mymb?MIyHwdh$sUc0A(^qYC z#81v0z2Bq))}6s4+%hfl=9^i+`p9@l)##E4Mm$*W3AY~t%grwC^67GX=n@VBZk>Vq zMP1XkRZI^nICU8$S+{nG9VMhZu4ddZ_Z+la;(Wz_dP5ZF?uRJRd+YZxR+fe&V61YF zQx{3UvLjfZAboCv$G-^l`4{@@%%G0^3W!Q@q920yY_5PS1z_3nhUht1LYlVOh6%r7 zsQ#*?#spZOXrPx5mbk2&NV_zDMMp>nRf7Ej3E7TE5+4-7`q(A3uV7_=QL`k7DGE|( zkoAj$Mqp`Pw|fCBwL*5h!IF5PQ4cH$X}&FjcFEZHl|Zbq_?{XpH?AklgY|Qfcv2v~ zCvS~h0##&6;N#b^?*?kVj$r`0I|s~%i@Qd z_9_FfF8f;;p=RUoF{FP|4$KBvO3$3Ozj--LsXFVb+9-Cpm_V2oGbm-_8kGghfSbe; zaC-r0$-}T`KJbI}5CPo`3>z3^^sIjCZR+_Pt1i7*jC{r6qQE6rzRUThaufxZUHl@L zMU;Dbw4#h1x%D4MX!pLL(E9W`j8DKI5V;-%>VNGo^7oa0xqgPr=k>w3QVO!pQ2bxZ z+df&yTW8_QUB6%J>lK2KBXw{-&=Rb7oX~`Bc}O(^>qnoM&b z*8v;W2rr0vW3MCcH1{04kn=tDg!>#=U*+q7w0D$vfZVDml01=e?|?bv`iQ&li5y4Q zxF=Si{@3Tz3Sgs0T1*~Hm6ImKst^1=NZ^6aC6t-UqR zwo6$owuJR5vI2z?Jy3o*4*Z9}|k8H?+Js<>N0CX#dLnp;0^n@a}i)kSp^& z0H}x#M_e^0(HHl>gRGbS6ndgZ8IO>{8_97M4wadGn%c^^bYJESh8_|3r!bkB+vFum zYEcMee6za{&@Nw?M~luOsz}L*ivvv4S7TEr!Yng7h&6@!jn2>lj48GI)Oj zp8>$dga0oW+!w${5|F_KYO5h+^rZAW=NU4#Q`0N+9WPru1E{G00YKn04and<4P?+S z=#v0o&=yn!Hz{y~1D|2Qy$56-fWZq`1YnYPE|_YuE75N{qOw*`$GXRJ&IX?666ag24jH? zZYbade+^*pLN*839$@&VkRp3wr{*dP_`;KHyOzr4GD|CG5x9ULP-hN@3wjBdJK9_E zySNCqR9dx!07`HXGY7y*(7d?{K-QMcIhGc1N4TRIzX%-eWi4baCU$E^98c z^|EaVfm^^`e(qu?V+FOD4QK;6E}{Tm1{iFk4!|Y=gBPkJz_$PfHv{n29tUtTz!ok* zYynr`;N;a30yM|W0&eMGZ^>`z*iz|gXATG~g}p?aon5_JDs7xC?VEFKn{$L(DqAY8 zh0L0(Y#abHpvD^BTxKI|33kW|919C`Crf8netwIVN^=oGdvi-6LBJ1O#M)7$rP31c z>H-cCygA2CP}r)u%oPYG;srOeZwc|T^0KyZP5hU;C(vLREGyk&BjEAcy#h0a)g^4L zb7is>Iy79>TYbJYj&}DlveFu>+qa#Ns((Mc8H20G+cAImaw)G=`5o2n?N8tc{1*u8 zo{MGGxU_CzO3APRCYh(+ZgNgZfaPU*?|dr!z~q8dwng@PdwbT>Pbs=hk1D=rQyfOb zDXD%qkf;h$ob*sxHo5ZbXdUkr_Ded^>Xuv1raV|!5h3GcZ4c)%K1p0Z#%(E^S~ynX zqjxgc>X65APjCwaTcr*Yol>8`5n1;lX+FZB{Ua}3_QZrl=XXVExW!=mYWRE%dUSF%@FPVyEfYk@mT zHK`6~=>=lBd`tfm_uHkeTv}tE!qalc;ya7}Zyfcbzr)WEy@=)16$=nPo9^RVe2-zO z_iMBI;VAa<`l*g{yLe|;`edAJND`H6nlG4Uem7QsecVi#5oB{+sf(M^&mxobAZj&T z;HE*{_BHjQwr$KPHaYA#iNY(6(c-+@Nv}(aZp9FDugZ(s)z=$MD>er{Vyk%-vBuh4 zsP&w$;&cpG+b2(|`_vW@uM`K|tguB_m+6CWPpK+8#f3_RPv}VZl=$Dt=ah#>d$=gb zE!R%Q5K*&Ak}vC1*|mt1+Z?Pt6_0t8BD7(b80|cr*(sqL&3VrxposLbnJv-izSzzB z&#A{@gYdS@3+7j1Tx{3~iI9(3Fsj3oeM2B+`#5adDD_MpsV#?KfEo!JMw;0^lRVw1 z5>>U9McsR^!g;G|EJglrANB6SoS42!dx_TlAbur4kvB%Vvqkx0PS3?_XI7Hj1jz#S zBQXcxWDj-DQFutp=UDe*#rrfDf0*0QNAF^ArEq$&*^UQ4%WT&Sy!)E1T!NPzonyYb z(u7oZtEg^9dOF{SHk94Vwvb|NrphZc;*MdbBpAqaNFH0y&Sz~F`nmZ`wlyvTscl0QFs!8R?sN^G` zSO4$y@IO)hCyjvcfTIqd#Fr!i)vT?hpf}AdTvwWsCGreZs>?7;zARLJOy3~YN9U{{ zuqJF}7k?Ey{bZ!pm_Rq0ngaVtdZ;cF%$FpaZUFnr`|7uP_EKe%#NCN)^&Ql?TmGXN zj*&ad=Y$Wagz)(b(X%eXo}vfPDRz8icwhRFW%rIz%jiv`e6z#YQ?m0fehBeHbU_t#FujU5DS;;lwsR!`7`-X?y_rHC#n;+^TmY`6h zmnc1OPU#d2HlvJEAenVJ%XGf%+r=$^(b!bxOWA7}B9qY^{Q+!DI#i>_=#9o9BGw!@ zy0s67S2^+Rs$L3?j*}^WL0r?mcxP4nn|+*x^t=Z}$t(%kbw${-c*F6PdqwKZ!%=F_ zD7nb@<-Uu&l#JPICKI30xvC};Z{XNGXwOgmHlZWM&x*>Ww&!cW#}>!BnUF+beFB$P zDh>9Lr*GO@ zPk$Lhi(S}xomj5?`0iL^UD{q=Lz7rv^ZCoPDF7jH_Yl?sQNZhsDKE!KDB5n$cL zIajz9xI1s{vyvyeZr(jJ%x_sqm9y)+79;ZIOg#{Zc36@9MofT$o_#3O6;*XB+(r}BERi&PGApOzR=@{CS zK*uI(b$#kwp`cuHPwmLjEj|N|@yD>t!|FNv57SX^l}&IUlfWApQ8(nNM6&Og)45H11Xgr|O)uR9&9f^1{0L zic4&XNO^vK9jzxW{mgp_;XIP-Yu68BDk-lTxypAQm!r9%y!x?C~50jN#@{U$i7-tA(|Q4vqK*Nmb;SU2U9Be{5}Gq*mBs!rW7y=Xp4z#O179XlWgaXG$;lUodnnmLzyAtCE?Iwk04a?83Kw=`7nw62OADBnDks?HV4kmD+DnmRT=K0K zcZ|Ey#OOFt6o0nBhNjGt)hL+e~p081!nyakA1-& z{LN^U!e6I<4B}%JW6${++pL1z+P-E& zlNmg+1ay8YTlZXQWgVY><_liJt%n>7x5l>~4W}7B$*5u>A334*3giMCn2_=a+iun_C`VSpt_noY{p}IFCj}`gSN_*>kZArYa&%4QT zoUR|(KJt1*Yq00`ib{IdwD=bAM^{~#a&CP*jew^OX0{AgbUM8%#BfGO z=Q#ERwz{Ha%cHO3(pn0$@MU^|!)5i{T%lcGE z4fZ^Xwx0A+5dOr52DwtrlPA ztPh@^Cgxn(j^X&6;kG}!gMaR!+AUG-#7TIXi+u_Gbf4s<(>9+7&xDqXG=w~-VESS+ z)3WuROOBf&)rt8w<*cIjP8@G@>$e>nc^X(l&7AM4D1w|>i|hFXmrFT6Qr#`V6242t z%((AA-PFSJYWFTIh%mqSw6anoj4v%*#+|D;J;nNsT(x9Dx`E*x{Fk~K?z~NrFt}r0ov`F$ob+y! zBaZH|%$QR=#angB_Ver>@j?p6U)DN`g{n54(lT^twcQj{uf(E`04pw@t%&FBa`Q=73MB;IF&tRPQwVme;ZMC<2 zS?Qy6?y60^)C}HTGKWlHoIQisId$z4KEw5gbPeVL(RYAmn7waCPjd}I8r*A{Ke{`H= ze5r<^=A2k~d`X4S+qTd_-i7R8cZ1VS*_Vw*=PVJNmc+i;1?kM>mKjQ0H$qZF#uXer z`IR>JWu>0sCwFjfcjuN(tRA^5>b>w}ccZ&1n%OonKe~!{{M3&-YuVlB2dI zl*VcbUd8(I&La{9ZzTI)vE>e-MLg)ziP}`!los8rqA&9wcv1SWeuf~8e(_+A74ydx zJI9ex>V`DJfs%!DE_jT^9KH!>2p+eb9+xb_^Cq)W(sB+CI6^xdju&;+;!!+Usk=J$ zzVu<#x|oZIm6rdt8=4cR7rY!xF1kN`Jg2~-M;^BqCr^@Ts`5z2GKDV`^J*Hi3)|GO z51Vns1a5fB0QVWgGn;|4te;JVEIn^IBJe1{cOn0k{0syG^jF2d&d(r+#6T(Y%Q^Tv zC`i!zSqPJyRR26e7?M+?BoJmqVJME+Ps@`N2t#sBlpM0xN8$6RG6+>d1%x4a9!db& zm!UA^7y08cWDJ(5pIsrjCJICLV9qB)yze*r4=4=v?`L8FW)J4af|Q9o z0{J8usFU9?B&Wta5kO%WQ5f>?io%c=2t#tjZ}#s{81iom{xJUwK{yjthl6_I4aFxx zVI7p*7KOP{I2nae5C&mLAIh&lz6b`CWN=P2A^*xK4C!a0upG+aA__zPAq?rufG-&U zDmW=%>nIHQ-$P+P)H?G9Ur78y5QgM?;L8etAbVtT7l2JseoVm^8NU#OAvq`l-))ei zbns9mgi-LP9O~YmHOpP z{5$!d{PXpqdaiIX;>2m-n=%-TW9h^q2nI_+E(B5-C(MAPPuEooDGv$^{eF~L$9vvP zEo?*YDew2SPg9`4g&Z|fK;A{C;N)!>Owj-v)p-mq7_74!X%3S`=7rSyfcY6u7`D5c zMb)QV(JpM(NY-n_#NoehoW(m9o8_Q%Jmni39@*wH+7HbTEa=1o3|428)zab_bq~Zz z?701Pxw+TTC*o0W@=vs)c`h}9XAShzc4;Q z3C2U6iCuOf_>?`jMkz(fqx9@CKemp;^?i8Ubn&JFLSyd1GNb&0jnmU%g^h?{Gv50l zEW$HRqWv~Db69g{UK_gnyyB3~q4(TK02~Ouig|^fUHcV6dF`;Q8@!TPWLij_C4f@vGgszOGDWr8?Q6 z+Bgzr3fI|jCNTvK@TNUU4rc$_aDnCpf|l2^ws>m3!pNg5ulR6OM~jKX#?oF<&h`n3 z-Zul~5XGjd1e#@98LZX%UCyHVd;t#!8+?5W%TjtJ1eTz`t0M|9Zg@MFP^hT#S?$Nz zyn}HZu@CayMPARssqLE341E^$pRYQfakHVt;$!g;%yAUMXBjC{F(+?9zk7N>Rg`8) z717Qbupj^T^#}U@mCl(N7=W0tBLbPH^9|8s4PvSJ^T_$@zqJRx!v0Bdgleik!7nE= z-hP6=z@Miz^81V+*xOs+1Ir8Bi_T#ACxXUuyx|9O3nkVJN5-pgp|AmMdaYKF8D7Kr zR><|~%Izcv)@Kq(=z*ocHuF`m)aa8#UeA-FYlY(#PU#{{$pATmRXy8L3zF0EsWOI zHZikSR!U}hQ2dNGLN0O15qW<9-bCvR4LR~0l#N(qH~aV~%WHkWmqYJl1oVhneTPcr zjBm!sKU@U-W`xPh8y(AWzp!s@c!JZq984y^tWYO!FWDtaQX3{mZ>T22&&Sjz3~z3o znu?OYP>LsS&Y0EuJp&;(UHAVy-(dV-3H`zy%F%+rle~@Y2$@wsY2h0zzA|8X#*69) zh+lQVb7X$Wi)aLprt$oZP%y$5NZ**z5B;e(+Ow-Y>wXte`OdT9$^9UiA4A?>*ftn{ z%0V9>T_FG>-+C+qWOc+iR>2W%jAih}d&x5GfEmq*pNt!rFhUfLK<2d!JwSlzFGYe6 z5y<=z`gjB~AB4~y0c5)vTB1FWLB_jw7cr#~o8*k{kDvw8VX*Ln5MVltp;C$rkWC^j z93F)LvQVV(lALAKM1c$#zXSv_e@4;!B#&k$S>`>^Tzvskpzd;5^ z`&R~Y=AqmSuw(%k+=#&EN{E3|7}sTmPS@lZ(hal2-f;tu{+H(-Y~Tws9G2NEN8fXi zxa?-n*V)NzZQ<{4nQ#d-00@i&w(2y%X8>jZm=RzO6odYG00u8aF@WU&Rsa~h0Ki2I zvNDQQfD$AJyvvoL2dV^P2fK(H>DW8=mRvpFbVu1m2SmH7QUUnHCRcJbXBXvg?i8lE zaeJFfd^>aHJeA?P#ehi8r~c9^=V6C>!xlCsFy{m$x(L+${sbbA;IkHZ(b4=1J_po+ zH;CUE+;qf$VX!YC!JS$vg@lD&EbS~^ft(ZJ<}wkuT}w!FnVq1yu#>$2KaimT7iz8& zw6hixvN1O^cNT{8+c#H<*nrt7LKe+A<|mGTBh)Tx;G7ZQ^8kaJ6u5E70(=GFYXE~A zBxq$1FdSeo;VlB-`v9i_oC9zHz#v(=3OQN;X)ydimJD1#$i>PP-dx2m03_3xc{#Z{ znOh3j0eLpIR)PY;HjY9qA(lc8{I)huK%xnlsv;;LC~WU)EdmGh`GtjT1kGGUY=z8i zT0*SNfOHgBbHSDnC;sL#Yq*_DbCrviuss~^40J39NNf=Ta&XElg+%OutQI>#o0du- zp+^W9p5`*3!o~)O02i^hwQ;ruE z3Vd(f@2u*gIoh6hRKvp8%-F6D-`aNGeB4%nN7nmgR=I_bBgJn$vi97MfW41xJ_I$x zd8xb-1uxI|ye)Aj&QiY;e6O+jDuLQZQl2Q-q=f;@(`%>0m{YoGg0`imm@*psbx|wl zdOy*zl5k8QdDR~Fa}CXMK7-l5fOU6lCi4d4vZQJX^aqxWE6Zn!dm3yIsTSXv?v1AT z?SzP>PRV6TTUAxEG4Y$n95C z<|HxSn%Zn~tF5H%?Q7ykU0YsTQ608Mti7x;>dOyP>x^DSbMH~;I=L*rHi$(3k?58{ zPASV?5{rky9c&oYC0}gXS4=gTUabR8oL~UBbHiY?M0@!}pErYUx&+|l*p;Yg|MR!ZxEHy$ zS@QUnFV`EG7qBtNUo*9O(o_RCD)f0yJB_hOENAmM>g+<4ckzRb7Fm(gq1uvjmh+kC zDg?qOi&?(&xHyfP_o;OwdhtjGlcWHk3oS)W|dOpzMb`WD6Ni_E9`FQOg8UuHy0(;F8jKOisK zA6}=^@r$r0(e=B-;iHaEL6JIg>y?-c{}frdafGwoy8y%t=%KL4y@=or)pg8G0&gI@Wb*;r?f+1)TqgJDNZ!y>&QFKPT zS;tPbAY|rY;KOcovp* z$>uSczH=0Lw+8+G7*p7u!NHQ5;c2Jl)1w}-90$rhTgHXmui~o5>@@rrzYZ`e$`#5a z9WeE(a9?8My~{I8p0Ysw#~|BSzWvFXu5dZG#m=@s;og zrX$oR67+A2XE3%uALiFyNKKNGL6^rPW&m4i&t2%mJJ#<$lr9y2rRCQ$@*HJYXCPmfIPHQ01COPx^v3sz0QSjCNOP{LI8_fk6 ziYrMIrj^!5G3hv&+)hi7BCPgei7?$%1^W9C(u*`pbf4Oj0{eYTzHxpy;>^KHY_~F! ziOn*g&wP7BCw04)JL5bZ-mQbF3QAOS+yi@8H=K~l8YqwZqjqxyFZB!c5AGD{=AySDe^-Ei4-K8a}mOAuE9o4#7-uWdo{`*)KS#WJ!!-FDu+J-q&1LomN zy7e#Jw^a+OHfG^C-&XeJ^fvuWklxnl22KBKaeaTI5g4*GgJ`|b*-oFzrKN^eUU ziFLyq$~n6+5p_C;Y16%ra8J(Fv-L&xV!}bG_Pm2g?aS*`;S0+p;?Wcs>7fD@`U;$ z2fAxGw?)xiqH!iIgOW{Bq4hbw{Y{QS60)YW5;aUhn_X*|v3zxM?u{g?0OFu@qrs07 z&ms@2+>4F2!)2?+Q)1SQrkgr#yT5&P+oqn^eNjg8%;kZRq~^Osqj}O(EoVI#g(p0^ zJ!6uFj-@rNBw6GHr}&lA9?q*vlOItwPgvB(k)kUIf4{mug`+t~(BA#Tc!1nd+2BpE z;?R$hvTvlxr#`$Q$}3f=k;AQTZ2s~214;hH(c#uwMehAF z%DfGEsR=Rp4<$SvuX{fBsF!cl zX#GpL-2*`~R(0E;51&7{mzcn(2iHZyZ@2y zVRf>|PYz%8r7QE8j%wK&nlNwJco`z+;#s@VtkN0cFzLC>DtoR_hqyP5SKltcJyGl6 zB~6k#_mzl3R$7fvNx`nXU58GIOY&?(*oeK+kZ~ts2Y9j0)rw@JA-3!@J7GIPwY&xr zYzo7Q7FMMpt~j*Y;=0$0G92OEcdYK6Me|y~6sH{LUi!poo3Z5Iji@YL@IHS0Fjo84 zum(=r`7z&~rMVPcK3aKN*pa%Zi8_VKXx&ItSL``11QBN+!ll?m`jy;g$@b?0BFd+p z^|TzOQk1h9Nvj?jzAvPQUsjADXohPzB%3$9UQwJVEOgtW9N;4^#^-(Du~bzW$`X)m ziF<9;dHvy8|1Dgf59u-Qv*sr4LuH%BXEda;In~~!C#UvrIxsqU~e%#y%8o*CsXVp3JghApQP;60R99q$yGCAVGC-TLm;qhq+0mKZlbmQ%x5&zZSK~hA}3tCbk26{&VZ+*y3*fpUqm| ze<5Vv>EVTCd+3x?J~OY0R_?&L`J%GqbZukIq_AnkDT=t;UoVB{NbCT?fYr9Y)$IHf$T@@A{7R!0f_HIKx%P`WWpOBuMDrJ zH}dEd@PoP51TD7KT%Sqz6V`4CsF2CogdAsbyqC#6x<=%NHrmalknv8#4NI5_Te~aA z)U%H3Yw1O`UV4cEPTSeN%m)HNWh)QeV{gjUvL5O4lF<)}vVUF}b}0mYIIgy}_-vLJ zuKg8l+GB=Iv%cxJtIds%3-L{AnCx#&9)7w&ezlz5$oH$Q8&fpwD6)DGC&t%kisTi( z#?Td|@^1`77E0K>&snZaTA$^ib!qQ=fptrTi7SRf(rqeogv%zIMvn|zFZD}zj{0`3 z*a}a5??6cxWF>SS(z{T=s-|}d8eia zcd^~TVsG3lMseN}d?G&1jHJh}uyvz67Q1L? z->5D%G2$^^W__qj#JqOM6=1kVF#7DRk!!j_Tf)E)!zY~JP{hnBSN2F-?i+V|Wb&%m z!!QU0zV;_jR}PB1D{f-7G33w^uHDl zIr!?jyJ&CQB=)o1&tEsC&ib&PZ+xw&`q1`1eO=bFiifg*Blg0uknvTI=T&v2_{XJv2weKi>2<~($`>)x&q4D-tN599> zXvV*G>qEj9i9pMTmPd1oG8Y(lWq3Fk!*jXe51P#6H>ah6y#+r=CbyE4o z;L>cOfuYeSis=Z&BhN$ z7_MKkrOh@aew}!Y_?vU7unOBskkWp-0LPbgIkbr67%v&N?AQSxIF57k>#uZ$TgM}~ zLkcy83OW}P`)}Pq3yU&iT9e?u;;XBcX?7=|Xbb_Kp0&J-eE zqurIeC3YrH60;|1Du@NrrktyifcIy|;BzeadpiM;9RPmqivU0;`2ZNm{{@u+WEcR0 z?=PUd78xu600e!vM+|(a01#CFSH9Ru{mBsz=@ zKn*AuSU=l|0EmTdKdyrcFxVM@XaW0P92m?GASr+V7cc}i3>F5E0zh~Gy~l#VA^}nd zh!UWWCt$ElfGh#x2521<1}g%{6(C`N_Ap?uR)FpTqz@1iK;wYx1c1~50$ji<_mN0| zk8QkZlfFS=(XBTH@4@XBcb4M$*4~&2K zAUHhsLH{QokOM$2_5I)baRu-}0m#8Q`bR$`{?-k&kALb!zL4GC;n*zMS#Irfq4a*v3`%y3+R&r0`wIBI=wJT zlnur|Y_NcK(qC+#D3Ql8PW-;6dcZz~!7yRqd|ErYn7NAz|7|#a$8r8)cMoj`&cRPR zC>YST_lJ0IQFa)A#S;?zTfAj-JSM;gI3M7CJT5rEQ-T9N0zkn)JwK?pp#3j#U4eB4 z6fl1uXUGpEQWg+^^XXyd{QEFrmMA-nf7rc5+ktZm*+JpP{}5{xuwnWo)^9_E_5L9i zJ1{q(9Te-=p{X4G`?w4N8*mMvaS4e2Z8~96D7$}-D;{kJ#RBc1aaH~y7PxkJK&+Dh zfphWu#?M2?VglsvF=v{t&YjZ3nIi824Y66y}GH$qLAST~j<@mGEG= z0HF#7$icJi*vHQ)2K-W@bZ}9tN``*#!Ek;*>s|qIz&@C}xH$fO2ZMbBec%Amz?}j4 zz`y~YanJ@jBd~xPSO@!Y?B}n30Pd0Ez~KSL1q%_38t@3dk%>@VVJEO)pg&Xag@7sq z044(b9lx{b0msnVKpL`81>nG|7FE#m@e_kKzZM8X`e013K$r=Ij`sjq6-Q%;K^WX4 zV9`2eKkfC=m>JE3&>UhAhU9r@0TjOsjTO**56vM4VMs0u+#kRK*{h&& zaWf3UkUR)2htwj`6=-~9SZG5ugD@mdM#~}lOf-i4FaBl^VMxvo+yTJ?*^8qw@(C|ZFZxWj_wXaCR`(hovo`(riO zIvS(lPYmgc;~h(q(F(n24Eg_v#`5TW$Ar3YL*9V{;lDF0zFANsjF`tAVbENt4Mhgl&{p_QuLpZDxXm7}e2_B1KL4o1E zwwwDGb(Htt6q9eMsG-}RL_!M4K&e^JuWYFIj{n?oYYKYzYS^K*V6Y%$U=AC0r%*Ai zsMGuxt}A5Pj?&FXvjsh?XJ|`rtP?pLYrZc1dEf?tEj2adiF5jDZyX7eb4Ttsse)~% z2#B{#i@f<~)~`G=9#k{BD2f#iwtK?u2SL8s$x}XEP6$21LBOjsaKET?`nIa+VFi~i zqZHfL4ymK0w8xcI%p1#3oNL9Jd*sV1h&U6VSEJ}`-+;SKulGTLWgQ!954d8dEM@L zkY5Yg@di2RLZcqYiRr#Af_BNc_mx4cx%hz=M;3lag++Gmc@-W<)j{;yj zRDfXyhV>6Hc~-ykHuZdtQU{-pnjN4bE5VEJI*j4DB_E~j(XGF zbL>Q|_tX=fvtWCbuLH{7QSt%msG>^pM9IAa=1}`1?!G5#9$n*}Sb_RqpHC}-Ttnjo zN`Kii6?G=uUjE<-wtq(7R}9^mP~*BHSAyc846oCmmBguEp2I1r6~wl7Cs7=%|EVna zy0>P3;BC!t32xEff7v?K^Fc0gHcnpY*`7jh--ZImh)`=!%`^Eg5d$rK*`@L%JT9#t zSZfs6B61XF)2$VvnkwW$-$?c2_44_`E`T>4i{LcA-@L7siio;o`nmUYh-@G8*nU8> zxojpK-vsMdcLaw3UJ*T>7eWpYGjI6ImceZwtXzpi{N7K{Uh8@WqN@`qlL4~7%Lg>f z$v$!?{%mIf!q~@zA{Y!UFHHIP%LdxNa(`$Pj{v;;AsupMzXt#f(cy?I=4AR3{&!IA zGM_?^^{C?!(gedft|Fnbvrki7nU?O$p2E^2;rSFMJ9C?|L|Huwfr@W-7s2x)eT^e4 zN%j`#_}z7sUU5jgEE`VnBl*QpSs}2D1|hzi{e2&U=vM;YZ{QsO+&qN;Lg2XoUP(X# zH>jHCtXNBIqZ|)OWwXuJ9DEKQz?1D^hz__)O{*!u1;RfCtP` zF^3wB{o~P=Q~qYgkO0D%vxBS5zR0*_Jf)gA{>GC&qCKx{!*;K#|UB?Rb>nFZX^ z!QN89(y^t|)y^CcSc-UwIy<|1wN%N0_%9TkdoGq$<1)I1DJ4S&*c6_6 zyU95v0hX5(yz^-Y1Ct9<*%vwP?d@61Jf-R~J*xPgO?4O%r>yqzK(Z=GY0^V=+2r!G zqjiE;xG(9&s#|V3oATn|M1+i&y?Hp7@k#PJ62GNvY5}Rt&){UR)vkc&p5PWz5H)S5 znA+uBekb;sF=W3zGSbkoa`N5)&ftIT2dX}(hJ0L`Ii1Co_9-~ zxu=bJ3s1@;C3Y75-#Y3?e}|tUc@fK{D;^+nCf&!k_#Wd_&(~&+!x7x&^%L!9cL~m{ z^vXKfkR__rG@m!k{BErA8re*o5oB{+xs!+5&mxojAZj&T@TNiC_En9dH`~}z?DDvA zl7*KYqb2yZlU|n;-HIXQSyd3TtFJejR%#A>#9s3%VvVh*@Y-|!ijy(iZ$5d_+^4mO zc%{_uW`!%Zy37!Se?m>kDK1nxd_qU2yTt!aK9>SK+QUUrez|rshJ=<)igH<>#;!$z z(&k|8sYJ}96yXiK#AxT~%nnK2Xs&xE0Y&7G&1^|V_Qh}3e@;b)4Zz=Io;SZ7<6^^3 zOoF;*!DtST&kezpZDiQCQR=BYa$8QJ0Ch49tTeNIW(E2YWt!@17Ih!I3g@k=ag+qU zjq2TnIWd2g@e-^1LHbINDsPN@XN&s9oSuoBw*E}wd$^tylEjn0>j?`k<;YevVXD;4 zaw*w1fqg=rn{2$kR@uvpBlAZdBD79>cSSK%&zgHj=*mS!;EV)^Htb`2y)aGs)WV$f zjxcL?KS5ukYDXfsRc_Z&=k+_3zvWEuO2t1_abF6!@ z(tWxMKg?|yqIa>lQ@Fg?ZO4P3WwvPr-hItpF3CrU$vIzLX+o~MRa7@4Go9~qI+Vl9 zwvcLVrphZc;*Md)rBwn`T zjdNjcUQc=s{eC&^pq>1Ghv)TOr!Ivp4i&IBUZ8(U9c|_&#)M=a_kQ&@@$8~eBhn>h zM?WiEfs)$%dgG^13rB2hzMXdmYaIhAUF7+RJ$-^A23rHBsX3a>Vs0m7ZgMsezGAQ% z)uv+`5tjGb4Frh#pX4VDR zQ_KK*rS`9kA4*4AckdXrjNCNJSO54@n{Surl1JN2pmtD+mMgP-;@#8qJ#!z{J;x{5 zafVnDENEX7)ZCytBefRb z;;QzAJFD8??BgtC<~^uNX2~e7E5V+{8;-BsE7D*Yic)_@%}u#4|6TN@RLo{Gg~W`` z6?NHo1IOk8djZ;a3GFFH-$uRRwcpGsb9ZsU{K=vTqNT>M3Ex2EKrZ^t%9UL0fsYi6S6#MNoczfa^s;fFf z&r&jX71iCSKIt887U@ZM$Ve4ORZLViO_%hoO;LXqa(~n}{bdXzcJ*3md;BSOwnK^( zo@jyzu9r0HcWY$^7xTaHV>$)7zW2h)5qgew`r<{n@Ti5ydDG-;T%GFWQ=8i6X`F-9 zA4bhoM=->+_&3yWikCVMFRt>p^>W?pIIMl19O=lyM5|{J7}Un;-}o#a)5`B8Q8aeF z28)WaB&1@+_jtgh=EpW=cMd}2hq4MWYT*j7ej#C8f_dx*Q@N5l@ZP<4RQ@K z8Fxa4>u0Oae^tXwT961Y-jqr!6$v%n{v6s`tnbJx$hM1jws0$Oci!4(B~NVKylZAi zz_OAiXV-mCE$8S|;7R?27yPrVcSsf3yWfTlpIXg!Iw!36lDqhJ*OS9nW_8=VPQCmK z3pQ4gtdFugD$LvnH&!<#9wG12@ragHrJi&k|Iyju7}}IT&n{+led=tXkbH7??eNhp zegn?&$FR)9>N)$5(^2nKOz=yRs%F&uE$69sglh-Cw>uYna+9twKV0Il#mISdWKf_% zM?DsQIibK3qu!defVnMyDrS&5W0t=}`P}%q5xi~9-q+1YTMyTb|hR!nyg1TYQN`Wqy7gqdPAB)CWnC zJhJO+*AHVVsjnEhDs&*rG2Ae6mRF-y3Y6#&91AA5sJ!Ly73ebdc#i_c@bhRdSQ4|> z!dk=zhkb*jD{{=PG)|{Kwze=*FKn@L9vXNMsaF4B37__Q-H$yHo+;0Byj)Qd@>VXV zPdgOPl%DrLZ|GVqMf6xsH8Cwk)|NrCC-$3k&s>>;d`z`M0d;Cw02VXj9r-#hTH7_b zi#pN)(pdQwE*!=#vZp+j7O*c?PO#6xJYT)Em!`zJ=vy!07r zF^}`aaPs&|-wHSc&6opyc7s{5NRh0@p7S%dSp~VZz0JfXGXxX~m;zR|?zyxoIzD|Y z=e=cZjc+~5O|XkQ;$XSIpU8ks#1MBB%aM)GxNd{QmpUs`Z9dt9p%t9KPJeA&d zggER@h%KnxrQwykk`94R79kFK2G3;myQrMY*)n);eEX5-;NOezF{|^9Qbvf*#2l?76*?vfkBe z{0oF5t1iqrw?>hX_MFqf`+L#DE`%DL{=H&q@{}*C=H%+;)z3>eJn?+DRgeK6B6Ish z#7@ywKYAuFr+QgMJf5aH2d|=zBeQH~%V0&P!>d9ZZ)9YSb5C%qGg_`Z`a1q;OA%H+ zttBTl;bu6JueIR2kL{Jma#TZl6)>ZYM|(GY?69}2PlVLq&a-Ok$s7d{PHZ@S`=Ush zFF{X=HGG5+F}oxFECY@|cU4v^<-E3u;_c}zp4)x>rB0SEVr#;fMx2t*mYgoUps-0Z zhsd3Nvep}kNsN~jPiK!0;BVN~xp5?!v=1`g>A#pl?ElR6^Cb_a z6dwAE4z~B*KYlgxrr@*eFnD+3D}CGI{(_w|et6=)t$F8w+%R8(#kZ{d z(M1m|VxL;LC60e<*~hNe$TADk>L}+ivnaENy8^M*63blm!PC>ETr1l#oS!q?_Gfnp z&puSYC8nJ?2~TseFJYMOmAZJ+<`eO$&~nj+kmpp)UuP#Lq@Hu@o^<%q}?s9x_A`>;yjmsv2upl=wCbhEYvC~bD za0jFds*Kv+iE!_5Q@LBs$_z8pmc2t5*j;;yIdH33&?2kwh;)T1IX*HQwWTl9;8mliL{FbOL3FENr|&xjH2SxL#>rn@;LqZ+Fnfh%mnL9NXhP z-Kp-N{;vZAm0l}z>UB$gxsi;K1YBJ+Sm%6g=lMcg?JZwc^BKwZ_?eMj>>d<67dM|S z;>#^}(f^#gS`#1bf{H%3OdRfr$fT#;Ww*%ywQR-)p}Y9E6L$}~D6$*77^TY3*qbm- zJ~`B$eN&p2S1-n|DG|D=C~Clma}V)`W%aeEZ$r6%bevRtsivXUoOpPANrmvcH=zT3 z3)w^N1}B@cFBy%@St7VBNqun((pe}iGnBV(grtUyD>`}#C~xk|Nk1b@Zs*zV$}O8% zJ#tsld*R99Mt?=ts}s+A=kfxFfUf?>kwaSE@3%~)Mr=!{jnx&siuDzohb0T%O7*>B z&mF{wc+ja6wW+)*Beq$^Q0CwNqV!??3{e`x;=vpn_K(YUj>99g4Qa&vB@1U=2$+gF zeG^U*J#INUE>%R}O<|>c%{e&W2;*=lUd;6xuhPLv-Ib{ir4OUl#a%?LuK8cRp*3-G z!OOAag8S3QbBe5blyQ4;3S^n4s*hwXQ}{!%ucWcKuuma<*o`A5@WWI3c}^Lg+63Pc$K|lrkAN{|SpMhY6{;K%*{0vG+3~frxD!2*4sg)rH1^Uq%hLvmWQ z1j0;c48;-uX?a|MFeKMP%OQJxG(LysL1+#!2t)Ebv;eX%Lu1G<>YvAuG04&Xc7^0x zXbjndIiFz3MExG;gQBY_2Ihx?B@^`v{0|fJxB}T{qA@hSS~Nzh!ypXFdw;Y4h{n+P zekKM$4$y%h`8hO(;`5^`(D;HN0ooqIki7RddoVv1ESacZAfE&aUHJ_|a$4+T0ThM_ zjUoT8XbgFQFeFF(X8#_IA^*1EKjwca5Y9xm;h{fxL-EPbSO+b)MPnW`PDW!igh3e6 zhw>{>AA$iTS-fLS$iE62L;9I$ERS}$h{lkA2t)d^;710a6dW_yIvPX%_t4l6z0bVC z4-&r=2t)Ec@M8r~AbV7D7l2LCeoVm+8NU<=Lvm0Ae%qi#>kyzhgwgP)92(xA83nhA{ zfV_(ip~>4Yn34f5dhl4>Fjz+y${Z$#$_uIW0rNASFm87>i)l=`Vw~Tsk*e2>i6i{b zI7@&OpXH=>JmDJ~9{I*)qz_slSkSQt7_81FtEI&=>K=%bIPm-Ga&xa^PQ;_X-+Gy>EcaAgy!6XWhR9M8>gp3iW?EZW_#QJP(=5Xdty*70D z`NW}|L*Kbk0Qe#LD((%|@Ai-WZK-*w-P6H$Ii$ywvvm`!Pg&!2f$ckM&Zvb; z@U%wdhuBz*qxzXsJe&v0aSIPo8w&9a>pE!nopdh~Y%jQaybobW7$@+tb1ol{DZ+pd zY^OhgdNAZ#u!4w2-BIfBU=T%|O)&!{u)Bagk@0h%0Ej>#h)5d9SGa8fgAIEK#^EEk zEI}>(j6D|^EawAQMH z_OAmM=w2Xbc{OW`x8^I1GP?4z4`+3>xM*zb>B}nFJ|WTjW}qA@v8k$oW|>w7Yqfrt zvS>b^C&0r6Ki|T#lwS#h9Q1c(SP{kpZ{rpY6;nB*{TP>TAdWNkLB6}_>sdIhT@!|( z&!YbGRp(P~Hm7m;Sv`bu9K{J)hl^CrDO)h_p6pi>qgzr#w6O*3$Nzo*f&PCjXUz-@ zKup{ofy&eQhUm5ivGn{o)cW<^+5S{m1FBMsDKcQdX&uWdrJ`)J`_7?a- zetvt=8RUN|&|Ho;{6HO{q*~#qcoi;GHlR(9)e0)ZYbf6ewLe{XoaDjwOd?4=kPB|J zTmiXeuRQ8`o)lYC0x|bBerAvh<6R&F`8D46;Oh+bmR<|B16a)$P|sIX)CrXR``3P` z=W#;tD(ZPUbVBbx&MAbprutj#WkB^GNwU31Im>o=j9!ab3V5z+E%@~D;i-6yZFa-r8q&&|H z`__ghc&*F96bj3Vbqe-UopNNgVe$-y>aqg-%x^^C&8<^YQ3~fv2^7qkvRc1qAmpd( z{-4(yjQ?xFJb#CJq#*D(Z=)+hcGXWtrjBeOZ)(uP;AqhvI@>&KTAi(sOBB6%}RQ?Dn#WoCeZia0r7CU^u?v8+i1+Jo{jSP=xWY%w{?Io{QuqH+%k$4i;+*e}Bt_i=Y8e zz&K#9P6BiaAVz?g0OCXw=${uL@Ie#@NFE?XfWQX;+{7TMph*?rU~wS0R2h1pMl^P? zi@1@FyJK(3-R(_(lwEW{va2Q?KuBtGIah0TQ6BG3VVWC{x4GoEQ*b(px^@|p$GXZoCAn=d^4-PqiE(3HGAnDn`4~IJg1IqytTZDleoH9#cQF|b(#ZJhkr4mT!5eBBGxeREqu>m5$MeS{E zoUKKi;LbL}X2<R8&TN0TPEiH@0kCe(e0S2&CJG58mc2PPRlsC zX`)Nb{1We~pN@KgtAJR_<&dbtmC$qBCnA}0chaZiYq`&|Jw1o_+L16NELVNBY3kGY zDbvs0w^+g|=>2Q|rAI}pcwBT>N0wu85_Cr2x~|=j_*0_5_tyQ6s!qD2?TJS2VdcMr6wD5DL_^n6Qp8XN9H~PkhsAecHl~1zZD7{ENecG``(WSS;Ux6~9J#!&1k-=kcG6IP9Hx60I5zdydLFOeIPq&ycWlvKan;?{NFH!3#q%!_WK zZ3>mt_44sIA-D{;PSU;8&zJwAYB#qfUsSlEE~2;GDp`iDn@nV5P5*w%@?gc>E<1Wv zL{u-HIC6V4=gj8>YDsg!g9JK?2AA9Q7lTrlUoZICk*#e`lJc*q&nCCpO4;7NDsj}g z<+T;nZfnHW!xp2l{4ll7=v6e&9+j?>%kpc3NX#FJZV8msavUYG1Xw)5hEbgg#iqT* zG?VGoI^e5mWWC({gHS` z%InOcA;a0ye6w=H^QzmAAE{0Z%71t5kJ#~l{*DFz0{1p+9{=*CdIR$Uc1DG(rdCgy zYT!nNKF?22V=a=(+kB2Xvk>K7{9vO+PV{7`w$z;EeCF8-!SKmq*6+M7P9x^M>RpH) z0WP!{bEt<&rHMc9++`rYC5(IBLv zN*%uSN?cZ8ilW>&!dYe+A09T(pin>AS?1%^6*IL@bkh_AN0Q}=502}r<_dkSu{Y}& zh<2`?6*8t_3aVs@Qe3fNFZ-^!m{)oy_@k-x+zy7QoSe?h$!>Bb4L= z_ieKBPVT;t1o=I#<({{7t;J(PAs7)ORwH@uu-G3_bws;a$DUXCoDrcXX%bneT5k7A zm#~{{I$lQ&zo6HQhyecD!6#szgQpwTT$H(T_XD{X z>19_LYgx?{HPP{DxA>Bp8l6QWR@wD0=diSFgB8UHl?nT&BQz!w^lwXKFtt4&63|{q zO_I-k%pJ3XtmsxR^4H`Bm-;4TVUyADK;Y!iX5&IhHwbWx& zb$K4&Q7srIP1DeJ+CcJ+)XeM0?!n$g!B_e&eyYZ7G#6wnt|U*GR$d>$rsraIJ1I$y zu-c0y!FE#S|nOOQyG{<5Zn`82#0% z>ui<-+-+sO5iXGvbjdd)*Bc}JmETPHKeg4li5+%wFmuLZDA=p8Jm%?0%N*_x|5A+c zhD~#~*cXKZda9=J#<#B~*Z7c1nn`4rIS1D2tSy7e#Jx77-&HfG^?-&Xe< z*kv_+ecfpz|J{L)rZjk44r(&I`*FwPRt9JDl zyZz8-m*@)5W~-m7|FB@TrniXtAHjden7!`!*z-#{ec4{iCt5hdxV+`(pBAOyZJ$b; z|6w*ER#qq7K6FtU^QkDevZuF_JB{)V8Qp5^R2|{;mnXENoS3fRJQhWFNyeG48I)|2 z3$M@d?{9Jzl2J6Jm8fGA+w5Ayj1{Vrb8jSB1&{`%8x4#~K8rl8axXU84wtJMPl;JK znr`Z_?fUlBZJTyp_eB}mGnWTOQd;j5jpoTuw4Cu^5}EMm@{CCuM9OGcNwLZcO$n%^ zJ)GB&p**5)p0KEmBga$}`F>@63Quc}sIBXXaX+P_iox4prNJL1W#7n?DYr{-BAV>N zo;Tts;7SWOPJMhul2@u)BadI-*!<)3N3#42BSWpVN<8~z)Oi~U(i7qeA4_;WUU!f7 zsJBtPo&BV{B%2C9NDqwBSbUvyBl(GSQF`BqghRAq(QbmLKO3H17K}33yRq+0x%Diu zBFxS3jgV`pA46F$R?o@Bdo(i^7)$hPO#8k48=58Irn^gJq4h7}b`OLo*feZ|K7Rh_ zUSa~D9#|I*zgbFY*A|j1t!3=%YjR}s7^7qCjdl)AMT~-+37eBeescJ#FMXNEbX3dM z;DmX@#>)_S7th*_X4Q@mhe^+6Ho3EfI;1^meEN0)?upk9UeYCL@LY~4WIL@HDkapJ zx9iX$c~OCV5ErpG5;E>Y>Hsg+xl)mAG{~NPYA0+bsFu$_l3j5~$-=5M#1-%KwuJ80 zq6|lP*Bz^SXE3}LuqCL+d6qtL*=8*HcOfcE7rc>=AI56m8q&l|J2&Rry)>7i%YRzo zH0(%2%tV7qb);^%sWbL0H-dz#7vWNDBJ)cAvsBx2K~a?x&$?Rn_?jbzjg4L=kz zz%MC95H-U!9g@u(Uau%k6c)PeQTOwc78CM4@K~xU4P_0;w#2`>>b(B&jQ9lVv$D(@5*<}AVdJ7vVG9=O4vZ(8OeCtm zxrP_tX~;Lk!L?19U$IoMC3Z0Roafr=Ac8^ z+hM0KBZJ0@c>Dcacoa3)*hFY-;Kc#8GxCu`w-;tteM}a{gY_}=jmJ&Sa8roh?XGUO zG|ZC3N6w7!6fvtQXTz4$iSjjT#2m^;>-%vlwC@_N=!P*iz80i-+t^KszIp1e7_sfOx1@H;hU&v0MdxW%ZqtK6u4 zbk?Nn&1lL;*o8u*9dj&X-;LQYI7FDWX;@*X7*W#H)zS z3J0<=@M5fT&e|z12ko?Uy}U!#{O!j*;l!|mkn!@??Aed3G_~zdhsk>3!Efd2yvjSS z%f9J-+FCw?RX3lyB$Dw1XUQ%sP4Af~H_X^yqO$x!qQu5qdmE4Fkf#*^WFLEP(O+Xf zMbl;e`0DqQ>e*Vf1O3^LI7BN9Rs#^<2Z7Y$5~+l@yj~ezPjBSWD-s5CuL)Uft+_sv z=_9V)5>%y-w+TUJa(ew9bOdftZPkE)B!N~Wkts8SR>?pE&058VZXo~C=q2}OaNsr?u3qYwt{jc+TJaU$`kqe&Jfmq=Z#*`r zn7PD_$?~s^c>6_}=NEjxw1wH6reKT6k@8~FFd2{ZpSIu)rG)XbH(GahM^bAWm9*G{ZMcq$g;&P7)K!shQm=M2Lc_hA3e5Az z`f#RLHRp<1h{xMcty@u#J)to6`uc2U-28rxR&zJ8!)vh;_W-V^%xU&3*(y`tUo7=K zV>}mmuARBJu_v^`PUC*j1j12R>Gf`v=U5!uuIxLEj~A=LUqn5;{9fi|P%QQ1lUs%P zbxRl0+-5I76ctOV$_|fh>}2Bfy>Xs(imNhT?9qXcD9cVwH~wO^JvM)hLqN1@M6_oU z-|%zq`$-EArS$zbdBoqkSB&7jBl<*&%#36ptgv;XMv7msvv1Upo*4ERFS9<>C1F`R z(tyJhpLIKr@q1iyAOT~kbB>cQVR zflM$eQ-&*D@(X>p%qJ=|{VAN3?J?2C*UGfniL+@|__Ac}W;ui!y1N+f-bn6exu3gk zN}Kg@J>U3hQT3tieTKTMWmOLqK}Xz$Az|amppJva-7FfKH)4_+9`D&gf+7QBM%a6J zBBhO{jhK_B=6Y3Kbm{3guqK$bSjWoe%qWQiNuf0XkPp|&+0rlBH zGd&6xfv!Zl?#a7ZS8mX|HPybS_942{q2j-0^OnxrUjy?VXQP?G+O3ZXUnB!9A6g#G zEy|u}l|I&W*tlC#I%Zz9SN7b8KPSr`}7lKQ(N%{vzo>0DSA0|(D zq#C^#q3MyC_`Lrkwi7!e^Of@5kbC!qXYDZd#-8Z(Ydt2pqwg%UbWU&V-p(4VM_w7{ zeM&H$7iH0;-sdu7kCCpd3!=|MO18#GMmw}cC^Y)IrJIc(4l`cAXnQ)_l=OAtRnl+H zr6Q{AD?!Tp>4KbJ*5xrGl4HDN*|TH&ec*V`&9A@G7j7Yk@dp)a3KexOB=+69tMoXA zd%ldzo#BRjSoaYA(%EmTj#3*VZeq?hAFvEfYV8VqH=L~^hkl;P`e*uze3iJQ~ literal 0 HcmV?d00001 diff --git a/tests/resources/generated_primitive_large_offsets.arrow_file b/tests/resources/generated_primitive_large_offsets.arrow_file new file mode 100644 index 0000000000000000000000000000000000000000..6abe1a3de54c9d8065991a62893d5d4caf69e54a GIT binary patch literal 3578 zcmd^?eT)-D6u|f7xP#-4Q~`x>kcdP+MYio0RFDG{#Q4D-D4)jL?QXa2Znw*LM@2M* zA_mY9B1X_N0coNnU~IG{#3003OfVraL=;RoNFd=ZV1Qr{)I@xRklz8HfPn+y>+iNxKtBj{w#rPSy)7~MMJMYx?Y6CIwgLLr{c%M*83}!dR0rp_n}^XH+@Ci^B>qbSW4NLx`R|JzX}`-yi{4C7>vGK$hxl*xWHI_e+; zpok^yt1s$FCNq+I-=yCW2kBo5t-A*MY6GNr1xfSWj1F&{+zdcm@{-yTrUJtZSUDS4>&wI&|pY__o6%=N&B3NGVw8#pF++< zE_ zJe}Xo3t*8m#O}R(Iw@xr8G>P(rWuZdg&8&?yufTV5{=nWaA@;_<&`2#g|S{96qXVJ zzp<31CDIxw<51-ogLqkrp7lx^EW}{`N+B&@|1Wvcr38GFJV|jDPH&YbcVc{s$R^|^ zB!OhIr@Xin4X)uar+gz}-Xr^+D8GRAeR>Ql3X;jbi2eI%0YI^3+85?qIOhvq?!Mm?QlqJnzLM?}g?K{la1aoFQCF1Lodhpy~sT%R8C z{m)0IZCHBz$nLR&zjQ_%{~&#=|K*l(yXLnHX;UMcPqrSIqy74B=!-Y6eAM}T;Xvo8 z)kD#pL|KT!PRED#OKiXa6Om!$B;=z3PD7dQE9KYOsDA?aWPs10Oy}pDpB6V?YuKY! z%`5V{A0Ih)VN?6L$@Z}Q=g%(Cy5=-onXqwI*Wm~6FN|Ea>W7J$Wn)Gz&kyE}Gy3*D zm^qyNWaXYME#`VBwffAyYr{WVF>dc$!_F^hxb$Ij>yeHfs+el-%3D8oTuu-AVT(xT ziZ~qm#|B98qUSsV`zap>IUoD!UJxBG&&#|JkMVNCu$Y^Jl#zf`5^+pkWFSN#fk%bL z6^WaL6=rI#NYA{GOz4RmXcb;Ip-<7{s?MN~7rl}bQFJMX~gx6LCQ!vr}Tb# z8ubg2evqMUq1=Xi6}bkv5xE7q4Y>xC*=RkpJ)hNj5>8r~Ci;{si!pRSp0E literal 0 HcmV?d00001 diff --git a/tests/resources/generated_primitive_large_offsets.json b/tests/resources/generated_primitive_large_offsets.json new file mode 100644 index 0000000..eeb6a83 --- /dev/null +++ b/tests/resources/generated_primitive_large_offsets.json @@ -0,0 +1,582 @@ +{ + "schema": { + "fields": [ + { + "name": "largebinary_nullable", + "type": { + "name": "largebinary" + }, + "nullable": true, + "children": [] + }, + { + "name": "largebinary_nonnullable", + "type": { + "name": "largebinary" + }, + "nullable": false, + "children": [] + }, + { + "name": "largeutf8_nullable", + "type": { + "name": "largeutf8" + }, + "nullable": true, + "children": [] + }, + { + "name": "largeutf8_nonnullable", + "type": { + "name": "largeutf8" + }, + "nullable": false, + "children": [] + } + ] + }, + "batches": [ + { + "count": 17, + "columns": [ + { + "name": "largebinary_nullable", + "count": 17, + "VALIDITY": [ + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 0, + 1, + 0, + 1, + 0, + 1, + 0, + 0, + 0, + 0 + ], + "OFFSET": [ + "0", + "0", + "3", + "6", + "7", + "7", + "8", + "8", + "8", + "17", + "17", + "17", + "17", + "19", + "19", + "19", + "19", + "19" + ], + "DATA": [ + "", + "C12E1E", + "DFE731", + "55", + "", + "6D", + "", + "", + "5867B5C5A7786E1534", + "", + "", + "", + "2C09", + "", + "", + "", + "" + ] + }, + { + "name": "largebinary_nonnullable", + "count": 17, + "VALIDITY": [ + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1 + ], + "OFFSET": [ + "0", + "9", + "11", + "28", + "36", + "39", + "43", + "43", + "43", + "43", + "46", + "52", + "58", + "60", + "67", + "70", + "81", + "81" + ], + "DATA": [ + "E23907E804B3FE7A1C", + "E319", + "EE37836C76FABC7747EFD7F5DF75D35136", + "73853DDEC4E2E828", + "93CFC8", + "079AF075", + "", + "", + "", + "E8032D", + "753D5974DC08", + "BFDD468EAFDD", + "539B", + "FBA0315A8638DC", + "99BE43", + "64EFC8F7E671AC43D7EB85", + "" + ] + }, + { + "name": "largeutf8_nullable", + "count": 17, + "VALIDITY": [ + 0, + 0, + 1, + 0, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 1 + ], + "OFFSET": [ + "0", + "0", + "0", + "8", + "8", + "16", + "25", + "25", + "33", + "44", + "52", + "61", + "61", + "68", + "76", + "83", + "91", + "99" + ], + "DATA": [ + "", + "", + "1\u00b0ekpj5", + "", + "f\u00b5wneog", + "cjr\u00a3g2\u00b5", + "", + "54\u00a3gl51", + "\u77e2e2\u00f4h4\u00a3", + "wirebm\u00f4", + "w\u00b5\u00c26nnr", + "", + "6g6lerf", + "kha\u00c2fmh", + "hpif4c3", + "im\u00f41h2j", + "rpe\u00f4mp4" + ] + }, + { + "name": "largeutf8_nonnullable", + "count": 17, + "VALIDITY": [ + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1 + ], + "OFFSET": [ + "0", + "10", + "20", + "28", + "36", + "43", + "54", + "65", + "73", + "80", + "89", + "96", + "107", + "119", + "127", + "138", + "146", + "155" + ], + "DATA": [ + "\u00b0pm\u00b5cp\u00b5", + "ne2\u00b0h\u77e2i", + "\u00f4r1jfl1", + "rkrj\u00c2bp", + "kmelwbf", + "b5lc\u00b0\u20ac\u00a3", + "\u00f4rwe\u00b5\u00c2\u00c2", + "jh1o51\u00f4", + "wlggg2c", + "c\u00b0g2e3\u00a3", + "rla346l", + "4\u77e2l\u00a3k\u00f43", + "ra\u20ac\u00c2\u77e2k5", + "3\u00c2b6ikb", + "fjb\u20acc\u20ac1", + "6rm\u00f4k5d", + "o\u00f4b\u00b0her" + ] + } + ] + }, + { + "count": 20, + "columns": [ + { + "name": "largebinary_nullable", + "count": 20, + "VALIDITY": [ + 1, + 0, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 0, + 1, + 1, + 0 + ], + "OFFSET": [ + "0", + "0", + "0", + "5", + "9", + "12", + "19", + "35", + "35", + "35", + "40", + "43", + "43", + "53", + "58", + "58", + "60", + "60", + "60", + "64", + "64" + ], + "DATA": [ + "", + "", + "1D4542F260", + "10E81B7B", + "9B1C89", + "6A22D5F3D43F8A", + "5CD8D1AC2A18BD6E22D89C6AD70EF252", + "", + "", + "2BAB545219", + "746133", + "", + "93DE60BF4962F99610BA", + "8BF5A6DD56", + "", + "A3BF", + "", + "", + "DD27811B", + "" + ] + }, + { + "name": "largebinary_nonnullable", + "count": 20, + "VALIDITY": [ + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1 + ], + "OFFSET": [ + "0", + "1", + "3", + "4", + "5", + "5", + "13", + "48", + "50", + "55", + "62", + "62", + "63", + "65", + "70", + "71", + "72", + "74", + "74", + "78", + "92" + ], + "DATA": [ + "C9", + "EC5A", + "51", + "F8", + "", + "0DB6610153B0C20A", + "D93BEAEE9075EA386C1DBEEBE65762E7490DF52E8E46E7CC2F26A324717FD6306D7129", + "2476", + "8218C21C42", + "0909C46DCC6FAD", + "", + "7B", + "B697", + "5267896E69", + "81", + "E3", + "BCF8", + "", + "1EB2792B", + "B88C1DEB5B0DF0A15160D19BA261" + ] + }, + { + "name": "largeutf8_nullable", + "count": 20, + "VALIDITY": [ + 0, + 0, + 0, + 0, + 1, + 0, + 1, + 0, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 0, + 0, + 1, + 1, + 0 + ], + "OFFSET": [ + "0", + "0", + "0", + "0", + "0", + "11", + "11", + "20", + "20", + "30", + "42", + "42", + "50", + "58", + "66", + "76", + "76", + "76", + "84", + "93", + "93" + ], + "DATA": [ + "", + "", + "", + "", + "n\u00c2\u00b5\u00a3c6\u00b5", + "", + "fekrpw\u20ac", + "", + "f\u77e23ng\u00b0r", + "\u20ac4\u00b0e4f\u77e2", + "", + "pr\u00a3gr25", + "rgbp\u00c26n", + "\u00a3hfdfwr", + "\u00a35\u00b5g\u00a3g5", + "", + "", + "dcadrp\u00c2", + "\u00b0\u00f4n35d5", + "" + ] + }, + { + "name": "largeutf8_nonnullable", + "count": 20, + "VALIDITY": [ + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1 + ], + "OFFSET": [ + "0", + "10", + "20", + "28", + "41", + "49", + "60", + "68", + "77", + "88", + "97", + "107", + "116", + "124", + "131", + "142", + "151", + "159", + "171", + "181", + "190" + ], + "DATA": [ + "wrp\u00a3ba\u77e2", + "\u00b0n\u77e254cc", + "2ddkai\u00a3", + "\u00c2r\u00c2\u00a3n\u20ac\u00c2", + "rd2\u00f4gda", + "hmm\u00b0\u00a3w\u77e2", + "p4\u00b5r23r", + "gh\u20acdlib", + "\u00b5\u00c25\u00c2\u00b51o", + "1\u00c2me6\u00c2d", + "\u00b5\u00f4\u00f4ndhc", + "2io2\u00a3k\u00b5", + "o\u00b5wr542", + "4nlo5p5", + "i\u00f4w\u00b0\u20achr", + "\u00b5\u00c216kaj", + "5\u00a3oprpo", + "4\u77e25\u00b5om\u77e2", + "\u20ac3dna\u00b51", + "b\u20acprdek" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/resources/generated_primitive_large_offsets.stream b/tests/resources/generated_primitive_large_offsets.stream new file mode 100644 index 0000000000000000000000000000000000000000..27109fa263b045d2bbd2eaeed474604952925aac GIT binary patch literal 3160 zcma)-4~P_H6u{@Y)AP@rT&bma3Yp}eGjnElRx8a*OEW4w&&q$%&Fsw1?#}G&_;u%I z7UG(b7Kdb{b%EwW30}0+u1J{DHwFemNR|ekf`WHws2G=5LD&0s-#cYRa{ryWSB&kjkW0*8T8Y~TxhQJ++imCy94Tx2!FW(CTp_8N&aJ>arBV0m$ z4}1bfje@^_+Rg#}XwW$tv&^>E*zA|wwClDxj;1>%=-()!m8e4b6?qxrN1AZz-j)66zHX<)Es!Bgd4(>~NkgtJ= zN@fyl*S{R3r{)N-r%?WeD}qr=lF z!*U=b4ad0Zk&}>vo<+v|2UXg zbme64*|r~=A}?>Ad*;Zwo+;bEKU)3a)wZ6>2ioSYXgxjTaM!~2&%2sC{@g9Ed~^2c z4-PFbt{lC7VNK_Plb1H5|H5@VFenERta}|Aj1BN!l&2#p&$%cQA4mCVWHWLVlFq5j z^QP@3)fg{kTr+J&lZ@vqg%{N@&{5^?U3*N$FT_F2B$%nE{6dBo{cOZ_S;UGsCX3l| z&CkYC@wk_Wg`=San8=rLV&qC9^t+sSia zkugQ~el{7G)2a-`a4gGGqF`YuW{BsQqlLl|Ckzf9p0oWzh-t9a&w|2ML*O@(u=Q9{ z2W1p4Il`b`wrZsPf)0V0tQ6`0?UNog;G6VGsyt+|BCjF|B$GY$#noug zhsT`yjf8!V?DwMlBH9lbFf1!bCi^_jAEE$&B4x^p`WeY&Mso1H(fsSb(MrUzel@2KCBocQCfC+2Nked>7U zv@u`16Hk4VJUQap(uRG@mX2-JLfg-@9A2#d{(jw8Z})!E^?dGd*W``k(49nGh{{gK zhw>%X;)0n-1vv}(Xn^xjruUWl>mt-YiF_)+XHll}^W85ioBC?^YZc9TKJ4d5E?(Z& zc5${d{?MfhFX-KiYkOz3FYG?{;QhHtYc~8eGqq;wq_x>Gyng=Bp+{23(x0v0zhkMj z#Z7EHf3R=D7wZ}hyfgmNs@kg`H?t}h9=b{l_jF~p`GSD(&&`LtC#q$h=Ff{P6(0HP8Geu>V?uquy zbMcrF%YatpMGNLsBdQq;=6K#OxFOY`Gby$Xz9;n<&s3z0q;pE&muFDF99b$d+}kL( zB40ypLbfA!Aa^16A&W?Q?=#HfIbDN1dESM8H5`p9hGA<7=*n3JN#$Iq2HYC(wXzH? zo=WjNe6vJl!o0|o5VPW7VK@n0bZ}J^R!+-)Hf2WqtO2?Ll*Ncgl|)*B&drNy_$D$n ztb|=Bt$HeaA2OxyK%5B&a>Uk>D$k`o=A}iuhcc&AK-eT?xEgpDJ>Fx63B?foT%`0j D{nQpD literal 0 HcmV?d00001 diff --git a/tests/resources/generated_primitive_no_batches.arrow_file b/tests/resources/generated_primitive_no_batches.arrow_file new file mode 100644 index 0000000000000000000000000000000000000000..610ae3cd470c1ee18ecb2569455aea5f8f6a91dc GIT binary patch literal 3914 zcmeHKJ#Q015S?|-XUj5)0T-4)a7BbtP~Z?nAh}2+3W^9&3P3cGE!)D9Wk(^F0&;~? zxQHmCloT!~C@3gWA&L}HMCp>kKLH`i@Oix1+dZ#uegEKAdiHsC-@MtG+Z*5d=H|mE zWg;JAkRd7%j~vaDOKHmD3PU)dJ&&J`vIp0^fan@f0)ONZ^??ci7j%*)+6V3dS>TI9 zGypb$BJg#NXb9ASo4}70(Fb4~SOLxj(J62MJOw;p4A)1%1^^dy{L}aGwNS!`8PnNs z-`|7nI?!zGHR>-~uNt*hr@HfUtGs+`tJCdtdhK?#)@~>t!t#;(6Zp6q=R-TdeC5~d zI3B>qWz2JqtT^sYg`s@daU3ciS7LnVFiwIP(nYK@bJ2#KYunpx-kDY;`Y1WV;tXNG z5OTBv(IWB+vlG=AVf`gcs1%#&rK02ri{rq)U~sg`AkdhL*7Py%Q=$}cKAY`sb$4}T zYCHWv$Kt(FJkCML(_*ll$LO6&o|x#}UY^qXru01c;C%4?7z`Ju#N{J^A>{dyy399p zw^3ItNJaPg`k(KwVS@A7YjvXAq`Kq@i}y+KIH#JY<;F2u-@)+=t;iN>6r*(mxnZ8x zGPidB@9cd&1BYv%_f=k3to?XmRbb(7Fh4E}}R7d!^-J-maY^ zYsdWg313Un5^@G}ik}PSIeQ-1#Tx9aJEL^u$u+{_zB9P2skou$+ZfHU;&~IzXbI^k zMze$*fhIhBC{44BR0!s+SRW}K*HY~>T1v{wHTFIa46f2N%Srn&nx~4#HDTQuC8fU7 zT*SOt(=4i#r3@g4;BS7m1wO(Q!*5Zp$;he}RNo^v`mFm59gd;a?Y5`yV2+J_xKQIe zL;DaepK<2sXM}?*zUDXrKa)7%p6epl=i!f^CVBi~kL+Pd&1qlK9Dd~Fx};AVy?&C~ rq)#V(Iy{5?CpGEQ_WzHhPvc!9re`L7TD~$ReH#6Ch literal 0 HcmV?d00001 diff --git a/tests/resources/generated_primitive_no_batches.json b/tests/resources/generated_primitive_no_batches.json new file mode 100644 index 0000000..e9eac55 --- /dev/null +++ b/tests/resources/generated_primitive_no_batches.json @@ -0,0 +1,287 @@ +{ + "schema": { + "fields": [ + { + "name": "bool_nullable", + "type": { + "name": "bool" + }, + "nullable": true, + "children": [] + }, + { + "name": "bool_nonnullable", + "type": { + "name": "bool" + }, + "nullable": false, + "children": [] + }, + { + "name": "int8_nullable", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 8 + }, + "nullable": true, + "children": [] + }, + { + "name": "int8_nonnullable", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 8 + }, + "nullable": false, + "children": [] + }, + { + "name": "int16_nullable", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 16 + }, + "nullable": true, + "children": [] + }, + { + "name": "int16_nonnullable", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 16 + }, + "nullable": false, + "children": [] + }, + { + "name": "int32_nullable", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": true, + "children": [] + }, + { + "name": "int32_nonnullable", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": false, + "children": [] + }, + { + "name": "int64_nullable", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 64 + }, + "nullable": true, + "children": [] + }, + { + "name": "int64_nonnullable", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 64 + }, + "nullable": false, + "children": [] + }, + { + "name": "uint8_nullable", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 8 + }, + "nullable": true, + "children": [] + }, + { + "name": "uint8_nonnullable", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 8 + }, + "nullable": false, + "children": [] + }, + { + "name": "uint16_nullable", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 16 + }, + "nullable": true, + "children": [] + }, + { + "name": "uint16_nonnullable", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 16 + }, + "nullable": false, + "children": [] + }, + { + "name": "uint32_nullable", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 32 + }, + "nullable": true, + "children": [] + }, + { + "name": "uint32_nonnullable", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 32 + }, + "nullable": false, + "children": [] + }, + { + "name": "uint64_nullable", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 64 + }, + "nullable": true, + "children": [] + }, + { + "name": "uint64_nonnullable", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 64 + }, + "nullable": false, + "children": [] + }, + { + "name": "float32_nullable", + "type": { + "name": "floatingpoint", + "precision": "SINGLE" + }, + "nullable": true, + "children": [] + }, + { + "name": "float32_nonnullable", + "type": { + "name": "floatingpoint", + "precision": "SINGLE" + }, + "nullable": false, + "children": [] + }, + { + "name": "float64_nullable", + "type": { + "name": "floatingpoint", + "precision": "DOUBLE" + }, + "nullable": true, + "children": [] + }, + { + "name": "float64_nonnullable", + "type": { + "name": "floatingpoint", + "precision": "DOUBLE" + }, + "nullable": false, + "children": [] + }, + { + "name": "binary_nullable", + "type": { + "name": "binary" + }, + "nullable": true, + "children": [] + }, + { + "name": "binary_nonnullable", + "type": { + "name": "binary" + }, + "nullable": false, + "children": [] + }, + { + "name": "utf8_nullable", + "type": { + "name": "utf8" + }, + "nullable": true, + "children": [] + }, + { + "name": "utf8_nonnullable", + "type": { + "name": "utf8" + }, + "nullable": false, + "children": [] + }, + { + "name": "fixedsizebinary_19_nullable", + "type": { + "name": "fixedsizebinary", + "byteWidth": 19 + }, + "nullable": true, + "children": [] + }, + { + "name": "fixedsizebinary_19_nonnullable", + "type": { + "name": "fixedsizebinary", + "byteWidth": 19 + }, + "nullable": false, + "children": [] + }, + { + "name": "fixedsizebinary_120_nullable", + "type": { + "name": "fixedsizebinary", + "byteWidth": 120 + }, + "nullable": true, + "children": [] + }, + { + "name": "fixedsizebinary_120_nonnullable", + "type": { + "name": "fixedsizebinary", + "byteWidth": 120 + }, + "nullable": false, + "children": [] + } + ] + }, + "batches": [] +} \ No newline at end of file diff --git a/tests/resources/generated_primitive_no_batches.stream b/tests/resources/generated_primitive_no_batches.stream new file mode 100644 index 0000000000000000000000000000000000000000..ccbc451af9d8f920679290df6665d78acbb7e2dc GIT binary patch literal 1944 zcmZ`)zi-n(6n>XFj#WV$AVVY&c&HEq11(hrQicu;3>9KRh^0!DL`1Tj3L2?JnNWue z6+_jLp#uX014Ado(4ndrGcx2)K!~yYJid3nbI!il(w8{z?t9;R?>jqEN;#?!Rf(^8 zS|Fdw6yOe*@Y47N{Pc7@x#tZ;*MU0tW1naUGytTalQPjB@Bj#aFCNhd*Z^w4*Eymi zpbgvxew2tl09(K+a4v{Wfqmd95CRjVJ_I%Zq@d%UQsQf|jt{%0v)@wgAa(=j_I5h$ zm%Ue=RxfV8*xg)Sxw9Gf<9HB7%~sUWIfRpA{!fzQT2T&@0PBrkv+H<-99OW;Gji&< zpD9Dl(wL+}B-g(M9<}Xb7Mv`t}W(b$~ zy^FZ_EeOsRsFw5e*Rte89H1;tKW~ZCzxyF96r?eWSAVn=VzQb<{jbTj_i_@% literal 0 HcmV?d00001 diff --git a/tests/resources/generated_primitive_zerolength.arrow_file b/tests/resources/generated_primitive_zerolength.arrow_file new file mode 100644 index 0000000000000000000000000000000000000000..25a26d3bc3c22959b73e874550ea2e9a3784b7cd GIT binary patch literal 8858 zcmeHNzfTlF6n=Xw$Az45>J>zdZb6KN1t1}qP(WfqLDWzvu^|v3UUIp+C?sg83-r%o#13B~i{_9CjjZjYW>f#^D<5BS48(Hdk70u!{?Nwf~R z2Pr_l_(U6!DM%UewS#B}vH-aaInEJ%fHWY(kQ0w+AF>IVfdr5vnBIm=L12P*e}*A$ zy?wZtV>c7E-pN;yj+~G)aIVOm>nD%n61@owUughZoayxZFm_Q-G3q* z*K9V#0Q$#%wI0WP*tm>-_EBaWcN4?VHnQW`(l)MIY{)S7qA_%qp=bJ{igu3e%JR}^ zl9A}6!gv|X4%#oo7-1mlhhNd`r#TLxe+eDh#io0ytT0{%-Ju%Ok^y>8wRO8Qg1)V;{sgAw%1Vh3`P&ye8kBp#X$rv~b;oZx|mr1G zk$&;tYpwwO+1U9ic8u?7;)RlK!DndB@pr*oXHTPcZyfE++ah!nC><|@eP>{qQ)6S! zw-(MLjSHHb=@8Pkg|iPn0?Z3=(VS)&sTB2R#`;j>IF@>z=}=NZ>15}*X<#*{8BSWa zaPDgy$Ao!Xgp}4aXFvMQm=+MF0(C)D@%YZ)w#Y{8V%RM!oh}vClFoaSPk-w^gNMg3 zU$0k_Z!nLIZREkmZ-&_L^1Q|wBi{%gPyEc|i0m}S0rTMiJ|4`=rASv`ku!2pS)~r2 z_}vS*GcMh#Px5mK#@YUO-0vQo2hNuVWG(4+9Z$|y7kyrbX0g>ppVzKgY<1BOOl3Tu zb)AR9zdv1RZx6V<(%$iRvp<~&pBm^ zwP>Hz8*FL) zNL#MSeKm8xm)}u$?xkXsIXiRbUf92l&WGH&SK?Fw=ew4(D0l9aJWq1xUPSKPD>|d& U--w(C&-1z0H{|aL^*<^90f(@6X#fBK literal 0 HcmV?d00001 diff --git a/tests/resources/generated_primitive_zerolength.json b/tests/resources/generated_primitive_zerolength.json new file mode 100644 index 0000000..1e16259 --- /dev/null +++ b/tests/resources/generated_primitive_zerolength.json @@ -0,0 +1,879 @@ +{ + "schema": { + "fields": [ + { + "name": "bool_nullable", + "type": { + "name": "bool" + }, + "nullable": true, + "children": [] + }, + { + "name": "bool_nonnullable", + "type": { + "name": "bool" + }, + "nullable": false, + "children": [] + }, + { + "name": "int8_nullable", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 8 + }, + "nullable": true, + "children": [] + }, + { + "name": "int8_nonnullable", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 8 + }, + "nullable": false, + "children": [] + }, + { + "name": "int16_nullable", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 16 + }, + "nullable": true, + "children": [] + }, + { + "name": "int16_nonnullable", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 16 + }, + "nullable": false, + "children": [] + }, + { + "name": "int32_nullable", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": true, + "children": [] + }, + { + "name": "int32_nonnullable", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": false, + "children": [] + }, + { + "name": "int64_nullable", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 64 + }, + "nullable": true, + "children": [] + }, + { + "name": "int64_nonnullable", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 64 + }, + "nullable": false, + "children": [] + }, + { + "name": "uint8_nullable", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 8 + }, + "nullable": true, + "children": [] + }, + { + "name": "uint8_nonnullable", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 8 + }, + "nullable": false, + "children": [] + }, + { + "name": "uint16_nullable", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 16 + }, + "nullable": true, + "children": [] + }, + { + "name": "uint16_nonnullable", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 16 + }, + "nullable": false, + "children": [] + }, + { + "name": "uint32_nullable", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 32 + }, + "nullable": true, + "children": [] + }, + { + "name": "uint32_nonnullable", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 32 + }, + "nullable": false, + "children": [] + }, + { + "name": "uint64_nullable", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 64 + }, + "nullable": true, + "children": [] + }, + { + "name": "uint64_nonnullable", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 64 + }, + "nullable": false, + "children": [] + }, + { + "name": "float32_nullable", + "type": { + "name": "floatingpoint", + "precision": "SINGLE" + }, + "nullable": true, + "children": [] + }, + { + "name": "float32_nonnullable", + "type": { + "name": "floatingpoint", + "precision": "SINGLE" + }, + "nullable": false, + "children": [] + }, + { + "name": "float64_nullable", + "type": { + "name": "floatingpoint", + "precision": "DOUBLE" + }, + "nullable": true, + "children": [] + }, + { + "name": "float64_nonnullable", + "type": { + "name": "floatingpoint", + "precision": "DOUBLE" + }, + "nullable": false, + "children": [] + }, + { + "name": "binary_nullable", + "type": { + "name": "binary" + }, + "nullable": true, + "children": [] + }, + { + "name": "binary_nonnullable", + "type": { + "name": "binary" + }, + "nullable": false, + "children": [] + }, + { + "name": "utf8_nullable", + "type": { + "name": "utf8" + }, + "nullable": true, + "children": [] + }, + { + "name": "utf8_nonnullable", + "type": { + "name": "utf8" + }, + "nullable": false, + "children": [] + }, + { + "name": "fixedsizebinary_19_nullable", + "type": { + "name": "fixedsizebinary", + "byteWidth": 19 + }, + "nullable": true, + "children": [] + }, + { + "name": "fixedsizebinary_19_nonnullable", + "type": { + "name": "fixedsizebinary", + "byteWidth": 19 + }, + "nullable": false, + "children": [] + }, + { + "name": "fixedsizebinary_120_nullable", + "type": { + "name": "fixedsizebinary", + "byteWidth": 120 + }, + "nullable": true, + "children": [] + }, + { + "name": "fixedsizebinary_120_nonnullable", + "type": { + "name": "fixedsizebinary", + "byteWidth": 120 + }, + "nullable": false, + "children": [] + } + ] + }, + "batches": [ + { + "count": 0, + "columns": [ + { + "name": "bool_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "bool_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int8_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int8_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int16_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int16_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int32_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int32_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int64_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int64_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint8_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint8_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint16_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint16_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint32_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint32_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint64_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint64_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "float32_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "float32_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "float64_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "float64_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "binary_nullable", + "count": 0, + "VALIDITY": [], + "OFFSET": [ + 0 + ], + "DATA": [] + }, + { + "name": "binary_nonnullable", + "count": 0, + "VALIDITY": [], + "OFFSET": [ + 0 + ], + "DATA": [] + }, + { + "name": "utf8_nullable", + "count": 0, + "VALIDITY": [], + "OFFSET": [ + 0 + ], + "DATA": [] + }, + { + "name": "utf8_nonnullable", + "count": 0, + "VALIDITY": [], + "OFFSET": [ + 0 + ], + "DATA": [] + }, + { + "name": "fixedsizebinary_19_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "fixedsizebinary_19_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "fixedsizebinary_120_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "fixedsizebinary_120_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + } + ] + }, + { + "count": 0, + "columns": [ + { + "name": "bool_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "bool_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int8_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int8_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int16_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int16_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int32_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int32_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int64_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int64_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint8_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint8_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint16_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint16_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint32_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint32_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint64_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint64_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "float32_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "float32_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "float64_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "float64_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "binary_nullable", + "count": 0, + "VALIDITY": [], + "OFFSET": [ + 0 + ], + "DATA": [] + }, + { + "name": "binary_nonnullable", + "count": 0, + "VALIDITY": [], + "OFFSET": [ + 0 + ], + "DATA": [] + }, + { + "name": "utf8_nullable", + "count": 0, + "VALIDITY": [], + "OFFSET": [ + 0 + ], + "DATA": [] + }, + { + "name": "utf8_nonnullable", + "count": 0, + "VALIDITY": [], + "OFFSET": [ + 0 + ], + "DATA": [] + }, + { + "name": "fixedsizebinary_19_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "fixedsizebinary_19_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "fixedsizebinary_120_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "fixedsizebinary_120_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + } + ] + }, + { + "count": 0, + "columns": [ + { + "name": "bool_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "bool_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int8_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int8_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int16_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int16_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int32_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int32_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int64_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "int64_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint8_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint8_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint16_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint16_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint32_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint32_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint64_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "uint64_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "float32_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "float32_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "float64_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "float64_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "binary_nullable", + "count": 0, + "VALIDITY": [], + "OFFSET": [ + 0 + ], + "DATA": [] + }, + { + "name": "binary_nonnullable", + "count": 0, + "VALIDITY": [], + "OFFSET": [ + 0 + ], + "DATA": [] + }, + { + "name": "utf8_nullable", + "count": 0, + "VALIDITY": [], + "OFFSET": [ + 0 + ], + "DATA": [] + }, + { + "name": "utf8_nonnullable", + "count": 0, + "VALIDITY": [], + "OFFSET": [ + 0 + ], + "DATA": [] + }, + { + "name": "fixedsizebinary_19_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "fixedsizebinary_19_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "fixedsizebinary_120_nullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + }, + { + "name": "fixedsizebinary_120_nonnullable", + "count": 0, + "VALIDITY": [], + "DATA": [] + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/resources/generated_primitive_zerolength.stream b/tests/resources/generated_primitive_zerolength.stream new file mode 100644 index 0000000000000000000000000000000000000000..3e649c3193282055dffd80163de6433d91d760b3 GIT binary patch literal 6816 zcmeHJziSjh6n=Br>@DF;PQBt#(Jg{lSR^q7g%l|)EP_~x*o1^6cyQbW%|RlmB*hiM zBBZoPVPRomkxDF5gkX84Mg9a4%lLEs-t4@+x!oH`7w~rA-R{hL-}~N=otcZGC~6mp z%EV)YM#-mP3h;)ZykxzEsb|*XcQzop4ygj)@rkw|vkV@F8Iw#xV?6U}x8&j&<&BYkl=@FC)MJBb@}d0_3(W)d?9>rJ@|W!hmT$2^A^BRdjDu%##_8= z=qn5g={#TkjiNayxSp+ABb_EKYn;m9J{la?RB%Ff;^6uQOlWbXQlyT9>n3W$IN`EW zd+?`!UQdDH9!Or3Gd<;`S&hr)wQF#~#W>09z`@163&3>|!nH_$`0or?2z|D9p6(sv zN4t2Tq+6&NmJ56?*!%2RysqZ3&b)nEN1?V+8SGmN%bW(AaK3SHo)}!%U2>ZKQIn&&2x3;JBA2&vYv(&^FmTcPy;owA)GB4$dQk#XTkIn#G5l7vO;LBXZ0 +#include // Needed by doctest +#include +#include + +#include + +#include "sparrow/utils/metadata.hpp" +#include "sparrow/utils/repeat_container.hpp" + +#include "doctest/doctest.h" +#include "metadata_sample.hpp" +#include "sparrow_ipc/arrow_interface/arrow_schema.hpp" +#include "sparrow_ipc/deserialize_primitive_array.hpp" + +using namespace std::string_literals; + +void compare_arrow_schema(const ArrowSchema& schema, const ArrowSchema& schema_copy) +{ + CHECK_NE(&schema, &schema_copy); + CHECK_EQ(std::string_view(schema.format), std::string_view(schema_copy.format)); + CHECK_EQ(std::string_view(schema.name), std::string_view(schema_copy.name)); + CHECK_EQ(std::string_view(schema.metadata), std::string_view(schema_copy.metadata)); + CHECK_EQ(schema.flags, schema_copy.flags); + CHECK_EQ(schema.n_children, schema_copy.n_children); + if (schema.n_children > 0) + { + REQUIRE_NE(schema.children, nullptr); + REQUIRE_NE(schema_copy.children, nullptr); + for (int64_t i = 0; i < schema.n_children; ++i) + { + CHECK_NE(schema.children[i], nullptr); + compare_arrow_schema(*schema.children[i], *schema_copy.children[i]); + } + } + else + { + CHECK_EQ(schema.children, nullptr); + CHECK_EQ(schema_copy.children, nullptr); + } + + if (schema.dictionary != nullptr) + { + REQUIRE_NE(schema_copy.dictionary, nullptr); + compare_arrow_schema(*schema.dictionary, *schema_copy.dictionary); + } + else + { + CHECK_EQ(schema_copy.dictionary, nullptr); + } +} + +void check_empty(ArrowSchema& sch) +{ + CHECK_EQ(std::strcmp(sch.format, "n"), 0); + CHECK_EQ(std::strcmp(sch.name, ""), 0); + CHECK_EQ(std::strcmp(sch.metadata, ""), 0); + CHECK_EQ(sch.flags, 0); + CHECK_EQ(sch.n_children, 0); + CHECK_EQ(sch.children, nullptr); + CHECK_EQ(sch.dictionary, nullptr); +} + +TEST_SUITE("C Data Interface") +{ + TEST_CASE("ArrowSchema") + { + SUBCASE("make_schema_constructor") + { + ArrowSchema** children = new ArrowSchema*[2]; + children[0] = new ArrowSchema(); + children[1] = new ArrowSchema(); + + const auto children_1_ptr = children[0]; + const auto children_2_ptr = children[1]; + + auto dictionnary = new ArrowSchema(); + dictionnary->name = "dictionary"; + const std::string format = "format"; + const std::string name = "name"; + auto schema = sparrow_ipc::make_arrow_schema( + format.data(), + name.data(), + sparrow_ipc::metadata_sample_opt, + std::unordered_set{sparrow::ArrowFlag::DICTIONARY_ORDERED}, + 2, + children, + dictionnary + ); + + const auto schema_format = std::string_view(schema.format); + const bool format_eq = schema_format == format; + CHECK(format_eq); + const auto schema_name = std::string_view(schema.name); + const bool name_eq = schema_name == name; + CHECK(name_eq); + sparrow_ipc::test_metadata(sparrow_ipc::metadata_sample, schema.metadata); + CHECK_EQ(schema.flags, 1); + CHECK_EQ(schema.n_children, 2); + REQUIRE_NE(schema.children, nullptr); + CHECK_EQ(schema.children[0], children_1_ptr); + CHECK_EQ(schema.children[1], children_2_ptr); + CHECK_EQ(schema.dictionary, dictionnary); + const bool is_release_arrow_schema = schema.release == &sparrow_ipc::release_arrow_schema; + CHECK(is_release_arrow_schema); + CHECK_NE(schema.private_data, nullptr); + schema.release(&schema); + } + + SUBCASE("make_schema_constructor no children, no dictionary, no name and metadata") + { + auto schema = sparrow_ipc::make_arrow_schema( + "format", + nullptr, + std::optional>{}, + std::unordered_set{sparrow::ArrowFlag::DICTIONARY_ORDERED}, + 0, + nullptr, + nullptr + ); + + const auto schema_format = std::string_view(schema.format); + const bool format_eq = schema_format == "format"; + CHECK(format_eq); + CHECK_EQ(schema.name, nullptr); + CHECK_EQ(schema.metadata, nullptr); + CHECK_EQ(schema.flags, 1); + CHECK_EQ(schema.n_children, 0); + CHECK_EQ(schema.children, nullptr); + CHECK_EQ(schema.dictionary, nullptr); + const bool is_release_arrow_schema = schema.release == &sparrow_ipc::release_arrow_schema; + CHECK(is_release_arrow_schema); + CHECK_NE(schema.private_data, nullptr); + schema.release(&schema); + } + + SUBCASE("ArrowSchema release") + { + ArrowSchema** children = new ArrowSchema*[2]; + children[0] = new ArrowSchema(); + children[1] = new ArrowSchema(); + + auto schema = sparrow_ipc::make_arrow_schema( + "format", + "name", + sparrow_ipc::metadata_sample_opt, + std::unordered_set{sparrow::ArrowFlag::DICTIONARY_ORDERED}, + 2, + children, + new ArrowSchema() + ); + + schema.release(&schema); + + CHECK_EQ(schema.format, nullptr); + CHECK_EQ(schema.name, nullptr); + CHECK_EQ(schema.metadata, nullptr); + CHECK_EQ(schema.children, nullptr); + CHECK_EQ(schema.dictionary, nullptr); + const bool is_nullptr = schema.release == nullptr; + CHECK(is_nullptr); + CHECK_EQ(schema.private_data, nullptr); + } + + SUBCASE("ArrowSchema release no children, no dictionary, no name and metadata") + { + auto schema = sparrow_ipc::make_arrow_schema( + "format", + nullptr, + std::optional>{}, + std::unordered_set{sparrow::ArrowFlag::DICTIONARY_ORDERED}, + 0, + nullptr, + nullptr + ); + + schema.release(&schema); + + CHECK_EQ(schema.format, nullptr); + CHECK_EQ(schema.name, nullptr); + CHECK_EQ(schema.metadata, nullptr); + CHECK_EQ(schema.children, nullptr); + CHECK_EQ(schema.dictionary, nullptr); + const bool is_nullptr = schema.release == nullptr; + CHECK(is_nullptr); + CHECK_EQ(schema.private_data, nullptr); + } + + SUBCASE("deep_copy_schema") + { + auto children = new ArrowSchema*[2]; + children[0] = new ArrowSchema(); + *children[0] = sparrow_ipc::make_arrow_schema( + "format", + "child1", + sparrow_ipc::metadata_sample_opt, + std::unordered_set{sparrow::ArrowFlag::MAP_KEYS_SORTED}, + 0, + nullptr, + nullptr + ); + children[1] = new ArrowSchema(); + *children[1] = sparrow_ipc::make_arrow_schema( + "format", + "child2", + sparrow_ipc::metadata_sample_opt, + std::unordered_set{sparrow::ArrowFlag::NULLABLE}, + 0, + nullptr, + nullptr + ); + + auto dictionary = new ArrowSchema(); + *dictionary = sparrow_ipc::make_arrow_schema( + "format", + "dictionary", + sparrow_ipc::metadata_sample_opt, + std::unordered_set{sparrow::ArrowFlag::MAP_KEYS_SORTED}, + 0, + nullptr, + nullptr + ); + auto schema = sparrow_ipc::make_arrow_schema( + "format", + "name", + sparrow_ipc::metadata_sample_opt, + std::unordered_set{sparrow::ArrowFlag::DICTIONARY_ORDERED}, + 0, + children, + dictionary + ); + + auto schema_copy = sparrow::copy_schema(schema); + + compare_arrow_schema(schema, schema_copy); + + schema_copy.release(&schema_copy); + schema.release(&schema); + } + + // SUBCASE("swap_schema") + // { + // auto schema0 = test::make_arrow_schema(true); + // auto schema0_bkup = sparrow::copy_schema(schema0); + + // auto schema1 = test::make_arrow_schema(false); + // auto schema1_bkup = sparrow::copy_schema(schema1); + + // sparrow::swap(schema0, schema1); + // compare_arrow_schema(schema0, schema1_bkup); + // compare_arrow_schema(schema1, schema0_bkup); + + // schema0.release(&schema0); + // schema1.release(&schema1); + // schema0_bkup.release(&schema0_bkup); + // schema1_bkup.release(&schema1_bkup); + // } + + // SUBCASE("move_schema") + // { + // auto src_schema = test::make_arrow_schema(true); + // auto control = sparrow::copy_schema(src_schema); + + // auto dst_schema = sparrow::move_schema(std::move(src_schema)); + // // check_empty(src_schema); + // compare_arrow_schema(dst_schema, control); + + // auto dst2_schema = sparrow::move_schema(dst_schema); + // // check_empty(dst_schema); + // compare_arrow_schema(dst2_schema, control); + // dst2_schema.release(&dst2_schema); + // control.release(&control); + // } + } +} diff --git a/tests/test_null_array_serialization.cpp b/tests/test_null_array_serialization.cpp index d3b06f0..0b3e12c 100644 --- a/tests/test_null_array_serialization.cpp +++ b/tests/test_null_array_serialization.cpp @@ -1,14 +1,13 @@ -#include "doctest/doctest.h" -#include "sparrow.hpp" +#include +#include -#include "serialize_null_array.hpp" +#include "sparrow_ipc/serialize_null_array.hpp" #include "sparrow_ipc_tests_helpers.hpp" namespace sparrow_ipc { namespace sp = sparrow; - TEST_CASE("Serialize and deserialize null_array") { const std::size_t size = 10; diff --git a/tests/test_primitive_array_serialization.cpp b/tests/test_primitive_array_serialization.cpp index b0086f0..450ab19 100644 --- a/tests/test_primitive_array_serialization.cpp +++ b/tests/test_primitive_array_serialization.cpp @@ -3,20 +3,17 @@ #include #include -#include "doctest/doctest.h" -#include "sparrow.hpp" +#include +#include -#include "serialize_primitive_array.hpp" +#include "sparrow_ipc/serialize_primitive_array.hpp" #include "sparrow_ipc_tests_helpers.hpp" namespace sparrow_ipc { namespace sp = sparrow; - using testing_types = std::tuple< - int, - float, - double>; + using testing_types = std::tuple; template void compare_bitmap(const sp::primitive_array& pa1, const sp::primitive_array& pa2) @@ -45,7 +42,8 @@ namespace sparrow_ipc TEST_CASE_TEMPLATE_DEFINE("Serialize and Deserialize primitive_array", T, primitive_array_types) { - auto create_primitive_array = []() -> sp::primitive_array { + auto create_primitive_array = []() -> sp::primitive_array + { if constexpr (std::is_same_v) { return {10, 20, 30, 40, 50}; @@ -83,9 +81,9 @@ namespace sparrow_ipc const sp::u8_buffer data_buffer = {100, 200, 300, 400, 500}; // Validity bitmap: 100 (valid), 200 (valid), 300 (null), 400 (valid), 500 (null) - sp::validity_bitmap validity(5, true); // All valid initially - validity.set(2, false); // Set index 2 to null - validity.set(4, false); // Set index 4 to null + sp::validity_bitmap validity(5, true); // All valid initially + validity.set(2, false); // Set index 2 to null + validity.set(4, false); // Set index 4 to null sp::primitive_array ar(std::move(data_buffer), std::move(validity)); @@ -107,15 +105,12 @@ namespace sparrow_ipc const sp::validity_bitmap validity(3, true); // Custom metadata - const std::vector metadata = { - {"key1", "value1"}, - {"key2", "value2"} - }; + const std::vector metadata = {{"key1", "value1"}, {"key2", "value2"}}; sp::primitive_array ar( std::move(data_buffer), std::move(validity), - "my_named_array", // name + "my_named_array", // name std::make_optional(std::vector{{"key1", "value1"}, {"key2", "value2"}}) ); diff --git a/tests/test_primitive_array_with_files.cpp b/tests/test_primitive_array_with_files.cpp new file mode 100644 index 0000000..7603543 --- /dev/null +++ b/tests/test_primitive_array_with_files.cpp @@ -0,0 +1,103 @@ +#include +#include +#include +#include +#include + +#include + +#include + +#include "sparrow/json_reader/json_parser.hpp" + +#include "doctest/doctest.h" +#include "sparrow.hpp" +#include "sparrow_ipc/deserialize.hpp" + + +const std::filesystem::path tests_resources_files_path = TESTS_RESOURCES_FILES_PATH; + +const std::vector files_paths_to_test = { + tests_resources_files_path / "generated_primitive", +}; + +size_t get_number_of_batches(const std::filesystem::path& json_path) +{ + std::ifstream json_file(json_path); + if (!json_file.is_open()) + { + throw std::runtime_error("Could not open file: " + json_path.string()); + } + const nlohmann::json data = nlohmann::json::parse(json_file); + return data["batches"].size(); +} + +nlohmann::json load_json_file(const std::filesystem::path& json_path) +{ + std::ifstream json_file(json_path); + if (!json_file.is_open()) + { + throw std::runtime_error("Could not open file: " + json_path.string()); + } + return nlohmann::json::parse(json_file); +} + +TEST_SUITE("integration tests") +{ + TEST_CASE("POUET") + { + for (const auto& file_path : files_paths_to_test) + { + std::filesystem::path json_path = file_path; + json_path.replace_extension(".json"); + const std::string test_name = "Testing " + json_path.filename().string(); + SUBCASE(test_name.c_str()) + { + // Load the JSON file + auto json_data = load_json_file(json_path); + CHECK(json_data != nullptr); + + const size_t num_batches = get_number_of_batches(json_path); + + std::vector record_batches_from_json; + + for (size_t batch_idx = 0; batch_idx < num_batches; ++batch_idx) + { + INFO("Processing batch " << batch_idx << " of " << num_batches); + record_batches_from_json.emplace_back( + sparrow::json_reader::build_record_batch_from_json(json_data, batch_idx) + ); + } + + // Load stream file + std::filesystem::path stream_file_path = file_path; + stream_file_path.replace_extension(".stream"); + std::ifstream stream_file(stream_file_path, std::ios::in | std::ios::binary); + REQUIRE(stream_file.is_open()); + const std::vector stream_data( + (std::istreambuf_iterator(stream_file)), + (std::istreambuf_iterator()) + ); + stream_file.close(); + + // Process the stream file + const auto record_batches_from_stream = sparrow_ipc::deserialize_stream(stream_data.data()); + + // Compare record batches + REQUIRE_EQ(record_batches_from_stream.size(), record_batches_from_json.size()); + for (size_t i = 0; i < record_batches_from_stream.size(); ++i) + { + for(size_t y = 0; y < record_batches_from_stream[i].nb_columns(); y++) + { + for(size_t z = 0 ; z < record_batches_from_stream[i].get_column(y).size(); z++) + { + INFO("Comparing batch " << i << ", column " << y << ", row " << z); + REQUIRE_EQ(record_batches_from_stream[i].get_column(y).size(), record_batches_from_json[i].get_column(y).size()); + CHECK_EQ(record_batches_from_stream[i].get_column(y).at(z), record_batches_from_json[i].get_column(y).at(z)); + } + } + } + } + } + } +} diff --git a/tests/test_utils.cpp b/tests/test_utils.cpp index f53ef0c..ab9f4a0 100644 --- a/tests/test_utils.cpp +++ b/tests/test_utils.cpp @@ -1,8 +1,8 @@ -#include "doctest/doctest.h" +#include +#include -#include "sparrow.hpp" - -#include "utils.hpp" +#include "sparrow_ipc/arrow_interface/arrow_array_schema_common_release.hpp" +#include "sparrow_ipc/utils.hpp" namespace sparrow_ipc { @@ -22,118 +22,328 @@ namespace sparrow_ipc flatbuffers::FlatBufferBuilder builder; SUBCASE("Null and Boolean types") { - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::NA)).first, org::apache::arrow::flatbuf::Type::Null); - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::BOOL)).first, org::apache::arrow::flatbuf::Type::Bool); + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::NA)).first, + org::apache::arrow::flatbuf::Type::Null + ); + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::BOOL)).first, + org::apache::arrow::flatbuf::Type::Bool + ); } SUBCASE("Integer types") { - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::INT8)).first, org::apache::arrow::flatbuf::Type::Int); // INT8 - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::UINT8)).first, org::apache::arrow::flatbuf::Type::Int); // UINT8 - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::INT16)).first, org::apache::arrow::flatbuf::Type::Int); // INT16 - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::UINT16)).first, org::apache::arrow::flatbuf::Type::Int); // UINT16 - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::INT32)).first, org::apache::arrow::flatbuf::Type::Int); // INT32 - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::UINT32)).first, org::apache::arrow::flatbuf::Type::Int); // UINT32 - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::INT64)).first, org::apache::arrow::flatbuf::Type::Int); // INT64 - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::UINT64)).first, org::apache::arrow::flatbuf::Type::Int); // UINT64 + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::INT8)).first, + org::apache::arrow::flatbuf::Type::Int + ); // INT8 + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::UINT8)).first, + org::apache::arrow::flatbuf::Type::Int + ); // UINT8 + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::INT16)).first, + org::apache::arrow::flatbuf::Type::Int + ); // INT16 + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::UINT16)).first, + org::apache::arrow::flatbuf::Type::Int + ); // UINT16 + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::INT32)).first, + org::apache::arrow::flatbuf::Type::Int + ); // INT32 + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::UINT32)).first, + org::apache::arrow::flatbuf::Type::Int + ); // UINT32 + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::INT64)).first, + org::apache::arrow::flatbuf::Type::Int + ); // INT64 + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::UINT64)).first, + org::apache::arrow::flatbuf::Type::Int + ); // UINT64 } SUBCASE("Floating Point types") { - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::HALF_FLOAT)).first, org::apache::arrow::flatbuf::Type::FloatingPoint); // HALF_FLOAT - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::FLOAT)).first, org::apache::arrow::flatbuf::Type::FloatingPoint); // FLOAT - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::DOUBLE)).first, org::apache::arrow::flatbuf::Type::FloatingPoint); // DOUBLE + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::HALF_FLOAT)) + .first, + org::apache::arrow::flatbuf::Type::FloatingPoint + ); // HALF_FLOAT + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::FLOAT)).first, + org::apache::arrow::flatbuf::Type::FloatingPoint + ); // FLOAT + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::DOUBLE)).first, + org::apache::arrow::flatbuf::Type::FloatingPoint + ); // DOUBLE } SUBCASE("String and Binary types") { - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::STRING)).first, org::apache::arrow::flatbuf::Type::Utf8); // STRING - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::LARGE_STRING)).first, org::apache::arrow::flatbuf::Type::LargeUtf8); // LARGE_STRING - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::BINARY)).first, org::apache::arrow::flatbuf::Type::Binary); // BINARY - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::LARGE_BINARY)).first, org::apache::arrow::flatbuf::Type::LargeBinary); // LARGE_BINARY - CHECK_EQ(utils::get_flatbuffer_type(builder, "vu").first, org::apache::arrow::flatbuf::Type::Utf8View); // STRING_VIEW - CHECK_EQ(utils::get_flatbuffer_type(builder, "vz").first, org::apache::arrow::flatbuf::Type::BinaryView); // BINARY_VIEW + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::STRING)).first, + org::apache::arrow::flatbuf::Type::Utf8 + ); // STRING + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::LARGE_STRING)) + .first, + org::apache::arrow::flatbuf::Type::LargeUtf8 + ); // LARGE_STRING + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::BINARY)).first, + org::apache::arrow::flatbuf::Type::Binary + ); // BINARY + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::LARGE_BINARY)) + .first, + org::apache::arrow::flatbuf::Type::LargeBinary + ); // LARGE_BINARY + CHECK_EQ( + utils::get_flatbuffer_type(builder, "vu").first, + org::apache::arrow::flatbuf::Type::Utf8View + ); // STRING_VIEW + CHECK_EQ( + utils::get_flatbuffer_type(builder, "vz").first, + org::apache::arrow::flatbuf::Type::BinaryView + ); // BINARY_VIEW } SUBCASE("Date types") { - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::DATE_DAYS)).first, org::apache::arrow::flatbuf::Type::Date); // DATE_DAYS - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::DATE_MILLISECONDS)).first, org::apache::arrow::flatbuf::Type::Date); // DATE_MILLISECONDS + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::DATE_DAYS)) + .first, + org::apache::arrow::flatbuf::Type::Date + ); // DATE_DAYS + CHECK_EQ( + utils::get_flatbuffer_type( + builder, + sparrow::data_type_to_format(sparrow::data_type::DATE_MILLISECONDS) + ) + .first, + org::apache::arrow::flatbuf::Type::Date + ); // DATE_MILLISECONDS } SUBCASE("Timestamp types") { - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::TIMESTAMP_SECONDS)).first, org::apache::arrow::flatbuf::Type::Timestamp); // TIMESTAMP_SECONDS - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::TIMESTAMP_MILLISECONDS)).first, org::apache::arrow::flatbuf::Type::Timestamp); // TIMESTAMP_MILLISECONDS - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::TIMESTAMP_MICROSECONDS)).first, org::apache::arrow::flatbuf::Type::Timestamp); // TIMESTAMP_MICROSECONDS - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::TIMESTAMP_NANOSECONDS)).first, org::apache::arrow::flatbuf::Type::Timestamp); // TIMESTAMP_NANOSECONDS + CHECK_EQ( + utils::get_flatbuffer_type( + builder, + sparrow::data_type_to_format(sparrow::data_type::TIMESTAMP_SECONDS) + ) + .first, + org::apache::arrow::flatbuf::Type::Timestamp + ); // TIMESTAMP_SECONDS + CHECK_EQ( + utils::get_flatbuffer_type( + builder, + sparrow::data_type_to_format(sparrow::data_type::TIMESTAMP_MILLISECONDS) + ) + .first, + org::apache::arrow::flatbuf::Type::Timestamp + ); // TIMESTAMP_MILLISECONDS + CHECK_EQ( + utils::get_flatbuffer_type( + builder, + sparrow::data_type_to_format(sparrow::data_type::TIMESTAMP_MICROSECONDS) + ) + .first, + org::apache::arrow::flatbuf::Type::Timestamp + ); // TIMESTAMP_MICROSECONDS + CHECK_EQ( + utils::get_flatbuffer_type( + builder, + sparrow::data_type_to_format(sparrow::data_type::TIMESTAMP_NANOSECONDS) + ) + .first, + org::apache::arrow::flatbuf::Type::Timestamp + ); // TIMESTAMP_NANOSECONDS } SUBCASE("Duration types") { - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::DURATION_SECONDS)).first, org::apache::arrow::flatbuf::Type::Duration); // DURATION_SECONDS - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::DURATION_MILLISECONDS)).first, org::apache::arrow::flatbuf::Type::Duration); // DURATION_MILLISECONDS - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::DURATION_MICROSECONDS)).first, org::apache::arrow::flatbuf::Type::Duration); // DURATION_MICROSECONDS - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::DURATION_NANOSECONDS)).first, org::apache::arrow::flatbuf::Type::Duration); // DURATION_NANOSECONDS + CHECK_EQ( + utils::get_flatbuffer_type( + builder, + sparrow::data_type_to_format(sparrow::data_type::DURATION_SECONDS) + ) + .first, + org::apache::arrow::flatbuf::Type::Duration + ); // DURATION_SECONDS + CHECK_EQ( + utils::get_flatbuffer_type( + builder, + sparrow::data_type_to_format(sparrow::data_type::DURATION_MILLISECONDS) + ) + .first, + org::apache::arrow::flatbuf::Type::Duration + ); // DURATION_MILLISECONDS + CHECK_EQ( + utils::get_flatbuffer_type( + builder, + sparrow::data_type_to_format(sparrow::data_type::DURATION_MICROSECONDS) + ) + .first, + org::apache::arrow::flatbuf::Type::Duration + ); // DURATION_MICROSECONDS + CHECK_EQ( + utils::get_flatbuffer_type( + builder, + sparrow::data_type_to_format(sparrow::data_type::DURATION_NANOSECONDS) + ) + .first, + org::apache::arrow::flatbuf::Type::Duration + ); // DURATION_NANOSECONDS } SUBCASE("Interval types") { - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::INTERVAL_MONTHS)).first, org::apache::arrow::flatbuf::Type::Interval); // INTERVAL_MONTHS - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::INTERVAL_DAYS_TIME)).first, org::apache::arrow::flatbuf::Type::Interval); // INTERVAL_DAYS_TIME - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::INTERVAL_MONTHS_DAYS_NANOSECONDS)).first, org::apache::arrow::flatbuf::Type::Interval); // INTERVAL_MONTHS_DAYS_NANOSECONDS + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::INTERVAL_MONTHS)) + .first, + org::apache::arrow::flatbuf::Type::Interval + ); // INTERVAL_MONTHS + CHECK_EQ( + utils::get_flatbuffer_type( + builder, + sparrow::data_type_to_format(sparrow::data_type::INTERVAL_DAYS_TIME) + ) + .first, + org::apache::arrow::flatbuf::Type::Interval + ); // INTERVAL_DAYS_TIME + CHECK_EQ( + utils::get_flatbuffer_type( + builder, + sparrow::data_type_to_format(sparrow::data_type::INTERVAL_MONTHS_DAYS_NANOSECONDS) + ) + .first, + org::apache::arrow::flatbuf::Type::Interval + ); // INTERVAL_MONTHS_DAYS_NANOSECONDS } SUBCASE("Time types") { - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::TIME_SECONDS)).first, org::apache::arrow::flatbuf::Type::Time); // TIME_SECONDS - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::TIME_MILLISECONDS)).first, org::apache::arrow::flatbuf::Type::Time); // TIME_MILLISECONDS - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::TIME_MICROSECONDS)).first, org::apache::arrow::flatbuf::Type::Time); // TIME_MICROSECONDS - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::TIME_NANOSECONDS)).first, org::apache::arrow::flatbuf::Type::Time); // TIME_NANOSECONDS + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::TIME_SECONDS)) + .first, + org::apache::arrow::flatbuf::Type::Time + ); // TIME_SECONDS + CHECK_EQ( + utils::get_flatbuffer_type( + builder, + sparrow::data_type_to_format(sparrow::data_type::TIME_MILLISECONDS) + ) + .first, + org::apache::arrow::flatbuf::Type::Time + ); // TIME_MILLISECONDS + CHECK_EQ( + utils::get_flatbuffer_type( + builder, + sparrow::data_type_to_format(sparrow::data_type::TIME_MICROSECONDS) + ) + .first, + org::apache::arrow::flatbuf::Type::Time + ); // TIME_MICROSECONDS + CHECK_EQ( + utils::get_flatbuffer_type( + builder, + sparrow::data_type_to_format(sparrow::data_type::TIME_NANOSECONDS) + ) + .first, + org::apache::arrow::flatbuf::Type::Time + ); // TIME_NANOSECONDS } SUBCASE("List types") { - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::LIST)).first, org::apache::arrow::flatbuf::Type::List); // LIST - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::LARGE_LIST)).first, org::apache::arrow::flatbuf::Type::LargeList); // LARGE_LIST - CHECK_EQ(utils::get_flatbuffer_type(builder, "+vl").first, org::apache::arrow::flatbuf::Type::ListView); // LIST_VIEW - CHECK_EQ(utils::get_flatbuffer_type(builder, "+vL").first, org::apache::arrow::flatbuf::Type::LargeListView); // LARGE_LIST_VIEW - CHECK_EQ(utils::get_flatbuffer_type(builder, "+w:16").first, org::apache::arrow::flatbuf::Type::FixedSizeList); // FIXED_SIZED_LIST - CHECK_THROWS(utils::get_flatbuffer_type(builder, "+w:")); // Invalid FixedSizeList format + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::LIST)).first, + org::apache::arrow::flatbuf::Type::List + ); // LIST + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::LARGE_LIST)) + .first, + org::apache::arrow::flatbuf::Type::LargeList + ); // LARGE_LIST + CHECK_EQ( + utils::get_flatbuffer_type(builder, "+vl").first, + org::apache::arrow::flatbuf::Type::ListView + ); // LIST_VIEW + CHECK_EQ( + utils::get_flatbuffer_type(builder, "+vL").first, + org::apache::arrow::flatbuf::Type::LargeListView + ); // LARGE_LIST_VIEW + CHECK_EQ( + utils::get_flatbuffer_type(builder, "+w:16").first, + org::apache::arrow::flatbuf::Type::FixedSizeList + ); // FIXED_SIZED_LIST + CHECK_THROWS(utils::get_flatbuffer_type(builder, "+w:")); // Invalid FixedSizeList format } SUBCASE("Struct and Map types") { - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::STRUCT)).first, org::apache::arrow::flatbuf::Type::Struct_); // STRUCT - CHECK_EQ(utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::MAP)).first, org::apache::arrow::flatbuf::Type::Map); // MAP + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::STRUCT)).first, + org::apache::arrow::flatbuf::Type::Struct_ + ); // STRUCT + CHECK_EQ( + utils::get_flatbuffer_type(builder, sparrow::data_type_to_format(sparrow::data_type::MAP)).first, + org::apache::arrow::flatbuf::Type::Map + ); // MAP } SUBCASE("Union types") { - CHECK_EQ(utils::get_flatbuffer_type(builder, "+ud:").first, org::apache::arrow::flatbuf::Type::Union); // DENSE_UNION - CHECK_EQ(utils::get_flatbuffer_type(builder, "+us:").first, org::apache::arrow::flatbuf::Type::Union); // SPARSE_UNION + CHECK_EQ( + utils::get_flatbuffer_type(builder, "+ud:").first, + org::apache::arrow::flatbuf::Type::Union + ); // DENSE_UNION + CHECK_EQ( + utils::get_flatbuffer_type(builder, "+us:").first, + org::apache::arrow::flatbuf::Type::Union + ); // SPARSE_UNION } SUBCASE("Run-End Encoded type") { - CHECK_EQ(utils::get_flatbuffer_type(builder, "+r").first, org::apache::arrow::flatbuf::Type::RunEndEncoded); // RUN_ENCODED + CHECK_EQ( + utils::get_flatbuffer_type(builder, "+r").first, + org::apache::arrow::flatbuf::Type::RunEndEncoded + ); // RUN_ENCODED } SUBCASE("Decimal types") { - CHECK_EQ(utils::get_flatbuffer_type(builder, "d:10,5").first, org::apache::arrow::flatbuf::Type::Decimal); // DECIMAL (general) - CHECK_THROWS(utils::get_flatbuffer_type(builder, "d:10")); // Invalid Decimal format + CHECK_EQ( + utils::get_flatbuffer_type(builder, "d:10,5").first, + org::apache::arrow::flatbuf::Type::Decimal + ); // DECIMAL (general) + CHECK_THROWS(utils::get_flatbuffer_type(builder, "d:10")); // Invalid Decimal format } SUBCASE("Fixed Width Binary type") { - CHECK_EQ(utils::get_flatbuffer_type(builder, "w:32").first, org::apache::arrow::flatbuf::Type::FixedSizeBinary); // FIXED_WIDTH_BINARY - CHECK_THROWS(utils::get_flatbuffer_type(builder, "w:")); // Invalid FixedSizeBinary format + CHECK_EQ( + utils::get_flatbuffer_type(builder, "w:32").first, + org::apache::arrow::flatbuf::Type::FixedSizeBinary + ); // FIXED_WIDTH_BINARY + CHECK_THROWS(utils::get_flatbuffer_type(builder, "w:")); // Invalid FixedSizeBinary format } SUBCASE("Unsupported type returns Null") { - CHECK_EQ(utils::get_flatbuffer_type(builder, "unsupported_format").first, org::apache::arrow::flatbuf::Type::Null); + CHECK_EQ( + utils::get_flatbuffer_type(builder, "unsupported_format").first, + org::apache::arrow::flatbuf::Type::Null + ); } } } From 190af27816a7f74f1e8c02f85cab61da563edbf2 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Thu, 4 Sep 2025 11:24:48 +0200 Subject: [PATCH 02/22] wip --- .../deserialize_fixedsizebinary_array.hpp | 23 ++++++++++++------- .../deserialize_primitive_array.hpp | 13 ++++++++--- ...deserialize_variable_size_binary_array.hpp | 15 ++++++++---- src/encapsulated_message.cpp | 13 +++++++---- tests/test_primitive_array_with_files.cpp | 12 ++++++---- 5 files changed, 51 insertions(+), 25 deletions(-) diff --git a/include/sparrow_ipc/deserialize_fixedsizebinary_array.hpp b/include/sparrow_ipc/deserialize_fixedsizebinary_array.hpp index 81c6629..b6f02c7 100644 --- a/include/sparrow_ipc/deserialize_fixedsizebinary_array.hpp +++ b/include/sparrow_ipc/deserialize_fixedsizebinary_array.hpp @@ -23,17 +23,24 @@ namespace sparrow_ipc const std::string format = "w:" + std::to_string(byte_width); ArrowSchema schema = make_arrow_schema(format, name.data(), metadata, std::nullopt, 0, nullptr, nullptr); - const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++); - auto buffer_ptr = const_cast(body.data() + buffer_metadata->offset()); - const size_t buffer_size = buffer_metadata->length(); - const auto bitmap_buffer_metadata = record_batch.buffers()->Get(buffer_index++); - auto bitmap_ptr = const_cast(body.data() + bitmap_buffer_metadata->offset()); - const sparrow::dynamic_bitset_view bitmap_view{bitmap_ptr, static_cast(record_batch.length())}; - std::vector buffers = {buffer_ptr, bitmap_ptr}; + uint8_t* bitmap_ptr = nullptr; + int64_t null_count = 0; + + // Check if validity buffer is present (length > 0 for nullable fields) + if (bitmap_buffer_metadata->length() > 0) { + bitmap_ptr = const_cast(body.data() + bitmap_buffer_metadata->offset()); + const sparrow::dynamic_bitset_view bitmap_view{bitmap_ptr, static_cast(record_batch.length())}; + null_count = bitmap_view.null_count(); + } + + const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++); + auto buffer_ptr = const_cast(body.data() + buffer_metadata->offset()); + + std::vector buffers = {bitmap_ptr, buffer_ptr}; - ArrowArray array = make_arrow_array(record_batch.length(), bitmap_view.null_count(), 0, std::move(buffers), 0, nullptr, nullptr); + ArrowArray array = make_arrow_array(record_batch.length(), null_count, 0, std::move(buffers), 0, nullptr, nullptr); sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; return sparrow::fixed_width_binary_array{std::move(ap)}; diff --git a/include/sparrow_ipc/deserialize_primitive_array.hpp b/include/sparrow_ipc/deserialize_primitive_array.hpp index 5bf7624..751feb0 100644 --- a/include/sparrow_ipc/deserialize_primitive_array.hpp +++ b/include/sparrow_ipc/deserialize_primitive_array.hpp @@ -27,14 +27,21 @@ namespace sparrow_ipc ArrowSchema schema = make_arrow_schema(format, name.data(), metadata, std::nullopt, 0, nullptr, nullptr); const auto bitmap_buffer_metadata = record_batch.buffers()->Get(buffer_index++); - auto bitmap_ptr = const_cast(body.data() + bitmap_buffer_metadata->offset()); + uint8_t* bitmap_ptr = nullptr; + int64_t null_count = 0; + + // Check if validity buffer is present (length > 0 for nullable fields) + if (bitmap_buffer_metadata->length() > 0) { + bitmap_ptr = const_cast(body.data() + bitmap_buffer_metadata->offset()); + const sparrow::dynamic_bitset_view bitmap_view{bitmap_ptr, static_cast(record_batch.length())}; + null_count = bitmap_view.null_count(); + } const auto primitive_buffer_metadata = record_batch.buffers()->Get(buffer_index++); auto primitives_ptr = const_cast(body.data() + primitive_buffer_metadata->offset()); - const sparrow::dynamic_bitset_view bitmap_view{bitmap_ptr, static_cast(record_batch.length())}; std::vector buffers = {bitmap_ptr, primitives_ptr}; - ArrowArray array = make_arrow_array(record_batch.length(), bitmap_view.null_count(), 0, std::move(buffers), 0, nullptr, nullptr); + ArrowArray array = make_arrow_array(record_batch.length(), null_count, 0, std::move(buffers), 0, nullptr, nullptr); sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; return sparrow::primitive_array{std::move(ap)}; diff --git a/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp b/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp index 2309861..c111ec9 100644 --- a/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp +++ b/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp @@ -26,19 +26,24 @@ namespace sparrow_ipc ArrowSchema schema = make_arrow_schema(format, name.data(), metadata, std::nullopt, 0, nullptr, nullptr); const auto bitmap_buffer_metadata = record_batch.buffers()->Get(buffer_index++); - auto bitmap_ptr = const_cast(body.data() + bitmap_buffer_metadata->offset()); + uint8_t* bitmap_ptr = nullptr; + int64_t null_count = 0; + + // Check if validity buffer is present (length > 0 for nullable fields) + if (bitmap_buffer_metadata->length() > 0) { + bitmap_ptr = const_cast(body.data() + bitmap_buffer_metadata->offset()); + const sparrow::dynamic_bitset_view bitmap_view{bitmap_ptr, static_cast(record_batch.length())}; + null_count = bitmap_view.null_count(); + } const auto offset_metadata = record_batch.buffers()->Get(buffer_index++); auto offset_ptr = const_cast(body.data() + offset_metadata->offset()); - const size_t offset_size = offset_metadata->length(); const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++); auto buffer_ptr = const_cast(body.data() + buffer_metadata->offset()); - const size_t buffer_size = buffer_metadata->length(); - const sparrow::dynamic_bitset_view bitmap_view{bitmap_ptr, static_cast(record_batch.length())}; std::vector buffers = {bitmap_ptr, offset_ptr, buffer_ptr}; - ArrowArray array = make_arrow_array(record_batch.length(), bitmap_view.null_count(), 0, std::move(buffers), 0, nullptr, nullptr); + ArrowArray array = make_arrow_array(record_batch.length(), null_count, 0, std::move(buffers), 0, nullptr, nullptr); sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; return T{std::move(ap)}; diff --git a/src/encapsulated_message.cpp b/src/encapsulated_message.cpp index eb6f5a4..e76fac3 100644 --- a/src/encapsulated_message.cpp +++ b/src/encapsulated_message.cpp @@ -72,16 +72,19 @@ namespace sparrow_ipc std::span EncapsulatedMessage::body() const { - const uint8_t* body_ptr = m_buf_ptr + (sizeof(uint32_t) * 2) // 4 bytes continuation + 4 bytes - // metadata size - + metadata_length(); + const size_t offset = sizeof(uint32_t) * 2 // 4 bytes continuation + 4 bytes metadata size + + metadata_length(); + const size_t padded_offset = (offset + 7) & ~7; // Round up to 8-byte boundary + const uint8_t* body_ptr = m_buf_ptr + padded_offset; return {body_ptr, body_length()}; } size_t EncapsulatedMessage::total_length() const { - return sizeof(uint32_t) * 2 // 4 bytes continuation + 4 bytes metadata size - + metadata_length() + body_length(); + const size_t offset = sizeof(uint32_t) * 2 // 4 bytes continuation + 4 bytes metadata size + + metadata_length(); + const size_t padded_offset = (offset + 7) & ~7; // Round up to 8-byte boundary + return padded_offset + body_length(); } std::span EncapsulatedMessage::as_span() const diff --git a/tests/test_primitive_array_with_files.cpp b/tests/test_primitive_array_with_files.cpp index 7603543..101cdfa 100644 --- a/tests/test_primitive_array_with_files.cpp +++ b/tests/test_primitive_array_with_files.cpp @@ -19,6 +19,9 @@ const std::filesystem::path tests_resources_files_path = TESTS_RESOURCES_FILES_P const std::vector files_paths_to_test = { tests_resources_files_path / "generated_primitive", + tests_resources_files_path / "generated_primitive_large_offsets", + tests_resources_files_path / "generated_primitive_zerolength", + tests_resources_files_path / "generated_primitive_no_batches" }; size_t get_number_of_batches(const std::filesystem::path& json_path) @@ -42,15 +45,15 @@ nlohmann::json load_json_file(const std::filesystem::path& json_path) return nlohmann::json::parse(json_file); } -TEST_SUITE("integration tests") +TEST_SUITE("Integration tests") { - TEST_CASE("POUET") + TEST_CASE("Compare stream deserialization with JSON deserialization") { for (const auto& file_path : files_paths_to_test) { std::filesystem::path json_path = file_path; json_path.replace_extension(".json"); - const std::string test_name = "Testing " + json_path.filename().string(); + const std::string test_name = "Testing " + file_path.filename().string(); SUBCASE(test_name.c_str()) { // Load the JSON file @@ -91,7 +94,8 @@ TEST_SUITE("integration tests") { for(size_t z = 0 ; z < record_batches_from_stream[i].get_column(y).size(); z++) { - INFO("Comparing batch " << i << ", column " << y << ", row " << z); + const auto col_name = record_batches_from_stream[i].get_column(y).name().value_or("NA"); + INFO("Comparing batch " << i << ", column " << y << " named :"<< col_name <<" , row " << z); REQUIRE_EQ(record_batches_from_stream[i].get_column(y).size(), record_batches_from_json[i].get_column(y).size()); CHECK_EQ(record_batches_from_stream[i].get_column(y).at(z), record_batches_from_json[i].get_column(y).at(z)); } From c6f0202c6738e7ff6626318a49d66a28d50802cd Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Thu, 4 Sep 2025 14:18:23 +0200 Subject: [PATCH 03/22] wip --- tests/test_primitive_array_with_files.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_primitive_array_with_files.cpp b/tests/test_primitive_array_with_files.cpp index 101cdfa..d0825ed 100644 --- a/tests/test_primitive_array_with_files.cpp +++ b/tests/test_primitive_array_with_files.cpp @@ -19,7 +19,7 @@ const std::filesystem::path tests_resources_files_path = TESTS_RESOURCES_FILES_P const std::vector files_paths_to_test = { tests_resources_files_path / "generated_primitive", - tests_resources_files_path / "generated_primitive_large_offsets", + // tests_resources_files_path / "generated_primitive_large_offsets", tests_resources_files_path / "generated_primitive_zerolength", tests_resources_files_path / "generated_primitive_no_batches" }; @@ -92,12 +92,16 @@ TEST_SUITE("Integration tests") { for(size_t y = 0; y < record_batches_from_stream[i].nb_columns(); y++) { - for(size_t z = 0 ; z < record_batches_from_stream[i].get_column(y).size(); z++) + const auto& column_stream = record_batches_from_stream[i].get_column(y); + const auto& column_json = record_batches_from_json[i].get_column(y); + REQUIRE_EQ(column_stream.size(), column_json.size()); + for(size_t z = 0 ; z < column_json.size(); z++) { - const auto col_name = record_batches_from_stream[i].get_column(y).name().value_or("NA"); + const auto col_name = column_stream.name().value_or("NA"); INFO("Comparing batch " << i << ", column " << y << " named :"<< col_name <<" , row " << z); - REQUIRE_EQ(record_batches_from_stream[i].get_column(y).size(), record_batches_from_json[i].get_column(y).size()); - CHECK_EQ(record_batches_from_stream[i].get_column(y).at(z), record_batches_from_json[i].get_column(y).at(z)); + const auto& column_stream_value = column_stream[z]; + const auto& column_json_value = column_json[z]; + CHECK_EQ(column_stream_value, column_json_value); } } } From 620ea810676613248511e716076a98b45ab9b32c Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Thu, 4 Sep 2025 15:35:01 +0200 Subject: [PATCH 04/22] wip --- CMakeLists.txt | 1 - .../arrow_interface/arrow_array.hpp | 6 +-- .../arrow_array/private_data.hpp | 4 +- .../arrow_array_schema_common_release.hpp | 8 ++-- .../arrow_interface/arrow_schema.hpp | 14 +++--- .../arrow_schema/private_data.hpp | 8 +++- .../deserialize_fixedsizebinary_array.hpp | 44 ++++++++++--------- .../deserialize_primitive_array.hpp | 40 ++++++++++------- include/sparrow_ipc/deserialize_utils.hpp | 9 +++- ...deserialize_variable_size_binary_array.hpp | 43 ++++++++++-------- include/sparrow_ipc/magic_values.hpp | 5 --- include/sparrow_ipc/utils.hpp | 1 - src/arrow_interface/arrow_array.cpp | 30 ++++++++----- .../arrow_array/private_data.cpp | 2 +- src/arrow_interface/arrow_schema.cpp | 8 ++-- .../arrow_schema/private_data.cpp | 8 ++-- src/deserialize_utils.cpp | 22 +++++++++- src/encapsulated_message.cpp | 9 ++-- src/magic_values.cpp | 19 -------- tests/test_arrow_schema.cpp | 28 ++++++------ 20 files changed, 172 insertions(+), 137 deletions(-) delete mode 100644 src/magic_values.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 6fd7294..0648f34 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,7 +116,6 @@ set(SPARROW_IPC_SRC ${SPARROW_IPC_SOURCE_DIR}/serialize_null_array.cpp ${SPARROW_IPC_SOURCE_DIR}/serialize.cpp ${SPARROW_IPC_SOURCE_DIR}/utils.cpp - ${SPARROW_IPC_SOURCE_DIR}/magic_values.cpp ${SPARROW_IPC_SOURCE_DIR}/metadata.cpp ${SPARROW_IPC_SOURCE_DIR}/deserialize_utils.cpp ) diff --git a/include/sparrow_ipc/arrow_interface/arrow_array.hpp b/include/sparrow_ipc/arrow_interface/arrow_array.hpp index 21044f6..2517e1d 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_array.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_array.hpp @@ -7,7 +7,7 @@ namespace sparrow_ipc { - [[nodiscard]] ArrowArray make_arrow_array( + [[nodiscard]] ArrowArray make_non_owning_arrow_array( int64_t length, int64_t null_count, int64_t offset, @@ -17,9 +17,9 @@ namespace sparrow_ipc ArrowArray* dictionary ); - void release_arrow_array(ArrowArray* array); + void release_non_owning_arrow_array(ArrowArray* array); - void fill_arrow_array( + void fill_non_owning_arrow_array( ArrowArray& array, int64_t length, int64_t null_count, diff --git a/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp b/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp index 4472f13..536170f 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp @@ -4,11 +4,11 @@ namespace sparrow_ipc { - class arrow_array_private_data + class non_owning_arrow_array_private_data { public: - explicit constexpr arrow_array_private_data(std::vector&& buffers_pointers) + explicit constexpr non_owning_arrow_array_private_data(std::vector&& buffers_pointers) : m_buffers_pointers(std::move(buffers_pointers)) { } diff --git a/include/sparrow_ipc/arrow_interface/arrow_array_schema_common_release.hpp b/include/sparrow_ipc/arrow_interface/arrow_array_schema_common_release.hpp index 96ec8e7..3e73c50 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_array_schema_common_release.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_array_schema_common_release.hpp @@ -16,10 +16,12 @@ namespace sparrow_ipc */ template requires std::same_as || std::same_as - void release_common_arrow(T& t) + void release_common_non_owning_arrow(T& t) { - using private_data_type = std:: - conditional_t, arrow_array_private_data, arrow_schema_private_data>; + using private_data_type = std::conditional_t< + std::same_as, + non_owning_arrow_array_private_data, + non_owning_arrow_schema_private_data>; if (t.release == nullptr) { return; diff --git a/include/sparrow_ipc/arrow_interface/arrow_schema.hpp b/include/sparrow_ipc/arrow_interface/arrow_schema.hpp index f41fc26..52b8f86 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_schema.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_schema.hpp @@ -11,10 +11,10 @@ namespace sparrow_ipc { - void release_arrow_schema(ArrowSchema* schema); + void release_non_owning_arrow_schema(ArrowSchema* schema); template > - void fill_arrow_schema( + void fill_non_owning_arrow_schema( ArrowSchema& schema, std::string_view format, const char* name, @@ -41,19 +41,19 @@ namespace sparrow_ipc ) : std::nullopt; - schema.private_data = new arrow_schema_private_data(format, name, std::move(metadata_str)); + schema.private_data = new non_owning_arrow_schema_private_data(format, name, std::move(metadata_str)); - const auto private_data = static_cast(schema.private_data); + const auto private_data = static_cast(schema.private_data); schema.format = private_data->format_ptr(); schema.name = private_data->name_ptr(); schema.metadata = private_data->metadata_ptr(); schema.children = children; schema.dictionary = dictionary; - schema.release = release_arrow_schema; + schema.release = release_non_owning_arrow_schema; } template > - [[nodiscard]] ArrowSchema make_arrow_schema( + [[nodiscard]] ArrowSchema make_non_owning_arrow_schema( std::string_view format, const char* name, std::optional metadata, @@ -64,7 +64,7 @@ namespace sparrow_ipc ) { ArrowSchema schema{}; - fill_arrow_schema(schema, format, name, metadata, flags, children_count, children, dictionary); + fill_non_owning_arrow_schema(schema, format, name, metadata, flags, children_count, children, dictionary); return schema; } } \ No newline at end of file diff --git a/include/sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp b/include/sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp index edd0412..0622e07 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp @@ -6,11 +6,15 @@ namespace sparrow_ipc { - class arrow_schema_private_data + class non_owning_arrow_schema_private_data { public: - arrow_schema_private_data(std::string_view format, const char* name, std::optional metadata); + non_owning_arrow_schema_private_data( + std::string_view format, + const char* name, + std::optional metadata + ); [[nodiscard]] const char* format_ptr() const noexcept; [[nodiscard]] const char* name_ptr() const noexcept; diff --git a/include/sparrow_ipc/deserialize_fixedsizebinary_array.hpp b/include/sparrow_ipc/deserialize_fixedsizebinary_array.hpp index b6f02c7..00ad290 100644 --- a/include/sparrow_ipc/deserialize_fixedsizebinary_array.hpp +++ b/include/sparrow_ipc/deserialize_fixedsizebinary_array.hpp @@ -6,11 +6,10 @@ #include "Message_generated.h" #include "sparrow_ipc/arrow_interface/arrow_array.hpp" #include "sparrow_ipc/arrow_interface/arrow_schema.hpp" - +#include "sparrow_ipc/deserialize_utils.hpp" namespace sparrow_ipc { - [[nodiscard]] sparrow::fixed_width_binary_array deserialize_fixedwidthbinary( const org::apache::arrow::flatbuf::RecordBatch& record_batch, std::span body, @@ -21,27 +20,32 @@ namespace sparrow_ipc ) { const std::string format = "w:" + std::to_string(byte_width); - ArrowSchema schema = make_arrow_schema(format, name.data(), metadata, std::nullopt, 0, nullptr, nullptr); - - const auto bitmap_buffer_metadata = record_batch.buffers()->Get(buffer_index++); - - uint8_t* bitmap_ptr = nullptr; - int64_t null_count = 0; - - // Check if validity buffer is present (length > 0 for nullable fields) - if (bitmap_buffer_metadata->length() > 0) { - bitmap_ptr = const_cast(body.data() + bitmap_buffer_metadata->offset()); - const sparrow::dynamic_bitset_view bitmap_view{bitmap_ptr, static_cast(record_batch.length())}; - null_count = bitmap_view.null_count(); - } - + ArrowSchema schema = make_non_owning_arrow_schema( + format, + name.data(), + metadata, + std::nullopt, + 0, + nullptr, + nullptr + ); + const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count( + record_batch, + body, + buffer_index++ + ); const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++); auto buffer_ptr = const_cast(body.data() + buffer_metadata->offset()); - std::vector buffers = {bitmap_ptr, buffer_ptr}; - - ArrowArray array = make_arrow_array(record_batch.length(), null_count, 0, std::move(buffers), 0, nullptr, nullptr); - + ArrowArray array = make_non_owning_arrow_array( + record_batch.length(), + null_count, + 0, + std::move(buffers), + 0, + nullptr, + nullptr + ); sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; return sparrow::fixed_width_binary_array{std::move(ap)}; } diff --git a/include/sparrow_ipc/deserialize_primitive_array.hpp b/include/sparrow_ipc/deserialize_primitive_array.hpp index 751feb0..2ae0083 100644 --- a/include/sparrow_ipc/deserialize_primitive_array.hpp +++ b/include/sparrow_ipc/deserialize_primitive_array.hpp @@ -9,6 +9,7 @@ #include "Message_generated.h" #include "sparrow_ipc/arrow_interface/arrow_array.hpp" #include "sparrow_ipc/arrow_interface/arrow_schema.hpp" +#include "sparrow_ipc/deserialize_utils.hpp" namespace sparrow_ipc { @@ -24,25 +25,32 @@ namespace sparrow_ipc const std::string_view format = data_type_to_format( sparrow::detail::get_data_type_from_array>::get() ); - ArrowSchema schema = make_arrow_schema(format, name.data(), metadata, std::nullopt, 0, nullptr, nullptr); - - const auto bitmap_buffer_metadata = record_batch.buffers()->Get(buffer_index++); - uint8_t* bitmap_ptr = nullptr; - int64_t null_count = 0; - - // Check if validity buffer is present (length > 0 for nullable fields) - if (bitmap_buffer_metadata->length() > 0) { - bitmap_ptr = const_cast(body.data() + bitmap_buffer_metadata->offset()); - const sparrow::dynamic_bitset_view bitmap_view{bitmap_ptr, static_cast(record_batch.length())}; - null_count = bitmap_view.null_count(); - } - + ArrowSchema schema = make_non_owning_arrow_schema( + format, + name.data(), + metadata, + std::nullopt, + 0, + nullptr, + nullptr + ); + const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count( + record_batch, + body, + buffer_index++ + ); const auto primitive_buffer_metadata = record_batch.buffers()->Get(buffer_index++); auto primitives_ptr = const_cast(body.data() + primitive_buffer_metadata->offset()); - std::vector buffers = {bitmap_ptr, primitives_ptr}; - ArrowArray array = make_arrow_array(record_batch.length(), null_count, 0, std::move(buffers), 0, nullptr, nullptr); - + ArrowArray array = make_non_owning_arrow_array( + record_batch.length(), + null_count, + 0, + std::move(buffers), + 0, + nullptr, + nullptr + ); sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; return sparrow::primitive_array{std::move(ap)}; } diff --git a/include/sparrow_ipc/deserialize_utils.hpp b/include/sparrow_ipc/deserialize_utils.hpp index b43f5d7..4a901ad 100644 --- a/include/sparrow_ipc/deserialize_utils.hpp +++ b/include/sparrow_ipc/deserialize_utils.hpp @@ -4,11 +4,12 @@ #include #include +#include #include "Message_generated.h" #include "Schema_generated.h" -namespace sparrow_ipc +namespace sparrow_ipc::utils { template [[nodiscard]] sparrow::u8_buffer message_buffer_to_u8buffer( @@ -29,4 +30,10 @@ namespace sparrow_ipc std::span body, size_t index ); + + [[nodiscard]] std::pair get_bitmap_pointer_and_null_count( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + std::span body, + size_t index + ); } \ No newline at end of file diff --git a/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp b/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp index c111ec9..cafdebe 100644 --- a/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp +++ b/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp @@ -8,6 +8,7 @@ #include "Message_generated.h" #include "sparrow_ipc/arrow_interface/arrow_array.hpp" #include "sparrow_ipc/arrow_interface/arrow_schema.hpp" +#include "sparrow_ipc/deserialize_utils.hpp" namespace sparrow_ipc { @@ -20,31 +21,35 @@ namespace sparrow_ipc size_t& buffer_index ) { - const std::string_view format = data_type_to_format( - sparrow::detail::get_data_type_from_array::get() + const std::string_view format = data_type_to_format(sparrow::detail::get_data_type_from_array::get()); + ArrowSchema schema = make_non_owning_arrow_schema( + format, + name.data(), + metadata, + std::nullopt, + 0, + nullptr, + nullptr + ); + const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count( + record_batch, + body, + buffer_index++ ); - ArrowSchema schema = make_arrow_schema(format, name.data(), metadata, std::nullopt, 0, nullptr, nullptr); - - const auto bitmap_buffer_metadata = record_batch.buffers()->Get(buffer_index++); - uint8_t* bitmap_ptr = nullptr; - int64_t null_count = 0; - - // Check if validity buffer is present (length > 0 for nullable fields) - if (bitmap_buffer_metadata->length() > 0) { - bitmap_ptr = const_cast(body.data() + bitmap_buffer_metadata->offset()); - const sparrow::dynamic_bitset_view bitmap_view{bitmap_ptr, static_cast(record_batch.length())}; - null_count = bitmap_view.null_count(); - } - const auto offset_metadata = record_batch.buffers()->Get(buffer_index++); auto offset_ptr = const_cast(body.data() + offset_metadata->offset()); - const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++); auto buffer_ptr = const_cast(body.data() + buffer_metadata->offset()); - std::vector buffers = {bitmap_ptr, offset_ptr, buffer_ptr}; - ArrowArray array = make_arrow_array(record_batch.length(), null_count, 0, std::move(buffers), 0, nullptr, nullptr); - + ArrowArray array = make_non_owning_arrow_array( + record_batch.length(), + null_count, + 0, + std::move(buffers), + 0, + nullptr, + nullptr + ); sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; return T{std::move(ap)}; } diff --git a/include/sparrow_ipc/magic_values.hpp b/include/sparrow_ipc/magic_values.hpp index 94b4cc2..e7f8fc6 100644 --- a/include/sparrow_ipc/magic_values.hpp +++ b/include/sparrow_ipc/magic_values.hpp @@ -15,14 +15,9 @@ namespace sparrow_ipc { return std::ranges::equal(buf, continuation); } - - [[nodiscard]] bool is_continuation(std::istream& stream); - template [[nodiscard]] bool is_end_of_stream(const R& buf) { return std::ranges::equal(buf, end_of_stream); } - - [[nodiscard]] bool is_end_of_stream(std::istream& stream); } \ No newline at end of file diff --git a/include/sparrow_ipc/utils.hpp b/include/sparrow_ipc/utils.hpp index c83c60c..44900b2 100644 --- a/include/sparrow_ipc/utils.hpp +++ b/include/sparrow_ipc/utils.hpp @@ -17,5 +17,4 @@ namespace sparrow_ipc::utils // This function maps a sparrow data type to the corresponding Flatbuffers type SPARROW_IPC_API std::pair> get_flatbuffer_type(flatbuffers::FlatBufferBuilder& builder, std::string_view format_str); - } diff --git a/src/arrow_interface/arrow_array.cpp b/src/arrow_interface/arrow_array.cpp index fa6a599..74aadcc 100644 --- a/src/arrow_interface/arrow_array.cpp +++ b/src/arrow_interface/arrow_array.cpp @@ -10,22 +10,22 @@ namespace sparrow_ipc { - void release_arrow_array(ArrowArray* array) + void release_non_owning_arrow_array(ArrowArray* array) { SPARROW_ASSERT_FALSE(array == nullptr) - SPARROW_ASSERT_TRUE(array->release == std::addressof(release_arrow_array)) + SPARROW_ASSERT_TRUE(array->release == std::addressof(release_non_owning_arrow_array)) - release_common_arrow(*array); + release_common_non_owning_arrow(*array); if (array->private_data != nullptr) { - const auto private_data = static_cast(array->private_data); + const auto private_data = static_cast(array->private_data); delete private_data; array->private_data = nullptr; } array->buffers = nullptr; // The buffers were deleted with the private data } - void fill_arrow_array( + void fill_non_owning_arrow_array( ArrowArray& array, int64_t length, int64_t null_count, @@ -44,16 +44,16 @@ namespace sparrow_ipc array.null_count = null_count; array.offset = offset; array.n_buffers = static_cast(buffers.size()); - array.private_data = new arrow_array_private_data(std::move(buffers)); - const auto private_data = static_cast(array.private_data); + array.private_data = new non_owning_arrow_array_private_data(std::move(buffers)); + const auto private_data = static_cast(array.private_data); array.buffers = private_data->buffers_ptrs(); array.n_children = static_cast(children_count); array.children = children; array.dictionary = dictionary; - array.release = release_arrow_array; + array.release = release_non_owning_arrow_array; } - ArrowArray make_arrow_array( + ArrowArray make_non_owning_arrow_array( int64_t length, int64_t null_count, int64_t offset, @@ -64,8 +64,16 @@ namespace sparrow_ipc ) { ArrowArray array{}; - fill_arrow_array(array, length, null_count, offset, std::move(buffers), children_count, children, dictionary); + fill_non_owning_arrow_array( + array, + length, + null_count, + offset, + std::move(buffers), + children_count, + children, + dictionary + ); return array; } - } diff --git a/src/arrow_interface/arrow_array/private_data.cpp b/src/arrow_interface/arrow_array/private_data.cpp index fac1ced..b133c8e 100644 --- a/src/arrow_interface/arrow_array/private_data.cpp +++ b/src/arrow_interface/arrow_array/private_data.cpp @@ -2,7 +2,7 @@ namespace sparrow_ipc { - const void** arrow_array_private_data::buffers_ptrs() noexcept + const void** non_owning_arrow_array_private_data::buffers_ptrs() noexcept { return const_cast(reinterpret_cast(m_buffers_pointers.data())); } diff --git a/src/arrow_interface/arrow_schema.cpp b/src/arrow_interface/arrow_schema.cpp index 522dd9e..e3af5b8 100644 --- a/src/arrow_interface/arrow_schema.cpp +++ b/src/arrow_interface/arrow_schema.cpp @@ -4,14 +4,14 @@ namespace sparrow_ipc { - void release_arrow_schema(ArrowSchema* schema) + void release_non_owning_arrow_schema(ArrowSchema* schema) { SPARROW_ASSERT_FALSE(schema == nullptr); - SPARROW_ASSERT_TRUE(schema->release == std::addressof(release_arrow_schema)); - release_common_arrow(*schema); + SPARROW_ASSERT_TRUE(schema->release == std::addressof(release_non_owning_arrow_schema)); + release_common_non_owning_arrow(*schema); if (schema->private_data != nullptr) { - const auto private_data = static_cast(schema->private_data); + const auto private_data = static_cast(schema->private_data); delete private_data; schema->private_data = nullptr; } diff --git a/src/arrow_interface/arrow_schema/private_data.cpp b/src/arrow_interface/arrow_schema/private_data.cpp index 9534e6d..969742b 100644 --- a/src/arrow_interface/arrow_schema/private_data.cpp +++ b/src/arrow_interface/arrow_schema/private_data.cpp @@ -2,7 +2,7 @@ namespace sparrow_ipc { - arrow_schema_private_data::arrow_schema_private_data( + non_owning_arrow_schema_private_data::non_owning_arrow_schema_private_data( std::string_view format, const char* name, std::optional metadata @@ -13,17 +13,17 @@ namespace sparrow_ipc { } - const char* arrow_schema_private_data::format_ptr() const noexcept + const char* non_owning_arrow_schema_private_data::format_ptr() const noexcept { return m_format.data(); } - const char* arrow_schema_private_data::name_ptr() const noexcept + const char* non_owning_arrow_schema_private_data::name_ptr() const noexcept { return m_name; } - const char* arrow_schema_private_data::metadata_ptr() const noexcept + const char* non_owning_arrow_schema_private_data::metadata_ptr() const noexcept { return m_metadata.has_value() ? m_metadata->c_str() : nullptr; } diff --git a/src/deserialize_utils.cpp b/src/deserialize_utils.cpp index f1e5b3b..c9c1a03 100644 --- a/src/deserialize_utils.cpp +++ b/src/deserialize_utils.cpp @@ -1,6 +1,6 @@ #include "sparrow_ipc/deserialize_utils.hpp" -namespace sparrow_ipc +namespace sparrow_ipc::utils { const sparrow::dynamic_bitset_view message_buffer_to_validity_bitmap( const org::apache::arrow::flatbuf::RecordBatch* record_batch, @@ -14,4 +14,24 @@ namespace sparrow_ipc static_cast(buffer_metadata->length()) }; } + + std::pair get_bitmap_pointer_and_null_count( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + std::span body, + size_t index + ) + { + const auto bitmap_metadata = record_batch.buffers()->Get(index); + if (bitmap_metadata->length() == 0) + { + return {nullptr, 0}; + } + + auto ptr = const_cast(body.data() + bitmap_metadata->offset()); + const sparrow::dynamic_bitset_view bitmap_view{ + ptr, + static_cast(record_batch.length()) + }; + return {ptr, bitmap_view.null_count()}; + } } \ No newline at end of file diff --git a/src/encapsulated_message.cpp b/src/encapsulated_message.cpp index e76fac3..4e1bbbc 100644 --- a/src/encapsulated_message.cpp +++ b/src/encapsulated_message.cpp @@ -3,6 +3,7 @@ #include #include "sparrow_ipc/magic_values.hpp" +#include "sparrow_ipc/utils.hpp" namespace sparrow_ipc { @@ -73,8 +74,8 @@ namespace sparrow_ipc std::span EncapsulatedMessage::body() const { const size_t offset = sizeof(uint32_t) * 2 // 4 bytes continuation + 4 bytes metadata size - + metadata_length(); - const size_t padded_offset = (offset + 7) & ~7; // Round up to 8-byte boundary + + metadata_length(); + const size_t padded_offset = utils::align_to_8(offset); // Round up to 8-byte boundary const uint8_t* body_ptr = m_buf_ptr + padded_offset; return {body_ptr, body_length()}; } @@ -82,8 +83,8 @@ namespace sparrow_ipc size_t EncapsulatedMessage::total_length() const { const size_t offset = sizeof(uint32_t) * 2 // 4 bytes continuation + 4 bytes metadata size - + metadata_length(); - const size_t padded_offset = (offset + 7) & ~7; // Round up to 8-byte boundary + + metadata_length(); + const size_t padded_offset = utils::align_to_8(offset); // Round up to 8-byte boundary return padded_offset + body_length(); } diff --git a/src/magic_values.cpp b/src/magic_values.cpp deleted file mode 100644 index 154021f..0000000 --- a/src/magic_values.cpp +++ /dev/null @@ -1,19 +0,0 @@ -#include "sparrow_ipc/magic_values.hpp" - -namespace sparrow_ipc -{ - bool is_continuation(std::istream& stream) - { - std::array buf; - stream.read(reinterpret_cast(buf.data()), 4); - if (stream.gcount() < 4) - { - if (stream.eof()) - { - return false; // End of file reached, not a continuation - } - throw std::runtime_error("Failed to read enough bytes from stream."); - } - return is_continuation(buf); - } -} \ No newline at end of file diff --git a/tests/test_arrow_schema.cpp b/tests/test_arrow_schema.cpp index 01ed08f..7ef09dc 100644 --- a/tests/test_arrow_schema.cpp +++ b/tests/test_arrow_schema.cpp @@ -66,7 +66,7 @@ TEST_SUITE("C Data Interface") { TEST_CASE("ArrowSchema") { - SUBCASE("make_schema_constructor") + SUBCASE("make_non_owning_arrow_schema") { ArrowSchema** children = new ArrowSchema*[2]; children[0] = new ArrowSchema(); @@ -79,7 +79,7 @@ TEST_SUITE("C Data Interface") dictionnary->name = "dictionary"; const std::string format = "format"; const std::string name = "name"; - auto schema = sparrow_ipc::make_arrow_schema( + auto schema = sparrow_ipc::make_non_owning_arrow_schema( format.data(), name.data(), sparrow_ipc::metadata_sample_opt, @@ -102,15 +102,16 @@ TEST_SUITE("C Data Interface") CHECK_EQ(schema.children[0], children_1_ptr); CHECK_EQ(schema.children[1], children_2_ptr); CHECK_EQ(schema.dictionary, dictionnary); - const bool is_release_arrow_schema = schema.release == &sparrow_ipc::release_arrow_schema; + const bool is_release_arrow_schema = schema.release + == &sparrow_ipc::release_non_owning_arrow_schema; CHECK(is_release_arrow_schema); CHECK_NE(schema.private_data, nullptr); schema.release(&schema); } - SUBCASE("make_schema_constructor no children, no dictionary, no name and metadata") + SUBCASE("make_non_owning_arrow_schema no children, no dictionary, no name and metadata") { - auto schema = sparrow_ipc::make_arrow_schema( + auto schema = sparrow_ipc::make_non_owning_arrow_schema( "format", nullptr, std::optional>{}, @@ -129,7 +130,8 @@ TEST_SUITE("C Data Interface") CHECK_EQ(schema.n_children, 0); CHECK_EQ(schema.children, nullptr); CHECK_EQ(schema.dictionary, nullptr); - const bool is_release_arrow_schema = schema.release == &sparrow_ipc::release_arrow_schema; + const bool is_release_arrow_schema = schema.release + == &sparrow_ipc::release_non_owning_arrow_schema; CHECK(is_release_arrow_schema); CHECK_NE(schema.private_data, nullptr); schema.release(&schema); @@ -141,7 +143,7 @@ TEST_SUITE("C Data Interface") children[0] = new ArrowSchema(); children[1] = new ArrowSchema(); - auto schema = sparrow_ipc::make_arrow_schema( + auto schema = sparrow_ipc::make_non_owning_arrow_schema( "format", "name", sparrow_ipc::metadata_sample_opt, @@ -165,7 +167,7 @@ TEST_SUITE("C Data Interface") SUBCASE("ArrowSchema release no children, no dictionary, no name and metadata") { - auto schema = sparrow_ipc::make_arrow_schema( + auto schema = sparrow_ipc::make_non_owning_arrow_schema( "format", nullptr, std::optional>{}, @@ -191,7 +193,7 @@ TEST_SUITE("C Data Interface") { auto children = new ArrowSchema*[2]; children[0] = new ArrowSchema(); - *children[0] = sparrow_ipc::make_arrow_schema( + *children[0] = sparrow_ipc::make_non_owning_arrow_schema( "format", "child1", sparrow_ipc::metadata_sample_opt, @@ -201,7 +203,7 @@ TEST_SUITE("C Data Interface") nullptr ); children[1] = new ArrowSchema(); - *children[1] = sparrow_ipc::make_arrow_schema( + *children[1] = sparrow_ipc::make_non_owning_arrow_schema( "format", "child2", sparrow_ipc::metadata_sample_opt, @@ -212,7 +214,7 @@ TEST_SUITE("C Data Interface") ); auto dictionary = new ArrowSchema(); - *dictionary = sparrow_ipc::make_arrow_schema( + *dictionary = sparrow_ipc::make_non_owning_arrow_schema( "format", "dictionary", sparrow_ipc::metadata_sample_opt, @@ -221,12 +223,12 @@ TEST_SUITE("C Data Interface") nullptr, nullptr ); - auto schema = sparrow_ipc::make_arrow_schema( + auto schema = sparrow_ipc::make_non_owning_arrow_schema( "format", "name", sparrow_ipc::metadata_sample_opt, std::unordered_set{sparrow::ArrowFlag::DICTIONARY_ORDERED}, - 0, + 2, children, dictionary ); From 9fd39b84e1e66560772f09f6708bacae3b2f5247 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Thu, 4 Sep 2025 15:46:38 +0200 Subject: [PATCH 05/22] Fix osx build --- include/sparrow_ipc/deserialize.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/sparrow_ipc/deserialize.hpp b/include/sparrow_ipc/deserialize.hpp index 3784bb4..83a0c39 100644 --- a/include/sparrow_ipc/deserialize.hpp +++ b/include/sparrow_ipc/deserialize.hpp @@ -10,7 +10,6 @@ #include "sparrow_ipc/encapsulated_message.hpp" #include "SparseTensor_generated.h" - namespace sparrow_ipc { SPARROW_IPC_API void deserialize_schema_message( @@ -19,8 +18,8 @@ namespace sparrow_ipc std::optional& name, std::optional>& metadata ); - SPARROW_IPC_API [[nodiscard]] const org::apache::arrow::flatbuf::RecordBatch* + [[nodiscard]] SPARROW_IPC_API const org::apache::arrow::flatbuf::RecordBatch* deserialize_record_batch_message(const uint8_t* buf_ptr, size_t& current_offset); - SPARROW_IPC_API [[nodiscard]] std::vector deserialize_stream(const uint8_t* buf_ptr); + [[nodiscard]] SPARROW_IPC_API std::vector deserialize_stream(const uint8_t* buf_ptr); } \ No newline at end of file From 5b779ba3a35b20e0b706a4d2aa005a40ff366b43 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Thu, 4 Sep 2025 15:57:20 +0200 Subject: [PATCH 06/22] fix compilation --- include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp | 1 + .../sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp | 1 + 2 files changed, 2 insertions(+) diff --git a/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp b/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp index 536170f..7c866aa 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include namespace sparrow_ipc diff --git a/include/sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp b/include/sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp index 0622e07..29de88c 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp @@ -1,6 +1,7 @@ #pragma once +#include #include #include From 9c32fc04dc0d9ebe5a6ea2b030e100c5e7560178 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Fri, 5 Sep 2025 10:28:43 +0200 Subject: [PATCH 07/22] wip --- .../arrow_interface/arrow_array.hpp | 8 +- .../arrow_interface/arrow_schema.hpp | 5 +- .../deserialize_primitive_array.hpp | 2 +- src/deserialize.cpp | 405 ++++++++---------- 4 files changed, 176 insertions(+), 244 deletions(-) diff --git a/include/sparrow_ipc/arrow_interface/arrow_array.hpp b/include/sparrow_ipc/arrow_interface/arrow_array.hpp index 2517e1d..2f1f72d 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_array.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_array.hpp @@ -5,9 +5,11 @@ #include +#include "sparrow_ipc/config/config.hpp" + namespace sparrow_ipc { - [[nodiscard]] ArrowArray make_non_owning_arrow_array( + [[nodiscard]] SPARROW_IPC_API ArrowArray make_non_owning_arrow_array( int64_t length, int64_t null_count, int64_t offset, @@ -17,9 +19,9 @@ namespace sparrow_ipc ArrowArray* dictionary ); - void release_non_owning_arrow_array(ArrowArray* array); + SPARROW_IPC_API void release_non_owning_arrow_array(ArrowArray* array); - void fill_non_owning_arrow_array( + SPARROW_IPC_API void fill_non_owning_arrow_array( ArrowArray& array, int64_t length, int64_t null_count, diff --git a/include/sparrow_ipc/arrow_interface/arrow_schema.hpp b/include/sparrow_ipc/arrow_interface/arrow_schema.hpp index 52b8f86..099aa86 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_schema.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_schema.hpp @@ -7,11 +7,12 @@ #include #include -#include "arrow_schema/private_data.hpp" +#include "sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp" +#include "sparrow_ipc/config/config.hpp" namespace sparrow_ipc { - void release_non_owning_arrow_schema(ArrowSchema* schema); + SPARROW_IPC_API void release_non_owning_arrow_schema(ArrowSchema* schema); template > void fill_non_owning_arrow_schema( diff --git a/include/sparrow_ipc/deserialize_primitive_array.hpp b/include/sparrow_ipc/deserialize_primitive_array.hpp index 2ae0083..b70f6a8 100644 --- a/include/sparrow_ipc/deserialize_primitive_array.hpp +++ b/include/sparrow_ipc/deserialize_primitive_array.hpp @@ -14,7 +14,7 @@ namespace sparrow_ipc { template - [[nodiscard]] sparrow::primitive_array deserialize_primitive_array_bis( + [[nodiscard]] sparrow::primitive_array deserialize_non_owning_primitive_array( const org::apache::arrow::flatbuf::RecordBatch& record_batch, std::span body, std::string_view name, diff --git a/src/deserialize.cpp b/src/deserialize.cpp index 7b29f3b..f8e312f 100644 --- a/src/deserialize.cpp +++ b/src/deserialize.cpp @@ -1,8 +1,10 @@ -#include +#include "sparrow_ipc/deserialize.hpp" + +#include -#include "sparrow_ipc/deserialize_variable_size_binary_array.hpp" #include "sparrow_ipc/deserialize_fixedsizebinary_array.hpp" #include "sparrow_ipc/deserialize_primitive_array.hpp" +#include "sparrow_ipc/deserialize_variable_size_binary_array.hpp" #include "sparrow_ipc/magic_values.hpp" #include "sparrow_ipc/metadata.hpp" @@ -65,6 +67,164 @@ namespace sparrow_ipc return static_cast(batch_message->header()); } + std::vector get_arrays_from_record_batch( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + const org::apache::arrow::flatbuf::Schema& schema, + const EncapsulatedMessage& encapsulated_message + ) + { + const size_t length = static_cast(record_batch.length()); + size_t buffer_index = 0; + + std::vector arrays; + arrays.reserve(schema.fields()->size()); + + for (const auto field : *(schema.fields())) + { + const ::flatbuffers::Vector<::flatbuffers::Offset>* + fb_custom_metadata = field->custom_metadata(); + const std::optional> + metadata = fb_custom_metadata == nullptr + ? std::nullopt + : std::make_optional(to_sparrow_metadata(*fb_custom_metadata)); + const auto name = field->name()->string_view(); + const auto field_type = field->type_type(); + const auto deserialize_non_owning_primitive_array_lambda = [&]() + { + return deserialize_non_owning_primitive_array( + record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + ); + }; + switch (field_type) + { + case org::apache::arrow::flatbuf::Type::Bool: + arrays.emplace_back( + deserialize_non_owning_primitive_array_lambda.template operator()() + ); + break; + case org::apache::arrow::flatbuf::Type::Int: + { + const auto int_type = field->type_as_Int(); + const auto bit_width = int_type->bitWidth(); + const bool is_signed = int_type->is_signed(); + + if (is_signed) + { + switch (bit_width) + { + // clang-format off + case 8: arrays.emplace_back(deserialize_non_owning_primitive_array_lambda.template operator()()); break; + case 16: arrays.emplace_back(deserialize_non_owning_primitive_array_lambda.template operator()()); break; + case 32: arrays.emplace_back(deserialize_non_owning_primitive_array_lambda.template operator()()); break; + case 64: arrays.emplace_back(deserialize_non_owning_primitive_array_lambda.template operator()()); break; + default: throw std::runtime_error("Unsupported integer bit width."); + // clang-format on + } + } + else + { + switch (bit_width) + { + // clang-format off + case 8: arrays.emplace_back(deserialize_non_owning_primitive_array_lambda.template operator()()); break; + case 16: arrays.emplace_back(deserialize_non_owning_primitive_array_lambda.template operator()()); break; + case 32: arrays.emplace_back(deserialize_non_owning_primitive_array_lambda.template operator()()); break; + case 64: arrays.emplace_back(deserialize_non_owning_primitive_array_lambda.template operator()()); break; + default: throw std::runtime_error("Unsupported integer bit width."); + // clang-format on + } + } + } + break; + case org::apache::arrow::flatbuf::Type::FloatingPoint: + { + const auto float_type = field->type_as_FloatingPoint(); + switch (float_type->precision()) + { + // clang-format off + case org::apache::arrow::flatbuf::Precision::HALF: + arrays.emplace_back(deserialize_non_owning_primitive_array_lambda.template operator()()); + break; + case org::apache::arrow::flatbuf::Precision::SINGLE: + arrays.emplace_back(deserialize_non_owning_primitive_array_lambda.template operator()()); + break; + case org::apache::arrow::flatbuf::Precision::DOUBLE: + arrays.emplace_back(deserialize_non_owning_primitive_array_lambda.template operator()()); + break; + default: + throw std::runtime_error("Unsupported floating point precision."); + // clang-format on + } + break; + } + case org::apache::arrow::flatbuf::Type::FixedSizeBinary: + { + const auto fixed_size_binary_field = field->type_as_FixedSizeBinary(); + arrays.emplace_back(deserialize_fixedwidthbinary( + record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index, + fixed_size_binary_field->byteWidth() + )); + break; + } + case org::apache::arrow::flatbuf::Type::Binary: + arrays.emplace_back( + deserialize_variable_size_binary( + record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + ) + ); + break; + case org::apache::arrow::flatbuf::Type::LargeBinary: + arrays.emplace_back( + deserialize_variable_size_binary( + record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + ) + ); + break; + case org::apache::arrow::flatbuf::Type::Utf8: + arrays.emplace_back( + deserialize_variable_size_binary( + record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + ) + ); + break; + case org::apache::arrow::flatbuf::Type::LargeUtf8: + arrays.emplace_back( + deserialize_variable_size_binary( + record_batch, + encapsulated_message.body(), + name, + metadata, + buffer_index + ) + ); + break; + default: + throw std::runtime_error("Unsupported type."); + } + } + return arrays; + } + std::vector deserialize_stream(const uint8_t* buf_ptr) { const org::apache::arrow::flatbuf::Schema* schema = nullptr; @@ -94,7 +254,6 @@ namespace sparrow_ipc break; case org::apache::arrow::flatbuf::MessageHeader::RecordBatch: { - const auto lol = message->header_type(); if (schema == nullptr) { throw std::runtime_error("Schema message is missing."); @@ -104,240 +263,11 @@ namespace sparrow_ipc { throw std::runtime_error("RecordBatch message is missing."); } - const size_t length = static_cast(record_batch->length()); - size_t buffer_index = 0; - - std::vector arrays; - arrays.reserve(schema->fields()->size()); - - for (const auto field : *(schema->fields())) - { - const ::flatbuffers::Vector<::flatbuffers::Offset>* - fb_custom_metadata = field->custom_metadata(); - const std::optional> - metadata = fb_custom_metadata == nullptr - ? std::nullopt - : std::make_optional(to_sparrow_metadata(*fb_custom_metadata)); - const auto name = field->name()->string_view(); - const auto field_type = field->type_type(); - switch (field_type) - { - case org::apache::arrow::flatbuf::Type::Bool: - arrays.emplace_back( - deserialize_primitive_array_bis( - *record_batch, - encapsulated_message.body(), - name, - metadata, - buffer_index - ) - ); - break; - case org::apache::arrow::flatbuf::Type::Int: - { - const auto int_type = field->type_as_Int(); - - if (int_type->is_signed()) - { - switch (int_type->bitWidth()) - { - case 8: - arrays.emplace_back( - deserialize_primitive_array_bis( - *record_batch, - encapsulated_message.body(), - name, - metadata, - buffer_index - ) - ); - break; - case 16: - arrays.emplace_back( - deserialize_primitive_array_bis( - *record_batch, - encapsulated_message.body(), - name, - metadata, - buffer_index - ) - ); - break; - case 32: - arrays.emplace_back( - deserialize_primitive_array_bis( - *record_batch, - encapsulated_message.body(), - name, - metadata, - buffer_index - ) - ); - break; - case 64: - arrays.emplace_back( - deserialize_primitive_array_bis( - *record_batch, - encapsulated_message.body(), - name, - metadata, - buffer_index - ) - ); - break; - default: - throw std::runtime_error("Unsupported integer bit width."); - } - } - else - { - switch (int_type->bitWidth()) - { - case 8: - arrays.emplace_back( - deserialize_primitive_array_bis( - *record_batch, - encapsulated_message.body(), - name, - metadata, - buffer_index - ) - ); - break; - case 16: - arrays.emplace_back( - deserialize_primitive_array_bis( - *record_batch, - encapsulated_message.body(), - name, - metadata, - buffer_index - ) - ); - break; - case 32: - arrays.emplace_back( - deserialize_primitive_array_bis( - *record_batch, - encapsulated_message.body(), - name, - metadata, - buffer_index - ) - ); - break; - case 64: - arrays.emplace_back( - deserialize_primitive_array_bis( - *record_batch, - encapsulated_message.body(), - name, - metadata, - buffer_index - ) - ); - break; - default: - throw std::runtime_error("Unsupported integer bit width."); - } - } - } - break; - case org::apache::arrow::flatbuf::Type::FloatingPoint: - { - const auto float_type = field->type_as_FloatingPoint(); - switch (float_type->precision()) - { - case org::apache::arrow::flatbuf::Precision::HALF: - arrays.emplace_back( - deserialize_primitive_array_bis( - *record_batch, - encapsulated_message.body(), - name, - metadata, - buffer_index - ) - ); - break; - case org::apache::arrow::flatbuf::Precision::SINGLE: - arrays.emplace_back( - deserialize_primitive_array_bis( - *record_batch, - encapsulated_message.body(), - name, - metadata, - buffer_index - ) - ); - break; - case org::apache::arrow::flatbuf::Precision::DOUBLE: - arrays.emplace_back( - deserialize_primitive_array_bis( - *record_batch, - encapsulated_message.body(), - name, - metadata, - buffer_index - ) - ); - break; - default: - throw std::runtime_error("Unsupported floating point precision."); - } - break; - } - case org::apache::arrow::flatbuf::Type::FixedSizeBinary: - { - const auto fixed_size_binary_field = field->type_as_FixedSizeBinary(); - arrays.emplace_back(deserialize_fixedwidthbinary(*record_batch, - encapsulated_message.body(), - name, - metadata, - buffer_index, - fixed_size_binary_field->byteWidth())); - break; - } - case org::apache::arrow::flatbuf::Type::Binary: - arrays.emplace_back(deserialize_variable_size_binary( - *record_batch, - encapsulated_message.body(), - name, - metadata, - buffer_index - )); - break; - case org::apache::arrow::flatbuf::Type::LargeBinary: - arrays.emplace_back(deserialize_variable_size_binary( - *record_batch, - encapsulated_message.body(), - name, - metadata, - buffer_index - )); - break; - case org::apache::arrow::flatbuf::Type::Utf8: - arrays.emplace_back(deserialize_variable_size_binary( - *record_batch, - encapsulated_message.body(), - name, - metadata, - buffer_index - )); - break; - case org::apache::arrow::flatbuf::Type::LargeUtf8: - arrays.emplace_back(deserialize_variable_size_binary( - *record_batch, - encapsulated_message.body(), - name, - metadata, - buffer_index - )); - break; - default: - throw std::runtime_error("Unsupported type."); - } - } - + std::vector arrays = get_arrays_from_record_batch( + *record_batch, + *schema, + encapsulated_message + ); std::vector field_names_str(field_names.cbegin(), field_names.cend()); record_batches.emplace_back(std::move(field_names_str), std::move(arrays), "test"); } @@ -358,5 +288,4 @@ namespace sparrow_ipc } while (true); return record_batches; } - } \ No newline at end of file From 1c1cd1835dfd1e36786da301bd0adb5cd236ba33 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Fri, 5 Sep 2025 10:40:01 +0200 Subject: [PATCH 08/22] compilation fix --- .../arrow_interface/arrow_array/private_data.hpp | 4 +++- .../arrow_interface/arrow_schema/private_data.hpp | 10 ++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp b/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp index 7c866aa..90e633f 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp @@ -3,6 +3,8 @@ #include #include +#include "sparrow_ipc/config/config.hpp" + namespace sparrow_ipc { class non_owning_arrow_array_private_data @@ -14,7 +16,7 @@ namespace sparrow_ipc { } - [[nodiscard]] const void** buffers_ptrs() noexcept; + [[nodiscard]] SPARROW_IPC_API const void** buffers_ptrs() noexcept; private: diff --git a/include/sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp b/include/sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp index 29de88c..f7bc910 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp @@ -5,21 +5,23 @@ #include #include +#include "sparrow_ipc/config/config.hpp" + namespace sparrow_ipc { class non_owning_arrow_schema_private_data { public: - non_owning_arrow_schema_private_data( + SPARROW_IPC_API non_owning_arrow_schema_private_data( std::string_view format, const char* name, std::optional metadata ); - [[nodiscard]] const char* format_ptr() const noexcept; - [[nodiscard]] const char* name_ptr() const noexcept; - [[nodiscard]] const char* metadata_ptr() const noexcept; + [[nodiscard]] SPARROW_IPC_API const char* format_ptr() const noexcept; + [[nodiscard]] SPARROW_IPC_API const char* name_ptr() const noexcept; + [[nodiscard]] SPARROW_IPC_API const char* metadata_ptr() const noexcept; private: From 2db57957060eb58e1d459bcb8b52a875a63ba44e Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Fri, 5 Sep 2025 11:25:58 +0200 Subject: [PATCH 09/22] wip --- CMakeLists.txt | 5 ++- .../deserialize_fixedsizebinary_array.hpp | 35 +-------------- ...deserialize_variable_size_binary_array.hpp | 2 +- src/deserialize.cpp | 10 ++--- src/deserialize_fixedsizebinary_array.cpp | 44 +++++++++++++++++++ 5 files changed, 55 insertions(+), 41 deletions(-) create mode 100644 src/deserialize_fixedsizebinary_array.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 0648f34..b9e7d3d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -111,13 +111,14 @@ set(SPARROW_IPC_SRC ${SPARROW_IPC_SOURCE_DIR}/arrow_interface/arrow_array/private_data.cpp ${SPARROW_IPC_SOURCE_DIR}/arrow_interface/arrow_schema.cpp ${SPARROW_IPC_SOURCE_DIR}/arrow_interface/arrow_schema/private_data.cpp + ${SPARROW_IPC_SOURCE_DIR}/deserialize_fixedsizebinary_array.cpp + ${SPARROW_IPC_SOURCE_DIR}/deserialize_utils.cpp ${SPARROW_IPC_SOURCE_DIR}/deserialize.cpp ${SPARROW_IPC_SOURCE_DIR}/encapsulated_message.cpp + ${SPARROW_IPC_SOURCE_DIR}/metadata.cpp ${SPARROW_IPC_SOURCE_DIR}/serialize_null_array.cpp ${SPARROW_IPC_SOURCE_DIR}/serialize.cpp ${SPARROW_IPC_SOURCE_DIR}/utils.cpp - ${SPARROW_IPC_SOURCE_DIR}/metadata.cpp - ${SPARROW_IPC_SOURCE_DIR}/deserialize_utils.cpp ) # Fetch schemas from apache arrow diff --git a/include/sparrow_ipc/deserialize_fixedsizebinary_array.hpp b/include/sparrow_ipc/deserialize_fixedsizebinary_array.hpp index 00ad290..29f113c 100644 --- a/include/sparrow_ipc/deserialize_fixedsizebinary_array.hpp +++ b/include/sparrow_ipc/deserialize_fixedsizebinary_array.hpp @@ -10,43 +10,12 @@ namespace sparrow_ipc { - [[nodiscard]] sparrow::fixed_width_binary_array deserialize_fixedwidthbinary( + [[nodiscard]] sparrow::fixed_width_binary_array deserialize_non_owning_fixedwidthbinary( const org::apache::arrow::flatbuf::RecordBatch& record_batch, std::span body, std::string_view name, const std::optional>& metadata, size_t& buffer_index, int32_t byte_width - ) - { - const std::string format = "w:" + std::to_string(byte_width); - ArrowSchema schema = make_non_owning_arrow_schema( - format, - name.data(), - metadata, - std::nullopt, - 0, - nullptr, - nullptr - ); - const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count( - record_batch, - body, - buffer_index++ - ); - const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++); - auto buffer_ptr = const_cast(body.data() + buffer_metadata->offset()); - std::vector buffers = {bitmap_ptr, buffer_ptr}; - ArrowArray array = make_non_owning_arrow_array( - record_batch.length(), - null_count, - 0, - std::move(buffers), - 0, - nullptr, - nullptr - ); - sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; - return sparrow::fixed_width_binary_array{std::move(ap)}; - } + ); } \ No newline at end of file diff --git a/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp b/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp index cafdebe..52e7bb4 100644 --- a/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp +++ b/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp @@ -13,7 +13,7 @@ namespace sparrow_ipc { template - [[nodiscard]] T deserialize_variable_size_binary( + [[nodiscard]] T deserialize_non_owning_variable_size_binary( const org::apache::arrow::flatbuf::RecordBatch& record_batch, std::span body, std::string_view name, diff --git a/src/deserialize.cpp b/src/deserialize.cpp index f8e312f..d418c4e 100644 --- a/src/deserialize.cpp +++ b/src/deserialize.cpp @@ -164,7 +164,7 @@ namespace sparrow_ipc case org::apache::arrow::flatbuf::Type::FixedSizeBinary: { const auto fixed_size_binary_field = field->type_as_FixedSizeBinary(); - arrays.emplace_back(deserialize_fixedwidthbinary( + arrays.emplace_back(deserialize_non_owning_fixedwidthbinary( record_batch, encapsulated_message.body(), name, @@ -176,7 +176,7 @@ namespace sparrow_ipc } case org::apache::arrow::flatbuf::Type::Binary: arrays.emplace_back( - deserialize_variable_size_binary( + deserialize_non_owning_variable_size_binary( record_batch, encapsulated_message.body(), name, @@ -187,7 +187,7 @@ namespace sparrow_ipc break; case org::apache::arrow::flatbuf::Type::LargeBinary: arrays.emplace_back( - deserialize_variable_size_binary( + deserialize_non_owning_variable_size_binary( record_batch, encapsulated_message.body(), name, @@ -198,7 +198,7 @@ namespace sparrow_ipc break; case org::apache::arrow::flatbuf::Type::Utf8: arrays.emplace_back( - deserialize_variable_size_binary( + deserialize_non_owning_variable_size_binary( record_batch, encapsulated_message.body(), name, @@ -209,7 +209,7 @@ namespace sparrow_ipc break; case org::apache::arrow::flatbuf::Type::LargeUtf8: arrays.emplace_back( - deserialize_variable_size_binary( + deserialize_non_owning_variable_size_binary( record_batch, encapsulated_message.body(), name, diff --git a/src/deserialize_fixedsizebinary_array.cpp b/src/deserialize_fixedsizebinary_array.cpp new file mode 100644 index 0000000..995dc61 --- /dev/null +++ b/src/deserialize_fixedsizebinary_array.cpp @@ -0,0 +1,44 @@ +#include "sparrow_ipc/deserialize_fixedsizebinary_array.hpp" + +namespace sparrow_ipc +{ + sparrow::fixed_width_binary_array deserialize_non_owning_fixedwidthbinary( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + std::span body, + std::string_view name, + const std::optional>& metadata, + size_t& buffer_index, + int32_t byte_width + ) + { + const std::string format = "w:" + std::to_string(byte_width); + ArrowSchema schema = make_non_owning_arrow_schema( + format, + name.data(), + metadata, + std::nullopt, + 0, + nullptr, + nullptr + ); + const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count( + record_batch, + body, + buffer_index++ + ); + const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++); + auto buffer_ptr = const_cast(body.data() + buffer_metadata->offset()); + std::vector buffers = {bitmap_ptr, buffer_ptr}; + ArrowArray array = make_non_owning_arrow_array( + record_batch.length(), + null_count, + 0, + std::move(buffers), + 0, + nullptr, + nullptr + ); + sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; + return sparrow::fixed_width_binary_array{std::move(ap)}; + } +} From e618b0b8951a95c42d6df1128666797addeeeeb0 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Fri, 5 Sep 2025 11:51:10 +0200 Subject: [PATCH 10/22] Use std::span --- include/sparrow_ipc/deserialize.hpp | 7 +++--- include/sparrow_ipc/encapsulated_message.hpp | 6 ++--- .../sparrow_ipc/serialize_primitive_array.hpp | 8 +++--- src/deserialize.cpp | 18 ++++++------- src/encapsulated_message.cpp | 25 +++++++++---------- src/serialize_null_array.cpp | 7 ++++-- tests/test_primitive_array_with_files.cpp | 13 +++++++--- 7 files changed, 47 insertions(+), 37 deletions(-) diff --git a/include/sparrow_ipc/deserialize.hpp b/include/sparrow_ipc/deserialize.hpp index 83a0c39..b54084f 100644 --- a/include/sparrow_ipc/deserialize.hpp +++ b/include/sparrow_ipc/deserialize.hpp @@ -13,13 +13,14 @@ namespace sparrow_ipc { SPARROW_IPC_API void deserialize_schema_message( - const uint8_t* buf_ptr, + std::span data, size_t& current_offset, std::optional& name, std::optional>& metadata ); [[nodiscard]] SPARROW_IPC_API const org::apache::arrow::flatbuf::RecordBatch* - deserialize_record_batch_message(const uint8_t* buf_ptr, size_t& current_offset); + deserialize_record_batch_message(std::span data, size_t& current_offset); - [[nodiscard]] SPARROW_IPC_API std::vector deserialize_stream(const uint8_t* buf_ptr); + [[nodiscard]] SPARROW_IPC_API std::vector + deserialize_stream(std::span data); } \ No newline at end of file diff --git a/include/sparrow_ipc/encapsulated_message.hpp b/include/sparrow_ipc/encapsulated_message.hpp index ac9aab2..e738a85 100644 --- a/include/sparrow_ipc/encapsulated_message.hpp +++ b/include/sparrow_ipc/encapsulated_message.hpp @@ -9,7 +9,7 @@ namespace sparrow_ipc { public: - EncapsulatedMessage(const uint8_t* buf_ptr); + EncapsulatedMessage(std::span data); [[nodiscard]] const org::apache::arrow::flatbuf::Message* flat_buffer_message() const; @@ -36,8 +36,8 @@ namespace sparrow_ipc private: - const uint8_t* m_buf_ptr; + std::span m_data; }; - [[nodiscard]] EncapsulatedMessage create_encapsulated_message(const uint8_t* buf_ptr); + [[nodiscard]] EncapsulatedMessage create_encapsulated_message(std::span buf_ptr); } \ No newline at end of file diff --git a/include/sparrow_ipc/serialize_primitive_array.hpp b/include/sparrow_ipc/serialize_primitive_array.hpp index b06c64b..167b316 100644 --- a/include/sparrow_ipc/serialize_primitive_array.hpp +++ b/include/sparrow_ipc/serialize_primitive_array.hpp @@ -8,7 +8,6 @@ #include "serialize.hpp" #include "utils.hpp" - namespace sparrow_ipc { // TODO Use `arr` as const after fixing the issue upstream in sparrow::get_arrow_structures @@ -62,11 +61,14 @@ namespace sparrow_ipc // I - Deserialize the Schema message std::optional name; std::optional> metadata; - deserialize_schema_message(buf_ptr, current_offset, name, metadata); + deserialize_schema_message(std::span(buffer), current_offset, name, metadata); // II - Deserialize the RecordBatch message const uint32_t batch_meta_len = *(reinterpret_cast(buf_ptr + current_offset)); - const auto* record_batch = deserialize_record_batch_message(buf_ptr, current_offset); + const auto* record_batch = deserialize_record_batch_message( + std::span(buffer), + current_offset + ); current_offset += utils::align_to_8(batch_meta_len); const uint8_t* body_ptr = buf_ptr + current_offset; diff --git a/src/deserialize.cpp b/src/deserialize.cpp index d418c4e..0ca138a 100644 --- a/src/deserialize.cpp +++ b/src/deserialize.cpp @@ -11,15 +11,15 @@ namespace sparrow_ipc { void deserialize_schema_message( - const uint8_t* buf_ptr, + std::span data, size_t& current_offset, std::optional& name, std::optional>& metadata ) { - const uint32_t schema_meta_len = *(reinterpret_cast(buf_ptr + current_offset)); + const uint32_t schema_meta_len = *(reinterpret_cast(data.data() + current_offset)); current_offset += sizeof(uint32_t); - const auto schema_message = org::apache::arrow::flatbuf::GetMessage(buf_ptr + current_offset); + const auto schema_message = org::apache::arrow::flatbuf::GetMessage(data.data() + current_offset); if (schema_message->header_type() != org::apache::arrow::flatbuf::MessageHeader::Schema) { throw std::runtime_error("Expected Schema message at the start of the buffer."); @@ -56,10 +56,10 @@ namespace sparrow_ipc } const org::apache::arrow::flatbuf::RecordBatch* - deserialize_record_batch_message(const uint8_t* buf_ptr, size_t& current_offset) + deserialize_record_batch_message(std::span data, size_t& current_offset) { current_offset += sizeof(uint32_t); - const auto batch_message = org::apache::arrow::flatbuf::GetMessage(buf_ptr + current_offset); + const auto batch_message = org::apache::arrow::flatbuf::GetMessage(data.data() + current_offset); if (batch_message->header_type() != org::apache::arrow::flatbuf::MessageHeader::RecordBatch) { throw std::runtime_error("Expected RecordBatch message, but got a different type."); @@ -225,7 +225,7 @@ namespace sparrow_ipc return arrays; } - std::vector deserialize_stream(const uint8_t* buf_ptr) + std::vector deserialize_stream(std::span data) { const org::apache::arrow::flatbuf::Schema* schema = nullptr; std::vector record_batches; @@ -234,7 +234,7 @@ namespace sparrow_ipc std::vector field_types; do { - const EncapsulatedMessage encapsulated_message = create_encapsulated_message(buf_ptr); + const EncapsulatedMessage encapsulated_message = create_encapsulated_message(data); const org::apache::arrow::flatbuf::Message* message = encapsulated_message.flat_buffer_message(); switch (message->header_type()) { @@ -280,8 +280,8 @@ namespace sparrow_ipc throw std::runtime_error("Unknown message header type."); } const size_t encapsulated_message_total_length = encapsulated_message.total_length(); - buf_ptr += encapsulated_message_total_length; - if (is_end_of_stream(std::span{buf_ptr, 8})) + data = data.subspan(encapsulated_message_total_length); + if (is_end_of_stream(data.subspan(0, 8))) { break; } diff --git a/src/encapsulated_message.cpp b/src/encapsulated_message.cpp index 4e1bbbc..75a13b1 100644 --- a/src/encapsulated_message.cpp +++ b/src/encapsulated_message.cpp @@ -7,21 +7,21 @@ namespace sparrow_ipc { - EncapsulatedMessage::EncapsulatedMessage(const uint8_t* buf_ptr) - : m_buf_ptr(buf_ptr) + EncapsulatedMessage::EncapsulatedMessage(std::span data) + : m_data(data) { } const org::apache::arrow::flatbuf::Message* EncapsulatedMessage::flat_buffer_message() const { - const uint8_t* message_ptr = m_buf_ptr + (sizeof(uint32_t) * 2); // 4 bytes continuation + 4 bytes - // metadata size + const uint8_t* message_ptr = m_data.data() + (sizeof(uint32_t) * 2); // 4 bytes continuation + 4 + // bytes metadata size return org::apache::arrow::flatbuf::GetMessage(message_ptr); } size_t EncapsulatedMessage::metadata_length() const { - return *(reinterpret_cast(m_buf_ptr + sizeof(uint32_t))); + return *(reinterpret_cast(m_data.data() + sizeof(uint32_t))); } [[nodiscard]] std::variant< @@ -76,8 +76,7 @@ namespace sparrow_ipc const size_t offset = sizeof(uint32_t) * 2 // 4 bytes continuation + 4 bytes metadata size + metadata_length(); const size_t padded_offset = utils::align_to_8(offset); // Round up to 8-byte boundary - const uint8_t* body_ptr = m_buf_ptr + padded_offset; - return {body_ptr, body_length()}; + return m_data.subspan(padded_offset, body_length()); } size_t EncapsulatedMessage::total_length() const @@ -90,20 +89,20 @@ namespace sparrow_ipc std::span EncapsulatedMessage::as_span() const { - return {m_buf_ptr, total_length()}; + return m_data; } - EncapsulatedMessage create_encapsulated_message(const uint8_t* buf_ptr) + EncapsulatedMessage create_encapsulated_message(std::span data) { - if (!buf_ptr) + if (!data.size() || data.size() < 8) { - throw std::invalid_argument("Buffer pointer cannot be null."); + throw std::invalid_argument("Buffer is too small to contain a valid message."); } - const std::span continuation_span(buf_ptr, 4); + const std::span continuation_span = data.subspan(0, 4); if (!is_continuation(continuation_span)) { throw std::runtime_error("Buffer starts with continuation bytes, expected a valid message."); } - return {buf_ptr}; + return {data}; } } \ No newline at end of file diff --git a/src/serialize_null_array.cpp b/src/serialize_null_array.cpp index 96b68a3..1b5378e 100644 --- a/src/serialize_null_array.cpp +++ b/src/serialize_null_array.cpp @@ -32,10 +32,13 @@ namespace sparrow_ipc // I - Deserialize the Schema message std::optional name; std::optional> metadata; - deserialize_schema_message(buf_ptr, current_offset, name, metadata); + deserialize_schema_message(std::span(buffer), current_offset, name, metadata); // II - Deserialize the RecordBatch message - const auto* record_batch = deserialize_record_batch_message(buf_ptr, current_offset); + const auto* record_batch = deserialize_record_batch_message( + std::span(buffer), + current_offset + ); // The body is empty, so we don't need to read any further. // Construct the null_array from the deserialized metadata. diff --git a/tests/test_primitive_array_with_files.cpp b/tests/test_primitive_array_with_files.cpp index d0825ed..38fd1ec 100644 --- a/tests/test_primitive_array_with_files.cpp +++ b/tests/test_primitive_array_with_files.cpp @@ -84,21 +84,26 @@ TEST_SUITE("Integration tests") stream_file.close(); // Process the stream file - const auto record_batches_from_stream = sparrow_ipc::deserialize_stream(stream_data.data()); + const auto record_batches_from_stream = sparrow_ipc::deserialize_stream( + std::span(stream_data) + ); // Compare record batches REQUIRE_EQ(record_batches_from_stream.size(), record_batches_from_json.size()); for (size_t i = 0; i < record_batches_from_stream.size(); ++i) { - for(size_t y = 0; y < record_batches_from_stream[i].nb_columns(); y++) + for (size_t y = 0; y < record_batches_from_stream[i].nb_columns(); y++) { const auto& column_stream = record_batches_from_stream[i].get_column(y); const auto& column_json = record_batches_from_json[i].get_column(y); REQUIRE_EQ(column_stream.size(), column_json.size()); - for(size_t z = 0 ; z < column_json.size(); z++) + for (size_t z = 0; z < column_json.size(); z++) { const auto col_name = column_stream.name().value_or("NA"); - INFO("Comparing batch " << i << ", column " << y << " named :"<< col_name <<" , row " << z); + INFO( + "Comparing batch " << i << ", column " << y << " named :" << col_name + << " , row " << z + ); const auto& column_stream_value = column_stream[z]; const auto& column_json_value = column_json[z]; CHECK_EQ(column_stream_value, column_json_value); From 4b8cf6c81d4f3e12a33bac1aa87fc3ae6bd35417 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Fri, 5 Sep 2025 14:59:16 +0200 Subject: [PATCH 11/22] wip --- cmake/external_dependencies.cmake | 44 ++++++++++++++++++++ include/sparrow_ipc/encapsulated_message.hpp | 3 +- src/deserialize.cpp | 5 +-- src/encapsulated_message.cpp | 7 +++- tests/CMakeLists.txt | 6 ++- tests/test_primitive_array_with_files.cpp | 5 ++- 6 files changed, 61 insertions(+), 9 deletions(-) diff --git a/cmake/external_dependencies.cmake b/cmake/external_dependencies.cmake index 6139f43..822ff6e 100644 --- a/cmake/external_dependencies.cmake +++ b/cmake/external_dependencies.cmake @@ -85,4 +85,48 @@ if(SPARROW_IPC_BUILD_TESTS) GIT_REPOSITORY https://github.com/doctest/doctest.git TAG v2.4.12 ) + + message(STATUS "📦 Fetching arrow-testing") + cmake_policy(PUSH) + cmake_policy(SET CMP0174 NEW) # Suppress warning about FetchContent_Declare GIT_REPOSITORY + # Fetch arrow-testing data (no CMake build needed) + FetchContent_Declare( + arrow-testing + GIT_REPOSITORY https://github.com/apache/arrow-testing.git + GIT_SHALLOW TRUE + # CONFIGURE_COMMAND "" + # BUILD_COMMAND "" + # INSTALL_COMMAND "" + ) + FetchContent_MakeAvailable(arrow-testing) + cmake_policy(POP) + + # Create interface library for easy access to test data + add_library(arrow-testing-data INTERFACE) + message(STATUS "Arrow testing data directory: ${arrow-testing_SOURCE_DIR}") + target_compile_definitions(arrow-testing-data INTERFACE + ARROW_TESTING_DATA_DIR="${arrow-testing_SOURCE_DIR}" + ) + message(STATUS "\t✅ Fetched arrow-testing") + + # Iterate over all the files in the arrow-testing-data source directiory. When it's a gz, extract in place. + file(GLOB_RECURSE arrow_testing_data_targz_files CONFIGURE_DEPENDS + "${arrow-testing_SOURCE_DIR}/data/arrow-ipc-stream/integration/1.0.0-littleendian/*.json.gz" + ) + foreach(file_path IN LISTS arrow_testing_data_targz_files) + cmake_path(GET file_path PARENT_PATH parent_dir) + cmake_path(GET file_path STEM filename) + set(destination_file_path "${parent_dir}/${filename}.json") + if(EXISTS "${destination_file_path}") + message(VERBOSE "File already extracted: ${destination_file_path}") + else() + message(STATUS "Extracting ${file_path}") + if(WIN32) + execute_process(COMMAND powershell -Command "$i=\"${file_path}\"; $o=\"${destination_file_path}\"; [IO.Compression.GZipStream]::new([IO.File]::OpenRead($i),[IO.Compression.CompressionMode]::Decompress).CopyTo([IO.File]::Create($o))") + else() + execute_process(COMMAND gunzip -kf "${file_path}") + endif() + endif() + endforeach() + endif() diff --git a/include/sparrow_ipc/encapsulated_message.hpp b/include/sparrow_ipc/encapsulated_message.hpp index e738a85..e4ba113 100644 --- a/include/sparrow_ipc/encapsulated_message.hpp +++ b/include/sparrow_ipc/encapsulated_message.hpp @@ -39,5 +39,6 @@ namespace sparrow_ipc std::span m_data; }; - [[nodiscard]] EncapsulatedMessage create_encapsulated_message(std::span buf_ptr); + [[nodiscard]] std::pair> + extract_encapsulated_message(std::span buf_ptr); } \ No newline at end of file diff --git a/src/deserialize.cpp b/src/deserialize.cpp index 0ca138a..8457c84 100644 --- a/src/deserialize.cpp +++ b/src/deserialize.cpp @@ -234,7 +234,7 @@ namespace sparrow_ipc std::vector field_types; do { - const EncapsulatedMessage encapsulated_message = create_encapsulated_message(data); + const auto [encapsulated_message, rest] = extract_encapsulated_message(data); const org::apache::arrow::flatbuf::Message* message = encapsulated_message.flat_buffer_message(); switch (message->header_type()) { @@ -279,8 +279,7 @@ namespace sparrow_ipc default: throw std::runtime_error("Unknown message header type."); } - const size_t encapsulated_message_total_length = encapsulated_message.total_length(); - data = data.subspan(encapsulated_message_total_length); + data = rest; if (is_end_of_stream(data.subspan(0, 8))) { break; diff --git a/src/encapsulated_message.cpp b/src/encapsulated_message.cpp index 75a13b1..b0c5c38 100644 --- a/src/encapsulated_message.cpp +++ b/src/encapsulated_message.cpp @@ -92,7 +92,8 @@ namespace sparrow_ipc return m_data; } - EncapsulatedMessage create_encapsulated_message(std::span data) + std::pair> + extract_encapsulated_message(std::span data) { if (!data.size() || data.size() < 8) { @@ -103,6 +104,8 @@ namespace sparrow_ipc { throw std::runtime_error("Buffer starts with continuation bytes, expected a valid message."); } - return {data}; + EncapsulatedMessage message(data); + std::span rest = data.subspan(message.total_length()); + return {std::move(message), std::move(rest)}; } } \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2e5921f..dc77463 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -2,8 +2,7 @@ cmake_minimum_required(VERSION 3.28) set(test_target "test_sparrow_ipc_lib") -set( - SPARROW_IPC_TESTS_SRC +set(SPARROW_IPC_TESTS_SRC include/sparrow_ipc_tests_helpers.hpp # TODO move all the files below under src? main.cpp @@ -21,8 +20,11 @@ target_link_libraries(${test_target} sparrow-ipc doctest::doctest sparrow::json_reader + arrow-testing-data ) +set(APACHE_ARROW_TESTING_DIR $) + target_compile_definitions(${test_target} PRIVATE TESTS_RESOURCES_FILES_PATH="${CMAKE_CURRENT_SOURCE_DIR}/resources/") diff --git a/tests/test_primitive_array_with_files.cpp b/tests/test_primitive_array_with_files.cpp index 38fd1ec..7a0ef99 100644 --- a/tests/test_primitive_array_with_files.cpp +++ b/tests/test_primitive_array_with_files.cpp @@ -15,7 +15,10 @@ #include "sparrow_ipc/deserialize.hpp" -const std::filesystem::path tests_resources_files_path = TESTS_RESOURCES_FILES_PATH; +const std::filesystem::path arrow_testing_data_dir = ARROW_TESTING_DATA_DIR; + +const std::filesystem::path tests_resources_files_path = arrow_testing_data_dir / "data" / "arrow-ipc-stream" + / "integration" / "1.0.0-littleendian"; const std::vector files_paths_to_test = { tests_resources_files_path / "generated_primitive", From db5511328883fe6be7ba9af48e838617a94a0c23 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Fri, 5 Sep 2025 15:01:00 +0200 Subject: [PATCH 12/22] wip --- tests/CMakeLists.txt | 6 - .../resources/generated_primitive.arrow_file | Bin 22298 -> 0 bytes tests/resources/generated_primitive.json | 3170 ----------------- tests/resources/generated_primitive.stream | Bin 20280 -> 0 bytes ...nerated_primitive_large_offsets.arrow_file | Bin 3578 -> 0 bytes .../generated_primitive_large_offsets.json | 582 --- .../generated_primitive_large_offsets.stream | Bin 3160 -> 0 bytes .../generated_primitive_no_batches.arrow_file | Bin 3914 -> 0 bytes .../generated_primitive_no_batches.json | 287 -- .../generated_primitive_no_batches.stream | Bin 1944 -> 0 bytes .../generated_primitive_zerolength.arrow_file | Bin 8858 -> 0 bytes .../generated_primitive_zerolength.json | 879 ----- .../generated_primitive_zerolength.stream | Bin 6816 -> 0 bytes 13 files changed, 4924 deletions(-) delete mode 100644 tests/resources/generated_primitive.arrow_file delete mode 100644 tests/resources/generated_primitive.json delete mode 100644 tests/resources/generated_primitive.stream delete mode 100644 tests/resources/generated_primitive_large_offsets.arrow_file delete mode 100644 tests/resources/generated_primitive_large_offsets.json delete mode 100644 tests/resources/generated_primitive_large_offsets.stream delete mode 100644 tests/resources/generated_primitive_no_batches.arrow_file delete mode 100644 tests/resources/generated_primitive_no_batches.json delete mode 100644 tests/resources/generated_primitive_no_batches.stream delete mode 100644 tests/resources/generated_primitive_zerolength.arrow_file delete mode 100644 tests/resources/generated_primitive_zerolength.json delete mode 100644 tests/resources/generated_primitive_zerolength.stream diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index dc77463..7fb84c3 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -23,12 +23,6 @@ target_link_libraries(${test_target} arrow-testing-data ) -set(APACHE_ARROW_TESTING_DIR $) - -target_compile_definitions(${test_target} - PRIVATE - TESTS_RESOURCES_FILES_PATH="${CMAKE_CURRENT_SOURCE_DIR}/resources/") - if(WIN32) add_custom_command( TARGET ${test_target} POST_BUILD diff --git a/tests/resources/generated_primitive.arrow_file b/tests/resources/generated_primitive.arrow_file deleted file mode 100644 index 5854eb145badfd88847c2939fae33e914c9a5ba8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 22298 zcmeIa1ymJn+cvyGx}_xqNkO_n5Tp?hq$HK@+>+8MA&Q8Egp^8$v?y#!rBe`TK|pCF z1>~FA?D6*Ye*B;J`~Po!?^^%+oNJwP&2gP|)m$^|Iho5^S{m2+VX$L>BC%mG5*SE$ zFd`Th3=@U}oB@P}@w1))fEcLylXFl320IHdHDKR|1%vqkED12+0*1hZ!NLHR2N(z7 zcNj2OB*1C_lLP$W6bzOLuqD7;0I#FNU_}7C0xSgZ9vTeR3h-Tk^#NuCcpPw@0I(Xs zfD2gV{_!zT5K{w(U)%KN;P}`ZsAB-w+Rod`@|N9gD|0(%Gq}%9egVFl&MwZ*9*&M? z=8jeV~gYHj0AP0b4^83GT$5p@w86XG8=pXG6|64OqKK|K`c$5#?Klp%} zfjDUcCOM$@>sUAf_2ArkxLb?;Jt7QNhO|NZhfO9>Px#XYiU5PL0OJZ&WBeYY56~wC z1gIhYb$DUqNE@_&*kAzl#J|`;Q6f)boceuE^?-E>gQ3H~@w9eyF>@CY`dfGWPU8H- z?jFhx9D|>BP%xlw?+@|bAnnlpiYF-Ww|L8_c#MD#a6G{EctTjfRe}Xx0sx^QuODPw z(EgXWuE4wk3h2N0Gvvqdu?!#p$J4{k`S)(ZERl9-|FC-$42HehT(J1Ew#T~j{#*ZndCY`{5y`o%Bux8a0IBJKX!uXvOl6brP2`c?Ud zSm4~@0I^O3435R`3qKDPixH6jbM7#Za|hZJ{bdh@gq;V*45&s2_k~}tGuxATG#Q{C zjQcO94#tSoLHmbo?k~1rOqA`o zNKWMcXsag54xCF=KYyP}7!FeBpZ&c1i!CbVuhR(&`$NoDlpQ!HVBCK(r7%BKOcp@? z>zv{MvxEb~1{g_bUbg~A( zQ!x~V3J8O11SHLq;-|en3NxX~AXEtz5QgM=C;=3|429)U>&)v`G99Y zkRW?86o&kR&x^kjgdw@|$qn_Vy(S7Pql}VKm=cA{P#E$FVMu=uB^N;{@Bz=TApL9~ z3PbuqC~SYC23tpA6#R)HeKDL9Nis^I4}~HBA5d5hweA>^58RM9;CUB?A-OI7i43xL zMd3YEJ8Z!RqhAQZkQ@|&&!NzIM1}u}AyY^WO$UU3m*b);{v?N%!=L2P@cl{t|26)< z^Z)PUsNwy8JwE@=|G(=Ww0!?GK7W$8)|^ZX2J1;V2|p>1%}*9uP$2EI52g;`ur8p! zAs;4iB8CM8hWpxX?qAYT+HWf+-BMOX)jx@Z6pjNWXFb2NBJVr?b0@VasMV`(hth(< zf{p`oSTVZXDN=D%=WCDC@2YCf7R=v_TaU3z1kz~Mymb?MIyHwdh$sUc0A(^qYC z#81v0z2Bq))}6s4+%hfl=9^i+`p9@l)##E4Mm$*W3AY~t%grwC^67GX=n@VBZk>Vq zMP1XkRZI^nICU8$S+{nG9VMhZu4ddZ_Z+la;(Wz_dP5ZF?uRJRd+YZxR+fe&V61YF zQx{3UvLjfZAboCv$G-^l`4{@@%%G0^3W!Q@q920yY_5PS1z_3nhUht1LYlVOh6%r7 zsQ#*?#spZOXrPx5mbk2&NV_zDMMp>nRf7Ej3E7TE5+4-7`q(A3uV7_=QL`k7DGE|( zkoAj$Mqp`Pw|fCBwL*5h!IF5PQ4cH$X}&FjcFEZHl|Zbq_?{XpH?AklgY|Qfcv2v~ zCvS~h0##&6;N#b^?*?kVj$r`0I|s~%i@Qd z_9_FfF8f;;p=RUoF{FP|4$KBvO3$3Ozj--LsXFVb+9-Cpm_V2oGbm-_8kGghfSbe; zaC-r0$-}T`KJbI}5CPo`3>z3^^sIjCZR+_Pt1i7*jC{r6qQE6rzRUThaufxZUHl@L zMU;Dbw4#h1x%D4MX!pLL(E9W`j8DKI5V;-%>VNGo^7oa0xqgPr=k>w3QVO!pQ2bxZ z+df&yTW8_QUB6%J>lK2KBXw{-&=Rb7oX~`Bc}O(^>qnoM&b z*8v;W2rr0vW3MCcH1{04kn=tDg!>#=U*+q7w0D$vfZVDml01=e?|?bv`iQ&li5y4Q zxF=Si{@3Tz3Sgs0T1*~Hm6ImKst^1=NZ^6aC6t-UqR zwo6$owuJR5vI2z?Jy3o*4*Z9}|k8H?+Js<>N0CX#dLnp;0^n@a}i)kSp^& z0H}x#M_e^0(HHl>gRGbS6ndgZ8IO>{8_97M4wadGn%c^^bYJESh8_|3r!bkB+vFum zYEcMee6za{&@Nw?M~luOsz}L*ivvv4S7TEr!Yng7h&6@!jn2>lj48GI)Oj zp8>$dga0oW+!w${5|F_KYO5h+^rZAW=NU4#Q`0N+9WPru1E{G00YKn04and<4P?+S z=#v0o&=yn!Hz{y~1D|2Qy$56-fWZq`1YnYPE|_YuE75N{qOw*`$GXRJ&IX?666ag24jH? zZYbade+^*pLN*839$@&VkRp3wr{*dP_`;KHyOzr4GD|CG5x9ULP-hN@3wjBdJK9_E zySNCqR9dx!07`HXGY7y*(7d?{K-QMcIhGc1N4TRIzX%-eWi4baCU$E^98c z^|EaVfm^^`e(qu?V+FOD4QK;6E}{Tm1{iFk4!|Y=gBPkJz_$PfHv{n29tUtTz!ok* zYynr`;N;a30yM|W0&eMGZ^>`z*iz|gXATG~g}p?aon5_JDs7xC?VEFKn{$L(DqAY8 zh0L0(Y#abHpvD^BTxKI|33kW|919C`Crf8netwIVN^=oGdvi-6LBJ1O#M)7$rP31c z>H-cCygA2CP}r)u%oPYG;srOeZwc|T^0KyZP5hU;C(vLREGyk&BjEAcy#h0a)g^4L zb7is>Iy79>TYbJYj&}DlveFu>+qa#Ns((Mc8H20G+cAImaw)G=`5o2n?N8tc{1*u8 zo{MGGxU_CzO3APRCYh(+ZgNgZfaPU*?|dr!z~q8dwng@PdwbT>Pbs=hk1D=rQyfOb zDXD%qkf;h$ob*sxHo5ZbXdUkr_Ded^>Xuv1raV|!5h3GcZ4c)%K1p0Z#%(E^S~ynX zqjxgc>X65APjCwaTcr*Yol>8`5n1;lX+FZB{Ua}3_QZrl=XXVExW!=mYWRE%dUSF%@FPVyEfYk@mT zHK`6~=>=lBd`tfm_uHkeTv}tE!qalc;ya7}Zyfcbzr)WEy@=)16$=nPo9^RVe2-zO z_iMBI;VAa<`l*g{yLe|;`edAJND`H6nlG4Uem7QsecVi#5oB{+sf(M^&mxobAZj&T z;HE*{_BHjQwr$KPHaYA#iNY(6(c-+@Nv}(aZp9FDugZ(s)z=$MD>er{Vyk%-vBuh4 zsP&w$;&cpG+b2(|`_vW@uM`K|tguB_m+6CWPpK+8#f3_RPv}VZl=$Dt=ah#>d$=gb zE!R%Q5K*&Ak}vC1*|mt1+Z?Pt6_0t8BD7(b80|cr*(sqL&3VrxposLbnJv-izSzzB z&#A{@gYdS@3+7j1Tx{3~iI9(3Fsj3oeM2B+`#5adDD_MpsV#?KfEo!JMw;0^lRVw1 z5>>U9McsR^!g;G|EJglrANB6SoS42!dx_TlAbur4kvB%Vvqkx0PS3?_XI7Hj1jz#S zBQXcxWDj-DQFutp=UDe*#rrfDf0*0QNAF^ArEq$&*^UQ4%WT&Sy!)E1T!NPzonyYb z(u7oZtEg^9dOF{SHk94Vwvb|NrphZc;*MdbBpAqaNFH0y&Sz~F`nmZ`wlyvTscl0QFs!8R?sN^G` zSO4$y@IO)hCyjvcfTIqd#Fr!i)vT?hpf}AdTvwWsCGreZs>?7;zARLJOy3~YN9U{{ zuqJF}7k?Ey{bZ!pm_Rq0ngaVtdZ;cF%$FpaZUFnr`|7uP_EKe%#NCN)^&Ql?TmGXN zj*&ad=Y$Wagz)(b(X%eXo}vfPDRz8icwhRFW%rIz%jiv`e6z#YQ?m0fehBeHbU_t#FujU5DS;;lwsR!`7`-X?y_rHC#n;+^TmY`6h zmnc1OPU#d2HlvJEAenVJ%XGf%+r=$^(b!bxOWA7}B9qY^{Q+!DI#i>_=#9o9BGw!@ zy0s67S2^+Rs$L3?j*}^WL0r?mcxP4nn|+*x^t=Z}$t(%kbw${-c*F6PdqwKZ!%=F_ zD7nb@<-Uu&l#JPICKI30xvC};Z{XNGXwOgmHlZWM&x*>Ww&!cW#}>!BnUF+beFB$P zDh>9Lr*GO@ zPk$Lhi(S}xomj5?`0iL^UD{q=Lz7rv^ZCoPDF7jH_Yl?sQNZhsDKE!KDB5n$cL zIajz9xI1s{vyvyeZr(jJ%x_sqm9y)+79;ZIOg#{Zc36@9MofT$o_#3O6;*XB+(r}BERi&PGApOzR=@{CS zK*uI(b$#kwp`cuHPwmLjEj|N|@yD>t!|FNv57SX^l}&IUlfWApQ8(nNM6&Og)45H11Xgr|O)uR9&9f^1{0L zic4&XNO^vK9jzxW{mgp_;XIP-Yu68BDk-lTxypAQm!r9%y!x?C~50jN#@{U$i7-tA(|Q4vqK*Nmb;SU2U9Be{5}Gq*mBs!rW7y=Xp4z#O179XlWgaXG$;lUodnnmLzyAtCE?Iwk04a?83Kw=`7nw62OADBnDks?HV4kmD+DnmRT=K0K zcZ|Ey#OOFt6o0nBhNjGt)hL+e~p081!nyakA1-& z{LN^U!e6I<4B}%JW6${++pL1z+P-E& zlNmg+1ay8YTlZXQWgVY><_liJt%n>7x5l>~4W}7B$*5u>A334*3giMCn2_=a+iun_C`VSpt_noY{p}IFCj}`gSN_*>kZArYa&%4QT zoUR|(KJt1*Yq00`ib{IdwD=bAM^{~#a&CP*jew^OX0{AgbUM8%#BfGO z=Q#ERwz{Ha%cHO3(pn0$@MU^|!)5i{T%lcGE z4fZ^Xwx0A+5dOr52DwtrlPA ztPh@^Cgxn(j^X&6;kG}!gMaR!+AUG-#7TIXi+u_Gbf4s<(>9+7&xDqXG=w~-VESS+ z)3WuROOBf&)rt8w<*cIjP8@G@>$e>nc^X(l&7AM4D1w|>i|hFXmrFT6Qr#`V6242t z%((AA-PFSJYWFTIh%mqSw6anoj4v%*#+|D;J;nNsT(x9Dx`E*x{Fk~K?z~NrFt}r0ov`F$ob+y! zBaZH|%$QR=#angB_Ver>@j?p6U)DN`g{n54(lT^twcQj{uf(E`04pw@t%&FBa`Q=73MB;IF&tRPQwVme;ZMC<2 zS?Qy6?y60^)C}HTGKWlHoIQisId$z4KEw5gbPeVL(RYAmn7waCPjd}I8r*A{Ke{`H= ze5r<^=A2k~d`X4S+qTd_-i7R8cZ1VS*_Vw*=PVJNmc+i;1?kM>mKjQ0H$qZF#uXer z`IR>JWu>0sCwFjfcjuN(tRA^5>b>w}ccZ&1n%OonKe~!{{M3&-YuVlB2dI zl*VcbUd8(I&La{9ZzTI)vE>e-MLg)ziP}`!los8rqA&9wcv1SWeuf~8e(_+A74ydx zJI9ex>V`DJfs%!DE_jT^9KH!>2p+eb9+xb_^Cq)W(sB+CI6^xdju&;+;!!+Usk=J$ zzVu<#x|oZIm6rdt8=4cR7rY!xF1kN`Jg2~-M;^BqCr^@Ts`5z2GKDV`^J*Hi3)|GO z51Vns1a5fB0QVWgGn;|4te;JVEIn^IBJe1{cOn0k{0syG^jF2d&d(r+#6T(Y%Q^Tv zC`i!zSqPJyRR26e7?M+?BoJmqVJME+Ps@`N2t#sBlpM0xN8$6RG6+>d1%x4a9!db& zm!UA^7y08cWDJ(5pIsrjCJICLV9qB)yze*r4=4=v?`L8FW)J4af|Q9o z0{J8usFU9?B&Wta5kO%WQ5f>?io%c=2t#tjZ}#s{81iom{xJUwK{yjthl6_I4aFxx zVI7p*7KOP{I2nae5C&mLAIh&lz6b`CWN=P2A^*xK4C!a0upG+aA__zPAq?rufG-&U zDmW=%>nIHQ-$P+P)H?G9Ur78y5QgM?;L8etAbVtT7l2JseoVm^8NU#OAvq`l-))ei zbns9mgi-LP9O~YmHOpP z{5$!d{PXpqdaiIX;>2m-n=%-TW9h^q2nI_+E(B5-C(MAPPuEooDGv$^{eF~L$9vvP zEo?*YDew2SPg9`4g&Z|fK;A{C;N)!>Owj-v)p-mq7_74!X%3S`=7rSyfcY6u7`D5c zMb)QV(JpM(NY-n_#NoehoW(m9o8_Q%Jmni39@*wH+7HbTEa=1o3|428)zab_bq~Zz z?701Pxw+TTC*o0W@=vs)c`h}9XAShzc4;Q z3C2U6iCuOf_>?`jMkz(fqx9@CKemp;^?i8Ubn&JFLSyd1GNb&0jnmU%g^h?{Gv50l zEW$HRqWv~Db69g{UK_gnyyB3~q4(TK02~Ouig|^fUHcV6dF`;Q8@!TPWLij_C4f@vGgszOGDWr8?Q6 z+Bgzr3fI|jCNTvK@TNUU4rc$_aDnCpf|l2^ws>m3!pNg5ulR6OM~jKX#?oF<&h`n3 z-Zul~5XGjd1e#@98LZX%UCyHVd;t#!8+?5W%TjtJ1eTz`t0M|9Zg@MFP^hT#S?$Nz zyn}HZu@CayMPARssqLE341E^$pRYQfakHVt;$!g;%yAUMXBjC{F(+?9zk7N>Rg`8) z717Qbupj^T^#}U@mCl(N7=W0tBLbPH^9|8s4PvSJ^T_$@zqJRx!v0Bdgleik!7nE= z-hP6=z@Miz^81V+*xOs+1Ir8Bi_T#ACxXUuyx|9O3nkVJN5-pgp|AmMdaYKF8D7Kr zR><|~%Izcv)@Kq(=z*ocHuF`m)aa8#UeA-FYlY(#PU#{{$pATmRXy8L3zF0EsWOI zHZikSR!U}hQ2dNGLN0O15qW<9-bCvR4LR~0l#N(qH~aV~%WHkWmqYJl1oVhneTPcr zjBm!sKU@U-W`xPh8y(AWzp!s@c!JZq984y^tWYO!FWDtaQX3{mZ>T22&&Sjz3~z3o znu?OYP>LsS&Y0EuJp&;(UHAVy-(dV-3H`zy%F%+rle~@Y2$@wsY2h0zzA|8X#*69) zh+lQVb7X$Wi)aLprt$oZP%y$5NZ**z5B;e(+Ow-Y>wXte`OdT9$^9UiA4A?>*ftn{ z%0V9>T_FG>-+C+qWOc+iR>2W%jAih}d&x5GfEmq*pNt!rFhUfLK<2d!JwSlzFGYe6 z5y<=z`gjB~AB4~y0c5)vTB1FWLB_jw7cr#~o8*k{kDvw8VX*Ln5MVltp;C$rkWC^j z93F)LvQVV(lALAKM1c$#zXSv_e@4;!B#&k$S>`>^Tzvskpzd;5^ z`&R~Y=AqmSuw(%k+=#&EN{E3|7}sTmPS@lZ(hal2-f;tu{+H(-Y~Tws9G2NEN8fXi zxa?-n*V)NzZQ<{4nQ#d-00@i&w(2y%X8>jZm=RzO6odYG00u8aF@WU&Rsa~h0Ki2I zvNDQQfD$AJyvvoL2dV^P2fK(H>DW8=mRvpFbVu1m2SmH7QUUnHCRcJbXBXvg?i8lE zaeJFfd^>aHJeA?P#ehi8r~c9^=V6C>!xlCsFy{m$x(L+${sbbA;IkHZ(b4=1J_po+ zH;CUE+;qf$VX!YC!JS$vg@lD&EbS~^ft(ZJ<}wkuT}w!FnVq1yu#>$2KaimT7iz8& zw6hixvN1O^cNT{8+c#H<*nrt7LKe+A<|mGTBh)Tx;G7ZQ^8kaJ6u5E70(=GFYXE~A zBxq$1FdSeo;VlB-`v9i_oC9zHz#v(=3OQN;X)ydimJD1#$i>PP-dx2m03_3xc{#Z{ znOh3j0eLpIR)PY;HjY9qA(lc8{I)huK%xnlsv;;LC~WU)EdmGh`GtjT1kGGUY=z8i zT0*SNfOHgBbHSDnC;sL#Yq*_DbCrviuss~^40J39NNf=Ta&XElg+%OutQI>#o0du- zp+^W9p5`*3!o~)O02i^hwQ;ruE z3Vd(f@2u*gIoh6hRKvp8%-F6D-`aNGeB4%nN7nmgR=I_bBgJn$vi97MfW41xJ_I$x zd8xb-1uxI|ye)Aj&QiY;e6O+jDuLQZQl2Q-q=f;@(`%>0m{YoGg0`imm@*psbx|wl zdOy*zl5k8QdDR~Fa}CXMK7-l5fOU6lCi4d4vZQJX^aqxWE6Zn!dm3yIsTSXv?v1AT z?SzP>PRV6TTUAxEG4Y$n95C z<|HxSn%Zn~tF5H%?Q7ykU0YsTQ608Mti7x;>dOyP>x^DSbMH~;I=L*rHi$(3k?58{ zPASV?5{rky9c&oYC0}gXS4=gTUabR8oL~UBbHiY?M0@!}pErYUx&+|l*p;Yg|MR!ZxEHy$ zS@QUnFV`EG7qBtNUo*9O(o_RCD)f0yJB_hOENAmM>g+<4ckzRb7Fm(gq1uvjmh+kC zDg?qOi&?(&xHyfP_o;OwdhtjGlcWHk3oS)W|dOpzMb`WD6Ni_E9`FQOg8UuHy0(;F8jKOisK zA6}=^@r$r0(e=B-;iHaEL6JIg>y?-c{}frdafGwoy8y%t=%KL4y@=or)pg8G0&gI@Wb*;r?f+1)TqgJDNZ!y>&QFKPT zS;tPbAY|rY;KOcovp* z$>uSczH=0Lw+8+G7*p7u!NHQ5;c2Jl)1w}-90$rhTgHXmui~o5>@@rrzYZ`e$`#5a z9WeE(a9?8My~{I8p0Ysw#~|BSzWvFXu5dZG#m=@s;og zrX$oR67+A2XE3%uALiFyNKKNGL6^rPW&m4i&t2%mJJ#<$lr9y2rRCQ$@*HJYXCPmfIPHQ01COPx^v3sz0QSjCNOP{LI8_fk6 ziYrMIrj^!5G3hv&+)hi7BCPgei7?$%1^W9C(u*`pbf4Oj0{eYTzHxpy;>^KHY_~F! ziOn*g&wP7BCw04)JL5bZ-mQbF3QAOS+yi@8H=K~l8YqwZqjqxyFZB!c5AGD{=AySDe^-Ei4-K8a}mOAuE9o4#7-uWdo{`*)KS#WJ!!-FDu+J-q&1LomN zy7e#Jw^a+OHfG^C-&XeJ^fvuWklxnl22KBKaeaTI5g4*GgJ`|b*-oFzrKN^eUU ziFLyq$~n6+5p_C;Y16%ra8J(Fv-L&xV!}bG_Pm2g?aS*`;S0+p;?Wcs>7fD@`U;$ z2fAxGw?)xiqH!iIgOW{Bq4hbw{Y{QS60)YW5;aUhn_X*|v3zxM?u{g?0OFu@qrs07 z&ms@2+>4F2!)2?+Q)1SQrkgr#yT5&P+oqn^eNjg8%;kZRq~^Osqj}O(EoVI#g(p0^ zJ!6uFj-@rNBw6GHr}&lA9?q*vlOItwPgvB(k)kUIf4{mug`+t~(BA#Tc!1nd+2BpE z;?R$hvTvlxr#`$Q$}3f=k;AQTZ2s~214;hH(c#uwMehAF z%DfGEsR=Rp4<$SvuX{fBsF!cl zX#GpL-2*`~R(0E;51&7{mzcn(2iHZyZ@2y zVRf>|PYz%8r7QE8j%wK&nlNwJco`z+;#s@VtkN0cFzLC>DtoR_hqyP5SKltcJyGl6 zB~6k#_mzl3R$7fvNx`nXU58GIOY&?(*oeK+kZ~ts2Y9j0)rw@JA-3!@J7GIPwY&xr zYzo7Q7FMMpt~j*Y;=0$0G92OEcdYK6Me|y~6sH{LUi!poo3Z5Iji@YL@IHS0Fjo84 zum(=r`7z&~rMVPcK3aKN*pa%Zi8_VKXx&ItSL``11QBN+!ll?m`jy;g$@b?0BFd+p z^|TzOQk1h9Nvj?jzAvPQUsjADXohPzB%3$9UQwJVEOgtW9N;4^#^-(Du~bzW$`X)m ziF<9;dHvy8|1Dgf59u-Qv*sr4LuH%BXEda;In~~!C#UvrIxsqU~e%#y%8o*CsXVp3JghApQP;60R99q$yGCAVGC-TLm;qhq+0mKZlbmQ%x5&zZSK~hA}3tCbk26{&VZ+*y3*fpUqm| ze<5Vv>EVTCd+3x?J~OY0R_?&L`J%GqbZukIq_AnkDT=t;UoVB{NbCT?fYr9Y)$IHf$T@@A{7R!0f_HIKx%P`WWpOBuMDrJ zH}dEd@PoP51TD7KT%Sqz6V`4CsF2CogdAsbyqC#6x<=%NHrmalknv8#4NI5_Te~aA z)U%H3Yw1O`UV4cEPTSeN%m)HNWh)QeV{gjUvL5O4lF<)}vVUF}b}0mYIIgy}_-vLJ zuKg8l+GB=Iv%cxJtIds%3-L{AnCx#&9)7w&ezlz5$oH$Q8&fpwD6)DGC&t%kisTi( z#?Td|@^1`77E0K>&snZaTA$^ib!qQ=fptrTi7SRf(rqeogv%zIMvn|zFZD}zj{0`3 z*a}a5??6cxWF>SS(z{T=s-|}d8eia zcd^~TVsG3lMseN}d?G&1jHJh}uyvz67Q1L? z->5D%G2$^^W__qj#JqOM6=1kVF#7DRk!!j_Tf)E)!zY~JP{hnBSN2F-?i+V|Wb&%m z!!QU0zV;_jR}PB1D{f-7G33w^uHDl zIr!?jyJ&CQB=)o1&tEsC&ib&PZ+xw&`q1`1eO=bFiifg*Blg0uknvTI=T&v2_{XJv2weKi>2<~($`>)x&q4D-tN599> zXvV*G>qEj9i9pMTmPd1oG8Y(lWq3Fk!*jXe51P#6H>ah6y#+r=CbyE4o z;L>cOfuYeSis=Z&BhN$ z7_MKkrOh@aew}!Y_?vU7unOBskkWp-0LPbgIkbr67%v&N?AQSxIF57k>#uZ$TgM}~ zLkcy83OW}P`)}Pq3yU&iT9e?u;;XBcX?7=|Xbb_Kp0&J-eE zqurIeC3YrH60;|1Du@NrrktyifcIy|;BzeadpiM;9RPmqivU0;`2ZNm{{@u+WEcR0 z?=PUd78xu600e!vM+|(a01#CFSH9Ru{mBsz=@ zKn*AuSU=l|0EmTdKdyrcFxVM@XaW0P92m?GASr+V7cc}i3>F5E0zh~Gy~l#VA^}nd zh!UWWCt$ElfGh#x2521<1}g%{6(C`N_Ap?uR)FpTqz@1iK;wYx1c1~50$ji<_mN0| zk8QkZlfFS=(XBTH@4@XBcb4M$*4~&2K zAUHhsLH{QokOM$2_5I)baRu-}0m#8Q`bR$`{?-k&kALb!zL4GC;n*zMS#Irfq4a*v3`%y3+R&r0`wIBI=wJT zlnur|Y_NcK(qC+#D3Ql8PW-;6dcZz~!7yRqd|ErYn7NAz|7|#a$8r8)cMoj`&cRPR zC>YST_lJ0IQFa)A#S;?zTfAj-JSM;gI3M7CJT5rEQ-T9N0zkn)JwK?pp#3j#U4eB4 z6fl1uXUGpEQWg+^^XXyd{QEFrmMA-nf7rc5+ktZm*+JpP{}5{xuwnWo)^9_E_5L9i zJ1{q(9Te-=p{X4G`?w4N8*mMvaS4e2Z8~96D7$}-D;{kJ#RBc1aaH~y7PxkJK&+Dh zfphWu#?M2?VglsvF=v{t&YjZ3nIi824Y66y}GH$qLAST~j<@mGEG= z0HF#7$icJi*vHQ)2K-W@bZ}9tN``*#!Ek;*>s|qIz&@C}xH$fO2ZMbBec%Amz?}j4 zz`y~YanJ@jBd~xPSO@!Y?B}n30Pd0Ez~KSL1q%_38t@3dk%>@VVJEO)pg&Xag@7sq z044(b9lx{b0msnVKpL`81>nG|7FE#m@e_kKzZM8X`e013K$r=Ij`sjq6-Q%;K^WX4 zV9`2eKkfC=m>JE3&>UhAhU9r@0TjOsjTO**56vM4VMs0u+#kRK*{h&& zaWf3UkUR)2htwj`6=-~9SZG5ugD@mdM#~}lOf-i4FaBl^VMxvo+yTJ?*^8qw@(C|ZFZxWj_wXaCR`(hovo`(riO zIvS(lPYmgc;~h(q(F(n24Eg_v#`5TW$Ar3YL*9V{;lDF0zFANsjF`tAVbENt4Mhgl&{p_QuLpZDxXm7}e2_B1KL4o1E zwwwDGb(Htt6q9eMsG-}RL_!M4K&e^JuWYFIj{n?oYYKYzYS^K*V6Y%$U=AC0r%*Ai zsMGuxt}A5Pj?&FXvjsh?XJ|`rtP?pLYrZc1dEf?tEj2adiF5jDZyX7eb4Ttsse)~% z2#B{#i@f<~)~`G=9#k{BD2f#iwtK?u2SL8s$x}XEP6$21LBOjsaKET?`nIa+VFi~i zqZHfL4ymK0w8xcI%p1#3oNL9Jd*sV1h&U6VSEJ}`-+;SKulGTLWgQ!954d8dEM@L zkY5Yg@di2RLZcqYiRr#Af_BNc_mx4cx%hz=M;3lag++Gmc@-W<)j{;yj zRDfXyhV>6Hc~-ykHuZdtQU{-pnjN4bE5VEJI*j4DB_E~j(XGF zbL>Q|_tX=fvtWCbuLH{7QSt%msG>^pM9IAa=1}`1?!G5#9$n*}Sb_RqpHC}-Ttnjo zN`Kii6?G=uUjE<-wtq(7R}9^mP~*BHSAyc846oCmmBguEp2I1r6~wl7Cs7=%|EVna zy0>P3;BC!t32xEff7v?K^Fc0gHcnpY*`7jh--ZImh)`=!%`^Eg5d$rK*`@L%JT9#t zSZfs6B61XF)2$VvnkwW$-$?c2_44_`E`T>4i{LcA-@L7siio;o`nmUYh-@G8*nU8> zxojpK-vsMdcLaw3UJ*T>7eWpYGjI6ImceZwtXzpi{N7K{Uh8@WqN@`qlL4~7%Lg>f z$v$!?{%mIf!q~@zA{Y!UFHHIP%LdxNa(`$Pj{v;;AsupMzXt#f(cy?I=4AR3{&!IA zGM_?^^{C?!(gedft|Fnbvrki7nU?O$p2E^2;rSFMJ9C?|L|Huwfr@W-7s2x)eT^e4 zN%j`#_}z7sUU5jgEE`VnBl*QpSs}2D1|hzi{e2&U=vM;YZ{QsO+&qN;Lg2XoUP(X# zH>jHCtXNBIqZ|)OWwXuJ9DEKQz?1D^hz__)O{*!u1;RfCtP` zF^3wB{o~P=Q~qYgkO0D%vxBS5zR0*_Jf)gA{>GC&qCKx{!*;K#|UB?Rb>nFZX^ z!QN89(y^t|)y^CcSc-UwIy<|1wN%N0_%9TkdoGq$<1)I1DJ4S&*c6_6 zyU95v0hX5(yz^-Y1Ct9<*%vwP?d@61Jf-R~J*xPgO?4O%r>yqzK(Z=GY0^V=+2r!G zqjiE;xG(9&s#|V3oATn|M1+i&y?Hp7@k#PJ62GNvY5}Rt&){UR)vkc&p5PWz5H)S5 znA+uBekb;sF=W3zGSbkoa`N5)&ftIT2dX}(hJ0L`Ii1Co_9-~ zxu=bJ3s1@;C3Y75-#Y3?e}|tUc@fK{D;^+nCf&!k_#Wd_&(~&+!x7x&^%L!9cL~m{ z^vXKfkR__rG@m!k{BErA8re*o5oB{+xs!+5&mxojAZj&T@TNiC_En9dH`~}z?DDvA zl7*KYqb2yZlU|n;-HIXQSyd3TtFJejR%#A>#9s3%VvVh*@Y-|!ijy(iZ$5d_+^4mO zc%{_uW`!%Zy37!Se?m>kDK1nxd_qU2yTt!aK9>SK+QUUrez|rshJ=<)igH<>#;!$z z(&k|8sYJ}96yXiK#AxT~%nnK2Xs&xE0Y&7G&1^|V_Qh}3e@;b)4Zz=Io;SZ7<6^^3 zOoF;*!DtST&kezpZDiQCQR=BYa$8QJ0Ch49tTeNIW(E2YWt!@17Ih!I3g@k=ag+qU zjq2TnIWd2g@e-^1LHbINDsPN@XN&s9oSuoBw*E}wd$^tylEjn0>j?`k<;YevVXD;4 zaw*w1fqg=rn{2$kR@uvpBlAZdBD79>cSSK%&zgHj=*mS!;EV)^Htb`2y)aGs)WV$f zjxcL?KS5ukYDXfsRc_Z&=k+_3zvWEuO2t1_abF6!@ z(tWxMKg?|yqIa>lQ@Fg?ZO4P3WwvPr-hItpF3CrU$vIzLX+o~MRa7@4Go9~qI+Vl9 zwvcLVrphZc;*Md)rBwn`T zjdNjcUQc=s{eC&^pq>1Ghv)TOr!Ivp4i&IBUZ8(U9c|_&#)M=a_kQ&@@$8~eBhn>h zM?WiEfs)$%dgG^13rB2hzMXdmYaIhAUF7+RJ$-^A23rHBsX3a>Vs0m7ZgMsezGAQ% z)uv+`5tjGb4Frh#pX4VDR zQ_KK*rS`9kA4*4AckdXrjNCNJSO54@n{Surl1JN2pmtD+mMgP-;@#8qJ#!z{J;x{5 zafVnDENEX7)ZCytBefRb z;;QzAJFD8??BgtC<~^uNX2~e7E5V+{8;-BsE7D*Yic)_@%}u#4|6TN@RLo{Gg~W`` z6?NHo1IOk8djZ;a3GFFH-$uRRwcpGsb9ZsU{K=vTqNT>M3Ex2EKrZ^t%9UL0fsYi6S6#MNoczfa^s;fFf z&r&jX71iCSKIt887U@ZM$Ve4ORZLViO_%hoO;LXqa(~n}{bdXzcJ*3md;BSOwnK^( zo@jyzu9r0HcWY$^7xTaHV>$)7zW2h)5qgew`r<{n@Ti5ydDG-;T%GFWQ=8i6X`F-9 zA4bhoM=->+_&3yWikCVMFRt>p^>W?pIIMl19O=lyM5|{J7}Un;-}o#a)5`B8Q8aeF z28)WaB&1@+_jtgh=EpW=cMd}2hq4MWYT*j7ej#C8f_dx*Q@N5l@ZP<4RQ@K z8Fxa4>u0Oae^tXwT961Y-jqr!6$v%n{v6s`tnbJx$hM1jws0$Oci!4(B~NVKylZAi zz_OAiXV-mCE$8S|;7R?27yPrVcSsf3yWfTlpIXg!Iw!36lDqhJ*OS9nW_8=VPQCmK z3pQ4gtdFugD$LvnH&!<#9wG12@ragHrJi&k|Iyju7}}IT&n{+led=tXkbH7??eNhp zegn?&$FR)9>N)$5(^2nKOz=yRs%F&uE$69sglh-Cw>uYna+9twKV0Il#mISdWKf_% zM?DsQIibK3qu!defVnMyDrS&5W0t=}`P}%q5xi~9-q+1YTMyTb|hR!nyg1TYQN`Wqy7gqdPAB)CWnC zJhJO+*AHVVsjnEhDs&*rG2Ae6mRF-y3Y6#&91AA5sJ!Ly73ebdc#i_c@bhRdSQ4|> z!dk=zhkb*jD{{=PG)|{Kwze=*FKn@L9vXNMsaF4B37__Q-H$yHo+;0Byj)Qd@>VXV zPdgOPl%DrLZ|GVqMf6xsH8Cwk)|NrCC-$3k&s>>;d`z`M0d;Cw02VXj9r-#hTH7_b zi#pN)(pdQwE*!=#vZp+j7O*c?PO#6xJYT)Em!`zJ=vy!07r zF^}`aaPs&|-wHSc&6opyc7s{5NRh0@p7S%dSp~VZz0JfXGXxX~m;zR|?zyxoIzD|Y z=e=cZjc+~5O|XkQ;$XSIpU8ks#1MBB%aM)GxNd{QmpUs`Z9dt9p%t9KPJeA&d zggER@h%KnxrQwykk`94R79kFK2G3;myQrMY*)n);eEX5-;NOezF{|^9Qbvf*#2l?76*?vfkBe z{0oF5t1iqrw?>hX_MFqf`+L#DE`%DL{=H&q@{}*C=H%+;)z3>eJn?+DRgeK6B6Ish z#7@ywKYAuFr+QgMJf5aH2d|=zBeQH~%V0&P!>d9ZZ)9YSb5C%qGg_`Z`a1q;OA%H+ zttBTl;bu6JueIR2kL{Jma#TZl6)>ZYM|(GY?69}2PlVLq&a-Ok$s7d{PHZ@S`=Ush zFF{X=HGG5+F}oxFECY@|cU4v^<-E3u;_c}zp4)x>rB0SEVr#;fMx2t*mYgoUps-0Z zhsd3Nvep}kNsN~jPiK!0;BVN~xp5?!v=1`g>A#pl?ElR6^Cb_a z6dwAE4z~B*KYlgxrr@*eFnD+3D}CGI{(_w|et6=)t$F8w+%R8(#kZ{d z(M1m|VxL;LC60e<*~hNe$TADk>L}+ivnaENy8^M*63blm!PC>ETr1l#oS!q?_Gfnp z&puSYC8nJ?2~TseFJYMOmAZJ+<`eO$&~nj+kmpp)UuP#Lq@Hu@o^<%q}?s9x_A`>;yjmsv2upl=wCbhEYvC~bD za0jFds*Kv+iE!_5Q@LBs$_z8pmc2t5*j;;yIdH33&?2kwh;)T1IX*HQwWTl9;8mliL{FbOL3FENr|&xjH2SxL#>rn@;LqZ+Fnfh%mnL9NXhP z-Kp-N{;vZAm0l}z>UB$gxsi;K1YBJ+Sm%6g=lMcg?JZwc^BKwZ_?eMj>>d<67dM|S z;>#^}(f^#gS`#1bf{H%3OdRfr$fT#;Ww*%ywQR-)p}Y9E6L$}~D6$*77^TY3*qbm- zJ~`B$eN&p2S1-n|DG|D=C~Clma}V)`W%aeEZ$r6%bevRtsivXUoOpPANrmvcH=zT3 z3)w^N1}B@cFBy%@St7VBNqun((pe}iGnBV(grtUyD>`}#C~xk|Nk1b@Zs*zV$}O8% zJ#tsld*R99Mt?=ts}s+A=kfxFfUf?>kwaSE@3%~)Mr=!{jnx&siuDzohb0T%O7*>B z&mF{wc+ja6wW+)*Beq$^Q0CwNqV!??3{e`x;=vpn_K(YUj>99g4Qa&vB@1U=2$+gF zeG^U*J#INUE>%R}O<|>c%{e&W2;*=lUd;6xuhPLv-Ib{ir4OUl#a%?LuK8cRp*3-G z!OOAag8S3QbBe5blyQ4;3S^n4s*hwXQ}{!%ucWcKuuma<*o`A5@WWI3c}^Lg+63Pc$K|lrkAN{|SpMhY6{;K%*{0vG+3~frxD!2*4sg)rH1^Uq%hLvmWQ z1j0;c48;-uX?a|MFeKMP%OQJxG(LysL1+#!2t)Ebv;eX%Lu1G<>YvAuG04&Xc7^0x zXbjndIiFz3MExG;gQBY_2Ihx?B@^`v{0|fJxB}T{qA@hSS~Nzh!ypXFdw;Y4h{n+P zekKM$4$y%h`8hO(;`5^`(D;HN0ooqIki7RddoVv1ESacZAfE&aUHJ_|a$4+T0ThM_ zjUoT8XbgFQFeFF(X8#_IA^*1EKjwca5Y9xm;h{fxL-EPbSO+b)MPnW`PDW!igh3e6 zhw>{>AA$iTS-fLS$iE62L;9I$ERS}$h{lkA2t)d^;710a6dW_yIvPX%_t4l6z0bVC z4-&r=2t)Ec@M8r~AbV7D7l2LCeoVm+8NU<=Lvm0Ae%qi#>kyzhgwgP)92(xA83nhA{ zfV_(ip~>4Yn34f5dhl4>Fjz+y${Z$#$_uIW0rNASFm87>i)l=`Vw~Tsk*e2>i6i{b zI7@&OpXH=>JmDJ~9{I*)qz_slSkSQt7_81FtEI&=>K=%bIPm-Ga&xa^PQ;_X-+Gy>EcaAgy!6XWhR9M8>gp3iW?EZW_#QJP(=5Xdty*70D z`NW}|L*Kbk0Qe#LD((%|@Ai-WZK-*w-P6H$Ii$ywvvm`!Pg&!2f$ckM&Zvb; z@U%wdhuBz*qxzXsJe&v0aSIPo8w&9a>pE!nopdh~Y%jQaybobW7$@+tb1ol{DZ+pd zY^OhgdNAZ#u!4w2-BIfBU=T%|O)&!{u)Bagk@0h%0Ej>#h)5d9SGa8fgAIEK#^EEk zEI}>(j6D|^EawAQMH z_OAmM=w2Xbc{OW`x8^I1GP?4z4`+3>xM*zb>B}nFJ|WTjW}qA@v8k$oW|>w7Yqfrt zvS>b^C&0r6Ki|T#lwS#h9Q1c(SP{kpZ{rpY6;nB*{TP>TAdWNkLB6}_>sdIhT@!|( z&!YbGRp(P~Hm7m;Sv`bu9K{J)hl^CrDO)h_p6pi>qgzr#w6O*3$Nzo*f&PCjXUz-@ zKup{ofy&eQhUm5ivGn{o)cW<^+5S{m1FBMsDKcQdX&uWdrJ`)J`_7?a- zetvt=8RUN|&|Ho;{6HO{q*~#qcoi;GHlR(9)e0)ZYbf6ewLe{XoaDjwOd?4=kPB|J zTmiXeuRQ8`o)lYC0x|bBerAvh<6R&F`8D46;Oh+bmR<|B16a)$P|sIX)CrXR``3P` z=W#;tD(ZPUbVBbx&MAbprutj#WkB^GNwU31Im>o=j9!ab3V5z+E%@~D;i-6yZFa-r8q&&|H z`__ghc&*F96bj3Vbqe-UopNNgVe$-y>aqg-%x^^C&8<^YQ3~fv2^7qkvRc1qAmpd( z{-4(yjQ?xFJb#CJq#*D(Z=)+hcGXWtrjBeOZ)(uP;AqhvI@>&KTAi(sOBB6%}RQ?Dn#WoCeZia0r7CU^u?v8+i1+Jo{jSP=xWY%w{?Io{QuqH+%k$4i;+*e}Bt_i=Y8e zz&K#9P6BiaAVz?g0OCXw=${uL@Ie#@NFE?XfWQX;+{7TMph*?rU~wS0R2h1pMl^P? zi@1@FyJK(3-R(_(lwEW{va2Q?KuBtGIah0TQ6BG3VVWC{x4GoEQ*b(px^@|p$GXZoCAn=d^4-PqiE(3HGAnDn`4~IJg1IqytTZDleoH9#cQF|b(#ZJhkr4mT!5eBBGxeREqu>m5$MeS{E zoUKKi;LbL}X2<R8&TN0TPEiH@0kCe(e0S2&CJG58mc2PPRlsC zX`)Nb{1We~pN@KgtAJR_<&dbtmC$qBCnA}0chaZiYq`&|Jw1o_+L16NELVNBY3kGY zDbvs0w^+g|=>2Q|rAI}pcwBT>N0wu85_Cr2x~|=j_*0_5_tyQ6s!qD2?TJS2VdcMr6wD5DL_^n6Qp8XN9H~PkhsAecHl~1zZD7{ENecG``(WSS;Ux6~9J#!&1k-=kcG6IP9Hx60I5zdydLFOeIPq&ycWlvKan;?{NFH!3#q%!_WK zZ3>mt_44sIA-D{;PSU;8&zJwAYB#qfUsSlEE~2;GDp`iDn@nV5P5*w%@?gc>E<1Wv zL{u-HIC6V4=gj8>YDsg!g9JK?2AA9Q7lTrlUoZICk*#e`lJc*q&nCCpO4;7NDsj}g z<+T;nZfnHW!xp2l{4ll7=v6e&9+j?>%kpc3NX#FJZV8msavUYG1Xw)5hEbgg#iqT* zG?VGoI^e5mWWC({gHS` z%InOcA;a0ye6w=H^QzmAAE{0Z%71t5kJ#~l{*DFz0{1p+9{=*CdIR$Uc1DG(rdCgy zYT!nNKF?22V=a=(+kB2Xvk>K7{9vO+PV{7`w$z;EeCF8-!SKmq*6+M7P9x^M>RpH) z0WP!{bEt<&rHMc9++`rYC5(IBLv zN*%uSN?cZ8ilW>&!dYe+A09T(pin>AS?1%^6*IL@bkh_AN0Q}=502}r<_dkSu{Y}& zh<2`?6*8t_3aVs@Qe3fNFZ-^!m{)oy_@k-x+zy7QoSe?h$!>Bb4L= z_ieKBPVT;t1o=I#<({{7t;J(PAs7)ORwH@uu-G3_bws;a$DUXCoDrcXX%bneT5k7A zm#~{{I$lQ&zo6HQhyecD!6#szgQpwTT$H(T_XD{X z>19_LYgx?{HPP{DxA>Bp8l6QWR@wD0=diSFgB8UHl?nT&BQz!w^lwXKFtt4&63|{q zO_I-k%pJ3XtmsxR^4H`Bm-;4TVUyADK;Y!iX5&IhHwbWx& zb$K4&Q7srIP1DeJ+CcJ+)XeM0?!n$g!B_e&eyYZ7G#6wnt|U*GR$d>$rsraIJ1I$y zu-c0y!FE#S|nOOQyG{<5Zn`82#0% z>ui<-+-+sO5iXGvbjdd)*Bc}JmETPHKeg4li5+%wFmuLZDA=p8Jm%?0%N*_x|5A+c zhD~#~*cXKZda9=J#<#B~*Z7c1nn`4rIS1D2tSy7e#Jx77-&HfG^?-&Xe< z*kv_+ecfpz|J{L)rZjk44r(&I`*FwPRt9JDl zyZz8-m*@)5W~-m7|FB@TrniXtAHjden7!`!*z-#{ec4{iCt5hdxV+`(pBAOyZJ$b; z|6w*ER#qq7K6FtU^QkDevZuF_JB{)V8Qp5^R2|{;mnXENoS3fRJQhWFNyeG48I)|2 z3$M@d?{9Jzl2J6Jm8fGA+w5Ayj1{Vrb8jSB1&{`%8x4#~K8rl8axXU84wtJMPl;JK znr`Z_?fUlBZJTyp_eB}mGnWTOQd;j5jpoTuw4Cu^5}EMm@{CCuM9OGcNwLZcO$n%^ zJ)GB&p**5)p0KEmBga$}`F>@63Quc}sIBXXaX+P_iox4prNJL1W#7n?DYr{-BAV>N zo;Tts;7SWOPJMhul2@u)BadI-*!<)3N3#42BSWpVN<8~z)Oi~U(i7qeA4_;WUU!f7 zsJBtPo&BV{B%2C9NDqwBSbUvyBl(GSQF`BqghRAq(QbmLKO3H17K}33yRq+0x%Diu zBFxS3jgV`pA46F$R?o@Bdo(i^7)$hPO#8k48=58Irn^gJq4h7}b`OLo*feZ|K7Rh_ zUSa~D9#|I*zgbFY*A|j1t!3=%YjR}s7^7qCjdl)AMT~-+37eBeescJ#FMXNEbX3dM z;DmX@#>)_S7th*_X4Q@mhe^+6Ho3EfI;1^meEN0)?upk9UeYCL@LY~4WIL@HDkapJ zx9iX$c~OCV5ErpG5;E>Y>Hsg+xl)mAG{~NPYA0+bsFu$_l3j5~$-=5M#1-%KwuJ80 zq6|lP*Bz^SXE3}LuqCL+d6qtL*=8*HcOfcE7rc>=AI56m8q&l|J2&Rry)>7i%YRzo zH0(%2%tV7qb);^%sWbL0H-dz#7vWNDBJ)cAvsBx2K~a?x&$?Rn_?jbzjg4L=kz zz%MC95H-U!9g@u(Uau%k6c)PeQTOwc78CM4@K~xU4P_0;w#2`>>b(B&jQ9lVv$D(@5*<}AVdJ7vVG9=O4vZ(8OeCtm zxrP_tX~;Lk!L?19U$IoMC3Z0Roafr=Ac8^ z+hM0KBZJ0@c>Dcacoa3)*hFY-;Kc#8GxCu`w-;tteM}a{gY_}=jmJ&Sa8roh?XGUO zG|ZC3N6w7!6fvtQXTz4$iSjjT#2m^;>-%vlwC@_N=!P*iz80i-+t^KszIp1e7_sfOx1@H;hU&v0MdxW%ZqtK6u4 zbk?Nn&1lL;*o8u*9dj&X-;LQYI7FDWX;@*X7*W#H)zS z3J0<=@M5fT&e|z12ko?Uy}U!#{O!j*;l!|mkn!@??Aed3G_~zdhsk>3!Efd2yvjSS z%f9J-+FCw?RX3lyB$Dw1XUQ%sP4Af~H_X^yqO$x!qQu5qdmE4Fkf#*^WFLEP(O+Xf zMbl;e`0DqQ>e*Vf1O3^LI7BN9Rs#^<2Z7Y$5~+l@yj~ezPjBSWD-s5CuL)Uft+_sv z=_9V)5>%y-w+TUJa(ew9bOdftZPkE)B!N~Wkts8SR>?pE&058VZXo~C=q2}OaNsr?u3qYwt{jc+TJaU$`kqe&Jfmq=Z#*`r zn7PD_$?~s^c>6_}=NEjxw1wH6reKT6k@8~FFd2{ZpSIu)rG)XbH(GahM^bAWm9*G{ZMcq$g;&P7)K!shQm=M2Lc_hA3e5Az z`f#RLHRp<1h{xMcty@u#J)to6`uc2U-28rxR&zJ8!)vh;_W-V^%xU&3*(y`tUo7=K zV>}mmuARBJu_v^`PUC*j1j12R>Gf`v=U5!uuIxLEj~A=LUqn5;{9fi|P%QQ1lUs%P zbxRl0+-5I76ctOV$_|fh>}2Bfy>Xs(imNhT?9qXcD9cVwH~wO^JvM)hLqN1@M6_oU z-|%zq`$-EArS$zbdBoqkSB&7jBl<*&%#36ptgv;XMv7msvv1Upo*4ERFS9<>C1F`R z(tyJhpLIKr@q1iyAOT~kbB>cQVR zflM$eQ-&*D@(X>p%qJ=|{VAN3?J?2C*UGfniL+@|__Ac}W;ui!y1N+f-bn6exu3gk zN}Kg@J>U3hQT3tieTKTMWmOLqK}Xz$Az|amppJva-7FfKH)4_+9`D&gf+7QBM%a6J zBBhO{jhK_B=6Y3Kbm{3guqK$bSjWoe%qWQiNuf0XkPp|&+0rlBH zGd&6xfv!Zl?#a7ZS8mX|HPybS_942{q2j-0^OnxrUjy?VXQP?G+O3ZXUnB!9A6g#G zEy|u}l|I&W*tlC#I%Zz9SN7b8KPSr`}7lKQ(N%{vzo>0DSA0|(D zq#C^#q3MyC_`Lrkwi7!e^Of@5kbC!qXYDZd#-8Z(Ydt2pqwg%UbWU&V-p(4VM_w7{ zeM&H$7iH0;-sdu7kCCpd3!=|MO18#GMmw}cC^Y)IrJIc(4l`cAXnQ)_l=OAtRnl+H zr6Q{AD?!Tp>4KbJ*5xrGl4HDN*|TH&ec*V`&9A@G7j7Yk@dp)a3KexOB=+69tMoXA zd%ldzo#BRjSoaYA(%EmTj#3*VZeq?hAFvEfYV8VqH=L~^hkl;P`e*uze3iJQ~ diff --git a/tests/resources/generated_primitive_large_offsets.arrow_file b/tests/resources/generated_primitive_large_offsets.arrow_file deleted file mode 100644 index 6abe1a3de54c9d8065991a62893d5d4caf69e54a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3578 zcmd^?eT)-D6u|f7xP#-4Q~`x>kcdP+MYio0RFDG{#Q4D-D4)jL?QXa2Znw*LM@2M* zA_mY9B1X_N0coNnU~IG{#3003OfVraL=;RoNFd=ZV1Qr{)I@xRklz8HfPn+y>+iNxKtBj{w#rPSy)7~MMJMYx?Y6CIwgLLr{c%M*83}!dR0rp_n}^XH+@Ci^B>qbSW4NLx`R|JzX}`-yi{4C7>vGK$hxl*xWHI_e+; zpok^yt1s$FCNq+I-=yCW2kBo5t-A*MY6GNr1xfSWj1F&{+zdcm@{-yTrUJtZSUDS4>&wI&|pY__o6%=N&B3NGVw8#pF++< zE_ zJe}Xo3t*8m#O}R(Iw@xr8G>P(rWuZdg&8&?yufTV5{=nWaA@;_<&`2#g|S{96qXVJ zzp<31CDIxw<51-ogLqkrp7lx^EW}{`N+B&@|1Wvcr38GFJV|jDPH&YbcVc{s$R^|^ zB!OhIr@Xin4X)uar+gz}-Xr^+D8GRAeR>Ql3X;jbi2eI%0YI^3+85?qIOhvq?!Mm?QlqJnzLM?}g?K{la1aoFQCF1Lodhpy~sT%R8C z{m)0IZCHBz$nLR&zjQ_%{~&#=|K*l(yXLnHX;UMcPqrSIqy74B=!-Y6eAM}T;Xvo8 z)kD#pL|KT!PRED#OKiXa6Om!$B;=z3PD7dQE9KYOsDA?aWPs10Oy}pDpB6V?YuKY! z%`5V{A0Ih)VN?6L$@Z}Q=g%(Cy5=-onXqwI*Wm~6FN|Ea>W7J$Wn)Gz&kyE}Gy3*D zm^qyNWaXYME#`VBwffAyYr{WVF>dc$!_F^hxb$Ij>yeHfs+el-%3D8oTuu-AVT(xT ziZ~qm#|B98qUSsV`zap>IUoD!UJxBG&&#|JkMVNCu$Y^Jl#zf`5^+pkWFSN#fk%bL z6^WaL6=rI#NYA{GOz4RmXcb;Ip-<7{s?MN~7rl}bQFJMX~gx6LCQ!vr}Tb# z8ubg2evqMUq1=Xi6}bkv5xE7q4Y>xC*=RkpJ)hNj5>8r~Ci;{si!pRSp0E diff --git a/tests/resources/generated_primitive_large_offsets.json b/tests/resources/generated_primitive_large_offsets.json deleted file mode 100644 index eeb6a83..0000000 --- a/tests/resources/generated_primitive_large_offsets.json +++ /dev/null @@ -1,582 +0,0 @@ -{ - "schema": { - "fields": [ - { - "name": "largebinary_nullable", - "type": { - "name": "largebinary" - }, - "nullable": true, - "children": [] - }, - { - "name": "largebinary_nonnullable", - "type": { - "name": "largebinary" - }, - "nullable": false, - "children": [] - }, - { - "name": "largeutf8_nullable", - "type": { - "name": "largeutf8" - }, - "nullable": true, - "children": [] - }, - { - "name": "largeutf8_nonnullable", - "type": { - "name": "largeutf8" - }, - "nullable": false, - "children": [] - } - ] - }, - "batches": [ - { - "count": 17, - "columns": [ - { - "name": "largebinary_nullable", - "count": 17, - "VALIDITY": [ - 0, - 1, - 1, - 1, - 0, - 1, - 1, - 0, - 1, - 0, - 1, - 0, - 1, - 0, - 0, - 0, - 0 - ], - "OFFSET": [ - "0", - "0", - "3", - "6", - "7", - "7", - "8", - "8", - "8", - "17", - "17", - "17", - "17", - "19", - "19", - "19", - "19", - "19" - ], - "DATA": [ - "", - "C12E1E", - "DFE731", - "55", - "", - "6D", - "", - "", - "5867B5C5A7786E1534", - "", - "", - "", - "2C09", - "", - "", - "", - "" - ] - }, - { - "name": "largebinary_nonnullable", - "count": 17, - "VALIDITY": [ - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1 - ], - "OFFSET": [ - "0", - "9", - "11", - "28", - "36", - "39", - "43", - "43", - "43", - "43", - "46", - "52", - "58", - "60", - "67", - "70", - "81", - "81" - ], - "DATA": [ - "E23907E804B3FE7A1C", - "E319", - "EE37836C76FABC7747EFD7F5DF75D35136", - "73853DDEC4E2E828", - "93CFC8", - "079AF075", - "", - "", - "", - "E8032D", - "753D5974DC08", - "BFDD468EAFDD", - "539B", - "FBA0315A8638DC", - "99BE43", - "64EFC8F7E671AC43D7EB85", - "" - ] - }, - { - "name": "largeutf8_nullable", - "count": 17, - "VALIDITY": [ - 0, - 0, - 1, - 0, - 1, - 1, - 0, - 1, - 1, - 1, - 1, - 0, - 1, - 1, - 1, - 1, - 1 - ], - "OFFSET": [ - "0", - "0", - "0", - "8", - "8", - "16", - "25", - "25", - "33", - "44", - "52", - "61", - "61", - "68", - "76", - "83", - "91", - "99" - ], - "DATA": [ - "", - "", - "1\u00b0ekpj5", - "", - "f\u00b5wneog", - "cjr\u00a3g2\u00b5", - "", - "54\u00a3gl51", - "\u77e2e2\u00f4h4\u00a3", - "wirebm\u00f4", - "w\u00b5\u00c26nnr", - "", - "6g6lerf", - "kha\u00c2fmh", - "hpif4c3", - "im\u00f41h2j", - "rpe\u00f4mp4" - ] - }, - { - "name": "largeutf8_nonnullable", - "count": 17, - "VALIDITY": [ - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1 - ], - "OFFSET": [ - "0", - "10", - "20", - "28", - "36", - "43", - "54", - "65", - "73", - "80", - "89", - "96", - "107", - "119", - "127", - "138", - "146", - "155" - ], - "DATA": [ - "\u00b0pm\u00b5cp\u00b5", - "ne2\u00b0h\u77e2i", - "\u00f4r1jfl1", - "rkrj\u00c2bp", - "kmelwbf", - "b5lc\u00b0\u20ac\u00a3", - "\u00f4rwe\u00b5\u00c2\u00c2", - "jh1o51\u00f4", - "wlggg2c", - "c\u00b0g2e3\u00a3", - "rla346l", - "4\u77e2l\u00a3k\u00f43", - "ra\u20ac\u00c2\u77e2k5", - "3\u00c2b6ikb", - "fjb\u20acc\u20ac1", - "6rm\u00f4k5d", - "o\u00f4b\u00b0her" - ] - } - ] - }, - { - "count": 20, - "columns": [ - { - "name": "largebinary_nullable", - "count": 20, - "VALIDITY": [ - 1, - 0, - 1, - 1, - 1, - 1, - 1, - 0, - 0, - 1, - 1, - 0, - 1, - 1, - 1, - 1, - 0, - 1, - 1, - 0 - ], - "OFFSET": [ - "0", - "0", - "0", - "5", - "9", - "12", - "19", - "35", - "35", - "35", - "40", - "43", - "43", - "53", - "58", - "58", - "60", - "60", - "60", - "64", - "64" - ], - "DATA": [ - "", - "", - "1D4542F260", - "10E81B7B", - "9B1C89", - "6A22D5F3D43F8A", - "5CD8D1AC2A18BD6E22D89C6AD70EF252", - "", - "", - "2BAB545219", - "746133", - "", - "93DE60BF4962F99610BA", - "8BF5A6DD56", - "", - "A3BF", - "", - "", - "DD27811B", - "" - ] - }, - { - "name": "largebinary_nonnullable", - "count": 20, - "VALIDITY": [ - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1 - ], - "OFFSET": [ - "0", - "1", - "3", - "4", - "5", - "5", - "13", - "48", - "50", - "55", - "62", - "62", - "63", - "65", - "70", - "71", - "72", - "74", - "74", - "78", - "92" - ], - "DATA": [ - "C9", - "EC5A", - "51", - "F8", - "", - "0DB6610153B0C20A", - "D93BEAEE9075EA386C1DBEEBE65762E7490DF52E8E46E7CC2F26A324717FD6306D7129", - "2476", - "8218C21C42", - "0909C46DCC6FAD", - "", - "7B", - "B697", - "5267896E69", - "81", - "E3", - "BCF8", - "", - "1EB2792B", - "B88C1DEB5B0DF0A15160D19BA261" - ] - }, - { - "name": "largeutf8_nullable", - "count": 20, - "VALIDITY": [ - 0, - 0, - 0, - 0, - 1, - 0, - 1, - 0, - 1, - 1, - 0, - 1, - 1, - 1, - 1, - 0, - 0, - 1, - 1, - 0 - ], - "OFFSET": [ - "0", - "0", - "0", - "0", - "0", - "11", - "11", - "20", - "20", - "30", - "42", - "42", - "50", - "58", - "66", - "76", - "76", - "76", - "84", - "93", - "93" - ], - "DATA": [ - "", - "", - "", - "", - "n\u00c2\u00b5\u00a3c6\u00b5", - "", - "fekrpw\u20ac", - "", - "f\u77e23ng\u00b0r", - "\u20ac4\u00b0e4f\u77e2", - "", - "pr\u00a3gr25", - "rgbp\u00c26n", - "\u00a3hfdfwr", - "\u00a35\u00b5g\u00a3g5", - "", - "", - "dcadrp\u00c2", - "\u00b0\u00f4n35d5", - "" - ] - }, - { - "name": "largeutf8_nonnullable", - "count": 20, - "VALIDITY": [ - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1 - ], - "OFFSET": [ - "0", - "10", - "20", - "28", - "41", - "49", - "60", - "68", - "77", - "88", - "97", - "107", - "116", - "124", - "131", - "142", - "151", - "159", - "171", - "181", - "190" - ], - "DATA": [ - "wrp\u00a3ba\u77e2", - "\u00b0n\u77e254cc", - "2ddkai\u00a3", - "\u00c2r\u00c2\u00a3n\u20ac\u00c2", - "rd2\u00f4gda", - "hmm\u00b0\u00a3w\u77e2", - "p4\u00b5r23r", - "gh\u20acdlib", - "\u00b5\u00c25\u00c2\u00b51o", - "1\u00c2me6\u00c2d", - "\u00b5\u00f4\u00f4ndhc", - "2io2\u00a3k\u00b5", - "o\u00b5wr542", - "4nlo5p5", - "i\u00f4w\u00b0\u20achr", - "\u00b5\u00c216kaj", - "5\u00a3oprpo", - "4\u77e25\u00b5om\u77e2", - "\u20ac3dna\u00b51", - "b\u20acprdek" - ] - } - ] - } - ] -} \ No newline at end of file diff --git a/tests/resources/generated_primitive_large_offsets.stream b/tests/resources/generated_primitive_large_offsets.stream deleted file mode 100644 index 27109fa263b045d2bbd2eaeed474604952925aac..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3160 zcma)-4~P_H6u{@Y)AP@rT&bma3Yp}eGjnElRx8a*OEW4w&&q$%&Fsw1?#}G&_;u%I z7UG(b7Kdb{b%EwW30}0+u1J{DHwFemNR|ekf`WHws2G=5LD&0s-#cYRa{ryWSB&kjkW0*8T8Y~TxhQJ++imCy94Tx2!FW(CTp_8N&aJ>arBV0m$ z4}1bfje@^_+Rg#}XwW$tv&^>E*zA|wwClDxj;1>%=-()!m8e4b6?qxrN1AZz-j)66zHX<)Es!Bgd4(>~NkgtJ= zN@fyl*S{R3r{)N-r%?WeD}qr=lF z!*U=b4ad0Zk&}>vo<+v|2UXg zbme64*|r~=A}?>Ad*;Zwo+;bEKU)3a)wZ6>2ioSYXgxjTaM!~2&%2sC{@g9Ed~^2c z4-PFbt{lC7VNK_Plb1H5|H5@VFenERta}|Aj1BN!l&2#p&$%cQA4mCVWHWLVlFq5j z^QP@3)fg{kTr+J&lZ@vqg%{N@&{5^?U3*N$FT_F2B$%nE{6dBo{cOZ_S;UGsCX3l| z&CkYC@wk_Wg`=San8=rLV&qC9^t+sSia zkugQ~el{7G)2a-`a4gGGqF`YuW{BsQqlLl|Ckzf9p0oWzh-t9a&w|2ML*O@(u=Q9{ z2W1p4Il`b`wrZsPf)0V0tQ6`0?UNog;G6VGsyt+|BCjF|B$GY$#noug zhsT`yjf8!V?DwMlBH9lbFf1!bCi^_jAEE$&B4x^p`WeY&Mso1H(fsSb(MrUzel@2KCBocQCfC+2Nked>7U zv@u`16Hk4VJUQap(uRG@mX2-JLfg-@9A2#d{(jw8Z})!E^?dGd*W``k(49nGh{{gK zhw>%X;)0n-1vv}(Xn^xjruUWl>mt-YiF_)+XHll}^W85ioBC?^YZc9TKJ4d5E?(Z& zc5${d{?MfhFX-KiYkOz3FYG?{;QhHtYc~8eGqq;wq_x>Gyng=Bp+{23(x0v0zhkMj z#Z7EHf3R=D7wZ}hyfgmNs@kg`H?t}h9=b{l_jF~p`GSD(&&`LtC#q$h=Ff{P6(0HP8Geu>V?uquy zbMcrF%YatpMGNLsBdQq;=6K#OxFOY`Gby$Xz9;n<&s3z0q;pE&muFDF99b$d+}kL( zB40ypLbfA!Aa^16A&W?Q?=#HfIbDN1dESM8H5`p9hGA<7=*n3JN#$Iq2HYC(wXzH? zo=WjNe6vJl!o0|o5VPW7VK@n0bZ}J^R!+-)Hf2WqtO2?Ll*Ncgl|)*B&drNy_$D$n ztb|=Bt$HeaA2OxyK%5B&a>Uk>D$k`o=A}iuhcc&AK-eT?xEgpDJ>Fx63B?foT%`0j D{nQpD diff --git a/tests/resources/generated_primitive_no_batches.arrow_file b/tests/resources/generated_primitive_no_batches.arrow_file deleted file mode 100644 index 610ae3cd470c1ee18ecb2569455aea5f8f6a91dc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3914 zcmeHKJ#Q015S?|-XUj5)0T-4)a7BbtP~Z?nAh}2+3W^9&3P3cGE!)D9Wk(^F0&;~? zxQHmCloT!~C@3gWA&L}HMCp>kKLH`i@Oix1+dZ#uegEKAdiHsC-@MtG+Z*5d=H|mE zWg;JAkRd7%j~vaDOKHmD3PU)dJ&&J`vIp0^fan@f0)ONZ^??ci7j%*)+6V3dS>TI9 zGypb$BJg#NXb9ASo4}70(Fb4~SOLxj(J62MJOw;p4A)1%1^^dy{L}aGwNS!`8PnNs z-`|7nI?!zGHR>-~uNt*hr@HfUtGs+`tJCdtdhK?#)@~>t!t#;(6Zp6q=R-TdeC5~d zI3B>qWz2JqtT^sYg`s@daU3ciS7LnVFiwIP(nYK@bJ2#KYunpx-kDY;`Y1WV;tXNG z5OTBv(IWB+vlG=AVf`gcs1%#&rK02ri{rq)U~sg`AkdhL*7Py%Q=$}cKAY`sb$4}T zYCHWv$Kt(FJkCML(_*ll$LO6&o|x#}UY^qXru01c;C%4?7z`Ju#N{J^A>{dyy399p zw^3ItNJaPg`k(KwVS@A7YjvXAq`Kq@i}y+KIH#JY<;F2u-@)+=t;iN>6r*(mxnZ8x zGPidB@9cd&1BYv%_f=k3to?XmRbb(7Fh4E}}R7d!^-J-maY^ zYsdWg313Un5^@G}ik}PSIeQ-1#Tx9aJEL^u$u+{_zB9P2skou$+ZfHU;&~IzXbI^k zMze$*fhIhBC{44BR0!s+SRW}K*HY~>T1v{wHTFIa46f2N%Srn&nx~4#HDTQuC8fU7 zT*SOt(=4i#r3@g4;BS7m1wO(Q!*5Zp$;he}RNo^v`mFm59gd;a?Y5`yV2+J_xKQIe zL;DaepK<2sXM}?*zUDXrKa)7%p6epl=i!f^CVBi~kL+Pd&1qlK9Dd~Fx};AVy?&C~ rq)#V(Iy{5?CpGEQ_WzHhPvc!9re`L7TD~$ReH#6Ch diff --git a/tests/resources/generated_primitive_no_batches.json b/tests/resources/generated_primitive_no_batches.json deleted file mode 100644 index e9eac55..0000000 --- a/tests/resources/generated_primitive_no_batches.json +++ /dev/null @@ -1,287 +0,0 @@ -{ - "schema": { - "fields": [ - { - "name": "bool_nullable", - "type": { - "name": "bool" - }, - "nullable": true, - "children": [] - }, - { - "name": "bool_nonnullable", - "type": { - "name": "bool" - }, - "nullable": false, - "children": [] - }, - { - "name": "int8_nullable", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 8 - }, - "nullable": true, - "children": [] - }, - { - "name": "int8_nonnullable", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 8 - }, - "nullable": false, - "children": [] - }, - { - "name": "int16_nullable", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 16 - }, - "nullable": true, - "children": [] - }, - { - "name": "int16_nonnullable", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 16 - }, - "nullable": false, - "children": [] - }, - { - "name": "int32_nullable", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 32 - }, - "nullable": true, - "children": [] - }, - { - "name": "int32_nonnullable", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 32 - }, - "nullable": false, - "children": [] - }, - { - "name": "int64_nullable", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 64 - }, - "nullable": true, - "children": [] - }, - { - "name": "int64_nonnullable", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 64 - }, - "nullable": false, - "children": [] - }, - { - "name": "uint8_nullable", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 8 - }, - "nullable": true, - "children": [] - }, - { - "name": "uint8_nonnullable", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 8 - }, - "nullable": false, - "children": [] - }, - { - "name": "uint16_nullable", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 16 - }, - "nullable": true, - "children": [] - }, - { - "name": "uint16_nonnullable", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 16 - }, - "nullable": false, - "children": [] - }, - { - "name": "uint32_nullable", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 32 - }, - "nullable": true, - "children": [] - }, - { - "name": "uint32_nonnullable", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 32 - }, - "nullable": false, - "children": [] - }, - { - "name": "uint64_nullable", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 64 - }, - "nullable": true, - "children": [] - }, - { - "name": "uint64_nonnullable", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 64 - }, - "nullable": false, - "children": [] - }, - { - "name": "float32_nullable", - "type": { - "name": "floatingpoint", - "precision": "SINGLE" - }, - "nullable": true, - "children": [] - }, - { - "name": "float32_nonnullable", - "type": { - "name": "floatingpoint", - "precision": "SINGLE" - }, - "nullable": false, - "children": [] - }, - { - "name": "float64_nullable", - "type": { - "name": "floatingpoint", - "precision": "DOUBLE" - }, - "nullable": true, - "children": [] - }, - { - "name": "float64_nonnullable", - "type": { - "name": "floatingpoint", - "precision": "DOUBLE" - }, - "nullable": false, - "children": [] - }, - { - "name": "binary_nullable", - "type": { - "name": "binary" - }, - "nullable": true, - "children": [] - }, - { - "name": "binary_nonnullable", - "type": { - "name": "binary" - }, - "nullable": false, - "children": [] - }, - { - "name": "utf8_nullable", - "type": { - "name": "utf8" - }, - "nullable": true, - "children": [] - }, - { - "name": "utf8_nonnullable", - "type": { - "name": "utf8" - }, - "nullable": false, - "children": [] - }, - { - "name": "fixedsizebinary_19_nullable", - "type": { - "name": "fixedsizebinary", - "byteWidth": 19 - }, - "nullable": true, - "children": [] - }, - { - "name": "fixedsizebinary_19_nonnullable", - "type": { - "name": "fixedsizebinary", - "byteWidth": 19 - }, - "nullable": false, - "children": [] - }, - { - "name": "fixedsizebinary_120_nullable", - "type": { - "name": "fixedsizebinary", - "byteWidth": 120 - }, - "nullable": true, - "children": [] - }, - { - "name": "fixedsizebinary_120_nonnullable", - "type": { - "name": "fixedsizebinary", - "byteWidth": 120 - }, - "nullable": false, - "children": [] - } - ] - }, - "batches": [] -} \ No newline at end of file diff --git a/tests/resources/generated_primitive_no_batches.stream b/tests/resources/generated_primitive_no_batches.stream deleted file mode 100644 index ccbc451af9d8f920679290df6665d78acbb7e2dc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1944 zcmZ`)zi-n(6n>XFj#WV$AVVY&c&HEq11(hrQicu;3>9KRh^0!DL`1Tj3L2?JnNWue z6+_jLp#uX014Ado(4ndrGcx2)K!~yYJid3nbI!il(w8{z?t9;R?>jqEN;#?!Rf(^8 zS|Fdw6yOe*@Y47N{Pc7@x#tZ;*MU0tW1naUGytTalQPjB@Bj#aFCNhd*Z^w4*Eymi zpbgvxew2tl09(K+a4v{Wfqmd95CRjVJ_I%Zq@d%UQsQf|jt{%0v)@wgAa(=j_I5h$ zm%Ue=RxfV8*xg)Sxw9Gf<9HB7%~sUWIfRpA{!fzQT2T&@0PBrkv+H<-99OW;Gji&< zpD9Dl(wL+}B-g(M9<}Xb7Mv`t}W(b$~ zy^FZ_EeOsRsFw5e*Rte89H1;tKW~ZCzxyF96r?eWSAVn=VzQb<{jbTj_i_@% diff --git a/tests/resources/generated_primitive_zerolength.arrow_file b/tests/resources/generated_primitive_zerolength.arrow_file deleted file mode 100644 index 25a26d3bc3c22959b73e874550ea2e9a3784b7cd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8858 zcmeHNzfTlF6n=Xw$Az45>J>zdZb6KN1t1}qP(WfqLDWzvu^|v3UUIp+C?sg83-r%o#13B~i{_9CjjZjYW>f#^D<5BS48(Hdk70u!{?Nwf~R z2Pr_l_(U6!DM%UewS#B}vH-aaInEJ%fHWY(kQ0w+AF>IVfdr5vnBIm=L12P*e}*A$ zy?wZtV>c7E-pN;yj+~G)aIVOm>nD%n61@owUughZoayxZFm_Q-G3q* z*K9V#0Q$#%wI0WP*tm>-_EBaWcN4?VHnQW`(l)MIY{)S7qA_%qp=bJ{igu3e%JR}^ zl9A}6!gv|X4%#oo7-1mlhhNd`r#TLxe+eDh#io0ytT0{%-Ju%Ok^y>8wRO8Qg1)V;{sgAw%1Vh3`P&ye8kBp#X$rv~b;oZx|mr1G zk$&;tYpwwO+1U9ic8u?7;)RlK!DndB@pr*oXHTPcZyfE++ah!nC><|@eP>{qQ)6S! zw-(MLjSHHb=@8Pkg|iPn0?Z3=(VS)&sTB2R#`;j>IF@>z=}=NZ>15}*X<#*{8BSWa zaPDgy$Ao!Xgp}4aXFvMQm=+MF0(C)D@%YZ)w#Y{8V%RM!oh}vClFoaSPk-w^gNMg3 zU$0k_Z!nLIZREkmZ-&_L^1Q|wBi{%gPyEc|i0m}S0rTMiJ|4`=rASv`ku!2pS)~r2 z_}vS*GcMh#Px5mK#@YUO-0vQo2hNuVWG(4+9Z$|y7kyrbX0g>ppVzKgY<1BOOl3Tu zb)AR9zdv1RZx6V<(%$iRvp<~&pBm^ zwP>Hz8*FL) zNL#MSeKm8xm)}u$?xkXsIXiRbUf92l&WGH&SK?Fw=ew4(D0l9aJWq1xUPSKPD>|d& U--w(C&-1z0H{|aL^*<^90f(@6X#fBK diff --git a/tests/resources/generated_primitive_zerolength.json b/tests/resources/generated_primitive_zerolength.json deleted file mode 100644 index 1e16259..0000000 --- a/tests/resources/generated_primitive_zerolength.json +++ /dev/null @@ -1,879 +0,0 @@ -{ - "schema": { - "fields": [ - { - "name": "bool_nullable", - "type": { - "name": "bool" - }, - "nullable": true, - "children": [] - }, - { - "name": "bool_nonnullable", - "type": { - "name": "bool" - }, - "nullable": false, - "children": [] - }, - { - "name": "int8_nullable", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 8 - }, - "nullable": true, - "children": [] - }, - { - "name": "int8_nonnullable", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 8 - }, - "nullable": false, - "children": [] - }, - { - "name": "int16_nullable", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 16 - }, - "nullable": true, - "children": [] - }, - { - "name": "int16_nonnullable", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 16 - }, - "nullable": false, - "children": [] - }, - { - "name": "int32_nullable", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 32 - }, - "nullable": true, - "children": [] - }, - { - "name": "int32_nonnullable", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 32 - }, - "nullable": false, - "children": [] - }, - { - "name": "int64_nullable", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 64 - }, - "nullable": true, - "children": [] - }, - { - "name": "int64_nonnullable", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 64 - }, - "nullable": false, - "children": [] - }, - { - "name": "uint8_nullable", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 8 - }, - "nullable": true, - "children": [] - }, - { - "name": "uint8_nonnullable", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 8 - }, - "nullable": false, - "children": [] - }, - { - "name": "uint16_nullable", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 16 - }, - "nullable": true, - "children": [] - }, - { - "name": "uint16_nonnullable", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 16 - }, - "nullable": false, - "children": [] - }, - { - "name": "uint32_nullable", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 32 - }, - "nullable": true, - "children": [] - }, - { - "name": "uint32_nonnullable", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 32 - }, - "nullable": false, - "children": [] - }, - { - "name": "uint64_nullable", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 64 - }, - "nullable": true, - "children": [] - }, - { - "name": "uint64_nonnullable", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 64 - }, - "nullable": false, - "children": [] - }, - { - "name": "float32_nullable", - "type": { - "name": "floatingpoint", - "precision": "SINGLE" - }, - "nullable": true, - "children": [] - }, - { - "name": "float32_nonnullable", - "type": { - "name": "floatingpoint", - "precision": "SINGLE" - }, - "nullable": false, - "children": [] - }, - { - "name": "float64_nullable", - "type": { - "name": "floatingpoint", - "precision": "DOUBLE" - }, - "nullable": true, - "children": [] - }, - { - "name": "float64_nonnullable", - "type": { - "name": "floatingpoint", - "precision": "DOUBLE" - }, - "nullable": false, - "children": [] - }, - { - "name": "binary_nullable", - "type": { - "name": "binary" - }, - "nullable": true, - "children": [] - }, - { - "name": "binary_nonnullable", - "type": { - "name": "binary" - }, - "nullable": false, - "children": [] - }, - { - "name": "utf8_nullable", - "type": { - "name": "utf8" - }, - "nullable": true, - "children": [] - }, - { - "name": "utf8_nonnullable", - "type": { - "name": "utf8" - }, - "nullable": false, - "children": [] - }, - { - "name": "fixedsizebinary_19_nullable", - "type": { - "name": "fixedsizebinary", - "byteWidth": 19 - }, - "nullable": true, - "children": [] - }, - { - "name": "fixedsizebinary_19_nonnullable", - "type": { - "name": "fixedsizebinary", - "byteWidth": 19 - }, - "nullable": false, - "children": [] - }, - { - "name": "fixedsizebinary_120_nullable", - "type": { - "name": "fixedsizebinary", - "byteWidth": 120 - }, - "nullable": true, - "children": [] - }, - { - "name": "fixedsizebinary_120_nonnullable", - "type": { - "name": "fixedsizebinary", - "byteWidth": 120 - }, - "nullable": false, - "children": [] - } - ] - }, - "batches": [ - { - "count": 0, - "columns": [ - { - "name": "bool_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "bool_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int8_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int8_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int16_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int16_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int32_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int32_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int64_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int64_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint8_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint8_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint16_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint16_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint32_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint32_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint64_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint64_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "float32_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "float32_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "float64_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "float64_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "binary_nullable", - "count": 0, - "VALIDITY": [], - "OFFSET": [ - 0 - ], - "DATA": [] - }, - { - "name": "binary_nonnullable", - "count": 0, - "VALIDITY": [], - "OFFSET": [ - 0 - ], - "DATA": [] - }, - { - "name": "utf8_nullable", - "count": 0, - "VALIDITY": [], - "OFFSET": [ - 0 - ], - "DATA": [] - }, - { - "name": "utf8_nonnullable", - "count": 0, - "VALIDITY": [], - "OFFSET": [ - 0 - ], - "DATA": [] - }, - { - "name": "fixedsizebinary_19_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "fixedsizebinary_19_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "fixedsizebinary_120_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "fixedsizebinary_120_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - } - ] - }, - { - "count": 0, - "columns": [ - { - "name": "bool_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "bool_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int8_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int8_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int16_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int16_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int32_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int32_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int64_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int64_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint8_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint8_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint16_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint16_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint32_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint32_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint64_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint64_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "float32_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "float32_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "float64_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "float64_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "binary_nullable", - "count": 0, - "VALIDITY": [], - "OFFSET": [ - 0 - ], - "DATA": [] - }, - { - "name": "binary_nonnullable", - "count": 0, - "VALIDITY": [], - "OFFSET": [ - 0 - ], - "DATA": [] - }, - { - "name": "utf8_nullable", - "count": 0, - "VALIDITY": [], - "OFFSET": [ - 0 - ], - "DATA": [] - }, - { - "name": "utf8_nonnullable", - "count": 0, - "VALIDITY": [], - "OFFSET": [ - 0 - ], - "DATA": [] - }, - { - "name": "fixedsizebinary_19_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "fixedsizebinary_19_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "fixedsizebinary_120_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "fixedsizebinary_120_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - } - ] - }, - { - "count": 0, - "columns": [ - { - "name": "bool_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "bool_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int8_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int8_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int16_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int16_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int32_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int32_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int64_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "int64_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint8_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint8_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint16_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint16_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint32_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint32_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint64_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "uint64_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "float32_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "float32_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "float64_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "float64_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "binary_nullable", - "count": 0, - "VALIDITY": [], - "OFFSET": [ - 0 - ], - "DATA": [] - }, - { - "name": "binary_nonnullable", - "count": 0, - "VALIDITY": [], - "OFFSET": [ - 0 - ], - "DATA": [] - }, - { - "name": "utf8_nullable", - "count": 0, - "VALIDITY": [], - "OFFSET": [ - 0 - ], - "DATA": [] - }, - { - "name": "utf8_nonnullable", - "count": 0, - "VALIDITY": [], - "OFFSET": [ - 0 - ], - "DATA": [] - }, - { - "name": "fixedsizebinary_19_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "fixedsizebinary_19_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "fixedsizebinary_120_nullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - }, - { - "name": "fixedsizebinary_120_nonnullable", - "count": 0, - "VALIDITY": [], - "DATA": [] - } - ] - } - ] -} \ No newline at end of file diff --git a/tests/resources/generated_primitive_zerolength.stream b/tests/resources/generated_primitive_zerolength.stream deleted file mode 100644 index 3e649c3193282055dffd80163de6433d91d760b3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6816 zcmeHJziSjh6n=Br>@DF;PQBt#(Jg{lSR^q7g%l|)EP_~x*o1^6cyQbW%|RlmB*hiM zBBZoPVPRomkxDF5gkX84Mg9a4%lLEs-t4@+x!oH`7w~rA-R{hL-}~N=otcZGC~6mp z%EV)YM#-mP3h;)ZykxzEsb|*XcQzop4ygj)@rkw|vkV@F8Iw#xV?6U}x8&j&<&BYkl=@FC)MJBb@}d0_3(W)d?9>rJ@|W!hmT$2^A^BRdjDu%##_8= z=qn5g={#TkjiNayxSp+ABb_EKYn;m9J{la?RB%Ff;^6uQOlWbXQlyT9>n3W$IN`EW zd+?`!UQdDH9!Or3Gd<;`S&hr)wQF#~#W>09z`@163&3>|!nH_$`0or?2z|D9p6(sv zN4t2Tq+6&NmJ56?*!%2RysqZ3&b)nEN1?V+8SGmN%bW(AaK3SHo)}!%U2>ZKQIn&&2x3;JBA2&vYv(&^FmTcPy;owA)GB4$dQk#XTkIn#G5l7vO;LBXZ0 Date: Fri, 5 Sep 2025 15:56:35 +0200 Subject: [PATCH 13/22] fix --- include/sparrow_ipc/encapsulated_message.hpp | 2 ++ src/utils.cpp | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/sparrow_ipc/encapsulated_message.hpp b/include/sparrow_ipc/encapsulated_message.hpp index e4ba113..a88e4de 100644 --- a/include/sparrow_ipc/encapsulated_message.hpp +++ b/include/sparrow_ipc/encapsulated_message.hpp @@ -1,3 +1,5 @@ +#pragma once + #include #include diff --git a/src/utils.cpp b/src/utils.cpp index d22288a..3d7b5e7 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -6,7 +6,6 @@ #include "sparrow.hpp" - namespace sparrow_ipc { namespace @@ -377,9 +376,8 @@ namespace sparrow_ipc } case sparrow::data_type::MAP: { - const auto map_type = org::apache::arrow::flatbuf::CreateMap(builder, false); // not - // sorted - // keys + // not sorted keys + const auto map_type = org::apache::arrow::flatbuf::CreateMap(builder, false); return {org::apache::arrow::flatbuf::Type::Map, map_type.Union()}; } case sparrow::data_type::DENSE_UNION: From 635913616d372c7816506154e2611ff2ef46ca36 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Mon, 8 Sep 2025 15:20:16 +0200 Subject: [PATCH 14/22] wip --- .../arrow_array_schema_common_release.hpp | 2 + .../config/sparrow_ipc_version.hpp | 2 +- include/sparrow_ipc/deserialize.hpp | 41 ++++++++++++++++++- .../deserialize_primitive_array.hpp | 4 ++ include/sparrow_ipc/deserialize_utils.hpp | 39 ++++++++---------- ...deserialize_variable_size_binary_array.hpp | 9 ++++ include/sparrow_ipc/magic_values.hpp | 10 +++++ include/sparrow_ipc/metadata.hpp | 14 +++++++ src/arrow_interface/arrow_array.cpp | 6 --- src/arrow_interface/arrow_schema.cpp | 6 --- src/deserialize.cpp | 31 +++++++++++++- src/deserialize_fixedsizebinary_array.cpp | 4 ++ src/deserialize_utils.cpp | 18 ++------ src/encapsulated_message.cpp | 4 ++ src/metadata.cpp | 16 ++++---- 15 files changed, 147 insertions(+), 59 deletions(-) diff --git a/include/sparrow_ipc/arrow_interface/arrow_array_schema_common_release.hpp b/include/sparrow_ipc/arrow_interface/arrow_array_schema_common_release.hpp index 3e73c50..8ef5f68 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_array_schema_common_release.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_array_schema_common_release.hpp @@ -28,6 +28,8 @@ namespace sparrow_ipc } SPARROW_ASSERT_TRUE(t.private_data != nullptr); const auto private_data = static_cast(t.private_data); + delete private_data; + t.private_data = nullptr; if (t.dictionary) { diff --git a/include/sparrow_ipc/config/sparrow_ipc_version.hpp b/include/sparrow_ipc/config/sparrow_ipc_version.hpp index 0c2b9bb..4f718d7 100644 --- a/include/sparrow_ipc/config/sparrow_ipc_version.hpp +++ b/include/sparrow_ipc/config/sparrow_ipc_version.hpp @@ -6,7 +6,7 @@ namespace sparrow_ipc constexpr int SPARROW_IPC_VERSION_MINOR = 1; constexpr int SPARROW_IPC_VERSION_PATCH = 0; - constexpr int SPARROW_IPC_BINARY_CURRENT = 9; + constexpr int SPARROW_IPC_BINARY_CURRENT = 1; constexpr int SPARROW_IPC_BINARY_REVISION = 0; constexpr int SPARROW_IPC_BINARY_AGE = 0; } diff --git a/include/sparrow_ipc/deserialize.hpp b/include/sparrow_ipc/deserialize.hpp index b54084f..fa52902 100644 --- a/include/sparrow_ipc/deserialize.hpp +++ b/include/sparrow_ipc/deserialize.hpp @@ -5,13 +5,31 @@ #include -#include "config/config.hpp" #include "Message_generated.h" +#include "sparrow_ipc/config/config.hpp" #include "sparrow_ipc/encapsulated_message.hpp" -#include "SparseTensor_generated.h" namespace sparrow_ipc { + /** + * @brief Deserializes a schema message from Arrow IPC format data. + * + * This function parses an Arrow IPC schema message from a byte buffer, extracting + * the field name and custom metadata from the first (and expected only) field in the schema. + * + * @param data A span containing the raw byte data to deserialize from + * @param current_offset Reference to the current position in the data buffer, which will be + * updated to point past the processed schema message + * @param name Optional output parameter that will contain the field name if present + * @param metadata Optional output parameter that will contain the custom metadata + * key-value pairs if present + * + * @throws std::runtime_error If the message is not a Schema message type + * @throws std::runtime_error If the schema does not contain exactly one field + * + * @note This function expects the data to start with a 4-byte length prefix followed + * by the FlatBuffer schema message data + */ SPARROW_IPC_API void deserialize_schema_message( std::span data, size_t& current_offset, @@ -21,6 +39,25 @@ namespace sparrow_ipc [[nodiscard]] SPARROW_IPC_API const org::apache::arrow::flatbuf::RecordBatch* deserialize_record_batch_message(std::span data, size_t& current_offset); + /** + * @brief Deserializes an Arrow IPC stream from binary data into a vector of record batches. + * + * This function processes an Arrow IPC stream format, extracting schema information + * and record batch data. It handles encapsulated messages sequentially, first expecting + * a Schema message followed by one or more RecordBatch messages. + * + * @param data A span of bytes containing the serialized Arrow IPC stream data + * + * @return std::vector A vector containing all deserialized record batches + * + * @throws std::runtime_error If: + * - A RecordBatch message is encountered before a Schema message + * - A RecordBatch message header is missing or invalid + * - Unsupported message types are encountered (Tensor, DictionaryBatch, SparseTensor) + * - An unknown message header type is encountered + * + * @note The function processes messages until an end-of-stream marker is detected + */ [[nodiscard]] SPARROW_IPC_API std::vector deserialize_stream(std::span data); } \ No newline at end of file diff --git a/include/sparrow_ipc/deserialize_primitive_array.hpp b/include/sparrow_ipc/deserialize_primitive_array.hpp index b70f6a8..a1c5dad 100644 --- a/include/sparrow_ipc/deserialize_primitive_array.hpp +++ b/include/sparrow_ipc/deserialize_primitive_array.hpp @@ -40,6 +40,10 @@ namespace sparrow_ipc buffer_index++ ); const auto primitive_buffer_metadata = record_batch.buffers()->Get(buffer_index++); + if (body.size() < (primitive_buffer_metadata->offset() + primitive_buffer_metadata->length())) + { + throw std::runtime_error("Primitive buffer exceeds body size"); + } auto primitives_ptr = const_cast(body.data() + primitive_buffer_metadata->offset()); std::vector buffers = {bitmap_ptr, primitives_ptr}; ArrowArray array = make_non_owning_arrow_array( diff --git a/include/sparrow_ipc/deserialize_utils.hpp b/include/sparrow_ipc/deserialize_utils.hpp index 4a901ad..fc1ca05 100644 --- a/include/sparrow_ipc/deserialize_utils.hpp +++ b/include/sparrow_ipc/deserialize_utils.hpp @@ -1,36 +1,33 @@ #pragma once #include +#include #include #include -#include #include "Message_generated.h" #include "Schema_generated.h" namespace sparrow_ipc::utils { - template - [[nodiscard]] sparrow::u8_buffer message_buffer_to_u8buffer( - const org::apache::arrow::flatbuf::RecordBatch* record_batch, - std::span body, - size_t index - ) - { - const auto buffer_metadata = record_batch->buffers()->Get(index); - auto ptr = const_cast(body.data() + buffer_metadata->offset()); - auto casted_ptr = reinterpret_cast(ptr); - const std::size_t count = static_cast(buffer_metadata->length() / sizeof(T)); - return sparrow::u8_buffer{casted_ptr, count}; - } - - [[nodiscard]] const sparrow::dynamic_bitset_view message_buffer_to_validity_bitmap( - const org::apache::arrow::flatbuf::RecordBatch* record_batch, - std::span body, - size_t index - ); - + /** + * @brief Extracts bitmap pointer and null count from a RecordBatch buffer. + * + * This function retrieves a bitmap buffer from the specified index in the RecordBatch's + * buffer list and calculates the number of null values represented by the bitmap. + * + * @param record_batch The Arrow RecordBatch containing buffer metadata + * @param body The raw buffer data as a byte span + * @param index The index of the bitmap buffer in the RecordBatch's buffer list + * + * @return A pair containing: + * - First: Pointer to the bitmap data (nullptr if buffer is empty) + * - Second: Count of null values in the bitmap (0 if buffer is empty) + * + * @note If the bitmap buffer has zero length, returns {nullptr, 0} + * @note The returned pointer is a non-const cast of the original const data + */ [[nodiscard]] std::pair get_bitmap_pointer_and_null_count( const org::apache::arrow::flatbuf::RecordBatch& record_batch, std::span body, diff --git a/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp b/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp index 52e7bb4..f6a5729 100644 --- a/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp +++ b/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp @@ -36,9 +36,18 @@ namespace sparrow_ipc body, buffer_index++ ); + const auto offset_metadata = record_batch.buffers()->Get(buffer_index++); + if ((offset_metadata->offset() + offset_metadata->length()) > body.size()) + { + throw std::runtime_error("Offset buffer exceeds body size"); + } auto offset_ptr = const_cast(body.data() + offset_metadata->offset()); const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++); + if ((buffer_metadata->offset() + buffer_metadata->length()) > body.size()) + { + throw std::runtime_error("Data buffer exceeds body size"); + } auto buffer_ptr = const_cast(body.data() + buffer_metadata->offset()); std::vector buffers = {bitmap_ptr, offset_ptr, buffer_ptr}; ArrowArray array = make_non_owning_arrow_array( diff --git a/include/sparrow_ipc/magic_values.hpp b/include/sparrow_ipc/magic_values.hpp index e7f8fc6..b08d505 100644 --- a/include/sparrow_ipc/magic_values.hpp +++ b/include/sparrow_ipc/magic_values.hpp @@ -6,8 +6,17 @@ namespace sparrow_ipc { + + /** + * Continuation value defined in the Arrow IPC specification: + * https://arrow.apache.org/docs/format/Columnar.html#encapsulated-message-format + */ constexpr std::array continuation = {0xFF, 0xFF, 0xFF, 0xFF}; + /** + * End-of-stream marker defined in the Arrow IPC specification: + * https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format + */ constexpr std::array end_of_stream = {0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00}; template @@ -15,6 +24,7 @@ namespace sparrow_ipc { return std::ranges::equal(buf, continuation); } + template [[nodiscard]] bool is_end_of_stream(const R& buf) { diff --git a/include/sparrow_ipc/metadata.hpp b/include/sparrow_ipc/metadata.hpp index bab7290..83951ee 100644 --- a/include/sparrow_ipc/metadata.hpp +++ b/include/sparrow_ipc/metadata.hpp @@ -10,6 +10,20 @@ namespace sparrow_ipc { + /** + * @brief Converts FlatBuffers metadata to Sparrow metadata format. + * + * This function takes a FlatBuffers vector containing key-value pairs from Apache Arrow + * format and converts them into a vector of Sparrow metadata pairs. Each key-value pair + * from the FlatBuffers structure is extracted and stored as a sparrow::metadata_pair. + * + * @param metadata A FlatBuffers vector containing KeyValue pairs from Apache Arrow format + * @return std::vector A vector of Sparrow metadata pairs containing + * the converted key-value data + * + * @note The function reserves space in the output vector to match the input size for + * optimal memory allocation performance. + */ std::vector to_sparrow_metadata( const ::flatbuffers::Vector<::flatbuffers::Offset>& metadata ); diff --git a/src/arrow_interface/arrow_array.cpp b/src/arrow_interface/arrow_array.cpp index 74aadcc..ed0a0f2 100644 --- a/src/arrow_interface/arrow_array.cpp +++ b/src/arrow_interface/arrow_array.cpp @@ -16,12 +16,6 @@ namespace sparrow_ipc SPARROW_ASSERT_TRUE(array->release == std::addressof(release_non_owning_arrow_array)) release_common_non_owning_arrow(*array); - if (array->private_data != nullptr) - { - const auto private_data = static_cast(array->private_data); - delete private_data; - array->private_data = nullptr; - } array->buffers = nullptr; // The buffers were deleted with the private data } diff --git a/src/arrow_interface/arrow_schema.cpp b/src/arrow_interface/arrow_schema.cpp index e3af5b8..6c3ed7d 100644 --- a/src/arrow_interface/arrow_schema.cpp +++ b/src/arrow_interface/arrow_schema.cpp @@ -9,12 +9,6 @@ namespace sparrow_ipc SPARROW_ASSERT_FALSE(schema == nullptr); SPARROW_ASSERT_TRUE(schema->release == std::addressof(release_non_owning_arrow_schema)); release_common_non_owning_arrow(*schema); - if (schema->private_data != nullptr) - { - const auto private_data = static_cast(schema->private_data); - delete private_data; - schema->private_data = nullptr; - } *schema = {}; } } \ No newline at end of file diff --git a/src/deserialize.cpp b/src/deserialize.cpp index 8457c84..f00a052 100644 --- a/src/deserialize.cpp +++ b/src/deserialize.cpp @@ -17,7 +17,15 @@ namespace sparrow_ipc std::optional>& metadata ) { + if (data.size() < (current_offset + sizeof(uint32_t))) + { + throw std::runtime_error("Data too short to contain schema length."); + } const uint32_t schema_meta_len = *(reinterpret_cast(data.data() + current_offset)); + if (schema_meta_len == 0 || (data.size() < (current_offset + sizeof(uint32_t) + schema_meta_len))) + { + throw std::runtime_error("Invalid schema length."); + } current_offset += sizeof(uint32_t); const auto schema_message = org::apache::arrow::flatbuf::GetMessage(data.data() + current_offset); if (schema_message->header_type() != org::apache::arrow::flatbuf::MessageHeader::Schema) @@ -66,7 +74,26 @@ namespace sparrow_ipc } return static_cast(batch_message->header()); } - + /** + * @brief Deserializes arrays from an Apache Arrow RecordBatch using the provided schema. + * + * This function processes each field in the schema and deserializes the corresponding + * data from the RecordBatch into sparrow::array objects. It handles various Arrow data + * types including primitive types (bool, integers, floating point), binary data, and + * string data with their respective size variants. + * + * @param record_batch The Apache Arrow FlatBuffer RecordBatch containing the serialized data + * @param schema The Apache Arrow FlatBuffer Schema defining the structure and types of the data + * @param encapsulated_message The message containing the binary data buffers + * + * @return std::vector A vector of deserialized arrays, one for each field in the schema + * + * @throws std::runtime_error If an unsupported data type, integer bit width, or floating point precision + * is encountered + * + * The function maintains a buffer index that is incremented as it processes each field + * to correctly map data buffers to their corresponding arrays. + */ std::vector get_arrays_from_record_batch( const org::apache::arrow::flatbuf::RecordBatch& record_batch, const org::apache::arrow::flatbuf::Schema& schema, @@ -269,7 +296,7 @@ namespace sparrow_ipc encapsulated_message ); std::vector field_names_str(field_names.cbegin(), field_names.cend()); - record_batches.emplace_back(std::move(field_names_str), std::move(arrays), "test"); + record_batches.emplace_back(std::move(field_names_str), std::move(arrays)); } break; case org::apache::arrow::flatbuf::MessageHeader::Tensor: diff --git a/src/deserialize_fixedsizebinary_array.cpp b/src/deserialize_fixedsizebinary_array.cpp index 995dc61..63ea213 100644 --- a/src/deserialize_fixedsizebinary_array.cpp +++ b/src/deserialize_fixedsizebinary_array.cpp @@ -27,6 +27,10 @@ namespace sparrow_ipc buffer_index++ ); const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++); + if ((body.size() < (buffer_metadata->offset() + buffer_metadata->length()))) + { + throw std::runtime_error("Data buffer exceeds body size"); + } auto buffer_ptr = const_cast(body.data() + buffer_metadata->offset()); std::vector buffers = {bitmap_ptr, buffer_ptr}; ArrowArray array = make_non_owning_arrow_array( diff --git a/src/deserialize_utils.cpp b/src/deserialize_utils.cpp index c9c1a03..d89be6c 100644 --- a/src/deserialize_utils.cpp +++ b/src/deserialize_utils.cpp @@ -2,19 +2,6 @@ namespace sparrow_ipc::utils { - const sparrow::dynamic_bitset_view message_buffer_to_validity_bitmap( - const org::apache::arrow::flatbuf::RecordBatch* record_batch, - std::span body, - size_t index - ) - { - const auto buffer_metadata = record_batch->buffers()->Get(index); - return sparrow::dynamic_bitset_view{ - body.data() + buffer_metadata->offset(), - static_cast(buffer_metadata->length()) - }; - } - std::pair get_bitmap_pointer_and_null_count( const org::apache::arrow::flatbuf::RecordBatch& record_batch, std::span body, @@ -26,7 +13,10 @@ namespace sparrow_ipc::utils { return {nullptr, 0}; } - + if (body.size() < (bitmap_metadata->offset() + bitmap_metadata->length())) + { + throw std::runtime_error("Bitmap buffer exceeds body size"); + } auto ptr = const_cast(body.data() + bitmap_metadata->offset()); const sparrow::dynamic_bitset_view bitmap_view{ ptr, diff --git a/src/encapsulated_message.cpp b/src/encapsulated_message.cpp index b0c5c38..71efab2 100644 --- a/src/encapsulated_message.cpp +++ b/src/encapsulated_message.cpp @@ -76,6 +76,10 @@ namespace sparrow_ipc const size_t offset = sizeof(uint32_t) * 2 // 4 bytes continuation + 4 bytes metadata size + metadata_length(); const size_t padded_offset = utils::align_to_8(offset); // Round up to 8-byte boundary + if (m_data.size() < padded_offset + body_length()) + { + throw std::runtime_error("Data size is smaller than expected from metadata."); + } return m_data.subspan(padded_offset, body_length()); } diff --git a/src/metadata.cpp b/src/metadata.cpp index a07f216..3638f76 100644 --- a/src/metadata.cpp +++ b/src/metadata.cpp @@ -1,6 +1,6 @@ #include "sparrow_ipc/metadata.hpp" -#include +#include namespace sparrow_ipc { @@ -10,12 +10,14 @@ namespace sparrow_ipc { std::vector sparrow_metadata; sparrow_metadata.reserve(metadata.size()); - - for (const auto& kv : metadata) - { - sparrow_metadata.emplace_back(kv->key()->str(), kv->value()->str()); - } - + std::ranges::transform( + metadata, + std::back_inserter(sparrow_metadata), + [](const auto& kv) + { + return sparrow::metadata_pair{kv->key()->str(), kv->value()->str()}; + } + ); return sparrow_metadata; } } \ No newline at end of file From b6734ada33122df04327e6780b8b9e3fa165be40 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Mon, 8 Sep 2025 16:04:50 +0200 Subject: [PATCH 15/22] Remove serialization --- CMakeLists.txt | 5 - include/sparrow_ipc/deserialize.hpp | 28 --- include/sparrow_ipc/serialize.hpp | 25 --- include/sparrow_ipc/serialize_null_array.hpp | 11 -- .../sparrow_ipc/serialize_primitive_array.hpp | 98 ---------- src/serialize.cpp | 180 ------------------ src/serialize_null_array.cpp | 47 ----- tests/CMakeLists.txt | 5 +- ...pp => test_deserialization_with_files.cpp} | 0 tests/test_null_array_serialization.cpp | 51 ----- tests/test_primitive_array_serialization.cpp | 125 ------------ 11 files changed, 1 insertion(+), 574 deletions(-) delete mode 100644 include/sparrow_ipc/serialize.hpp delete mode 100644 include/sparrow_ipc/serialize_null_array.hpp delete mode 100644 include/sparrow_ipc/serialize_primitive_array.hpp delete mode 100644 src/serialize.cpp delete mode 100644 src/serialize_null_array.cpp rename tests/{test_primitive_array_with_files.cpp => test_deserialization_with_files.cpp} (100%) delete mode 100644 tests/test_null_array_serialization.cpp delete mode 100644 tests/test_primitive_array_serialization.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b9e7d3d..0947f0b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,9 +100,6 @@ set(SPARROW_IPC_HEADERS ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/encapsulated_message.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/magic_values.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/metadata.hpp - ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/serialize_null_array.hpp - ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/serialize_primitive_array.hpp - ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/serialize.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/utils.hpp ) @@ -116,8 +113,6 @@ set(SPARROW_IPC_SRC ${SPARROW_IPC_SOURCE_DIR}/deserialize.cpp ${SPARROW_IPC_SOURCE_DIR}/encapsulated_message.cpp ${SPARROW_IPC_SOURCE_DIR}/metadata.cpp - ${SPARROW_IPC_SOURCE_DIR}/serialize_null_array.cpp - ${SPARROW_IPC_SOURCE_DIR}/serialize.cpp ${SPARROW_IPC_SOURCE_DIR}/utils.cpp ) diff --git a/include/sparrow_ipc/deserialize.hpp b/include/sparrow_ipc/deserialize.hpp index fa52902..074949f 100644 --- a/include/sparrow_ipc/deserialize.hpp +++ b/include/sparrow_ipc/deserialize.hpp @@ -11,34 +11,6 @@ namespace sparrow_ipc { - /** - * @brief Deserializes a schema message from Arrow IPC format data. - * - * This function parses an Arrow IPC schema message from a byte buffer, extracting - * the field name and custom metadata from the first (and expected only) field in the schema. - * - * @param data A span containing the raw byte data to deserialize from - * @param current_offset Reference to the current position in the data buffer, which will be - * updated to point past the processed schema message - * @param name Optional output parameter that will contain the field name if present - * @param metadata Optional output parameter that will contain the custom metadata - * key-value pairs if present - * - * @throws std::runtime_error If the message is not a Schema message type - * @throws std::runtime_error If the schema does not contain exactly one field - * - * @note This function expects the data to start with a 4-byte length prefix followed - * by the FlatBuffer schema message data - */ - SPARROW_IPC_API void deserialize_schema_message( - std::span data, - size_t& current_offset, - std::optional& name, - std::optional>& metadata - ); - [[nodiscard]] SPARROW_IPC_API const org::apache::arrow::flatbuf::RecordBatch* - deserialize_record_batch_message(std::span data, size_t& current_offset); - /** * @brief Deserializes an Arrow IPC stream from binary data into a vector of record batches. * diff --git a/include/sparrow_ipc/serialize.hpp b/include/sparrow_ipc/serialize.hpp deleted file mode 100644 index a6896f8..0000000 --- a/include/sparrow_ipc/serialize.hpp +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - - -#include -#include - -#include -#include -#include - -#include "config/config.hpp" - - -namespace sparrow_ipc -{ - namespace details - { - SPARROW_IPC_API std::vector serialize_schema_message(const ArrowSchema& arrow_schema); - SPARROW_IPC_API void serialize_record_batch_message( - const ArrowArray& arrow_arr, - const std::vector& buffers_sizes, - std::vector& final_buffer - ); - } -} diff --git a/include/sparrow_ipc/serialize_null_array.hpp b/include/sparrow_ipc/serialize_null_array.hpp deleted file mode 100644 index 269184a..0000000 --- a/include/sparrow_ipc/serialize_null_array.hpp +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include "config/config.hpp" -#include "serialize.hpp" - -namespace sparrow_ipc -{ - // TODO Use `arr` as const after fixing the issue upstream in sparrow::get_arrow_structures - SPARROW_IPC_API std::vector serialize_null_array(sparrow::null_array& arr); - SPARROW_IPC_API sparrow::null_array deserialize_null_array(const std::vector& buffer); -} diff --git a/include/sparrow_ipc/serialize_primitive_array.hpp b/include/sparrow_ipc/serialize_primitive_array.hpp deleted file mode 100644 index 167b316..0000000 --- a/include/sparrow_ipc/serialize_primitive_array.hpp +++ /dev/null @@ -1,98 +0,0 @@ -#pragma once - -#include - -#include - -#include "deserialize.hpp" -#include "serialize.hpp" -#include "utils.hpp" - -namespace sparrow_ipc -{ - // TODO Use `arr` as const after fixing the issue upstream in sparrow::get_arrow_structures - template - std::vector serialize_primitive_array(sparrow::primitive_array& arr); - - template - sparrow::primitive_array deserialize_primitive_array(const std::vector& buffer); - - template - std::vector serialize_primitive_array(sparrow::primitive_array& arr) - { - // This function serializes a sparrow::primitive_array into a byte vector that is compliant - // with the Apache Arrow IPC Streaming Format. It constructs a stream containing two messages: - // 1. A Schema message: Describes the data's metadata (field name, type, nullability). - // 2. A RecordBatch message: Contains the actual array data (null count, length, and raw buffers). - // This two-part structure makes the data self-describing and readable by other Arrow-native tools. - // The implementation adheres to the specification by correctly handling: - // - Message order (Schema first, then RecordBatch). - // - The encapsulated message format (4-byte metadata length prefix). - // - 8-byte padding and alignment for the message body. - // - Correctly populating the Flatbuffer-defined metadata for both messages. - - // Get arrow structures - const auto [arrow_arr_ptr, arrow_schema_ptr] = sparrow::get_arrow_structures(arr); - const auto& arrow_arr = *arrow_arr_ptr; - const auto& arrow_schema = *arrow_schema_ptr; - - // I - Serialize the Schema message - auto final_buffer = details::serialize_schema_message(arrow_schema); - - // II - Serialize the RecordBatch message - // After the Schema, we send the RecordBatch containing the actual data - - // Calculate the size of the validity and data buffers - const int64_t validity_size = (arrow_arr.length + 7) / 8; - const int64_t data_size = arrow_arr.length * sizeof(T); - const std::vector buffers_sizes = {validity_size, data_size}; - details::serialize_record_batch_message(arrow_arr, buffers_sizes, final_buffer); - - // Return the final buffer containing the complete IPC stream - return final_buffer; - } - - template - sparrow::primitive_array deserialize_primitive_array(const std::vector& buffer) - { - const uint8_t* buf_ptr = buffer.data(); - size_t current_offset = 0; - - // I - Deserialize the Schema message - std::optional name; - std::optional> metadata; - deserialize_schema_message(std::span(buffer), current_offset, name, metadata); - - // II - Deserialize the RecordBatch message - const uint32_t batch_meta_len = *(reinterpret_cast(buf_ptr + current_offset)); - const auto* record_batch = deserialize_record_batch_message( - std::span(buffer), - current_offset - ); - - current_offset += utils::align_to_8(batch_meta_len); - const uint8_t* body_ptr = buf_ptr + current_offset; - - // Extract metadata from the RecordBatch - const auto buffers_meta = record_batch->buffers(); - const auto nodes_meta = record_batch->nodes(); - const auto node_meta = nodes_meta->Get(0); - - // The body contains the validity bitmap and the data buffer concatenated - // We need to copy this data into memory owned by the new ArrowArray - const int64_t validity_len = buffers_meta->Get(0)->length(); - const int64_t data_len = buffers_meta->Get(1)->length(); - - uint8_t* validity_buffer_copy = new uint8_t[validity_len]; - memcpy(validity_buffer_copy, body_ptr + buffers_meta->Get(0)->offset(), validity_len); - - uint8_t* data_buffer_copy = new uint8_t[data_len]; - memcpy(data_buffer_copy, body_ptr + buffers_meta->Get(1)->offset(), data_len); - - - auto data = sparrow::u8_buffer(reinterpret_cast(data_buffer_copy), node_meta->length()); - auto bitmap = sparrow::validity_bitmap(validity_buffer_copy, node_meta->length()); - - return sparrow::primitive_array(std::move(data), node_meta->length(), std::move(bitmap), name, metadata); - } -} diff --git a/src/serialize.cpp b/src/serialize.cpp deleted file mode 100644 index 723ef53..0000000 --- a/src/serialize.cpp +++ /dev/null @@ -1,180 +0,0 @@ -#include "sparrow_ipc/serialize.hpp" - -#include -#include - -#include "Message_generated.h" -#include "sparrow_ipc/utils.hpp" - -namespace sparrow_ipc -{ - namespace details - { - std::vector serialize_schema_message(const ArrowSchema& arrow_schema) - { - // Create a new builder for the Schema message's metadata - flatbuffers::FlatBufferBuilder schema_builder; - - flatbuffers::Offset fb_name_offset = 0; - if (arrow_schema.name) - { - fb_name_offset = schema_builder.CreateString(arrow_schema.name); - } - - // Determine the Flatbuffer type information from the C schema's format string - const auto [type_enum, type_offset] = utils::get_flatbuffer_type(schema_builder, arrow_schema.format); - - // Handle metadata - flatbuffers::Offset>> - fb_metadata_offset = 0; - - if (arrow_schema.metadata) - { - const auto metadata_view = sparrow::key_value_view(arrow_schema.metadata); - std::vector> kv_offsets; - kv_offsets.reserve(metadata_view.size()); - for (const auto& [key, value] : metadata_view) - { - const auto key_offset = schema_builder.CreateString(std::string(key)); - const auto value_offset = schema_builder.CreateString(std::string(value)); - kv_offsets.push_back( - org::apache::arrow::flatbuf::CreateKeyValue(schema_builder, key_offset, value_offset) - ); - } - fb_metadata_offset = schema_builder.CreateVector(kv_offsets); - } - - // Build the Field object - const auto fb_field = org::apache::arrow::flatbuf::CreateField( - schema_builder, - fb_name_offset, - (arrow_schema.flags & static_cast(sparrow::ArrowFlag::NULLABLE)) != 0, - type_enum, - type_offset, - 0, // dictionary - 0, // children - fb_metadata_offset - ); - - // A Schema contains a vector of fields - const std::vector> fields_vec = {fb_field}; - const auto fb_fields = schema_builder.CreateVector(fields_vec); - - // Build the Schema object from the vector of fields - const auto schema_offset = org::apache::arrow::flatbuf::CreateSchema( - schema_builder, - org::apache::arrow::flatbuf::Endianness::Little, - fb_fields - ); - - // Wrap the Schema in a top-level Message, which is the standard IPC envelope - const auto schema_message_offset = org::apache::arrow::flatbuf::CreateMessage( - schema_builder, - org::apache::arrow::flatbuf::MetadataVersion::V5, - org::apache::arrow::flatbuf::MessageHeader::Schema, - schema_offset.Union(), - 0 - ); - schema_builder.Finish(schema_message_offset); - - // Assemble the Schema message bytes - const uint32_t schema_len = schema_builder.GetSize(); // Get the size of the serialized metadata - // This will be the final buffer holding the complete IPC stream. - std::vector final_buffer; - final_buffer.resize(sizeof(uint32_t) + schema_len); // Resize the buffer to hold the message - // Copy the metadata into the buffer, after the 4-byte length prefix - memcpy(final_buffer.data() + sizeof(uint32_t), schema_builder.GetBufferPointer(), schema_len); - // Write the 4-byte metadata length at the beginning of the message - *(reinterpret_cast(final_buffer.data())) = schema_len; - return final_buffer; - } - - void serialize_record_batch_message( - const ArrowArray& arrow_arr, - const std::vector& buffers_sizes, - std::vector& final_buffer - ) - { - // Create a new builder for the RecordBatch message's metadata - flatbuffers::FlatBufferBuilder batch_builder; - - std::vector buffers_vec; - int64_t current_offset = 0; - int64_t body_len = 0; // The total size of the message body - for (const auto& size : buffers_sizes) - { - buffers_vec.emplace_back(current_offset, size); - current_offset += size; - } - body_len = current_offset; - - // Create the FieldNode, which describes the layout of the array data - const org::apache::arrow::flatbuf::FieldNode field_node_struct(arrow_arr.length, arrow_arr.null_count); - // A RecordBatch contains a vector of nodes and a vector of buffers - const auto fb_nodes_vector = batch_builder.CreateVectorOfStructs(&field_node_struct, 1); - const auto fb_buffers_vector = batch_builder.CreateVectorOfStructs(buffers_vec); - - // Build the RecordBatch metadata object - const auto record_batch_offset = org::apache::arrow::flatbuf::CreateRecordBatch( - batch_builder, - arrow_arr.length, - fb_nodes_vector, - fb_buffers_vector - ); - - // Wrap the RecordBatch in a top-level Message - const auto batch_message_offset = org::apache::arrow::flatbuf::CreateMessage( - batch_builder, - org::apache::arrow::flatbuf::MetadataVersion::V5, - org::apache::arrow::flatbuf::MessageHeader::RecordBatch, - record_batch_offset.Union(), - body_len - ); - batch_builder.Finish(batch_message_offset); - - // Append the RecordBatch message to the final buffer - const uint32_t batch_meta_len = batch_builder.GetSize(); // Get the size of the batch metadata - const int64_t aligned_batch_meta_len = utils::align_to_8(batch_meta_len); // Calculate the padded - // length - - const size_t current_size = final_buffer.size(); // Get the current size (which is the end of the - // Schema message) - // Resize the buffer to append the new message - final_buffer.resize(current_size + sizeof(uint32_t) + aligned_batch_meta_len + body_len); - uint8_t* dst = final_buffer.data() + current_size; // Get a pointer to where the new message will - // start - - // Write the 4-byte metadata length for the RecordBatch message - *(reinterpret_cast(dst)) = batch_meta_len; - dst += sizeof(uint32_t); - // Copy the RecordBatch metadata into the buffer - memcpy(dst, batch_builder.GetBufferPointer(), batch_meta_len); - // Add padding to align the body to an 8-byte boundary - memset(dst + batch_meta_len, 0, aligned_batch_meta_len - batch_meta_len); - - dst += aligned_batch_meta_len; - // Copy the actual data buffers (the message body) into the buffer - for (size_t i = 0; i < buffers_sizes.size(); ++i) - { - // arrow_arr.buffers[0] is the validity bitmap - // arrow_arr.buffers[1] is the actual data buffer - const uint8_t* data_buffer = reinterpret_cast(arrow_arr.buffers[i]); - if (data_buffer) - { - memcpy(dst, data_buffer, buffers_sizes[i]); - } - else - { - // If validity_bitmap is null, it means there are no nulls - if (i == 0) - { - memset(dst, 0xFF, buffers_sizes[i]); - } - } - dst += buffers_sizes[i]; - } - } - - - } // namespace details -} // namespace sparrow-ipc diff --git a/src/serialize_null_array.cpp b/src/serialize_null_array.cpp deleted file mode 100644 index 1b5378e..0000000 --- a/src/serialize_null_array.cpp +++ /dev/null @@ -1,47 +0,0 @@ -#include "sparrow_ipc/serialize_null_array.hpp" - -#include "sparrow_ipc/deserialize.hpp" - -namespace sparrow_ipc -{ - // A null_array is represented by metadata only (Schema, RecordBatch) and has no data buffers, - // making its message body zero-length. - std::vector serialize_null_array(sparrow::null_array& arr) - { - const auto [arrow_arr_ptr, arrow_schema_ptr] = sparrow::get_arrow_structures(arr); - const auto& arrow_arr = *arrow_arr_ptr; - const auto& arrow_schema = *arrow_schema_ptr; - - // I - Serialize the Schema message - auto final_buffer = details::serialize_schema_message(arrow_schema); - - // II - Serialize the RecordBatch message - details::serialize_record_batch_message(arrow_arr, {}, final_buffer); - - // Return the final buffer containing the complete IPC stream - return final_buffer; - } - - // This reads the Schema and RecordBatch messages to extract the array's length, - // name, and metadata, then constructs a null_array. - sparrow::null_array deserialize_null_array(const std::vector& buffer) - { - const uint8_t* buf_ptr = buffer.data(); - size_t current_offset = 0; - - // I - Deserialize the Schema message - std::optional name; - std::optional> metadata; - deserialize_schema_message(std::span(buffer), current_offset, name, metadata); - - // II - Deserialize the RecordBatch message - const auto* record_batch = deserialize_record_batch_message( - std::span(buffer), - current_offset - ); - - // The body is empty, so we don't need to read any further. - // Construct the null_array from the deserialized metadata. - return sparrow::null_array(record_batch->length(), name, metadata); - } -} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7fb84c3..b84fc71 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -4,13 +4,10 @@ set(test_target "test_sparrow_ipc_lib") set(SPARROW_IPC_TESTS_SRC include/sparrow_ipc_tests_helpers.hpp - # TODO move all the files below under src? main.cpp test_arrow_array.cpp test_arrow_schema.cpp - test_null_array_serialization.cpp - test_primitive_array_serialization.cpp - test_primitive_array_with_files.cpp + test_deserialization_with_files.cpp test_utils.cpp ) diff --git a/tests/test_primitive_array_with_files.cpp b/tests/test_deserialization_with_files.cpp similarity index 100% rename from tests/test_primitive_array_with_files.cpp rename to tests/test_deserialization_with_files.cpp diff --git a/tests/test_null_array_serialization.cpp b/tests/test_null_array_serialization.cpp deleted file mode 100644 index 0b3e12c..0000000 --- a/tests/test_null_array_serialization.cpp +++ /dev/null @@ -1,51 +0,0 @@ -#include -#include - -#include "sparrow_ipc/serialize_null_array.hpp" -#include "sparrow_ipc_tests_helpers.hpp" - -namespace sparrow_ipc -{ - namespace sp = sparrow; - - TEST_CASE("Serialize and deserialize null_array") - { - const std::size_t size = 10; - const std::string_view name = "my_null_array"; - - const std::vector metadata_vec = {{"key1", "value1"}, {"key2", "value2"}}; - const std::optional> metadata = metadata_vec; - - sp::null_array arr(size, name, metadata); - - const auto buffer = serialize_null_array(arr); - const auto deserialized_arr = deserialize_null_array(buffer); - - CHECK_EQ(deserialized_arr.size(), arr.size()); - REQUIRE(deserialized_arr.name().has_value()); - CHECK_EQ(deserialized_arr.name().value(), arr.name().value()); - - REQUIRE(deserialized_arr.metadata().has_value()); - compare_metadata(arr, deserialized_arr); - - // Check the deserialized object is a null_array - const auto& arrow_proxy = sp::detail::array_access::get_arrow_proxy(deserialized_arr); - CHECK_EQ(arrow_proxy.format(), "n"); - CHECK_EQ(arrow_proxy.n_children(), 0); - CHECK_EQ(arrow_proxy.flags(), std::unordered_set{sp::ArrowFlag::NULLABLE}); - CHECK_EQ(arrow_proxy.name(), name); - CHECK_EQ(arrow_proxy.dictionary(), nullptr); - CHECK_EQ(arrow_proxy.buffers().size(), 0); - } - - TEST_CASE("Serialize and deserialize null_array with no name and no metadata") - { - const std::size_t size = 100; - sp::null_array arr(size); - const auto buffer = serialize_null_array(arr); - const auto deserialized_arr = deserialize_null_array(buffer); - CHECK_EQ(deserialized_arr.size(), arr.size()); - CHECK_FALSE(deserialized_arr.name().has_value()); - CHECK_FALSE(deserialized_arr.metadata().has_value()); - } -} diff --git a/tests/test_primitive_array_serialization.cpp b/tests/test_primitive_array_serialization.cpp deleted file mode 100644 index 450ab19..0000000 --- a/tests/test_primitive_array_serialization.cpp +++ /dev/null @@ -1,125 +0,0 @@ -#include -#include -#include -#include - -#include -#include - -#include "sparrow_ipc/serialize_primitive_array.hpp" -#include "sparrow_ipc_tests_helpers.hpp" - -namespace sparrow_ipc -{ - namespace sp = sparrow; - - using testing_types = std::tuple; - - template - void compare_bitmap(const sp::primitive_array& pa1, const sp::primitive_array& pa2) - { - const auto pa1_bitmap = pa1.bitmap(); - const auto pa2_bitmap = pa2.bitmap(); - - CHECK_EQ(pa1_bitmap.size(), pa2_bitmap.size()); - auto pa1_it = pa1_bitmap.begin(); - auto pa2_it = pa2_bitmap.begin(); - for (size_t i = 0; i < pa1_bitmap.size(); ++i) - { - CHECK_EQ(*pa1_it, *pa2_it); - ++pa1_it; - ++pa2_it; - } - } - - template - void compare_primitive_arrays(const sp::primitive_array& ar, const sp::primitive_array& deserialized_ar) - { - CHECK_EQ(ar, deserialized_ar); - compare_bitmap(ar, deserialized_ar); - compare_metadata(ar, deserialized_ar); - } - - TEST_CASE_TEMPLATE_DEFINE("Serialize and Deserialize primitive_array", T, primitive_array_types) - { - auto create_primitive_array = []() -> sp::primitive_array - { - if constexpr (std::is_same_v) - { - return {10, 20, 30, 40, 50}; - } - else if constexpr (std::is_same_v) - { - return {10.5f, 20.5f, 30.5f, 40.5f, 50.5f}; - } - else if constexpr (std::is_same_v) - { - return {10.1, 20.2, 30.3, 40.4, 50.5}; - } - else - { - FAIL("Unsupported type for templated test case"); - } - }; - - sp::primitive_array ar = create_primitive_array(); - - const std::vector serialized_data = serialize_primitive_array(ar); - - CHECK(serialized_data.size() > 0); - - sp::primitive_array deserialized_ar = deserialize_primitive_array(serialized_data); - - compare_primitive_arrays(ar, deserialized_ar); - } - - TEST_CASE_TEMPLATE_APPLY(primitive_array_types, testing_types); - - TEST_CASE("Serialize and Deserialize primitive_array - int with nulls") - { - // Data buffer - const sp::u8_buffer data_buffer = {100, 200, 300, 400, 500}; - - // Validity bitmap: 100 (valid), 200 (valid), 300 (null), 400 (valid), 500 (null) - sp::validity_bitmap validity(5, true); // All valid initially - validity.set(2, false); // Set index 2 to null - validity.set(4, false); // Set index 4 to null - - sp::primitive_array ar(std::move(data_buffer), std::move(validity)); - - const std::vector serialized_data = serialize_primitive_array(ar); - - CHECK(serialized_data.size() > 0); - - sp::primitive_array deserialized_ar = deserialize_primitive_array(serialized_data); - - compare_primitive_arrays(ar, deserialized_ar); - } - - TEST_CASE("Serialize and Deserialize primitive_array - with name and metadata") - { - // Data buffer - const sp::u8_buffer data_buffer = {1, 2, 3}; - - // Validity bitmap: All valid - const sp::validity_bitmap validity(3, true); - - // Custom metadata - const std::vector metadata = {{"key1", "value1"}, {"key2", "value2"}}; - - sp::primitive_array ar( - std::move(data_buffer), - std::move(validity), - "my_named_array", // name - std::make_optional(std::vector{{"key1", "value1"}, {"key2", "value2"}}) - ); - - const std::vector serialized_data = serialize_primitive_array(ar); - - CHECK(serialized_data.size() > 0); - - sp::primitive_array deserialized_ar = deserialize_primitive_array(serialized_data); - - compare_primitive_arrays(ar, deserialized_ar); - } -} From c36b7de34f1041dde683ff2b61a72f0e0c144339 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Mon, 8 Sep 2025 16:17:47 +0200 Subject: [PATCH 16/22] wip --- src/deserialize.cpp | 53 --------------------------------------------- 1 file changed, 53 deletions(-) diff --git a/src/deserialize.cpp b/src/deserialize.cpp index f00a052..02d4e08 100644 --- a/src/deserialize.cpp +++ b/src/deserialize.cpp @@ -10,59 +10,6 @@ namespace sparrow_ipc { - void deserialize_schema_message( - std::span data, - size_t& current_offset, - std::optional& name, - std::optional>& metadata - ) - { - if (data.size() < (current_offset + sizeof(uint32_t))) - { - throw std::runtime_error("Data too short to contain schema length."); - } - const uint32_t schema_meta_len = *(reinterpret_cast(data.data() + current_offset)); - if (schema_meta_len == 0 || (data.size() < (current_offset + sizeof(uint32_t) + schema_meta_len))) - { - throw std::runtime_error("Invalid schema length."); - } - current_offset += sizeof(uint32_t); - const auto schema_message = org::apache::arrow::flatbuf::GetMessage(data.data() + current_offset); - if (schema_message->header_type() != org::apache::arrow::flatbuf::MessageHeader::Schema) - { - throw std::runtime_error("Expected Schema message at the start of the buffer."); - } - const auto flatbuffer_schema = static_cast( - schema_message->header() - ); - const auto fields = flatbuffer_schema->fields(); - if (fields->size() != 1) - { - throw std::runtime_error("Expected schema with exactly one field."); - } - - const auto field = fields->Get(0); - - // Get name - if (const auto fb_name = field->name()) - { - name = fb_name->str(); - } - - // Handle metadata - const auto fb_metadata = field->custom_metadata(); - if (fb_metadata && !fb_metadata->empty()) - { - metadata = std::vector(); - metadata->reserve(fb_metadata->size()); - for (const auto& kv : *fb_metadata) - { - metadata->emplace_back(kv->key()->str(), kv->value()->str()); - } - } - current_offset += schema_meta_len; - } - const org::apache::arrow::flatbuf::RecordBatch* deserialize_record_batch_message(std::span data, size_t& current_offset) { From 0ae315d07590da408db026256a6f6e2c655c4aa0 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Mon, 8 Sep 2025 16:23:24 +0200 Subject: [PATCH 17/22] Avoid recreating metadata --- src/deserialize.cpp | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/deserialize.cpp b/src/deserialize.cpp index 02d4e08..cd50d16 100644 --- a/src/deserialize.cpp +++ b/src/deserialize.cpp @@ -21,6 +21,7 @@ namespace sparrow_ipc } return static_cast(batch_message->header()); } + /** * @brief Deserializes arrays from an Apache Arrow RecordBatch using the provided schema. * @@ -44,7 +45,8 @@ namespace sparrow_ipc std::vector get_arrays_from_record_batch( const org::apache::arrow::flatbuf::RecordBatch& record_batch, const org::apache::arrow::flatbuf::Schema& schema, - const EncapsulatedMessage& encapsulated_message + const EncapsulatedMessage& encapsulated_message, + const std::vector>>& field_metadata ) { const size_t length = static_cast(record_batch.length()); @@ -52,15 +54,12 @@ namespace sparrow_ipc std::vector arrays; arrays.reserve(schema.fields()->size()); - + size_t field_idx = 0; for (const auto field : *(schema.fields())) { const ::flatbuffers::Vector<::flatbuffers::Offset>* fb_custom_metadata = field->custom_metadata(); - const std::optional> - metadata = fb_custom_metadata == nullptr - ? std::nullopt - : std::make_optional(to_sparrow_metadata(*fb_custom_metadata)); + const std::optional>& metadata = field_metadata[field_idx++]; const auto name = field->name()->string_view(); const auto field_type = field->type_type(); const auto deserialize_non_owning_primitive_array_lambda = [&]() @@ -206,6 +205,7 @@ namespace sparrow_ipc std::vector field_names; std::vector fields_nullable; std::vector field_types; + std::vector>> fields_metadata; do { const auto [encapsulated_message, rest] = extract_encapsulated_message(data); @@ -218,11 +218,19 @@ namespace sparrow_ipc const size_t size = static_cast(schema->fields()->size()); field_names.reserve(size); fields_nullable.reserve(size); + fields_metadata.reserve(size); for (const auto field : *(schema->fields())) { field_names.emplace_back(field->name()->string_view()); fields_nullable.push_back(field->nullable()); + const ::flatbuffers::Vector<::flatbuffers::Offset>* + fb_custom_metadata = field->custom_metadata(); + std::optional> + metadata = fb_custom_metadata == nullptr + ? std::nullopt + : std::make_optional(to_sparrow_metadata(*fb_custom_metadata)); + fields_metadata.push_back(std::move(metadata)); } } break; @@ -240,7 +248,8 @@ namespace sparrow_ipc std::vector arrays = get_arrays_from_record_batch( *record_batch, *schema, - encapsulated_message + encapsulated_message, + fields_metadata ); std::vector field_names_str(field_names.cbegin(), field_names.cend()); record_batches.emplace_back(std::move(field_names_str), std::move(arrays)); From b94eea76a2839632700b632df75d18aad14a8d97 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Mon, 8 Sep 2025 16:27:59 +0200 Subject: [PATCH 18/22] address comments --- include/sparrow_ipc/encapsulated_message.hpp | 6 +++--- include/sparrow_ipc/utils.hpp | 2 +- src/deserialize.cpp | 2 +- src/encapsulated_message.cpp | 22 ++++++++++---------- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/sparrow_ipc/encapsulated_message.hpp b/include/sparrow_ipc/encapsulated_message.hpp index a88e4de..7e95339 100644 --- a/include/sparrow_ipc/encapsulated_message.hpp +++ b/include/sparrow_ipc/encapsulated_message.hpp @@ -7,11 +7,11 @@ namespace sparrow_ipc { - class EncapsulatedMessage + class encapsulated_message { public: - EncapsulatedMessage(std::span data); + encapsulated_message(std::span data); [[nodiscard]] const org::apache::arrow::flatbuf::Message* flat_buffer_message() const; @@ -41,6 +41,6 @@ namespace sparrow_ipc std::span m_data; }; - [[nodiscard]] std::pair> + [[nodiscard]] std::pair> extract_encapsulated_message(std::span buf_ptr); } \ No newline at end of file diff --git a/include/sparrow_ipc/utils.hpp b/include/sparrow_ipc/utils.hpp index 44900b2..65563a0 100644 --- a/include/sparrow_ipc/utils.hpp +++ b/include/sparrow_ipc/utils.hpp @@ -5,8 +5,8 @@ #include #include -#include "config/config.hpp" #include "Schema_generated.h" +#include "sparrow_ipc/config/config.hpp" namespace sparrow_ipc::utils { diff --git a/src/deserialize.cpp b/src/deserialize.cpp index cd50d16..0d13072 100644 --- a/src/deserialize.cpp +++ b/src/deserialize.cpp @@ -45,7 +45,7 @@ namespace sparrow_ipc std::vector get_arrays_from_record_batch( const org::apache::arrow::flatbuf::RecordBatch& record_batch, const org::apache::arrow::flatbuf::Schema& schema, - const EncapsulatedMessage& encapsulated_message, + const encapsulated_message& encapsulated_message, const std::vector>>& field_metadata ) { diff --git a/src/encapsulated_message.cpp b/src/encapsulated_message.cpp index 71efab2..128b7fc 100644 --- a/src/encapsulated_message.cpp +++ b/src/encapsulated_message.cpp @@ -7,19 +7,19 @@ namespace sparrow_ipc { - EncapsulatedMessage::EncapsulatedMessage(std::span data) + encapsulated_message::encapsulated_message(std::span data) : m_data(data) { } - const org::apache::arrow::flatbuf::Message* EncapsulatedMessage::flat_buffer_message() const + const org::apache::arrow::flatbuf::Message* encapsulated_message::flat_buffer_message() const { const uint8_t* message_ptr = m_data.data() + (sizeof(uint32_t) * 2); // 4 bytes continuation + 4 // bytes metadata size return org::apache::arrow::flatbuf::GetMessage(message_ptr); } - size_t EncapsulatedMessage::metadata_length() const + size_t encapsulated_message::metadata_length() const { return *(reinterpret_cast(m_data.data() + sizeof(uint32_t))); } @@ -30,7 +30,7 @@ namespace sparrow_ipc const org::apache::arrow::flatbuf::Tensor*, const org::apache::arrow::flatbuf::DictionaryBatch*, const org::apache::arrow::flatbuf::SparseTensor*> - EncapsulatedMessage::metadata() const + encapsulated_message::metadata() const { const auto schema_message = flat_buffer_message(); switch (schema_message->header_type()) @@ -61,17 +61,17 @@ namespace sparrow_ipc } const ::flatbuffers::Vector<::flatbuffers::Offset>* - EncapsulatedMessage::custom_metadata() const + encapsulated_message::custom_metadata() const { return flat_buffer_message()->custom_metadata(); } - size_t EncapsulatedMessage::body_length() const + size_t encapsulated_message::body_length() const { return static_cast(flat_buffer_message()->bodyLength()); } - std::span EncapsulatedMessage::body() const + std::span encapsulated_message::body() const { const size_t offset = sizeof(uint32_t) * 2 // 4 bytes continuation + 4 bytes metadata size + metadata_length(); @@ -83,7 +83,7 @@ namespace sparrow_ipc return m_data.subspan(padded_offset, body_length()); } - size_t EncapsulatedMessage::total_length() const + size_t encapsulated_message::total_length() const { const size_t offset = sizeof(uint32_t) * 2 // 4 bytes continuation + 4 bytes metadata size + metadata_length(); @@ -91,12 +91,12 @@ namespace sparrow_ipc return padded_offset + body_length(); } - std::span EncapsulatedMessage::as_span() const + std::span encapsulated_message::as_span() const { return m_data; } - std::pair> + std::pair> extract_encapsulated_message(std::span data) { if (!data.size() || data.size() < 8) @@ -108,7 +108,7 @@ namespace sparrow_ipc { throw std::runtime_error("Buffer starts with continuation bytes, expected a valid message."); } - EncapsulatedMessage message(data); + encapsulated_message message(data); std::span rest = data.subspan(message.total_length()); return {std::move(message), std::move(rest)}; } From 115d3a9159e3534748ab6f9e532dbf118393e78a Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Thu, 11 Sep 2025 11:47:41 +0200 Subject: [PATCH 19/22] Update conda env --- environment-dev.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment-dev.yml b/environment-dev.yml index 65e0f04..f1c0bc6 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -8,7 +8,7 @@ dependencies: - cxx-compiler # Libraries dependencies - flatbuffers - - sparrow >=1.1.0 + - sparrow-devel >=1.1.1 - doctest # Documentation dependencies - doxygen From e4bb2e9af1be4bc780b0e62c34b13c94dbd2da61 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Thu, 11 Sep 2025 13:38:18 +0200 Subject: [PATCH 20/22] TRY FIX --- cmake/external_dependencies.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external_dependencies.cmake b/cmake/external_dependencies.cmake index 822ff6e..7eb7e6f 100644 --- a/cmake/external_dependencies.cmake +++ b/cmake/external_dependencies.cmake @@ -52,7 +52,7 @@ endif() find_package_or_fetch( PACKAGE_NAME sparrow GIT_REPOSITORY https://github.com/man-group/sparrow.git - TAG 1.1.0 + TAG 1.1.1 ) unset(CREATE_JSON_READER_TARGET) From dd01882b5f17c7973a6751a827d79ffeb98a0e54 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Mon, 15 Sep 2025 14:27:41 +0200 Subject: [PATCH 21/22] Upgrade sparrow version --- environment-dev.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment-dev.yml b/environment-dev.yml index f1c0bc6..2e46112 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -8,7 +8,7 @@ dependencies: - cxx-compiler # Libraries dependencies - flatbuffers - - sparrow-devel >=1.1.1 + - sparrow-devel >=1.1.2 - doctest # Documentation dependencies - doxygen From 02d732294e20c0e94c155997361d7d3fa1cb4972 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Tue, 16 Sep 2025 16:16:49 +0200 Subject: [PATCH 22/22] Fix windows run tests --- tests/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b84fc71..b46f509 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -29,6 +29,9 @@ if(WIN32) COMMAND ${CMAKE_COMMAND} -E copy "$" "$" + COMMAND ${CMAKE_COMMAND} -E copy + "$" + "$" COMMENT "Copying sparrow and sparrow-ipc DLLs to executable directory" ) endif()