From 28a69a4ed660de51d4371ca7991944868bec09ad Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Tue, 5 Mar 2024 16:55:35 -0600 Subject: [PATCH 1/7] support non-vcpkg dependencies --- cpp/cmake/modules/FindBrotli.cmake | 8 +++--- cpp/cmake/modules/FindDepsBase.cmake | 2 +- cpp/cmake/modules/FindDepsKafkaAdapter.cmake | 2 +- cpp/cmake/modules/FindRdKafka.cmake | 19 +++++++++++++ cpp/cmake/modules/Findlz4.cmake | 12 +++++++++ setup.py | 28 +++++++++++--------- 6 files changed, 53 insertions(+), 18 deletions(-) create mode 100644 cpp/cmake/modules/FindRdKafka.cmake create mode 100644 cpp/cmake/modules/Findlz4.cmake diff --git a/cpp/cmake/modules/FindBrotli.cmake b/cpp/cmake/modules/FindBrotli.cmake index 312b468f6..6f068a74d 100644 --- a/cpp/cmake/modules/FindBrotli.cmake +++ b/cpp/cmake/modules/FindBrotli.cmake @@ -49,17 +49,17 @@ find_path( BROTLI_INCLUDE_DIR message("Include: ${BROTLI_INCLUDE_DIR}") find_library( BROTLI_STATIC_LIB_ENC - NAMES libbrotlienc.a libbrotlienc-static.a + NAMES libbrotlienc.a libbrotlienc-static.a brotlienc PATHS ${_brotli_roots} PATH_SUFFIXES "" "lib" ) find_library( BROTLI_STATIC_LIB_DEC - NAMES libbrotlidec.a libbrotlidec-static.a + NAMES libbrotlidec.a libbrotlidec-static.a brotlidec PATHS ${_brotli_roots} PATH_SUFFIXES "" "lib" ) find_library( BROTLI_STATIC_LIB_COMMON - NAMES libbrotlicommon.a libbrotlicommon-static.a + NAMES libbrotlicommon.a libbrotlicommon-static.a brotlicommon PATHS ${_brotli_roots} PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" ) @@ -101,4 +101,4 @@ mark_as_advanced( BROTLI_LIBS BROTLI_LIBRARIES BROTLI_STATIC_LIB -) \ No newline at end of file +) diff --git a/cpp/cmake/modules/FindDepsBase.cmake b/cpp/cmake/modules/FindDepsBase.cmake index 1313e09f8..9f72c8f74 100644 --- a/cpp/cmake/modules/FindDepsBase.cmake +++ b/cpp/cmake/modules/FindDepsBase.cmake @@ -12,4 +12,4 @@ include_directories(${RapidJSON_INCLUDE_DIRS}) find_path(EXPRTK_INCLUDE_DIRS "exprtk.hpp") # For adapter utils -find_package(protobuf CONFIG REQUIRED) +find_package(Protobuf REQUIRED) diff --git a/cpp/cmake/modules/FindDepsKafkaAdapter.cmake b/cpp/cmake/modules/FindDepsKafkaAdapter.cmake index 0bef6f861..59e0404bd 100644 --- a/cpp/cmake/modules/FindDepsKafkaAdapter.cmake +++ b/cpp/cmake/modules/FindDepsKafkaAdapter.cmake @@ -1,3 +1,3 @@ cmake_minimum_required(VERSION 3.7.2) -find_package(RdKafka CONFIG REQUIRED) +find_package(RdKafka REQUIRED) diff --git a/cpp/cmake/modules/FindRdKafka.cmake b/cpp/cmake/modules/FindRdKafka.cmake new file mode 100644 index 000000000..432125886 --- /dev/null +++ b/cpp/cmake/modules/FindRdKafka.cmake @@ -0,0 +1,19 @@ +find_path(RdKafka_INCLUDE_DIR NAMES librdkafka/rdkafkacpp.h) +find_library(RdKafka_LIBRARY NAMES rdkafka++ librdkafkacpp) +find_library(RdKafka_C_LIBRARY NAMES rdkafka librdkafka) + +if (NOT TARGET RdKafka::rdkafka) + add_library(RdKafka::rdkafka STATIC IMPORTED) + set_property(TARGET RdKafka::rdkafka PROPERTY + IMPORTED_LOCATION "${RdKafka_C_LIBRARY}") + target_include_directories(RdKafka::rdkafka INTERFACE ${RdKafka_INCLUDE_DIR}) + + add_library(RdKafka::rdkafka++ STATIC IMPORTED) + set_property(TARGET RdKafka::rdkafka++ PROPERTY + IMPORTED_LOCATION "${RdKafka_LIBRARY}") + target_include_directories(RdKafka::rdkafka++ INTERFACE ${RdKafka_INCLUDE_DIR}) +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(RdKafka DEFAULT_MSG RdKafka_LIBRARY RdKafka_C_LIBRARY RdKafka_INCLUDE_DIR) +mark_as_advanced(RdKafka_INCLUDE_DIR RdKafka_LIBRARY RdKafka_C_LIBRARY RdKafka::rdkafka++ RdKafka::rdkafka) diff --git a/cpp/cmake/modules/Findlz4.cmake b/cpp/cmake/modules/Findlz4.cmake new file mode 100644 index 000000000..520961f7a --- /dev/null +++ b/cpp/cmake/modules/Findlz4.cmake @@ -0,0 +1,12 @@ +find_path(Lz4_INCLUDE_DIR lz4.h) +find_library(Lz4_LIBRARY NAMES lz4 liblz4) + +if (NOT TARGET lz4::lz4) + add_library(lz4::lz4 STATIC IMPORTED) + set_property(TARGET lz4::lz4 PROPERTY IMPORTED_LOCATION "${Lz4_LIBRARY}") + target_include_directories(lz4::lz4 INTERFACE ${Lz4_INCLUDE_DIR}) +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(lz4 DEFAULT_MSG Lz4_LIBRARY Lz4_INCLUDE_DIR) +mark_as_advanced(Lz4_INCLUDE_DIR Lz4_LIBRARY) diff --git a/setup.py b/setup.py index 6ba8b67ea..0abdff4fc 100644 --- a/setup.py +++ b/setup.py @@ -7,19 +7,21 @@ from shutil import which from skbuild import setup + # This will be used for e.g. the sdist -if not os.path.exists("vcpkg"): - subprocess.call(["git", "clone", "https://github.com/Microsoft/vcpkg.git"]) -if not os.path.exists("vcpkg/ports"): - subprocess.call(["git", "submodule", "update", "--init", "--recursive"]) -if not os.path.exists("vcpkg/buildtrees"): - subprocess.call(["git", "pull"], cwd="vcpkg") - if os.name == "nt": - subprocess.call(["bootstrap-vcpkg.bat"], cwd="vcpkg") - subprocess.call(["vcpkg", "install"], cwd="vcpkg") - else: - subprocess.call(["./bootstrap-vcpkg.sh"], cwd="vcpkg") - subprocess.call(["./vcpkg", "install"], cwd="vcpkg") +if os.environ.get("CSP_USE_VCPKG", "1") == "1": + if not os.path.exists("vcpkg"): + subprocess.call(["git", "clone", "https://github.com/Microsoft/vcpkg.git"]) + if not os.path.exists("vcpkg/ports"): + subprocess.call(["git", "submodule", "update", "--init", "--recursive"]) + if not os.path.exists("vcpkg/buildtrees"): + subprocess.call(["git", "pull"], cwd="vcpkg") + if os.name == "nt": + subprocess.call(["bootstrap-vcpkg.bat"], cwd="vcpkg") + subprocess.call(["vcpkg", "install"], cwd="vcpkg") + else: + subprocess.call(["./bootstrap-vcpkg.sh"], cwd="vcpkg") + subprocess.call(["./vcpkg", "install"], cwd="vcpkg") python_version = f"{sys.version_info.major}.{sys.version_info.minor}" @@ -40,6 +42,8 @@ "-DARROW_WITH_UTF8PROC=Off", ] ) +else: + cmake_args.append("-DCSP_BUILD_PARQUET_ADAPTER=OFF") # if "CONDA_PREFIX" in os.environ: # cmake_args.append(f"-DCMAKE_MODULE_PATH={os.environ['CONDA_PREFIX']}/lib/cmake/absl;{os.environ['CONDA_PREFIX']}/lib/cmake/arrow") From 118f57f3fe0f13bec994b2e8487125dc53c8d76a Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Tue, 5 Mar 2024 16:55:42 -0600 Subject: [PATCH 2/7] support arrow 15 Signed-off-by: Tim Paine --- cpp/cmake/modules/FindDepsKafkaAdapter.cmake | 9 + .../modules/FindDepsParquetAdapter.cmake | 1 + cpp/csp/adapters/kafka/CMakeLists.txt | 8 +- .../parquet/ArrowIPCFileWriterWrapper.cpp | 2 +- cpp/csp/adapters/parquet/CMakeLists.txt | 10 +- .../parquet/ParquetReaderColumnAdapter.cpp | 4 +- cpp/csp/adapters/utils/ProtobufHelper.h | 1 + cpp/csp/python/adapters/CMakeLists.txt | 44 +- .../python/adapters/parquetadapterimpl.cpp | 7 +- .../arrow/python/CMakeLists.txt} | 9 +- .../arrow/python/api.h | 0 .../arrow/python/arrow_to_pandas.cc | 464 +- .../arrow/python/arrow_to_pandas.h | 22 + .../arrow/python/arrow_to_python_internal.h | 0 .../pyarrow-15.0.0/arrow/python/async.h | 60 + .../arrow/python/benchmark.cc | 4 +- .../arrow/python/benchmark.h | 0 .../arrow/python/common.cc | 0 .../arrow/python/common.h | 118 +- .../arrow/python/csv.cc | 2 +- .../arrow/python/csv.h | 0 .../arrow/python/datetime.cc | 209 +- .../arrow/python/datetime.h | 32 +- .../arrow/python/decimal.cc | 0 .../arrow/python/decimal.h | 0 .../arrow/python/deserialize.cc | 0 .../arrow/python/deserialize.h | 0 .../arrow/python/extension_type.cc | 0 .../arrow/python/extension_type.h | 0 .../arrow/python/filesystem.cc | 4 +- .../arrow/python/filesystem.h | 4 +- .../arrow/python/flight.cc | 98 +- .../arrow/python/flight.h | 19 +- .../arrow/python/gdb.cc | 221 +- .../arrow/python/gdb.h | 0 .../arrow/python/helpers.cc | 0 .../arrow/python/helpers.h | 0 .../arrow/python/inference.cc | 27 +- .../arrow/python/inference.h | 2 +- .../arrow/python/init.cc | 0 .../arrow/python/init.h | 0 .../arrow/python/io.cc | 12 +- .../arrow/python/io.h | 5 + .../arrow/python/ipc.cc | 2 +- .../arrow/python/ipc.h | 0 .../arrow/python/iterators.h | 0 .../pyarrow-15.0.0/arrow/python/lib.h | 83 + .../pyarrow-15.0.0/arrow/python/lib_api.h | 201 + .../arrow/python/numpy_convert.cc | 0 .../arrow/python/numpy_convert.h | 0 .../arrow/python/numpy_internal.h | 8 +- .../arrow/python/numpy_interop.h | 0 .../arrow/python/numpy_to_arrow.cc | 20 +- .../arrow/python/numpy_to_arrow.h | 0 .../arrow/python/parquet_encryption.cc | 98 + .../arrow/python/parquet_encryption.h | 132 + .../arrow/python/pch.h | 0 .../arrow/python/platform.h | 9 +- .../arrow/python/pyarrow.cc | 10 + .../arrow/python/pyarrow.h | 5 + .../pyarrow-15.0.0/arrow/python/pyarrow_api.h | 19 + .../pyarrow-15.0.0/arrow/python/pyarrow_lib.h | 19 + .../arrow/python/python_test.cc | 888 ++++ .../arrow/python/python_test.h} | 35 +- .../arrow/python/python_to_arrow.cc | 117 +- .../arrow/python/python_to_arrow.h | 0 .../arrow/python/serialize.cc | 0 .../arrow/python/serialize.h | 0 .../arrow/python/type_traits.h | 0 .../pyarrow-15.0.0/arrow/python/udf.cc | 745 +++ .../pyarrow-15.0.0/arrow/python/udf.h | 81 + .../arrow/python/visibility.h | 2 +- .../pyarrow-15.0.0/arrow/util/int_util.h | 137 + .../arrow/util/int_util_overflow.h} | 44 +- .../pyarrow-15.0.0/arrow/util/io_util.h | 420 ++ .../pyarrow-15.0.0/arrow/util/iterator.h | 568 ++ .../arrow/util/key_value_metadata.h | 98 + .../pyarrow-15.0.0/arrow/util/launder.h | 35 + .../pyarrow-15.0.0/arrow/util/list_util.h | 55 + .../pyarrow-15.0.0/arrow/util/logging.h | 259 + .../pyarrow-15.0.0/arrow/util/macros.h | 191 + .../vendored/pyarrow-15.0.0/arrow/util/map.h | 63 + .../arrow/util/math_constants.h | 32 + .../pyarrow-15.0.0/arrow/util/memory.h | 43 + .../pyarrow-15.0.0/arrow/util/mutex.h | 85 + .../pyarrow-15.0.0/arrow/util/parallel.h | 102 + .../pyarrow-15.0.0/arrow/util/pcg_random.h | 33 + .../pyarrow-15.0.0/arrow/util/print.h | 77 + .../pyarrow-15.0.0/arrow/util/queue.h | 29 + .../pyarrow-15.0.0/arrow/util/range.h | 258 + .../pyarrow-15.0.0/arrow/util/ree_util.h | 582 +++ .../pyarrow-15.0.0/arrow/util/regex.h | 51 + .../pyarrow-15.0.0/arrow/util/rle_encoding.h | 826 +++ .../arrow/util/rows_to_batches.h | 163 + .../vendored/pyarrow-15.0.0/arrow/util/simd.h | 44 + .../pyarrow-15.0.0/arrow/util/small_vector.h | 511 ++ .../vendored/pyarrow-15.0.0/arrow/util/sort.h | 78 + .../pyarrow-15.0.0/arrow/util/spaced.h | 98 + .../vendored/pyarrow-15.0.0/arrow/util/span.h | 132 + .../pyarrow-15.0.0/arrow/util/stopwatch.h | 48 + .../pyarrow-15.0.0/arrow/util/string.h | 173 + .../arrow/util/string_builder.h | 84 + .../pyarrow-15.0.0/arrow/util/task_group.h | 106 + .../pyarrow-15.0.0/arrow/util/tdigest.h | 104 + .../pyarrow-15.0.0/arrow/util/test_common.h | 90 + .../pyarrow-15.0.0/arrow/util/thread_pool.h | 620 +++ .../vendored/pyarrow-15.0.0/arrow/util/time.h | 83 + .../pyarrow-15.0.0/arrow/util/tracing.h | 45 + .../vendored/pyarrow-15.0.0/arrow/util/trie.h | 243 + .../pyarrow-15.0.0/arrow/util/type_fwd.h | 69 + .../pyarrow-15.0.0/arrow/util/type_traits.h | 46 + .../pyarrow-15.0.0/arrow/util/ubsan.h | 87 + .../pyarrow-15.0.0/arrow/util/union_util.h | 31 + .../pyarrow-15.0.0/arrow/util/unreachable.h | 30 + .../vendored/pyarrow-15.0.0/arrow/util/uri.h | 118 + .../vendored/pyarrow-15.0.0/arrow/util/utf8.h | 59 + .../pyarrow-15.0.0/arrow/util/value_parsing.h | 928 ++++ .../pyarrow-15.0.0/arrow/util/vector.h | 172 + .../pyarrow-15.0.0/arrow/util/visibility.h | 83 + .../arrow/util/windows_compatibility.h | 40 + .../pyarrow-15.0.0/arrow/util/windows_fixup.h | 52 + .../vendored/portable-snippets/debug-trap.h | 83 + .../vendored/portable-snippets/safe-math.h | 0 .../pyarrow-15.0.0/arrow/vendored/xxhash.h | 18 + .../arrow/vendored/xxhash/xxhash.h | 4562 ++++++++++++----- .../arrow/python/ArrowPythonConfig.cmake.in | 36 - .../python/ArrowPythonFlightConfig.cmake.in | 37 - .../pyarrow-7.0.0/arrow/python/CMakeLists.txt | 189 - .../arrow/python/arrow-python.pc.in | 26 - .../pyarrow-7.0.0/arrow/python/pyarrow_api.h | 239 - .../pyarrow-7.0.0/arrow/python/pyarrow_lib.h | 82 - .../pyarrow-7.0.0/arrow/python/python_test.cc | 599 --- .../arrow/python/util/CMakeLists.txt | 32 - vcpkg.json | 3 +- 134 files changed, 15117 insertions(+), 2986 deletions(-) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0/arrow/python/arrow-python-flight.pc.in => pyarrow-15.0.0/arrow/python/CMakeLists.txt} (73%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/api.h (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/arrow_to_pandas.cc (85%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/arrow_to_pandas.h (80%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/arrow_to_python_internal.h (100%) create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/async.h rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/benchmark.cc (94%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/benchmark.h (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/common.cc (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/common.h (74%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/csv.cc (98%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/csv.h (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/datetime.cc (72%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/datetime.h (87%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/decimal.cc (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/decimal.h (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/deserialize.cc (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/deserialize.h (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/extension_type.cc (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/extension_type.h (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/filesystem.cc (97%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/filesystem.h (96%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/flight.cc (81%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/flight.h (95%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/gdb.cc (65%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/gdb.h (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/helpers.cc (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/helpers.h (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/inference.cc (95%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/inference.h (98%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/init.cc (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/init.h (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/io.cc (95%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/io.h (92%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/ipc.cc (98%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/ipc.h (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/iterators.h (100%) create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/lib.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/lib_api.h rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/numpy_convert.cc (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/numpy_convert.h (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/numpy_internal.h (96%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/numpy_interop.h (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/numpy_to_arrow.cc (98%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/numpy_to_arrow.h (100%) create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/parquet_encryption.cc create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/parquet_encryption.h rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/pch.h (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/platform.h (86%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/pyarrow.cc (94%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/pyarrow.h (91%) create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/pyarrow_api.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/pyarrow_lib.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/python_test.cc rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0/arrow/python/util/test_main.cc => pyarrow-15.0.0/arrow/python/python_test.h} (65%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/python_to_arrow.cc (89%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/python_to_arrow.h (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/serialize.cc (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/serialize.h (100%) rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/type_traits.h (100%) create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/udf.cc create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/udf.h rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/python/visibility.h (97%) create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/int_util.h rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0/arrow/util/int_util_internal.h => pyarrow-15.0.0/arrow/util/int_util_overflow.h} (76%) create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/io_util.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/iterator.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/key_value_metadata.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/launder.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/list_util.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/logging.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/macros.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/map.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/math_constants.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/memory.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/mutex.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/parallel.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/pcg_random.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/print.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/queue.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/range.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/ree_util.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/regex.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/rle_encoding.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/rows_to_batches.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/simd.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/small_vector.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/sort.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/spaced.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/span.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/stopwatch.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/string.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/string_builder.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/task_group.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/tdigest.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/test_common.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/thread_pool.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/time.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/tracing.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/trie.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/type_fwd.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/type_traits.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/ubsan.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/union_util.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/unreachable.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/uri.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/utf8.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/value_parsing.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/vector.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/visibility.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/windows_compatibility.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/util/windows_fixup.h create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/vendored/portable-snippets/debug-trap.h rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/vendored/portable-snippets/safe-math.h (100%) create mode 100644 cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/vendored/xxhash.h rename cpp/csp/python/adapters/vendored/{pyarrow-7.0.0 => pyarrow-15.0.0}/arrow/vendored/xxhash/xxhash.h (50%) delete mode 100644 cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/ArrowPythonConfig.cmake.in delete mode 100644 cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/ArrowPythonFlightConfig.cmake.in delete mode 100644 cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/CMakeLists.txt delete mode 100644 cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/arrow-python.pc.in delete mode 100644 cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/pyarrow_api.h delete mode 100644 cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/pyarrow_lib.h delete mode 100644 cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/python_test.cc delete mode 100644 cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/util/CMakeLists.txt diff --git a/cpp/cmake/modules/FindDepsKafkaAdapter.cmake b/cpp/cmake/modules/FindDepsKafkaAdapter.cmake index 59e0404bd..6fab29400 100644 --- a/cpp/cmake/modules/FindDepsKafkaAdapter.cmake +++ b/cpp/cmake/modules/FindDepsKafkaAdapter.cmake @@ -1,3 +1,12 @@ cmake_minimum_required(VERSION 3.7.2) find_package(RdKafka REQUIRED) +find_package(OpenSSL REQUIRED) +find_package(lz4 REQUIRED) + +pkg_check_modules(SASL libsasl2) +if(SASL_FOUND) + set(CSP_LINK_SASL ON) +else() + set(CSP_LINK_SASL OFF) +endif() diff --git a/cpp/cmake/modules/FindDepsParquetAdapter.cmake b/cpp/cmake/modules/FindDepsParquetAdapter.cmake index f24a652ed..d932139b2 100644 --- a/cpp/cmake/modules/FindDepsParquetAdapter.cmake +++ b/cpp/cmake/modules/FindDepsParquetAdapter.cmake @@ -11,6 +11,7 @@ include_directories(${PARQUET_INCLUDE_DIR}) # Other deps find_package(Thrift REQUIRED) find_package(Brotli REQUIRED) +find_package(Snappy REQUIRED) # find_package(unofficial-brotli CONFIG REQUIRED) find_package(utf8proc REQUIRED) # find_package(unofficial-utf8proc CONFIG REQUIRED) diff --git a/cpp/csp/adapters/kafka/CMakeLists.txt b/cpp/csp/adapters/kafka/CMakeLists.txt index 60a1cdc52..217753b6b 100644 --- a/cpp/csp/adapters/kafka/CMakeLists.txt +++ b/cpp/csp/adapters/kafka/CMakeLists.txt @@ -21,7 +21,13 @@ add_library(csp_kafka_adapter STATIC ${KAFKA_SOURCE_FILES}) set_target_properties(csp_kafka_adapter PROPERTIES PUBLIC_HEADER "${KAFKA_HEADER_FILES}") find_package(RdKafka REQUIRED) -target_link_libraries(csp_kafka_adapter csp_adapter_utils RdKafka::rdkafka RdKafka::rdkafka++) +find_package(OpenSSL REQUIRED) +find_package(lz4 REQUIRED) +if(CSP_LINK_SASL) + set(SASL_LIBRARY sasl2) +endif() + +target_link_libraries(csp_kafka_adapter PRIVATE csp_adapter_utils ${RdKafka_LIBRARY} ${RdKafka_C_LIBRARY} OpenSSL::SSL OpenSSL::Crypto lz4::lz4 ${SASL_LIBRARY}) install(TARGETS csp_kafka_adapter PUBLIC_HEADER DESTINATION include/csp/adapters/kafka diff --git a/cpp/csp/adapters/parquet/ArrowIPCFileWriterWrapper.cpp b/cpp/csp/adapters/parquet/ArrowIPCFileWriterWrapper.cpp index 78ed7fe19..daf23337e 100644 --- a/cpp/csp/adapters/parquet/ArrowIPCFileWriterWrapper.cpp +++ b/cpp/csp/adapters/parquet/ArrowIPCFileWriterWrapper.cpp @@ -19,7 +19,7 @@ void ArrowIPCFileWriterWrapper::openImpl( const std::string &fileName, const std writeOptions.codec = resolveCompressionCodec( compression ); STATUS_OK_OR_THROW_RUNTIME( - ::arrow::ipc::NewStreamWriter( m_outputStream.get(), getSchema(), writeOptions ).Value(&m_fileWriter), + ::arrow::ipc::MakeStreamWriter( m_outputStream.get(), getSchema(), writeOptions ).Value(&m_fileWriter), "Failed to open arrow file writer" ); } diff --git a/cpp/csp/adapters/parquet/CMakeLists.txt b/cpp/csp/adapters/parquet/CMakeLists.txt index 65f2b2519..d976a5538 100644 --- a/cpp/csp/adapters/parquet/CMakeLists.txt +++ b/cpp/csp/adapters/parquet/CMakeLists.txt @@ -50,7 +50,15 @@ find_package(lz4 REQUIRED) find_package(utf8proc REQUIRED) find_package(Brotli REQUIRED) -target_link_libraries(csp_parquet_adapter PRIVATE csp_adapter_utils parquet_static arrow_static thrift::thrift lz4::lz4 utf8proc::utf8proc ${BROTLI_STATIC_LIB}) +if(CSP_USE_VCPKG) + # use staic variants + set(ARROW_PACKAGES_TO_LINK parquet_static arrow_static) +else() + # use dynamic variants + set(ARROW_PACKAGES_TO_LINK parquet arrow) +endif() + +target_link_libraries(csp_parquet_adapter PRIVATE csp_adapter_utils thrift::thrift lz4::lz4 utf8proc::utf8proc ${BROTLI_STATIC_LIB} ${ARROW_PACKAGES_TO_LINK}) install(TARGETS csp_parquet_adapter PUBLIC_HEADER DESTINATION include/csp/adapters/parquet diff --git a/cpp/csp/adapters/parquet/ParquetReaderColumnAdapter.cpp b/cpp/csp/adapters/parquet/ParquetReaderColumnAdapter.cpp index 8f36449f6..978b96b32 100644 --- a/cpp/csp/adapters/parquet/ParquetReaderColumnAdapter.cpp +++ b/cpp/csp/adapters/parquet/ParquetReaderColumnAdapter.cpp @@ -777,14 +777,14 @@ void ListColumnAdapter::readCurValue() for( int64_t i = 0; i < typedValues -> length(); ++i ) { - maxStringLength = std::max( ( uint32_t ) ArrayValidValueProvider::getValue(typedValues, i).length(), maxStringLength ); + maxStringLength = std::max( ( uint32_t ) ArrayValidValueProvider::getValue(typedValues, i).length(), maxStringLength ); } auto arrayValue = m_listReader -> create( typedValues -> length(), maxStringLength ); for( int64_t i = 0; i < typedValues -> length(); ++i ) { - m_listReader -> setValue( arrayValue, i, typedValues -> GetView( i ).to_string() ); + m_listReader -> setValue( arrayValue, i, std::string(typedValues -> GetView( i ))); } this -> m_curValue = std::move( arrayValue ); } diff --git a/cpp/csp/adapters/utils/ProtobufHelper.h b/cpp/csp/adapters/utils/ProtobufHelper.h index 62c5c1ccb..82d7dab7a 100644 --- a/cpp/csp/adapters/utils/ProtobufHelper.h +++ b/cpp/csp/adapters/utils/ProtobufHelper.h @@ -1,3 +1,4 @@ +#include #include #include #include diff --git a/cpp/csp/python/adapters/CMakeLists.txt b/cpp/csp/python/adapters/CMakeLists.txt index cc4bc50f3..e44630320 100644 --- a/cpp/csp/python/adapters/CMakeLists.txt +++ b/cpp/csp/python/adapters/CMakeLists.txt @@ -6,30 +6,30 @@ if(CSP_BUILD_KAFKA_ADAPTER) endif() if(CSP_BUILD_PARQUET_ADAPTER) - set(VENDORED_PYARROW_ROOW "${CMAKE_SOURCE_DIR}/cpp/csp/python/adapters/vendored/pyarrow-7.0.0/") + set(VENDORED_PYARROW_ROOT "${CMAKE_SOURCE_DIR}/cpp/csp/python/adapters/vendored/pyarrow-15.0.0/") set(ARROW_PYTHON_SRCS - ${VENDORED_PYARROW_ROOW}/arrow/python/arrow_to_pandas.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/benchmark.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/common.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/datetime.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/decimal.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/deserialize.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/extension_type.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/gdb.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/helpers.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/inference.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/init.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/io.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/ipc.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/numpy_convert.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/numpy_to_arrow.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/python_to_arrow.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/pyarrow.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/serialize.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/csv.cc - ${VENDORED_PYARROW_ROOW}/arrow/python/filesystem.cc) + ${VENDORED_PYARROW_ROOT}/arrow/python/arrow_to_pandas.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/benchmark.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/common.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/datetime.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/decimal.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/deserialize.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/extension_type.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/gdb.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/helpers.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/inference.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/init.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/io.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/ipc.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/numpy_convert.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/numpy_to_arrow.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/python_to_arrow.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/pyarrow.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/serialize.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/csv.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/filesystem.cc) add_library(parquetadapterimpl SHARED parquetadapterimpl.cpp ${ARROW_PYTHON_SRCS}) target_link_libraries(parquetadapterimpl csp_core csp_engine cspimpl csp_parquet_adapter) - target_include_directories(parquetadapterimpl PUBLIC ${ARROW_INCLUDE_DIR} ${PARQUET_INCLUDE_DIR} "${VENDORED_PYARROW_ROOW}") + target_include_directories(parquetadapterimpl PUBLIC ${ARROW_INCLUDE_DIR} ${PARQUET_INCLUDE_DIR} "${VENDORED_PYARROW_ROOT}") install(TARGETS parquetadapterimpl RUNTIME DESTINATION bin/ LIBRARY DESTINATION lib/) endif() diff --git a/cpp/csp/python/adapters/parquetadapterimpl.cpp b/cpp/csp/python/adapters/parquetadapterimpl.cpp index 8c7ecf84b..67020d056 100644 --- a/cpp/csp/python/adapters/parquetadapterimpl.cpp +++ b/cpp/csp/python/adapters/parquetadapterimpl.cpp @@ -190,7 +190,10 @@ class ArrowTableGenerator : public csp::Generator, std::shared_ptr bufferReader = std::make_shared( reinterpret_cast(data), size ); std::shared_ptr reader = arrow::ipc::RecordBatchStreamReader::Open(bufferReader.get()).ValueOrDie(); - STATUS_OK_OR_THROW_RUNTIME(reader->ReadAll(&value), "Failed read arrow table from buffer"); + auto result = reader->ToTable(); + if (!(result.ok())) + CSP_THROW(csp::RuntimeException, "Failed read arrow table from buffer"); + value = std::move(result.ValueUnsafe()); return true; } private: @@ -634,7 +637,7 @@ csp::AdapterManager *create_parquet_output_adapter_manager( PyEngine *engine, co { PyObjectPtr pyFilenameVisitor = PyObjectPtr::own( toPython( pyFilenameVisitorDG ) ); fileVisitor = [pyFilenameVisitor]( const std::string & filename ) - { + { PyObjectPtr rv = PyObjectPtr::own( PyObject_CallFunction( pyFilenameVisitor.get(), "O", PyObjectPtr::own( toPython( filename ) ).get() ) ); if( !rv.get() ) CSP_THROW( PythonPassthrough, "" ); diff --git a/cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/arrow-python-flight.pc.in b/cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/CMakeLists.txt similarity index 73% rename from cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/arrow-python-flight.pc.in rename to cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/CMakeLists.txt index fabed1b2d..ff355e46a 100644 --- a/cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/arrow-python-flight.pc.in +++ b/cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/CMakeLists.txt @@ -15,11 +15,4 @@ # specific language governing permissions and limitations # under the License. -libdir=@CMAKE_INSTALL_FULL_LIBDIR@ -includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ - -Name: Apache Arrow Python Flight -Description: Python integration library for Apache Arrow Flight -Version: @ARROW_VERSION@ -Requires: arrow-python arrow-flight -Libs: -L${libdir} -larrow_python_flight +arrow_install_all_headers("arrow/python") diff --git a/cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/api.h b/cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/api.h similarity index 100% rename from cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/api.h rename to cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/api.h diff --git a/cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/arrow_to_pandas.cc b/cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/arrow_to_pandas.cc similarity index 85% rename from cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/arrow_to_pandas.cc rename to cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/arrow_to_pandas.cc index 41e537191..e979342b8 100644 --- a/cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/arrow_to_pandas.cc +++ b/cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/arrow_to_pandas.cc @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -43,7 +44,6 @@ #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/parallel.h" -#include "arrow/util/string_view.h" #include "arrow/visit_type_inline.h" #include "arrow/compute/api.h" @@ -65,7 +65,6 @@ class MemoryPool; using internal::checked_cast; using internal::CheckIndexBounds; -using internal::GetByteWidth; using internal::OptionalParallelFor; namespace py { @@ -166,6 +165,7 @@ static inline bool ListTypeSupported(const DataType& type) { case Type::INT32: case Type::INT64: case Type::UINT64: + case Type::HALF_FLOAT: case Type::FLOAT: case Type::DOUBLE: case Type::DECIMAL128: @@ -177,11 +177,13 @@ static inline bool ListTypeSupported(const DataType& type) { case Type::DATE32: case Type::DATE64: case Type::STRUCT: + case Type::MAP: case Type::TIME32: case Type::TIME64: case Type::TIMESTAMP: case Type::DURATION: case Type::DICTIONARY: + case Type::INTERVAL_MONTH_DAY_NANO: case Type::NA: // empty list // The above types are all supported. return true; @@ -191,6 +193,10 @@ static inline bool ListTypeSupported(const DataType& type) { const auto& list_type = checked_cast(type); return ListTypeSupported(*list_type.value_type()); } + case Type::EXTENSION: { + const auto& ext = checked_cast(*type.GetSharedPtr()); + return ListTypeSupported(*(ext.storage_type())); + } default: break; } @@ -279,7 +285,7 @@ inline const T* GetPrimitiveValues(const Array& arr) { if (arr.length() == 0) { return nullptr; } - const int elsize = GetByteWidth(*arr.type()); + const int elsize = arr.type()->byte_width(); const auto& prim_arr = checked_cast(arr); return reinterpret_cast(prim_arr.values()->data() + arr.offset() * elsize); } @@ -337,6 +343,9 @@ class PandasWriter { DATETIME_MILLI, DATETIME_MICRO, DATETIME_NANO, + DATETIME_SECOND_TZ, + DATETIME_MILLI_TZ, + DATETIME_MICRO_TZ, DATETIME_NANO_TZ, TIMEDELTA_SECOND, TIMEDELTA_MILLI, @@ -347,7 +356,10 @@ class PandasWriter { }; PandasWriter(const PandasOptions& options, int64_t num_rows, int num_columns) - : options_(options), num_rows_(num_rows), num_columns_(num_columns) {} + : options_(options), num_rows_(num_rows), num_columns_(num_columns) { + PyAcquireGIL lock; + internal::InitPandasStaticData(); + } virtual ~PandasWriter() {} void SetBlockData(PyObject* arr) { @@ -370,7 +382,6 @@ class PandasWriter { return Status::OK(); } PyAcquireGIL lock; - npy_intp placement_dims[1] = {num_columns_}; PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64); RETURN_IF_PYERROR(); @@ -486,7 +497,7 @@ class PandasWriter { Status AllocateNDArray(int npy_type, int ndim = 2) { PyAcquireGIL lock; - PyObject* block_arr; + PyObject* block_arr = nullptr; npy_intp block_dims[2] = {0, 0}; if (ndim == 2) { @@ -584,7 +595,7 @@ template struct MemoizationTraits> { // For binary, we memoize string_view as a scalar value to avoid having to // unnecessarily copy the memory into the memo table data structure - using Scalar = util::string_view; + using Scalar = std::string_view; }; // Generic Array -> PyObject** converter that handles object deduplication, if @@ -654,7 +665,12 @@ Status ConvertStruct(PandasOptions options, const ChunkedArray& data, auto arr = checked_cast(data.chunk(c).get()); // Convert the struct arrays first for (int32_t i = 0; i < num_fields; i++) { - const auto field = arr->field(static_cast(i)); + auto field = arr->field(static_cast(i)); + // In case the field is an extension array, use .storage() to convert to Pandas + if (field->type()->id() == Type::EXTENSION) { + const ExtensionArray& arr_ext = checked_cast(*field); + field = arr_ext.storage(); + } RETURN_NOT_OK(ConvertArrayToPandas(options, field, nullptr, fields_data[i + fields_data_offset].ref())); DCHECK(PyArray_Check(fields_data[i + fields_data_offset].obj())); @@ -727,11 +743,26 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, ArrayVector value_arrays; for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = checked_cast(*data.chunk(c)); - value_arrays.emplace_back(arr.values()); + // values() does not account for offsets, so we need to slice into it. + // We can't use Flatten(), because it removes the values behind a null list + // value, and that makes the offsets into original list values and our + // flattened_values array different. + std::shared_ptr flattened_values = arr.values()->Slice( + arr.value_offset(0), arr.value_offset(arr.length()) - arr.value_offset(0)); + if (arr.value_type()->id() == Type::EXTENSION) { + const auto& arr_ext = checked_cast(*flattened_values); + value_arrays.emplace_back(arr_ext.storage()); + } else { + value_arrays.emplace_back(flattened_values); + } } + using ListArrayType = typename ListArrayT::TypeClass; const auto& list_type = checked_cast(*data.type()); auto value_type = list_type.value_type(); + if (value_type->id() == Type::EXTENSION) { + value_type = checked_cast(*value_type).storage_type(); + } auto flat_column = std::make_shared(value_arrays, value_type); @@ -740,22 +771,24 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, OwnedRefNoGIL owned_numpy_array; RETURN_NOT_OK(ConvertChunkedArrayToPandas(options, flat_column, nullptr, owned_numpy_array.ref())); - PyObject* numpy_array = owned_numpy_array.obj(); DCHECK(PyArray_Check(numpy_array)); int64_t chunk_offset = 0; for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = checked_cast(*data.chunk(c)); - const bool has_nulls = data.null_count() > 0; for (int64_t i = 0; i < arr.length(); ++i) { if (has_nulls && arr.IsNull(i)) { Py_INCREF(Py_None); *out_values = Py_None; } else { - OwnedRef start(PyLong_FromLongLong(arr.value_offset(i) + chunk_offset)); - OwnedRef end(PyLong_FromLongLong(arr.value_offset(i + 1) + chunk_offset)); + // Need to subtract value_offset(0) since the original chunk might be a slice + // into another array. + OwnedRef start(PyLong_FromLongLong(arr.value_offset(i) + chunk_offset - + arr.value_offset(0))); + OwnedRef end(PyLong_FromLongLong(arr.value_offset(i + 1) + chunk_offset - + arr.value_offset(0))); OwnedRef slice(PySlice_New(start.obj(), end.obj(), nullptr)); if (ARROW_PREDICT_FALSE(slice.obj() == nullptr)) { @@ -773,12 +806,122 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, } RETURN_IF_PYERROR(); + chunk_offset += arr.value_offset(arr.length()) - arr.value_offset(0); + } + + return Status::OK(); +} + +template +Status ConvertMapHelper(F1 resetRow, F2 addPairToRow, F3 stealRow, + const ChunkedArray& data, PyArrayObject* py_keys, + PyArrayObject* py_items, + // needed for null checks in items + const std::vector> item_arrays, + PyObject** out_values) { + OwnedRef key_value; + OwnedRef item_value; + + int64_t chunk_offset = 0; + for (int c = 0; c < data.num_chunks(); ++c) { + const auto& arr = checked_cast(*data.chunk(c)); + const bool has_nulls = data.null_count() > 0; + + // Make a list of key/item pairs for each row in array + for (int64_t i = 0; i < arr.length(); ++i) { + if (has_nulls && arr.IsNull(i)) { + Py_INCREF(Py_None); + *out_values = Py_None; + } else { + int64_t entry_offset = arr.value_offset(i); + int64_t num_pairs = arr.value_offset(i + 1) - entry_offset; + + // Build the new list object for the row of Python pairs + RETURN_NOT_OK(resetRow(num_pairs)); + + // Add each key/item pair in the row + for (int64_t j = 0; j < num_pairs; ++j) { + // Get key value, key is non-nullable for a valid row + auto ptr_key = reinterpret_cast( + PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j)); + key_value.reset(PyArray_GETITEM(py_keys, ptr_key)); + RETURN_IF_PYERROR(); + + if (item_arrays[c]->IsNull(entry_offset + j)) { + // Translate the Null to a None + Py_INCREF(Py_None); + item_value.reset(Py_None); + } else { + // Get valid value from item array + auto ptr_item = reinterpret_cast( + PyArray_GETPTR1(py_items, chunk_offset + entry_offset + j)); + item_value.reset(PyArray_GETITEM(py_items, ptr_item)); + RETURN_IF_PYERROR(); + } + + // Add the key/item pair to the row + RETURN_NOT_OK(addPairToRow(j, key_value, item_value)); + } + + // Pass ownership to the resulting array + *out_values = stealRow(); + } + ++out_values; + } + RETURN_IF_PYERROR(); + chunk_offset += arr.values()->length(); } return Status::OK(); } +// A more helpful error message around TypeErrors that may stem from unhashable keys +Status CheckMapAsPydictsTypeError() { + if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) { + return Status::OK(); + } + if (PyErr_ExceptionMatches(PyExc_TypeError)) { + // Modify the error string directly, so it is re-raised + // with our additional info. + // + // There are not many interesting things happening when this + // is hit. This is intended to only be called directly after + // PyDict_SetItem, where a finite set of errors could occur. + PyObject *type, *value, *traceback; + PyErr_Fetch(&type, &value, &traceback); + std::string message; + RETURN_NOT_OK(internal::PyObject_StdStringStr(value, &message)); + message += + ". If keys are not hashable, then you must use the option " + "[maps_as_pydicts=None (default)]"; + + // resets the error + PyErr_SetString(PyExc_TypeError, message.c_str()); + } + return ConvertPyError(); +} + +Status CheckForDuplicateKeys(bool error_on_duplicate_keys, Py_ssize_t total_dict_len, + Py_ssize_t total_raw_len) { + if (total_dict_len < total_raw_len) { + const char* message = + "[maps_as_pydicts] " + "After conversion of Arrow maps to pydicts, " + "detected data loss due to duplicate keys. " + "Original input length is [%lld], total converted pydict length is [%lld]."; + std::array buf; + std::snprintf(buf.data(), buf.size(), message, total_raw_len, total_dict_len); + + if (error_on_duplicate_keys) { + return Status::UnknownError(buf.data()); + } else { + ARROW_LOG(WARNING) << buf.data(); + } + } + return Status::OK(); +} + Status ConvertMap(PandasOptions options, const ChunkedArray& data, PyObject** out_values) { // Get columns of underlying key/item arrays @@ -814,9 +957,6 @@ Status ConvertMap(PandasOptions options, const ChunkedArray& data, auto flat_keys = std::make_shared(key_arrays, key_type); auto flat_items = std::make_shared(item_arrays, item_type); - OwnedRef list_item; - OwnedRef key_value; - OwnedRef item_value; OwnedRefNoGIL owned_numpy_keys; RETURN_NOT_OK( ConvertChunkedArrayToPandas(options, flat_keys, nullptr, owned_numpy_keys.ref())); @@ -826,61 +966,67 @@ Status ConvertMap(PandasOptions options, const ChunkedArray& data, PyArrayObject* py_keys = reinterpret_cast(owned_numpy_keys.obj()); PyArrayObject* py_items = reinterpret_cast(owned_numpy_items.obj()); - int64_t chunk_offset = 0; - for (int c = 0; c < data.num_chunks(); ++c) { - const auto& arr = checked_cast(*data.chunk(c)); - const bool has_nulls = data.null_count() > 0; - - // Make a list of key/item pairs for each row in array - for (int64_t i = 0; i < arr.length(); ++i) { - if (has_nulls && arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values = Py_None; - } else { - int64_t entry_offset = arr.value_offset(i); - int64_t num_maps = arr.value_offset(i + 1) - entry_offset; - - // Build the new list object for the row of maps - list_item.reset(PyList_New(num_maps)); - RETURN_IF_PYERROR(); - - // Add each key/item pair in the row - for (int64_t j = 0; j < num_maps; ++j) { - // Get key value, key is non-nullable for a valid row - auto ptr_key = reinterpret_cast( - PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j)); - key_value.reset(PyArray_GETITEM(py_keys, ptr_key)); - RETURN_IF_PYERROR(); - - if (item_arrays[c]->IsNull(entry_offset + j)) { - // Translate the Null to a None - Py_INCREF(Py_None); - item_value.reset(Py_None); - } else { - // Get valid value from item array - auto ptr_item = reinterpret_cast( - PyArray_GETPTR1(py_items, chunk_offset + entry_offset + j)); - item_value.reset(PyArray_GETITEM(py_items, ptr_item)); - RETURN_IF_PYERROR(); - } - - // Add the key/item pair to the list for the row - PyList_SET_ITEM(list_item.obj(), j, + if (options.maps_as_pydicts == MapConversionType::DEFAULT) { + // The default behavior to express an Arrow MAP as a list of [(key, value), ...] pairs + OwnedRef list_item; + return ConvertMapHelper( + [&list_item](int64_t num_pairs) { + list_item.reset(PyList_New(num_pairs)); + return CheckPyError(); + }, + [&list_item](int64_t idx, OwnedRef& key_value, OwnedRef& item_value) { + PyList_SET_ITEM(list_item.obj(), idx, PyTuple_Pack(2, key_value.obj(), item_value.obj())); - RETURN_IF_PYERROR(); - } - - // Pass ownership to the resulting array - *out_values = list_item.detach(); - } - ++out_values; + return CheckPyError(); + }, + [&list_item] { return list_item.detach(); }, data, py_keys, py_items, item_arrays, + out_values); + } else { + // Use a native pydict + OwnedRef dict_item; + Py_ssize_t total_dict_len{0}; + Py_ssize_t total_raw_len{0}; + + bool error_on_duplicate_keys; + if (options.maps_as_pydicts == MapConversionType::LOSSY) { + error_on_duplicate_keys = false; + } else if (options.maps_as_pydicts == MapConversionType::STRICT_) { + error_on_duplicate_keys = true; + } else { + auto val = std::underlying_type_t(options.maps_as_pydicts); + return Status::UnknownError("Received unknown option for maps_as_pydicts: " + + std::to_string(val)); } - RETURN_IF_PYERROR(); - chunk_offset += arr.values()->length(); + auto status = ConvertMapHelper( + [&dict_item, &total_raw_len](int64_t num_pairs) { + total_raw_len += num_pairs; + dict_item.reset(PyDict_New()); + return CheckPyError(); + }, + [&dict_item]([[maybe_unused]] int64_t idx, OwnedRef& key_value, + OwnedRef& item_value) { + auto setitem_result = + PyDict_SetItem(dict_item.obj(), key_value.obj(), item_value.obj()); + ARROW_RETURN_NOT_OK(CheckMapAsPydictsTypeError()); + // returns -1 if there are internal errors around hashing/resizing + return setitem_result == 0 ? Status::OK() + : Status::UnknownError( + "[maps_as_pydicts] " + "Unexpected failure inserting Arrow (key, " + "value) pair into Python dict"); + }, + [&dict_item, &total_dict_len] { + total_dict_len += PyDict_Size(dict_item.obj()); + return dict_item.detach(); + }, + data, py_keys, py_items, item_arrays, out_values); + + ARROW_RETURN_NOT_OK(status); + // If there were no errors generating the pydicts, + // then check if we detected any data loss from duplicate keys. + return CheckForDuplicateKeys(error_on_duplicate_keys, total_dict_len, total_raw_len); } - - return Status::OK(); } template @@ -1011,7 +1157,7 @@ struct ObjectWriterVisitor { enable_if_t::value || is_fixed_size_binary_type::value, Status> Visit(const Type& type) { - auto WrapValue = [](const util::string_view& view, PyObject** out) { + auto WrapValue = [](const std::string_view& view, PyObject** out) { *out = WrapBytes::Wrap(view.data(), view.length()); if (*out == nullptr) { PyErr_Clear(); @@ -1056,11 +1202,24 @@ struct ObjectWriterVisitor { auto ConvertTimezoneAware = [&](typename Type::c_type value, PyObject** out) { PyObject* naive_datetime; RETURN_NOT_OK(ConvertTimezoneNaive(value, &naive_datetime)); + // convert the timezone naive datetime object to timezone aware - *out = PyObject_CallMethod(tzinfo.obj(), "fromutc", "O", naive_datetime); + // two step conversion of the datetime mimics Python's code: + // dt.replace(tzinfo=datetime.timezone.utc).astimezone(tzinfo) + // first step: replacing timezone with timezone.utc (replace method) + OwnedRef args(PyTuple_New(0)); + OwnedRef keywords(PyDict_New()); + PyDict_SetItemString(keywords.obj(), "tzinfo", PyDateTime_TimeZone_UTC); + OwnedRef naive_datetime_replace(PyObject_GetAttrString(naive_datetime, "replace")); + OwnedRef datetime_utc( + PyObject_Call(naive_datetime_replace.obj(), args.obj(), keywords.obj())); + // second step: adjust the datetime to tzinfo timezone (astimezone method) + *out = PyObject_CallMethod(datetime_utc.obj(), "astimezone", "O", tzinfo.obj()); + // the timezone naive object is no longer required Py_DECREF(naive_datetime); RETURN_IF_PYERROR(); + return Status::OK(); }; @@ -1190,10 +1349,14 @@ struct ObjectWriterVisitor { enable_if_t::value || std::is_same::value || std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || std::is_same::value || (std::is_base_of::value && !std::is_same::value) || - std::is_base_of::value, + std::is_base_of::value || + std::is_base_of::value, Status> Visit(const Type& type) { return Status::NotImplemented("No implemented conversion to object dtype: ", @@ -1331,7 +1494,7 @@ class BoolWriter : public TypedPandasWriter { // Date / timestamp types template -inline void ConvertDatetimeLikeNanos(const ChunkedArray& data, int64_t* out_values) { +inline void ConvertDatetime(const ChunkedArray& data, int64_t* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = *data.chunk(c); const T* in_values = GetPrimitiveValues(arr); @@ -1413,7 +1576,30 @@ class DatetimeWriter : public TypedPandasWriter { }; using DatetimeSecondWriter = DatetimeWriter; -using DatetimeMilliWriter = DatetimeWriter; + +class DatetimeMilliWriter : public DatetimeWriter { + public: + using DatetimeWriter::DatetimeWriter; + + Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { + Type::type type = data->type()->id(); + int64_t* out_values = this->GetBlockColumnStart(rel_placement); + if (type == Type::DATE32) { + // Convert from days since epoch to datetime64[ms] + ConvertDatetime(*data, out_values); + } else if (type == Type::DATE64) { + ConvertNumericNullable(*data, kPandasTimestampNull, out_values); + } else { + const auto& ts_type = checked_cast(*data->type()); + DCHECK_EQ(TimeUnit::MILLI, ts_type.unit()) + << "Should only call instances of this writer " + << "with arrays of the correct unit"; + ConvertNumericNullable(*data, kPandasTimestampNull, out_values); + } + return Status::OK(); + } +}; + using DatetimeMicroWriter = DatetimeWriter; class DatetimeNanoWriter : public DatetimeWriter { @@ -1435,11 +1621,11 @@ class DatetimeNanoWriter : public DatetimeWriter { if (type == Type::DATE32) { // Convert from days since epoch to datetime64[ns] - ConvertDatetimeLikeNanos(*data, out_values); + ConvertDatetime(*data, out_values); } else if (type == Type::DATE64) { // Date64Type is millisecond timestamp stored as int64_t // TODO(wesm): Do we want to make sure to zero out the milliseconds? - ConvertDatetimeLikeNanos(*data, out_values); + ConvertDatetime(*data, out_values); } else if (type == Type::TIMESTAMP) { const auto& ts_type = checked_cast(*data->type()); @@ -1462,16 +1648,17 @@ class DatetimeNanoWriter : public DatetimeWriter { } }; -class DatetimeTZWriter : public DatetimeNanoWriter { +template +class DatetimeTZWriter : public BASE { public: DatetimeTZWriter(const PandasOptions& options, const std::string& timezone, int64_t num_rows) - : DatetimeNanoWriter(options, num_rows, 1), timezone_(timezone) {} + : BASE(options, num_rows, 1), timezone_(timezone) {} protected: Status GetResultBlock(PyObject** out) override { - RETURN_NOT_OK(MakeBlock1D()); - *out = block_arr_.obj(); + RETURN_NOT_OK(this->MakeBlock1D()); + *out = this->block_arr_.obj(); return Status::OK(); } @@ -1488,6 +1675,11 @@ class DatetimeTZWriter : public DatetimeNanoWriter { std::string timezone_; }; +using DatetimeSecondTZWriter = DatetimeTZWriter; +using DatetimeMilliTZWriter = DatetimeTZWriter; +using DatetimeMicroTZWriter = DatetimeTZWriter; +using DatetimeNanoTZWriter = DatetimeTZWriter; + template class TimedeltaWriter : public TypedPandasWriter { public: @@ -1533,11 +1725,11 @@ class TimedeltaNanoWriter : public TimedeltaWriter { if (ts_type.unit() == TimeUnit::NANO) { ConvertNumericNullable(*data, kPandasTimestampNull, out_values); } else if (ts_type.unit() == TimeUnit::MICRO) { - ConvertDatetimeLikeNanos(*data, out_values); + ConvertDatetime(*data, out_values); } else if (ts_type.unit() == TimeUnit::MILLI) { - ConvertDatetimeLikeNanos(*data, out_values); + ConvertDatetime(*data, out_values); } else if (ts_type.unit() == TimeUnit::SECOND) { - ConvertDatetimeLikeNanos(*data, out_values); + ConvertDatetime(*data, out_values); } else { return Status::NotImplemented("Unsupported time unit"); } @@ -1788,6 +1980,12 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type, *writer = std::make_shared>(options, num_rows); \ break; +#define TZ_CASE(NAME, TYPE) \ + case PandasWriter::NAME: { \ + const auto& ts_type = checked_cast(type); \ + *writer = std::make_shared(options, ts_type.timezone(), num_rows); \ + } break; + switch (writer_type) { case PandasWriter::CATEGORICAL: { const auto& index_type = *checked_cast(type).index_type(); @@ -1834,10 +2032,10 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type, BLOCK_CASE(TIMEDELTA_MILLI, TimedeltaMilliWriter); BLOCK_CASE(TIMEDELTA_MICRO, TimedeltaMicroWriter); BLOCK_CASE(TIMEDELTA_NANO, TimedeltaNanoWriter); - case PandasWriter::DATETIME_NANO_TZ: { - const auto& ts_type = checked_cast(type); - *writer = std::make_shared(options, ts_type.timezone(), num_rows); - } break; + TZ_CASE(DATETIME_SECOND_TZ, DatetimeSecondTZWriter); + TZ_CASE(DATETIME_MILLI_TZ, DatetimeMilliTZWriter); + TZ_CASE(DATETIME_MICRO_TZ, DatetimeMicroTZWriter); + TZ_CASE(DATETIME_NANO_TZ, DatetimeNanoTZWriter); default: return Status::NotImplemented("Unsupported block type"); } @@ -1900,13 +2098,25 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& case Type::INTERVAL_MONTH_DAY_NANO: // fall through *output_type = PandasWriter::OBJECT; break; - case Type::DATE32: // fall through + case Type::DATE32: + if (options.date_as_object) { + *output_type = PandasWriter::OBJECT; + } else if (options.coerce_temporal_nanoseconds) { + *output_type = PandasWriter::DATETIME_NANO; + } else if (options.to_numpy) { + // Numpy supports Day, but Pandas does not + *output_type = PandasWriter::DATETIME_DAY; + } else { + *output_type = PandasWriter::DATETIME_MILLI; + } + break; case Type::DATE64: if (options.date_as_object) { *output_type = PandasWriter::OBJECT; + } else if (options.coerce_temporal_nanoseconds) { + *output_type = PandasWriter::DATETIME_NANO; } else { - *output_type = options.coerce_temporal_nanoseconds ? PandasWriter::DATETIME_NANO - : PandasWriter::DATETIME_DAY; + *output_type = PandasWriter::DATETIME_MILLI; } break; case Type::TIMESTAMP: { @@ -1915,24 +2125,43 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& // Nanoseconds are never out of bounds for pandas, so in that case // we don't convert to object *output_type = PandasWriter::OBJECT; - } else if (!ts_type.timezone().empty()) { - *output_type = PandasWriter::DATETIME_NANO_TZ; } else if (options.coerce_temporal_nanoseconds) { - *output_type = PandasWriter::DATETIME_NANO; + if (!ts_type.timezone().empty()) { + *output_type = PandasWriter::DATETIME_NANO_TZ; + } else { + *output_type = PandasWriter::DATETIME_NANO; + } } else { - switch (ts_type.unit()) { - case TimeUnit::SECOND: - *output_type = PandasWriter::DATETIME_SECOND; - break; - case TimeUnit::MILLI: - *output_type = PandasWriter::DATETIME_MILLI; - break; - case TimeUnit::MICRO: - *output_type = PandasWriter::DATETIME_MICRO; - break; - case TimeUnit::NANO: - *output_type = PandasWriter::DATETIME_NANO; - break; + if (!ts_type.timezone().empty()) { + switch (ts_type.unit()) { + case TimeUnit::SECOND: + *output_type = PandasWriter::DATETIME_SECOND_TZ; + break; + case TimeUnit::MILLI: + *output_type = PandasWriter::DATETIME_MILLI_TZ; + break; + case TimeUnit::MICRO: + *output_type = PandasWriter::DATETIME_MICRO_TZ; + break; + case TimeUnit::NANO: + *output_type = PandasWriter::DATETIME_NANO_TZ; + break; + } + } else { + switch (ts_type.unit()) { + case TimeUnit::SECOND: + *output_type = PandasWriter::DATETIME_SECOND; + break; + case TimeUnit::MILLI: + *output_type = PandasWriter::DATETIME_MILLI; + break; + case TimeUnit::MICRO: + *output_type = PandasWriter::DATETIME_MICRO; + break; + case TimeUnit::NANO: + *output_type = PandasWriter::DATETIME_NANO; + break; + } } } } break; @@ -2032,6 +2261,18 @@ class PandasBlockCreator { std::vector column_block_placement_; }; +// Helper function for extension chunked arrays +// Constructing a storage chunked array of an extension chunked array +std::shared_ptr GetStorageChunkedArray(std::shared_ptr arr) { + auto value_type = checked_cast(*arr->type()).storage_type(); + ArrayVector storage_arrays; + for (int c = 0; c < arr->num_chunks(); c++) { + const auto& arr_ext = checked_cast(*arr->chunk(c)); + storage_arrays.emplace_back(arr_ext.storage()); + } + return std::make_shared(std::move(storage_arrays), value_type); +}; + class ConsolidatedBlockCreator : public PandasBlockCreator { public: using PandasBlockCreator::PandasBlockCreator; @@ -2057,6 +2298,10 @@ class ConsolidatedBlockCreator : public PandasBlockCreator { *out = PandasWriter::EXTENSION; return Status::OK(); } else { + // In case of an extension array default to the storage type + if (arrays_[column_index]->type()->id() == Type::EXTENSION) { + arrays_[column_index] = GetStorageChunkedArray(arrays_[column_index]); + } return GetPandasWriterType(*arrays_[column_index], options_, out); } } @@ -2070,6 +2315,9 @@ class ConsolidatedBlockCreator : public PandasBlockCreator { int block_placement = 0; std::shared_ptr writer; if (output_type == PandasWriter::CATEGORICAL || + output_type == PandasWriter::DATETIME_SECOND_TZ || + output_type == PandasWriter::DATETIME_MILLI_TZ || + output_type == PandasWriter::DATETIME_MICRO_TZ || output_type == PandasWriter::DATETIME_NANO_TZ || output_type == PandasWriter::EXTENSION) { RETURN_NOT_OK(MakeWriter(options_, output_type, type, num_rows_, @@ -2105,6 +2353,9 @@ class ConsolidatedBlockCreator : public PandasBlockCreator { PandasWriter::type output_type = this->column_types_[i]; switch (output_type) { case PandasWriter::CATEGORICAL: + case PandasWriter::DATETIME_SECOND_TZ: + case PandasWriter::DATETIME_MILLI_TZ: + case PandasWriter::DATETIME_MICRO_TZ: case PandasWriter::DATETIME_NANO_TZ: case PandasWriter::EXTENSION: { auto it = this->singleton_blocks_.find(i); @@ -2279,6 +2530,11 @@ Status ConvertChunkedArrayToPandas(const PandasOptions& options, // Table->DataFrame modified_options.allow_zero_copy_blocks = true; + // In case of an extension array default to the storage type + if (arr->type()->id() == Type::EXTENSION) { + arr = GetStorageChunkedArray(arr); + } + PandasWriter::type output_type; RETURN_NOT_OK(GetPandasWriterType(*arr, modified_options, &output_type)); if (options.decode_dictionaries) { diff --git a/cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/arrow_to_pandas.h b/cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/arrow_to_pandas.h similarity index 80% rename from cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/arrow_to_pandas.h rename to cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/arrow_to_pandas.h index 6570364b8..82e0a6005 100644 --- a/cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/arrow_to_pandas.h +++ b/cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/arrow_to_pandas.h @@ -41,6 +41,13 @@ class Table; namespace py { +enum class MapConversionType { + DEFAULT, // convert arrow maps to assoc lists (list of kev-value tuples) in Pandas + LOSSY, // report warnings when lossiness is encountered due to duplicate keys + STRICT_, // raise a Python exception when lossiness is encountered due to duplicate + // keys +}; + struct PandasOptions { /// arrow::MemoryPool to use for memory allocations MemoryPool* pool = default_memory_pool(); @@ -90,6 +97,17 @@ struct PandasOptions { /// conversions bool self_destruct = false; + /// \brief The default behavior (DEFAULT), is to convert Arrow Map arrays to + /// Python association lists (list-of-tuples) in the same order as the Arrow + /// Map, as in [(key1, value1), (key2, value2), ...] + /// If LOSSY or STRICT, convert Arrow Map arrays to native Python dicts. + /// This can change the ordering of (key, value) pairs, and will deduplicate + /// multiple keys, resulting in a possible loss of data. + /// If 'lossy', this key deduplication results in a warning printed + /// when detected. If 'strict', this instead results in an exception + /// being raised when detected. + MapConversionType maps_as_pydicts = MapConversionType::DEFAULT; + // Used internally for nested arrays. bool decode_dictionaries = false; @@ -99,6 +117,10 @@ struct PandasOptions { // Columns that should be passed through to be converted to // ExtensionArray/Block std::unordered_set extension_columns; + + // Used internally to decipher between to_numpy() and to_pandas() when + // the expected output differs + bool to_numpy = false; }; ARROW_PYTHON_EXPORT diff --git a/cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/arrow_to_python_internal.h b/cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/arrow_to_python_internal.h similarity index 100% rename from cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/arrow_to_python_internal.h rename to cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/arrow_to_python_internal.h diff --git a/cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/async.h b/cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/async.h new file mode 100644 index 000000000..1568d2193 --- /dev/null +++ b/cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/async.h @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/python/common.h" +#include "arrow/status.h" +#include "arrow/util/future.h" + +namespace arrow::py { + +/// \brief Bind a Python callback to an arrow::Future. +/// +/// If the Future finishes successfully, py_wrapper is called with its +/// result value and should return a PyObject*. If py_wrapper is successful, +/// py_cb is called with its return value. +/// +/// If either the Future or py_wrapper fails, py_cb is called with the +/// associated Python exception. +/// +/// \param future The future to bind to. +/// \param py_cb The Python callback function. Will be passed the result of +/// py_wrapper, or a Python exception if the future failed or one was +/// raised by py_wrapper. +/// \param py_wrapper A function (likely defined in Cython) to convert the C++ +/// result of the future to a Python object. +template +void BindFuture(Future future, PyObject* py_cb, PyWrapper py_wrapper) { + Py_INCREF(py_cb); + OwnedRefNoGIL cb_ref(py_cb); + + auto future_cb = [cb_ref = std::move(cb_ref), + py_wrapper = std::move(py_wrapper)](Result result) { + SafeCallIntoPythonVoid([&]() { + OwnedRef py_value_or_exc{WrapResult(std::move(result), std::move(py_wrapper))}; + Py_XDECREF( + PyObject_CallFunctionObjArgs(cb_ref.obj(), py_value_or_exc.obj(), NULLPTR)); + ARROW_WARN_NOT_OK(CheckPyError(), "Internal error in async call"); + }); + }; + future.AddCallback(std::move(future_cb)); +} + +} // namespace arrow::py diff --git a/cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/benchmark.cc b/cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/benchmark.cc similarity index 94% rename from cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/benchmark.cc rename to cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/benchmark.cc index 2d29f69d2..6dcc959ed 100644 --- a/cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/benchmark.cc +++ b/cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/benchmark.cc @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include -#include +#include "arrow/python/benchmark.h" +#include "arrow/python/helpers.h" namespace arrow { namespace py { diff --git a/cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/benchmark.h b/cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/benchmark.h similarity index 100% rename from cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/benchmark.h rename to cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/benchmark.h diff --git a/cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/common.cc b/cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/common.cc similarity index 100% rename from cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/common.cc rename to cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/common.cc diff --git a/cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/common.h b/cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/common.h similarity index 74% rename from cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/common.h rename to cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/common.h index 24dcb130a..4a7886695 100644 --- a/cpp/csp/python/adapters/vendored/pyarrow-7.0.0/arrow/python/common.h +++ b/cpp/csp/python/adapters/vendored/pyarrow-15.0.0/arrow/python/common.h @@ -17,7 +17,9 @@ #pragma once +#include #include +#include #include #include "arrow/buffer.h" @@ -71,6 +73,37 @@ T GetResultValue(Result result) { } } +/// \brief Wrap a Result and return the corresponding Python object. +/// +/// If the Result is successful, py_wrapper is called with its result value +/// and should return a PyObject*. If py_wrapper is successful (returns +/// a non-NULL value), its return value is returned. +/// +/// If either the Result or py_wrapper fails, the associated Python exception +/// is raised and NULL is returned. +// +/// \param result The Result whose value to wrap in a Python object. +/// \param py_wrapper A function (likely defined in Cython) to convert the C++ +/// value of the Result to a Python object. +/// \return A new Python reference, or NULL if an exception occurred +template +PyObject* WrapResult(Result result, PyWrapper&& py_wrapper) { + static_assert(std::is_same_v()))>, + "PyWrapper argument to WrapResult should return a PyObject* " + "when called with a T*"); + Status st = result.status(); + if (st.ok()) { + PyObject* py_value = py_wrapper(result.MoveValueUnsafe()); + st = CheckPyError(); + if (st.ok()) { + return py_value; + } + Py_XDECREF(py_value); // should be null, but who knows + } + // Status is an error, convert it to an exception. + return internal::convert_status(st); +} + // A RAII-style helper that ensures the GIL is acquired inside a lexical block. class ARROW_PYTHON_EXPORT PyAcquireGIL { public: @@ -102,13 +135,15 @@ class ARROW_PYTHON_EXPORT PyAcquireGIL { // A RAII-style helper that releases the GIL until the end of a lexical block class ARROW_PYTHON_EXPORT PyReleaseGIL { public: - PyReleaseGIL() { saved_state_ = PyEval_SaveThread(); } - - ~PyReleaseGIL() { PyEval_RestoreThread(saved_state_); } + PyReleaseGIL() : ptr_(PyEval_SaveThread(), &unique_ptr_deleter) {} private: - PyThreadState* saved_state_; - ARROW_DISALLOW_COPY_AND_ASSIGN(PyReleaseGIL); + static void unique_ptr_deleter(PyThreadState* state) { + if (state) { + PyEval_RestoreThread(state); + } + } + std::unique_ptr ptr_; }; // A helper to call safely into the Python interpreter from arbitrary C++ code. @@ -130,6 +165,19 @@ auto SafeCallIntoPython(Function&& func) -> decltype(func()) { return maybe_status; } +template +auto SafeCallIntoPythonVoid(Function&& func) -> decltype(func()) { + PyAcquireGIL lock; + PyObject* exc_type; + PyObject* exc_value; + PyObject* exc_traceback; + PyErr_Fetch(&exc_type, &exc_value, &exc_traceback); + func(); + if (exc_type != NULLPTR) { + PyErr_Restore(exc_type, exc_value, exc_traceback); + } +} + // A RAII primitive that DECREFs the underlying PyObject* when it // goes out of scope. class ARROW_PYTHON_EXPORT OwnedRef { @@ -143,7 +191,12 @@ class ARROW_PYTHON_EXPORT OwnedRef { return *this; } - ~OwnedRef() { reset(); } + ~OwnedRef() { + // GH-38626: destructor may be called after the Python interpreter is finalized. + if (Py_IsInitialized()) { + reset(); + } + } void reset(PyObject* obj) { Py_XDECREF(obj_); @@ -180,11 +233,56 @@ class ARROW_PYTHON_EXPORT OwnedRefNoGIL : public OwnedRef { explicit OwnedRefNoGIL(PyObject* obj) : OwnedRef(obj) {} ~OwnedRefNoGIL() { - PyAcquireGIL lock; - reset(); + // GH-38626: destructor may be called after the Python interpreter is finalized. + if (Py_IsInitialized() && obj() != NULLPTR) { + PyAcquireGIL lock; + reset(); + } } }; +template