From 831e2f216fb0b8f6c1a2c13cd6d2b42712a8196d Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Fri, 18 Aug 2017 10:11:05 -0700 Subject: [PATCH] remove sequence.h --- cpp/src/arrow/python/CMakeLists.txt | 2 - cpp/src/arrow/python/arrow_to_python.cc | 2 +- .../arrow/python/python_to_arrow-internal.h | 300 ++++++++++++++++++ cpp/src/arrow/python/python_to_arrow.cc | 57 +--- cpp/src/arrow/python/python_to_arrow.h | 1 - cpp/src/arrow/python/sequence.cc | 169 ---------- cpp/src/arrow/python/sequence.h | 137 -------- 7 files changed, 302 insertions(+), 366 deletions(-) create mode 100644 cpp/src/arrow/python/python_to_arrow-internal.h delete mode 100644 cpp/src/arrow/python/sequence.cc delete mode 100644 cpp/src/arrow/python/sequence.h diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 3e1b091611200..f2807b930a33c 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -54,7 +54,6 @@ set(ARROW_PYTHON_SRCS pandas_to_arrow.cc python_to_arrow.cc pyarrow.cc - sequence.cc ) set(ARROW_PYTHON_SHARED_LINK_LIBS @@ -99,7 +98,6 @@ install(FILES python_to_arrow.h platform.h pyarrow.h - sequence.h type_traits.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/python") diff --git a/cpp/src/arrow/python/arrow_to_python.cc b/cpp/src/arrow/python/arrow_to_python.cc index b9d7d7cb46f9c..ab1f8ba7b57b7 100644 --- a/cpp/src/arrow/python/arrow_to_python.cc +++ b/cpp/src/arrow/python/arrow_to_python.cc @@ -137,7 +137,7 @@ Status GetValue(std::shared_ptr arr, int64_t index, int32_t type, PyObjec } } // We use an Int32Builder here to distinguish the tensor indices from - // the Type::INT64 above (see tensor_indices_ in sequence.h). + // the Type::INT64 above (see tensor_indices_ in SequenceBuilder). case Type::INT32: { return DeserializeArray(arr, index, base, tensors, result); } diff --git a/cpp/src/arrow/python/python_to_arrow-internal.h b/cpp/src/arrow/python/python_to_arrow-internal.h new file mode 100644 index 0000000000000..b4382c0ba42da --- /dev/null +++ b/cpp/src/arrow/python/python_to_arrow-internal.h @@ -0,0 +1,300 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PYTHON_PYTHON_TO_ARROW_INTERNAL_H +#define ARROW_PYTHON_PYTHON_TO_ARROW_INTERNAL_H + +#include + +#include "arrow/api.h" +#include "arrow/util/logging.h" + +namespace arrow { +namespace py { + +#define UPDATE(OFFSET, TAG) \ + if (TAG == -1) { \ + TAG = num_tags; \ + num_tags += 1; \ + } \ + RETURN_NOT_OK(offsets_.Append(OFFSET)); \ + RETURN_NOT_OK(types_.Append(TAG)); \ + RETURN_NOT_OK(nones_.AppendToBitmap(true)); + +#define ADD_ELEMENT(VARNAME, TAG) \ + if (TAG != -1) { \ + types[TAG] = std::make_shared("", VARNAME.type()); \ + RETURN_NOT_OK(VARNAME.Finish(&children[TAG])); \ + RETURN_NOT_OK(nones_.AppendToBitmap(true)); \ + type_ids.push_back(TAG); \ + } + +#define ADD_SUBSEQUENCE(DATA, OFFSETS, BUILDER, TAG, NAME) \ + if (DATA) { \ + DCHECK(DATA->length() == OFFSETS.back()); \ + std::shared_ptr offset_array; \ + Int32Builder builder(pool_, std::make_shared()); \ + RETURN_NOT_OK(builder.Append(OFFSETS.data(), OFFSETS.size())); \ + RETURN_NOT_OK(builder.Finish(&offset_array)); \ + std::shared_ptr list_array; \ + ListArray::FromArrays(*offset_array, *DATA, pool_, &list_array); \ + auto field = std::make_shared(NAME, list_array->type()); \ + auto type = \ + std::make_shared(std::vector>({field})); \ + types[TAG] = std::make_shared("", type); \ + children[TAG] = std::shared_ptr( \ + new StructArray(type, list_array->length(), {list_array})); \ + RETURN_NOT_OK(nones_.AppendToBitmap(true)); \ + type_ids.push_back(TAG); \ + } else { \ + DCHECK_EQ(OFFSETS.size(), 1); \ + } + +/// A Sequence is a heterogeneous collections of elements. It can contain +/// scalar Python types, lists, tuples, dictionaries and tensors. +class SequenceBuilder { + public: + explicit SequenceBuilder(MemoryPool* pool = nullptr) + : pool_(pool), + types_(pool, ::arrow::int8()), + offsets_(pool, ::arrow::int32()), + nones_(pool), + bools_(pool, ::arrow::boolean()), + ints_(pool, ::arrow::int64()), + bytes_(pool, ::arrow::binary()), + strings_(pool), + floats_(pool, ::arrow::float32()), + doubles_(pool, ::arrow::float64()), + tensor_indices_(pool, ::arrow::int32()), + list_offsets_({0}), + tuple_offsets_({0}), + dict_offsets_({0}) {} + + /// Appending a none to the sequence + Status AppendNone() { + RETURN_NOT_OK(offsets_.Append(0)); + RETURN_NOT_OK(types_.Append(0)); + return nones_.AppendToBitmap(false); + } + + /// Appending a boolean to the sequence + Status AppendBool(bool data) { + UPDATE(bools_.length(), bool_tag); + return bools_.Append(data); + } + + /// Appending an int64_t to the sequence + Status AppendInt64(int64_t data) { + UPDATE(ints_.length(), int_tag); + return ints_.Append(data); + } + + /// Appending an uint64_t to the sequence + Status AppendUInt64(uint64_t data) { + UPDATE(ints_.length(), int_tag); + return ints_.Append(data); + } + + /// Append a list of bytes to the sequence + Status AppendBytes(const uint8_t* data, int32_t length) { + UPDATE(bytes_.length(), bytes_tag); + return bytes_.Append(data, length); + } + + /// Appending a string to the sequence + Status AppendString(const char* data, int32_t length) { + UPDATE(strings_.length(), string_tag); + return strings_.Append(data, length); + } + + /// Appending a float to the sequence + Status AppendFloat(float data) { + UPDATE(floats_.length(), float_tag); + return floats_.Append(data); + } + + /// Appending a double to the sequence + Status AppendDouble(double data) { + UPDATE(doubles_.length(), double_tag); + return doubles_.Append(data); + } + + /// Appending a tensor to the sequence + /// + /// \param tensor_index Index of the tensor in the object. + Status AppendTensor(int32_t tensor_index) { + UPDATE(tensor_indices_.length(), tensor_tag); + return tensor_indices_.Append(tensor_index); + } + + /// Add a sublist to the sequence. The data contained in the sublist will be + /// specified in the "Finish" method. + /// + /// To construct l = [[11, 22], 33, [44, 55]] you would for example run + /// list = ListBuilder(); + /// list.AppendList(2); + /// list.Append(33); + /// list.AppendList(2); + /// list.Finish([11, 22, 44, 55]); + /// list.Finish(); + + /// \param size + /// The size of the sublist + Status AppendList(int32_t size) { + UPDATE(list_offsets_.size() - 1, list_tag); + list_offsets_.push_back(list_offsets_.back() + size); + return Status::OK(); + } + + Status AppendTuple(int32_t size) { + UPDATE(tuple_offsets_.size() - 1, tuple_tag); + tuple_offsets_.push_back(tuple_offsets_.back() + size); + return Status::OK(); + } + + Status AppendDict(int32_t size) { + UPDATE(dict_offsets_.size() - 1, dict_tag); + dict_offsets_.push_back(dict_offsets_.back() + size); + return Status::OK(); + } + + /// Finish building the sequence and return the result. + Status Finish(std::shared_ptr list_data, std::shared_ptr tuple_data, + std::shared_ptr dict_data, std::shared_ptr* out) { + std::vector> types(num_tags); + std::vector> children(num_tags); + std::vector type_ids; + + ADD_ELEMENT(bools_, bool_tag); + ADD_ELEMENT(ints_, int_tag); + ADD_ELEMENT(strings_, string_tag); + ADD_ELEMENT(bytes_, bytes_tag); + ADD_ELEMENT(floats_, float_tag); + ADD_ELEMENT(doubles_, double_tag); + + ADD_ELEMENT(tensor_indices_, tensor_tag); + + ADD_SUBSEQUENCE(list_data, list_offsets_, list_builder, list_tag, "list"); + ADD_SUBSEQUENCE(tuple_data, tuple_offsets_, tuple_builder, tuple_tag, "tuple"); + ADD_SUBSEQUENCE(dict_data, dict_offsets_, dict_builder, dict_tag, "dict"); + + auto type = ::arrow::union_(types, type_ids, UnionMode::DENSE); + out->reset(new UnionArray(type, types_.length(), children, types_.data(), + offsets_.data(), nones_.null_bitmap(), nones_.null_count())); + return Status::OK(); + } + + private: + MemoryPool* pool_; + + Int8Builder types_; + Int32Builder offsets_; + + /// Total number of bytes needed to represent this sequence. + int64_t total_num_bytes_; + + NullBuilder nones_; + BooleanBuilder bools_; + Int64Builder ints_; + BinaryBuilder bytes_; + StringBuilder strings_; + FloatBuilder floats_; + DoubleBuilder doubles_; + + // We use an Int32Builder here to distinguish the tensor indices from + // the ints_ above (see the case Type::INT32 in get_value in python.cc). + // TODO(pcm): Replace this by using the union tags to distinguish between + // these two cases. + Int32Builder tensor_indices_; + + std::vector list_offsets_; + std::vector tuple_offsets_; + std::vector dict_offsets_; + + // Tags for members of the sequence. If they are set to -1 it means + // they are not used and will not part be of the metadata when we call + // SequenceBuilder::Finish. If a member with one of the tags is added, + // the associated variable gets a unique index starting from 0. This + // happens in the UPDATE macro in sequence.cc. + int8_t bool_tag = -1; + int8_t int_tag = -1; + int8_t string_tag = -1; + int8_t bytes_tag = -1; + int8_t float_tag = -1; + int8_t double_tag = -1; + + int8_t tensor_tag = -1; + int8_t list_tag = -1; + int8_t tuple_tag = -1; + int8_t dict_tag = -1; + + int8_t num_tags = 0; +}; + +/// Constructing dictionaries of key/value pairs. Sequences of +/// keys and values are built separately using a pair of +/// SequenceBuilders. The resulting Arrow representation +/// can be obtained via the Finish method. +class DictBuilder { + public: + explicit DictBuilder(MemoryPool* pool = nullptr) : keys_(pool), vals_(pool) {} + + /// Builder for the keys of the dictionary + SequenceBuilder& keys() { return keys_; } + /// Builder for the values of the dictionary + SequenceBuilder& vals() { return vals_; } + + /// Construct an Arrow StructArray representing the dictionary. + /// Contains a field "keys" for the keys and "vals" for the values. + + /// \param list_data + /// List containing the data from nested lists in the value + /// list of the dictionary + /// + /// \param dict_data + /// List containing the data from nested dictionaries in the + /// value list of the dictionary + Status Finish(std::shared_ptr key_tuple_data, + std::shared_ptr key_dict_data, + std::shared_ptr val_list_data, + std::shared_ptr val_tuple_data, + std::shared_ptr val_dict_data, + std::shared_ptr* out) { + // lists and dicts can't be keys of dicts in Python, that is why for + // the keys we do not need to collect sublists + std::shared_ptr keys, vals; + RETURN_NOT_OK(keys_.Finish(nullptr, key_tuple_data, key_dict_data, &keys)); + RETURN_NOT_OK(vals_.Finish(val_list_data, val_tuple_data, val_dict_data, &vals)); + auto keys_field = std::make_shared("keys", keys->type()); + auto vals_field = std::make_shared("vals", vals->type()); + auto type = std::make_shared( + std::vector>({keys_field, vals_field})); + std::vector> field_arrays({keys, vals}); + DCHECK(keys->length() == vals->length()); + out->reset(new StructArray(type, keys->length(), field_arrays)); + return Status::OK(); + } + + private: + SequenceBuilder keys_; + SequenceBuilder vals_; +}; + +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_PYTHON_TO_ARROW_INTERNAL_H diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 295140f1aa5ab..6910d1a7af110 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -28,7 +28,7 @@ #include "arrow/python/numpy_convert.h" #include "arrow/python/numpy_interop.h" #include "arrow/python/platform.h" -#include "arrow/python/sequence.h" +#include "arrow/python/python_to_arrow-internal.h" constexpr int32_t kMaxRecursionDepth = 100; @@ -40,61 +40,6 @@ PyObject* pyarrow_deserialize_callback = NULL; namespace arrow { namespace py { -/// Constructing dictionaries of key/value pairs. Sequences of -/// keys and values are built separately using a pair of -/// SequenceBuilders. The resulting Arrow representation -/// can be obtained via the Finish method. -class DictBuilder { - public: - explicit DictBuilder(MemoryPool* pool = nullptr) : keys_(pool), vals_(pool) {} - - /// Builder for the keys of the dictionary - SequenceBuilder& keys() { return keys_; } - /// Builder for the values of the dictionary - SequenceBuilder& vals() { return vals_; } - - /// Construct an Arrow StructArray representing the dictionary. - /// Contains a field "keys" for the keys and "vals" for the values. - - /// \param list_data - /// List containing the data from nested lists in the value - /// list of the dictionary - /// - /// \param dict_data - /// List containing the data from nested dictionaries in the - /// value list of the dictionary - arrow::Status Finish(std::shared_ptr key_tuple_data, - std::shared_ptr key_dict_data, - std::shared_ptr val_list_data, - std::shared_ptr val_tuple_data, - std::shared_ptr val_dict_data, std::shared_ptr* out); - - private: - SequenceBuilder keys_; - SequenceBuilder vals_; -}; - -Status DictBuilder::Finish(std::shared_ptr key_tuple_data, - std::shared_ptr key_dict_data, - std::shared_ptr val_list_data, - std::shared_ptr val_tuple_data, - std::shared_ptr val_dict_data, - std::shared_ptr* out) { - // lists and dicts can't be keys of dicts in Python, that is why for - // the keys we do not need to collect sublists - std::shared_ptr keys, vals; - RETURN_NOT_OK(keys_.Finish(nullptr, key_tuple_data, key_dict_data, &keys)); - RETURN_NOT_OK(vals_.Finish(val_list_data, val_tuple_data, val_dict_data, &vals)); - auto keys_field = std::make_shared("keys", keys->type()); - auto vals_field = std::make_shared("vals", vals->type()); - auto type = std::make_shared( - std::vector>({keys_field, vals_field})); - std::vector> field_arrays({keys, vals}); - DCHECK(keys->length() == vals->length()); - out->reset(new StructArray(type, keys->length(), field_arrays)); - return Status::OK(); -} - Status CallCustomCallback(PyObject* callback, PyObject* elem, PyObject** result) { *result = NULL; if (!callback) { diff --git a/cpp/src/arrow/python/python_to_arrow.h b/cpp/src/arrow/python/python_to_arrow.h index 082b6355c0e33..4d3761a963ac8 100644 --- a/cpp/src/arrow/python/python_to_arrow.h +++ b/cpp/src/arrow/python/python_to_arrow.h @@ -24,7 +24,6 @@ #include "arrow/io/interfaces.h" #include "arrow/python/numpy_interop.h" #include "arrow/python/platform.h" -#include "arrow/python/sequence.h" #include diff --git a/cpp/src/arrow/python/sequence.cc b/cpp/src/arrow/python/sequence.cc deleted file mode 100644 index c72e5cb60354f..0000000000000 --- a/cpp/src/arrow/python/sequence.cc +++ /dev/null @@ -1,169 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/python/sequence.h" - -namespace arrow { -namespace py { - -SequenceBuilder::SequenceBuilder(MemoryPool* pool) - : pool_(pool), - types_(pool, ::arrow::int8()), - offsets_(pool, ::arrow::int32()), - nones_(pool), - bools_(pool, ::arrow::boolean()), - ints_(pool, ::arrow::int64()), - bytes_(pool, ::arrow::binary()), - strings_(pool), - floats_(pool, ::arrow::float32()), - doubles_(pool, ::arrow::float64()), - tensor_indices_(pool, ::arrow::int32()), - list_offsets_({0}), - tuple_offsets_({0}), - dict_offsets_({0}) {} - -#define UPDATE(OFFSET, TAG) \ - if (TAG == -1) { \ - TAG = num_tags; \ - num_tags += 1; \ - } \ - RETURN_NOT_OK(offsets_.Append(OFFSET)); \ - RETURN_NOT_OK(types_.Append(TAG)); \ - RETURN_NOT_OK(nones_.AppendToBitmap(true)); - -Status SequenceBuilder::AppendNone() { - RETURN_NOT_OK(offsets_.Append(0)); - RETURN_NOT_OK(types_.Append(0)); - return nones_.AppendToBitmap(false); -} - -Status SequenceBuilder::AppendBool(bool data) { - UPDATE(bools_.length(), bool_tag); - return bools_.Append(data); -} - -Status SequenceBuilder::AppendInt64(int64_t data) { - UPDATE(ints_.length(), int_tag); - return ints_.Append(data); -} - -Status SequenceBuilder::AppendUInt64(uint64_t data) { - UPDATE(ints_.length(), int_tag); - return ints_.Append(data); -} - -Status SequenceBuilder::AppendBytes(const uint8_t* data, int32_t length) { - UPDATE(bytes_.length(), bytes_tag); - return bytes_.Append(data, length); -} - -Status SequenceBuilder::AppendString(const char* data, int32_t length) { - UPDATE(strings_.length(), string_tag); - return strings_.Append(data, length); -} - -Status SequenceBuilder::AppendFloat(float data) { - UPDATE(floats_.length(), float_tag); - return floats_.Append(data); -} - -Status SequenceBuilder::AppendDouble(double data) { - UPDATE(doubles_.length(), double_tag); - return doubles_.Append(data); -} - -Status SequenceBuilder::AppendTensor(int32_t tensor_index) { - UPDATE(tensor_indices_.length(), tensor_tag); - return tensor_indices_.Append(tensor_index); -} - -Status SequenceBuilder::AppendList(int32_t size) { - UPDATE(list_offsets_.size() - 1, list_tag); - list_offsets_.push_back(list_offsets_.back() + size); - return Status::OK(); -} - -Status SequenceBuilder::AppendTuple(int32_t size) { - UPDATE(tuple_offsets_.size() - 1, tuple_tag); - tuple_offsets_.push_back(tuple_offsets_.back() + size); - return Status::OK(); -} - -Status SequenceBuilder::AppendDict(int32_t size) { - UPDATE(dict_offsets_.size() - 1, dict_tag); - dict_offsets_.push_back(dict_offsets_.back() + size); - return Status::OK(); -} - -#define ADD_ELEMENT(VARNAME, TAG) \ - if (TAG != -1) { \ - types[TAG] = std::make_shared("", VARNAME.type()); \ - RETURN_NOT_OK(VARNAME.Finish(&children[TAG])); \ - RETURN_NOT_OK(nones_.AppendToBitmap(true)); \ - type_ids.push_back(TAG); \ - } - -#define ADD_SUBSEQUENCE(DATA, OFFSETS, BUILDER, TAG, NAME) \ - if (DATA) { \ - DCHECK(DATA->length() == OFFSETS.back()); \ - std::shared_ptr offset_array; \ - Int32Builder builder(pool_, std::make_shared()); \ - RETURN_NOT_OK(builder.Append(OFFSETS.data(), OFFSETS.size())); \ - RETURN_NOT_OK(builder.Finish(&offset_array)); \ - std::shared_ptr list_array; \ - ListArray::FromArrays(*offset_array, *DATA, pool_, &list_array); \ - auto field = std::make_shared(NAME, list_array->type()); \ - auto type = \ - std::make_shared(std::vector>({field})); \ - types[TAG] = std::make_shared("", type); \ - children[TAG] = std::shared_ptr( \ - new StructArray(type, list_array->length(), {list_array})); \ - RETURN_NOT_OK(nones_.AppendToBitmap(true)); \ - type_ids.push_back(TAG); \ - } else { \ - DCHECK_EQ(OFFSETS.size(), 1); \ - } - -Status SequenceBuilder::Finish(std::shared_ptr list_data, - std::shared_ptr tuple_data, - std::shared_ptr dict_data, - std::shared_ptr* out) { - std::vector> types(num_tags); - std::vector> children(num_tags); - std::vector type_ids; - - ADD_ELEMENT(bools_, bool_tag); - ADD_ELEMENT(ints_, int_tag); - ADD_ELEMENT(strings_, string_tag); - ADD_ELEMENT(bytes_, bytes_tag); - ADD_ELEMENT(floats_, float_tag); - ADD_ELEMENT(doubles_, double_tag); - - ADD_ELEMENT(tensor_indices_, tensor_tag); - - ADD_SUBSEQUENCE(list_data, list_offsets_, list_builder, list_tag, "list"); - ADD_SUBSEQUENCE(tuple_data, tuple_offsets_, tuple_builder, tuple_tag, "tuple"); - ADD_SUBSEQUENCE(dict_data, dict_offsets_, dict_builder, dict_tag, "dict"); - - auto type = ::arrow::union_(types, type_ids, UnionMode::DENSE); - out->reset(new UnionArray(type, types_.length(), children, types_.data(), - offsets_.data(), nones_.null_bitmap(), nones_.null_count())); - return Status::OK(); -} - -} // namespace py -} // namespace arrow diff --git a/cpp/src/arrow/python/sequence.h b/cpp/src/arrow/python/sequence.h deleted file mode 100644 index 8c3b765ce0b6c..0000000000000 --- a/cpp/src/arrow/python/sequence.h +++ /dev/null @@ -1,137 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PYTHON_ARROW_SEQUENCE_H -#define PYTHON_ARROW_SEQUENCE_H - -#include - -#include "arrow/api.h" -#include "arrow/util/logging.h" - -namespace arrow { -namespace py { - -/// A Sequence is a heterogeneous collections of elements. It can contain -/// scalar Python types, lists, tuples, dictionaries and tensors. -class SequenceBuilder { - public: - explicit SequenceBuilder(MemoryPool* pool = nullptr); - - /// Appending a none to the sequence - Status AppendNone(); - - /// Appending a boolean to the sequence - Status AppendBool(bool data); - - /// Appending an int64_t to the sequence - Status AppendInt64(int64_t data); - - /// Appending an uint64_t to the sequence - Status AppendUInt64(uint64_t data); - - /// Append a list of bytes to the sequence - Status AppendBytes(const uint8_t* data, int32_t length); - - /// Appending a string to the sequence - Status AppendString(const char* data, int32_t length); - - /// Appending a float to the sequence - Status AppendFloat(float data); - - /// Appending a double to the sequence - Status AppendDouble(double data); - - /// Appending a tensor to the sequence - /// - /// \param tensor_index Index of the tensor in the object. - Status AppendTensor(int32_t tensor_index); - - /// Add a sublist to the sequence. The data contained in the sublist will be - /// specified in the "Finish" method. - /// - /// To construct l = [[11, 22], 33, [44, 55]] you would for example run - /// list = ListBuilder(); - /// list.AppendList(2); - /// list.Append(33); - /// list.AppendList(2); - /// list.Finish([11, 22, 44, 55]); - /// list.Finish(); - - /// \param size - /// The size of the sublist - Status AppendList(int32_t size); - - Status AppendTuple(int32_t size); - - Status AppendDict(int32_t size); - - /// Finish building the sequence and return the result. - Status Finish(std::shared_ptr list_data, std::shared_ptr tuple_data, - std::shared_ptr dict_data, std::shared_ptr* out); - - private: - MemoryPool* pool_; - - Int8Builder types_; - Int32Builder offsets_; - - /// Total number of bytes needed to represent this sequence. - int64_t total_num_bytes_; - - NullBuilder nones_; - BooleanBuilder bools_; - Int64Builder ints_; - BinaryBuilder bytes_; - StringBuilder strings_; - FloatBuilder floats_; - DoubleBuilder doubles_; - - // We use an Int32Builder here to distinguish the tensor indices from - // the ints_ above (see the case Type::INT32 in get_value in python.cc). - // TODO(pcm): Replace this by using the union tags to distinguish between - // these two cases. - Int32Builder tensor_indices_; - - std::vector list_offsets_; - std::vector tuple_offsets_; - std::vector dict_offsets_; - - // Tags for members of the sequence. If they are set to -1 it means - // they are not used and will not part be of the metadata when we call - // SequenceBuilder::Finish. If a member with one of the tags is added, - // the associated variable gets a unique index starting from 0. This - // happens in the UPDATE macro in sequence.cc. - int8_t bool_tag = -1; - int8_t int_tag = -1; - int8_t string_tag = -1; - int8_t bytes_tag = -1; - int8_t float_tag = -1; - int8_t double_tag = -1; - - int8_t tensor_tag = -1; - int8_t list_tag = -1; - int8_t tuple_tag = -1; - int8_t dict_tag = -1; - - int8_t num_tags = 0; -}; - -} // namespace py -} // namespace arrow - -#endif // PYTHON_ARROW_SEQUENCE_H