Skip to content

Commit

Permalink
apacheGH-36614: [MATLAB] Subclass arrow::Buffer to keep MATLAB data b…
Browse files Browse the repository at this point in the history
…acking arrow::Arrays alive (apache#36615)

### Rationale for this change

When building `arrow.array.<Numeric>Arrays` from native MATLAB arrays, we avoid copying by

1. Wrapping the MATLAB data inside a non-owning `arrow::Buffer`
2. Constructing an `arrow::ArrayData` object from the `arrow::Buffer`
3. Constructing the `arrow::Array` from the `arrow::Data` object.

Because the `Array`'s underlying `Buffer` does not have ownership of its backing data, we have been storing the original MATLAB array as a property called `MatlabArray` on the MATLAB `arrow.array.NumericArray` class.

This solution is not ideal because the backing MATLAB array is kept separate from the actual `std::shared_ptr<arrow::Array>`, which is stored within the C++ proxy objects (e.g. `arrow::matlab::array::proxy::NumericArray<float64>`).  A better solution would be to create a new subclass of `arrow::Buffer` called `MatlabBuffer`, which will keep the original MATLAB array alive by taking it in as an input parameter and storing it as a member variable. 

### What changes are included in this PR?

1. Removed the `MatlabArray` property from the MATLAB class `matlab.io.NumericArray` 
2. Added a private member variable called `mda_array` to `arrow::matlab::array::proxy::NumericArray<CType>` and `arrow::matlab::array::proxy::TimestampArray`. `mda_array` is a `matlab::data::Array` object. This member variable stores the MATLAB array that owns the memory the `arrow::Array` objects are backed by.

### Are these changes tested?

Existing tests used.

### Are there any user-facing changes?
No.

* Closes: apache#36614

Authored-by: Sarah Gilmore <sgilmore@mathworks.com>
Signed-off-by: Kevin Gurney <kgurney@mathworks.com>
  • Loading branch information
sgilmore10 authored and R-JunmingChen committed Aug 20, 2023
1 parent 28f1730 commit a9a1e53
Show file tree
Hide file tree
Showing 13 changed files with 104 additions and 75 deletions.
2 changes: 1 addition & 1 deletion matlab/src/cpp/arrow/matlab/array/proxy/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

namespace arrow::matlab::array::proxy {

Array::Array() {
Array::Array(std::shared_ptr<arrow::Array> array) : array{std::move(array)} {

// Register Proxy methods.
REGISTER_METHOD(Array, toString);
Expand Down
2 changes: 1 addition & 1 deletion matlab/src/cpp/arrow/matlab/array/proxy/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ namespace arrow::matlab::array::proxy {

class Array : public libmexclass::proxy::Proxy {
public:
Array();
Array(std::shared_ptr<arrow::Array> array);

virtual ~Array() {}

Expand Down
6 changes: 5 additions & 1 deletion matlab/src/cpp/arrow/matlab/array/proxy/boolean_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@

namespace arrow::matlab::array::proxy {

BooleanArray::BooleanArray(std::shared_ptr<arrow::BooleanArray> array)
: arrow::matlab::array::proxy::Array{std::move(array)} {}

libmexclass::proxy::MakeResult BooleanArray::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) {
::matlab::data::StructArray opts = constructor_arguments[0];

Expand All @@ -40,7 +43,8 @@ namespace arrow::matlab::array::proxy {
const auto array_length = logical_mda.getNumberOfElements();

auto array_data = arrow::ArrayData::Make(data_type, array_length, {validity_bitmap_buffer, data_buffer});
return std::make_shared<arrow::matlab::array::proxy::BooleanArray>(arrow::MakeArray(array_data));
auto arrow_array = std::static_pointer_cast<arrow::BooleanArray>(arrow::MakeArray(array_data));
return std::make_shared<arrow::matlab::array::proxy::BooleanArray>(std::move(arrow_array));
}

void BooleanArray::toMATLAB(libmexclass::proxy::method::Context& context) {
Expand Down
6 changes: 2 additions & 4 deletions matlab/src/cpp/arrow/matlab/array/proxy/boolean_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,13 @@
#include "arrow/matlab/array/proxy/array.h"

#include "libmexclass/proxy/Proxy.h"
#include "arrow/type_fwd.h"

namespace arrow::matlab::array::proxy {

class BooleanArray : public arrow::matlab::array::proxy::Array {
public:
BooleanArray(const std::shared_ptr<arrow::Array> logical_array)
: arrow::matlab::array::proxy::Array() {
array = logical_array;
}
BooleanArray(std::shared_ptr<arrow::BooleanArray> array);

static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments);

Expand Down
26 changes: 12 additions & 14 deletions matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "arrow/matlab/error/error.h"
#include "arrow/matlab/bit/pack.h"
#include "arrow/matlab/bit/unpack.h"
#include "arrow/matlab/buffer/matlab_buffer.h"

#include "libmexclass/proxy/Proxy.h"

Expand All @@ -35,35 +36,32 @@ namespace arrow::matlab::array::proxy {
template<typename CType>
class NumericArray : public arrow::matlab::array::proxy::Array {
public:
NumericArray(const std::shared_ptr<arrow::Array> numeric_array)
: arrow::matlab::array::proxy::Array() {
array = numeric_array;
}
using ArrowType = typename arrow::CTypeTraits<CType>::ArrowType;

static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments) {
using ArrowType = typename arrow::CTypeTraits<CType>::ArrowType;
using BuilderType = typename arrow::CTypeTraits<CType>::BuilderType;
NumericArray(const std::shared_ptr<arrow::NumericArray<ArrowType>> numeric_array)
: arrow::matlab::array::proxy::Array{std::move(numeric_array)} {}

static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments) {
using MatlabBuffer = arrow::matlab::buffer::MatlabBuffer;
using NumericArray = arrow::NumericArray<ArrowType>;
using NumericArrayProxy = typename arrow::matlab::array::proxy::NumericArray<CType>;

::matlab::data::StructArray opts = constructor_arguments[0];

// Get the mxArray from constructor arguments
const ::matlab::data::TypedArray<CType> numeric_mda = opts[0]["MatlabArray"];
const ::matlab::data::TypedArray<bool> valid_mda = opts[0]["Valid"];

// Get raw pointer of mxArray
auto it(numeric_mda.cbegin());
auto dt = it.operator->();
auto data_buffer = std::make_shared<MatlabBuffer>(numeric_mda);

const auto data_type = arrow::CTypeTraits<CType>::type_singleton();
const auto length = static_cast<int64_t>(numeric_mda.getNumberOfElements()); // cast size_t to int64_t

// Do not make a copy when creating arrow::Buffer
auto data_buffer = std::make_shared<arrow::Buffer>(reinterpret_cast<const uint8_t*>(dt),
sizeof(CType) * numeric_mda.getNumberOfElements());
// Pack the validity bitmap values.
MATLAB_ASSIGN_OR_ERROR(auto packed_validity_bitmap, bit::packValid(valid_mda), error::BITPACK_VALIDITY_BITMAP_ERROR_ID);
auto array_data = arrow::ArrayData::Make(data_type, length, {packed_validity_bitmap, data_buffer});
return std::make_shared<arrow::matlab::array::proxy::NumericArray<CType>>(arrow::MakeArray(array_data));
auto numeric_array = std::static_pointer_cast<NumericArray>(arrow::MakeArray(array_data));
return std::make_shared<NumericArrayProxy>(std::move(numeric_array));
}

protected:
Expand Down
7 changes: 5 additions & 2 deletions matlab/src/cpp/arrow/matlab/array/proxy/string_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@

namespace arrow::matlab::array::proxy {

StringArray::StringArray(const std::shared_ptr<arrow::StringArray> string_array)
: arrow::matlab::array::proxy::Array(std::move(string_array)) {}

libmexclass::proxy::MakeResult StringArray::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) {
namespace mda = ::matlab::data;

Expand Down Expand Up @@ -53,8 +56,8 @@ namespace arrow::matlab::array::proxy {
arrow::StringBuilder builder;
MATLAB_ERROR_IF_NOT_OK(builder.AppendValues(strings, unpacked_validity_bitmap_ptr), error::STRING_BUILDER_APPEND_FAILED);
MATLAB_ASSIGN_OR_ERROR(auto array, builder.Finish(), error::STRING_BUILDER_FINISH_FAILED);

return std::make_shared<arrow::matlab::array::proxy::StringArray>(array);
auto typed_array = std::static_pointer_cast<arrow::StringArray>(array);
return std::make_shared<arrow::matlab::array::proxy::StringArray>(std::move(typed_array));
}

void StringArray::toMATLAB(libmexclass::proxy::method::Context& context) {
Expand Down
9 changes: 4 additions & 5 deletions matlab/src/cpp/arrow/matlab/array/proxy/string_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,14 @@

#include "libmexclass/proxy/Proxy.h"

#include "arrow/type_fwd.h"

namespace arrow::matlab::array::proxy {

class StringArray : public arrow::matlab::array::proxy::Array {
public:
StringArray(const std::shared_ptr<arrow::Array> string_array)
: arrow::matlab::array::proxy::Array() {
array = string_array;
}

StringArray(const std::shared_ptr<arrow::StringArray> string_array);

static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments);

protected:
Expand Down
45 changes: 21 additions & 24 deletions matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,22 @@
#include "arrow/matlab/error/error.h"
#include "arrow/matlab/bit/pack.h"
#include "arrow/matlab/bit/unpack.h"
#include "arrow/matlab/buffer/matlab_buffer.h"

#include "arrow/matlab/type/time_unit.h"
#include "arrow/util/utf8.h"
#include "arrow/type.h"
#include "arrow/builder.h"


namespace arrow::matlab::array::proxy {

namespace {
const uint8_t* getUnpackedValidityBitmap(const ::matlab::data::TypedArray<bool>& valid_elements) {
const auto valid_elements_iterator(valid_elements.cbegin());
return reinterpret_cast<const uint8_t*>(valid_elements_iterator.operator->());
}
} // anonymous namespace
TimestampArray::TimestampArray(std::shared_ptr<arrow::TimestampArray> array)
: arrow::matlab::array::proxy::Array{std::move(array)} {}

libmexclass::proxy::MakeResult TimestampArray::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) {
namespace mda = ::matlab::data;
using MatlabBuffer = arrow::matlab::buffer::MatlabBuffer;
using TimestampArray = arrow::TimestampArray;
using TimestampArrayProxy = arrow::matlab::array::proxy::TimestampArray;

mda::StructArray opts = constructor_arguments[0];

Expand All @@ -49,32 +47,31 @@ namespace arrow::matlab::array::proxy {
const mda::TypedArray<mda::MATLABString> units_mda = opts[0]["TimeUnit"];

// extract the time zone string
const std::u16string& utf16_timezone = timezone_mda[0];
MATLAB_ASSIGN_OR_ERROR(const auto timezone, arrow::util::UTF16StringToUTF8(utf16_timezone),
const std::u16string& u16_timezone = timezone_mda[0];
MATLAB_ASSIGN_OR_ERROR(const auto timezone,
arrow::util::UTF16StringToUTF8(u16_timezone),
error::UNICODE_CONVERSION_ERROR_ID);

// extract the time unit
const std::u16string& utf16_unit = units_mda[0];
MATLAB_ASSIGN_OR_ERROR(const auto time_unit, arrow::matlab::type::timeUnitFromString(utf16_unit),
error::UKNOWN_TIME_UNIT_ERROR_ID);
const std::u16string& u16_timeunit = units_mda[0];
MATLAB_ASSIGN_OR_ERROR(const auto time_unit,
arrow::matlab::type::timeUnitFromString(u16_timeunit),
error::UKNOWN_TIME_UNIT_ERROR_ID)

// create the timestamp_type
auto data_type = arrow::timestamp(time_unit, timezone);
arrow::TimestampBuilder builder(data_type, arrow::default_memory_pool());
auto array_length = static_cast<int64_t>(timestamp_mda.getNumberOfElements()); // cast size_t to int64_t

// Get raw pointer of mxArray
auto it(timestamp_mda.cbegin());
auto dt = it.operator->();
auto data_buffer = std::make_shared<MatlabBuffer>(timestamp_mda);

// Pack the validity bitmap values.
const uint8_t* valid_mask = getUnpackedValidityBitmap(validity_bitmap_mda);
const auto num_elements = timestamp_mda.getNumberOfElements();

// Append values
MATLAB_ERROR_IF_NOT_OK(builder.AppendValues(dt, num_elements, valid_mask), error::APPEND_VALUES_ERROR_ID);
MATLAB_ASSIGN_OR_ERROR(auto timestamp_array, builder.Finish(), error::BUILD_ARRAY_ERROR_ID);
MATLAB_ASSIGN_OR_ERROR(auto packed_validity_bitmap,
bit::packValid(validity_bitmap_mda),
error::BITPACK_VALIDITY_BITMAP_ERROR_ID);

return std::make_shared<arrow::matlab::array::proxy::TimestampArray>(timestamp_array);
auto array_data = arrow::ArrayData::Make(data_type, array_length, {packed_validity_bitmap, data_buffer});
auto timestamp_array = std::static_pointer_cast<TimestampArray>(arrow::MakeArray(array_data));
return std::make_shared<TimestampArrayProxy>(std::move(timestamp_array));
}

void TimestampArray::toMATLAB(libmexclass::proxy::method::Context& context) {
Expand Down
8 changes: 3 additions & 5 deletions matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,17 @@

#include "libmexclass/proxy/Proxy.h"

#include "arrow/type_fwd.h"

namespace arrow::matlab::array::proxy {

class TimestampArray : public arrow::matlab::array::proxy::Array {
public:
TimestampArray(const std::shared_ptr<arrow::Array> timestamp_array)
: arrow::matlab::array::proxy::Array() {
array = timestamp_array;
}
TimestampArray(std::shared_ptr<arrow::TimestampArray> array);

static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments);

protected:

void toMATLAB(libmexclass::proxy::method::Context& context) override;
};

Expand Down
48 changes: 48 additions & 0 deletions matlab/src/cpp/arrow/matlab/buffer/matlab_buffer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include "arrow/buffer.h"

#include "MatlabDataArray.hpp"

namespace arrow::matlab::buffer {

namespace mda = ::matlab::data;

class MatlabBuffer : public arrow::Buffer {
public:

template<typename CType>
MatlabBuffer(const mda::TypedArray<CType> typed_array)
: arrow::Buffer{nullptr, 0}
, array{typed_array} {

// Get raw pointer of mxArray
auto it(typed_array.cbegin());
auto dt = it.operator->();

data_ = reinterpret_cast<const uint8_t*>(dt);
size_ = sizeof(CType) * static_cast<int64_t>(typed_array.getNumberOfElements());
capacity_ = size_;
is_mutable_ = false;
}
private:
const mda::Array array;
};
}
8 changes: 0 additions & 8 deletions matlab/src/matlab/+arrow/+array/NumericArray.m
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,6 @@

classdef NumericArray < arrow.array.Array
% arrow.array.NumericArray


properties (Hidden, SetAccess=protected)
MatlabArray = []
end

properties(Abstract, Access=protected)
NullSubstitutionValue;
Expand All @@ -38,9 +33,6 @@
validElements = arrow.args.parseValidElements(data, opts);
opts = struct(MatlabArray=data, Valid=validElements);
obj@arrow.array.Array("Name", proxyName, "ConstructorArguments", {opts});
obj.MatlabArray = cast(obj.MatlabArray, type);
% Store a reference to the array
obj.MatlabArray = data;
end

function matlabArray = toMATLAB(obj)
Expand Down
8 changes: 0 additions & 8 deletions matlab/test/arrow/array/hNumericArray.m
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,6 @@ function BasicTest(tc)
tc.verifyEqual(className, tc.ArrowArrayClassName);
end

function ShallowCopyTest(tc)
% NumericArrays stores a shallow copy of the array keep the
% memory alive.
A = tc.ArrowArrayConstructor(tc.MatlabArrayFcn([1, 2, 3]));
tc.verifyEqual(A.MatlabArray, tc.MatlabArrayFcn([1, 2, 3]));
tc.verifyEqual(toMATLAB(A), tc.MatlabArrayFcn([1 2 3]'));
end

function ToMATLAB(tc)
% Create array from a scalar
A1 = tc.ArrowArrayConstructor(tc.MatlabArrayFcn(100));
Expand Down
4 changes: 2 additions & 2 deletions matlab/tools/cmake/BuildMatlabArrowInterface.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/c
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/bit"
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/error"
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type"
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy")

"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy"
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/buffer")

set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/array.cc"
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/boolean_array.cc"
Expand Down

0 comments on commit a9a1e53

Please sign in to comment.