Skip to content

Commit

Permalink
Account for chunked arrays when converting lists back to pandas form
Browse files Browse the repository at this point in the history
Change-Id: Ie84c0f1511c5714c7a8a0c0573e397cbe70809de
  • Loading branch information
wesm committed Aug 20, 2017
1 parent b50f235 commit 18acdd9
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 3 deletions.
7 changes: 5 additions & 2 deletions cpp/src/arrow/python/arrow_to_pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,7 @@ inline Status ConvertListsLike(PandasOptions options, const std::shared_ptr<Colu

PyAcquireGIL lock;

int64_t chunk_offset = 0;
for (int c = 0; c < data.num_chunks(); c++) {
auto arr = std::static_pointer_cast<ListArray>(data.chunk(c));

Expand All @@ -507,8 +508,8 @@ inline Status ConvertListsLike(PandasOptions options, const std::shared_ptr<Colu
Py_INCREF(Py_None);
*out_values = Py_None;
} else {
PyObject* start = PyLong_FromLong(arr->value_offset(i));
PyObject* end = PyLong_FromLong(arr->value_offset(i + 1));
PyObject* start = PyLong_FromLong(arr->value_offset(i) + chunk_offset);
PyObject* end = PyLong_FromLong(arr->value_offset(i + 1) + chunk_offset);
PyObject* slice = PySlice_New(start, end, NULL);
*out_values = PyObject_GetItem(numpy_array, slice);
Py_DECREF(start);
Expand All @@ -517,6 +518,8 @@ inline Status ConvertListsLike(PandasOptions options, const std::shared_ptr<Colu
}
++out_values;
}

chunk_offset += arr->length();
}

Py_XDECREF(numpy_array);
Expand Down
25 changes: 25 additions & 0 deletions python/pyarrow/tests/test_convert_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,31 @@ def test_column_of_lists(self):
field = schema.field_by_name(column)
self._check_array_roundtrip(df[column], type=field.type)

def test_column_of_lists_chunked(self):
# ARROW-1357
df = pd.DataFrame({
'lists': np.array([
[1, 2],
None,
[2, 3],
[4, 5],
[6, 7],
[8, 9]
], dtype=object)
})

schema = pa.schema([
pa.field('lists', pa.list_(pa.int64()))
])

t1 = pa.Table.from_pandas(df[:2], schema=schema)
t2 = pa.Table.from_pandas(df[2:], schema=schema)

table = pa.concat_tables([t1, t2])
result = table.to_pandas()

tm.assert_frame_equal(result, df)

def test_column_of_lists_strided(self):
df, schema = dataframe_with_lists()
df = pd.concat([df] * 6, ignore_index=True)
Expand Down
1 change: 0 additions & 1 deletion python/pyarrow/tests/test_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import pytest

from collections import namedtuple
import os
import string
import sys

Expand Down

0 comments on commit 18acdd9

Please sign in to comment.