Skip to content

Commit

Permalink
PERF: Slow performance of to_dict (pandas-dev#46470)
Browse files Browse the repository at this point in the history
  • Loading branch information
Roger Thomas committed Mar 23, 2022
1 parent c68c626 commit 31c11e1
Show file tree
Hide file tree
Showing 3 changed files with 207 additions and 62 deletions.
185 changes: 124 additions & 61 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1775,6 +1775,129 @@ def to_numpy(

return result

def _to_dict_helper(self, orient, into_c, into):
"""Helper function to do main work to convert frame into dict based on
`orient` and `into`
As part of GH46470 also takes care in when to use maybe_box_native as this
function can perform badly and is not necessary for non object cols
"""
object_dtype_cols = {
col for col, dtype in self.dtypes.items() if is_object_dtype(dtype)
}
if orient == "dict":
return into_c((k, v.to_dict(into)) for k, v in self.items())
elif orient == "list":
return into_c(
(
k,
list(map(maybe_box_native, v.tolist()))
if k in object_dtype_cols
else v.tolist(),
)
for k, v in self.items()
)
elif orient == "split":
if object_dtype_cols:
is_object_dtype_by_index = [
col in object_dtype_cols for col in self.columns
]
data = [
[
maybe_box_native(v) if is_object_dtype_by_index[i] else v
for i, v in enumerate(t)
]
for t in self.itertuples(index=False, name=None)
]
else:
data = [list(t) for t in self.itertuples(index=False, name=None)]
return into_c(
(
("index", self.index.tolist()),
("columns", self.columns.tolist()),
("data", data),
)
)
elif orient == "series":
return into_c((k, v) for k, v in self.items())
elif orient == "records":
columns = self.columns.tolist()
if object_dtype_cols:
is_object_dtype_by_index = [col in object_dtype_cols for col in columns]
return [
into_c(
zip(
columns,
[
maybe_box_native(v)
if is_object_dtype_by_index[i]
else v
for i, v in enumerate(t)
],
)
)
for t in self.itertuples(index=False, name=None)
]
else:
return [
into_c(zip(columns, t))
for t in self.itertuples(index=False, name=None)
]
elif orient == "index":
if not self.index.is_unique:
raise ValueError("DataFrame index must be unique for orient='index'.")
if object_dtype_cols:
is_object_dtype_by_index = [
col in object_dtype_cols for col in self.columns
]
return into_c(
(
t[0],
dict(
zip(
self.columns,
[
maybe_box_native(v)
if is_object_dtype_by_index[i]
else v
for i, v in enumerate(t[1:])
],
)
),
)
for t in self.itertuples(name=None)
)
else:
return into_c(
(t[0], dict(zip(self.columns, t[1:])))
for t in self.itertuples(name=None)
)
elif orient == "tight":
if object_dtype_cols:
is_object_dtype_by_index = [
col in object_dtype_cols for col in self.columns
]
data = [
[
maybe_box_native(v) if is_object_dtype_by_index[i] else v
for i, v in enumerate(t)
]
for t in self.itertuples(index=False, name=None)
]
else:
data = [list(t) for t in self.itertuples(index=False, name=None)]
return into_c(
(
("index", self.index.tolist()),
("columns", self.columns.tolist()),
("data", data),
("index_names", list(self.index.names)),
("column_names", list(self.columns.names)),
)
)
else:
raise ValueError(f"orient '{orient}' not understood")

def to_dict(self, orient: str = "dict", into=dict):
"""
Convert the DataFrame to a dictionary.
Expand Down Expand Up @@ -1913,67 +2036,7 @@ def to_dict(self, orient: str = "dict", into=dict):
elif orient.startswith("i"):
orient = "index"

if orient == "dict":
return into_c((k, v.to_dict(into)) for k, v in self.items())

elif orient == "list":
return into_c((k, v.tolist()) for k, v in self.items())

elif orient == "split":
return into_c(
(
("index", self.index.tolist()),
("columns", self.columns.tolist()),
(
"data",
[
list(map(maybe_box_native, t))
for t in self.itertuples(index=False, name=None)
],
),
)
)

elif orient == "tight":
return into_c(
(
("index", self.index.tolist()),
("columns", self.columns.tolist()),
(
"data",
[
list(map(maybe_box_native, t))
for t in self.itertuples(index=False, name=None)
],
),
("index_names", list(self.index.names)),
("column_names", list(self.columns.names)),
)
)

elif orient == "series":
return into_c((k, v) for k, v in self.items())

elif orient == "records":
columns = self.columns.tolist()
rows = (
dict(zip(columns, row))
for row in self.itertuples(index=False, name=None)
)
return [
into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows
]

elif orient == "index":
if not self.index.is_unique:
raise ValueError("DataFrame index must be unique for orient='index'.")
return into_c(
(t[0], dict(zip(self.columns, t[1:])))
for t in self.itertuples(name=None)
)

else:
raise ValueError(f"orient '{orient}' not understood")
return self._to_dict_helper(orient, into_c, into)

def to_gbq(
self,
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1770,7 +1770,13 @@ def to_dict(self, into=dict):
"""
# GH16122
into_c = com.standardize_mapping(into)
return into_c((k, maybe_box_native(v)) for k, v in self.items())

if is_object_dtype(self):
return into_c((k, maybe_box_native(v)) for k, v in self.items())
else:
# Not an object dtype => all types will be the same so let the default
# indexer return native python type
return into_c((k, v) for k, v in self.items())

def to_frame(self, name: Hashable = lib.no_default) -> DataFrame:
"""
Expand Down
76 changes: 76 additions & 0 deletions pandas/tests/frame/methods/test_to_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,3 +344,79 @@ def test_to_dict_orient_tight(self, index, columns):
roundtrip = DataFrame.from_dict(df.to_dict(orient="tight"), orient="tight")

tm.assert_frame_equal(df, roundtrip)

@pytest.mark.parametrize(
"orient",
["dict", "list", "split", "records", "index", "tight"],
)
@pytest.mark.parametrize(
"data,expected_types",
(
(
{
"a": [np.int64(1), 1, np.int64(3)],
"b": [np.float64(1.0), 2.0, np.float64(3.0)],
"c": [np.float64(1.0), 2, np.int64(3)],
"d": [np.float64(1.0), "a", np.int64(3)],
"e": [np.float64(1.0), ["a"], np.int64(3)],
"f": [np.float64(1.0), ("a",), np.int64(3)],
},
{
"a": [int, int, int],
"b": [float, float, float],
"c": [float, float, float],
"d": [float, str, int],
"e": [float, list, int],
"f": [float, tuple, int],
},
),
(
{
"a": [1, 2, 3],
"b": [1.1, 2.2, 3.3],
},
{
"a": [int, int, int],
"b": [float, float, float],
},
),
),
)
def test_to_dict_return_types(self, orient, data, expected_types):
# GH46470
df = DataFrame(data)
result = df.to_dict(orient)
if orient == "dict":
assertion_iterator = (
(i, key, value)
for key, index_value_map in result.items()
for i, value in index_value_map.items()
)
elif orient == "list":
assertion_iterator = (
(i, key, value)
for key, values in result.items()
for i, value in enumerate(values)
)
elif orient in {"split", "tight"}:
assertion_iterator = (
(i, key, result["data"][i][j])
for i in result["index"]
for j, key in enumerate(result["columns"])
)
elif orient == "records":
assertion_iterator = (
(i, key, value)
for i, record in enumerate(result)
for key, value in record.items()
)
elif orient == "index":
assertion_iterator = (
(i, key, value)
for i, record in result.items()
for key, value in record.items()
)

for i, key, value in assertion_iterator:
assert value == data[key][i]
assert type(value) is expected_types[key][i]

0 comments on commit 31c11e1

Please sign in to comment.