Skip to content

Commit

Permalink
ENH: Add strings_as_fixed_length parameter for df.to_records() (panda…
Browse files Browse the repository at this point in the history
…s-dev#18146) (pandas-dev#22229)

* ENH: Allow fixed-length strings in df.to_records()

Adds parameter to allow string-like columns to be
cast as fixed-length string-like dtypes for more
efficient storage.

Closes pandas-devgh-18146.

Originally authored by @qinghao1 but cleaned up
by @gfyoung to fix merge conflicts.

* Add dtype parameters instead of fix-string-like

The original parameter was causing a lot of acrobatics
with regards to string dtypes between 2.x and 3.x.

The new parameters simplify the internal logic and
pass the responsibility and motivation of memory
efficiency back to the users.

* MAINT: Use is_dict_like in to_records

More generic than checking whether our
mappings are instances of dict.

Expands is_dict_like check to include
whether it has a __contains__ method.

* TST: Add test for is_dict_like expanded def

* MAINT: Address final comments
  • Loading branch information
qinghao1 authored and Pingviinituutti committed Feb 28, 2019
1 parent 162c6f3 commit 6f42e8e
Show file tree
Hide file tree
Showing 5 changed files with 272 additions and 4 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,7 @@ Other Enhancements
- :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`)
- :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the ``axis`` parameter (:issue:`8839`)
- The ``scatter_matrix``, ``andrews_curves``, ``parallel_coordinates``, ``lag_plot``, ``autocorrelation_plot``, ``bootstrap_plot``, and ``radviz`` plots from the ``pandas.plotting`` module are now accessible from calling :meth:`DataFrame.plot` (:issue:`11978`)
- :meth:`DataFrame.to_records` now accepts ``index_dtypes`` and ``column_dtypes`` parameters to allow different data types in stored column and index records (:issue:`18146`)
- :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`)
- :func:`pandas.DataFrame.to_sql` has gained the ``method`` argument to control SQL insertion clause. See the :ref:`insertion method <io.sql.method>` section in the documentation. (:issue:`8953`)

Expand Down
5 changes: 4 additions & 1 deletion pandas/core/dtypes/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,8 +398,11 @@ def is_dict_like(obj):
>>> is_dict_like([1, 2, 3])
False
"""
for attr in ("__getitem__", "keys", "__contains__"):
if not hasattr(obj, attr):
return False

return hasattr(obj, '__getitem__') and hasattr(obj, 'keys')
return True


def is_named_tuple(obj):
Expand Down
92 changes: 89 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
OrderedDict, PY36, raise_with_traceback,
string_and_binary_types)
from pandas.compat.numpy import function as nv

from pandas.core.dtypes.cast import (
maybe_upcast,
cast_scalar_to_array,
Expand All @@ -49,6 +48,7 @@
maybe_upcast_putmask,
find_common_type)
from pandas.core.dtypes.common import (
is_dict_like,
is_object_dtype,
is_extension_type,
is_extension_array_dtype,
Expand Down Expand Up @@ -1540,7 +1540,8 @@ def from_records(cls, data, index=None, exclude=None, columns=None,

return cls(mgr)

def to_records(self, index=True, convert_datetime64=None):
def to_records(self, index=True, convert_datetime64=None,
column_dtypes=None, index_dtypes=None):
"""
Convert DataFrame to a NumPy record array.
Expand All @@ -1557,6 +1558,20 @@ def to_records(self, index=True, convert_datetime64=None):
Whether to convert the index to datetime.datetime if it is a
DatetimeIndex.
column_dtypes : str, type, dict, default None
.. versionadded:: 0.24.0
If a string or type, the data type to store all columns. If
a dictionary, a mapping of column names and indices (zero-indexed)
to specific data types.
index_dtypes : str, type, dict, default None
.. versionadded:: 0.24.0
If a string or type, the data type to store all index levels. If
a dictionary, a mapping of index level names and indices
(zero-indexed) to specific data types.
This mapping is applied only if `index=True`.
Returns
-------
Expand Down Expand Up @@ -1598,6 +1613,23 @@ def to_records(self, index=True, convert_datetime64=None):
>>> df.to_records(index=False)
rec.array([(1, 0.5 ), (2, 0.75)],
dtype=[('A', '<i8'), ('B', '<f8')])
Data types can be specified for the columns:
>>> df.to_records(column_dtypes={"A": "int32"})
rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
As well as for the index:
>>> df.to_records(index_dtypes="<S2")
rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
>>> index_dtypes = "<S{}".format(df.index.str.len().max())
>>> df.to_records(index_dtypes=index_dtypes)
rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
"""

if convert_datetime64 is not None:
Expand All @@ -1620,20 +1652,74 @@ def to_records(self, index=True, convert_datetime64=None):

count = 0
index_names = list(self.index.names)

if isinstance(self.index, MultiIndex):
for i, n in enumerate(index_names):
if n is None:
index_names[i] = 'level_%d' % count
count += 1
elif index_names[0] is None:
index_names = ['index']

names = (lmap(compat.text_type, index_names) +
lmap(compat.text_type, self.columns))
else:
arrays = [self[c].get_values() for c in self.columns]
names = lmap(compat.text_type, self.columns)
index_names = []

index_len = len(index_names)
formats = []

for i, v in enumerate(arrays):
index = i

# When the names and arrays are collected, we
# first collect those in the DataFrame's index,
# followed by those in its columns.
#
# Thus, the total length of the array is:
# len(index_names) + len(DataFrame.columns).
#
# This check allows us to see whether we are
# handling a name / array in the index or column.
if index < index_len:
dtype_mapping = index_dtypes
name = index_names[index]
else:
index -= index_len
dtype_mapping = column_dtypes
name = self.columns[index]

# We have a dictionary, so we get the data type
# associated with the index or column (which can
# be denoted by its name in the DataFrame or its
# position in DataFrame's array of indices or
# columns, whichever is applicable.
if is_dict_like(dtype_mapping):
if name in dtype_mapping:
dtype_mapping = dtype_mapping[name]
elif index in dtype_mapping:
dtype_mapping = dtype_mapping[index]
else:
dtype_mapping = None

# If no mapping can be found, use the array's
# dtype attribute for formatting.
#
# A valid dtype must either be a type or
# string naming a type.
if dtype_mapping is None:
formats.append(v.dtype)
elif isinstance(dtype_mapping, (type, compat.string_types)):
formats.append(dtype_mapping)
else:
element = "row" if i < index_len else "column"
msg = ("Invalid dtype {dtype} specified for "
"{element} {name}").format(dtype=dtype_mapping,
element=element, name=name)
raise ValueError(msg)

formats = [v.dtype for v in arrays]
return np.rec.fromarrays(
arrays,
dtype={'names': names, 'formats': formats}
Expand Down
27 changes: 27 additions & 0 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,33 @@ def test_is_dict_like_fails(ll):
assert not inference.is_dict_like(ll)


@pytest.mark.parametrize("has_keys", [True, False])
@pytest.mark.parametrize("has_getitem", [True, False])
@pytest.mark.parametrize("has_contains", [True, False])
def test_is_dict_like_duck_type(has_keys, has_getitem, has_contains):
class DictLike(object):
def __init__(self, d):
self.d = d

if has_keys:
def keys(self):
return self.d.keys()

if has_getitem:
def __getitem__(self, key):
return self.d.__getitem__(key)

if has_contains:
def __contains__(self, key):
return self.d.__contains__(key)

d = DictLike({1: 2})
result = inference.is_dict_like(d)
expected = has_keys and has_getitem and has_contains

assert result is expected


def test_is_file_like(mock):
class MockFile(object):
pass
Expand Down
151 changes: 151 additions & 0 deletions pandas/tests/frame/test_convert_to.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,157 @@ def test_to_records_with_categorical(self):
dtype=[('index', '=i8'), ('0', 'O')])
tm.assert_almost_equal(result, expected)

@pytest.mark.parametrize("kwargs,expected", [
# No dtypes --> default to array dtypes.
(dict(),
np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
dtype=[("index", "<i8"), ("A", "<i8"),
("B", "<f8"), ("C", "O")])),
# Should have no effect in this case.
(dict(index=True),
np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
dtype=[("index", "<i8"), ("A", "<i8"),
("B", "<f8"), ("C", "O")])),
# Column dtype applied across the board. Index unaffected.
(dict(column_dtypes="<U4"),
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", "<i8"), ("A", "<U4"),
("B", "<U4"), ("C", "<U4")])),
# Index dtype applied across the board. Columns unaffected.
(dict(index_dtypes="<U1"),
np.rec.array([("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
dtype=[("index", "<U1"), ("A", "<i8"),
("B", "<f8"), ("C", "O")])),
# Pass in a type instance.
(dict(column_dtypes=np.unicode),
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", "<i8"), ("A", "<U"),
("B", "<U"), ("C", "<U")])),
# Pass in a dictionary (name-only).
(dict(column_dtypes={"A": np.int8, "B": np.float32, "C": "<U2"}),
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", "<i8"), ("A", "i1"),
("B", "<f4"), ("C", "<U2")])),
# Pass in a dictionary (indices-only).
(dict(index_dtypes={0: "int16"}),
np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
dtype=[("index", "i2"), ("A", "<i8"),
("B", "<f8"), ("C", "O")])),
# Ignore index mappings if index is not True.
(dict(index=False, index_dtypes="<U2"),
np.rec.array([(1, 0.2, "a"), (2, 1.5, "bc")],
dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")])),
# Non-existent names / indices in mapping should not error.
(dict(index_dtypes={0: "int16", "not-there": "float32"}),
np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
dtype=[("index", "i2"), ("A", "<i8"),
("B", "<f8"), ("C", "O")])),
# Names / indices not in mapping default to array dtype.
(dict(column_dtypes={"A": np.int8, "B": np.float32}),
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", "<i8"), ("A", "i1"),
("B", "<f4"), ("C", "O")])),
# Mixture of everything.
(dict(column_dtypes={"A": np.int8, "B": np.float32},
index_dtypes="<U2"),
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", "<U2"), ("A", "i1"),
("B", "<f4"), ("C", "O")])),
# Invalid dype values.
(dict(index=False, column_dtypes=list()),
"Invalid dtype \\[\\] specified for column A"),
(dict(index=False, column_dtypes={"A": "int32", "B": 5}),
"Invalid dtype 5 specified for column B"),
])
def test_to_records_dtype(self, kwargs, expected):
# see gh-18146
df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})

if isinstance(expected, str):
with pytest.raises(ValueError, match=expected):
df.to_records(**kwargs)
else:
result = df.to_records(**kwargs)
tm.assert_almost_equal(result, expected)

@pytest.mark.parametrize("df,kwargs,expected", [
# MultiIndex in the index.
(DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=list("abc")).set_index(["a", "b"]),
dict(column_dtypes="float64", index_dtypes={0: "int32", 1: "int8"}),
np.rec.array([(1, 2, 3.), (4, 5, 6.), (7, 8, 9.)],
dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")])),
# MultiIndex in the columns.
(DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
("c", "f")])),
dict(column_dtypes={0: "<U1", 2: "float32"}, index_dtypes="float32"),
np.rec.array([(0., u"1", 2, 3.), (1., u"4", 5, 6.),
(2., u"7", 8, 9.)],
dtype=[("index", "<f4"),
("('a', 'd')", "<U1"),
("('b', 'e')", "<i8"),
("('c', 'f')", "<f4")])),
# MultiIndex in both the columns and index.
(DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=MultiIndex.from_tuples([
("a", "d"), ("b", "e"), ("c", "f")], names=list("ab")),
index=MultiIndex.from_tuples([
("d", -4), ("d", -5), ("f", -6)], names=list("cd"))),
dict(column_dtypes="float64", index_dtypes={0: "<U2", 1: "int8"}),
np.rec.array([("d", -4, 1., 2., 3.), ("d", -5, 4., 5., 6.),
("f", -6, 7, 8, 9.)],
dtype=[("c", "<U2"), ("d", "i1"),
("('a', 'd')", "<f8"), ("('b', 'e')", "<f8"),
("('c', 'f')", "<f8")]))
])
def test_to_records_dtype_mi(self, df, kwargs, expected):
# see gh-18146
result = df.to_records(**kwargs)
tm.assert_almost_equal(result, expected)

def test_to_records_dict_like(self):
# see gh-18146
class DictLike(object):
def __init__(self, **kwargs):
self.d = kwargs.copy()

def __getitem__(self, key):
return self.d.__getitem__(key)

def __contains__(self, key):
return key in self.d

def keys(self):
return self.d.keys()

df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})

dtype_mappings = dict(column_dtypes=DictLike(**{"A": np.int8,
"B": np.float32}),
index_dtypes="<U2")

result = df.to_records(**dtype_mappings)
expected = np.rec.array([("0", "1", "0.2", "a"),
("1", "2", "1.5", "bc")],
dtype=[("index", "<U2"), ("A", "i1"),
("B", "<f4"), ("C", "O")])
tm.assert_almost_equal(result, expected)

@pytest.mark.parametrize('mapping', [
dict,
collections.defaultdict(list),
Expand Down

0 comments on commit 6f42e8e

Please sign in to comment.