From 6f42e8e4bc479ce2ea720f5a3cf102fcee23acfe Mon Sep 17 00:00:00 2001 From: Chu Qing Hao <6337103+qinghao1@users.noreply.github.com> Date: Sun, 30 Dec 2018 14:40:38 -0800 Subject: [PATCH] ENH: Add strings_as_fixed_length parameter for df.to_records() (#18146) (#22229) * ENH: Allow fixed-length strings in df.to_records() Adds parameter to allow string-like columns to be cast as fixed-length string-like dtypes for more efficient storage. Closes gh-18146. Originally authored by @qinghao1 but cleaned up by @gfyoung to fix merge conflicts. * Add dtype parameters instead of fix-string-like The original parameter was causing a lot of acrobatics with regards to string dtypes between 2.x and 3.x. The new parameters simplify the internal logic and pass the responsibility and motivation of memory efficiency back to the users. * MAINT: Use is_dict_like in to_records More generic than checking whether our mappings are instances of dict. Expands is_dict_like check to include whether it has a __contains__ method. * TST: Add test for is_dict_like expanded def * MAINT: Address final comments --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/dtypes/inference.py | 5 +- pandas/core/frame.py | 92 +++++++++++++++- pandas/tests/dtypes/test_inference.py | 27 +++++ pandas/tests/frame/test_convert_to.py | 151 ++++++++++++++++++++++++++ 5 files changed, 272 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index b4331aab3085f0..cca15f26cbf99e 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -411,6 +411,7 @@ Other Enhancements - :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`) - :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the ``axis`` parameter (:issue:`8839`) - The ``scatter_matrix``, ``andrews_curves``, ``parallel_coordinates``, ``lag_plot``, ``autocorrelation_plot``, ``bootstrap_plot``, and ``radviz`` plots from the ``pandas.plotting`` module are now accessible from calling :meth:`DataFrame.plot` (:issue:`11978`) +- :meth:`DataFrame.to_records` now accepts ``index_dtypes`` and ``column_dtypes`` parameters to allow different data types in stored column and index records (:issue:`18146`) - :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`) - :func:`pandas.DataFrame.to_sql` has gained the ``method`` argument to control SQL insertion clause. See the :ref:`insertion method ` section in the documentation. (:issue:`8953`) diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 241a1b471f677d..b11542622451ce 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -398,8 +398,11 @@ def is_dict_like(obj): >>> is_dict_like([1, 2, 3]) False """ + for attr in ("__getitem__", "keys", "__contains__"): + if not hasattr(obj, attr): + return False - return hasattr(obj, '__getitem__') and hasattr(obj, 'keys') + return True def is_named_tuple(obj): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 99ae551d3c55b9..99653248216f59 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -35,7 +35,6 @@ OrderedDict, PY36, raise_with_traceback, string_and_binary_types) from pandas.compat.numpy import function as nv - from pandas.core.dtypes.cast import ( maybe_upcast, cast_scalar_to_array, @@ -49,6 +48,7 @@ maybe_upcast_putmask, find_common_type) from pandas.core.dtypes.common import ( + is_dict_like, is_object_dtype, is_extension_type, is_extension_array_dtype, @@ -1540,7 +1540,8 @@ def from_records(cls, data, index=None, exclude=None, columns=None, return cls(mgr) - def to_records(self, index=True, convert_datetime64=None): + def to_records(self, index=True, convert_datetime64=None, + column_dtypes=None, index_dtypes=None): """ Convert DataFrame to a NumPy record array. @@ -1557,6 +1558,20 @@ def to_records(self, index=True, convert_datetime64=None): Whether to convert the index to datetime.datetime if it is a DatetimeIndex. + column_dtypes : str, type, dict, default None + .. versionadded:: 0.24.0 + + If a string or type, the data type to store all columns. If + a dictionary, a mapping of column names and indices (zero-indexed) + to specific data types. + index_dtypes : str, type, dict, default None + .. versionadded:: 0.24.0 + + If a string or type, the data type to store all index levels. If + a dictionary, a mapping of index level names and indices + (zero-indexed) to specific data types. + + This mapping is applied only if `index=True`. Returns ------- @@ -1598,6 +1613,23 @@ def to_records(self, index=True, convert_datetime64=None): >>> df.to_records(index=False) rec.array([(1, 0.5 ), (2, 0.75)], dtype=[('A', '>> df.to_records(column_dtypes={"A": "int32"}) + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('I', 'O'), ('A', '>> df.to_records(index_dtypes=">> index_dtypes = ">> df.to_records(index_dtypes=index_dtypes) + rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], + dtype=[('I', 'S1'), ('A', ' default to array dtypes. + (dict(), + np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], + dtype=[("index", "