Skip to content

Commit

Permalink
Squashed commit of the following:
Browse files Browse the repository at this point in the history
commit 8b136bf
Merge: 3005aed 01d3dc2
Author: Tom Augspurger <tom.w.augspurger@gmail.com>
Date:   Fri Mar 15 16:03:23 2019 -0500

    Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor

commit 3005aed
Author: Tom Augspurger <tom.w.augspurger@gmail.com>
Date:   Thu Mar 14 06:26:32 2019 -0500

    isort?

commit 318c06f
Merge: 0922296 79205ea
Author: Tom Augspurger <tom.w.augspurger@gmail.com>
Date:   Thu Mar 14 06:25:45 2019 -0500

    Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor

commit 0922296
Author: Tom Augspurger <tom.w.augspurger@gmail.com>
Date:   Wed Mar 13 21:35:51 2019 -0500

    updates

commit f433be8
Author: Tom Augspurger <tom.w.augspurger@gmail.com>
Date:   Wed Mar 13 20:54:07 2019 -0500

    lint

commit 6696f28
Merge: 534a379 1017382
Author: Tom Augspurger <tom.w.augspurger@gmail.com>
Date:   Wed Mar 13 20:53:13 2019 -0500

    Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor

commit 534a379
Merge: 94a7baf 5c341dc
Author: Tom Augspurger <tom.w.augspurger@gmail.com>
Date:   Tue Mar 12 14:37:27 2019 -0500

    Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor

commit 94a7baf
Author: Tom Augspurger <tom.w.augspurger@gmail.com>
Date:   Tue Mar 12 14:22:48 2019 -0500

    fixups

commit 6f619b5
Author: Tom Augspurger <tom.w.augspurger@gmail.com>
Date:   Tue Mar 12 13:38:48 2019 -0500

    32-bit compat

commit 24f48c3
Author: Tom Augspurger <tom.w.augspurger@gmail.com>
Date:   Mon Mar 11 22:05:46 2019 -0500

    API: DataFrame.sparse accessor

    Closes pandas-dev#25681
  • Loading branch information
TomAugspurger committed Mar 15, 2019
1 parent 01d3dc2 commit 02cb410
Show file tree
Hide file tree
Showing 7 changed files with 391 additions and 82 deletions.
23 changes: 23 additions & 0 deletions doc/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,29 @@ specific plotting methods of the form ``DataFrame.plot.<kind>``.
DataFrame.boxplot
DataFrame.hist


.. _api.frame.sparse:

Sparse Accessor
~~~~~~~~~~~~~~~

Sparse-dtype specific methods and attributes are provided under the
``DataFrame.sparse`` accessor.

.. autosummary::
:toctree: api/
:template: autosummary/accessor_attribute.rst

DataFrame.sparse.density

.. autosummary::
:toctree: api/

DataFrame.sparse.from_spmatrix
DataFrame.sparse.to_coo
DataFrame.sparse.to_dense


Serialization / IO / Conversion
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Other Enhancements
- :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`)
- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`)
- :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`)
- Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:`25681`)
- :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`)

.. _whatsnew_0250.api_breaking:
Expand Down
261 changes: 250 additions & 11 deletions pandas/core/arrays/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,55 @@ def _simple_new(cls, sparse_array, sparse_index, dtype):
new._dtype = dtype
return new

@classmethod
def from_spmatrix(cls, data):
"""
Create a SparseArray from a scipy.sparse matrix.
.. versionadded:: 0.25.0
Parameters
----------
data : scipy.sparse.sp_matrix
This should be a SciPy sparse matrix where the size
of the second dimension is 1. In other words, a
sparse matrix with a single column.
Returns
-------
SparseArray
Examples
--------
>>> import scipy.sparse
>>> mat = scipy.sparse.coo_matrix((4, 1))
>>> pd.SparseArray.from_spmatrix(mat)
[0.0, 0.0, 0.0, 0.0]
Fill: 0.0
IntIndex
Indices: array([], dtype=int32)
"""
length, ncol = data.shape

if ncol != 1:
raise ValueError(
"'data' must have a single column, not '{}'".format(ncol)
)

# our sparse index classes require that the positions be strictly
# increasing. So we need to sort loc, and arr accordingly.
arr = data.data
idx, _ = data.nonzero()
loc = np.argsort(idx)
arr = arr.take(loc)
idx.sort()

zero = np.array(0, dtype=arr.dtype).item()
dtype = SparseDtype(arr.dtype, zero)
index = IntIndex(length, idx)

return cls._simple_new(arr, index, dtype)

def __array__(self, dtype=None, copy=True):
fill_value = self.fill_value

Expand Down Expand Up @@ -1891,27 +1940,32 @@ def _make_index(length, indices, kind):
# ----------------------------------------------------------------------------
# Accessor


class BaseAccessor(object):
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."

def __init__(self, data=None):
self._parent = data
self._validate(data)

def _validate(self, data):
raise NotImplementedError


@delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
'sp_values'],
typ='property')
class SparseAccessor(PandasDelegate):
class SparseAccessor(BaseAccessor, PandasDelegate):
"""
Accessor for SparseSparse from other sparse matrix data types.
"""

def __init__(self, data=None):
self._validate(data)
# Store the Series since we need that for to_coo
self._parent = data

@staticmethod
def _validate(data):
def _validate(self, data):
if not isinstance(data.dtype, SparseDtype):
msg = "Can only use the '.sparse' accessor with Sparse data."
raise AttributeError(msg)
raise AttributeError(self._validation_msg)

def _delegate_property_get(self, name, *args, **kwargs):
return getattr(self._parent.values, name)
return getattr(self._parent.array, name)

def _delegate_method(self, name, *args, **kwargs):
if name == 'from_coo':
Expand Down Expand Up @@ -2025,3 +2079,188 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
column_levels,
sort_labels=sort_labels)
return A, rows, columns

def to_dense(self):
"""
Convert a Series from sparse values to dense.
.. versionadded:: 0.25.0
Returns
-------
Series:
A Series with the same values, stored as a dense array.
Examples
--------
>>> series = pd.Series(pd.SparseArray([0, 1, 0]))
>>> series
0 0
1 1
2 0
dtype: Sparse[int64, 0]
>>> series.sparse.to_dense()
0 0
1 1
2 0
dtype: int64
"""
from pandas import Series
return Series(self._parent.array.to_dense(),
index=self._parent.index,
name=self._parent.name)


class SparseFrameAccessor(BaseAccessor, PandasDelegate):
"""
DataFrame accessor for sparse data.
.. versionadded :: 0.25.0
"""

def _validate(self, data):
dtypes = data.dtypes
if not all(isinstance(t, SparseDtype) for t in dtypes):
raise AttributeError(self._validation_msg)

@classmethod
def from_spmatrix(cls, data, index=None, columns=None):
"""
Create a new DataFrame from a scipy sparse matrix.
.. versionadded:: 0.25.0
Parameters
----------
data : scipy.sparse.spmatrix
Must be convertible to csc format.
index, columns : Index, optional
Row and column labels to use for the resulting DataFrame.
Defaults to a RangeIndex.
Returns
-------
DataFrame
Each column of the DataFrame is stored as a
:class:`SparseArray`.
Examples
--------
>>> import scipy.sparse
>>> mat = scipy.sparse.eye(3)
>>> pd.DataFrame.sparse.from_spmatrix(mat)
0 1 2
0 1.0 0.0 0.0
1 0.0 1.0 0.0
2 0.0 0.0 1.0
"""
from pandas import DataFrame

data = data.tocsc()
index, columns = cls._prep_index(data, index, columns)
sparrays = [
SparseArray.from_spmatrix(data[:, i])
for i in range(data.shape[1])
]
data = dict(zip(columns, sparrays))
return DataFrame(data, index=index)

def to_dense(self):
"""
Convert a DataFrame with sparse values to dense.
.. versionadded:: 0.25.0
Returns
-------
DataFrame
A DataFrame with the same values stored as dense arrays.
Examples
--------
>>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])})
>>> df.sparse.to_dense()
A
0 0
1 1
2 0
"""
from pandas import DataFrame

data = {k: v.array.to_dense()
for k, v in compat.iteritems(self._parent)}
return DataFrame(data,
index=self._parent.index,
columns=self._parent.columns)

def to_coo(self):
"""
Return the contents of the frame as a sparse SciPy COO matrix.
.. versionadded:: 0.20.0
Returns
-------
coo_matrix : scipy.sparse.spmatrix
If the caller is heterogeneous and contains booleans or objects,
the result will be of dtype=object. See Notes.
Notes
-----
The dtype will be the lowest-common-denominator type (implicit
upcasting); that is to say if the dtypes (even of numeric types)
are mixed, the one that accommodates all will be chosen.
e.g. If the dtypes are float16 and float32, dtype will be upcast to
float32. By numpy.find_common_type convention, mixing int64 and
and uint64 will result in a float64 dtype.
"""
try:
from scipy.sparse import coo_matrix
except ImportError:
raise ImportError('Scipy is not installed')

dtype = find_common_type(self._parent.dtypes)
if isinstance(dtype, SparseDtype):
dtype = dtype.subtype

cols, rows, datas = [], [], []
for col, name in enumerate(self._parent):
s = self._parent[name]
row = s.array.sp_index.to_int_index().indices
cols.append(np.repeat(col, len(row)))
rows.append(row)
datas.append(s.array.sp_values.astype(dtype, copy=False))

cols = np.concatenate(cols)
rows = np.concatenate(rows)
datas = np.concatenate(datas)
return coo_matrix((datas, (rows, cols)), shape=self._parent.shape)

@property
def density(self):
"""
Ratio of non-sparse points to total (dense) data points
represented in the DataFrame.
"""
return np.mean([column.array.density
for _, column in self._parent.iteritems()])

@staticmethod
def _prep_index(data, index, columns):
import pandas.core.indexes.base as ibase

N, K = data.shape
if index is None:
index = ibase.default_index(N)
if columns is None:
columns = ibase.default_index(K)

if len(columns) != K:
raise ValueError('Column length mismatch: {columns} vs. {K}'
.format(columns=len(columns), K=K))
if len(index) != N:
raise ValueError('Index length mismatch: {index} vs. {N}'
.format(index=len(index), N=N))
return index, columns
2 changes: 2 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
PY36, raise_with_traceback, Iterator,
string_and_binary_types)
from pandas.compat.numpy import function as nv
from pandas.core.arrays.sparse import SparseFrameAccessor
from pandas.core.dtypes.cast import (
maybe_upcast,
cast_scalar_to_array,
Expand Down Expand Up @@ -8009,6 +8010,7 @@ def isin(self, values):
plot = CachedAccessor("plot", gfx.FramePlotMethods)
hist = gfx.hist_frame
boxplot = gfx.boxplot_frame
sparse = CachedAccessor("sparse", SparseFrameAccessor)


DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0,
Expand Down
Loading

0 comments on commit 02cb410

Please sign in to comment.