Skip to content

Commit

Permalink
API: DataFrame.sparse accessor
Browse files Browse the repository at this point in the history
  • Loading branch information
TomAugspurger committed Mar 12, 2019
1 parent 21769e9 commit 24f48c3
Show file tree
Hide file tree
Showing 6 changed files with 265 additions and 28 deletions.
23 changes: 23 additions & 0 deletions doc/source/reference/frame.rst
Expand Up @@ -312,6 +312,29 @@ specific plotting methods of the form ``DataFrame.plot.<kind>``.
DataFrame.boxplot
DataFrame.hist


.. _api.frame.sparse:

Sparse Accessor
~~~~~~~~~~~~~~~

Sparse-dtype specific methods and attributes are provided under the
``DataFrame.sparse`` accessor.

.. autosummary::
:toctree: api/
:template: autosummary/accessor_attribute.rst

DataFrame.sparse.density

.. autosummary::
:toctree: api/

DataFrame.sparse.from_spmatrix
DataFrame.sparse.to_coo
DataFrame.sparse.to_dense


Serialization / IO / Conversion
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Expand Up @@ -26,6 +26,7 @@ Other Enhancements
- :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`)
- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`)
- :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`)
- Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:`25681`)

.. _whatsnew_0250.api_breaking:

Expand Down
164 changes: 159 additions & 5 deletions pandas/core/arrays/sparse.py
Expand Up @@ -678,6 +678,36 @@ def _simple_new(cls, sparse_array, sparse_index, dtype):
new._dtype = dtype
return new

@classmethod
def from_spmatrix(cls, data):
"""
Create a SparseArray from a scipy.sparse matrix.
Parameters
----------
data : scipy.sparse.sp_matrix
This should be a 2-D SciPy sparse where the size
of the second dimension is 1. In other words, a
sparse matrix with a single column.
Returns
-------
SparseArray.
"""
assert data.ndim == 2

length, ncol = data.shape

assert ncol == 1

arr = data.data
idx, _ = data.nonzero()
zero = np.array(0, dtype=arr.dtype).item()
dtype = SparseDtype(arr.dtype, zero)
index = IntIndex(length, idx)

return cls._simple_new(arr, index, dtype)

def __array__(self, dtype=None, copy=True):
fill_value = self.fill_value

Expand Down Expand Up @@ -1891,6 +1921,9 @@ def _make_index(length, indices, kind):
# ----------------------------------------------------------------------------
# Accessor

_validation_msg = "Can only use the '.sparse' accessor with Sparse data."


@delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
'sp_values'],
typ='property')
Expand All @@ -1900,15 +1933,13 @@ class SparseAccessor(PandasDelegate):
"""

def __init__(self, data=None):
self._validate(data)
# Store the Series since we need that for to_coo
self._parent = data
self._validate(data)

@staticmethod
def _validate(data):
def _validate(self, data):
if not isinstance(data.dtype, SparseDtype):
msg = "Can only use the '.sparse' accessor with Sparse data."
raise AttributeError(msg)
raise AttributeError(_validation_msg)

def _delegate_property_get(self, name, *args, **kwargs):
return getattr(self._parent.values, name)
Expand Down Expand Up @@ -2025,3 +2056,126 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
column_levels,
sort_labels=sort_labels)
return A, rows, columns

def to_dense(self):
from pandas import Series
return Series(self._parent.array.to_dense(),
index=self._parent.index,
name=self._parent.name)


class SparseFrameAccessor(PandasDelegate):

def __init__(self, data=None):
# Store the Series since we need that for to_coo
self._parent = data
self._validate(data)

def _validate(self, data):
dtypes = data.dtypes
if not all(isinstance(t, SparseDtype) for t in dtypes):
raise AttributeError(_validation_msg)

@classmethod
def from_spmatrix(cls, data, index=None, columns=None):
"""
Create a new DataFrame from a scipy sparse matrix.
Parameters
----------
data : scipy.sparse.spmatrix
Must be convertible to csc format.
index, columns : Index, optional
Row and column labels to use for the resulting DataFrame.
Defaults to a RangeIndex.
Returns
-------
DataFrame
Examples
--------
>>> import scipy.sparse
>>> mat = scipy.sparse.eye(3)
>>> pd.DataFrame.sparse.from_spmatrix(mat)
0 1 2
0 1.0 0.0 0.0
1 0.0 1.0 0.0
2 0.0 0.0 1.0
"""
from pandas import DataFrame

data = data.tocsc()
index, columns = cls._prep_index(data, index, columns)
sparrays = [
SparseArray.from_spmatrix(data[:, i])
for i in range(data.shape[1])
]
data = dict(zip(columns, sparrays))
return DataFrame(data, index=index)

def to_dense(self):
"""
Convert to dense DataFrame
Returns
-------
df : DataFrame
"""
from pandas import DataFrame

data = {k: v.array.to_dense()
for k, v in compat.iteritems(self._parent)}
return DataFrame(data,
index=self._parent.index,
columns=self._parent.columns)

def to_coo(self):
try:
from scipy.sparse import coo_matrix
except ImportError:
raise ImportError('Scipy is not installed')

dtype = find_common_type(self._parent.dtypes)
if isinstance(dtype, SparseDtype):
dtype = dtype.subtype

cols, rows, datas = [], [], []
for col, name in enumerate(self._parent):
s = self._parent[name]
row = s.array.sp_index.to_int_index().indices
cols.append(np.repeat(col, len(row)))
rows.append(row)
datas.append(s.array.sp_values.astype(dtype, copy=False))

cols = np.concatenate(cols)
rows = np.concatenate(rows)
datas = np.concatenate(datas)
return coo_matrix((datas, (rows, cols)), shape=self._parent.shape)

@property
def density(self):
"""
Ratio of non-sparse points to total (dense) data points
represented in the DataFrame.
"""
return np.mean([column.array.density
for _, column in self._parent.iteritems()])

@staticmethod
def _prep_index(data, index, columns):
import pandas.core.indexes.base as ibase

N, K = data.shape
if index is None:
index = ibase.default_index(N)
if columns is None:
columns = ibase.default_index(K)

if len(columns) != K:
raise ValueError('Column length mismatch: {columns} vs. {K}'
.format(columns=len(columns), K=K))
if len(index) != N:
raise ValueError('Index length mismatch: {index} vs. {N}'
.format(index=len(index), N=N))
return index, columns
2 changes: 2 additions & 0 deletions pandas/core/frame.py
Expand Up @@ -36,6 +36,7 @@
PY36, raise_with_traceback, Iterator,
string_and_binary_types)
from pandas.compat.numpy import function as nv
from pandas.core.arrays.sparse import SparseFrameAccessor
from pandas.core.dtypes.cast import (
maybe_upcast,
cast_scalar_to_array,
Expand Down Expand Up @@ -8009,6 +8010,7 @@ def isin(self, values):
plot = CachedAccessor("plot", gfx.FramePlotMethods)
hist = gfx.hist_frame
boxplot = gfx.boxplot_frame
sparse = CachedAccessor("sparse", SparseFrameAccessor)


DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0,
Expand Down
27 changes: 4 additions & 23 deletions pandas/core/sparse/frame.py
Expand Up @@ -14,12 +14,12 @@
from pandas.compat.numpy import function as nv
from pandas.util._decorators import Appender

from pandas.core.dtypes.cast import find_common_type, maybe_upcast
from pandas.core.dtypes.cast import maybe_upcast
from pandas.core.dtypes.common import ensure_platform_int, is_scipy_sparse
from pandas.core.dtypes.missing import isna, notna

import pandas.core.algorithms as algos
from pandas.core.arrays.sparse import SparseArray, SparseDtype
from pandas.core.arrays.sparse import SparseArray
import pandas.core.common as com
from pandas.core.frame import DataFrame
import pandas.core.generic as generic
Expand Down Expand Up @@ -271,27 +271,8 @@ def to_coo(self):
float32. By numpy.find_common_type convention, mixing int64 and
and uint64 will result in a float64 dtype.
"""
try:
from scipy.sparse import coo_matrix
except ImportError:
raise ImportError('Scipy is not installed')

dtype = find_common_type(self.dtypes)
if isinstance(dtype, SparseDtype):
dtype = dtype.subtype

cols, rows, datas = [], [], []
for col, name in enumerate(self):
s = self[name]
row = s.sp_index.to_int_index().indices
cols.append(np.repeat(col, len(row)))
rows.append(row)
datas.append(s.sp_values.astype(dtype, copy=False))

cols = np.concatenate(cols)
rows = np.concatenate(rows)
datas = np.concatenate(datas)
return coo_matrix((datas, (rows, cols)), shape=self.shape)
from pandas.core.arrays.sparse import SparseFrameAccessor
return SparseFrameAccessor(self).to_coo()

def __array_wrap__(self, result):
return self._constructor(
Expand Down
76 changes: 76 additions & 0 deletions pandas/tests/arrays/sparse/test_accessor.py
@@ -0,0 +1,76 @@
import string

import numpy as np
import pytest

import pandas as pd
import pandas.util.testing as tm


class TestSeriesAccessor(object):
# TODO: collect other accessor tests
def test_to_dense(self):
s = pd.Series([0, 1, 0, 10], dtype='Sparse[int]')
result = s.sparse.to_dense()
expected = pd.Series([0, 1, 0, 10])
tm.assert_series_equal(result, expected)


class TestFrameAccessor(object):
@pytest.mark.parametrize('format', ['csc', 'csr', 'coo'])
@pytest.mark.parametrize("labels", [
None,
list(string.ascii_letters[:10]),
])
@pytest.mark.parametrize('dtype', ['float64', 'int64'])
def test_from_spmatrix(self, format, labels, dtype):
pytest.importorskip("scipy")
import scipy.sparse
sp_dtype = pd.SparseDtype(dtype, np.array(0, dtype=dtype).item())

mat = scipy.sparse.eye(10, format=format, dtype=dtype)
result = pd.DataFrame.sparse.from_spmatrix(
mat, index=labels, columns=labels
)
expected = pd.DataFrame(
np.eye(10, dtype=dtype),
index=labels,
columns=labels,
).astype(sp_dtype)
tm.assert_frame_equal(result, expected)

def test_to_coo(self):
pytest.importorskip("scipy")
import scipy.sparse

df = pd.DataFrame({
"A": [0, 1, 0],
"B": [1, 0, 0],
}, dtype='Sparse[int64, 0]')
result = df.sparse.to_coo()
expected = scipy.sparse.coo_matrix(np.asarray(df))
assert (result != expected).nnz == 0

def test_to_dense(self):
df = pd.DataFrame({
"A": pd.SparseArray([1, 0], dtype=pd.SparseDtype('int64', 0)),
"B": pd.SparseArray([1, 0], dtype=pd.SparseDtype('int64', 1)),
"C": pd.SparseArray([1., 0.],
dtype=pd.SparseDtype('float64', 0.0)),
}, index=['b', 'a'])
result = df.sparse.to_dense()
expected = pd.DataFrame({
'A': [1, 0],
'B': [1, 0],
'C': [1.0, 0.0],
}, index=['b', 'a'])
tm.assert_frame_equal(result, expected)

def test_density(self):
df = pd.DataFrame({
'A': pd.SparseArray([1, 0, 2, 1], fill_value=0),
'B': pd.SparseArray([0, 1, 1, 1], fill_value=0),
})
res = df.sparse.density
expected = 0.75
assert res == expected

0 comments on commit 24f48c3

Please sign in to comment.