In [1]:
import numpy as np

In [2]:
def sum_abundance_1(data, axis, cutoff=10, strict=False):
    '''Check if the sum abundance larger than cutoff.

    It can be used filter features with at least "cutoff" abundance
    total over all samples
    NOTE: this is a "fast" function working on the whole matrix

    Parameters
    ----------
    data : numpy 2d array or scipy.Sparse matrix
    axis : int
        0 to sum each feature, 1 to sum each sample
    cutoff : float
        keep features with sum>=cutoff
    strict : bool, optional
        False (default) to use sum >=cutoff
        True to use sum>cutoff (for removing 0 reads)

    Returns
    -------
    numpy.array
        Boolean array with one entry per feature (axis=0) or sample (axis=1), True if sum>=cutoff.

    Examples
    --------
    >>> np.sum(sum_abundance(np.array([[0, 1, 1]]), axis=1, cutoff=2)) == 1
    True
    >>> np.sum(sum_abundance(np.array([[0, 1, 1]]), axis=1, cutoff=2, strict=True)) == 0
    True
    >>> np.sum(sum_abundance(np.array([[0, 1, 1]]), axis=1, cutoff=2.01)) == 0
    True

    '''
    if strict:
        res = data.sum(axis=axis) > cutoff
    else:
        res = data.sum(axis=axis) >= cutoff
    if issparse(data):
        res = res.A1
    return res

In [3]:
def sum_abundance_2(data, axis, cutoff):
    predicate = lambda i: i.sum() >= cutoff
    return np.apply_along_axis(predicate, 1 - axis, data)

In [4]:
def prevalence_1(x, cutoff=1/10000, fraction=0.1):
    '''Check the prevalence of values above the cutoff.

    present (abundance >= cutoff) in at least "fraction" of samples

    Examples
    --------
    >>> prevalence(np.array([0, 1]))
    True
    >>> prevalence(np.array([0, 1, 2, 3]), 2, 0.5)
    True
    >>> prevalence(np.array([0, 1, 2]), 2, 0.51)
    False
    '''
    frac = np.sum(x >= cutoff, axis=0) / x.shape[0]
    return frac >= fraction


In [5]:
def prevalence_2(x, cutoff=1/10000, fraction=0.1):
    '''Check the prevalence of values above the cutoff.

    present (abundance >= cutoff) in at least "fraction" of samples

    Examples
    --------
    >>> prevalence(np.array([0, 1]))
    True
    >>> prevalence(np.array([0, 1, 2, 3]), 2, 0.5)
    True
    >>> prevalence(np.array([0, 1, 2]), 2, 0.51)
    False
    '''
    def predicate(i):
        frac = np.sum(i >= cutoff) / len(i)
        return frac >= fraction
    return np.apply_along_axis(predicate, 0, x)

In [6]:
from numpy.random import RandomState
from scipy.sparse import issparse, csr_matrix

In [7]:
rand = RandomState(9)

In [8]:
table = rand.normal(size=(100, 1000))

In [10]:
%timeit a = sum_abundance_1(table, axis=1, cutoff=1)

35.5 µs ± 381 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [11]:
%timeit b = sum_abundance_2(table, axis=1, cutoff=1)

5.11 ms ± 42.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
%timeit c = prevalence_1(table, cutoff=1)

128 µs ± 543 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [13]:
%timeit d = prevalence_2(table, cutoff=1)

9.13 ms ± 45.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
x = csr_matrix([[0, 0, 1], [1, 0, 0]])

In [19]:
np.apply_along_axis(sum, 0, x.todense().A1)

array(2, dtype=int64)

In [21]:
a = [slice(None)] * 2
a

[slice(None, None, None), slice(None, None, None)]

In [22]:
y=x[0,:]

In [23]:
x[:,np.array([True, False])]

<2x1 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>