# Speed testing of frequency counts

In [2]:
import Orange
from scipy.stats import itemfreq
from pandas import crosstab
from numpy_indexed.funcs import count_table
import numpy as np
import timeit

def wrapper(func, *args, **kwargs):
    def wrapped():
        return func(*args, **kwargs)
    return wrapped

In [3]:
no_samples = 100000
no_unique_values = 100

## Testing for an array with 1 column

In [6]:
x = np.array(np.random.randint(no_unique_values, size=no_samples)).T.astype(np.float32)

wrapped = wrapper(Orange.statistics.util.bincount, x)
print("Orange binount:", timeit.timeit(wrapped, number=3) / 3)

wrapped = wrapper(np.unique, x, return_counts=True)
print("Numpy unique:", timeit.timeit(wrapped, number=3) / 3)

wrapped = wrapper(np.bincount, x.astype(np.int32, copy=False))
print("Numpy bincount:", timeit.timeit(wrapped, number=3) / 3)

wrapped = wrapper(itemfreq, x)
print("Scipy itemfreq:", timeit.timeit(wrapped, number=3) / 3)

wrapped = wrapper(count_table, x)
print("Numpy_indexed count_table:", timeit.timeit(wrapped, number=3) / 3)

wrapped = wrapper(crosstab, x, 'freq')
print("Pandas crosstab:", timeit.timeit(wrapped, number=3) / 3)

Orange binount: 0.00039311000000452623
Numpy unique: 0.0052230620000083645
Numpy bincount: 0.0002738416666640357
Scipy itemfreq: 0.009950036000001697
Numpy_indexed count_table: 0.025874018999995013
Pandas crosstab: 0.04655286700000261


## Testing for an array with 2 columns

In [9]:
x = np.array([np.random.randint(no_unique_values, size=no_samples) for i in range(2)]).T.astype(np.float32)

def unique_count(*a):
    no_att = len(a)
    M = np.column_stack(a)
    M_cont = np.ascontiguousarray(M).view(np.dtype((np.void, M.dtype.itemsize * no_att)))
    return np.unique(M_cont, return_counts=True)

wrapped = wrapper(Orange.statistics.util.contingency, x[:,0], x[:,1])
print("Orange contingency:", timeit.timeit(wrapped, number=3) / 3)

wrapped = wrapper(unique_count, x[:,0], x[:,1])
print("Numpy unique:", timeit.timeit(wrapped, number=3) / 3)

wrapped = wrapper(count_table, x[:,0], x[:,1])
print("Numpy_indexed count_table:", timeit.timeit(wrapped, number=3) / 3)

wrapped = wrapper(crosstab, x[:,0], x[:,1])
print("Pandas crosstab:", timeit.timeit(wrapped, number=3) / 3)

Orange contingency: 0.011879872999998042
Numpy unique: 0.035038583999873175
Numpy_indexed count_table: 0.02883348799999415
Pandas crosstab: 0.3573879576665604


## Testing for an array with 3 or more columns

In [8]:
x = np.array([np.random.randint(no_unique_values, size=no_samples) for i in range(3)]).T.astype(np.float32)

wrapped = wrapper(unique_count, x[:,0], x[:,1], x[:,2])
print("Numpy unique:", timeit.timeit(wrapped, number=3) / 3)

wrapped = wrapper(count_table, x[:,0], x[:,1], x[:,2])
print("Numpy_indexed count_table:", timeit.timeit(wrapped, number=3) / 3)

wrapped = wrapper(crosstab, x[:,0], [x[:,1], x[:,2]])
print("Pandas crosstab:", timeit.timeit(wrapped, number=3) / 3)

Numpy unique: 0.04339484199999788
Numpy_indexed count_table: 0.0477379963332775
Pandas crosstab: 3.3469604909999666
