In [12]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cdist
import dask
import dask.array as da

In [2]:
X = np.random.rand(1000, 1000)

In [3]:
%%time
neigh = NearestNeighbors(n_neighbors=5)
neigh.fit(X)

CPU times: user 19.5 ms, sys: 0 ns, total: 19.5 ms
Wall time: 27.2 ms


In [4]:
# K-nearest neighbours
%timeit neigh.kneighbors(X[0].reshape(1, -1))

1.62 ms ± 29.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [5]:
# Euclidean hand made Non-vectorised
%timeit distances = np.array([np.linalg.norm(vec - X[0]) for vec in X])

10.7 ms ± 462 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
# Euclidean hand made vectorised
%timeit distances = np.linalg.norm(X - X[0], axis=1) # vectorised

3.74 ms ± 55.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
# Cosine scikit-learn
%timeit distances = cosine_similarity(X, X[0].reshape(1, -1))

14 ms ± 1.04 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [8]:
# Euclidean distance scikit learn
%timeit euclidean_distances(X[0].reshape(1, -1), X)

1.16 ms ± 38 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [9]:
%timeit np.asarray(cdist(X[0].reshape(1, -1), X)) #np.asarray([cdist(x,y) for x, y in zip(A, B)])

449 µs ± 6.04 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [22]:
# dask setup
X_dask = da.from_array(X, chunks=(500, 500))
result = X_dask.compute()

In [21]:
%timeit distances = np.linalg.norm(X_dask - X_dask[0], axis=1) # vectorised and with dask

13.7 ms ± 133 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
# pure numpy with broadcasting
#%timeit dists2 = np.sqrt( np.sum( (X[:,:,np.newaxis,:] - B[:,np.newaxis,:,:])**2, axis=-1) )
#%timeit dists2 = np.sqrt( np.sum( (X[:,:,np.newaxis,:] - X[0])**2, axis=-1) )

In [11]:
# todo try:
# - https://github.com/droyed/eucl_dist
# - linear_kernel
# - manual cosine
# - safe_sparse_dot
# - general calculations above with speed metrics