In [1]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cdist

In [2]:
X = np.random.rand(1000, 1000)

In [4]:
%%time
neigh = NearestNeighbors(n_neighbors=5)
neigh.fit(X)

Wall time: 49.8 ms


In [5]:
# K-nearest neighbours
%timeit neigh.kneighbors(X[0].reshape(1, -1))

3.72 ms ± 117 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
# Euclidean hand made Non-vectorised
%timeit distances = np.array([np.linalg.norm(vec - X[0]) for vec in X])

8.39 ms ± 394 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
# Euclidean hand made vectorised
%timeit distances = np.linalg.norm(X - X[0], axis=1) # vectorised

18.2 ms ± 1.58 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [8]:
# Cosine scikit-learn
%timeit distances = cosine_similarity(X, X[0].reshape(1, -1))

27.1 ms ± 2.45 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
# Euclidean distance scikit learn
%timeit euclidean_distances(X[0].reshape(1, -1), X)

6.63 ms ± 836 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
%timeit np.asarray(cdist(X[0].reshape(1, -1), X)) #np.asarray([cdist(x,y) for x, y in zip(A, B)])

2.48 ms ± 199 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
# pure numpy with broadcasting
#%timeit dists2 = np.sqrt( np.sum( (X[:,:,np.newaxis,:] - B[:,np.newaxis,:,:])**2, axis=-1) )
#%timeit dists2 = np.sqrt( np.sum( (X[:,:,np.newaxis,:] - X[0])**2, axis=-1) )

In [14]:
# todo try:
# - https://github.com/droyed/eucl_dist
# - linear_kernel
# - manual cosine
# - safe_sparse_dot
# - general calculations above with speed metrics