In [7]:
import numpy as np
import pandas as pd
import cudf
import os

from sklearn.neighbors import NearestNeighbors as skKNN
from cuml.neighbors.nearest_neighbors import NearestNeighbors as cumlKNN

# Helper Functions

In [8]:
import gzip
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz',source='mortgage'):
    if os.path.exists(cached) and source=='mortgage':
        print('use mortgage data')
        with gzip.open(cached) as f:
            X = np.load(f)
        X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]
    else:
        print('use random data')
        X = np.random.random((nrows,ncols)).astype('float32')
    df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])}).fillna(0)
    return df

In [55]:
from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=1e-4,with_sign=True,metric='mse'):
    a = to_nparray(a)
    b = to_nparray(b)
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    if metric=='mse':
        error = mean_squared_error(a,b)
    else:
        error = np.sum(a!=b)/(a.shape[0]*a.shape[1])
    res = error<threshold
    return res

def accuracy(a,b, threshold=1e-3):
    a = to_nparray(a)
    b = to_nparray(b)
    c = a-b
    c = len(c[c>1]) / (c.shape[0]*c.shape[1])
    return c<threshold

def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x    

# Run tests

In [69]:
%%time
nrows = 2**15
ncols = 40

X = load_data(nrows,ncols)
print('data',X.shape)

use mortgage data
data (32768, 40)
CPU times: user 4.72 s, sys: 173 ms, total: 4.89 s
Wall time: 4.84 s


In [70]:
n_neighbors = 10

In [71]:
%%time
knn_sk = skKNN(metric = 'sqeuclidean', )
knn_sk.fit(X)
D_sk,I_sk = knn_sk.kneighbors(X,n_neighbors)

CPU times: user 35.7 s, sys: 1.35 s, total: 37.1 s
Wall time: 37.1 s


In [72]:
%%time
X = cudf.DataFrame.from_pandas(X)

CPU times: user 42.2 ms, sys: 29.6 ms, total: 71.7 ms
Wall time: 82.9 ms


In [73]:
%%time
knn_cuml = cumlKNN()
knn_cuml.fit(X)
D_cuml,I_cuml = knn_cuml.kneighbors(X,n_neighbors)

CPU times: user 8.56 s, sys: 153 ms, total: 8.71 s
Wall time: 2.58 s


In [74]:
passed = array_equal(D_sk,D_cuml, threshold=1e-12)
message = 'compare knn: cuml vs sklearn distances %s'%('equal'if passed else 'NOT equal')
print(message)

compare knn: cuml vs sklearn distances equal


In [83]:
passed = accuracy(I_sk, I_cuml, threshold=1e-1)
message = 'compare knn: cuml vs sklearn indexes %s'%('equal'if passed else 'NOT equal')
print(message)

compare knn: cuml vs sklearn indexes equal
