In [1]:
!nvidia-smi

Sat Jan 25 06:21:49 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.44       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8     9W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [2]:
import pynvml
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
device_name = pynvml.nvmlDeviceGetName(handle)
if (device_name != b'Tesla T4') and (device_name != b'Tesla P100-PCIE-16GB'):
  raise Exception("""
    Unfortunately this instance does not have a T4 or P100 GPU.
    
    Please make sure you've configured Colab to request a GPU instance type.
    
    Sometimes Colab allocates a Tesla K80 instead of a T4 or P100. Resetting the instance.
If you get a K80 GPU, try Runtime -> Reset all runtimes...
  """)
else:
  print('Yes, you got the right kind of GPU to work and it is a Tesla T4 GPU.')

Yes, you got the right kind of GPU to work and it is a Tesla T4 GPU.


In [3]:
!wget -nc https://github.com/rapidsai/notebooks-extended/raw/master/utils/rapids-colab.sh
!bash rapids-colab.sh

import sys, os

sys.path.append('/usr/local/lib/python3.6/site-packages/')
os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'

--2020-01-25 06:22:17--  https://github.com/rapidsai/notebooks-extended/raw/master/utils/rapids-colab.sh
Resolving github.com (github.com)... 52.74.223.119
Connecting to github.com (github.com)|52.74.223.119|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://github.com/rapidsai/notebooks-contrib/raw/master/utils/rapids-colab.sh [following]
--2020-01-25 06:22:17--  https://github.com/rapidsai/notebooks-contrib/raw/master/utils/rapids-colab.sh
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/rapidsai/notebooks-contrib/master/utils/rapids-colab.sh [following]
--2020-01-25 06:22:18--  https://raw.githubusercontent.com/rapidsai/notebooks-contrib/master/utils/rapids-colab.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.

# **KNN**
K NearestNeighbors is a supervised algorithm where if one wants to find the “closest” datapoint(s) to new unseen data, one can calculate a suitable “distance” between each and every point, and return the top K datapoints which have the smallest distance to it.

cuML’s KNN expects a cuDF DataFrame or a Numpy Array (where automatic chunking will be done in to a Numpy Array in a future release), and fits a special data structure first to approximate the distance calculations, allowing our querying times to be O(plogn) and not the brute force O(np) [where p = no(features)]:

The KNN function accepts the following parameters:
1. n_neighbors: int (default = 5).  The top K closest datapoints you want the algorithm to return. If this number is large, then expect the algorithm to run slower.
1. should_downcast: bool (default = False).  Currently only single precision is supported in the underlying index. Setting this to true will allow single-precision input arrays to be automatically downcasted to single precision. Default = False.

The methods that can be used with KNN are:
1. fit: Fit GPU index for performing nearest neighbor queries.
1. kneighbors: Query the GPU index for the k nearest neighbors of row vectors in X.

The model accepts only numpy arrays or cudf dataframes as the input. In order to convert your dataset to cudf format please read the cudf documentation on https://rapidsai.github.io/projects/cudf/en/latest/. For additional information on the K NearestNeighbors model please refer to the documentation on https://rapidsai.github.io/projects/cuml/en/latest/api.html#nearest-neighbors

In [0]:
import os

import numpy as np
import pandas as pd

import cudf
from cuml.neighbors.nearest_neighbors import NearestNeighbors as cumlKNN

from sklearn.neighbors import NearestNeighbors as skKNN

In [0]:
# check if the mortgage dataset is present and then extract the data from it, else just create a random dataset for training 
import gzip
# change the path of the mortgage dataset if you have saved it in a different directory
def load_data(nrows, ncols, cached = 'mortgage.npy.gz', source='mortgage'):
    if os.path.exists(cached) and source=='mortgage':
        print('use mortgage data')
        with gzip.open(cached) as f:
            X = np.load(f)
        X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]
    else:
        # create a random dataset
        print('use random data')
        X = np.random.random((nrows,ncols)).astype('float32')
    df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])}).fillna(0)
    return df

In [0]:
from sklearn.metrics import mean_squared_error
# this function checks if the results obtained from two different methods (sklearn and cuml) are the same
def array_equal(a,b,threshold=1e-3,with_sign=True, metric='mse'):
    a = to_nparray(a)
    b = to_nparray(b)
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    if metric=='mse':
        error = mean_squared_error(a,b)
        res = error<threshold
    elif metric=='abs':
        error = a-b
        res = len(error[error>threshold]) == 0
    elif metric == 'acc':
        error = np.sum(a!=b)/(a.shape[0]*a.shape[1])
        res = error<threshold
    return res

# calculate the accuracy 
def accuracy(a,b, threshold=1e-4):
    a = to_nparray(a)
    b = to_nparray(b)
    c = a-b
    c = len(c[c>1]) / (c.shape[0]*c.shape[1])
    return c<threshold

# the function converts a variable from ndarray or dataframe format to numpy array
def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x

## **Loading Data**

In [8]:
from google.colab import files
uploaded = files.upload()

Saving mortgage.npy.gz to mortgage.npy.gz


In [9]:
%%time
# nrows = number of samples
# ncols = number of features of each sample

nrows = 2**15
ncols = 40

X = load_data(nrows, ncols)
print('data', X.shape)

use mortgage data
data (32768, 40)
CPU times: user 5.08 s, sys: 1.09 s, total: 6.17 s
Wall time: 6.23 s


In [12]:
X

Unnamed: 0,fea0,fea1,fea2,fea3,fea4,fea5,fea6,fea7,fea8,fea9,fea10,fea11,fea12,fea13,fea14,fea15,fea16,fea17,fea18,fea19,fea20,fea21,fea22,fea23,fea24,fea25,fea26,fea27,fea28,fea29,fea30,fea31,fea32,fea33,fea34,fea35,fea36,fea37,fea38,fea39
0,0.666667,0.218545,0.545045,0.529297,0.636111,0.359871,0.0,0.044486,0.059951,0.061698,0.716319,0.083032,0.231624,0.113612,0.212412,0.00678,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.555556,0.085640,0.063063,0.386719,0.463889,0.790109,0.0,0.044486,0.059951,0.061698,0.716319,0.083032,0.231624,0.113612,0.212412,0.00678,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.722222,0.173407,0.729730,0.449219,0.552778,0.630076,0.0,0.044486,0.059951,0.061698,0.716319,0.083032,0.231624,0.113612,0.212412,0.00678,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.527778,0.196900,0.045045,0.746094,0.975000,0.255328,0.0,0.044486,0.059951,0.061698,0.716319,0.083032,0.231624,0.113612,0.212412,0.00678,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.611111,0.272232,0.216216,0.671875,0.869444,0.383997,0.0,0.044486,0.059951,0.061698,0.716319,0.083032,0.231624,0.113612,0.212412,0.00678,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32763,0.513889,0.104216,0.509009,0.193359,0.188889,0.341375,0.0,0.044486,0.059951,0.061698,0.716319,0.083032,0.231624,0.113612,0.212412,0.00678,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32764,0.527778,0.301038,0.063063,0.738281,0.955556,0.599517,0.0,0.044486,0.059951,0.061698,0.716319,0.083032,0.231624,0.113612,0.212412,0.00678,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32765,0.569444,0.196900,0.027027,0.402344,0.483333,0.782067,0.0,0.044486,0.059951,0.061698,0.716319,0.083032,0.231624,0.113612,0.212412,0.00678,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32766,0.694444,0.173840,0.211712,0.673828,0.000000,0.383997,0.0,0.044486,0.059951,0.061698,0.716319,0.083032,0.231624,0.113612,0.212412,0.00678,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## **Model Parameter**

In [0]:
# the number of neighbors whos labels are to be checked
n_neighbors = 10

## **Scikit-Learn Implementation**

In [13]:
%%time
# use the sklearn KNN model to fit the dataset 
knn_sk = skKNN(metric = 'sqeuclidean', )
knn_sk.fit(X)
D_sk, I_sk = knn_sk.kneighbors(X, n_neighbors)

CPU times: user 34.4 s, sys: 388 ms, total: 34.8 s
Wall time: 34.8 s


## **cuML Implementation**

In [14]:
%%time
# convert the pandas dataframe to cudf dataframe
X_cudf = cudf.DataFrame.from_pandas(X)

CPU times: user 475 ms, sys: 159 ms, total: 634 ms
Wall time: 924 ms


In [15]:
%%time
# use cuml's KNN model to fit the dataset
knn_cuml = cumlKNN()
knn_cuml.fit(X_cudf)

# calculate the distance and the indices of the samples present in the dataset
D_cuml, I_cuml = knn_cuml.kneighbors(X_cudf, n_neighbors)

CPU times: user 3.58 s, sys: 383 ms, total: 3.96 s
Wall time: 8.45 s


In [16]:
# compare the distance obtained while using sklearn and cuml models
passed = array_equal(D_sk, D_cuml, metric = 'abs') # metric used can be 'acc', 'mse', or 'abs'
message = 'compare knn: cuml vs sklearn distances %s'%('equal'if passed else 'NOT equal')
print(message)

compare knn: cuml vs sklearn distances equal


In [17]:
# compare the labels obtained while using sklearn and cuml models
passed = accuracy(I_sk, I_cuml, threshold = 1e-1)
message = 'compare knn: cuml vs sklearn indexes %s'%('equal'if passed else 'NOT equal')
print(message)

compare knn: cuml vs sklearn indexes equal
