Before we start, we need to get data. We will be using the Sift1M dataset. It can be downloaded and opened using this script:

In [1]:
import shutil
import urllib.request as request
from contextlib import closing

# first we download the Sift1M dataset
with closing(request.urlopen('ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz')) as r:
    with open('sift.tar.gz', 'wb') as f:
        shutil.copyfileobj(r, f)

In [2]:
import tarfile

# the download leaves us with a tar.gz file, we unzip it
tar = tarfile.open('sift.tar.gz', "r:gz")
tar.extractall()

  tar.extractall()


In [3]:
import numpy as np

# now define a function to read the fvecs file format of Sift1M dataset
def read_fvecs(fp):
    a = np.fromfile(fp, dtype='int32')
    d = a[0]
    return a.reshape(-1, d + 1)[:, 1:].copy().view('float32')

In [4]:
# data we will search through
wb = read_fvecs('./sift/sift_base.fvecs')  # 1M samples
# also get some query vectors to search with
xq = read_fvecs('./sift/sift_query.fvecs')
# take just one query (there are many in sift_learn.fvecs)
xq = xq[0].reshape(1, xq.shape[1])

In [5]:
wb.shape, xq.shape

((1000000, 128), (1, 128))

In [7]:
wb[0]

array([  0.,  16.,  35.,   5.,  32.,  31.,  14.,  10.,  11.,  78.,  55.,
        10.,  45.,  83.,  11.,   6.,  14.,  57., 102.,  75.,  20.,   8.,
         3.,   5.,  67.,  17.,  19.,  26.,   5.,   0.,   1.,  22.,  60.,
        26.,   7.,   1.,  18.,  22.,  84.,  53.,  85., 119., 119.,   4.,
        24.,  18.,   7.,   7.,   1.,  81., 106., 102.,  72.,  30.,   6.,
         0.,   9.,   1.,   9., 119.,  72.,   1.,   4.,  33., 119.,  29.,
         6.,   1.,   0.,   1.,  14.,  52., 119.,  30.,   3.,   0.,   0.,
        55.,  92., 111.,   2.,   5.,   4.,   9.,  22.,  89.,  96.,  14.,
         1.,   0.,   1.,  82.,  59.,  16.,  20.,   5.,  25.,  14.,  11.,
         4.,   0.,   0.,   1.,  26.,  47.,  23.,   4.,   0.,   0.,   4.,
        38.,  83.,  30.,  14.,   9.,   4.,   9.,  17.,  23.,  41.,   0.,
         0.,   2.,   8.,  19.,  25.,  23.,   1.], dtype=float32)