In [3]:
%pip install faiss-gpu

Note: you may need to restart the kernel to use updated packages.


The filename, directory name, or volume label syntax is incorrect.


In [1]:
import faiss

ModuleNotFoundError: No module named 'faiss'

In [None]:
import numpy as np

In [None]:
import shutil
import urllib.request as request
from contextlib import closing

# first we download the Sift1M dataset
with closing(request.urlopen('ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz')) as r:
    with open('sift.tar.gz', 'wb') as f:
        shutil.copyfileobj(r, f)

In [None]:
import tarfile

# the download leaves us with a tar.gz file, we unzip it
tar = tarfile.open('sift.tar.gz', "r:gz")
tar.extractall()

In [None]:
def read_fvecs(fp):
    a = np.fromfile(fp, dtype='int32')
    d = a[0]
    return a.reshape(-1, d + 1)[:, 1:].copy().view('float32')

# 1M samples
xb = read_fvecs('./sift/sift_base.fvecs')
# queries
xq = read_fvecs('./sift/sift_query.fvecs')

In [None]:
d = 128  # vector size
M = 32
efSearch = 32  # number of entry points (neighbors) we use on each layer
efConstruction = 32  # number of entry points used on each layer
                     # during construction

index = faiss.IndexHNSWFlat(d, M)
print(index.hnsw)

<faiss.swigfaiss.HNSW; proxy of <Swig Object of type 'faiss::HNSW *' at 0x7ff4b61a67e0> >


In [None]:
levels = faiss.vector_to_array(index.hnsw.levels)
np.bincount(levels)

array([], dtype=int64)

In [None]:
index.hnsw.efConstruction = efConstruction
index.hnsw.efSearch = efSearch

In [None]:
index.add(xb)

In [None]:
# after adding our data we will find that the level
# has been set automatically
index.hnsw.max_level

In [None]:
# and levels (or layers) are now populated
levels = faiss.vector_to_array(index.hnsw.levels)
np.bincount(levels)

In [None]:
index.hnsw.entry_point

In [None]:
def set_default_probas(M: int, m_L: float):
    nn = 0  # set nearest neighbors count = 0
    cum_nneighbor_per_level = []
    level = 0  # we start at level 0
    assign_probas = []
    while True:
        # calculate probability for current level
        proba = np.exp(-level / m_L) * (1 - np.exp(-1 / m_L))
        # once we reach low prob threshold, we've created enough levels
        if proba < 1e-9: break
        assign_probas.append(proba)
        # neighbors is == M on every level except level 0 where == M*2
        nn += M*2 if level == 0 else M
        cum_nneighbor_per_level.append(nn)
        level += 1
    return assign_probas, cum_nneighbor_per_level

In [None]:
assign_probas, cum_nneighbor_per_level = set_default_probas(
    32, 1/np.log(32)
)
assign_probas, cum_nneighbor_per_level

In [None]:
# this is copy of HNSW::random_level function
def random_level(assign_probas: list, rng):
    # get random float from 'r'andom 'n'umber 'g'enerator
    f = rng.uniform() 
    for level in range(len(assign_probas)):
        # if the random float is less than level probability...
        if f < assign_probas[level]:
            # ... we assert at this level
            return level
        # otherwise subtract level probability and try again
        f -= assign_probas[level]
    # below happens with very low probability
    return len(assign_probas) - 1

In [None]:
chosen_levels = []
rng = np.random.default_rng(12345)
for _ in range(1_000_000):
    chosen_levels.append(random_level(assign_probas, rng))
np.bincount(chosen_levels)

In [None]:
set_default_probas(32, 0.09)

In [None]:
levels = faiss.vector_to_array(index.hnsw.levels)
np.bincount(levels)

In [None]:
del index
index = faiss.IndexHNSWFlat(d, 32)
index.hnsw.set_default_probas(32, 0.09)  # HNSW::set_default_probas(int M, float levelMult)
index.hnsw.efConstruction = efConstruction
index.add(xb)

In [None]:
levels = faiss.vector_to_array(index.hnsw.levels)
np.bincount(levels)

In [None]:
assign_probas, cum_nneighbor_per_level = set_default_probas(32, 0.0000001)
assign_probas, cum_nneighbor_per_level

In [None]:
chosen_levels = []
rng = np.random.default_rng(12345)
for _ in range(1_000_000):
    chosen_levels.append(random_level(assign_probas, rng))

In [None]:
np.bincount(chosen_levels)

In [None]:
del index
index = faiss.IndexHNSWFlat(d, 32)
index.hnsw.efConstruction = efConstruction
index.add(xb[:1_000])

In [None]:
levels = faiss.vector_to_array(index.hnsw.levels)
np.bincount(levels)

In [None]:
recall_idx = []

index = faiss.IndexFlatL2(d)
index.add(xb)
D, recall_idx = index.search(xq[:1000], k=1)

In [None]:
import os

def get_memory(index):
    faiss.write_index(index, './temp.index')
    file_size = os.path.getsize('./temp.index')
    os.remove('./temp.index')
    return file_size

In [None]:
import time
import pandas as pd
from tqdm.auto import trange
from datetime import datetime

results = pd.DataFrame({
    'M': [],
    'efConstruction': [],
    'efSearch': [],
    'recall@1': [],
    'build_time': [],
    'search_time': [],
    'memory_usage': []
})


for M_bit in trange(1, 6):
    M = 2 ** M_bit
    print(M)
    for ef_bit in trange(1, 6):
        efConstruction = 2 ** ef_bit
        index = faiss.IndexHNSWFlat(d, M)
        index.efConstruction = efConstruction
        start = time.perf_counter()
        index.add(xb)
        build_time = (time.perf_counter() - start)
        memory_usage = get_memory(index)
        for efSearch in [2, 4, 8, 16, 32]:
            index.efSearch = efSearch
            start = time.perf_counter()
            D, I = index.search(xq[:1000], k=1)
            search_time = (time.perf_counter() - start)
            recall = sum(I == recall_idx)[0]
            results = results.append({
                'M': M,
                'efConstruction': efConstruction,
                'efSearch': efSearch,
                'recall@1': recall,
                'build_time': build_time,
                'search_time': search_time,
                'memory_usage': memory_usage
            }, ignore_index=True)
        del index

  0%|          | 0/5 [00:00<?, ?it/s]

2


  0%|          | 0/5 [00:00<?, ?it/s]

  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({


4


  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({


  0%|          | 0/5 [00:00<?, ?it/s]

  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({


8


  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({


  0%|          | 0/5 [00:00<?, ?it/s]

  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({


16


  results = results.append({


  0%|          | 0/5 [00:00<?, ?it/s]

  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({


32


  results = results.append({
  results = results.append({


  0%|          | 0/5 [00:00<?, ?it/s]

  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({


In [None]:
results.to_csv('./results.csv', sep='|', index=False)

In [None]:
import pandas as pd

results = pd.read_csv('./results.csv', sep='|')
results.head()

In [None]:
index = faiss.IndexHNSWFlat(d, M)
index.efConstruction = 32
index.add(xb)
index.efSearch = 2
start = datetime.now()
D, I = index.search(xq_full[:1000], k=1)
search_time = (datetime.now() - start).microseconds
recall = sum(I == recall_idx)[0]

In [None]:
search_time

211926

In [None]:
recall

913

In [None]:
efConstruction,efSearch

(2, 32)

In [None]:
index.efSearch = 32
start = datetime.now()
D, I = index.search(xq_full[:1000], k=1)
search_time = (datetime.now() - start).microseconds
recall = sum(I == recall_idx)[0]
search_time,recall

(411107, 913)

In [None]:
import time

In [None]:
index = faiss.IndexHNSWFlat(d, M)
index.efConstruction = efConstruction
start = time.time()
index.add(xb)
build_time = time.time()
build_time - start

399.89372968673706

In [None]:
index.efSearch = efSearch
start = time.time()
D, I = index.search(xq_full[:1000], k=1)
search_time = time.time()
search_time - start

0.3042445182800293

In [None]:
t = datetime.now()

In [None]:
datetime.now()-t

datetime.timedelta(seconds=9, microseconds=449645)