In [32]:
from annoy import AnnoyIndex
import sys

modules = '/home/dbcloud/ym/CSPG/PuiANN/modules'

if modules not in sys.path:
  sys.path.append(modules)

from binary_io import *
import puiann as pui

from concurrent.futures import ThreadPoolExecutor

In [33]:
dataset = [
  'sift1m', 'gist1m', 
  'deep1m', 'text2image1m', 'turing1m', 'rcd']

read_ways = [
  'fvecs_read', 'fvecs_read', 
  'fbin_read', 'fbin_read', 'fbin_read', 'fvecs_read']

bases = [
  "/var/lib/docker/anns/dataset/sift1m/base.fvecs",
  "/var/lib/docker/anns/dataset/gist1m/base.fvecs",
  "/var/lib/docker/anns/dataset/deep1b/base.1M.fbin",
  "/home/dbcloud/big-ann-benchmarks/data/text2image1B/base.1B.fbin.crop_nb_1000000",
  "/home/dbcloud/big-ann-benchmarks/data/MSTuringANNS/base1b.fbin.crop_nb_1000000",
  "/home/dbcloud/ym/CSPG/experiment/distribution/data/base.fvecs"
]

queries = [
  "/var/lib/docker/anns/query/sift1m/query.fvecs",
  "/var/lib/docker/anns/query/gist1m/query.fvecs",
  "/var/lib/docker/anns/dataset/deep1b/query.public.10K.fbin",
  "/home/dbcloud/big-ann-benchmarks/data/text2image1B/query.heldout.30K.fbin",
  "/home/dbcloud/big-ann-benchmarks/data/MSTuringANNS/testQuery10K.fbin",
  "/home/dbcloud/ym/CSPG/experiment/distribution/data/query.fvecs"
]

gts = [
  "/var/lib/docker/anns/query/sift1m/gt.ivecs",
  "/var/lib/docker/anns/query/gist1m/gt.ivecs",
  "/var/lib/docker/anns/dataset/deep1b/gt_1M.ibin",
  "/home/dbcloud/big-ann-benchmarks/data/text2image1B/gt100-heldout.30K.ivecs",
  "/home/dbcloud/big-ann-benchmarks/data/MSTuringANNS/testQuery10K_gt.ivecs",
  "/home/dbcloud/ym/CSPG/experiment/distribution/data/gt.ivecs"
]

output = '/home/dbcloud/ym/CSPG/experiment/output/distribution/'

In [34]:
def query_F(index, vector, k):
  return index.get_nns_by_vector(vector, k, include_distances=False)

In [35]:
def test_annoy(base, query, gt, rw, dataset):

  if rw == 'fbin_read':
    base = fbin_read(base)
    query = fbin_read(query)
  else:
    base = fvecs_read(base)
    query = fvecs_read(query)

  if dataset == 'deep1m':
    gt = ibin_read(gt)
  else:
    gt = ivecs_read(gt)

  nb, d = base.shape
  nq, ngt = gt.shape
  gt = gt.flatten().tolist()

  k = 10
  threads = 24
  # faiss.omp_set_num_threads(threads)

  print(ds, nb, d, nq, k)

  out = open(output + dataset + '_annoy.csv', 'w')

  tm = pui.STimer()

  print('num_queries,num_tree,query_time,recall', file=out)
  for nt in [100, 200, 300, 400, 500, 800, 1600]:
    print(nt) 
    index = AnnoyIndex(d, 'euclidean')
    for i in range(nb):
      index.add_item(i, base[i])

    index.build(nt, n_jobs = threads)

    tm.reset()
    tm.start()
    with ThreadPoolExecutor() as executor:
      futures = [executor.submit(query_F, index, query[i], k) for i in range(nq)]
    tm.stop()
    results = [future.result() for future in futures]

    indices_matrix = np.zeros((nq, k), dtype=int)
    for i, indices in enumerate(results):
      indices_matrix[i, :] = indices  

    recall = pui.get_recall(k, ngt, gt, indices_matrix)
    print(f'{nq},{nt},{tm.get_time()},{recall}', file=out)

  out.close()

In [36]:
for ds, rw, base, query, gt in zip(dataset, read_ways, bases, queries, gts):
  test_annoy(base, query, gt, rw, ds)

sift1m 1000000 128 10000 10
100
200
300
400
500
800
1600
gist1m 1000000 960 1000 10
100
200
300
400
500
800
1600
deep1m 1000000 96 10000 10
100
200
300
400
500
800
1600
text2image1m 1000000 200 30000 10
100
200
300
400
500
800
1600
turing1m 1000000 100 10000 10
100
200
300
400
500
800
1600
rcd 1000000 96 1000 10
100
200
300
400
500
800
1600
