In [1]:
import argparse
import h5py
import numpy as np
import os
from pathlib import Path
from urllib.request import urlretrieve
import logging


logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s'
)
LOG = logging.getLogger(__name__)


def download(src, dst):
    if not os.path.exists(dst):
        os.makedirs(Path(dst).parent, exist_ok=True)
        LOG.info('downloading %s -> %s...' % (src, dst))
        urlretrieve(src, dst)


def prepare(kind, size):
    url = "https://sisap-23-challenge.s3.amazonaws.com/SISAP23-Challenge"
    task = {
        "query": f"{url}/public-queries-10k-{kind}.h5",
        "dataset": f"{url}/laion2B-en-{kind}-n={size}.h5",
    }

    for version, url in task.items():
        target_path = os.path.join("data", kind, size, f"{version}.h5")
        download(url, target_path)
        assert os.path.exists(target_path), f"Failed to download {url}"


In [10]:
size= '100K'
kind = "pca32v2"

key = 'pca32'
prepare(kind, size)

data = np.array(h5py.File(os.path.join("data", kind, size, "dataset.h5"), "r")[key])
queries = np.array(h5py.File(os.path.join("data", kind, size, "query.h5"), "r")[key])
n, d = data.shape

[2023-06-16 12:56:23,515][INFO ][__main__] downloading https://sisap-23-challenge.s3.amazonaws.com/SISAP23-Challenge/public-queries-10k-pca32v2.h5 -> data/pca32v2/100K/query.h5...
[2023-06-16 12:56:24,703][INFO ][__main__] downloading https://sisap-23-challenge.s3.amazonaws.com/SISAP23-Challenge/laion2B-en-pca32v2-n=100K.h5 -> data/pca32v2/100K/dataset.h5...


In [11]:
import faiss

In [12]:
nlist = 128
index_identifier = f"IVF{nlist},Flat"
index = faiss.index_factory(d, index_identifier)

In [13]:
import time 

In [14]:
print(f"Training index on {data.shape}")
start = time.time()
index.train(data)
index.add(data)
elapsed_build = time.time() - start
print(f"Done training in {elapsed_build}s.")
assert index.is_trained

Training index on (100000, 32)
Done training in 0.24490928649902344s.


In [33]:
nprobe=500
k=50

In [34]:
print(f"Starting search on {queries.shape} with nprobe={nprobe}")
start = time.time()
index.nprobe = nprobe
D, I = index.search(queries, k)
elapsed_search = time.time() - start
print(f"Done searching in {elapsed_search}s.")

I = I + 1 # FAISS is 0-indexed, groundtruth is 1-indexed

identifier = f"index=({index_identifier}),query=(nprobe={nprobe})"

Starting search on (10000, 32) with nprobe=500
Done searching in 25.20758891105652s.


In [22]:
dst=os.path.join("result/", kind, size, f"{identifier}.h5")

In [35]:
D.shape

(10000, 50)

In [36]:
I.shape

(10000, 50)

In [37]:
queries[0]

array([-0.06877425, -0.11037602,  0.05646035, -0.11508854, -0.03577028,
        0.09110859,  0.07477964,  0.10778218,  0.04116842,  0.00877987,
       -0.06257925, -0.17031246,  0.00780125, -0.099485  ,  0.04967897,
       -0.12064676,  0.0035742 , -0.07642894,  0.08400699, -0.03188556,
       -0.01020392,  0.07147753, -0.01378688, -0.10539152, -0.00040444,
       -0.00291365,  0.12945361,  0.05624911,  0.00785595,  0.01020052,
        0.06252866, -0.00992952], dtype=float32)

In [38]:
I[0]

array([92501, 41079, 22337, 79896, 92811, 82795, 11781, 83736, 11441,
         885, 74338, 15455, 24314, 20213, 97655, 19961, 24532, 52102,
       45784, 57876, 41370, 50294, 13032, 69015, 83588, 53971, 12401,
       45346, 44952, 13236, 92706, 41312, 38159, 74173, 24874, 79172,
       99941, 93336, 10221, 75694, 67030, 96585, 20970, 12161, 99104,
       76538, 42014, 60713, 79753, 99973])

In [29]:
def get_groundtruth(size="100K"):
    #url = f"http://ingeotec.mx/~sadit/metric-datasets/LAION/SISAP23-Challenge/laion2B-en-public-gold-standard-v2-{size}.h5"
    url = f"https://sisap-23-challenge.s3.amazonaws.com/SISAP23-Challenge/laion2B-en-public-gold-standard-v2-{size}.h5"

    out_fn = os.path.join("data", f"groundtruth-{size}.h5")
    download(url, out_fn)
    gt_f = h5py.File(out_fn, "r")
    true_I = np.array(gt_f['knns'])
    gt_f.close()
    return true_I

In [30]:
gt = get_groundtruth(size="100K")

[2023-06-16 13:10:31,165][INFO ][__main__] downloading https://sisap-23-challenge.s3.amazonaws.com/SISAP23-Challenge/laion2B-en-public-gold-standard-v2-100K.h5 -> data/groundtruth-100K.h5...


In [31]:
gt[0]

array([79172, 15735, 22337,   231, 74173, 41079, 38159, 71849, 69015,
       92811, 99973, 79896, 13236, 64156, 86179, 55218, 60622, 64728,
       86341, 63839, 52857,   885, 83736, 87457, 36442, 24314, 73879,
        5985, 20970, 60559,  5414, 26294, 42586, 41370, 22973,  4415,
       23975, 82381, 15425, 79753, 19745, 74573, 40382,  8735, 70297,
       23884, 92501, 66709, 91232, 61308, 94073,  4327, 25525,  7180,
       12401,  5451, 59683, 17544, 53818, 52102, 85602,   898, 76553,
       18967, 37889, 50123, 80243, 35693, 73074, 31649, 55412, 13493,
       67226, 21472, 81544, 70033, 91179, 97387, 94426, 76538, 64253,
       29033, 87037, 38033, 80913, 70410, 83588, 44952,  6911, 65794,
       45784,  5492, 35205, 21690, 16711, 95028, 19961,  1708, 68450,
       20333], dtype=int32)

In [None]:
def store_results(dst, algo, kind, D, I, buildtime, querytime, params, size):

In [None]:
os.path.join("result/", kind, size, f"{identifier}.h5"), "faissIVF", kind, D, I, elapsed_build, elapsed_search, identifier, size