In [None]:
import faiss 
import numpy as np
import torch
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
%ls

[0m[01;34mgdrive[0m/  [01;34msample_data[0m/


In [None]:
%cd gdrive/MyDrive/Colab \Notebooks/Modal/

/content/gdrive/MyDrive/Colab Notebooks


In [None]:
images_emd = torch.load('images_emd.pt')
target = torch.load('target.pt')

## Accuracy method

In [None]:
def accuracy(imgs_emds, target, index, k=10):
    acc = []
    for i in range(len(imgs_emds)):
        D, I = index.search(imgs_emds[i].detach().numpy()[None, ...], k)
        ind_targets = torch.index_select(target, 0, torch.Tensor(I.reshape(k)).int())
        acc += [((ind_targets == target[i]).sum() / k).item()]
    
    return acc

In [None]:
d = 2048  # emd dimension
k = 20

## FAISS FLAT

In [None]:
index = faiss.IndexFlatIP(d)
index.add(images_emd.detach().numpy())


In [None]:
np.array(accuracy(images_emd, target, index)).mean()

0.13555555797906385

## Inverted File Index

In [None]:
nlist = 48  # number of cells/clusters to partition data into

quantizer = faiss.IndexFlatIP(d)  # how the vectors will be stored/compared
index = faiss.IndexIVFFlat(quantizer, d, nlist)
index.train(images_emd.detach().numpy())  # we must train the index to cluster into cells
index.add(images_emd.detach().numpy())

index.nprobe = 15  # set how many of nearest cells to search


In [None]:
np.array(accuracy(images_emd, target, index)).mean()

0.1694444474350247

## HNSW Implementation

In [None]:
# set HNSW index parameters
M = 15  # number of connections each vertex will have
ef_search = 32  # depth of layers explored during search
ef_construction = 64  # depth of layers explored during index construction

# initialize index (d == 2048)
index = faiss.IndexHNSWFlat(d, M)
# set efConstruction and efSearch parameters
index.hnsw.efConstruction = ef_construction
index.hnsw.efSearch = ef_search
# add data to index
index.add(images_emd.detach().numpy())

In [None]:
np.array(accuracy(images_emd, target, index)).mean()

0.1686111141099698

## Locality Sensitive Hashing

In [None]:
nbits = d*4  # resolution of bucketed vectors
# initialize index and add vectors
index = faiss.IndexLSH(d, nbits)
index.add(images_emd.detach().numpy())

In [None]:
np.array(accuracy(images_emd, target, index)).mean()

0.1741666696448293