In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
%%capture --no-display
import sys

# from embedding import Embed,EmbedBERT

import numpy as np
import faiss
import pickle
import torch
import os
import json
from sentence_transformers import SentenceTransformer

In [3]:
torch.cuda.is_available()

True

In [4]:

def create_index_compressed(embs, ids=None):
    """
    Create a compressed Faiss index using the provided embeddings and IDs.

    :param embs: Embeddings to be indexed.
    :type embs: numpy array
    :param ids: IDs corresponding to the embeddings.
    :type ids: numpy array or None

    :return: Compressed Faiss index.
    """
    emb_size = embs.shape[1]
    print("Loading embeddings..")

    nlist = 100
    k = 4
    ngpus = faiss.get_num_gpus()
    print("Number of GPUs:", ngpus)
    print("Creating index..")
    res = faiss.StandardGpuResources()  # Use standard GPU resources
    index2 = faiss.index_factory(emb_size, "PCA64,IVF16384_HNSW32,Flat")
    index_ivf = faiss.extract_index_ivf(index2)
    clustering_index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(64))
    index_ivf.clustering_index = clustering_index

    # Convert the index to GPU
    # index_gpu = faiss.index_cpu_to_all_gpus(index2)
    index_gpu = faiss.index_cpu_to_gpu(res, 0, index2)

    print("Training..")
    index_gpu.train(embs)
    print("Adding embeddings to index..")
    if ids is not None:
        index_gpu.add_with_ids(embs, ids)
    else:
        index_gpu.add(embs)
    return index_gpu


In [5]:

# Example usage:
num_embeddings = 30000
embedding_dim = 128
embs = np.random.rand(num_embeddings, embedding_dim).astype('float32')
ids = np.arange(num_embeddings).astype(np.int64)  # Example IDs: 0 to 9999

# Create Faiss index
index = create_index_compressed(embs, ids)

# Test the index by searching for the nearest neighbors of a random query
query = np.random.rand(1, embedding_dim).astype('float32')
k = 5  # number of nearest neighbors to retrieve
distances, indices = index.search(query, k)

print("Nearest neighbors for the query:")
print(indices)


Loading embeddings..
Number of GPUs: 4
Creating index..
Training..


: 

: 

For ncbitaxon we created a Faiss compressed index, since the ontology is pretty big. 

In [20]:
from faiss_utils.faiss_indexes import (create_index_flat, create_index_HNSW,
                            create_index_compressed,create_index_IVF)


In [21]:
DATA_DIR = '../data/pubmed_small_graph/'   

In [22]:
embeddings = []
with open(DATA_DIR + 'sentence_bert_embs_graph.json') as ff:
    for g in json.load(ff):
        embeddings.append(g['embedding'])


In [23]:
#TypeError: in method 'IndexPreTransform_train', argument 3 of type 'float const *'
embeddings = np.array(embeddings, dtype=np.float32)
d = embeddings.shape[1]


In [27]:

index = create_index_flat(embeddings, d)


Loading embeddings..


In [28]:
# index_file = os.path.join('out', ONTOLOGY, ONTOLOGY+'.index')
faiss.write_index(index, os.path.join(DATA_DIR,'faiss_index_flat.bin'))
