In [3]:
import faiss
import torch
import os
import pickle
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Paths
wiki_path = '/home/tkolb/data/indices/atlas/wiki/base'
index_path = '/home/tkolb/data/faiss_index.index'
passages_path = f'/home/tkolb/data/wiki_passages.pkl'

In [5]:
# Load index in chunks that fit into GPU memory
def load_embeddings_in_chunks(path, chunk_size=4):
    embeddings = []
    embeddings_files = sorted([f for f in os.listdir(path) if f.startswith('embeddings')], key=lambda f: int(f.split('.')[1]))
    for filename in embeddings_files:
        print(filename)
        file_path = os.path.join(path, filename)
        data = torch.load(file_path, map_location='cpu')
        embeddings.append(data)
        if len(embeddings) == chunk_size:
            yield torch.cat(embeddings, dim=1)
            embeddings = []
    if embeddings:
        yield torch.cat(embeddings, dim=0)

In [4]:
for embedding in load_embeddings_in_chunks(wiki_path):
    embeddings_np = embedding.numpy()
    print(embeddings_np.shape)
    print(embeddings_np.shape[1])
    print(embeddings_np[0].shape)
    break

embeddings.0.pt


KeyboardInterrupt: 

In [6]:
# Build FAISS index
def build_faiss_index_incrementally(vectors_path, index_path, chunk_size=4, num_files=28):
    d = None
    index = None

    for i, embeddings in enumerate(load_embeddings_in_chunks(vectors_path, chunk_size)):
        embeddings = embeddings.swapaxes(0, 1)
        embeddings_np = np.ascontiguousarray(embeddings.numpy()).astype(np.float32)
        if d is None:
            d = embeddings_np.shape[1]
            index = faiss.IndexFlatL2(d)
        index.add(embeddings_np)
        
        faiss.write_index(index, index_path)
        index = faiss.read_index(index_path)
        
        if (i+1)*chunk_size == num_files:
            break

    return index

In [7]:
index = build_faiss_index_incrementally(wiki_path, index_path)

embeddings.0.pt
embeddings.1.pt
embeddings.2.pt
embeddings.3.pt
embeddings.4.pt
embeddings.5.pt
embeddings.6.pt
embeddings.7.pt
embeddings.8.pt
embeddings.9.pt
embeddings.10.pt
embeddings.11.pt
embeddings.12.pt
embeddings.13.pt
embeddings.14.pt
embeddings.15.pt
embeddings.16.pt
embeddings.17.pt
embeddings.18.pt
embeddings.19.pt
embeddings.20.pt
embeddings.21.pt
embeddings.22.pt
embeddings.23.pt
embeddings.24.pt
embeddings.25.pt
embeddings.26.pt
embeddings.27.pt


In [8]:
# Load index to test
index = faiss.read_index(index_path)

In [9]:
index.d, index.ntotal

(768, 7030352)

In [14]:
# Build complete wiki passages file from shards
def build_passages(wiki_path, passages_path, num_files=28):
    passages_list = []
    passages_files = sorted([f for f in os.listdir(wiki_path) if f.startswith('passages')], key=lambda f: int(f.split('.')[1]))
    for i, filename in enumerate(passages_files):
        print(filename)
        file_path = os.path.join(wiki_path, filename)
        with open(file_path, "rb") as fobj:
            passages = pickle.load(fobj)
            passages_list += passages
            
        if i == num_files-1:
            break
    
    with open(passages_path, 'wb') as f:
        pickle.dump(passages_list, f)

In [15]:
build_passages(wiki_path, passages_path)

passages.0.pt
passages.1.pt
passages.2.pt
passages.3.pt
passages.4.pt
passages.5.pt
passages.6.pt
passages.7.pt
passages.8.pt
passages.9.pt
passages.10.pt
passages.11.pt
passages.12.pt
passages.13.pt
passages.14.pt
passages.15.pt
passages.16.pt
passages.17.pt
passages.18.pt
passages.19.pt
passages.20.pt
passages.21.pt
passages.22.pt
passages.23.pt
passages.24.pt
passages.25.pt
passages.26.pt
passages.27.pt


In [16]:
# Load passages to test
with open(passages_path, 'rb') as f:
    passages = pickle.load(f)
passages

[{'id': '0',
  'title': 'Mydaeinae',
  'section': '',
  'text': ' The Mydaeinae are a subfamily of true flies, belonging to the family Muscidae.'},
 {'id': '32',
  'title': 'César Obando',
  'section': '',
  'text': ' César Augusto Obando Villeda, nicknamed El Nene, (born 26 October 1969) is a retired Honduran football player who is considered one of the best players ever in Honduras but a serious injury cut his career early.'},
 {'id': '64',
  'title': 'Maro Engel: Complete Formula Three Euro Series results',
  'section': 'Complete Formula Three Euro Series results',
  'text': ' (key) (Races in bold indicate pole position) (Races in italics indicate fastest lap) † Driver did not finish the race, but was classified as he completed over 90% of the race distance.'},
 {'id': '96',
  'title': 'History of the Oakland Athletics: The emergence of a powerhouse (1968&ndash;1970)',
  'section': 'The emergence of a powerhouse (1968&ndash;1970)',
  'text': ' The Athletics\' Oakland tenure opened w

In [17]:
len(passages)

7030352