In [3]:
import pickle
import faiss
import torch
import os
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Paths
shard_id = 0
wiki_path = '/home/tkolb/data/indices/atlas/wiki/base'
shard_path = f'{wiki_path}/embeddings.{shard_id}.pt'
index_path = f'/home/tkolb/data/faiss_index_shard_{shard_id}.index'
passages_path = f'/home/tkolb/data/wiki_passages.pkl'

In [43]:
# Build faiss index for one shard
def build_faiss_index(shard_path, index_path):
    embeddings = torch.load(shard_path, map_location='cpu')
    embeddings = embeddings.swapaxes(0, 1)
    embeddings_np = np.ascontiguousarray(embeddings.numpy()).astype(np.float32)
    
    index = faiss.IndexFlatL2(embeddings_np.shape[1])
    index.add(embeddings_np)
    
    faiss.write_index(index, index_path)

In [44]:
build_faiss_index(shard_path, index_path)

In [45]:
index = faiss.read_index(index_path)

In [46]:
index.d, index.ntotal

(768, 251084)

In [24]:
# Build complete wiki passages file from shards
def build_passages(wiki_path, passages_path):
    titles = [''] * 251084
    sections = [''] * 251084
    texts = [''] * 251084
    max_nr = 251084
    passages_files = sorted([f for f in os.listdir(wiki_path) if f.startswith('passages')], key=lambda f: int(f.split('.')[1]))
    for i, filename in enumerate(passages_files):
        print(filename)
        file_path = os.path.join(wiki_path, filename)
        with open(file_path, "rb") as fobj:
            passages = pickle.load(fobj)
            for p in passages:
                id_text = p['id']
                if id_text.isdigit():
                    id = int(id_text)
                    if max_nr < id:
                        max_nr = id
                    # titles[id] = p['title']
                    # sections[id] = p['section']
                    # texts[id] = p['text']
        if i > 10:
            break
    print(max_nr)
    
    # Save passages list in pickle file
    # print("Saving...")
    # with open(passages_path, 'wb') as f:
    #     pickle.dump(passages, f)

In [25]:
build_passages(wiki_path, passages_path)

passages.0.pt
passages.1.pt
passages.2.pt
passages.3.pt
passages.4.pt
passages.5.pt
passages.6.pt
passages.7.pt
passages.8.pt
passages.9.pt
passages.10.pt
passages.11.pt
28446146


In [14]:
with open(passages_path, 'rb') as f:
    psg_list = pickle.load(f)
psg_list

[{'id': '0',
  'title': 'Mydaeinae',
  'section': '',
  'text': ' The Mydaeinae are a subfamily of true flies, belonging to the family Muscidae.'},
 {'id': '1',
  'title': 'Mydaeinae: Genera',
  'section': 'Genera',
  'text': 'Brontaea Kowarz, 1873 ; Graphomya Robineau-Desvoidy, 1830 ; Hebecnema Schnabl, 1889 ; Hemichlora Van der Wulp, 1896 ; Mydaea Robineau-Desvoidy, 1830 ; Myospila Róndani, 1856 ; Scenetes Malloch, 1936 ; Scutellomusca Townsend, 1931 '},
 {'id': '2',
  'title': 'The MegaMix',
  'section': '',
  'text': ' The MegaMix is Lil Suzy\'s first compilation album launched on 30 November 1999 by the Metropolitan Recording Corporation. Includes several mixes of his hits as well as re-recorded songs in a non-stop mix. The song "You\'re the Only One" was released as a single to promote the album. On 5 October 2000, the album was released in Germany under the name of "Best of ... (Non-stop Mixed) and was released the same year with an identical track listing.'},
 {'id': '32',
  't