In [None]:
# Install dependencies
!pip install -q faiss-cpu sentence-transformers pytrec_eval torch tqdm

import faiss
import numpy as np
import json
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

In [None]:
# Load preprocessed corpus
PROCESSED_CORPUS_PATH = "/kaggle/input/preprocessed-corpus/preprocessed_corpus.jsonl"

def load_corpus(file_path):
    """Load the preprocessed corpus from JSONL file."""
    corpus = []
    doc_ids = []
    doc_texts = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            doc = json.loads(line)
            corpus.append(doc)
            doc_ids.append(doc["doc_id"])
            doc_texts.append(doc["text"])
    
    return corpus, doc_ids, doc_texts

In [None]:
# Load the corpus
documents, doc_ids, doc_texts = load_corpus(PROCESSED_CORPUS_PATH)


In [None]:
# Load Sentence Transformer model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)

In [None]:
# Generate embeddings
batch_size = 32 if torch.cuda.is_available() else 16
doc_embeddings = []
print("Encoding documents...")

In [None]:
for i in tqdm(range(0, len(doc_texts), batch_size)):
    batch = doc_texts[i:i + batch_size]
    embeddings = model.encode(batch, convert_to_tensor=False)
    doc_embeddings.append(embeddings)

doc_embeddings = np.vstack(doc_embeddings)

In [None]:
# Build FAISS index
embedding_dim = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
faiss.omp_set_num_threads(4)  # Optimize for CPU
index.add(doc_embeddings)

In [None]:
# Save the FAISS index and doc_ids
FAISS_INDEX_PATH = "/kaggle/working/faiss_index.bin"
DOC_IDS_PATH = "/kaggle/working/doc_ids.npy"

faiss.write_index(index, FAISS_INDEX_PATH)
np.save(DOC_IDS_PATH, np.array(doc_ids))

print(f"FAISS index saved at: {FAISS_INDEX_PATH}")
print(f"Document IDs saved at: {DOC_IDS_PATH}")