### Script storing text docs & retrieving by semantic similarity â€” index small dataset and demonstrate top-k results

#### Install dependencies:
!pip install sentence-transformers faiss-cpu numpy

In [None]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

In [None]:
# -------------------------
# 1. Load embedding model
# -------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")


In [None]:
# -------------------------
# 2. Prepare documents
# -------------------------
documents = [
    "FastAPI is a modern web framework for building APIs with Python.",
    "Django is a high-level Python web framework.",
    "FAISS is a library for efficient similarity search.",
    "Transformers provide state-of-the-art natural language processing models.",
    "Neural networks are used in deep learning."
]


In [None]:
# -------------------------
# 3. Generate embeddings
# -------------------------
doc_embeddings = model.encode(documents)

# Convert to float32 (required by FAISS)
doc_embeddings = np.array(doc_embeddings).astype("float32")

In [None]:
# -------------------------
# 4. Create FAISS index
# -------------------------
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

index.add(doc_embeddings)

print(f"Indexed {index.ntotal} documents.")


In [None]:
# -------------------------
# 5. Semantic Search
# -------------------------
def semantic_search(query, top_k=2):
    query_embedding = model.encode([query])
    query_embedding = np.array(query_embedding).astype("float32")

    distances, indices = index.search(query_embedding, top_k)

    results = []
    for i in indices[0]:
        results.append(documents[i])

    return results


In [None]:
# -------------------------
# 6. Test Query
# -------------------------
query = "How do I build a Python API?"
results = semantic_search(query, top_k=3)

print("\nQuery:", query)
print("\nTop Results:")
for r in results:
    print("-", r)
