### Script storing text docs & retrieving by semantic similarity â€” index small dataset and demonstrate top-k results

#### Install dependencies:
!pip install sentence-transformers faiss-cpu numpy

In [1]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# -------------------------
# 1. Load embedding model
# -------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")


In [8]:
# -------------------------
# 2. Prepare documents
# -------------------------
scientific_facts = [
    "Stomach acid is strong enough to dissolve a single-edge razor blade [1.1].",
    "Babies are born with 300 bones, which fuse into 206 by adulthood [1.1].",
    "Bacterial cells outnumber human cells in the body by approximately 39 trillion to 30 trillion [1.1].",
    "A day on Venus is longer than its year, taking 243 Earth days to rotate once [1.1].",
    "Lightning bolts are five times hotter than the surface of the Sun [1.1].",
    "A single teaspoon of a neutron star would weigh roughly 6 billion tons on Earth [1.1].",
    "Light from the Sun takes about 8 minutes and 20 seconds to reach Earth [1.1].",
    "Saturn is less dense than water and would technically float in a large enough basin [1.1].",
    "Ocean-based organisms like plankton produce 50% to 80% of Earth's oxygen [1.1].",
    "Bananas contain potassium-40 and are slightly radioactive [1.1].",
    "Honey found in ancient Egyptian tombs remains edible after 3,000 years [1.1].",
    "Octopuses have three hearts and blue blood [1.1].",
    "It snows heavy metals like galena and bismuthinite on Venus [1.1].",
    "An average cumulus cloud weighs approximately 1.1 million pounds [1.1].",
    "Extreme atmospheric pressure on Jupiter and Saturn can cause diamond rain [1.1].",
    "A single teaspoon of healthy soil contains more microorganisms than people on Earth [1.1].",
    "The Eiffel Tower can grow up to 6 inches taller in summer due to thermal expansion [1.1]."
]

documents = scientific_facts


In [9]:
# -------------------------
# 3. Generate embeddings
# -------------------------
doc_embeddings = model.encode(documents)

# Convert to float32 (required by FAISS)
doc_embeddings = np.array(doc_embeddings).astype("float32")

In [10]:
# -------------------------
# 4. Create FAISS index
# -------------------------
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

index.add(doc_embeddings)

print(f"Indexed {index.ntotal} documents.")


Indexed 17 documents.


In [6]:
# -------------------------
# 5. Semantic Search
# -------------------------
def semantic_search(query, top_k=2):
    query_embedding = model.encode([query])
    query_embedding = np.array(query_embedding).astype("float32")

    distances, indices = index.search(query_embedding, top_k)

    results = []
    for i in indices[0]:
        results.append(documents[i])

    return results


In [14]:
# -------------------------
# 6. Test Query
# -------------------------
query = "Tell me about planets"
results = semantic_search(query, top_k=3)

print("\nQuery:", query)
print("\nTop Results:")
for r in results:
    print("-", r)



Query: Tell me about planets

Top Results:
- A day on Venus is longer than its year, taking 243 Earth days to rotate once [1.1].
- It snows heavy metals like galena and bismuthinite on Venus [1.1].
- Extreme atmospheric pressure on Jupiter and Saturn can cause diamond rain [1.1].
