<a href="https://colab.research.google.com/github/Titilegend/Script-storing-text-docs-retrieving-by-semantic-similarity/blob/main/Script_storing_text_docs_%26_retrieving_by_semantic_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install faiss-cpu openai numpy


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
from google.colab import userdata
from openai import OpenAI

openai_key = userdata.get('OPENAI_API_KEY')


# Safety check
if not openai_key:
    raise ValueError("OPENAI_API_KEY not found")

client = OpenAI(api_key=openai_key)
print("OpenAI client ready ✅")

OpenAI client ready ✅


In [None]:
import numpy as np
import faiss

docs = [
    "Vector databases store embeddings for semantic search.",
    "FAISS enables fast similarity search on large datasets.",
    "Embeddings convert text into numerical meaning representations.",
    "Pinecone is a managed vector database service.",
    "Python pandas is useful for data analysis.",
    "Machine learning models learn patterns from data.",
    "Flutter is used for cross-platform mobile development.",
    "Semantic search finds results by meaning not keywords.",
    "Software engineering interviews test coding skills.",
    "Nigeria has a growing tech startup ecosystem."
]

def embed(texts):
    res = client.embeddings.create(
        model="text-embedding-3-small",
        input=texts
    )
    return np.array([x.embedding for x in res.data], dtype="float32")

# Create embeddings
vectors = embed(docs)

# Normalize for cosine similarity
faiss.normalize_L2(vectors)

# Build FAISS index
dim = vectors.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(vectors)

def search(query, k=3):
    q_vec = embed([query])
    faiss.normalize_L2(q_vec)
    scores, ids = index.search(q_vec, k)
    return [(docs[i], float(scores[0][rank])) for rank, i in enumerate(ids[0])]

query = "How do vector databases help semantic search?"
results = search(query)

print("Query:", query)
print("\nTop results:")
for i, (text, score) in enumerate(results, 1):
    print(f"{i}. score={score:.4f} | {text}")