In [None]:
# ✅ --- Configuration & Imports ---
from langchain_community.retrievers import PineconeHybridSearchRetriever
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
from pinecone_text.sparse import BM25Encoder
from dotenv import load_dotenv
import os
import time

# --- Load environment variables ---
load_dotenv()

In [None]:
# --- Securely set your API key ---
# ⚠️ Do NOT hardcode API keys in production
api_key = os.getenv("PINECONE_API_KEY", "pcsk_6Wf8pq_3AD2pogNED7sHZx9zRXYPqGqWL2dpcnmTcd5wEdEwQTkaYoMVC9zxDQVvfuVS6V")


In [None]:
# --- Initialize Pinecone client ---\
from pinecone import Pinecone
pc = Pinecone(api_key=api_key)
index_name = "pinecone-hybrid-search"

In [None]:
# --- Create index if not exists ---
if index_name not in [idx.name for idx in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=384,  # must match embedding dimension
        metric="dotproduct",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    print("Creating index... waiting for readiness.")
    time.sleep(10)  # allow index provisioning

index = pc.Index(index_name)

In [None]:
# ✅ --- Embeddings & Sparse Encoder ---
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

In [None]:
# Load sentence-transformer model
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
# Initialize and train BM25 sparse encoder
bm25_encoder = BM25Encoder().default()
sentences = [
    "In 2023, I visited Paris",
    "In 2022, I visited New York",
    "In 2021, I visited New Orleans",
]
bm25_encoder.fit(sentences)
bm25_encoder.dump("bm25_values.json")

In [None]:
# Reload encoder (optional step for demo)
bm25_encoder = BM25Encoder().load("bm25_values.json")

In [None]:
# ✅ --- Create Hybrid Retriever ---
retriever = PineconeHybridSearchRetriever(
    embeddings=embeddings,
    sparse_encoder=bm25_encoder,
    index=index,
)


In [None]:
# Add text corpus
retriever.add_texts(sentences)


In [None]:
# ✅ --- Query the retriever ---
query = "What city did I visit first?"
results = retriever.invoke(query)

# --- Print results ---
print("\n🔍 Query:", query)
for i, doc in enumerate(results, 1):
    print(f"{i}. {doc.page_content}")