In [18]:
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
import os
import re
from langchain_groq import ChatGroq
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

# Initialize Pinecone client (v4 style)
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Create index if it doesn't exist
index_name = "coffee-products"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
index = pc.Index(index_name)

vectorstore = PineconeVectorStore(
    index=index,  # Pinecone v4 index
    embedding=embedding_model,
    text_key="text",  # your metadata should have "text"
)
retriever = vectorstore.as_retriever(search_type="similarity", k=3)

In [5]:
def get_retriever(category_filter=None, top_k=10):
    search_kwargs = {"k": top_k}
    if category_filter:
        search_kwargs["filter"] = {"category": category_filter}

    retriever = vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs=search_kwargs
    )
    return retriever

In [6]:
retriever = get_retriever(category_filter="coffee", top_k=10)

In [8]:
from sentence_transformers import CrossEncoder

# Load reranker model
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [11]:
def retrieve_and_rerank(query, category_filter=None, top_k_initial=10, top_k_final=3):
    # Step 1: Retrieve candidates
    retriever = get_retriever(category_filter=category_filter, top_k=top_k_initial)
    docs = retriever.invoke(query)   # <- new correct call

    if not docs:
        return []

    # Step 2: Rerank candidates
    pairs = [[query, doc.page_content] for doc in docs]
    scores = reranker.predict(pairs)

    # Step 3: Sort and select top K
    ranked_docs = [doc for _, doc in sorted(zip(scores, docs), reverse=True)]
    final_docs = ranked_docs[:top_k_final]

    return final_docs

In [14]:
query = "Show me the best coffees under 2 dollars"
category_filter = "Coffee"  # optional

# Retrieve + rerank
final_docs = retrieve_and_rerank(query, category_filter)

# Print results
for i, doc in enumerate(final_docs):
    print(f"\nResult #{i+1}")
    print(f"Score: N/A (sorted by reranker)")
    print(f"Name: {doc.metadata.get('name')}")
    print(f"Category: {doc.metadata.get('category')}")
    print(f"Price: {doc.metadata.get('price')}")
    print(f"Text: {doc.page_content}")


Result #1
Score: N/A (sorted by reranker)
Name: Espresso shot
Category: Coffee
Price: 2.0
Text: Espresso shot:
A bold shot of rich espresso, our espresso is crafted from the finest beans to deliver a robust flavor in every sip. Perfect for a quick pick-me-up, it can also serve as a base for your favorite coffee drinks.
Ingredients: ['Espresso']
Price: $2.0
Rating: 4.9

Result #2
Score: N/A (sorted by reranker)
Name: Cappuccino
Category: Coffee
Price: 4.5
Text: Cappuccino:
A rich and creamy cappuccino made with freshly brewed espresso, steamed milk, and a frothy milk cap. This delightful drink offers a perfect balance of bold coffee flavor and smooth milk, making it an ideal companion for relaxing mornings or lively conversations.
Ingredients: ['Espresso', 'Steamed Milk', 'Milk Foam']
Price: $4.5
Rating: 4.7

Result #3
Score: N/A (sorted by reranker)
Name: Latte
Category: Coffee
Price: 4.75
Text: Latte:
Smooth and creamy, our latte combines rich espresso with velvety steamed milk, crea

In [17]:
def build_filter(category=None, max_price=None, min_rating=None):
    filter_dict = {}

    if category:
        filter_dict["category"] = {"$eq": category}

    if max_price is not None:
        filter_dict["price"] = {"$lte": max_price}

    if min_rating is not None:
        filter_dict["rating"] = {"$gte": min_rating}

    return filter_dict if filter_dict else None

# Create retriever with optional filters
def get_retriever(category=None, max_price=None, min_rating=None, top_k=10):
    filter_dict = build_filter(category, max_price, min_rating)

    retriever = vectorstore.as_retriever(
        search_type="similarity",
        k=top_k,
        filter=filter_dict
    )
    return retriever

# Extract filters from natural language query using regex
def extract_filters_from_query(query):
    category = None
    max_price = None
    min_rating = None

    price_match = re.search(r"(under|below|less than)\s*\$?(\d+(\.\d+)?)", query, re.IGNORECASE)
    if price_match:
        max_price = float(price_match.group(2))

    rating_match = re.search(r"(rating|rated|score)\s*(above|over|at least|>=|greater than)\s*(\d+(\.\d+)?)", query, re.IGNORECASE)
    if rating_match:
        min_rating = float(rating_match.group(3))

    if "coffee" in query.lower():
        category = "Coffee"
    elif "tea" in query.lower():
        category = "Tea"

    return category, max_price, min_rating

# Example usage:
if __name__ == "__main__":
    user_query = "Show me coffees under $3 with rating above 4.5"

    category, max_price, min_rating = extract_filters_from_query(user_query)
    retriever = get_retriever(category, max_price, min_rating, top_k=5)
    
    # Use retriever.invoke or get_relevant_documents (or .invoke per new Langchain)
    docs = retriever.invoke(user_query)
    
    for i, doc in enumerate(docs):
        print(f"Result #{i+1}:")
        print(f"Name: {doc.metadata.get('name')}")
        print(f"Category: {doc.metadata.get('category')}")
        print(f"Price: ${doc.metadata.get('price')}")
        print(f"Rating: {doc.metadata.get('rating')}")
        print(f"Text: {doc.page_content}\n")

Result #1:
Name: Espresso shot
Category: Coffee
Price: $2.0
Rating: 4.9
Text: Espresso shot:
A bold shot of rich espresso, our espresso is crafted from the finest beans to deliver a robust flavor in every sip. Perfect for a quick pick-me-up, it can also serve as a base for your favorite coffee drinks.
Ingredients: ['Espresso']
Price: $2.0
Rating: 4.9

Result #2:
Name: Latte
Category: Coffee
Price: $4.75
Rating: 4.8
Text: Latte:
Smooth and creamy, our latte combines rich espresso with velvety steamed milk, creating a perfect balance of flavor and texture. Enjoy it as a comforting treat any time of day, whether you're starting your morning or taking a midday break.
Ingredients: ['Espresso', 'Steamed Milk', 'Milk Foam']
Price: $4.75
Rating: 4.8

Result #3:
Name: Cappuccino
Category: Coffee
Price: $4.5
Rating: 4.7
Text: Cappuccino:
A rich and creamy cappuccino made with freshly brewed espresso, steamed milk, and a frothy milk cap. This delightful drink offers a perfect balance of bold coff

In [20]:

# 2. LLM Reranker setup (replace ChatOpenAI with your Groq API Llama-3 wrapper)
llm = ChatGroq(model_name=os.getenv("GROQ_MODEL_NAME"), temperature=0)  # Dummy placeholder, replace with your actual LLM

# 3. Reranker function: rerank retrieved docs using LLM prompt
def rerank_documents(query, docs):
    """
    Takes a user query and a list of Document objects,
    returns the documents reranked by relevance using the LLM.
    """
    # Format documents for prompt
    docs_text = "\n\n".join([f"Document {i+1}:\n{doc.page_content}" for i, doc in enumerate(docs)])
    
    prompt = f"""
You are a helpful assistant ranking documents based on their relevance to the query.
Query: {query}
Documents:
{docs_text}

Please rank the documents from most relevant to least relevant by returning a list of document numbers in order.
"""
    response = llm.call_as_llm(prompt)  # replace with your actual call method
    
    # Example response: "1, 3, 2, 4, 5"
    ranked_indices = [int(i.strip())-1 for i in response.split(",") if i.strip().isdigit()]
    
    # Return docs sorted by LLM ranking
    return [docs[i] for i in ranked_indices if 0 <= i < len(docs)]

# 4. RAG pipeline function
def retrieve_and_rerank(query):
    retrieved_docs = retriever.invoke(query)  # or retriever.invoke(query) if Langchain version >=0.1.46
    
    reranked_docs = rerank_documents(query, retrieved_docs)
    
    return reranked_docs

# 5. Example usage
if __name__ == "__main__":
    user_query = "What are the best coffee drinks for a quick energy boost?"
    results = retrieve_and_rerank(user_query)
    
    for i, doc in enumerate(results):
        print(f"\nResult #{i+1}")
        print(f"Name: {doc.metadata.get('name')}")
        print(f"Category: {doc.metadata.get('category')}")
        print(f"Price: {doc.metadata.get('price')}")
        print(f"Rating: {doc.metadata.get('rating')}")
        print(f"Text: {doc.page_content}")


Result #1
Name: Latte
Category: Coffee
Price: 4.75
Rating: 4.8
Text: Latte:
Smooth and creamy, our latte combines rich espresso with velvety steamed milk, creating a perfect balance of flavor and texture. Enjoy it as a comforting treat any time of day, whether you're starting your morning or taking a midday break.
Ingredients: ['Espresso', 'Steamed Milk', 'Milk Foam']
Price: $4.75
Rating: 4.8

Result #2
Name: Cappuccino
Category: Coffee
Price: 4.5
Rating: 4.7
Text: Cappuccino:
A rich and creamy cappuccino made with freshly brewed espresso, steamed milk, and a frothy milk cap. This delightful drink offers a perfect balance of bold coffee flavor and smooth milk, making it an ideal companion for relaxing mornings or lively conversations.
Ingredients: ['Espresso', 'Steamed Milk', 'Milk Foam']
Price: $4.5
Rating: 4.7
