## Part 1 ) Install Libraries

In [30]:
!pip install -q chromadb==1.0.12 sentence-transformers==4.1.0 google-genai numpy scipy

## Part 2 ) Download the Food Dataset

In [31]:
!wget -q https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/sN1PIR8qp1SJ6K7syv72qQ/FoodDataSet.json
print('FoodDataSet.json downloaded')

FoodDataSet.json downloaded


## Part 3 ) Configure Google Gemini API Key




In [34]:
from google.colab import userdata
from google import genai
from google.genai import types

GOOGLE_API_KEY = userdata.get('gemini_api')
GEMINI_MODEL_NAME = 'gemini-2.5-flash'

gemini_client = genai.Client(api_key=GOOGLE_API_KEY)
GENERATION_CONFIG = types.GenerateContentConfig(
    max_output_tokens=1024,
    temperature=0.5,
)

def gemini_generate(prompt: str) -> str:
    response = gemini_client.models.generate_content(
        model=GEMINI_MODEL_NAME,
        contents=prompt,
        config=GENERATION_CONFIG,
    )
    return response.text.strip()

# Quick connection test
print(' Gemini:', gemini_generate('Say hello in one word'))

 Gemini: Hello


## Part 4 ) Shared Functions





In [40]:
import chromadb
from chromadb.utils import embedding_functions
from chromadb.config import Settings
import json
from typing import List, Dict, Optional

# Initialize Chroma Client
chroma_client = chromadb.Client(
    Settings(
        anonymized_telemetry=False,
        allow_reset=True
    )
)

print("ChromaDB client initialized.")


# Load & Normalize Food Data

def load_food_data(file_path: str) -> List[Dict]:
    """Load and normalize food dataset."""

    with open(file_path, "r", encoding="utf-8") as f:
        food_data = json.load(f)

    for i, item in enumerate(food_data):
        item["food_id"] = str(item.get("food_id", i))
        item.setdefault("food_name", "")
        item.setdefault("food_ingredients", [])
        item.setdefault("food_description", "")
        item.setdefault("cuisine_type", "Unknown")
        item.setdefault("food_calories_per_serving", 0)

        if isinstance(item.get("food_features"), dict):
            item["taste_profile"] = ", ".join(
                str(v) for v in item["food_features"].values() if v
            )
        else:
            item["taste_profile"] = ""

    print(f"Loaded {len(food_data)} items.")
    return food_data


# Create Collection

def create_similarity_search_collection(
    collection_name: str,
    collection_metadata: Optional[Dict] = None,
):
    """Create Chroma collection using cosine similarity."""

    try:
        chroma_client.delete_collection(collection_name)
    except Exception:
        pass

    embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-mpnet-base-v2"
    )

    metadata = {"hnsw:space": "cosine"}
    if collection_metadata:
        metadata.update(collection_metadata)

    collection = chroma_client.create_collection(
        name=collection_name,
        metadata=metadata,
        embedding_function=embedding_fn,
    )

    print(f"Collection '{collection_name}' created with cosine similarity.")
    return collection


# Populate Collection

def populate_similarity_collection(collection, food_items: List[Dict], batch_size: int = 100):
    """Embed and store food items in Chroma using batched inserts."""

    documents, metadatas, ids = [], [], []
    used_ids = set()

    for i, food in enumerate(food_items):

        document_text = f"""Name: {food.get('food_name')}.
Description: {food.get('food_description')}.
Ingredients: {', '.join(food.get('food_ingredients', []))}.
Cuisine: {food.get('cuisine_type')}.
Cooking method: {food.get('cooking_method', '')}.
Taste: {food.get('taste_profile', '')}.
Health benefits: {food.get('food_health_benefits', '')}."""

        base_id = str(food.get("food_id", i))
        uid = base_id
        counter = 1
        while uid in used_ids:
            uid = f"{base_id}_{counter}"
            counter += 1
        used_ids.add(uid)

        documents.append(document_text.strip())
        ids.append(uid)
        metadatas.append({
            "name": food.get("food_name", ""),
            "cuisine_type": food.get("cuisine_type", "Unknown"),
            "calories": food.get("food_calories_per_serving", 0),
            "description": food.get("food_description", ""),
        })

    # batch inserts to avoid timeouts on large datasets
    for start in range(0, len(documents), batch_size):
        collection.add(
            documents=documents[start:start + batch_size],
            metadatas=metadatas[start:start + batch_size],
            ids=ids[start:start + batch_size],
        )

    print(f"Added {len(ids)} items to collection '{collection.name}'.")


# Format Results

def _format_results(results) -> List[Dict]:
    """Convert Chroma results into clean structured output."""

    if not results or not results.get("ids") or not results["ids"][0]:
        return []

    formatted = []

    for i in range(len(results["ids"][0])):

        distance = results["distances"][0][i]
        similarity = max(0.0, min(1.0, 1 - distance))

        formatted.append({
            "food_id": results["ids"][0][i],
            "food_name": results["metadatas"][0][i]["name"],
            "food_description": results["metadatas"][0][i]["description"],
            "cuisine_type": results["metadatas"][0][i]["cuisine_type"],
            "food_calories_per_serving": results["metadatas"][0][i]["calories"],
            "similarity_score": round(similarity * 100, 2),
        })

    return formatted


# Standard Semantic Search

def perform_similarity_search(
    collection,
    query: str,
    n_results: int = 5,
) -> List[Dict]:

    results = collection.query(
        query_texts=[query],
        n_results=n_results,
    )
    return _format_results(results)


# Hybrid Filtered Search

def perform_filtered_similarity_search(
    collection,
    query: str,
    cuisine_filter: Optional[str] = None,
    max_calories: Optional[int] = None,
    min_calories: Optional[int] = None,
    n_results: int = 5,
) -> List[Dict]:

    filters = []

    if cuisine_filter:
        filters.append({"cuisine_type": cuisine_filter})

    if max_calories is not None:
        filters.append({"calories": {"$lte": max_calories}})

    #  min_calories
    if min_calories is not None:
        filters.append({"calories": {"$gte": min_calories}})

    where = None
    if len(filters) == 1:
        where = filters[0]
    elif len(filters) > 1:
        where = {"$and": filters}

    results = collection.query(
        query_texts=[query],
        n_results=n_results,
        where=where,
    )

    return _format_results(results)


# Confidence-Threshold

SIMILARITY_THRESHOLD = 20

def apply_confidence_threshold(results: List[Dict], threshold: float = SIMILARITY_THRESHOLD) -> List[Dict]:
    """Drop results below the similarity threshold."""
    filtered = [r for r in results if r["similarity_score"] >= threshold]
    if not filtered:
        print(f" All results below {threshold:.0f}% confidence threshold.")
    return filtered


print("Chroma similarity search system ready.")

ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given


ChromaDB client initialized.
Chroma similarity search system ready.


## Part 5) Load Data & Build Vector Database




In [41]:
food_items = load_food_data("FoodDataSet.json")

collection = create_similarity_search_collection(
    "food_search_main",
    {"description": "Main food search collection"},
)

populate_similarity_collection(collection, food_items)

print("Vector database is ready!")

ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Loaded 185 items.
Collection 'food_search_main' created with cosine similarity.


ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given


Added 185 items to collection 'food_search_main'.
Vector database is ready!





## Part 6 ) System 1: Interactive Similarity Search

Type any food description and get the top 5 matches ranked by similarity score.

In [42]:
def display_results(results: List[Dict], title: str = 'Search Results', show_details: bool = True):
    """Pretty-print search results."""
    print(f'\n {title}')
    print('=' * 60)
    if not results:
        print(' No results found. Try different keywords.')
        return
    for i, r in enumerate(results, 1):
        score = r['similarity_score']
        if show_details:
            print(f"\n{i}.  {r['food_name']}")
            print(f"    Match   : {score:.1f}%")
            print(f"    Cuisine : {r['cuisine_type']}")
            print(f"    Calories: {r['food_calories_per_serving']} per serving")
            print(f"    {r['food_description']}")
        else:
            print(f"  {i}. {r['food_name']} ({score:.1f}% match)")
    print('=' * 60)

print('ready.')

ready.


In [43]:
# Change the query below to anything you like!
query = 'chocolate dessert'

results = perform_similarity_search(collection, query, n_results=5)
results = apply_confidence_threshold(results)
display_results(results, f"Results for: '{query}'")

ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given



 Results for: 'chocolate dessert'

1.  Chocolate Cake
    Match   : 57.0%
    Cuisine : American
    Calories: 450 per serving
    A rich, moist cake made with high-quality cocoa powder and frosted with creamy chocolate icing.

2.  Chocolate Lava Cake
    Match   : 56.0%
    Cuisine : International
    Calories: 400 per serving
    A decadent dessert with a gooey chocolate center, served warm with a scoop of vanilla ice cream.

3.  Brownies
    Match   : 53.8%
    Cuisine : American
    Calories: 240 per serving
    A dense, fudgy dessert bar made with chocolate and baked until chewy and moist.

4.  Chocolate Lava Cake
    Match   : 50.9%
    Cuisine : French
    Calories: 520 per serving
    A decadent chocolate cake with a gooey molten chocolate center.

5.  Chocolate Syrup
    Match   : 50.6%
    Cuisine : Universal
    Calories: 150 per serving
    A rich and thick chocolate syrup perfect for drizzling over desserts.


In [44]:
# more queries
for q in ['Italian food', 'sweet treats', 'baked goods', 'low calorie healthy meal']:
    results = perform_similarity_search(collection, q, n_results=3)
    results = apply_confidence_threshold(results)
    display_results(results, f"'{q}'", show_details=False)


 'Italian food'
  1. Spaghetti Carbonara (57.9% match)
  2. Margherita Pizza (55.5% match)
  3. Margherita Pizza (54.2% match)

 'sweet treats'
  1. Brownies (36.6% match)
  2. Pavlova (32.2% match)
  3. Sour Cranberry Candy (32.0% match)

 'baked goods'
  1. Profiteroles (46.0% match)
  2. Brownies (45.5% match)
  3. Bienenstich (40.5% match)

 'low calorie healthy meal'
  1. Salted Roasted Chickpea Snack Mix (33.0% match)
  2. Salted Roasted Quinoa Snack (32.7% match)
  3. Salted Roasted Mixed Nuts and Dried Fruits (29.6% match)


---
## Part 7 )System 2: Advanced Filtered Search

Combine similarity search with **cuisine type**, **max calories**, and/or **min calories** filters.

In [45]:
# Cuisine-filtered search
query = 'creamy pasta'
cuisine_filter = 'Italian'   # e.g. Italian, Thai, Mexican, Indian, Japanese, American

results = perform_filtered_similarity_search(collection, query, cuisine_filter=cuisine_filter)
results = apply_confidence_threshold(results)
display_results(results, f"'{query}' in {cuisine_filter} cuisine")


 'creamy pasta' in Italian cuisine

1.  Spaghetti Carbonara
    Match   : 53.0%
    Cuisine : Italian
    Calories: 420 per serving
    A classic Italian pasta dish made with eggs, cheese, pancetta, and pepper.

2.  Panna Cotta
    Match   : 46.6%
    Cuisine : Italian
    Calories: 250 per serving
    An Italian dessert made with sweetened cream that is thickened with gelatin and molded.

3.  Tiramisu
    Match   : 39.6%
    Cuisine : Italian
    Calories: 350 per serving
    An Italian dessert made with layers of coffee-soaked ladyfingers, mascarpone cheese, and cocoa powder.

4.  Cannoli
    Match   : 39.5%
    Cuisine : Italian
    Calories: 220 per serving
    An Italian dessert consisting of tube-shaped shells of fried pastry dough, filled with a sweet, creamy ricotta filling.

5.  Tiramisu
    Match   : 39.3%
    Cuisine : Italian
    Calories: 480 per serving
    A classic Italian dessert made of layers of coffee-soaked ladyfingers and a rich, creamy mascarpone mixture, dusted

In [46]:
# Calorie-filtered search
query   = 'healthy meal'
max_cal = 300

results = perform_filtered_similarity_search(collection, query, max_calories=max_cal)
results = apply_confidence_threshold(results)
display_results(results, f"'{query}' under {max_cal} calories")


 'healthy meal' under 300 calories

1.  Vegetable Stir Fry
    Match   : 34.1%
    Cuisine : Chinese
    Calories: 180 per serving
    A quick and easy stir fry dish with a variety of fresh vegetables, soy sauce, and garlic.

2.  Tom Yum Soup
    Match   : 34.0%
    Cuisine : Thai
    Calories: 120 per serving
    A hot and sour Thai soup usually cooked with shrimp, mushrooms, tomatoes, lemongrass, galangal, and kaffir lime leaves.

3.  Beef Tacos
    Match   : 33.9%
    Cuisine : Mexican
    Calories: 300 per serving
    Soft corn tortillas filled with seasoned beef, lettuce, cheese, and salsa.

4.  Salted Roasted Chickpea Snack Mix
    Match   : 32.5%
    Cuisine : International
    Calories: 160 per serving
    A mix of roasted chickpeas, nuts, and seeds seasoned with salt, offering a flavorful and crunchy snack.

5.  Salted Roasted Seaweed Snacks
    Match   : 32.2%
    Cuisine : International
    Calories: 25 per serving
    Thin sheets of seaweed roasted and seasoned with salt, 

In [47]:
# Combined filters including min_calories
query          = 'light fresh meal'
cuisine_filter = 'Japanese'
max_cal        = 250

results = perform_filtered_similarity_search(
    collection, query, cuisine_filter=cuisine_filter, max_calories=max_cal
)
results = apply_confidence_threshold(results)
display_results(results, f"'{query}' | {cuisine_filter} | ≤{max_cal} cal")


 'light fresh meal' | Japanese | ≤250 cal

1.  Miso Soup
    Match   : 26.5%
    Cuisine : Japanese
    Calories: 84 per serving
    A traditional Japanese soup consisting of a stock called 'dashi' into which softened miso paste is mixed.


In [48]:
# Demo: calorie range filter using min + max
query   = 'balanced meal'
min_cal = 200
max_cal = 400

results = perform_filtered_similarity_search(
    collection, query, min_calories=min_cal, max_calories=max_cal
)
results = apply_confidence_threshold(results)
display_results(results, f"'{query}' | {min_cal}–{max_cal} cal")


 'balanced meal' | 200–400 cal

1.  Beef Stew
    Match   : 24.3%
    Cuisine : American
    Calories: 350 per serving
    A hearty stew made with chunks of beef, potatoes, carrots, and onions simmered in a rich broth.

2.  Lentil Dhal
    Match   : 23.6%
    Cuisine : Indian
    Calories: 300 per serving
    A nutritious and flavorful curry made with red lentils simmered in a spiced tomato and coconut milk sauce.

3.  Beef Tacos
    Match   : 23.4%
    Cuisine : Mexican
    Calories: 300 per serving
    Soft corn tortillas filled with seasoned beef, lettuce, cheese, and salsa.

4.  Margherita Pizza
    Match   : 23.3%
    Cuisine : Italian
    Calories: 250 per serving
    Classic pizza topped with fresh tomatoes, mozzarella cheese, and basil.

5.  Mille-feuille
    Match   : 23.3%
    Cuisine : French
    Calories: 300 per serving
    A classic French pastry with layers of puff pastry and pastry cream, topped with icing.


---
## Part 8 )System 3: Enhanced RAG Chatbot with Google Gemini

Pipeline steps:
1. **Parse** — extract cuisine, calorie range, dietary tags, allergens, mood from natural language
2. **Retrieve** — filtered or broad vector search
3. **Re-rank** — composite score: 60% vector similarity + 40% metadata signal; allergen penalty
4. **Augment** — top results injected into a grounded few-shot prompt
5. **Generate** — Gemini writes a natural, constrained recommendation
6. **Cache** — identical queries skip the LLM (1-hour TTL);  cache key includes filter state


In [49]:
from __future__ import annotations

import hashlib
import re
import time
from dataclasses import dataclass, field
from typing import Any



@dataclass
class QueryFilters:
    """Everything we can infer from the user's natural-language query."""
    cuisine: str | None = None
    max_calories: int | None = None
    min_calories: int | None = None
    dietary_tags: list[str] = field(default_factory=list)
    allergens_to_avoid: list[str] = field(default_factory=list)
    mood_keywords: list[str] = field(default_factory=list)
    servings: int | None = None


@dataclass
class FoodResult:
    """Normalised wrapper around a raw retrieval dict."""
    food_name: str
    cuisine_type: str
    calories: int
    description: str
    ingredients: str
    health_benefits: str
    taste_profile: str
    raw_similarity: float
    rerank_score: float = 0.0


# ─────────────────────────────────────────────────────────────

_CUISINE_KEYWORDS: dict[str, str] = {
    "italian": "Italian", "mexican": "Mexican", "indian": "Indian",
    "chinese": "Chinese", "american": "American", "japanese": "Japanese",
    "thai": "Thai", "mediterranean": "Mediterranean", "french": "French",
    "korean": "Korean", "greek": "Greek", "spanish": "Spanish",
}

_DIETARY_KEYWORDS: dict[str, str] = {
    "vegan": "vegan", "vegetarian": "vegetarian", "keto": "keto",
    "paleo": "paleo", "gluten-free": "gluten_free", "gluten free": "gluten_free",
    "dairy-free": "dairy_free", "dairy free": "dairy_free",
    "low carb": "low_carb", "low-carb": "low_carb",
    "high protein": "high_protein", "high-protein": "high_protein",
}

_ALLERGEN_KEYWORDS: list[str] = [
    "nut", "nuts", "peanut", "peanuts", "shellfish", "dairy", "gluten",
    "egg", "eggs", "soy", "sesame", "wheat", "fish",
]

_MOOD_KEYWORDS: list[str] = [
    "spicy", "mild", "light", "heavy", "comfort", "refreshing",
    "hearty", "crispy", "creamy", "sweet", "savoury", "savory",
]

print(" RAG — dataclasses and keyword maps ready.")

 RAG — dataclasses and keyword maps ready.


In [50]:
# 1. Query Parser

def parse_query(query: str) -> QueryFilters:
    """Extract structured filters from a free-text query."""
    q = query.lower()
    filters = QueryFilters()

    for kw, canonical in _CUISINE_KEYWORDS.items():
        if kw in q:
            filters.cuisine = canonical
            break

    under_match   = re.search(r"(?:under|below|less than)\s+(\d+)\s*(?:cal)?", q)
    above_match   = re.search(r"(?:over|above|more than|at least)\s+(\d+)\s*(?:cal)?", q)
    between_match = re.search(r"between\s+(\d+)\s+and\s+(\d+)\s*(?:cal)?", q)
    around_match  = re.search(r"around\s+(\d+)\s*(?:cal)?", q)

    if between_match:
        filters.min_calories = int(between_match.group(1))
        filters.max_calories = int(between_match.group(2))
    elif around_match:
        mid = int(around_match.group(1))
        filters.min_calories = max(0, mid - 100)
        filters.max_calories = mid + 100
    else:
        if under_match:  filters.max_calories = int(under_match.group(1))
        if above_match:  filters.min_calories = int(above_match.group(1))

    for phrase, tag in _DIETARY_KEYWORDS.items():
        if phrase in q:
            filters.dietary_tags.append(tag)

    avoid_ctx = re.findall(r"(?:no|without|avoid|free from)\s+([\w\s]+?)(?:\s|$|,|\.|and)", q)
    flat_ctx  = " ".join(avoid_ctx)
    for allergen in _ALLERGEN_KEYWORDS:
        if allergen in flat_ctx or f"no {allergen}" in q or f"without {allergen}" in q:
            if allergen not in filters.allergens_to_avoid:
                filters.allergens_to_avoid.append(allergen)

    for mood in _MOOD_KEYWORDS:
        if mood in q:
            filters.mood_keywords.append(mood)

    serving_match = re.search(r"(?:for|serves?)\s+(\d+)\s*(?:people|persons?)?", q)
    if serving_match:
        filters.servings = int(serving_match.group(1))

    return filters


# 2. Result Normaliser

def _normalise(raw: dict[str, Any]) -> FoodResult:
    sim = max(0.0, min(1.0, raw.get("similarity_score", 0.0) / 100.0))
    return FoodResult(
        food_name       = raw.get("food_name", "Unknown"),
        cuisine_type    = raw.get("cuisine_type", "Unknown"),
        calories        = int(raw.get("food_calories_per_serving", 0)),
        description     = raw.get("food_description", "N/A"),
        ingredients     = ", ".join(raw.get("food_ingredients", [])) if isinstance(raw.get("food_ingredients"), list) else str(raw.get("food_ingredients", "N/A")),
        health_benefits = raw.get("food_health_benefits", "N/A"),
        taste_profile   = raw.get("taste_profile", "N/A"),
        raw_similarity  = sim,
    )


# ─────────────────────────────────────────────────────────────
# 3. Re-Ranker
#    Composite = 0.6 × vector_sim + 0.4 × metadata_signal
# ─────────────────────────────────────────────────────────────

def rerank(results: list[FoodResult], filters: QueryFilters) -> list[FoodResult]:
    for item in results:
        meta = 0.0
        if filters.max_calories and item.calories <= filters.max_calories: meta += 0.15
        if filters.min_calories and item.calories >= filters.min_calories: meta += 0.10
        if filters.cuisine and item.cuisine_type.lower() == filters.cuisine.lower(): meta += 0.15

        combined = f"{item.description} {item.ingredients} {item.health_benefits} {item.taste_profile}".lower()

        diet_hits = sum(1 for t in filters.dietary_tags if t.replace("_", " ") in combined)
        meta += min(diet_hits * 0.05, 0.20)

        mood_hits = sum(1 for m in filters.mood_keywords if m in combined)
        meta += min(mood_hits * 0.04, 0.16)

        penalty = sum(0.40 for a in filters.allergens_to_avoid if a in combined)
        raw = max(0.0, min(1.0, item.raw_similarity))
        item.rerank_score = max(0.0, (0.6 * raw) + (0.4 * meta) - penalty)

    return sorted(results, key=lambda x: x.rerank_score, reverse=True)


# ─────────────────────────────────────────────────────────────
# 4. Context Builder  (backward-compatible with raw dict results)
# ─────────────────────────────────────────────────────────────

def prepare_context(results, top_k: int = 3) -> str:
    """
    Accepts either list[FoodResult] (enhanced pipeline) or
    list[dict] (legacy compare_queries path) for backward-compat.
    """
    if not results:
        return "No relevant food items were found."

    sections = []
    for rank, item in enumerate(results[:top_k], 1):
        if isinstance(item, FoodResult):
            name  = item.food_name
            cuisine = item.cuisine_type
            cal   = item.calories
            score = item.rerank_score * 100
            desc  = item.description
            ingr  = item.ingredients
            health = item.health_benefits
            taste = item.taste_profile
        else:
            name  = item.get("food_name", "Unknown")
            cuisine = item.get("cuisine_type", "Unknown")
            cal   = item.get("food_calories_per_serving", 0)
            score = item.get("similarity_score", 0)
            desc  = item.get("food_description", "N/A")
            ingr  = item.get("food_ingredients", "N/A")
            health = item.get("food_health_benefits", "N/A")
            taste = item.get("taste_profile", "N/A")

        block = (
            f"[OPTION {rank}]\n"
            f"Name        : {name}\n"
            f"Cuisine     : {cuisine}\n"
            f"Calories    : {cal} per serving\n"
            f"Match Score : {score:.1f}%\n\n"
            f"Description     : {desc}\n"
            f"Ingredients     : {ingr}\n"
            f"Health Benefits : {health}\n"
            f"Taste Profile   : {taste}"
        )
        sections.append(block)

    return "\n\n".join(sections)


# ─────────────────────────────────────────────────────────────
# 5. Prompt Builder  (few-shot + strict grounding rules)

_FEW_SHOT = """EXAMPLE REQUEST: "light Italian pasta under 500 cal"
EXAMPLE RESPONSE:
- **Spaghetti Aglio e Olio** (Italian, 420 cal) — A simple, garlicky pasta that fits your
  calorie budget. Olive oil and garlic keep it light with bold Mediterranean flavour.
- **Pasta Primavera** (Italian, 390 cal) — Packed with seasonal vegetables; refreshing and
  well within your calorie limit.""".strip()


def build_prompt(query: str, context: str, filters: QueryFilters) -> str:
    parts = []
    if filters.cuisine:              parts.append(f"Cuisine: {filters.cuisine}")
    if filters.max_calories:         parts.append(f"Max calories: {filters.max_calories}")
    if filters.min_calories:         parts.append(f"Min calories: {filters.min_calories}")
    if filters.dietary_tags:         parts.append(f"Dietary: {', '.join(filters.dietary_tags)}")
    if filters.allergens_to_avoid:   parts.append(f"Avoid allergens: {', '.join(filters.allergens_to_avoid)}")
    if filters.mood_keywords:        parts.append(f"Mood/texture: {', '.join(filters.mood_keywords)}")
    if filters.servings:             parts.append(f"Servings needed: {filters.servings}")
    filter_block = "\n".join(f"  - {p}" for p in parts) or "  - None detected"

    return f"""You are an expert food recommendation assistant. Be concise, friendly, and accurate.

STRICT RULES:
- Recommend ONLY foods from RETRIEVED OPTIONS. Never invent dishes.
- If an option contains a flagged allergen, do NOT recommend it.
- Respect all calorie and cuisine constraints.

{_FEW_SHOT}

USER REQUEST: "{query}"

DETECTED FILTERS:
{filter_block}

RETRIEVED OPTIONS:
{context}

TASK: Write a recommendation using bullet points:
  - Acknowledge the request in one sentence.
  - Recommend the best 2-3 matches with name, cuisine, and calories in bold.
  - Explain briefly why each fits.
  - If no option fully matches, say so honestly.

Response:"""


print("RAG — parser, re-ranker, context & prompt builders ready.")

RAG — parser, re-ranker, context & prompt builders ready.


In [51]:
# 6. Response Cache
# ─────────────────────────────────────────────────────────────

_response_cache: dict[str, tuple[str, float]] = {}
_CACHE_TTL_SECONDS = 3600  # 1 hour


def _cache_key(query: str, n_results: int, filters: QueryFilters) -> str:
    import dataclasses
    filter_str = json.dumps(dataclasses.asdict(filters), sort_keys=True)
    payload = f"{query.strip().lower()}|{n_results}|{filter_str}"
    return hashlib.sha256(payload.encode()).hexdigest()


def _get_cached(key: str) -> str | None:
    if key in _response_cache:
        response, ts = _response_cache[key]
        if time.time() - ts < _CACHE_TTL_SECONDS:
            return response
        del _response_cache[key]
    return None


def _set_cached(key: str, response: str) -> None:
    _response_cache[key] = (response, time.time())


def clear_cache() -> None:
    """Manually clear the response cache (e.g. after DB updates)."""
    _response_cache.clear()
    print("Response cache cleared.")


# ─────────────────────────────────────────────────────────────
# 7. Enhanced RAG Pipeline
# ─────────────────────────────────────────────────────────────

def rag_recommend(
    query: str,
    n_results: int = 3,
    verbose: bool = True,
    use_cache: bool = True,
) -> str:
    """
    Enhanced RAG pipeline:
      1. Parse query  -> structured QueryFilters
      2. Check response cache (skip LLM on hit)
      3. Retrieve candidates (filtered or broad search)
      4. Apply confidence threshold
      5. Normalise    -> FoodResult objects
      6. Re-rank with composite score (vector sim + metadata + allergen penalty)
      7. Build context + grounded prompt
      8. Call Gemini; cache & return response
    """
    if verbose:
        print(f"\n Searching for: '{query}'")

    # Step 1 — Parse
    filters = parse_query(query)
    if verbose and any([filters.cuisine, filters.max_calories, filters.min_calories,
                        filters.dietary_tags, filters.allergens_to_avoid, filters.mood_keywords]):
        parts = []
        if filters.cuisine:             parts.append(f"cuisine={filters.cuisine}")
        if filters.max_calories:        parts.append(f"max_cal={filters.max_calories}")
        if filters.min_calories:        parts.append(f"min_cal={filters.min_calories}")
        if filters.dietary_tags:        parts.append(f"diet={filters.dietary_tags}")
        if filters.allergens_to_avoid:  parts.append(f"avoid={filters.allergens_to_avoid}")
        if filters.mood_keywords:       parts.append(f"mood={filters.mood_keywords}")
        print(f"Detected filters: {' | '.join(parts)}")

    # Step 2 — Cache
    if use_cache:
        ck = _cache_key(query, n_results, filters)
        cached = _get_cached(ck)
        if cached:
            if verbose: print("Cache hit — returning cached response.")
            return cached

    # Step 3 — Retrieve
    has_filters = any([filters.cuisine, filters.max_calories, filters.min_calories])
    try:
        if has_filters:
            raw_results = perform_filtered_similarity_search(
                collection, query,
                cuisine_filter=filters.cuisine,
                max_calories=filters.max_calories,
                min_calories=filters.min_calories,
                n_results=n_results * 2,
            )
        else:
            raw_results = perform_similarity_search(collection, query, n_results * 2)
    except Exception as exc:
        return f"Retrieval error: {exc}"

    # Step 4 — Confidence threshold
    raw_results = apply_confidence_threshold(raw_results)

    if not raw_results:
        return "I couldn't find food options matching your request. Try broadening your search."

    # Step 5 — Normalise
    results: list[FoodResult] = [_normalise(r) for r in raw_results]

    # Step 6 — Re-rank
    results = rerank(results, filters)

    # Step 7 — Prompt
    context = prepare_context(results, top_k=n_results)
    prompt  = build_prompt(query, context, filters)

    # Step 8 — Generate
    try:
        response_text = gemini_generate(prompt)
    except Exception as exc:
        return f"Generation error: {exc}"

    if use_cache:
        _set_cached(_cache_key(query, n_results, filters), response_text)

    if verbose:
        print("\n   Re-ranked results:")
        print("   " + "-" * 62)
        for i, item in enumerate(results[:n_results], 1):
            print(
                f"   {i}. {item.food_name:<30} | {item.cuisine_type:<14} | "
                f"{item.calories:>4} cal | "
                f"sim={item.raw_similarity*100:.1f}%  →  score={item.rerank_score*100:.1f}%"
            )
        print("\n   Gemini recommendation:\n")
        print(response_text)
        print()

    return response_text


print("RAG recommendation system ready.")
print("Utilities: rag_recommend() | batch_recommend() | explain_filters() | clear_cache()")

RAG recommendation system ready.
Utilities: rag_recommend() | batch_recommend() | explain_filters() | clear_cache()


In [52]:
# ─────────────────────────────────────────────────────────────
# 8. Batch Recommend & Debug Helpers
# ─────────────────────────────────────────────────────────────

def batch_recommend(queries: list[str], n_results: int = 3) -> dict[str, str]:
    """Run rag_recommend for multiple queries. Returns a query → response map."""
    return {q: rag_recommend(q, n_results=n_results, verbose=False) for q in queries}


def explain_filters(query: str) -> None:
    """Print the structured filters detected from a query (useful for debugging)."""
    import dataclasses as _dc, json as _json
    f = parse_query(query)
    print(_json.dumps(_dc.asdict(f), indent=2))


# Quick smoke-test
explain_filters("I want vegan Indian food under 400 calories, no nuts")

{
  "cuisine": "Indian",
  "max_calories": 400,
  "min_calories": null,
  "dietary_tags": [
    "vegan"
  ],
  "allergens_to_avoid": [
    "nut",
    "nuts"
  ],
  "mood_keywords": [],
  "servings": null
}


---
## Part 9 — Test the RAG Chatbot

Run any of the cells below to test different query types.

In [53]:
_ = rag_recommend('I want something spicy and healthy for dinner')


 Searching for: 'I want something spicy and healthy for dinner'
Detected filters: mood=['spicy']

   Re-ranked results:
   --------------------------------------------------------------
   1. Beef Tacos                     | Mexican        |  300 cal | sim=49.7%  →  score=29.8%
   2. Chickpea Curry                 | Indian         |  380 cal | sim=48.1%  →  score=28.9%
   3. Vegetable Stir Fry             | Chinese        |  180 cal | sim=46.6%  →  score=28.0%

   Gemini recommendation:

Here are some spicy and healthy dinner options for you:

*   **Chickpea Curry** (Indian, 380 cal) — This hearty and flavorful curry, simmered in a spiced



In [54]:
_ = rag_recommend('What Italian dishes do you recommend under 400 calories?')


 Searching for: 'What Italian dishes do you recommend under 400 calories?'
Detected filters: cuisine=Italian | max_cal=400

   Re-ranked results:
   --------------------------------------------------------------
   1. Cannoli                        | Italian        |  220 cal | sim=40.9%  →  score=36.5%
   2. Margherita Pizza               | Italian        |  270 cal | sim=38.4%  →  score=35.0%
   3. Margherita Pizza               | Italian        |  250 cal | sim=38.3%  →  score=35.0%

   Gemini recommendation:

Here are some Italian dishes under 400 calories that you might enjoy:

*   **Cannoli** (Italian, 220 cal) — A delightful Italian dessert, perfectly within your calorie limit for a sweet treat.
*   **Margherita Pizza** (Italian, 250 cal) — A classic and light pizza option, topped with fresh ingredients and well under your calorie budget.



In [55]:
_ = rag_recommend("I'm craving comfort food for a cold evening")


 Searching for: 'I'm craving comfort food for a cold evening'
Detected filters: mood=['comfort']

   Re-ranked results:
   --------------------------------------------------------------
   1. Salted Roasted Chickpea Snack Mix | International  |  160 cal | sim=41.1%  →  score=24.7%
   2. Salted Roasted Barley Snack    | International  |  140 cal | sim=39.7%  →  score=23.8%
   3. Salted Roasted Quinoa Snack    | International  |  130 cal | sim=38.9%  →  score=23.3%

   Gemini recommendation:

I understand you're looking for comfort food for a cold evening. While the retrieved options are primarily savory snacks, they could offer a satisfying crunch.

*   **Salted Roasted Chickpea



In [56]:
_ = rag_recommend('Suggest some protein-rich breakfast options under 300 calories')


 Searching for: 'Suggest some protein-rich breakfast options under 300 calories'
Detected filters: max_cal=300

   Re-ranked results:
   --------------------------------------------------------------
   1. Salted Roasted Mixed Nuts and Dried Fruits | International  |  180 cal | sim=32.5%  →  score=25.5%
   2. Salted Roasted Chickpea Snack Mix | International  |  160 cal | sim=30.2%  →  score=24.1%
   3. Salted Roasted Quinoa Snack    | International  |  130 cal | sim=29.1%  →  score=23.5%

   Gemini recommendation:

Here are some protein-rich breakfast options under 300 calories for you:

*   **Salted Roasted Chickpea Snack Mix** (International, 160 cal) — This mix of roasted chickpeas, nuts, and seeds is packed with protein and well within your calorie limit, making it a great energizing breakfast snack.
*   **Salted Roasted Mixed Nuts and Dried Fruits** (International, 180 cal) — A delicious blend of protein-rich nuts that provides a satisfying and healthy start to your day, perfect

In [57]:
# Allergen avoidance
_ = rag_recommend('I want vegan Indian food under 400 calories, no nuts')


 Searching for: 'I want vegan Indian food under 400 calories, no nuts'
Detected filters: cuisine=Indian | max_cal=400 | diet=['vegan'] | avoid=['nut', 'nuts']

   Re-ranked results:
   --------------------------------------------------------------
   1. Vegetable Korma                | Indian         |  320 cal | sim=38.7%  →  score=35.2%
   2. Chickpea Curry                 | Indian         |  380 cal | sim=39.0%  →  score=0.0%
   3. Lentil Dhal                    | Indian         |  300 cal | sim=37.1%  →  score=0.0%

   Gemini recommendation:

Here are some vegan Indian options under 400 calories, free of nuts:

*   **Chickpea Curry** (Indian, 380 cal) — This hearty and flavorful dish is entirely plant-based and well within your calorie limit, with no nuts.
*   **Lentil Dhal** (Indian, 300 cal) — A nutritious and flavorful vegan option, this lentil curry is low in calories and completely nut-free.



In [58]:
# Calorie range query
_ = rag_recommend('Give me a balanced meal between 300 and 500 calories')


 Searching for: 'Give me a balanced meal between 300 and 500 calories'
Detected filters: max_cal=500 | min_cal=300

   Re-ranked results:
   --------------------------------------------------------------
   1. Baklava                        | Middle Eastern |  320 cal | sim=24.1%  →  score=24.5%
   2. Beef Tacos                     | Mexican        |  300 cal | sim=23.0%  →  score=23.8%
   3. Apple Pie                      | American       |  320 cal | sim=22.7%  →  score=23.6%

   Gemini recommendation:

Here is an option for a balanced meal between 300 and 500 calories:

*   **Beef Tacos** (Mexican, 300 cal) —



---
## Part 10 — AI-Powered Query Comparison

In [59]:
def compare_queries(query1: str, query2: str):
    """Use Gemini to compare food recommendations for two different queries."""
    print(f"\n Comparing: '{query1}'  vs  '{query2}'")
    print('=' * 60)

    results1 = perform_similarity_search(collection, query1, 3)
    results2 = perform_similarity_search(collection, query2, 3)

    context1 = prepare_context(results1)
    context2 = prepare_context(results2)

    prompt = f"""You are comparing two food preference queries.

Query 1: "{query1}"
Top results:
{context1}

Query 2: "{query2}"
Top results:
{context2}

Write a concise, friendly comparison that highlights:
- Key differences between the two preferences
- Any overlaps or similarities
- The best pick from each query

Comparison:"""

    try:
        ai_comparison = gemini_generate(prompt)
    except Exception as e:
        ai_comparison = f'(Gemini error: {e})'

    print('\n Gemini Analysis:')
    print(ai_comparison)

    print('\n Side-by-Side Results:')
    print('-' * 60)
    print(f"{'Query 1: ' + query1[:22]:<30} | {'Query 2: ' + query2[:22]:<30}")
    print('-' * 60)
    for i in range(3):
        left  = f"{results1[i]['food_name']} ({results1[i]['similarity_score']:.0f}%)" if i < len(results1) else '---'
        right = f"{results2[i]['food_name']} ({results2[i]['similarity_score']:.0f}%)" if i < len(results2) else '---'
        print(f'{left[:30]:<30} | {right[:30]}')


compare_queries('chocolate dessert', 'healthy breakfast')


 Comparing: 'chocolate dessert'  vs  'healthy breakfast'

 Gemini Analysis:
(Gemini error: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 5, model: gemini-2.5-flash\nPlease retry in 31.058318852s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-F