# Query Generation Debug Notebook

Interactive notebook for debugging food delivery query generation using DSPy + OpenAI.

**Purpose**: High-level abstraction for data processing, detailed visibility for DSPy LLM interactions.

## 1. Setup & Imports

In [1]:
# Standard imports
import os
import sys
import json
import pandas as pd
from datetime import datetime
from pathlib import Path
from typing import Optional, List, Dict, Any
from dotenv import load_dotenv
import dspy
load_dotenv()

# Setup project paths
project_root = os.environ.get("root_folder")
if project_root:
    sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")

Project root: /Users/luvsuneja/Documents/repos/masala-embed/esci-dataset


In [2]:
print(os.environ.get("OPENAI_API_KEY"))  # Debug: Print the API key to verify it's loaded correctly

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


In [3]:
# Import existing modules (high-level abstractions)
from database.utils.db_utils import get_table
from src.evals.dietary_evals import apply_complete_dietary_evaluation
from src.data_generation.prompt_template import (
    prepare_prompt,
    get_esci_label_description,
    load_query_examples,
    generate_markdown_table
)

# Import DSPy components (for detailed interaction)
from src.data_generation.dspy_schemas import (
    QueryGenerator,
    parse_generated_output,
    setup_dspy_model
)

print("‚úÖ All imports successful")

‚úÖ All imports successful


## 2. Configuration

**Easy parameter tweaking - change these values and re-run cells below.**

In [4]:
# === MAIN CONFIGURATION ===
CONFIG = {
    # Core parameters
    'ESCI_LABEL': 'E',  # E, S, C, or I
    'LIMIT': 1000,        # Number of records from DB (None = all)
    'BATCH_SIZE': 5,    # Records per API call
    
    # Model settings
    'MODEL': 'gpt-5-mini',  # or 'gpt-5-mini', 'gpt-5'
    'TEMPERATURE': 1.2,
    'QUERIES_PER_ITEM': 2,
    
    # Optional features
    'DIETARY_FLAG': False,
    'QUERY_EXAMPLES_PATH': 'prompts/query_generation/examples.txt',  # None = no examples
    'TEMPLATE_PATH': 'prompts/query_generation/v1.txt',
    
    # Output
    'OUTPUT_FORMAT': 'json',  # 'json' or 'csv'
    'MAX_RETRIES': 3
}

# API Key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("‚ö†Ô∏è OPENAI_API_KEY not found in environment")
else:
    print(f"‚úÖ API Key found: {OPENAI_API_KEY[:10]}...")

print(f"\nüìã Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

‚úÖ API Key found: sk-proj-SY...

üìã Configuration:
  ESCI_LABEL: E
  LIMIT: 1000
  BATCH_SIZE: 5
  MODEL: gpt-5-mini
  TEMPERATURE: 1.2
  QUERIES_PER_ITEM: 2
  DIETARY_FLAG: False
  QUERY_EXAMPLES_PATH: prompts/query_generation/examples.txt
  TEMPLATE_PATH: prompts/query_generation/v1.txt
  OUTPUT_FORMAT: json
  MAX_RETRIES: 3


## 3. Data Loading (High Abstraction)

**Wrapper around existing robust data loading functions.**

In [5]:
def load_data(limit: Optional[int] = None, dietary_flag: bool = False) -> pd.DataFrame:
    """Load consumable data with optional dietary evaluation."""
    print(f"üìä Loading consumable data (limit={limit}, dietary={dietary_flag})...")
    
    # Load from database
    df = get_table("consumable", limit=limit)
    print(f"   Loaded {len(df)} records from database")
    
    if len(df) == 0:
        raise ValueError("No data found in consumable table")
    
    # Apply dietary evaluation if requested
    if dietary_flag:
        print("   Applying dietary evaluation...")
        df, dietary_columns = apply_complete_dietary_evaluation(df)
        print(f"   Added dietary columns: {dietary_columns}")
    
    # Shuffle for reproducibility
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    print(f"   Shuffled data with seed=42")
    
    return df

# Load data
df = load_data(CONFIG['LIMIT'], CONFIG['DIETARY_FLAG'])
print(f"\n‚úÖ Data loaded successfully: {len(df)} records")
print(f"Columns: {list(df.columns)}")

# Display sample
df.head(3)

üìä Loading consumable data (limit=1000, dietary=False)...


  df = pd.read_sql_query(query, conn)


   Loaded 1000 records from database
   Shuffled data with seed=42

‚úÖ Data loaded successfully: 1000 records
Columns: ['id', 'image_url', 'consumable_name', 'consumable_type', 'consumable_ingredients', 'consumable_portion_size', 'consumable_nutritional_profile', 'consumable_cooking_method', 'created_at']


Unnamed: 0,id,image_url,consumable_name,consumable_type,consumable_ingredients,consumable_portion_size,consumable_nutritional_profile,consumable_cooking_method,created_at
0,520,https://file.b18a.io/7833502043800103484_41766...,Pizza,Restaurant food,"[""cheese"",""ham"",""olives"",""peas"",""cookie crumbs""]","[""pizza:300g""]","{'fat_g': 12.0, 'protein_g': 15.0, 'calories_k...",Baked,2025-09-21 09:11:39.103523+00:00
1,737,https://file.b18a.io/7832708849300107641_49572...,Lentil Stew,Homemade food,"[""lentils"",""potatoes"",""onions"",""spices""]","[""lentils:200g"",""potatoes:150g"",""vegetables:10...","{'fat_g': 10.0, 'protein_g': 15.0, 'calories_k...",stewing,2025-09-21 09:11:39.103523+00:00
2,740,https://file.b18a.io/7837417034700107396_57128...,Spicy Chicken Wings,Homemade food,"[""chicken wings"",""spices"",""oil""]","[""chicken_wings:300g""]","{'fat_g': 15.0, 'protein_g': 20.0, 'calories_k...",Roasting,2025-09-21 09:11:39.103523+00:00


In [6]:
df = df.sample(frac=0.1, random_state=42).reset_index(drop=True)

In [7]:
df = df.head(40)

## 4. DSPy Model Setup

**Using existing robust functions with clean interface.**

In [8]:
# Setup DSPy model
try:
    setup_dspy_model(OPENAI_API_KEY, CONFIG['MODEL'], CONFIG['TEMPERATURE'])
    generator = QueryGenerator()
    print(f"‚úÖ DSPy configured: {CONFIG['MODEL']}")
    
except Exception as e:
    print(f"‚ùå DSPy setup failed: {e}")
    raise

‚úÖ DSPy configured: gpt-5-mini


## 5. Prompt Generation (Detailed)

**Step-by-step prompt preparation with full visibility.**

In [9]:
# Prepare batch (first batch for demo)
batch_size = CONFIG['BATCH_SIZE']
batch_df = df.head(batch_size).copy()

print(f"üìù Batch: {len(batch_df)} records, ESCI: {CONFIG['ESCI_LABEL']}")
print(f"üçΩÔ∏è Items: {list(batch_df['consumable_name'])}")
# print query examples path if available
print(f"üìÑ Query examples path: {CONFIG['QUERY_EXAMPLES_PATH']}")
# Load examples if provided
if CONFIG['QUERY_EXAMPLES_PATH']:
    query_examples_path = os.path.join(project_root, CONFIG['QUERY_EXAMPLES_PATH']) if project_root else CONFIG['QUERY_EXAMPLES_PATH']
    try:
        examples_content = load_query_examples(query_examples_path)
        print(f"‚úÖ Examples loaded ({len(examples_content)} chars)")
    except FileNotFoundError:
        print(f"‚ö†Ô∏è Examples file not found: {query_examples_path}")
else:
    print("üì≠ No examples")

üìù Batch: 5 records, ESCI: E
üçΩÔ∏è Items: ['Stir-fried meat with vegetables', 'Breakfast Plate', 'Vegetable Stir-Fry', 'Noodle Bowl with Meat and Vegetables', 'Mixed Chinese Dishes']
üìÑ Query examples path: prompts/query_generation/examples.txt
‚úÖ Examples loaded (205 chars)


In [10]:
# Generate the complete prompt
templates_path = os.path.join(project_root, CONFIG['TEMPLATE_PATH']) if project_root else CONFIG['TEMPLATE_PATH']
print(query_examples_path)
prompt = prepare_prompt(
    template_path=templates_path,
    df=batch_df,
    esci_label=CONFIG['ESCI_LABEL'],
    batch_size=len(batch_df),
    include_dietary=CONFIG['DIETARY_FLAG'],
    queries_per_item=CONFIG['QUERIES_PER_ITEM'],
    query_examples_path=query_examples_path
)

print(f"üìè Prompt: {len(prompt)} characters")

/Users/luvsuneja/Documents/repos/masala-embed/esci-dataset/prompts/query_generation/examples.txt
üìè Prompt: 8138 characters


In [11]:
# Display the full prompt for inspection
print("üîç FULL PROMPT (for debugging):")
print("=" * 80)
print(prompt)
print("=" * 80)

üîç FULL PROMPT (for debugging):
**Role:** You are a senior world-class expert analyst specializing in food and grocery delivery apps (Zomato, Swiggy, DoorDash, Instacart, BigBasket). Your task is to generate **realistic user queries** that customers would actually type when searching for food on food and grocery delivery apps. Make sure your queries are grounded in the available food candidates displayed in markdown format i.e., the generated queries should have one of the candidate food items as the target or search result.

**Context:**
We are building a dataset similar to Amazon ESCI, but for food delivery. Generate queries that mirror natural search behavior - think like a hungry person typing quickly on their phone or a housewife who forgot to order a key ingredient.

**What are ESCI labels?**

**Exact (E)**

Indicates an item that directly and precisely matches all constraints of the search query.

- Query: "Vegetarian Paneer Tikka pizza delivery"
- Exact match: Product listed 

In [12]:
# === PREPARE GRANULAR INPUTS FOR DSPY OPTIMIZATION ===
# Extract components from the full prompt for granular DSPy inputs

# 1. Food candidates table (already generated in prepare_prompt)
food_candidates_table = generate_markdown_table(batch_df, CONFIG['DIETARY_FLAG'])

# 2. Examples (already loaded)
examples_text = examples_content if CONFIG['QUERY_EXAMPLES_PATH'] else ""

# 3. Queries per item
queries_per_item = CONFIG['QUERIES_PER_ITEM']

# 4. ESCI label
esci_label = CONFIG['ESCI_LABEL']

print("üîß Granular inputs prepared:")
print(f"   ESCI Label: {esci_label}")
print(f"   Candidates table: {len(food_candidates_table)} chars")
print(f"   Queries per item: {queries_per_item}")
print(f"   Examples: {len(examples_text)} chars")

üîß Granular inputs prepared:
   ESCI Label: E
   Candidates table: 1147 chars
   Queries per item: 2
   Examples: 205 chars


In [13]:
# === DSPY GENERATOR FUNCTION (TINKER HERE) ===
from pydantic import BaseModel, Field
from typing import List, Dict, Any

class QueryWithDimensions(BaseModel):
    """Schema for a single generated query with dimensions."""
    query: str = Field(description="The natural language query")
    dimensions: Dict[str, str] = Field(default_factory=dict, description="Query dimensions/attributes")

class CandidateOutput(BaseModel):
    """Schema for a candidate with its generated queries."""
    id: int = Field(description="Unique identifier for the consumable item")
    name: str = Field(description="Name of the consumable item") 
    queries: List[QueryWithDimensions] = Field(description="List of generated queries with dimensions")

class OutputFormat(BaseModel):
    """Complete output format for query generation."""
    candidates: List[CandidateOutput] = Field(description="List of candidates with their queries")

# DSPy signature and module
class QueryGeneratorSignatureTinker(dspy.Signature):
    """DSPy signature for query generation task - TINKER VERSION."""
    prompt_with_candidates = dspy.InputField(
        desc="Complete prompt with food candidates and instructions"
    )
    esci_label = dspy.InputField(
        desc="ESCI label (E/S/C/I) to generate queries for"
    )
    generated_queries: OutputFormat = dspy.OutputField(
        desc="Structured output containing candidates and their generated queries with dimensions"
    )

class QueryGenerator(dspy.Module):
    """DSPy module for generating food delivery queries - TINKER VERSION."""

    def __init__(self):
        super().__init__()
        # EXPERIMENT HERE: Try different DSPy modules
        # self.generate = dspy.Predict(QueryGeneratorSignatureTinker)
        # self.generate = dspy.ReAct(QueryGeneratorSignatureTinker)
        self.generate = dspy.ChainOfThought(QueryGeneratorSignatureTinker)

    def forward(self, prompt_with_candidates: str, esci_label: str) -> OutputFormat:
        """Generate queries using DSPy with detailed logging."""
        print(f"üöÄ API call starting...")
        print(f"   Model: {CONFIG['MODEL']}")
        print(f"   ESCI: {esci_label}")
        print(f"   Prompt: {len(prompt_with_candidates)} chars")
        
        try:
            result = self.generate(
                prompt_with_candidates=prompt_with_candidates,
                esci_label=esci_label
            )
            print(f"   ‚úÖ Success: Generated structured output")
            return result.generated_queries
            
        except Exception as e:
            print(f"   ‚ùå Failed: {e}")
            raise

# Initialize tinker generator
generator = QueryGenerator()
print("‚úÖ QueryGeneratorTinker ready")
print("‚úÖ Updated to use structured OutputFormat instead of JSON string")

‚úÖ QueryGeneratorTinker ready
‚úÖ Updated to use structured OutputFormat instead of JSON string


In [None]:
# === API CALL 2: USING TINKER GENERATOR (DETAILED DEBUGGING) ===
print("üéØ Method 2: Tinker generator with detailed debugging")
print("-" * 50)

try:
    # Using the tinker generator with same prompt
    result_json = generator(prompt, CONFIG['ESCI_LABEL'])
    
    print(f"üìû Tinker API response received")
    print(f"üìè Response length: {len(result_json)} characters")
    
    # Display full response for debugging
    print(f"\nüîç FULL TINKER RESPONSE:")
    print("=" * 60)
    print(result_json)
    print("=" * 60)
    
except Exception as e:
    print(f"‚ùå Tinker API call failed: {e}")
    tinker_result_json = None

üéØ Method 2: Tinker generator with detailed debugging
--------------------------------------------------
üöÄ API call starting...
   Model: gpt-5-mini
   ESCI: E
   Prompt: 8138 chars
   ‚úÖ Success: Generated structured output
üìû Tinker API response received
‚ùå Tinker API call failed: object of type 'NoneType' has no len()
   ‚úÖ Success: Generated structured output
üìû Tinker API response received
‚ùå Tinker API call failed: object of type 'NoneType' has no len()


In [18]:
result_json

OutputFormat(candidates=[CandidateOutput(id=666, name='Stir-fried meat with vegetables', queries=[QueryWithDimensions(query='stir-fried meat with vegetables', dimensions={}), QueryWithDimensions(query='stir fry meat w veg delivery under 30 minutes near me', dimensions={'cuisine': 'Chinese', 'urgency': 'under 30 minutes', 'location': 'near me'})]), CandidateOutput(id=128, name='Breakfast Plate', queries=[QueryWithDimensions(query='Breakfast Plate', dimensions={}), QueryWithDimensions(query='healthy Breakfast Plate with salmon under $15 near me', dimensions={'healthiness': 'Healthy', 'price': 'under $15', 'meal_type': 'Breakfast', 'location': 'near me'})]), CandidateOutput(id=785, name='Vegetable Stir-Fry', queries=[QueryWithDimensions(query='Vegetable Stir-Fry', dimensions={}), QueryWithDimensions(query='vegetable stir-fry vegan low-calorie', dimensions={'dietary_restrictions': 'Vegan', 'healthiness': 'Low-calorie'})]), CandidateOutput(id=151, name='Noodle Bowl with Meat and Vegetables'

## üìä Convert to Pandas DataFrame

**Convert structured output to flat DataFrame - one query per row.**

In [23]:
def convert_output_to_dataframe(output: OutputFormat) -> pd.DataFrame:
    """
    Convert OutputFormat to pandas DataFrame with one query per row.
    
    Args:
        output: OutputFormat object with candidates and queries
        
    Returns:
        pandas.DataFrame with columns:
        - candidate_id: int
        - candidate_name: str  
        - query: str
        - dimensions_json: str (JSON string of dimensions)
        - Individual dimension columns (cuisine, price, location, etc.)
    """
    rows = []
    
    # Collect all unique dimension keys first
    all_dimension_keys = set()
    for candidate in output.candidates:
        for query_obj in candidate.queries:
            all_dimension_keys.update(query_obj.dimensions.keys())
    
    # Convert to sorted list for consistent column ordering
    dimension_columns = sorted(list(all_dimension_keys))
    
    # Process each candidate and query
    for candidate in output.candidates:
        for query_obj in candidate.queries:
            row = {
                'candidate_id': candidate.id,
                'candidate_name': candidate.name,
                'query': query_obj.query,
                'dimensions_json': json.dumps(query_obj.dimensions) if query_obj.dimensions else "{}"
            }
            
            # Add individual dimension columns (None if not present)
            for dim_key in dimension_columns:
                row[f'dim_{dim_key}'] = query_obj.dimensions.get(dim_key, None)
            
            rows.append(row)
    
    df = pd.DataFrame(rows)
    
    # Reorder columns for better readability
    base_columns = ['candidate_id', 'candidate_name', 'query', 'dimensions_json']
    dim_columns = [col for col in df.columns if col.startswith('dim_')]
    df = df[base_columns + dim_columns]
    
    return df

print("‚úÖ DataFrame conversion function ready")

‚úÖ DataFrame conversion function ready


In [24]:
# Convert the structured output to DataFrame
if 'result_json' in locals() and result_json:
    queries_df = convert_output_to_dataframe(result_json)
    
    print(f"üìä DataFrame created: {len(queries_df)} rows, {len(queries_df.columns)} columns")
    print(f"üîç Columns: {list(queries_df.columns)}")
    
    # Display the DataFrame
    print(f"\nüìã Query DataFrame (one query per row):")
    print("=" * 80)
    display(queries_df)
    
    # Quick statistics
    print(f"\nüìà Quick Stats:")
    print(f"   Total queries: {len(queries_df)}")
    print(f"   Unique candidates: {queries_df['candidate_id'].nunique()}")
    print(f"   Queries per candidate: {len(queries_df) / queries_df['candidate_id'].nunique():.1f}")
    
    # Show dimension usage
    dim_cols = [col for col in queries_df.columns if col.startswith('dim_')]
    print(f"   Dimension columns: {len(dim_cols)}")
    for col in dim_cols:
        non_null_count = queries_df[col].notna().sum()
        print(f"     {col}: {non_null_count} queries ({non_null_count/len(queries_df)*100:.1f}%)")
        
else:
    print("‚ùå No result_json available to convert")

üìä DataFrame created: 10 rows, 12 columns
üîç Columns: ['candidate_id', 'candidate_name', 'query', 'dimensions_json', 'dim_cuisine', 'dim_dietary_restrictions', 'dim_healthiness', 'dim_location', 'dim_meal_type', 'dim_nutritional_profile', 'dim_price', 'dim_urgency']

üìã Query DataFrame (one query per row):


Unnamed: 0,candidate_id,candidate_name,query,dimensions_json,dim_cuisine,dim_dietary_restrictions,dim_healthiness,dim_location,dim_meal_type,dim_nutritional_profile,dim_price,dim_urgency
0,666,Stir-fried meat with vegetables,stir-fried meat with vegetables,{},,,,,,,,
1,666,Stir-fried meat with vegetables,stir fry meat w veg delivery under 30 minutes ...,"{""cuisine"": ""Chinese"", ""urgency"": ""under 30 mi...",Chinese,,,near me,,,,under 30 minutes
2,128,Breakfast Plate,Breakfast Plate,{},,,,,,,,
3,128,Breakfast Plate,healthy Breakfast Plate with salmon under $15 ...,"{""healthiness"": ""Healthy"", ""price"": ""under $15...",,,Healthy,near me,Breakfast,,under $15,
4,785,Vegetable Stir-Fry,Vegetable Stir-Fry,{},,,,,,,,
5,785,Vegetable Stir-Fry,vegetable stir-fry vegan low-calorie,"{""dietary_restrictions"": ""Vegan"", ""healthiness...",,Vegan,Low-calorie,,,,,
6,151,Noodle Bowl with Meat and Vegetables,Noodle Bowl with Meat and Vegetables,{},,,,,,,,
7,151,Noodle Bowl with Meat and Vegetables,noodle bowl w meat & veg high protein dinner u...,"{""nutritional_profile"": ""High Protein"", ""meal_...",,,,,Dinner,High Protein,under $20,fast delivery
8,919,Mixed Chinese Dishes,Mixed Chinese Dishes,{},,,,,,,,
9,919,Mixed Chinese Dishes,mixed chinese dishes family pack cheap near me,"{""price"": ""family pack"", ""cuisine"": ""Chinese"",...",Chinese,,,near me,,,family pack,



üìà Quick Stats:
   Total queries: 10
   Unique candidates: 5
   Queries per candidate: 2.0
   Dimension columns: 8
     dim_cuisine: 2 queries (20.0%)
     dim_dietary_restrictions: 1 queries (10.0%)
     dim_healthiness: 2 queries (20.0%)
     dim_location: 3 queries (30.0%)
     dim_meal_type: 2 queries (20.0%)
     dim_nutritional_profile: 1 queries (10.0%)
     dim_price: 3 queries (30.0%)
     dim_urgency: 2 queries (20.0%)


In [None]:
# Optional: Save DataFrame to CSV
save_csv = True  # Set to False to skip saving

if save_csv and 'queries_df' in locals():
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    csv_path = f"output/queries_dataframe_{CONFIG['ESCI_LABEL']}_batch{CONFIG['BATCH_SIZE']}_{timestamp}.csv"
    
    os.makedirs(os.path.dirname(csv_path), exist_ok=True)
    queries_df.to_csv(csv_path, index=False)
    
    print(f"üíæ DataFrame saved to CSV: {csv_path}")
elif save_csv:
    print("‚ö†Ô∏è No DataFrame to save")
else:
    print("üì≠ CSV saving disabled")