### Initialize environment, imports, and configuration

In [1]:
import subprocess
import sys
import os
import ast
import getpass
import random

import nest_asyncio
import pandas as pd

from phoenix.client import AsyncClient
from phoenix.client.types import PromptVersion
from phoenix.evals import LiteLLMModel, PromptTemplate, llm_generate

from google import genai
from pathlib import Path
from tqdm import tqdm
from dotenv import load_dotenv
# Load environment variables
load_dotenv()

# Enable nested async for Jupyter
nest_asyncio.apply()

print("‚úÖ All imports successful!")

  from .autonotebook import tqdm as notebook_tqdm


‚úÖ All imports successful!


In [3]:
if "GEMINI_API_KEY" not in os.environ or not os.environ["GEMINI_API_KEY"]:
    os.environ["GEMINI_API_KEY"] = getpass.getpass("Enter your GEMINI_API_KEY: ")

In [25]:
# Configuration
MODEL_NAME = "gemini/gemini-2.5-flash"
OUTPUT_DIR = Path("./data")
OUTPUT_DIR.mkdir(exist_ok=True)

# Set up Phoenix Gemini model
phoenix_model = LiteLLMModel(model=MODEL_NAME, temperature=0.9)

In [7]:
! phoenix serve

üèÉ‚Äç‚ôÄÔ∏è‚Äç‚û°Ô∏è Running migrations on the database.
---------------------------
2025-11-29 14:50:36,931 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-11-29 14:50:36,931 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("alembic_version")
2025-11-29 14:50:36,931 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-11-29 14:50:36,935 INFO sqlalchemy.engine.Engine SELECT alembic_version.version_num 
FROM alembic_version
2025-11-29 14:50:36,935 INFO sqlalchemy.engine.Engine [generated in 0.00009s] ()
2025-11-29 14:50:36,943 INFO sqlalchemy.engine.Engine COMMIT
---------------------------
‚úÖ Migrations completed in 0.014 seconds.
[32mINFO[0m:     Started server process [[36m13780[0m]
[32mINFO[0m:     Waiting for application startup.


‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ïó ‚ñà‚ñà‚ïó  ‚ñà‚ñà‚ïó ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ïó ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ïó‚ñà‚ñà‚ñà‚ïó   ‚ñà‚ñà‚ïó‚ñà‚ñà‚ïó‚ñà‚ñà‚ïó  ‚ñà‚ñà‚ïó
‚ñà‚ñà‚ïî‚ïê‚ïê‚ñà‚ñà‚ïó‚ñà‚ñà‚ïë  ‚ñà‚ñà‚ïë‚ñà‚ñà‚ïî‚ïê‚ïê‚ïê‚ñà‚ñà‚ïó‚ñà‚ñà‚ïî‚ïê‚ïê‚ïê‚ïê‚ïù‚ñà‚

In [8]:
# Create recipe assistant prompts
recipe_prompt_v1 = """
You are a helpful, accurate, and creative recipe assistant. Your job is to generate easy-to-follow, reliable recipes and cooking advice tailored to the user query below.

Core Responsibilities:
- Always include an ingredient list with precise measurements in standard US or metric units.
- Always include clear, numbered, step-by-step instructions that are logically ordered and easy to follow.
- Always structure your response in Markdown.

Ingredient Guidelines:
- Never suggest rare, expensive, or difficult-to-obtain ingredients without clearly providing readily available substitutions.
- Be specific with ingredients (e.g., "1 cup unsweetened almond milk" instead of "milk").

Instructional Guidelines:
- Do not skip steps or assume prior knowledge.
- Use direct, instructional language.
- Include preparation and cook time only if reliably known.

Behavior & Ethics:
- Never include unsafe, unethical, or harmful suggestions. Politely decline and explain briefly if a request cannot be fulfilled - without being preachy or moralizing.
- Never use offensive or derogatory language.
- Use creative combinations when a direct recipe doesn't exist, but clearly state when you're improvising.

Style & Formatting:
Structure all responses using the following Markdown format:

Begin with:
## Recipe Name
A 1-3 sentence, enticing description of the dish and why or when it's great.

### Ingredients
List all ingredients using bullet points, each with precise amounts and clear names.

### Instructions
1. Provide step-by-step instructions in logical cooking order.

Optionally include, if relevant:
### Notes
Additional context or background information.

### Tips
Suggestions for technique, improvements, or best results.

### Variations
Common substitutions or flavor variations.

User Query: {query}

"""

prompt_name = "recipe-assistant-v1"
px_client = AsyncClient()

prompt = await px_client.prompts.create(
    name=prompt_name,
    prompt_description="Basic recipe assistant prompt",
    version=PromptVersion(
        [
            {"role": "system", "content": recipe_prompt_v1},
            {"role": "user", "content": "{query}"}
        ],
        model_name="gemini-2.5-flash",
        model_provider="GOOGLE",
    ),
)

print("\nüéØ Prompt created! You can now view it in the Phoenix UI under the 'Prompts' section.")



üéØ Prompt created! You can now view it in the Phoenix UI under the 'Prompts' section.


In [9]:
# Define 4 key dimensions for Recipe Bot testing with specific values

DIMENSIONS = {
    "dietary_restriction": ["vegan", "vegetarian", "gluten-free", "keto", "no restrictions"],
    "cuisine_type": ["Italian", "Asian", "Mexican", "Mediterranean", "American", "any cuisine"],
    "meal_type": ["breakfast", "lunch", "dinner", "snack", "dessert"],
    "skill_level": ["beginner", "intermediate", "advanced"],
}

print("üéØ Defined key dimensions for Recipe Bot testing:")
for dim, values in DIMENSIONS.items():
    print(f"   {dim}: {', '.join(values)}")

print(
    f"\nTotal possible combinations: {len(DIMENSIONS['dietary_restriction']) * len(DIMENSIONS['cuisine_type']) * len(DIMENSIONS['meal_type']) * len(DIMENSIONS['skill_level'])}"
)

üéØ Defined key dimensions for Recipe Bot testing:
   dietary_restriction: vegan, vegetarian, gluten-free, keto, no restrictions
   cuisine_type: Italian, Asian, Mexican, Mediterranean, American, any cuisine
   meal_type: breakfast, lunch, dinner, snack, dessert
   skill_level: beginner, intermediate, advanced

Total possible combinations: 450


In [10]:
# Step 1: Generate diverse dimension tuples programmatically to ensure variety
print("üéØ Generating 8 diverse dimension tuples programmatically...")

# Create diverse combinations by sampling systematically
dimension_tuples = []
random.seed(42)  # For reproducible results

# Generate 8 diverse tuples
for i in range(8):
    tuple_data = {
        "dietary_restriction": random.choice(DIMENSIONS["dietary_restriction"]),
        "cuisine_type": random.choice(DIMENSIONS["cuisine_type"]),
        "meal_type": random.choice(DIMENSIONS["meal_type"]),
        "skill_level": random.choice(DIMENSIONS["skill_level"]),
        "tuple_id": i + 1,
    }
    dimension_tuples.append(tuple_data)

print(f"‚úÖ Generated {len(dimension_tuples)} diverse dimension tuples")

# Step 2: Show some examples to verify diversity
print("\nüìã Sample dimension tuples:")
for i in range(min(5, len(dimension_tuples))):
    tuple_data = dimension_tuples[i]
    print(
        f"\nTuple {i + 1}: {tuple_data['dietary_restriction']}, {tuple_data['cuisine_type']}, {tuple_data['meal_type']}, {tuple_data['skill_level']}"
    )

print(f"\n‚úÖ Successfully created {len(dimension_tuples)} diverse dimension tuples")

üéØ Generating 8 diverse dimension tuples programmatically...
‚úÖ Generated 8 diverse dimension tuples

üìã Sample dimension tuples:

Tuple 1: vegan, Italian, dinner, beginner

Tuple 2: vegetarian, Asian, breakfast, advanced

Tuple 3: no restrictions, Italian, dessert, intermediate

Tuple 4: vegan, Italian, breakfast, beginner

Tuple 5: vegetarian, American, dessert, beginner

‚úÖ Successfully created 8 diverse dimension tuples


In [14]:
client = genai.Client()

final_data = []
idx        = 0
selected_tuples = random.sample(dimension_tuples, 8)

for tuple_data in tqdm(selected_tuples, desc="Building synthetic recipe dataset"):
    
    tuple_str = f"dietary_restriction: {tuple_data['dietary_restriction']}, cuisine_type: {tuple_data['cuisine_type']}, meal_type: {tuple_data['meal_type']}, skill_level: {tuple_data['skill_level']}"
    
    query_template = f"""
    Convert this dimension tuple into a realistic user query for a Recipe Bot:
    
    Dimension tuple: {tuple_str}
    
    Create a natural language query that a real user with these characteristics might ask. Be creative and vary your style significantly.
    
    Vary your vocabulary, sentence structure, and level of detail. Generate 1 unique, realistic query:
    """   

    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=query_template,
    )
    final_data.append(
        {
            "id": f"SYN{idx + 1:03d}",
            "query": response.text,
            "dietary_restriction": tuple_data["dietary_restriction"],
            "cuisine_type": tuple_data["cuisine_type"],
            "meal_type": tuple_data["meal_type"],
            "skill_level": tuple_data["skill_level"],
            "tuple_description": tuple_str,
        }
    )
    idx+=1

all_queries_df = pd.DataFrame(final_data)
print(f"\nüéØ Created dataset with {len(all_queries_df)} queries ready for testing!")

Building synthetic recipe dataset: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [00:39<00:00,  4.93s/it]


üéØ Created dataset with 8 queries ready for testing!





In [15]:
# Display all rows and columns, and show full text in each cell for all_queries_df
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None
):
    display(all_queries_df)

Unnamed: 0,id,query,dietary_restriction,cuisine_type,meal_type,skill_level,tuple_description
0,SYN001,"""Hey Recipe Bot, I'm trying to find something special for dessert tonight. I'm vegetarian, and I'm really in the mood for some Mediterranean vibes ‚Äì think fresh, not too heavy. I'm a pretty confident cook, so I'm up for an intermediate challenge, but nothing that'll take me all day. Got any delicious, slightly ambitious ideas for a sweet treat?""",vegetarian,Mediterranean,dessert,intermediate,"dietary_restriction: vegetarian, cuisine_type: Mediterranean, meal_type: dessert, skill_level: intermediate"
1,SYN002,"""Hey, I'm really in the mood for some Italian tonight, but I need something completely plant-based. My cooking skills are pretty basic, so do you have any super simple vegan dinner recipes for a beginner like me?""",vegan,Italian,dinner,beginner,"dietary_restriction: vegan, cuisine_type: Italian, meal_type: dinner, skill_level: beginner"
2,SYN003,"""Hey Recipe Bot, I'm really craving something sweet, but I'm just learning to bake! Can you find me a super easy, classic American dessert recipe that's totally vegetarian and pretty much impossible for a beginner to mess up?""",vegetarian,American,dessert,beginner,"dietary_restriction: vegetarian, cuisine_type: American, meal_type: dessert, skill_level: beginner"
3,SYN004,"Okay, recipe bot, I'm looking to whip up an Italian dessert tonight. I'd say my cooking skills are somewhere in the middle ‚Äì I can handle a bit of a project, but nothing that requires three days of prep or super niche ingredients. Surprise me with a delicious, authentic-feeling Italian sweet treat!",no restrictions,Italian,dessert,intermediate,"dietary_restriction: no restrictions, cuisine_type: Italian, meal_type: dessert, skill_level: intermediate"
4,SYN005,"""Alright, Recipe Bot, I'm feeling a bit adventurous tonight! Could you hit me with an *intermediate-level* **Asian dessert** recipe? Something that's a nice challenge without requiring a culinary degree, and I'm totally open to anything ‚Äì no dietary hangups at all.""",no restrictions,Asian,dessert,intermediate,"dietary_restriction: no restrictions, cuisine_type: Asian, meal_type: dessert, skill_level: intermediate"
5,SYN006,"""I'm really craving something Italian for breakfast, but I'm completely new to cooking and eating plant-based. Can you suggest some super easy, vegan-friendly morning meals with an Italian flair that even a total beginner could whip up?""",vegan,Italian,breakfast,beginner,"dietary_restriction: vegan, cuisine_type: Italian, meal_type: breakfast, skill_level: beginner"
6,SYN007,"I'm hoping to make a delightful, plant-based Asian snack that's a bit more involved than basic, but still very achievable for someone who enjoys a moderate culinary challenge. Any intriguing ideas come to mind?",vegan,Asian,snack,intermediate,"dietary_restriction: vegan, cuisine_type: Asian, meal_type: snack, skill_level: intermediate"
7,SYN008,"""I'm feeling ambitious this morning! Can you suggest an elaborate, authentic Asian breakfast that's completely meat-free? I'm looking for a real culinary project, something intricate that will challenge my advanced cooking skills.""",vegetarian,Asian,breakfast,advanced,"dietary_restriction: vegetarian, cuisine_type: Asian, meal_type: breakfast, skill_level: advanced"


In [16]:
# Save the dataset to CSV for easy use
output_path = OUTPUT_DIR / "generated_synthetic_queries.csv"
all_queries_df.to_csv(output_path, index=False)

print(f"üíæ Saved dataset to: {output_path}")
print(f"üìä Ready for testing with {len(all_queries_df)} queries!")

üíæ Saved dataset to: data/generated_synthetic_queries.csv
üìä Ready for testing with 8 queries!


In [18]:
dataset = await px_client.datasets.create_dataset(
    dataframe=all_queries_df,
    name="recipe-bot-synthetic-queries-3",
    input_keys=["query"],
)

### Download Gold Standard Human Labelled Dataset from Phoenix

In [21]:
labelled_data = pd.read_csv("data/labeled_synthetic_data.csv")
labelled_data = labelled_data[["input","output","notes","trace_id"]]
labelled_data.head()

Unnamed: 0,input,output,notes,trace_id
0,"{""query"": ""\""Hey Recipe Bot, I'm feeling a bit...","{""messages"": [{""role"": ""assistant"", ""content"":...",Perfect,82bb1a9baf0978cb26238d9de6991a0a
1,"{""query"": ""\""Hey Recipe Bot, I'm hoping you ca...","{""messages"": [{""role"": ""assistant"", ""content"":...",Perfect,99c1fa91cd1983dbed8ad6de3a1e2adb
2,"{""query"": ""\""Hey there! I'm trying to whip up ...","{""messages"": [{""role"": ""assistant"", ""content"":...",egg is not vegetarian,4f87b41e7813973892257a8207a8f0b7
3,"{""query"": ""\""Hey there! I'm feeling ambitious ...","{""messages"": [{""role"": ""assistant"", ""content"":...",requires specialized equipment and long prep time,2308fc59ed1e6c1e235672e312dc5d1
4,"{""query"": ""\""Alright, Recipe Bot, I'm feeling ...","{""messages"": [{""role"": ""assistant"", ""content"":...",The recipe requires 3.5 hours approximately wh...,4e25ddaf26c0f8605ce4c083f5fe43b5


In [22]:
prompt = f"""
You are analyzing Recipe Bot failures. Look at these examples where a user queried the bot, the bot responded, and an analyst (me) described what went wrong.

EXAMPLES:
{labelled_data.to_json(orient="records", lines=True)}

Based on the patterns you see in the analyst's descriptions of what went wrong, create 4-6 systematic failure mode labels that would be useful for categorizing these types of issues.

Each label should:
- Be short and clear (2 words max)
- Capture a distinct type of failure pattern
- Be applicable to multiple traces

Respond STRICTLY with a list of failure mode labels: ["label1", "label2", "label3", "label4", "label5", "label6"]

Here is one example : ["Dietary Ignored", "Formatting Error", "Complexity Mismatch", "Meal Type Mismatch", "Ingredient Omission", "Skill Level Misalignment"]
"""  # noqa: E501

response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=prompt,
)

response_content = response.text

print(response_content)

["Dietary Mismatch", "Skill Mismatch", "Time Constraint", "Cuisine Mismatch", "Instruction Detail"]


In [None]:
failure_mode_labels = ast.literal_eval(response_content)
print(failure_mode_labels)

['Dietary Mismatch', 'Skill Mismatch', 'Time Constraint', 'Cuisine Mismatch', 'Instruction Detail']


In [27]:
# Create template for applying labels
classification_template = PromptTemplate(f"""
Look at this Recipe Bot interaction and the analyst's description of what went wrong.
Apply the most appropriate failure mode label(s) from the provided options.

USER QUERY: {{input}}
BOT RESPONSE: {{output}}
ANALYST'S ISSUE DESCRIPTION: {{notes}}

AVAILABLE FAILURE MODE LABELS:
{failure_mode_labels}

Based on the analyst's description of the issue, pick the failure mode that best apply to this case.

Respond with just the label name
""")

# Run llm_generate for classification

results = llm_generate(dataframe=labelled_data, template=classification_template, model=phoenix_model)

results.head()

llm_generate |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 (100.0%) | ‚è≥ 00:38<00:00 |  4.86s/it


Unnamed: 0,output
0,
1,
2,Dietary Mismatch
3,
4,Time Constraint


In [28]:
# Count the occurrences of each failure mode label in the results
label_counts = results["output"].value_counts()
label_counts

output
                      3
Dietary Mismatch      2
Time Constraint       1
Cuisine Mismatch      1
Instruction Detail    1
Name: count, dtype: int64

In [29]:
# Join results to combined_df on the index (axis=1), then rename 'output' to 'failure model'
final_data = labelled_data.join(results.rename(columns={"output": "failure model"}))
final_data.head()

final_data.to_csv("data/labeled_synthetic_data.csv", index=False)