# Evaluation of Voice-to-Voice Multi-Agent Product Assistant

This notebook evaluates our LangGraph-based, multi-agent product recommender.

We measure:

- **ROUGE-L**: similarity of answers to reference responses
- **BLEU**: n-gram precision vs reference
- **Precision@K**: relevance of top-K retrieved products (optional, needs labels)
- **Latency (cold vs warm cache)**: end-to-end response time
- **Cost (optional)**: approximate $-cost per query based on tokens

> **Important**: Before running this notebook, make sure the MCP server is running.


In [2]:
!pip install rouge-score nltk requests

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 372.9 kB/s eta 0:00:03
   ------------- -------------------------- 0.5/1.5 MB 372.9 kB/s eta 0:0

In [3]:
import sys
import time
import statistics
from typing import List, Dict, Any, Optional

sys.path.append(".")

import requests
from src.agents.graph import invoke_with_logging
from src.mcp_server.config import config

# Optional metrics
try:
    from rouge_score import rouge_scorer
except ImportError:
    rouge_scorer = None
    print("‚ö†Ô∏è rouge-score not installed, ROUGE-L will be skipped. Install with: pip install rouge-score")

try:
    from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
except ImportError:
    sentence_bleu = None
    SmoothingFunction = None
    print("‚ö†Ô∏è nltk not installed, BLEU will be skipped. Install with: pip install nltk")


In [9]:
"""
set OPENAI_API_KEY and SERPAPI_KEY
"""

import os
from getpass import getpass
from dotenv import load_dotenv, find_dotenv

# Optional: Load .env file automatically if it exists
# This saves you from typing keys if you have a .env file locally
# _ = load_dotenv(find_dotenv())

def check_and_set_key(var_name, display_name):
    """
    Checks if an environment variable exists.
    If not, prompts the user to input it securely (input will be hidden).
    """
    # 1. Try to get the variable from the environment
    key = os.getenv(var_name)

    # 2. If not found, prompt the user
    if not key:
        print(f"‚ö†Ô∏è {var_name} not found in environment.")
        # getpass hides your keystrokes for security
        key_input = getpass(f"üëâ Please enter your {display_name}: ").strip()

        if not key_input:
            print(f"‚ùå Error: You did not enter {display_name}. The program may not function correctly.")
            return None

        # 3. Set the variable in the current session
        os.environ[var_name] = key_input
        print(f"‚úÖ {var_name} has been set manually!")
    else:
        # If found (e.g., loaded from .env)
        # Show first 5 chars for verification, hide the rest
        print(f"‚úÖ {var_name} is already set (Value: {key[:5]}...).")

# --- Execute Checks ---

# 1. Check OpenAI API Key
check_and_set_key("OPENAI_API_KEY", "OpenAI API Key")

print("-" * 30)

# 2. Check SerpApi Key
# Note: Ensure this variable name matches what your code expects (e.g. SERPAPI_KEY or SERPAPI_API_KEY)
check_and_set_key("SERPAPI_KEY", "SerpApi Key")

‚ö†Ô∏è OPENAI_API_KEY not found in environment.
‚úÖ OPENAI_API_KEY has been set manually!
------------------------------
‚ö†Ô∏è SERPAPI_KEY not found in environment.
‚úÖ SERPAPI_KEY has been set manually!


In [22]:
import chromadb
import pandas as pd

CHROMA_PATH = "vectordb/chroma"

client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = client.get_collection(name="products")

# Query ALL metadata by setting n_results = large number
raw = collection.get(include=["metadatas", "documents"])

len(raw["ids"]), len(raw["metadatas"]), len(raw["documents"])


(8661, 8661, 8661)

In [23]:
def chroma_to_dataframe(raw):
    df = pd.DataFrame({
        "doc_id": raw["ids"],
        "document": raw["documents"],
        "metadata": raw["metadatas"]
    })

    # Expand metadata (metadata is a dict)
    meta_df = pd.json_normalize(df["metadata"])
    df = df.drop(columns=["metadata"]).join(meta_df)
    return df

df_products = chroma_to_dataframe(raw)
df_products.head()


Unnamed: 0,doc_id,document,price_bucket,price,sub_category,product_url,eco_friendly,title,brand,image_url,product_id,has_material,main_category
0,doc_00000,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",premium,237.68,&,https://www.amazon.com/DB-Longboards-CoreFlex-...,False,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",DB,https://images-na.ssl-images-amazon.com/images...,4c69b61db1fc16e7013b43fc926e502d,True,Sports
1,doc_00001,"Electronic Snap Circuits Mini Kits Classpack, ...",premium,99.95,&,https://www.amazon.com/Electronic-Circuits-Cla...,False,"Electronic Snap Circuits Mini Kits Classpack, ...",Electronic,https://images-na.ssl-images-amazon.com/images...,66d49bbed043f5be260fa9f7fbff5957,False,Toys
2,doc_00002,3Doodler Create Flexy 3D Printing Filament Ref...,mid,34.99,&,https://www.amazon.com/3Doodler-Plastic-Innova...,True,3Doodler Create Flexy 3D Printing Filament Ref...,3Doodler,https://images-na.ssl-images-amazon.com/images...,2c55cae269aebf53838484b0d7dd931a,True,Toys
3,doc_00003,Guillow Airplane Design Studio with Travel Cas...,mid,28.91,&,https://www.amazon.com/Guillow-Airplane-Design...,False,Guillow Airplane Design Studio with Travel Cas...,Guillow,https://images-na.ssl-images-amazon.com/images...,18018b6bc416dab347b1b7db79994afa,True,Toys
4,doc_00004,Woodstock- Collage 500 pc Puzzle | Puzzle has ...,mid,17.49,&,https://www.amazon.com/Woodstock-Collage-500-p...,False,Woodstock- Collage 500 pc Puzzle,Woodstock-,https://images-na.ssl-images-amazon.com/images...,e04b990e95bf73bbe6a3fa09785d7cd0,True,Toys


In [24]:
df_products.columns


Index(['doc_id', 'document', 'price_bucket', 'price', 'sub_category',
       'product_url', 'eco_friendly', 'title', 'brand', 'image_url',
       'product_id', 'has_material', 'main_category'],
      dtype='object')

In [48]:
# Evaluation dataset: 10 realistic voice-style queries
# Each item has:
#   - query: user question
#   - reference: ideal answer written by us (for ROUGE/BLEU)
#   - relevant_ids: (optional) list of correct product doc_ids for Precision@K

EVAL_SET: List[Dict[str, Any]] = [
    {
        "id": "q1",
        "query": "Find me a wooden puzzle for kids under $20",
        "reference": (
            "Here is a wooden puzzle for kids under $20 from our catalog, "
            "with good reviews and a safe, child-friendly design."
        ),
        "relevant_ids": [],
    },
    {
        "id": "q2",
        "query": "I need a stainless steel cleaner that‚Äôs eco-friendly and under $15",
        "reference": (
            "Here is an eco-friendly stainless steel cleaner under $15, "
            "made with non-toxic ingredients and good customer ratings."
        ),
        "relevant_ids": [],
    },
    {
        "id": "q3",
        "query": "Show me eco-friendly toys under $15 for a 4 year old",
        "reference": (
            "Here are eco-friendly toys under $15 suitable for a 4 year old, "
            "all made from sustainable materials and within your budget."
        ),
        "relevant_ids": [],
    },
    {
        "id": "q4",
        "query": "Recommend a reusable stainless steel water bottle 32 ounce under $25",
        "reference": (
            "Here is a reusable 32 ounce stainless steel water bottle under $25 "
            "with good insulation and strong customer reviews."
        ),
        "relevant_ids": [],
    },
    {
        "id": "q5",
        "query": "Lunch box under $30",
        "reference": (
            "Here is a durable and insulated lunch box under $30 with a leak-proof design, "
            "featuring multiple compartments and excellent ratings for keeping food fresh."
        ),
        "relevant_ids": [],
    },
    {
        "id": "q6",
        "query": "I want a LEGO building set for kids around $30",
        "reference": (
            "Here is a LEGO building set for kids around $30 from our catalog, "
            "with age-appropriate difficulty and good ratings."
        ),
        "relevant_ids": [],
    },
    {
        "id": "q7",
        "query": "Show me a non-stick frying pan under $40 that works on induction",
        "reference": (
            "Here is a non-stick frying pan under $40 that works on induction cooktops, "
            "with durable coating and positive reviews."
        ),
        "relevant_ids": [],
    },
    {
        "id": "q8",
        "query": "Skateboard under $60",
        "reference": (
            "Here is a durable skateboard under $60 suitable for beginners, "
            "featuring a sturdy maple deck and smooth customer reviews."
        ),
        "relevant_ids": [],
    },
    {
        "id": "q9",
        "query": "Party decorations under $100",
        "reference": (
            "Here is a complete party decoration set under $100 that includes "
            "balloons, banners, and tableware, perfect for various events and well reviewed."
        ),
        "relevant_ids": [],
    },
    {
        "id": "q10",
        "query": "I need an ergonomic office chair under $250",
        "reference": (
            "Here is an ergonomic office chair under $250 with adjustable lumbar support "
            "and good long-session comfort reviews."
        ),
        "relevant_ids": [],
    },
]

K = 5  # For Precision@K
len(EVAL_SET)

10

In [49]:
"""
Rule-based ‚Äúrelevant_ids‚Äù functions
"""

def get_doc_ids(mask, top_n=5):
    """Helper: from a boolean mask on df_products, return top_n doc_id strings."""
    return df_products[mask].head(top_n)["doc_id"].astype(str).tolist()


def relevant_ids_q1():
    # "Find me a wooden puzzle for kids under $20"
    mask = df_products["title"].str.contains("puzzle", case=False, na=False)
    mask &= df_products["price"] <= 20
    # Optional: restrict to toys if category exists
    if "main_category" in df_products.columns:
        mask &= df_products["main_category"].str.contains("toy", case=False, na=False)
    return get_doc_ids(mask)


def relevant_ids_q2():
    # "stainless steel cleaner eco-friendly under $15"
    # mask = df_products["title"].str.contains("stainless", case=False, na=False)
    # mask &= df_products["title"].str.contains("clean", case=False, na=False)
    mask = df_products["title"].str.contains("clean", case=False, na=False)
    mask &= df_products["price"] <= 15
    # if "eco_friendly" in df_products.columns:
    #     mask &= df_products["eco_friendly"] == True
    return get_doc_ids(mask)


def relevant_ids_q3():
    # "eco-friendly toys under $15 for a 4 year old"
    mask = df_products["price"] <= 15
    if "main_category" in df_products.columns:
        mask &= df_products["main_category"].str.contains("toy", case=False, na=False)
    if "eco_friendly" in df_products.columns:
        mask &= df_products["eco_friendly"] == True
    return get_doc_ids(mask)


def relevant_ids_q4():
    # "reusable stainless steel water bottle 32 ounce under $25"
    mask = df_products["title"].str.contains("bottle", case=False, na=False)
    # mask &= df_products["title"].str.contains("stainless", case=False, na=False)
    # Try to capture 32oz text in title if possible
    mask &= df_products["price"] <= 25
    return get_doc_ids(mask)


def relevant_ids_q5():
    # "lunch box under $30"
    mask = df_products["title"].str.contains("lunch box", case=False, na=False)
    mask &= df_products["price"] <= 30
    # if "eco_friendly" in df_products.columns:
    #     mask &= df_products["eco_friendly"] == True
    return get_doc_ids(mask)


def relevant_ids_q6():
    # "LEGO building set around $30"
    mask = df_products["price"].between(20, 40)  # "around 30"
    if "brand" in df_products.columns:
        mask &= df_products["brand"].str.contains("lego", case=False, na=False)
    else:
        mask &= df_products["title"].str.contains("lego", case=False, na=False)
    return get_doc_ids(mask)


def relevant_ids_q7():
    # "non-stick frying pan under $40 that works on induction"
    mask = df_products["title"].str.contains("pan", case=False, na=False)
    # mask &= df_products["title"].str.contains("non[- ]?stick", case=False, na=False) | \
    #         df_products["title"].str.contains("nonstick", case=False, na=False)
    mask &= df_products["price"] <= 40
    return get_doc_ids(mask)


def relevant_ids_q8():
    # "skateboard under $60"
    mask = df_products["title"].str.contains("skateboard", case=False, na=False)
    mask &= df_products["price"] <= 60
    return get_doc_ids(mask)


def relevant_ids_q9():
    # "party decorations under $100"
    mask = df_products["title"].str.contains("party", case=False, na=False) | \
           df_products["title"].str.contains("decorations", case=False, na=False)
    mask &= df_products["price"] <= 100
    return get_doc_ids(mask)


def relevant_ids_q10():
    # "ergonomic office chair under $250"
    mask = df_products["title"].str.contains("chair", case=False, na=False)
    # mask &= df_products["title"].str.contains("office", case=False, na=False) | \
    #         df_products["title"].str.contains("ergonomic", case=False, na=False)
    mask &= df_products["price"] <= 250
    return get_doc_ids(mask)


In [50]:
def populate_relevant_ids(eval_set):
    """Fill relevant_ids in-place for queries where we defined rules."""
    mapping = {
        "q1": relevant_ids_q1,
        "q2": relevant_ids_q2,
        "q3": relevant_ids_q3,
        "q4": relevant_ids_q4,
        "q5": relevant_ids_q5,
        "q6": relevant_ids_q6,
        "q7": relevant_ids_q7,
        "q8": relevant_ids_q8,
        "q9": relevant_ids_q9,
        "q10": relevant_ids_q10,
    }

    for item in eval_set:
        fn = mapping.get(item["id"])
        if fn is not None:
            item["relevant_ids"] = fn()
    return eval_set

EVAL_SET = populate_relevant_ids(EVAL_SET)

# Quick check:
[(e["id"], len(e["relevant_ids"])) for e in EVAL_SET]


[('q1', 5),
 ('q2', 5),
 ('q3', 5),
 ('q4', 5),
 ('q5', 5),
 ('q6', 5),
 ('q7', 5),
 ('q8', 5),
 ('q9', 5),
 ('q10', 5)]

In [37]:
def compute_rouge_l(reference: str, hypothesis: str) -> Optional[float]:
    """Compute ROUGE-L F1 using rouge_score if available."""
    if rouge_scorer is None:
        return None
    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    scores = scorer.score(reference, hypothesis)
    return scores["rougeL"].fmeasure


def compute_bleu(reference: str, hypothesis: str) -> Optional[float]:
    """Compute sentence-level BLEU score if nltk is available."""
    if sentence_bleu is None:
        return None
    ref_tokens = reference.split()
    hyp_tokens = hypothesis.split()
    smoothie = SmoothingFunction().method4 if SmoothingFunction else None
    return sentence_bleu([ref_tokens], hyp_tokens, smoothing_function=smoothie)


def precision_at_k(pred_ids: List[str], true_ids: List[str], k: int) -> Optional[float]:
    """
    Precision@K = (# relevant in top-k) / k.
    If no labels are provided, returns None.
    """
    if not true_ids:
        return None
    top_k = pred_ids[:k]
    if not top_k:
        return 0.0
    hits = sum(1 for pid in top_k if pid in true_ids)
    return hits / min(k, len(top_k))


In [38]:
# def clear_mcp_cache() -> None:
#     """Call MCP /clear_cache to reset RAG + web caches."""
#     url = f"http://{config.MCP_HOST}:{config.MCP_PORT}/clear_cache"
#     try:
#         resp = requests.post(url, timeout=5)
#         if resp.status_code == 200:
#             print("[CACHE] Cleared MCP cache")
#         else:
#             print(f"[CACHE] Failed to clear cache: HTTP {resp.status_code}")
#     except Exception as e:
#         print(f"[CACHE] Error calling clear_cache: {e}")

def clear_mcp_cache():
    """
    Clear RAG + web caches ONLY inside this notebook session.
    Does NOT touch files, embeddings, or the Chroma DB.
    Safe to run repeatedly.
    """
    try:
        from src.mcp_server.utils.cache import rag_cache, web_cache
        rag_cache.clear()
        web_cache.clear()
        print("[CACHE] Cleared rag_cache and web_cache")
    except Exception as e:
        print("[CACHE] Could not clear caches:", e)



def extract_retrieved_ids(result: Dict[str, Any]) -> List[str]:
    """
    Extract product IDs from the graph result for Precision@K.

    This is a generic version. After running one query and printing `result`,
    you should adapt this function to match your actual structure.
    """
    ids: List[str] = []

    # Case 1: products stored directly
    if "retrieved_products" in result:
        for p in result["retrieved_products"]:
            if isinstance(p, dict):
                pid = p.get("doc_id") or p.get("product_id") or p.get("id")
                if pid:
                    ids.append(str(pid))

    # Case 2: product info inside citations
    if not ids and "citations" in result:
        for c in result["citations"]:
            if isinstance(c, dict):
                pid = c.get("doc_id") or c.get("product_id") or c.get("id")
                if pid:
                    ids.append(str(pid))

    return ids


def estimate_cost(execution_stats: Dict[str, Any]) -> Optional[float]:
    """
    Rough cost estimation based on token usage, if logged by your logger.

    Example expected keys:
        execution_stats["total_input_tokens"]
        execution_stats["total_output_tokens"]
    """
    input_tokens = execution_stats.get("total_input_tokens")
    output_tokens = execution_stats.get("total_output_tokens")

    if input_tokens is None or output_tokens is None:
        return None

    total_tokens = input_tokens + output_tokens

    # Example price: $0.002 per 1K tokens, adjust to your real model/pricing
    price_per_1k = 0.002
    cost = (total_tokens / 1000.0) * price_per_1k
    return cost


In [51]:
sample_query = EVAL_SET[0]["query"]
print("Sample query:", sample_query)

clear_mcp_cache()
result_debug = invoke_with_logging(sample_query)

result_debug.keys()


Sample query: Find me a wooden puzzle for kids under $20
[CACHE] Cleared rag_cache and web_cache

[ROUTER] Extracted intent: {'product_type': 'puzzle', 'budget': None, 'price_min': None, 'price_max': 20.0, 'category': 'Toys', 'eco_friendly': None, 'materials': ['wood'], 'age_range': 'kids', 'brand': None}

[SAFETY] Is safe: True

[PLANNER] Plan: Search 2020 catalog, then verify current web prices for comparison
[PLANNER] Tools: ['rag.search', 'web.search']
[PLANNER] RAG params: {'query': 'wooden kids puzzle', 'price_min': None, 'price_max': 20.0, 'category': 'Toys', 'eco_friendly': None, 'top_k': 5}

[EXECUTOR] Calling rag.search...
[EXECUTOR] RAG found: 5 products

[EXECUTOR] Calling web.search (Google Shopping via MCP)...
[EXECUTOR] Price filter: under $20.0
[EXECUTOR] Running 5 price checks in PARALLEL...
[EXECUTOR] ‚úì Dragon Boat... ‚Üí 40 results
[EXECUTOR] ‚úì Melissa & Doug Barnyard Wooden Jigsaw Pu... ‚Üí 40 results
[EXECUTOR] ‚úì Melissa & Doug Personalized Vehicles Woo... ‚Ü

dict_keys(['user_query', 'intent', 'is_safe', 'safety_reason', 'plan', 'tools_to_call', 'rag_params', 'rag_results', 'web_results', 'matched_products', 'conflicts', 'comparison_table', 'final_answer', 'citations', 'tts_summary', '_logging'])

In [13]:
def run_eval(cold_cache: bool = True) -> Dict[str, Any]:
    """
    Run evaluation over EVAL_SET.

    cold_cache=True:
        clear MCP cache before each query (simulates worst-case latency).
    cold_cache=False:
        keep cache warm across queries.
    """
    run_type = "COLD" if cold_cache else "WARM"
    print(f"\n===== EVALUATION RUN ({run_type} CACHE) =====\n")

    per_query_results = []

    for item in EVAL_SET:
        qid = item["id"]
        query = item["query"]
        reference = item["reference"]
        relevant_ids = item.get("relevant_ids", [])

        print(f"\n--- [{run_type}] {qid}: {query} ---")

        if cold_cache:
            clear_mcp_cache()

        t0 = time.time()
        result = invoke_with_logging(query)
        t1 = time.time()

        final_answer = result.get("final_answer", "")
        logging_meta = result.get("_logging", {})
        exec_stats = logging_meta.get("execution_stats", {})

        # Latency
        latency_ms_logged = exec_stats.get("total_duration_ms")
        latency_ms_wall = (t1 - t0) * 1000.0
        latency_ms = latency_ms_logged or latency_ms_wall

        # ROUGE & BLEU
        rouge = compute_rouge_l(reference, final_answer)
        bleu = compute_bleu(reference, final_answer)

        # Precision@K
        pred_ids = extract_retrieved_ids(result)
        p_at_k = precision_at_k(pred_ids, relevant_ids, K)

        # Cost
        cost = estimate_cost(exec_stats)

        per_query_results.append(
            {
                "id": qid,
                "query": query,
                "reference": reference,
                "answer": final_answer,
                "rouge_l": rouge,
                "bleu": bleu,
                "precision_at_k": p_at_k,
                "latency_ms": latency_ms,
                "cost": cost,
            }
        )

        print(f"  ROUGE-L:       {rouge:.4f}" if rouge is not None else "  ROUGE-L:       (skipped)")
        print(f"  BLEU:          {bleu:.4f}" if bleu is not None else "  BLEU:          (skipped)")
        print(f"  Precision@{K}: {p_at_k:.4f}" if p_at_k is not None else f"  Precision@{K}: (no labels)")
        print(f"  Latency:       {latency_ms:.1f} ms")
        if cost is not None:
            print(f"  Cost:          ${cost:.5f}")
        else:
            print("  Cost:          (tokens not logged)")

    # Aggregate metrics
    def avg(values: List[Optional[float]]) -> Optional[float]:
        vals = [v for v in values if v is not None]
        return statistics.mean(vals) if vals else None

    summary = {
        "run_type": run_type,
        "avg_rouge_l": avg([r["rouge_l"] for r in per_query_results]),
        "avg_bleu": avg([r["bleu"] for r in per_query_results]),
        "avg_precision_at_20": avg([r["precision_at_20"] for r in per_query_results]),
        "avg_latency_ms": avg([r["latency_ms"] for r in per_query_results]),
        "avg_cost": avg([r["cost"] for r in per_query_results]),
        "per_query": per_query_results,
    }

    return summary

In [52]:
cold_summary = run_eval(cold_cache=True)
warm_summary = run_eval(cold_cache=False)



===== EVALUATION RUN (COLD CACHE) =====


--- [COLD] q1: Find me a wooden puzzle for kids under $20 ---
[CACHE] Cleared rag_cache and web_cache

[ROUTER] Extracted intent: {'product_type': 'puzzle', 'budget': None, 'price_min': None, 'price_max': 20.0, 'category': 'Toys', 'eco_friendly': None, 'materials': ['wood'], 'age_range': 'kids', 'brand': None}

[SAFETY] Is safe: True

[PLANNER] Plan: Search 2020 catalog, then verify current web prices for comparison
[PLANNER] Tools: ['rag.search', 'web.search']
[PLANNER] RAG params: {'query': 'wooden kids puzzle', 'price_min': None, 'price_max': 20.0, 'category': 'Toys', 'eco_friendly': None, 'top_k': 5}

[EXECUTOR] Calling rag.search...
[EXECUTOR] RAG found: 5 products

[EXECUTOR] Calling web.search (Google Shopping via MCP)...
[EXECUTOR] Price filter: under $20.0
[EXECUTOR] Running 5 price checks in PARALLEL...
[EXECUTOR] ‚úì Melissa & Doug Barnyard Wooden Jigsaw Pu... ‚Üí 40 results
[EXECUTOR] ‚úì Mudpuppy 4-Layer Transportation Friends ...

### First Time Evaluate

In [15]:
import pandas as pd

cold_df = pd.DataFrame(cold_summary["per_query"])
warm_df = pd.DataFrame(warm_summary["per_query"])

print("Cold cache ‚Äì per query metrics")
display(cold_df[["id", "rouge_l", "bleu", "precision_at_k", "latency_ms", "cost"]])

print("Warm cache ‚Äì per query metrics")
display(warm_df[["id", "rouge_l", "bleu", "precision_at_k", "latency_ms", "cost"]])

print("Aggregate metrics")
agg_df = pd.DataFrame([
    {
        "run_type": "COLD",
        "avg_rouge_l": cold_summary["avg_rouge_l"],
        "avg_bleu": cold_summary["avg_bleu"],
        "avg_precision_at_k": cold_summary["avg_precision_at_k"],
        "avg_latency_ms": cold_summary["avg_latency_ms"],
        "avg_cost": cold_summary["avg_cost"],
    },
    {
        "run_type": "WARM",
        "avg_rouge_l": warm_summary["avg_rouge_l"],
        "avg_bleu": warm_summary["avg_bleu"],
        "avg_precision_at_k": warm_summary["avg_precision_at_k"],
        "avg_latency_ms": warm_summary["avg_latency_ms"],
        "avg_cost": warm_summary["avg_cost"],
    }
])

display(agg_df)


Cold cache ‚Äì per query metrics


Unnamed: 0,id,rouge_l,bleu,precision_at_k,latency_ms,cost
0,q1,0.162963,0.01519,,16188.04,
1,q2,0.111111,0.015165,,20275.41,
2,q3,0.149254,0.015016,,61400.68,
3,q4,0.139241,0.024793,,19671.0,
4,q5,0.146667,0.04138,,23723.43,
5,q6,0.088106,0.008722,,54645.01,
6,q7,0.129496,0.027822,,16871.4,
7,q8,0.093023,0.003318,,17175.13,
8,q9,0.095238,0.014567,,2487.28,
9,q10,0.108527,0.008012,,51415.75,


Warm cache ‚Äì per query metrics


Unnamed: 0,id,rouge_l,bleu,precision_at_k,latency_ms,cost
0,q1,0.135593,0.012438,,23674.49,
1,q2,0.101449,0.017041,,12097.28,
2,q3,0.138889,0.014307,,18843.57,
3,q4,0.131579,0.02596,,59040.11,
4,q5,0.140625,0.047407,,12395.82,
5,q6,0.090909,0.008561,,21209.33,
6,q7,0.192593,0.085209,,12616.69,
7,q8,0.076923,0.003264,,14170.88,
8,q9,0.095238,0.014567,,2304.78,
9,q10,0.116505,0.012198,,16978.94,


Aggregate metrics


Unnamed: 0,run_type,avg_rouge_l,avg_bleu,avg_precision_at_k,avg_latency_ms,avg_cost
0,COLD,0.122363,0.017398,,28385.313,
1,WARM,0.12203,0.024095,,19333.189,


### Second Time Evaluate

In [33]:
import pandas as pd

cold_df = pd.DataFrame(cold_summary["per_query"])
warm_df = pd.DataFrame(warm_summary["per_query"])

print("Cold cache ‚Äì per query metrics")
display(cold_df[["id", "rouge_l", "bleu", "precision_at_k", "latency_ms", "cost"]])

print("Warm cache ‚Äì per query metrics")
display(warm_df[["id", "rouge_l", "bleu", "precision_at_k", "latency_ms", "cost"]])

print("Aggregate metrics")
agg_df = pd.DataFrame([
    {
        "run_type": "COLD",
        "avg_rouge_l": cold_summary["avg_rouge_l"],
        "avg_bleu": cold_summary["avg_bleu"],
        "avg_precision_at_k": cold_summary["avg_precision_at_k"],
        "avg_latency_ms": cold_summary["avg_latency_ms"],
        "avg_cost": cold_summary["avg_cost"],
    },
    {
        "run_type": "WARM",
        "avg_rouge_l": warm_summary["avg_rouge_l"],
        "avg_bleu": warm_summary["avg_bleu"],
        "avg_precision_at_k": warm_summary["avg_precision_at_k"],
        "avg_latency_ms": warm_summary["avg_latency_ms"],
        "avg_cost": warm_summary["avg_cost"],
    }
])

display(agg_df)


Cold cache ‚Äì per query metrics


Unnamed: 0,id,rouge_l,bleu,precision_at_k,latency_ms,cost
0,q1,0.135802,0.019546,0.0,159676.65,
1,q2,0.137931,0.023563,0.0,126972.67,
2,q3,0.215686,0.050967,0.0,196098.1,
3,q4,0.136986,0.027331,0.0,39603.51,
4,q5,0.147541,0.049479,0.0,22224.23,
5,q6,0.081633,0.007654,0.0,59489.07,
6,q7,0.094488,0.008502,0.0,29423.92,
7,q8,0.081081,0.003999,0.0,21316.63,
8,q9,0.095238,0.014567,,2346.22,
9,q10,0.081301,0.007625,0.0,21859.51,


Warm cache ‚Äì per query metrics


Unnamed: 0,id,rouge_l,bleu,precision_at_k,latency_ms,cost
0,q1,0.134021,0.010875,0.0,33838.47,
1,q2,0.116667,0.015785,0.0,14334.66,
2,q3,0.150943,0.014097,0.0,64595.53,
3,q4,0.136054,0.028366,0.0,20620.19,
4,q5,0.147541,0.049479,0.0,13060.31,
5,q6,0.083682,0.008128,0.0,23745.73,
6,q7,0.094488,0.008502,0.0,14642.35,
7,q8,0.075188,0.00316,0.0,15264.9,
8,q9,0.095238,0.014567,,3416.14,
9,q10,0.086957,0.007632,0.0,13121.72,


Aggregate metrics


Unnamed: 0,run_type,avg_rouge_l,avg_bleu,avg_precision_at_k,avg_latency_ms,avg_cost
0,COLD,0.120769,0.021323,0.0,67901.051,
1,WARM,0.112078,0.016059,0.0,21664.0,


### Third Time Evaluate

In [53]:
import pandas as pd

cold_df = pd.DataFrame(cold_summary["per_query"])
warm_df = pd.DataFrame(warm_summary["per_query"])

print("Cold cache ‚Äì per query metrics")
display(cold_df[["id", "rouge_l", "bleu", "precision_at_k", "latency_ms", "cost"]])

print("Warm cache ‚Äì per query metrics")
display(warm_df[["id", "rouge_l", "bleu", "precision_at_k", "latency_ms", "cost"]])

print("Aggregate metrics")
agg_df = pd.DataFrame([
    {
        "run_type": "COLD",
        "avg_rouge_l": cold_summary["avg_rouge_l"],
        "avg_bleu": cold_summary["avg_bleu"],
        "avg_precision_at_k": cold_summary["avg_precision_at_k"],
        "avg_latency_ms": cold_summary["avg_latency_ms"],
        "avg_cost": cold_summary["avg_cost"],
    },
    {
        "run_type": "WARM",
        "avg_rouge_l": warm_summary["avg_rouge_l"],
        "avg_bleu": warm_summary["avg_bleu"],
        "avg_precision_at_k": warm_summary["avg_precision_at_k"],
        "avg_latency_ms": warm_summary["avg_latency_ms"],
        "avg_cost": warm_summary["avg_cost"],
    }
])

display(agg_df)


Cold cache ‚Äì per query metrics


Unnamed: 0,id,rouge_l,bleu,precision_at_k,latency_ms,cost
0,q1,0.157895,0.013975,0.0,16411.29,
1,q2,0.115942,0.017041,0.0,20388.39,
2,q3,0.15544,0.014974,0.0,58690.34,
3,q4,0.139241,0.026674,0.0,24332.43,
4,q5,0.052045,0.00407,0.0,55529.62,
5,q6,0.148649,0.013196,0.0,57549.75,
6,q7,0.123077,0.011182,0.0,15779.5,
7,q8,0.052402,0.002779,0.0,50256.91,
8,q9,0.070312,0.004676,0.0,50038.29,
9,q10,0.094595,0.00973,0.0,40554.23,


Warm cache ‚Äì per query metrics


Unnamed: 0,id,rouge_l,bleu,precision_at_k,latency_ms,cost
0,q1,0.12973,0.011578,0.0,24249.62,
1,q2,0.137931,0.023563,0.0,10441.25,
2,q3,0.180791,0.015565,0.0,22186.45,
3,q4,0.128205,0.025363,0.0,15344.45,
4,q5,0.059041,0.004573,0.0,26957.99,
5,q6,0.11,0.009955,0.0,20933.84,
6,q7,0.119403,0.010716,0.0,10372.55,
7,q8,0.066667,0.003129,0.0,17507.04,
8,q9,0.126126,0.010078,0.0,15691.01,
9,q10,0.126582,0.006642,0.0,12741.05,


Aggregate metrics


Unnamed: 0,run_type,avg_rouge_l,avg_bleu,avg_precision_at_k,avg_latency_ms,avg_cost
0,COLD,0.11096,0.01183,0.0,38953.075,
1,WARM,0.118448,0.012116,0.0,17642.525,
