# Evaluation of the agent (VisualAgent)

In [1]:
%pip  install litellm

Note: you may need to restart the kernel to use updated packages.


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# === Base ===
import os
import json
import pandas as pd
from tqdm import tqdm

# === Phoenix core ===
import phoenix as px
from phoenix.trace import SpanEvaluations
from phoenix.trace.dsl import SpanQuery

# === Evaluaciones automáticas ===
from phoenix.evals import (
    TOOL_CALLING_PROMPT_TEMPLATE,
    llm_classify,
    PromptTemplate,
)


# === LLM local (ej. llama.cpp o llamafile) ===
#from phoenix.evals.models import 

# === Extra ===
from openinference.instrumentation import suppress_tracing
import nest_asyncio
nest_asyncio.apply()
import pprint

In [4]:
os.environ['OLLAMA_API_BASE']= 'http://localhost:11434'

In [5]:
PROJECT_NAME = "evaluating-agent"

In [None]:
from utils_0 import run_graph_with_tracing, start_main_span


<module 'langgraph.version' from '/Users/simop/miniconda3/lib/python3.11/site-packages/langgraph/version.py'>
🔭 OpenTelemetry Tracing Details 🔭
|  Phoenix Project: evaluating-agent
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: https://app.phoenix.arize.com/v1/traces
|  Transport: HTTP + protobuf
|  Transport Headers: {'api_key': '****'}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.



In [7]:
'''from phoenix.evals.models import BaseModel
from typing import Optional, Sequence, Union, Any
from langchain_ollama import ChatOllama
import asyncio'
'''

from phoenix.evals import llm_classify, TOOL_CALLING_PROMPT_TEMPLATE, PromptTemplate, LiteLLMModel
from litellm import completion


In [8]:
# === Correct Phoenix Trace Querying ===
from phoenix.trace import SpanEvaluations

# Define evaluation queries properly
sql_query = (
    SpanQuery()
    .where("name == 'sql_query_exec' and span_kind == 'TOOL'")
).select(
    question="input.value",
    query_gen="output.value",
)

analysis_query = (
    SpanQuery()
    .where("name == 'data_analysis' and span_kind == 'TOOL'")
).select(
    query="input.value",
    response="output.value",
)

viz_query = (
    SpanQuery()
    .where("name == 'gen_visualization' and span_kind == 'TOOL'")
).select(
    input="input.value",
    generated_code="output.value",
)

decide_query = (
    SpanQuery()
    .where("span_kind == 'TOOL' and name == 'decide_tool'")
).select(
    question="input.value",
    tool_call="output.value",
)

Create loop to see multiple times the difference in the "re-runs"
1. Use multiple type of prompts styles for testing
2. Human in the loop?

1. complete
2. How to save a csv file with the output of the traces.
3. Try to make a decision based on different branches and parallelization
4. cook up a docker container
5. Try access with the server

In [12]:
input_state = {
    "prompt": "What was the most popular product SKU?",
}
ret = run_graph_with_tracing(input_state)

[LangGraph] Starting LangGraph execution with tracing

Selected tool: lookup_sales_data
Generated SQL Query:
 SELECT SKU_Coded FROM sales GROUP BY SKU_Coded ORDER BY SUM(Total_Sale_Value) DESC LIMIT 1

Selected tool: analyzing_data
Data to analyze:
    SKU_Coded
0    6200700

Selected tool: lookup_sales_data
Generated SQL Query:
 SELECT SKU_Coded FROM sales GROUP BY SKU_Coded ORDER BY SUM(Total_Sale_Value) DESC LIMIT 1

Selected tool: analyzing_data
Data to analyze:
    SKU_Coded
0    6200700

Selected tool: end
[LangGraph] LangGraph execution completed


In [10]:
agent_questions = [
    "What was the most popular product SKU?",
    "What was the total revenue across all stores?",
    "Which store had the highest sales volume?",
    "Create a bar chart showing total sales by store",
    "What was the average transaction value?"
]

for question in tqdm(agent_questions, desc="Processing questions"):
    try:
        input_state = {
            "prompt": question,
        }
        ret = run_graph_with_tracing(input_state)
        print()
    except Exception as e:
        print(f"Error processing question: {question}")
        print(e)
        continue

Processing questions:   0%|          | 0/5 [00:00<?, ?it/s]

[LangGraph] Starting LangGraph execution with tracing

Selected tool: lookup_sales_data
Generated SQL Query:
 SELECT SKU_Coded FROM sales GROUP BY SKU_Coded ORDER BY SUM(Total_Sale_Value) DESC LIMIT 1

Selected tool: analyzing_data
Data to analyze:
    SKU_Coded
0    6200700

Selected tool: lookup_sales_data
Generated SQL Query:
 SELECT SKU_Coded FROM sales GROUP BY SKU_Coded ORDER BY SUM(Total_Sale_Value) DESC LIMIT 1

Selected tool: analyzing_data
Data to analyze:
    SKU_Coded
0    6200700

Selected tool: end


Processing questions:  20%|██        | 1/5 [00:33<02:12, 33.04s/it]

[LangGraph] LangGraph execution completed
[LangGraph] Starting LangGraph execution with tracing

Selected tool: lookup_sales_data
Generated SQL Query:
 SELECT SUM(Total_Sale_Value) FROM sales

Selected tool: analyzing_data
Data to analyze:
    sum(Total_Sale_Value)
0           1.327264e+07

Selected tool: analyzing_data
Data to analyze:
    sum(Total_Sale_Value)
0           1.327264e+07

Selected tool: end


Processing questions:  40%|████      | 2/5 [00:53<01:16, 25.51s/it]

[LangGraph] LangGraph execution completed
[LangGraph] Starting LangGraph execution with tracing

Selected tool: lookup_sales_data
Generated SQL Query:
 SELECT Store_Number FROM sales GROUP BY Store_Number ORDER BY SUM(Total_Sale_Value) DESC LIMIT 1

Selected tool: analyzing_data
Data to analyze:
    Store_Number
0          2970

Selected tool: lookup_sales_data
Generated SQL Query:
 SELECT Store_Number FROM sales GROUP BY Store_Number ORDER BY SUM(Total_Sale_Value) DESC LIMIT 1

Selected tool: analyzing_data
Data to analyze:
    Store_Number
0          2970

Selected tool: end
[LangGraph] LangGraph execution completed


Processing questions:  60%|██████    | 3/5 [01:17<00:49, 24.86s/it]

[LangGraph] Starting LangGraph execution with tracing

Selected tool: lookup_sales_data
Generated SQL Query:
 SELECT Store_Number, SUM(Total_Sale_Value) FROM sales GROUP BY Store_Number

Selected tool: create_visualization

Selected tool: analyzing_data
Data to analyze:
     Store_Number  sum(Total_Sale_Value)
0           3410          410567.848126
1            990          378433.018639
2           1650          580443.007953
3            880          420302.088397
4            550          229727.498752
5           4180          272208.118542
6            330          370503.687331
7           1210          508393.767785
8           1760          350747.617798
9           2750          453664.808068
10          1980          242290.828499
11          2310          412579.388504
12          2200          361173.288199
13          3630          405034.547846
14          1870          401070.997685
15          3190          335035.018792
16          1100          497509.528013
17      

Processing questions:  80%|████████  | 4/5 [02:08<00:35, 35.25s/it]

[LangGraph] Starting LangGraph execution with tracing

Selected tool: lookup_sales_data
Generated SQL Query:
 SELECT AVG(Total_Sale_Value) FROM sales

Selected tool: analyzing_data
Data to analyze:
    avg(Total_Sale_Value)
0              19.018132

Selected tool: analyzing_data
Data to analyze:
    avg(Total_Sale_Value)
0              19.018132

Selected tool: end
[LangGraph] LangGraph execution completed


Processing questions: 100%|██████████| 5/5 [02:22<00:00, 28.44s/it]


In [11]:
ret

{'prompt': 'What was the average transaction value?',
 'data': '   avg(Total_Sale_Value)\n0              19.018132',
 'analyze_data': 'The average transaction value is $19.02.',
 'answer': ['The average transaction value is $19.02.',
  'The average transaction value is $19.02.'],
 'visualization_goal': None,
 'chart_config': None,
 'tool_choice': 'end'}