In [2]:
import time
import pandas as pd
from langsmith import Client
from langsmith.evaluation import evaluate
from openevals.llm import create_llm_as_judge
from agent_router import handle_routed_query

# Load evaluation data
df = pd.read_csv("test_data.csv")

client = Client()
dataset_name = "Real Estate Q&A Evaluation 2"

# Create or retrieve dataset
try:
    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description="Evaluation dataset uploaded from CSV."
    )
except Exception:
    dataset = client.read_dataset(dataset_name=dataset_name)

# Add examples
examples = [
    {"inputs": {"question": row["question"]}, "outputs": {"answer": row["expected_answer"]}}
    for _, row in df.iterrows()
]
client.create_examples(dataset_id=dataset.id, examples=examples)

# Target function
def target(inputs: dict) -> dict:
    try:
        return {"answer": handle_routed_query(inputs["question"])}
    except Exception as e:
        return {"answer": f"Error: {str(e)}"}

# Relevance prompt
RELEVANCE_PROMPT = """
You are an expert evaluator.

Evaluate how relevant the assistant's answer is to the user's question on a scale of 1 to 10:
- 10 = Perfectly relevant
- 1 = Completely irrelevant

User Question:
{inputs[question]}

Assistant's Answer:
{response[answer]}

Score (1 to 10):
Give only the score as ouput, without any additional text.
"""

# Create evaluator
relevance_evaluator = create_llm_as_judge(
    prompt=RELEVANCE_PROMPT,
    model="openai:gpt-4",
    feedback_key="relevance"
)

# Evaluation with retry
def evaluate_with_retry(max_retries=5):
    for attempt in range(max_retries):
        try:
            return evaluate(
                target,
                data=dataset_name,
                evaluators=[relevance_evaluator],
                experiment_prefix="real-estate-eval-relevance",
                max_concurrency=2,
                blocking=True
            )
        except Exception as e:
            if "rate limit" in str(e).lower():
                wait_time = 2 ** attempt
                print(f"Rate limit hit. Retrying in {wait_time}s...")
                time.sleep(wait_time)
            else:
                raise

# Run evaluation
experiment_results = evaluate_with_retry()



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:faiss.loader:Loading faiss.
INFO:faiss.loader:Successfully loaded faiss.
INFO:faiss:Failed to load GPU Faiss: name 'GpuIndexIVFFlat' is not defined. Will not load constructor refs for GPU indexes. This is only an error if you're trying to use GPU Faiss.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embed

View the evaluation results for experiment: 'real-estate-eval-relevance-f342fccd' at:
https://smith.langchain.com/o/fe350ab2-22e0-5fde-ae9b-17f69d9aeb79/datasets/92391b92-ac74-4c08-b1d7-664e82db0ea8/compare?selectedSessions=db7d858a-adce-4068-922a-b63575da93eb




0it [00:00, ?it/s]INFO:agent_router:Handling new question: Which agent is really communicative?
INFO:agent_router:Handling new question: Give me properties whose price is less than 400000?
INFO:agent_router:Chat memory provided: None
INFO:agent_router:Chat memory provided: None
INFO:agent_router:Invoking routing logic...
INFO:agent_router:Invoking routing logic...
INFO:agent_router:Routing the query...
INFO:agent_router:Routing the query...
INFO:agent_router:User Question: Which agent is really communicative?
INFO:agent_router:User Question: Give me properties whose price is less than 400000?
INFO:agent_router:Chat Memory: None
INFO:agent_router:Chat Memory: None
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:agent_router:Routing Decision: {'destination': 'firestore'}
INFO:agent_router:Route: firestore
INFO:agent_router:Property Mention: None
INFO:agent_router:Agent Mention: None
INFO:agent_router:Routing to Firestore backend...
INFO:tex

In [9]:
import pandas as pd
import ast
import re
from tqdm import tqdm
from openai import OpenAI
import os
from dotenv import load_dotenv

# Initialize OpenAI client
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=openai_api_key)
 
# Load dataset
df = pd.read_csv("real-estate-eval-relevance-2201abec.csv")

# Extract question and answers from JSON-like columns
def safe_extract_answer(json_str):
    try:
        return ast.literal_eval(json_str)["answer"]
    except:
        return ""

def safe_extract_question(json_str):
    try:
        return ast.literal_eval(json_str)["question"]
    except:
        return ""

df["question"] = df["inputs"].apply(safe_extract_question)
df["reference_answer"] = df["reference_outputs"].apply(safe_extract_answer)
df["generated_answer"] = df["outputs"].apply(safe_extract_answer)

# Build the GPT prompt
def build_prompt(question, reference, generated):
    return f"""
You are a strict evaluator for an AI assistant.

Here is the original user question:
{question}

Expected Answer (for reference):
{reference}

AI-Generated Answer:
{generated}

Evaluate how well the AI-generated answer answers the user’s question.
You may use the expected answer for context, but your score should focus on whether the AI directly and accurately addressed the question.

Score the answer from 1 to 10:
- 10 = Fully and accurately answers the question
- 5 = Partially answers or vague
- 1 = Irrelevant, incorrect, or non-answer

Only return the number. No explanation.
"""

# Run scoring using OpenAI
scores = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    question = row["question"]
    reference = row["reference_answer"]
    generated = row["generated_answer"]

    if not question or not reference or not generated:
        scores.append(1)
        continue

    prompt = build_prompt(question, reference, generated)

    try:
        response = client.chat.completions.create(
            model="gpt-4",  
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )

        raw_output = response.choices[0].message.content.strip()
        match = re.search(r'\b([1-9]|10)\b', raw_output)
        score = int(match.group()) if match else 1

    except Exception as e:
        print(f"Error on row {i}: {e}")
        score = 1

    scores.append(score)

# Save results
df["relevance_score"] = scores
df.to_csv("real-estate-eval-with-gpt-scores.csv", index=False)

print("Saved updated file as: real-estate-eval-with-gpt-scores.csv")


  0%|          | 0/21 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
  5%|▍         | 1/21 [00:00<00:19,  1.05it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 10%|▉         | 2/21 [00:01<00:13,  1.45it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 14%|█▍        | 3/21 [00:01<00:10,  1.76it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 19%|█▉        | 4/21 [00:02<00:09,  1.85it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 24%|██▍       | 5/21 [00:02<00:07,  2.06it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 29%|██▊       | 6/21 [00:03<00:07,  1.90it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 33%|███▎      | 7/21 [00:03<00:07,  1.87it/s]INF

Saved updated file as: real-estate-eval-with-gpt-scores.csv



