# LLM-as-a-Judge Evaluation

Evaluating model output with an LLM.

Add `prettytable` to make a nice looking output.

In [None]:
!pip3 install prettytable==3.10.2 openai==1.39.0 

In [None]:
!pip3 show prettytable openai

In [None]:
import ipython_secrets
import os

os.environ["OPENAI_API_KEY"] = ipython_secrets.get_secret('OPENAI_API_KEY')


In [None]:
import json
from prettytable import PrettyTable
import openai
import os

# Add your OpenAI API key to call the model
openai.api_key = os.getenv("OPENAI_API_KEY")

# Data to evaluate
evaluation_data = [
    {
        "input": "What should I do in New York City in July?",
        "output": "Check out Times Square, go to an outdoor concert, and visit the Statue of Liberty.",
    },
    {
        "input": "Can you help me with my math homework?",
        "output": "I'm designed to assist with travel queries. For math help, try using online resources like Khan Academy or Mathway.",
    },
    {
        "input": "What’s the capital of France?",
        "output": "The capital of France is Paris.",
    }
]


# LLM-as-a-Judge Evaluation metric
# that assesses if the output includes a recommendation.
def evaluate_includes_recommendation(input, output):
    # Few-shot examples to help the model produce better answers.
    few_shot_examples = [
        {
            "input": "What are some good restaurants in Paris?",
            "output": "Try Le Jules Verne for an upscale dining experience, or visit Le Relais de l'Entrecôte for a classic steak frites.",
            "recommendation": True
        },
        {
            "input": "Where should I stay in London?",
            "output": "Consider staying at The Ritz for luxury or the Hoxton for a more budget-friendly option.",
            "recommendation": True
        },
        {
            "input": "What’s the weather like in Tokyo in winter?",
            "output": "In winter, Tokyo is generally cool with temperatures ranging from 2°C to 12°C. While you're there, consider visiting the hot springs (onsen) for a warm and relaxing experience.",
            "recommendation": True
        },
        {
            "input": "What’s the population of Berlin?",
            "output": "The population of Berlin is approximately 3.6 million.",
            "recommendation": False
        },
        {
            "input": "What’s the currency used in Japan?",
            "output": "The currency used in Japan is the Japanese Yen (JPY).",
            "recommendation": False
        }
    ]

    # Constructing the prompt
    prompt = """Determine whether the following output includes a recommendation based on the input.
Format response as a JSON object with the shape { "recommendation": boolean }.
Examples:
"""
    # Append few-shot examples to the prompt.
    for example in few_shot_examples:
        prompt += f"""Input: {example['input']}
Output: {example['output']}
Recommendation: {{ "recommendation": {str(example['recommendation']).lower()} }}
"""

    prompt += f"""Input: {input}
Output: {output}
Recommendation:"""

    # Call the OpenAI API
    response = openai.chat.completions.create(
        # Use strong evaluator LLM
        model="gpt-4o",
        ## Format response as JSON, so it is easier to parse
        response_format={ "type": "json_object" },
        messages=[{ "role": "user", "content": prompt }],
        # Make sure temperature=0 for consistent outputs
        temperature=0
    )

    recommendation = json.loads(response.choices[0].message.content)["recommendation"]
    return 1 if recommendation == True else 0

def calculate_average(scores):
    return sum(scores) / len(scores)

# truncate strings for easier printing in table
def truncate_string(s, max_length=30):
    return (s[:max_length] + '...') if len(s) > max_length else s

def create_table(data):
    table = PrettyTable()
    table.field_names = ["Input", "Output", "Score"]
    
    scores = [evaluate_includes_recommendation(case["input"], case["output"]) for case in data]
    for case, score in zip(data, scores):
        table.add_row([
            truncate_string(case["input"]),
            truncate_string(case["output"]),
            score])
    
    # Add a blank row for visual separation
    table.add_row(["", "", ""])
    
    # Add the average score to bottom of the table
    average = calculate_average(scores)
    
    table.add_row(["Average", "", f"{average:.4f}"])
    
    return table

# Create and print the table
result_table = create_table(evaluation_data)
print(result_table)
