In [1]:
# Load env variables and create client
from dotenv import load_dotenv
from anthropic import Anthropic

load_dotenv()

client = Anthropic()
low_model = "claude-3-5-haiku-latest"
high_model = "claude-sonnet-4-0"

In [2]:
# Helper functions
def add_user_message(messages, text):
    user_message = {"role": "user", "content": text}
    messages.append(user_message)


def add_assistant_message(messages, text):
    assistant_message = {"role": "assistant", "content": text}
    messages.append(assistant_message)


def chat(model, messages, system=None, temperature=1.0, stop_sequences=[]):
    params = {
        "model": model,
        "max_tokens": 1000,
        "messages": messages,
        "temperature": temperature,
        "stop_sequences": stop_sequences,
    }

    if system:
        params["system"] = system

    message = client.messages.create(**params)
    return message.content[0].text

In [3]:
# Function to generate a new dataset
import json


def generate_dataset():
    prompt = """
Generate a evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts
that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects,
each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
    {
        "task": "Description of task",
        "format": "json" or "python" or "regex"
    },
    ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a regular expression.
* Focus on tasks that do not require writing much code

Please generate 6 objects.
"""

    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json")
    text = chat(low_model,messages, stop_sequences=["```"])
    return json.loads(text)

In [None]:
# Generate the dataset and write it to 'dataset.json'
dataset = generate_dataset()
with open("dataset.json", "w") as f:
    json.dump(dataset, f, indent=4)

In [5]:
dataset

[{'task': 'Create a regex to validate an AWS S3 bucket name (lowercase, alphanumeric, 3-63 characters)',
  'format': 'regex'},
 {'task': 'Generate a JSON schema for an AWS Lambda function configuration',
  'format': 'json'},
 {'task': 'Write a Python function to extract region from an AWS ARN',
  'format': 'python'},
 {'task': "Create a regex to match AWS EC2 instance IDs (starts with 'i-')",
  'format': 'regex'},
 {'task': 'Create a JSON object representing minimal IAM policy for S3 read access',
  'format': 'json'},
 {'task': 'Write a Python function to convert CloudWatch metric names to valid Python variable names',
  'format': 'python'}]

In [16]:
# Function to grade a test case + output using a model
def grade_by_model(test_case, output):
    eval_prompt = f"""
You are an expert AWS code reviewer. Your task is to evaluate the following AI-generated solution.

Original Task:
<task>
{test_case["task"]}
</task>

Solution to Evaluate:
<solution>
{output}
</solution>

Output Format
Provide your evaluation as a structured JSON object with the following fields, in this specific order:
- "strengths": An array of 1-3 key strengths
- "weaknesses": An array of 1-3 key areas for improvement
- "reasoning": A concise explanation of your overall assessment
- "score": A number between 1-10

Respond with JSON. Keep your response concise and direct.
Example response shape:
{{
    "strengths": string[],
    "weaknesses": string[],
    "reasoning": string,
    "score": number
}}
    """

    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")
    eval_text = chat(high_model, messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [17]:
# Functions to validate the output structure
import re
import ast


def validate_json(text):
    try:
        json.loads(text.strip())
        return 10
    except json.JSONDecodeError:
        return 0


def validate_python(text):
    try:
        ast.parse(text.strip())
        return 10
    except SyntaxError:
        return 0


def validate_regex(text):
    try:
        re.compile(text.strip())
        return 10
    except re.error:
        return 0


def grade_syntax(response, test_case):
    format = test_case["format"]
    if format == "json":
        return validate_json(response)
    elif format == "python":
        return validate_python(response)
    else:
        return validate_regex(response)

In [18]:
# Passes a test case into Claude
def run_prompt(test_case):
    prompt = f"""
Please solve the following task:

{test_case["task"]}

* Respond only with Python, JSON, or a plain Regex
* Do not add any comments or commentary or explanation
"""

    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```code")
    output = chat(high_model,messages, stop_sequences=["```"])
    return output

In [19]:
# Function to execute a single test case and grade the output
def run_test_case(test_case):
    """Calls run_prompt, then grades the result"""
    output = run_prompt(test_case)

    model_grade = grade_by_model(test_case, output)
    model_score = model_grade["score"]
    reasoning = model_grade["reasoning"]

    syntax_score = grade_syntax(output, test_case)

    score = (model_score + syntax_score) / 2

    return {
        "output": output,
        "test_case": test_case,
        "score": score,
        "reasoning": reasoning,
    }

In [20]:
from statistics import mean


def run_eval(dataset):
    """Loads the dataset and calls run_test_case with each case"""
    results = []

    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)

    average_score = mean([result["score"] for result in results])
    print(f"Average score: {average_score}")

    return results

In [21]:
with open("dataset.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)

Average score: 7.333333333333333


In [22]:
results 

[{'output': '\n^[a-z0-9][a-z0-9.-]{1,61}[a-z0-9]$\n',
  'test_case': {'task': 'Create a regex to validate an AWS S3 bucket name (lowercase, alphanumeric, 3-63 characters)',
   'format': 'regex'},
  'score': 8.0,
  'reasoning': "The regex implements standard AWS S3 bucket naming rules but contradicts the simplified task requirements. The task specifically requested 'alphanumeric' characters only, while the solution includes periods and hyphens from the full AWS specification."},
 {'output': '\n{\n  "$schema": "http://json-schema.org/draft-07/schema#",\n  "type": "object",\n  "title": "AWS Lambda Function Configuration",\n  "properties": {\n    "FunctionName": {\n      "type": "string",\n      "pattern": "^[a-zA-Z0-9-_]{1,64}$"\n    },\n    "Runtime": {\n      "type": "string",\n      "enum": [\n        "nodejs18.x",\n        "nodejs16.x",\n        "nodejs14.x",\n        "python3.11",\n        "python3.10",\n        "python3.9",\n        "python3.8",\n        "java17",\n        "java11",