In [16]:
# Load env variables and create client
from dotenv import load_dotenv
from anthropic import Anthropic

load_dotenv()

client = Anthropic()
low_model = "claude-3-5-haiku-latest"
high_model = "claude-sonnet-4-0"

In [17]:
# Helper functions
def add_user_message(messages, text):
    user_message = {"role": "user", "content": text}
    messages.append(user_message)


def add_assistant_message(messages, text):
    assistant_message = {"role": "assistant", "content": text}
    messages.append(assistant_message)


def chat(model, messages, system=None, temperature=1.0, stop_sequences=[]):
    params = {
        "model": model,
        "max_tokens": 1000,
        "messages": messages,
        "temperature": temperature,
        "stop_sequences": stop_sequences,
    }

    if system:
        params["system"] = system

    message = client.messages.create(**params)
    return message.content[0].text

In [18]:
# Function to generate a new dataset
import json


def generate_dataset():
    prompt = """
You are an expert AI assistant tasked with creating an evaluation dataset. Your goal is to generate a dataset for a prompt-engineering pipeline that tests a model's ability to produce code for specific AWS tasks.

The output must be a single, valid JSON array containing exactly **six* JSON objects.

**Example Output Structure:**

Your final output should follow this structure precisely. Notice how each object contains a `task`, its required `format`, and the `judging_criteria`.

```json
[
  {
    "task": "A human-like description of a simple AWS task...",
    "format": "python",
    "judging_criteria": "Clear, specific rules for how to grade the Python code..."
  },
  {
    "task": "Another description, this time for a JSON-based AWS task...",
    "format": "json",
    "judging_criteria": "Unambiguous criteria for validating the JSON structure and content..."
  },
  {
      "task": "A request for a regular expression to parse some AWS log data...",
      "format": "regex",
      "judging_criteria": "Specific requirements for what the regex must capture..."
  }
]
```

Now, generate a new and unique array of three objects based on the following detailed instructions for each key:

1.  **`task`**: A string describing a common AWS-related problem. Write this as if you were a developer asking a colleague for help—it should sound natural and can be slightly informal or contain some situational context.
2.  **`format`**: A string specifying the required output format. This value is deterministic. The three objects you create **must** have the formats `"python"`, `"json"`, and `"regex"`, one for each.
3.  **`judging_criteria`**: A string containing precise, objective rules for an evaluator to check if the generated code is correct. This should be a clear checklist of what to look for in the solution.

**Constraints to Follow:**

  * **Domain:** All tasks must relate to common AWS services (e.g., S3, IAM, CloudWatch, Lambda).
  * **Simplicity:** Tasks must be solvable with a single Python function, a single JSON object, or one regular expression.
  * **Final Output:** Your response should only contain the raw JSON array and nothing else.

Generate the array of six objects now.
"""

    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json")
    text = chat(low_model, messages, stop_sequences=["```"])
    return json.loads(text)

In [34]:
# Generate the dataset and write it to 'dataset.json'
dataset = generate_dataset()
with open("final_dataset.json", "w") as f:
    json.dump(dataset, f, indent=4)

In [27]:
# Function to grade a test case + output using a model
def grade_by_model(test_case, output):
    eval_prompt = eval_prompt = f"""
You are an expert AWS code reviewer. Your task is to evaluate the following AI-generated solution based on a specific set of criteria.

Original Task:
<task>
{test_case["task"]}
</task>

Judging Criteria:
<criteria>
{test_case["judging_criteria"]}
</criteria>

Solution to Evaluate:
<solution>
{output}
</solution>

Your Instructions:
Carefully compare the provided <solution> against the <criteria>. Your entire evaluation must be based on whether the solution meets the requirements outlined in the criteria.

Output Format:
Provide your evaluation as a structured JSON object with the following fields, in this specific order:
- "strengths": An array of 1-3 key strengths, specifically referencing the judging criteria.
- "weaknesses": An array of 1-3 key weaknesses or deviations from the judging criteria.
- "reasoning": A concise explanation of your overall assessment, summarizing how well the solution adhered to the criteria.
- "score": A number between 1-10, where 10 means the solution perfectly meets all judging criteria.

Respond with only the raw JSON object. Keep your response concise and direct.
Example response shape:
{{
    "strengths": ["Adheres to the specified IAM action `s3:GetObject`.", "Correctly targets the resource ARN."],
    "weaknesses": ["Missing the `s3:ListBucket` action required by the criteria."],
    "reasoning": "The solution is mostly correct but incomplete, as it fails to meet one of the key requirements from the judging criteria.",
    "score": 7
}}
"""

    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")
    eval_text = chat(high_model, messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [28]:
# Passes a test case into Claude
def run_prompt(task):
    """
    Generates a precise, role-based prompt to guide the AI in solving a coding task.
    The model must infer the required output format (Python, JSON, or Regex) from the task itself.

    Args:
        task: A string describing the task to be solved.
    """
    prompt = f"""
You are an expert code generation engine. Your sole purpose is to generate a single, clean code snippet to solve the user's task.
Analyze the task below and respond with only the required code.

**Task:**
{task}

**Your Instructions:**
1.  Read the task and determine if the solution requires Python, JSON, or a regular expression.
2.  Generate only the raw code or text that solves the task.
3.  Your entire response must be only the solution itself. Do not add any comments, explanations, or surrounding markdown like ```.
"""

    messages = []
    add_user_message(messages, prompt)
    
    # Start the model with a generic code block opener.
    # The model is expected to fill in the language (e.g., python, json) itself.
    add_assistant_message(messages, "```code")
    output = chat(high_model,messages, stop_sequences=["```"])
    return output

In [29]:
# Functions to validate the output structure
import re
import ast


def validate_json(text):
    try:
        json.loads(text.strip())
        return 10
    except json.JSONDecodeError:
        return 0


def validate_python(text):
    try:
        ast.parse(text.strip())
        return 10
    except SyntaxError:
        return 0


def validate_regex(text):
    try:
        re.compile(text.strip())
        return 10
    except re.error:
        return 0


def grade_syntax(response, test_case):
    format = test_case["format"]
    if format == "json":
        return validate_json(response)
    elif format == "python":
        return validate_python(response)
    else:
        return validate_regex(response)

In [30]:
# Function to execute a single test case and grade the output
def run_test_case(test_case):
    """Calls run_prompt, then grades the result"""
    output = run_prompt(test_case)

    model_grade = grade_by_model(test_case, output)
    model_score = model_grade["score"]
    reasoning = model_grade["reasoning"]

    syntax_score = grade_syntax(output, test_case)

    score = (model_score + syntax_score) / 2

    return {
        "output": output,
        "test_case": test_case,
        "score": score,
        "reasoning": reasoning,
    }

In [31]:
from statistics import mean


def run_eval(dataset):
    """Loads the dataset and calls run_test_case with each case"""
    results = []

    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)

    average_score = mean([result["score"] for result in results])
    print(f"Average score: {average_score}")

    return results

In [35]:
with open("final_dataset.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)

Average score: 8.666666666666666


In [25]:
results 

[{'output': '\nimport time\nimport random\nimport string\n\ndef generate_unique_s3_bucket_name(prefix="bucket"):\n    # Ensure prefix is lowercase and valid\n    prefix = prefix.lower().strip(\'-\')\n    if not prefix:\n        prefix = "bucket"\n    \n    # Generate timestamp\n    timestamp = str(int(time.time()))\n    \n    # Generate random suffix for extra uniqueness\n    random_suffix = \'\'.join(random.choices(string.ascii_lowercase + string.digits, k=6))\n    \n    # Combine parts with hyphens\n    bucket_name = f"{prefix}-{timestamp}-{random_suffix}"\n    \n    # Ensure it meets length requirements (3-63 characters)\n    if len(bucket_name) > 63:\n        # Truncate prefix if necessary\n        max_prefix_length = 63 - len(timestamp) - len(random_suffix) - 2\n        prefix = prefix[:max_prefix_length]\n        bucket_name = f"{prefix}-{timestamp}-{random_suffix}"\n    \n    return bucket_name\n',
  'test_case': {'task': 'I need a way to generate a unique S3 bucket name that fo

In [12]:
with open("final_results.json", "w") as f:
    json.dump(results, f, indent=4)