In [29]:
import boto3
import json
from statistics import mean
import re
import ast

In [2]:
client = boto3.client('bedrock-runtime', region_name='eu-central-1')
model_id = "eu.anthropic.claude-3-haiku-20240307-v1:0"

def add_user_message(messages, text):
    user_message =  {
        "role": "user",
        "content": [
            {"text": text}
        ],
    }   
    messages.append(user_message)

def add_assitant_message(messages, text):
    assitant_message =  {
        "role": "assistant",
        "content": [
            {"text": text}
        ],
    }   
    messages.append(assitant_message)

def chat(messages, system=None, temperature=1.0, stop_sequences=[]):
    params = {
        "modelId": model_id,
        "messages": messages,
        "inferenceConfig": {
            "temperature": temperature,
            "stopSequences": stop_sequences,
        },
    }

    if system:
        params["system"] = [{"text": system}]

    response = client.converse(**params)

    return response["output"]["message"]["content"][0]["text"]

In [38]:
def generate_dataset():
    prompt = """
   Generate a evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts
    that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects,
    each representing task that requires Python, JSON, or a Regex to complete.

    Example output:
    ```json
    [
        {
            "task": "Description of task",
            "format": "python|json|regex",
            "solution_criteria": "Key criteria for a evalating solution",
        },
        ...additional
    ]
    ```

    * Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a regular expression.
    * Focus on tasks that do not require writing much code

    Please generate 3 objects.
    """

    messages = []
    add_user_message(messages, prompt)
    add_assitant_message(messages, "```json")
    text = chat(messages, temperature=0.0, stop_sequences=["```"])
    return json.loads(text)


In [39]:
dataset = generate_dataset()
with open("dataset.json", "w") as f:
    json.dump(dataset, f, indent=4)

In [None]:
def run_prompt(test_case):
    """Merges the prompt and test case, then runs the result."""
    prompt = f"""
    Please solve the following task:
    {test_case['task']}

    * Respond only with Python, JSON, or Regex
    * Do not include any explanations or other text or comments
    """
    messages = []
    add_user_message(messages, prompt)
    add_assitant_message(messages, "```code")
    response = chat(messages, stop_sequences=["```"])
    return response

In [40]:
def grade_by_model(test_case, output):
    eval_prompt = f"""
You are an expert AWS code reviewer. Your task is to evaluate the following AI-generated solution.

Original Task:
<task>
{test_case["task"]}
</task>

Solution to Evaluate:
<solution>
{output}
</solution>

Criteria you should use to evaluate the solution:
<criteria>
{test_case["solution_criteria"]}
</criteria>

Output Format
Provide your evaluation as a structured JSON object with the following fields, in this specific order:
- "strengths": An array of 1-3 key strengths
- "weaknesses": An array of 1-3 key areas for improvement
- "reasoning": A concise explanation of your overall assessment
- "score": A number between 1-10

Respond with JSON. Keep your response concise and direct.
Example response shape:
{{
    "strengths": string[],
    "weaknesses": string[],
    "reasoning": string,
    "score": number
}}
    """
    messages = []
    add_user_message(messages, eval_prompt)
    add_assitant_message(messages, "```json")
    eval_text = chat(messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [30]:
def validate_json(text):
    try:
        json.loads(text.strip())
        return 10
    except json.JSONDecodeError:
        return 0


def validate_python(text):
    try:
        ast.parse(text.strip())
        return 10
    except SyntaxError:
        return 0


def validate_regex(text):
    try:
        re.compile(text.strip())
        return 10
    except re.error:
        return 0


def grade_syntax(response, test_case):
    format = test_case["format"]
    if format == "json":
        return validate_json(response)
    elif format == "python":
        return validate_python(response)
    else:
        return validate_regex(response)

In [None]:
def run_test_case(test_case):
    """Calls the run prompt, then grades the result."""
    output = run_prompt(test_case)
    model_grade = grade_by_model(test_case, output)
    model_score = model_grade["score"]
    reasoning = model_grade["reasoning"]

    syntax_score = grade_syntax(output, test_case)
    score= (model_score + syntax_score) / 2

    return {
        "test_case": test_case,
        "output": output,
        "score": model_score,
        "reasoning": reasoning,
    }

In [41]:
def run_eval(dataset):
    """Loads the dataset, and calls run_test_case for each entry."""
    results = []
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)

    average_score = mean([result["score"] for result in results])
    print(f"Average Score: {average_score}")
    return results

In [42]:
with open("dataset.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)

Average Score: 8.333333333333334


In [44]:
print(json.dumps(results, indent=4))

[
    {
        "test_case": {
            "task": "Write a Python function to list all the AWS services available in a specific AWS region.",
            "format": "python",
            "solution_criteria": "The function should use the AWS SDK for Python (Boto3) to retrieve the list of available services in the specified region and return it as a list."
        },
        "output": "Here's a Python function that uses the AWS SDK for Python (Boto3) to list all the AWS services available in a specific AWS region:\n\n```python\nimport boto3\n\ndef list_aws_services(region_name):\n    \"\"\"\n    Lists all the AWS services available in the specified region.\n    \n    Args:\n        region_name (str): The AWS region to list the services for.\n        \n    Returns:\n        list: A list of the available AWS services in the specified region.\n    \"\"\"\n    # Create a Boto3 client for the AWS Service Catalog\n    service_catalog = boto3.client('servicecatalog', region_name=region_name)\n 