In [None]:
# Install dependecies:
%pip install anthropic python-dotenv

In [None]:
# Load environment variables from .env file
from dotenv import load_dotenv

load_dotenv()

In [None]:
# Create an API client
from anthropic import Anthropic

client = Anthropic()

model = "clause-sonnet-4-0"

In [None]:
def add_user_message(messages, text):
    user_message = {"role": "user", "content": text}
    messages.append(user_message)
    

In [None]:
def add_assistant_message(messages, text):
    assistant_message = {"role": "assistant", "text": text}
    messages.append(assistant_message)
    

In [None]:
# Make a chat request
def chat(messages, system=None):
    params = {
        "model": model,
        "max_tokens": 1000,
        "messages": messages
        # temperature=0.7, the temperature parameter 
        # can be adjusted for more or less randomness in the responses
    }

    # Include system message if provided
    if system:
        params["system"] = system

    message = client.messages.create(**params) 
    
    return message.content[0].text

In [None]:

# Make a starting list of messages
messages = []

# Optionally, add a system message
system = "You are a helpful assistant."

# Add a user message
add_user_message(messages, "Hello, who are you?")

# Get a response from the assistant
answer = chat(messages)

# Add the assistant's response to the messages
add_assistant_message(messages, answer)

# Print the assistant's response
print("Assistant:", answer)


FACCIAMO UN CHATBOT

In [None]:
# Make an initial list of messages
messages = []

# use a while true loop to keep the conversation going
while True:
    # Get user input
    user_input = input("You: ")
    
    # Add the user message to the messages
    add_user_message(messages, user_input)
    
    # Get a response from the assistant
    answer = chat(messages)
    
    # Add the assistant's response to the messages
    add_assistant_message(messages, answer)
    
    # Print the assistant's response
    print("---")
    print("Assistant:", answer)
    print("---")


STRUCTURED DATA EXERCISE

idea of stop sequences and message pre-filling

In [None]:
messages = []

prompt = """
Generate three different sample AWS CLI commands. Each should be very short
"""

add_user_message(messages, prompt)

# to guide claude to respond with a code block
add_assistant_message(messages, "Here are all three commands in a single block without any comments:\n```bash")

text = chat(messages, stop_sequences=["```"])
text.strip()


PROMPT EVALUATION

In [None]:
import json

def generate_dataset():
    prompt = """
Generate a evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts
that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects,
each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
    {
        "task": "Description of task",
        "format": "python" or "json" or "regex",
        "solution_criteria": "Key criteria for evaluating the solution"
    },
    ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a regular expression.
* Focus on tasks that do not require writing much code

Please generate 3 objects.
"""
    # Initialize messages
    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json")
    text = chat(messages, stop_sequences=["```"])
    return json.loads(text)

In [None]:
dataset = generate_dataset()

# Save the dataset to a JSON file in the current directory
with open("dataset.json", "w") as f:
    json.dump(dataset, f, indent=2)

In [None]:
def run_prompt(test_case):
    """merges the prompt and test case input, then returns the result"""
    prompt = f"""
Please solve the following task:

{test_case['task']}

* Respond only with Python, JSON, or a plain Regex
* Do not include any explanations or additional text
"""
    
    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```code") # Do not specify code block type so that it works for all formats
    output = chat(messages, stop_sequences=["```"])
    return output

In [None]:
def grade_my_model(test_case, output):
    eval_prompt = f"""
You are an expert AWS code reviewer. Your task is to evaluate the following AI-generated solution.

Original Task:
<task>
{test_case["task"]}
</task>

Solution to Evaluate:
<solution>
{output}
</solution>

Criteria you should use to evaluate the solution:
<criteria>
{test_case["solution_criteria"]}
</criteria>

Output Format
Provide your evaluation as a structured JSON object with the following fields, in this specific order:
- "strengths": An array of 1-3 key strengths
- "weaknesses": An array of 1-3 key areas for improvement
- "reasoning": A concise explanation of your overall assessment
- "score": A number between 1-10

Respond with JSON. Keep your response concise and direct.
Example response shape:
{{
    "strengths": string[],
    "weaknesses": string[],
    "reasoning": string,
    "score": number
}}
    """

    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")
    eval_text = chat(messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [None]:
# Functions to validate the output structure
import re
import ast


def validate_json(text):
    try:
        json.loads(text.strip())
        return 10
    except json.JSONDecodeError:
        return 0


def validate_python(text):
    try:
        ast.parse(text.strip())
        return 10
    except SyntaxError:
        return 0


def validate_regex(text):
    try:
        re.compile(text.strip())
        return 10
    except re.error:
        return 0


# It determines which validation function to use based on the format specified in the test case
def grade_syntax(response, test_case):
    format = test_case["format"]
    if format == "json":
        return validate_json(response)
    elif format == "python":
        return validate_python(response)
    else:
        return validate_regex(response)

In [None]:
def run_test_case(test_case):
    """Calls run_prompt, then grades the result"""
    output = run_prompt(test_case)

    # TODO - Grading
    model_grade = grade_my_model(test_case, output)
    model_score = model_grade["score"]
    reasoning = model_grade["reasoning"]

    syntax_score = grade_syntax(output, test_case)

    score = (model_score + syntax_score) / 2

    return {
        "output": output,
        "test_case": test_case,
        "score": score,
        "reasoning": reasoning
    }

In [None]:
from statistics import mean

def run_eval(dataset):
    """Loads the dataset and calls run_test_case with each case""" 
    results = []

    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)

    average_score = mean([result["score"] for result in results])
    print(f"Average Score: {average_score}")

    return results


In [None]:
with open("dataset.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)

In [None]:
print(json.dumps(results, indent=2))