In [11]:
import asyncio
import os
from anthropic import Anthropic
from anthropic.types.beta import BetaMessageParam

async def run_computer_control():
    # Initialize Anthropic client
    api_key = os.getenv("ANTHROPIC_API_KEY")
    if not api_key:
        raise ValueError("API key not found. Please set the ANTHROPIC_API_KEY environment variable.")
    
    client = Anthropic(api_key=api_key)
    
    # Define initial message
    messages: list[BetaMessageParam] = [
        {
            "role": "user",
            "content": "Save a picture of a cat to my desktop."
        }
    ]
    
    # Create message with computer use beta enabled
    response = await client.beta.messages.create(
        model="claude-3-5-sonnet-20241022",
        messages=messages,
        max_tokens=4096,
        system="You have access to computer control capabilities.",
        betas=["computer-use-2024-10-22"]
    )
    
    # Process the response
    print("Response content:", response.content)
    
    # Handle any tool calls in the response
    if hasattr(response, 'tool_calls'):
        for tool_call in response.tool_calls:
            # Process tool calls here
            print(f"Tool call: {tool_call}")
            
    return response

# Run the async function
async def main():
    try:
        result = await run_computer_control()
        print("Execution completed successfully")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

# Execute the main function

asyncio.run(main())


RuntimeError: asyncio.run() cannot be called from a running event loop

In [12]:
"""
Agentic sampling loop that calls the Anthropic API and local implementation of anthropic-defined computer use tools.
"""

import platform
import os
import asyncio
import base64
import shlex
import shutil
from collections.abc import Callable
from datetime import datetime
from enum import StrEnum
from pathlib import Path
from typing import Any, Callable, Literal, List, Optional, TypedDict, cast
from uuid import uuid4

import pyautogui
from anthropic import Anthropic, AnthropicBedrock, AnthropicVertex, APIResponse
from anthropic.types import ToolResultBlockParam
from anthropic.types.beta import (
    BetaContentBlock,
    BetaContentBlockParam,
    BetaImageBlockParam,
    BetaMessage,
    BetaMessageParam,
    BetaTextBlockParam,
    BetaToolResultBlockParam,
)

from tools import BashTool, ComputerTool, EditTool, ToolCollection, ToolResult
# from base import BaseAnthropicTool, ToolError
# from run import run

# Constants
BETA_FLAG = "computer-use-2024-10-22"
OUTPUT_DIR = "/tmp/outputs"
TYPING_DELAY_MS = 12
TYPING_GROUP_SIZE = 50

# Scaling Targets
class Resolution(TypedDict):
    width: int
    height: int

MAX_SCALING_TARGETS: dict[str, Resolution] = {
    "XGA": Resolution(width=1024, height=768),       # 4:3
    "WXGA": Resolution(width=1280, height=800),      # 16:10
    "FWXGA": Resolution(width=1366, height=768),     # ~16:9
}

# API Providers
class APIProvider(StrEnum):
    ANTHROPIC = "anthropic"
    BEDROCK = "bedrock"
    VERTEX = "vertex"

PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
    APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
    APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
    APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
}

# System Prompt
SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
* You are utilizing a Windows virtual machine using {platform.machine()} architecture with internet access.
* You can install Windows applications using PowerShell. Use Invoke-WebRequest for downloading files.
* To open Chrome, please just click on the Chrome icon.
* Using PowerShell, you can start GUI applications. Windows GUI apps run with PowerShell will appear within your desktop environment, but they may take some time to appear. Take a screenshot to confirm it did.
* When using PowerShell with commands that are expected to output very large quantities of text, redirect into a tmp file and use str_replace_editor or `Select-String -Pattern "<query>" -Path <filename> -Context 0,5` to confirm output.
* When viewing a page, it can be helpful to zoom out so that you can see everything on the page. Alternatively, make sure you scroll down to see everything before deciding something isn't available.
* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
* The current date is {datetime.today()}.
</SYSTEM_CAPABILITY>

<IMPORTANT>
* When using Chrome, if a startup wizard appears, IGNORE IT. Do not click on any setup options. Instead, click on the address bar where it says "Search Google or type a URL", and enter the appropriate search term or URL there.
* If the item you are looking at is a pdf, if after taking a single screenshot of the pdf it seems that you want to read the entire document instead of trying to continue to read the pdf from your screenshots + navigation, determine the URL, use Invoke-WebRequest to download the pdf, install and use a PDF to text converter (like pdftotext from Xpdf Tools for Windows) to convert it to a text file, and then read that text file directly with your StrReplaceEditTool.
</IMPORTANT>"""

# Helper Functions
def chunks(s: str, chunk_size: int) -> List[str]:
    return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]

def _maybe_filter_to_n_most_recent_images(
    messages: List[BetaMessageParam],
    images_to_keep: int,
    min_removal_threshold: int = 10,
):
    """
    Filters out older images from the messages to keep memory usage in check.
    """
    if images_to_keep is None:
        return

    tool_result_blocks = cast(
        List[ToolResultBlockParam],
        [
            item
            for message in messages
            for item in (
                message["content"] if isinstance(message["content"], list) else []
            )
            if isinstance(item, dict) and item.get("type") == "tool_result"
        ],
    )

    total_images = sum(
        1
        for tool_result in tool_result_blocks
        for content in tool_result.get("content", [])
        if isinstance(content, dict) and content.get("type") == "image"
    )

    images_to_remove = total_images - images_to_keep
    if images_to_remove <= 0:
        return

    # Remove images in chunks
    images_to_remove -= images_to_remove % min_removal_threshold

    for tool_result in tool_result_blocks:
        if isinstance(tool_result.get("content"), list):
            new_content = []
            for content in tool_result.get("content", []):
                if isinstance(content, dict) and content.get("type") == "image":
                    if images_to_remove > 0:
                        images_to_remove -= 1
                        continue
                new_content.append(content)
            tool_result["content"] = new_content
            if images_to_remove <= 0:
                break

def _make_api_tool_result(
    result: ToolResult, tool_use_id: str
) -> BetaToolResultBlockParam:
    """Convert an agent ToolResult to an API ToolResultBlockParam."""
    tool_result_content: List[
        BetaTextBlockParam | BetaImageBlockParam
    ] = []
    is_error = False
    if result.error:
        is_error = True
        tool_result_content = _maybe_prepend_system_tool_result(result, result.error)
    else:
        if result.output:
            tool_result_content.append(
                {
                    "type": "text",
                    "text": _maybe_prepend_system_tool_result(result, result.output),
                }
            )
        if result.base64_image:
            tool_result_content.append(
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/png",
                        "data": result.base64_image,
                    },
                }
            )
    return {
        "type": "tool_result",
        "content": tool_result_content,
        "tool_use_id": tool_use_id,
        "is_error": is_error,
    }

def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str) -> str:
    if result.system:
        result_text = f"<system>{result.system}</system>\n{result_text}"
    return result_text

# Sampling Loop
async def sampling_loop(
    *,
    model: str,
    provider: APIProvider,
    system_prompt_suffix: str,
    messages: List[BetaMessageParam],
    output_callback: Callable[[BetaContentBlock], None],
    tool_output_callback: Callable[[ToolResult, str], None],
    api_response_callback: Callable[[APIResponse[BetaMessage]], None],
    api_key: str,
    only_n_most_recent_images: Optional[int] = None,
    max_tokens: int = 4096,
) -> List[BetaMessageParam]:
    """
    Agentic sampling loop for the assistant/tool interaction of computer use.
    """
    tool_collection = ToolCollection(
        ComputerTool(),
        BashTool(),
        EditTool(),
    )
    system = (
        f"{SYSTEM_PROMPT}{' ' + system_prompt_suffix if system_prompt_suffix else ''}"
    )

    while True:
        if only_n_most_recent_images:
            _maybe_filter_to_n_most_recent_images(messages, only_n_most_recent_images)

        if provider == APIProvider.ANTHROPIC:
            client = Anthropic(api_key=api_key)
        elif provider == APIProvider.VERTEX:
            client = AnthropicVertex()
        elif provider == APIProvider.BEDROCK:
            client = AnthropicBedrock()
        else:
            raise ValueError(f"Unsupported API provider: {provider}")

        # Call the API
        raw_response = client.beta.messages.with_raw_response.create(
            max_tokens=max_tokens,
            messages=messages,
            model=model,
            system=system,
            tools=tool_collection.to_params(),
            betas=["computer-use-2024-10-22"],
        )

        api_response_callback(cast(APIResponse[BetaMessage], raw_response))

        response = raw_response.parse()

        messages.append(
            {
                "role": "assistant",
                "content": cast(List[BetaContentBlockParam], response.content),
            }
        )

        tool_result_content: List[BetaToolResultBlockParam] = []
        for content_block in cast(List[BetaContentBlock], response.content):
            output_callback(content_block)
            if content_block.type == "tool_use":
                result = await tool_collection.run(
                    name=content_block.name,
                    tool_input=cast(dict[str, Any], content_block.input),
                )
                tool_result_content.append(
                    _make_api_tool_result(result, content_block.id)
                )
                tool_output_callback(result, content_block.id)

        if not tool_result_content:
            return messages

        # **Change role from "user" to "assistant" when appending tool results**
        messages.append({"content": tool_result_content, "role": "assistant"})

# Main Function
async def run_sampling_loop() -> List[BetaMessageParam]:
    # Get API key from the environment variables
    api_key = os.getenv("ANTHROPIC_API_KEY")

    # Ensure the API key is available
    if not api_key:
        raise ValueError("API key not found. Please set the ANTHROPIC_API_KEY environment variable.")

    # Run the sampling loop
    messages = await sampling_loop(
        model="claude-3-5-sonnet-20241022",
        provider=APIProvider.ANTHROPIC,
        system_prompt_suffix="",
        messages=[{"role": "user", "content": "Save a picture of a cat to my desktop."}],
        output_callback=lambda x: print(x),
        tool_output_callback=lambda x, y: print(f"Tool Output: {x}, ID: {y}"),
        api_response_callback=lambda x: print(f"API Response: {x}"),
        api_key=api_key,
    )
    return messages

# Execute the Main Function and Print Messages
def main():
    messages = asyncio.run(run_sampling_loop())
    print("Final Messages:")
    for msg in messages:
        print(msg)

if __name__ == "__main__":
    main()


RuntimeError: asyncio.run() cannot be called from a running event loop

In [13]:
import anthropic
import os
import asyncio  # Import asyncio for running the event loop
api_key = os.getenv(key="ANTHROPIC_API_KEY")

# Define the main function to run the sampling loop
async def run_sampling_loop():
    client = anthropic.Anthropic()
    # Get API key from the environment variables
    api_key = os.getenv(key="ANTHROPIC_API_KEY")

# Ensure the API key is available
if not api_key:
    raise ValueError("API key not found. Please set the ANTHROPIC_API_KEY environment variable.")

# Run the sampling loop
messages = await sampling_loop(
    model="claude-3-5-sonnet-20241022",
    provider=APIProvider.ANTHROPIC,
    system_prompt_suffix="",
    messages=[{"role": "user", "content": "Save a picture of a cat to my desktop."}],
    output_callback=lambda x: print(x),
    tool_output_callback=lambda x, y: print(x, y),
    api_response_callback=lambda x: print(x),
    api_key=api_key,
)

# Run the main function using asyncio
asyncio.run(run_sampling_loop())

print(messages)


<APIResponse [200 OK] type=<class 'anthropic.types.beta.beta_message.BetaMessage'>>
BetaTextBlock(text="I'll help you save a picture of a cat to the desktop. I'll use Chrome to find a cat image and then save it.\n\nFirst, let me take a screenshot to see the current desktop and locate Chrome:", type='text')
BetaToolUseBlock(id='toolu_01CubyWSAY9c3781ULw2HV8e', input={'action': 'screenshot'}, name='computer', type='tool_use')
ToolResult(output='Screenshot taken', error=None, base64_image='iVBORw0KGgoAAAANSUhEUgAABVYAAAMACAIAAABAXKuVAAEAAElEQVR4nOzdd1xTV/8H8A8zYYNsicywBBRRRHBVKg+iqLgqDpQObWvrqD9rXa21dVTtYx2ttrVDRStuVJTyoLhBQBFFZW9QCHuPQPj9cSGGMGTj+L5fvl4mN+ee870nNyHn3HPOlVBS0wYhhBBCCCGEEELedJJ9HQAhhBBCCCGEEEJ6A3UBEEIIIYQQQgghbwXqAiCEEEIIIYQQQt4K1AVACCGEEEIIIYS8FaT7OgBCCOl+CoqK8vLyLBZbSkqqr2MhhBBCelBdXV11dVVFRUV5WVlfx0IIeQ1I0B0BCCFvEmlpaXV1DUlJyYqKckFdXT3q+zoiQgghpAdJQEJSSkpeQUFQJ8jPz6utre3riAghrzSaCEAIeaNoaGoC9aWlxXV1tdT+J4QQ8sarR31dXW1pSXE96jU0Nfs6HELIq466AAghbw5FRUVJScnychoJSQgh5K1TUV4m

BadRequestError: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'messages.2: `tool_result` blocks can only be in `user` messages'}}

In [4]:
import asyncio
import os
from anthropic import Anthropic
from anthropic.types.beta import BetaMessageParam

async def run_computer_control():
    # Initialize Anthropic client
    api_key = os.getenv("ANTHROPIC_API_KEY")
    if not api_key:
        raise ValueError("API key not found. Please set the ANTHROPIC_API_KEY environment variable.")
    
    client = Anthropic(api_key=api_key)
    
    # Define initial message
    messages: list[BetaMessageParam] = [
        {
            "role": "user",
            "content": "Save a picture of a cat to my desktop."
        }
    ]
    
    # Create message with computer use beta enabled
    response = await client.beta.messages.create(
        model="claude-3-5-sonnet-20241022",
        messages=messages,
        max_tokens=4096,
        system="You have access to computer control capabilities.",
        betas=["computer-use-2024-10-22"]
    )
    
    # Process the response
    print("Response content:", response.content)
    
    # Handle any tool calls in the response
    if hasattr(response, 'tool_calls'):
        for tool_call in response.tool_calls:
            # Process tool calls here
            print(f"Tool call: {tool_call}")
            
    return response

# Run the async function
async def main():
    try:
        result = await run_computer_control()
        print("Execution completed successfully")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

# Execute the main function
if __name__ == "__main__":
    asyncio.run(main())


RuntimeError: asyncio.run() cannot be called from a running event loop

In [9]:
import anthropic
import os
client = anthropic.Anthropic()
# make a request to the API using the loop
# get api key from the environment variables
api_key = os.getenv("ANTHROPIC_API_KEY")
messages = sampling_loop(
    model="claude-3-5-sonnet-20241022",
    provider=APIProvider.ANTHROPIC,
    system_prompt_suffix="",
    messages=[{"role": "user", "content": "Save a picture of a cat to my desktop."}],
    output_callback=lambda x: print(x),
    tool_output_callback=lambda x, y: print(x, y),
    api_response_callback=lambda x: print(x),
    api_key=api_key,
)


In [10]:
print(messages)



<coroutine object sampling_loop at 0x0000028664888040>


In [None]:

# response = client.beta.messages.create(
#     model="claude-3-5-sonnet-20241022",
#     max_tokens=1024,
#     tools=[
#         {
#           "type": "computer_20241022",
#           "name": "computer",
#           "display_width_px": 1024,
#           "display_height_px": 768,
#           "display_number": 1,
#         },
#         {
#           "type": "text_editor_20241022",
#           "name": "str_replace_editor"
#         },
#         {
#           "type": "bash_20241022",
#           "name": "bash"
#         }
#     ],
#     messages=[{"role": "user", "content": "Save a picture of a cat to my desktop."}],
#     betas=["computer-use-2024-10-22"],
# )
# print(response)


In [None]:
#show the output of the loop
print(messages)


In [None]:
import asyncio

# Assuming sampling_loop is an async function
async def main():
    messages = await sampling_loop()
    print(messages)

# Run the main function
asyncio.run(main())


In [None]:
import asyncio

# Assuming sampling_loop is an async function
async def main():
    messages = await sampling_loop()
    print(messages)

# Get the current event loop
loop = asyncio.get_event_loop()

# Run the main function
loop.run_until_complete(main())
