autogpt/llm/chat.py

from __future__ import annotations

import time
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from autogpt.agent.agent import Agent

from autogpt.config import Config
from autogpt.llm.api_manager import ApiManager
from autogpt.llm.base import ChatSequence, Message
from autogpt.llm.utils import count_message_tokens, create_chat_completion
from autogpt.log_cycle.log_cycle import CURRENT_CONTEXT_FILE_NAME
from autogpt.logs import logger


# TODO: Change debug from hardcode to argument
def chat_with_ai(
    config: Config,
    agent: Agent,
    system_prompt: str,
    user_input: str,
    token_limit: int,
    model: str | None = None,
):
    """
    Interact with the OpenAI API, sending the prompt, user input,
        message history, and permanent memory.

    Args:
        config (Config): The config to use.
        agent (Agent): The agent to use.
        system_prompt (str): The prompt explaining the rules to the AI.
        user_input (str): The input from the user.
        token_limit (int): The maximum number of tokens allowed in the API call.
        model (str, optional): The model to use. If None, the config.fast_llm_model will be used. Defaults to None.

    Returns:
    str: The AI's response.
    """
    if model is None:
        model = config.fast_llm_model

    # Reserve 1000 tokens for the response
    logger.debug(f"Token limit: {token_limit}")
    send_token_limit = token_limit - 1000

    # if len(agent.history) == 0:
    #     relevant_memory = ""
    # else:
    #     recent_history = agent.history[-5:]
    #     shuffle(recent_history)
    #     relevant_memories = agent.memory.get_relevant(
    #         str(recent_history), 5
    #     )
    #     if relevant_memories:
    #         shuffle(relevant_memories)
    #     relevant_memory = str(relevant_memories)
    # logger.debug(f"Memory Stats: {agent.memory.get_stats()}")
    relevant_memory = []

    message_sequence = ChatSequence.for_model(
        model,
        [
            Message("system", system_prompt),
            Message("system", f"The current time and date is {time.strftime('%c')}"),
            # Message(
            #     "system",
            #     f"This reminds you of these events from your past:\n{relevant_memory}\n\n",
            # ),
        ],
    )

    # Add messages from the full message history until we reach the token limit
    next_message_to_add_index = len(agent.history) - 1
    insertion_index = len(message_sequence)
    # Count the currently used tokens
    current_tokens_used = message_sequence.token_length

    # while current_tokens_used > 2500:
    #     # remove memories until we are under 2500 tokens
    #     relevant_memory = relevant_memory[:-1]
    #     (
    #         next_message_to_add_index,
    #         current_tokens_used,
    #         insertion_index,
    #         current_context,
    #     ) = generate_context(
    #         prompt, relevant_memory, agent.history, model
    #     )

    # Account for user input (appended later)
    user_input_msg = Message("user", user_input)
    current_tokens_used += count_message_tokens([user_input_msg], model)

    current_tokens_used += 500  # Reserve space for new_summary_message

    # Add Messages until the token limit is reached or there are no more messages to add.
    for cycle in reversed(list(agent.history.per_cycle())):
        messages_to_add = [msg for msg in cycle if msg is not None]
        tokens_to_add = count_message_tokens(messages_to_add, model)
        if current_tokens_used + tokens_to_add > send_token_limit:
            break

        # Add the most recent message to the start of the chain,
        #  after the system prompts.
        message_sequence.insert(insertion_index, *messages_to_add)
        current_tokens_used += tokens_to_add

    # Update & add summary of trimmed messages
    if len(agent.history) > 0:
        new_summary_message, trimmed_messages = agent.history.trim_messages(
            current_message_chain=list(message_sequence),
        )
        tokens_to_add = count_message_tokens([new_summary_message], model)
        message_sequence.insert(insertion_index, new_summary_message)
        current_tokens_used += tokens_to_add - 500

        # FIXME: uncomment when memory is back in use
        # memory_store = get_memory(cfg)
        # for _, ai_msg, result_msg in agent.history.per_cycle(trimmed_messages):
        #     memory_to_add = MemoryItem.from_ai_action(ai_msg, result_msg)
        #     logger.debug(f"Storing the following memory:\n{memory_to_add.dump()}")
        #     memory_store.add(memory_to_add)

    api_manager = ApiManager()
    # inform the AI about its remaining budget (if it has one)
    if api_manager.get_total_budget() > 0.0:
        remaining_budget = api_manager.get_total_budget() - api_manager.get_total_cost()
        if remaining_budget < 0:
            remaining_budget = 0
        budget_message = f"Your remaining API budget is ${remaining_budget:.3f}" + (
            " BUDGET EXCEEDED! SHUT DOWN!\n\n"
            if remaining_budget == 0
            else " Budget very nearly exceeded! Shut down gracefully!\n\n"
            if remaining_budget < 0.005
            else " Budget nearly exceeded. Finish up.\n\n"
            if remaining_budget < 0.01
            else "\n\n"
        )
        logger.debug(budget_message)
        message_sequence.add("system", budget_message)
        current_tokens_used += count_message_tokens([message_sequence[-1]], model)

    # Append user input, the length of this is accounted for above
    message_sequence.append(user_input_msg)

    plugin_count = len(config.plugins)
    for i, plugin in enumerate(config.plugins):
        if not plugin.can_handle_on_planning():
            continue
        plugin_response = plugin.on_planning(
            agent.config.prompt_generator, message_sequence.raw()
        )
        if not plugin_response or plugin_response == "":
            continue
        tokens_to_add = count_message_tokens(
            [Message("system", plugin_response)], model
        )
        if current_tokens_used + tokens_to_add > send_token_limit:
            logger.debug(f"Plugin response too long, skipping: {plugin_response}")
            logger.debug(f"Plugins remaining at stop: {plugin_count - i}")
            break
        message_sequence.add("system", plugin_response)
    # Calculate remaining tokens
    tokens_remaining = token_limit - current_tokens_used
    # assert tokens_remaining >= 0, "Tokens remaining is negative.
    # This should never happen, please submit a bug report at
    #  https://www.github.com/Torantulino/Auto-GPT"

    # Debug print the current context
    logger.debug(f"Token limit: {token_limit}")
    logger.debug(f"Send Token Count: {current_tokens_used}")
    logger.debug(f"Tokens remaining for response: {tokens_remaining}")
    logger.debug("------------ CONTEXT SENT TO AI ---------------")
    for message in message_sequence:
        # Skip printing the prompt
        if message.role == "system" and message.content == system_prompt:
            continue
        logger.debug(f"{message.role.capitalize()}: {message.content}")
        logger.debug("")
    logger.debug("----------- END OF CONTEXT ----------------")
    agent.log_cycle_handler.log_cycle(
        agent.config.ai_name,
        agent.created_at,
        agent.cycle_count,
        message_sequence.raw(),
        CURRENT_CONTEXT_FILE_NAME,
    )

    # TODO: use a model defined elsewhere, so that model can contain
    # temperature and other settings we care about
    assistant_reply = create_chat_completion(
        prompt=message_sequence,
        max_tokens=tokens_remaining,
    )

    # Update full message history
    agent.history.append(user_input_msg)
    agent.history.add("assistant", assistant_reply, "ai_response")

    return assistant_reply