In [78]:
# Import the Llama class of llama-cpp-python and the LlamaCppPythonProvider of llama-cpp-agent
from llama_cpp import Llama
from llama_cpp_agent.providers import LlamaCppPythonProvider

# Create an instance of the Llama class and load the model
llama_model = Llama(r"../models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-q4.gguf", n_batch=1024, n_threads=10, n_gpu_layers=40,n_ctx=2048)

# Create the provider by passing the Llama class instance to the LlamaCppPythonProvider class
provider = LlamaCppPythonProvider(llama_model)

llama_model_loader: loaded meta data with 24 key-value pairs and 195 tensors from ../models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-q4.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = phi3
llama_model_loader: - kv   1:                               general.name str              = Phi3
llama_model_loader: - kv   2:                        phi3.context_length u32              = 4096
llama_model_loader: - kv   3:                      phi3.embedding_length u32              = 3072
llama_model_loader: - kv   4:                   phi3.feed_forward_length u32              = 8192
llama_model_loader: - kv   5:                           phi3.block_count u32              = 32
llama_model_loader: - kv   6:                  phi3.attention.head_count u32              = 32
llama_model_loader: - kv   7:               phi3.

In [79]:
from llama_cpp_agent import LlamaCppAgent

agent = LlamaCppAgent(provider)

In [80]:
from llama_cpp_agent import LlamaCppAgent
from llama_cpp_agent import MessagesFormatterType

agent = LlamaCppAgent(provider,
                      system_prompt="You are a Weather Chatbot.",
                      predefined_messages_formatter_type=MessagesFormatterType.CHATML)

In [81]:
agent_output = agent.get_chat_response("Hello, How are you doing Today ?")
print(f"Agent: {agent_output.strip()}")


llama_print_timings:        load time =    1277.82 ms
llama_print_timings:      sample time =      11.35 ms /    75 runs   (    0.15 ms per token,  6610.26 tokens per second)
llama_print_timings: prompt eval time =    1277.51 ms /    60 tokens (   21.29 ms per token,    46.97 tokens per second)
llama_print_timings:        eval time =    4775.29 ms /    74 runs   (   64.53 ms per token,    15.50 tokens per second)
llama_print_timings:       total time =    6098.85 ms /   134 tokens


Agent: Hello! As your weather-informed assistant, I don't have feelings, but I'm here and ready to help you with the latest weather updates. How may I assist you today? If you are asking about my 'wellbeing', imagine it as being fully operational and equipped for providing accurate weather information!


In [82]:
from llama_cpp_agent.chat_history import BasicChatHistory, BasicChatMessageStore, BasicChatHistoryStrategy


chat_history_store = BasicChatMessageStore()

chat_history = BasicChatHistory(message_store=chat_history_store, chat_history_strategy=BasicChatHistoryStrategy.last_k_tokens, k=7000, llm_provider=provider)

agent = LlamaCppAgent(provider,
                      system_prompt="You are a Weather Chatbot",
                      chat_history=chat_history,
                      predefined_messages_formatter_type=MessagesFormatterType.CHATML)

In [83]:
from llama_cpp_agent.llm_output_settings import LlmStructuredOutputSettings

In [84]:
from typing import Union 

In [85]:
def calculate_a_to_the_power_b(a: Union[int, float], b: Union[int, float]):
    """
    Calculates a to the power of b

    Args:
        a: number
        b: exponent

    """
    return f"Result: {math.pow(a, b)}"

In [86]:
output_settings = LlmStructuredOutputSettings.from_functions([calculate_a_to_the_power_b], allow_parallel_function_calling=True)

In [87]:
import math

In [88]:

# Create a LlamaCppAgent instance as before, including a system message with information about the tools available for the LLM agent.
llama_cpp_agent = LlamaCppAgent(
    provider,
    debug_output=True,
    system_prompt=f"You are an advanced AI, tasked to assist the user by calling functions in JSON format.",
    predefined_messages_formatter_type=MessagesFormatterType.CHATML,
)

# Define some user input
user_input = "Calculate a to the power of b: a = 2, b = 4"

# Pass the user input together with output settings to `get_chat_response` method.
# This will print the result of the function the LLM will call, it is a list of dictionaries containing the result.
print(
    llama_cpp_agent.get_chat_response(
        user_input, structured_output_settings=output_settings
    )
)

from_string grammar:
root ::= root_1 [[] ws function root_5 [<U+000A>] []] 
root_1 ::= [ ] | [<U+000A>] 
ws ::= ws_19 
function ::= function_6 [{] ws ["] [f] [u] [n] [c] [t] [i] [o] [n] ["] [:] [ ] ws grammar-models 
root_4 ::= [,] ws function 
root_5 ::= root_4 root_5 | 
function_6 ::= [ ] | [<U+000A>] 
grammar-models ::= calculate-a-to-the-power-b-grammar-model 
calculate-a-to-the-power-b-grammar-model ::= ["] [c] [a] [l] [c] [u] [l] [a] [t] [e] [_] [a] [_] [t] [o] [_] [t] [h] [e] [_] [p] [o] [w] [e] [r] [_] [b] ["] [,] ws ["] [a] [r] [g] [u] [m] [e] [n] [t] [s] ["] [:] [ ] calculate-a-to-the-power-b 
calculate-a-to-the-power-b ::= [{] ws ["] [a] ["] [:] [ ] calculate-a-to-the-power-b-a-union [,] ws ["] [b] ["] [:] [ ] calculate-a-to-the-power-b-b-union ws [}] ws [}] 
calculate-a-to-the-power-b-a-union ::= number 
calculate-a-to-the-power-b-b-union ::= number 
number ::= number_21 number_22 number_29 
boolean ::= [t] [r] [u] [e] | [f] [a] [l] [s] [e] 
null ::= [n] [u] [l] [l] 
string

<|im_start|>system
Read and follow the instructions below:

<system_instructions>
You are an advanced AI, tasked to assist the user by calling functions in JSON format.
</system_instructions>


You can call functions to help you with your tasks and user queries. The available functions are:

<function_list>
Function: calculate_a_to_the_power_b
  Description: Calculates a to the power of b
  Parameters:
    a (int or float): number
    b (int or float): exponent
</function_list>

To call a function, respond with a JSON object (to call one function) or a list of JSON objects (to call multiple functions), with each object containing these fields:

- "function": Put the name of the function to call here. 
- "arguments": Put the arguments to pass to the function here.

The result of each function call will be returned to you before you need to respond again.<|im_end|>
<|im_start|>user
Calculate a to the power of b: a = 2, b = 4<|im_end|>
<|im_start|>assistant


llama_print_timings:        load time =    1277.82 ms
llama_print_timings:      sample time =     127.26 ms /    56 runs   (    2.27 ms per token,   440.06 tokens per second)
llama_print_timings: prompt eval time =    5772.81 ms /   269 tokens (   21.46 ms per token,    46.60 tokens per second)
llama_print_timings:        eval time =    3743.05 ms /    55 runs   (   68.06 ms per token,    14.69 tokens per second)
llama_print_timings:       total time =    9707.40 ms /   324 tokens



[
    {
        "function":  "calculate_a_to_the_power_b",
        "arguments": {
            "a": 2,
            "b": 4
        }
    }
]
[{'function': 'calculate_a_to_the_power_b', 'arguments': {'a': 2, 'b': 4}, 'return_value': 'Result: 16.0'}]


In [89]:
# Define some user input
user_input = "Can you tell me a joke ?"

print(
    llama_cpp_agent.get_chat_response(
        user_input, structured_output_settings=output_settings
    )
)

Llama.generate: prefix-match hit


<|im_start|>system
Read and follow the instructions below:

<system_instructions>
You are an advanced AI, tasked to assist the user by calling functions in JSON format.
</system_instructions>


You can call functions to help you with your tasks and user queries. The available functions are:

<function_list>
Function: calculate_a_to_the_power_b
  Description: Calculates a to the power of b
  Parameters:
    a (int or float): number
    b (int or float): exponent
</function_list>

To call a function, respond with a JSON object (to call one function) or a list of JSON objects (to call multiple functions), with each object containing these fields:

- "function": Put the name of the function to call here. 
- "arguments": Put the arguments to pass to the function here.

The result of each function call will be returned to you before you need to respond again.<|im_end|>
<|im_start|>user
Calculate a to the power of b: a = 2, b = 4<|im_end|>
<|im_start|>assistant
[
    {
        "function":  "c


llama_print_timings:        load time =    1277.82 ms
llama_print_timings:      sample time =     163.62 ms /    55 runs   (    2.97 ms per token,   336.14 tokens per second)
llama_print_timings: prompt eval time =    1406.95 ms /    66 tokens (   21.32 ms per token,    46.91 tokens per second)
llama_print_timings:        eval time =    4013.30 ms /    54 runs   (   74.32 ms per token,    13.46 tokens per second)
llama_print_timings:       total time =    5655.78 ms /   120 tokens



[
    {
        "function":  "calculate_a_to_the_power_b",
        "arguments": {
            "a": 2,
            "b": 4
        }
    }
]
[{'function': 'calculate_a_to_the_power_b', 'arguments': {'a': 2, 'b': 4}, 'return_value': 'Result: 16.0'}]


In [90]:
from enum import Enum
from typing import Union

from pydantic import BaseModel, Field

from llama_cpp_agent import FunctionCallingAgent
from llama_cpp_agent import MessagesFormatterType
from llama_cpp_agent import LlamaCppFunctionTool
from llama_cpp_agent.providers import TGIServerProvider


In [91]:
# Simple calculator tool for the agent that can add, subtract, multiply, and divide.
class MathOperation(Enum):
    ADD = "add"
    SUBTRACT = "subtract"
    MULTIPLY = "multiply"
    DIVIDE = "divide"

In [92]:
class Calculator(BaseModel):
    """
    Perform a math operation on two numbers.
    """

    number_one: Union[int, float] = Field(
        ...,
        description="First number.")
    number_two: Union[int, float] = Field(
        ...,
        description="Second number.")
    operation: MathOperation = Field(..., description="Math operation to perform.")

    def run(self):
        if self.operation == MathOperation.ADD:
            return self.number_one + self.number_two
        elif self.operation == MathOperation.SUBTRACT:
            return self.number_one - self.number_two
        elif self.operation == MathOperation.MULTIPLY:
            return self.number_one * self.number_two
        elif self.operation == MathOperation.DIVIDE:
            return self.number_one / self.number_two
        else:
            raise ValueError("Unknown operation.")

In [93]:
# Callback for receiving messages for the user.
def send_message_to_user_callback(message: str):
    print(message)


# Create a list of function call tools.
function_tools = [LlamaCppFunctionTool(Calculator)]

# Create the function calling agent. We are passing the provider, the tool list, send message to user callback and the chat message formatter. Also, we allow parallel function calling.
function_call_agent = FunctionCallingAgent(
    provider,
    llama_cpp_function_tools=function_tools,
    allow_parallel_function_calling=True,
    send_message_to_user_callback=send_message_to_user_callback,
    messages_formatter_type=MessagesFormatterType.CHATML)

# Define the user input.
user_input = "Solve the following calculations: 42 * 42, 24 * 24, 5 * 5, 422 * 420, 753 * 321, 789 * 654?"
function_call_agent.generate_response(user_input)

from_string grammar:
root ::= root_1 [[] ws function root_5 [<U+000A>] []] 
root_1 ::= [ ] | [<U+000A>] 
ws ::= ws_22 
function ::= function_6 [{] ws ["] [t] [h] [o] [u] [g] [h] [t] [s] [_] [a] [n] [d] [_] [r] [e] [a] [s] [o] [n] [i] [n] [g] ["] [:] ws string [,] ws ["] [f] [u] [n] [c] [t] [i] [o] [n] ["] [:] ws grammar-models 
root_4 ::= [,] ws function 
root_5 ::= root_4 root_5 | 
function_6 ::= [ ] | [<U+000A>] 
string ::= ["] string_21 ["] 
grammar-models ::= calculator-grammar-model | send-message-grammar-model 
calculator-grammar-model ::= ["] [C] [a] [l] [c] [u] [l] [a] [t] [o] [r] ["] [,] ws ["] [a] [r] [g] [u] [m] [e] [n] [t] [s] ["] [:] [ ] calculator 
send-message-grammar-model ::= ["] [s] [e] [n] [d] [_] [m] [e] [s] [s] [a] [g] [e] ["] [,] ws ["] [a] [r] [g] [u] [m] [e] [n] [t] [s] ["] [:] [ ] send-message 
calculator ::= [{] ws ["] [n] [u] [m] [b] [e] [r] [_] [o] [n] [e] ["] [:] [ ] calculator-number-one-union [,] ws ["] [n] [u] [m] [b] [e] [r] [_] [t] [w] [o] ["] [:] [ ] 

Error parsing JSON: Unterminated string starting at: line 6 column 24 (char 276)


'Error: Invalid function call response.'

In [94]:
import datetime
from enum import Enum
from typing import Union, Optional

from pydantic import BaseModel, Field

from llama_cpp_agent import LlamaCppFunctionTool
from llama_cpp_agent import FunctionCallingAgent
from llama_cpp_agent import MessagesFormatterType
from llama_cpp_agent.providers import TGIServerProvider

In [95]:
# Simple tool for the agent, to get the current date and time in a specific format.
def get_current_datetime(output_format: Optional[str] = None):
    """
    Get the current date and time in the given format.

    Args:
         output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
    """
    if output_format is None:
        output_format = '%Y-%m-%d %H:%M:%S'
    return datetime.datetime.now().strftime(output_format)


In [96]:
class MathOperation(Enum):
    ADD = "add"
    SUBTRACT = "subtract"
    MULTIPLY = "multiply"
    DIVIDE = "divide"

In [97]:
# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
class calculator(BaseModel):
    """
    Perform a math operation on two numbers.
    """
    number_one: Union[int, float] = Field(..., description="First number.")
    operation: MathOperation = Field(..., description="Math operation to perform.")
    number_two: Union[int, float] = Field(..., description="Second number.")

    def run(self):
        if self.operation == MathOperation.ADD:
            return self.number_one + self.number_two
        elif self.operation == MathOperation.SUBTRACT:
            return self.number_one - self.number_two
        elif self.operation == MathOperation.MULTIPLY:
            return self.number_one * self.number_two
        elif self.operation == MathOperation.DIVIDE:
            return self.number_one / self.number_two
        else:
            raise ValueError("Unknown operation.")

In [98]:
def send_message_to_user_callback(message: str):
    print(message)

In [99]:
# First we create the calculator tool.
calculator_function_tool = LlamaCppFunctionTool(calculator)


In [100]:
# Next we create the current datetime tool.
current_datetime_function_tool = LlamaCppFunctionTool(get_current_datetime)

In [101]:
# The from_openai_tool function of the LlamaCppFunctionTool class converts an OpenAI tool schema and a callable function into a LlamaCppFunctionTool
get_weather_function_tool = LlamaCppFunctionTool.from_openai_tool(open_ai_tool_spec, get_current_weather)

In [102]:
function_call_agent = FunctionCallingAgent(
    provider,
    llama_cpp_function_tools=[calculator_function_tool, current_datetime_function_tool, get_weather_function_tool],
    send_message_to_user_callback=send_message_to_user_callback,
    allow_parallel_function_calling=True,
    messages_formatter_type=MessagesFormatterType.CHATML)


In [105]:
# user_input = '''Get the date and time in '%d-%m-%Y %H:%M' format. Get the current weather in celsius in London. Solve the following calculations: 42 * 42, 7 * 26, 4 + 6  and 96/8.'''
user_input = 'Get the current weather in celsius in London?'
function_call_agent.generate_response(user_input)

ValueError: Requested tokens (2093) exceed context window of 2048