## Tool call with vLLM models with llamastack 

In [None]:
from mcp_client import MCPClient
import json, os
from openai import AsyncOpenAI
from dotenv import load_dotenv
load_dotenv()

LLS_ENDPOINT = os.getenv("REMOTE_BASE_URL") 
LLS_OPENAI_ENDPOINT = f"{LLS_ENDPOINT}/v1/openai/v1"
OPENAI_APIKEY = os.getenv("OPENAI_API_KEY", "EMPTY")

In [None]:
from llama_stack_client.lib.agents.agent import Agent
from llama_stack_client.lib.agents.event_logger import EventLogger
from llama_stack_client import LlamaStackClient
from termcolor import cprint
import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

In [27]:
from llama_stack_client.lib.agents.agent import Agent
from llama_stack_client.lib.agents.event_logger import EventLogger
from llama_stack_client import LlamaStackClient
from termcolor import cprint
import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

In [94]:
def torchtune(query: str = "torchtune"):
    """
    Answer information about torchtune.

    :param query: The query to use for querying the internet
    :returns: Information about torchtune
    """
    dummy_response = """
    torchtune is a PyTorch library for easily authoring, finetuning and experimenting with LLMs.

    torchtune provides:

    PyTorch implementations of popular LLMs from Llama, Gemma, Mistral, Phi, and Qwen model families
    Hackable training recipes for full finetuning, LoRA, QLoRA, DPO, PPO, QAT, knowledge distillation, and more
    Out-of-the-box memory efficiency, performance improvements, and scaling with the latest PyTorch APIs
    YAML configs for easily configuring training, evaluation, quantization or inference recipes
    Built-in support for many popular dataset formats and prompt templates
    """
    return dummy_response

def test_lls(mcp_endpoint, mcp_toolgroup, model, instructions, prompts):
    client = LlamaStackClient(base_url=LLS_ENDPOINT)
    logger.info(f"Connected to Llama Stack server @ {LLS_ENDPOINT[:15]}... \n")

    # Get tool info and register tools
    registered_tools = client.tools.list()
    registered_tools_identifiers = [t.identifier for t in registered_tools]
    registered_toolgroups = [t.toolgroup_id for t in registered_tools]

    if mcp_toolgroup not in registered_toolgroups:
        # Register MCP tools
        client.toolgroups.register(
            toolgroup_id=mcp_toolgroup,
            provider_id="model-context-protocol",
            mcp_endpoint={"uri":mcp_endpoint},
            )
    mcp_tools = [t.identifier for t in client.tools.list(toolgroup_id=mcp_toolgroup)]

    logger.info(f"""Your Server has access the the following toolgroups:
    {set(registered_toolgroups)}
    """)
    # Create simple agent with tools
    agent = Agent(
        client,
        model=model,
        instructions = instructions,
        tools=[mcp_toolgroup, torchtune],
        tool_config={"tool_choice":"auto"},
        sampling_params={"max_tokens": 4096}
    )


    user_prompts = prompts
    session_id = agent.create_session(session_name="Auto_demo")
    for prompt in user_prompts:
        turn_response = agent.create_turn(
            messages=[
                {
                    "role":"user",
                    "content": prompt
                }
            ],
            session_id=session_id,
            stream=True,
        )
        for log in EventLogger().log(turn_response):
            if "Tool:pods_list_in_namespace Response:" in log.content:
                continue
            log.print()

    logger.handlers.clear()

In [None]:
MCP_TOOLGROUP= "mcp::custom_tool"
MCP_ENDPOINT= ""
LLM_MODEL_ID = "granite32-8b"
INSTRUCTIONS = """You are a helpful assistant. You have access to a number of tools.
Whenever a tool is called, be sure return the Response in a friendly and helpful tone."""      
PROMPT = "Use tools to generate a number between 5 and 50"
test_lls(MCP_ENDPOINT, MCP_TOOLGROUP, LLM_MODEL_ID, INSTRUCTIONS, [PROMPT])

[33minference> [0m[33m<[0m[33mtool[0m[33m_[0m[33mcall[0m[33m>[0m[97m[0m
[30m[0m

In [95]:
MCP_TOOLGROUP= "mcp::openshift"
MCP_ENDPOINT=""
LLM_MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
INSTRUCTIONS = """You are a helpful assistant. You have access to a number of tools.
Whenever a tool is called, be sure return the Response in a friendly and helpful tone."""      
PROMPT = "List the pods in llama serve namespace"
test_lls(MCP_ENDPOINT, MCP_TOOLGROUP, LLM_MODEL_ID, INSTRUCTIONS, [PROMPT])

[33minference> [0m[97m[0m
[32mtool_execution> Tool:pods_list_in_namespace Args:{'namespace': 'llama-serve'}[0m
[33minference> [0m[97m[0m
[30m[0m

That's the infamous out of token error. 

In [46]:
MCP_TOOLGROUP= "mcp::github"
MCP_ENDPOINT=os.getenv("GITHUB_MCP_SERVER_URL") 
LLM_MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
INSTRUCTIONS = """You are a helpful assistant. You have access to a number of tools.
Whenever a tool is called, be sure return the Response in a friendly and helpful tone. 
For parameters like 'page', 'perPage', 'limit', etc., please ensure you provide numeric values without quotes (example: "page": 1 not "page": "1")."""         
PROMPT = "Search for top 5 Python repositories related to 'llama', sorted by stars."
test_lls(MCP_ENDPOINT, MCP_TOOLGROUP, LLM_MODEL_ID, INSTRUCTIONS, [PROMPT])

[33minference> [0m[97m[0m
[31m500: Internal server error: An unexpected error occurred.[0m


## Tool call with vLLM models without llamastack 

In [66]:
async def test_openai_api(mcp_endpoint, model, instruction, prompt):
    client = AsyncOpenAI(api_key=OPENAI_APIKEY, base_url=LLS_OPENAI_ENDPOINT)
    mcp = MCPClient(mcp_endpoint)
    tools = await mcp.list_tools()                 
    openai_tools = [
        {
            "type": "function",
            "function": {
                "name": t.name,
                "description": t.description,
                "parameters": t.inputSchema, 
            },
        }
        for t in tools
    ]
    messages = [
        {
            "role": "system",
            "content": (instruction)
        },
        {
            "role": "user",
            "content": (prompt),
        }
    ]
    resp = await client.chat.completions.create(
        model = model,
        messages = messages,
        tools = openai_tools,
        tool_choice = "auto",
        stream = False,
        )
    assistant = resp.choices[0].message
    if assistant.tool_calls:
        for call in assistant.tool_calls:
            args = json.loads(call.function.arguments)
            print("TOOL")
            print(call.function.name)
            print(args)
            result = await mcp.invoke_tool(call.function.name, args)
            #print(f"Results: {result.content}")

            messages.append(
                {
                    "role": "assistant",
                    "name": call.function.name,
                    "content": result.content,  
                }
            )
        final = await client.chat.completions.create(
            model    = model,
            messages = messages,
            stream=False
        )
        print("\n🔹 Assistant:", final.choices[0].message.content)
    else:
        print("\n🔹 Assistant:", assistant.content)


In [None]:
MCP_ENDPOINT = os.getenv("MCP_ENDPOINT") 
LLM_MODEL_ID = "granite32-8b"
INSTRUCTIONS = """You are a helpful assistant. You have access to a number of tools.
Whenever a tool is called, be sure return the Response in a friendly and helpful tone."""      
PROMPT = "Use tools to generate a number between 5 and 50"
await test_openai_api(MCP_ENDPOINT, LLM_MODEL_ID, INSTRUCTIONS, PROMPT)

TOOL
generate_random_number
{'min': '5', 'max': '50'}
Results: {"type":"text","text":"23","annotations":null}

🔹 Assistant: I've generated a number for you! It's 23.


In [67]:
MCP_ENDPOINT = os.getenv("MCP_ENDPOINT_OCP") 
LLM_MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
INSTRUCTIONS = """You are a helpful assistant. You have access to a number of tools.
Whenever a tool is called, be sure return the Response in a friendly and helpful tone. """
PROMPT = "List the pods in llama serve namespace"
await test_openai_api(MCP_ENDPOINT, LLM_MODEL_ID, INSTRUCTIONS, PROMPT)

TOOL
pods_list_in_namespace
{'namespace': 'llama-serve'}

🔹 Assistant: Here is the list of pods in the `llama-serve` namespace:

1. `ansible-mcp-server-6d8d74d699-9bb6l`
2. `auto-quote-mcp-5d579cbbfb-4vstt`
3. `custom-mcp-server-59cdf5cfd7-6 bwSchool]
4. `graveeteurne-rich25- enthusiastic-santonia`
5. `github-mcp-server-with-rh-nodejs-7b4dd84f68-qj7qk`
6. `granite-8b`
7. `granite32-8b-predictor-00001-deployment-7dc6885d57-rbd4t`
8. `granite33-8b`
9. `llama32-3b`
10. `llamastack-deployment-956c577f4-cs9ls`
11. `mcp-llamastack-server-667fc898c-n57mp`
12. `ocp-mcp-server-7cbd674668-sjrmj`
13. `simple-mcp-server-677b86fb84-skfk6`
14. `slack-mcp-server-76cdf9bc7b-h5456`
15. `slack-test`
16. `streamlit-79649d549d-h5456`


In [None]:
MCP_ENDPOINT = os.getenv("MCP_ENDPOINT_GITHUB") 
LLM_MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
INSTRUCTIONS = """You are a helpful assistant. You have access to a number of tools.
Whenever a tool is called, be sure return the Response in a friendly and helpful tone. 
For parameters like 'page', 'perPage', 'limit', etc., please ensure you provide numeric values without quotes (example: "page": 1 not "page": "1")."""         
PROMPT = "Search for top 5 Python repositories related to 'llama', sorted by stars."
await test_openai_api(MCP_ENDPOINT, LLM_MODEL_ID, INSTRUCTIONS, PROMPT)

TOOL
search_repositories
{'query': 'python llama', 'page': '1', 'perPage': '5'}


  + Exception Group Traceback (most recent call last):
  |   File "/Users/shrey/llama-stack-on-ocp/tests/mcp_client.py", line 124, in invoke_tool
  |     async with ClientSession(*streams) as session:
  |   File "/Users/shrey/miniforge3/envs/stack/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 767, in __aexit__
  |     raise BaseExceptionGroup(
  | exceptiongroup.ExceptionGroup: unhandled errors in a TaskGroup (1 sub-exception)
  +-+---------------- 1 ----------------
    | Traceback (most recent call last):
    |   File "/Users/shrey/llama-stack-on-ocp/tests/mcp_client.py", line 126, in invoke_tool
    |     result = await session.call_tool(tool_name, kwargs)
    |   File "/Users/shrey/miniforge3/envs/stack/lib/python3.10/site-packages/mcp/client/session.py", line 225, in call_tool
    |     return await self.send_request(
    |   File "/Users/shrey/miniforge3/envs/stack/lib/python3.10/site-packages/mcp/shared/session.py", line 250, in send_request
    |     raise Mcp

In [None]:
## Debug openapi
# client = AsyncOpenAI(api_key=OPENAI_APIKEY, base_url=LLS_OPENAI_ENDPOINT)
# mcp = MCPClient(MCP_ENDPOINT)
# tools = await mcp.list_tools()                 
# openai_tools = [
#     {
#         "type": "function",
#         "function": {
#             "name": t.name,
#             "description": t.description,
#             "parameters": t.inputSchema, 
#         },
#     }
#     for t in tools
# ]

# messages = [
#     {
#         "role": "user",
#         "content": (
#             "Use tools to generate a number between 5 and 50"
#         ),
#     }
# ]
# resp = await client.chat.completions.create(
#     model = LLM_MODEL_ID,
#     messages = messages,
#     tools = openai_tools,
#     tool_choice = "auto",
#     stream = False,
#     )
# assistant = resp.choices[0].message
# if assistant.tool_calls:
#     for call in assistant.tool_calls:
#         args = json.loads(call.function.arguments)
#         print("TOOL")
#         print(call.function.name)
#         print(args)
#         result = await mcp.invoke_tool(call.function.name, args)
#         print(f"Results: {result.content}")

#         messages.append(
#             {
#                 "role": "assistant",
#                 "name": call.function.name,
#                 "content": result.content,  
#             }
#         )
#     final = await client.chat.completions.create(
#         model    = LLM_MODEL_ID,
#         messages = messages,
#         stream=False
#     )
#     print("\n🔹 Assistant:", final.choices[0].message.content)
# else:
#     print("\n🔹 Assistant:", assistant.content)

In [None]:
# # Debug lls
# client = LlamaStackClient(base_url=LLS_ENDPOINT)
# logger.info(f"Connected to Llama Stack server @ {LLS_ENDPOINT[:15]}... \n")

# # Get tool info and register tools
# registered_tools = client.tools.list()
# registered_tools_identifiers = [t.identifier for t in registered_tools]
# registered_toolgroups = [t.toolgroup_id for t in registered_tools]

# if mcp_toolgroup not in registered_toolgroups:
#     # Register MCP tools
#     client.toolgroups.register(
#         toolgroup_id=mcp_toolgroup,
#         provider_id="model-context-protocol",
#         mcp_endpoint={"uri":mcp_endpoint},
#         )
# mcp_tools = [t.identifier for t in client.tools.list(toolgroup_id=mcp_toolgroup)]

# logger.info(f"""Your Server has access the the following toolgroups:
# {set(registered_toolgroups)}
# """)
# # Create simple agent with tools
# agent = Agent(
#     client,
#     model=model,
#     instructions = """""" ,
#     tools=["mcp::custom_tool"],
#     tool_config={"tool_choice":"auto"},
#     sampling_params={"max_tokens": 4096}
# )


# user_prompts = ["""Use tools to generate a number between 5 and 50"""]
# session_id = agent.create_session(session_name="Auto_demo")
# for prompt in user_prompts:
#     turn_response = agent.create_turn(
#         messages=[
#             {
#                 "role":"user",
#                 "content": prompt
#             }
#         ],
#         session_id=session_id,
#         stream=True,
#     )
#     for log in EventLogger().log(turn_response):
#         log.print()

# logger.handlers.clear()