# 2.2 Parallel Tool Calls

When a question requires multiple independent pieces of information, the model can request several tool calls in a single turn. Instead of calling one tool, waiting for the result, calling the next, and so on, it batches all requests together.

This is a latency optimization: four 300ms API calls complete in ~300ms (parallel) instead of ~1200ms (sequential). But it's also a capability -- the model has to recognize which parts of the question are independent and can be answered simultaneously.

In [1]:
import os
import json
import time
import openai
from concurrent.futures import ThreadPoolExecutor
from dotenv import load_dotenv

load_dotenv()

client = openai.OpenAI(
    api_key=os.getenv('OPENROUTER_API_KEY'),
    base_url='https://openrouter.ai/api/v1',
)

MODEL = 'google/gemini-2.5-flash-lite'

## Define multiple tools

Three independent tools that the model can call in parallel. Each one returns a different type of information.

In [2]:
# Three independent tools
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a city. Returns temperature and conditions.",
            "parameters": {
                "type": "object",
                "properties": {
                    "city": {"type": "string", "description": "City name"},
                },
                "required": ["city"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "get_stock_price",
            "description": "Get the current stock price for a ticker symbol. Returns price in USD.",
            "parameters": {
                "type": "object",
                "properties": {
                    "ticker": {"type": "string", "description": "Stock ticker, e.g. AAPL"},
                },
                "required": ["ticker"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "get_news_headlines",
            "description": "Get the latest news headlines for a topic. Returns a list of headline strings.",
            "parameters": {
                "type": "object",
                "properties": {
                    "topic": {"type": "string", "description": "News topic to search for"},
                },
                "required": ["topic"],
            },
        },
    },
]

# Simulated implementations
def get_weather(city: str) -> dict:
    time.sleep(0.3)  # Simulate API latency
    data = {
        "San Francisco": {"temp_f": 62, "conditions": "Foggy"},
        "New York": {"temp_f": 45, "conditions": "Partly cloudy"},
        "Tokyo": {"temp_f": 71, "conditions": "Clear"},
    }
    return data.get(city, {"temp_f": 70, "conditions": "Unknown"})

def get_stock_price(ticker: str) -> dict:
    time.sleep(0.3)  # Simulate API latency
    prices = {
        "AAPL": {"price": 198.50, "change": "+1.2%"},
        "GOOGL": {"price": 175.30, "change": "-0.5%"},
        "MSFT": {"price": 425.80, "change": "+0.8%"},
    }
    return prices.get(ticker, {"price": 0, "change": "N/A"})

def get_news_headlines(topic: str) -> dict:
    time.sleep(0.3)  # Simulate API latency
    return {"headlines": [
        f"Breaking: Major developments in {topic}",
        f"{topic} sector sees unprecedented growth",
        f"Experts weigh in on the future of {topic}",
    ]}

available_functions = {
    "get_weather": get_weather,
    "get_stock_price": get_stock_price,
    "get_news_headlines": get_news_headlines,
}

print(f'Defined {len(tools)} tools: {[t["function"]["name"] for t in tools]}')

Defined 3 tools: ['get_weather', 'get_stock_price', 'get_news_headlines']


## Handle parallel tool calls

When the model returns multiple tool_calls in one response, we execute them concurrently using a thread pool. Each result is linked back to its tool_call via `tool_call_id`.

In [3]:
def execute_tool_calls_parallel(tool_calls) -> list[dict]:
    """Execute multiple tool calls concurrently and return results."""
    def execute_one(tc):
        fn = available_functions[tc.function.name]
        args = json.loads(tc.function.arguments)
        result = fn(**args)
        return {
            "role": "tool",
            "tool_call_id": tc.id,
            "content": json.dumps(result),
        }

    with ThreadPoolExecutor(max_workers=len(tool_calls)) as executor:
        results = list(executor.map(execute_one, tool_calls))
    return results


def execute_tool_calls_sequential(tool_calls) -> list[dict]:
    """Execute tool calls one at a time (for comparison)."""
    results = []
    for tc in tool_calls:
        fn = available_functions[tc.function.name]
        args = json.loads(tc.function.arguments)
        result = fn(**args)
        results.append({
            "role": "tool",
            "tool_call_id": tc.id,
            "content": json.dumps(result),
        })
    return results


def run_with_tools(user_message: str, parallel: bool = True) -> str:
    """Complete tool-calling lifecycle with support for parallel calls."""
    messages = [{"role": "user", "content": user_message}]

    response = client.chat.completions.create(
        model=MODEL, messages=messages, tools=tools,
    )

    assistant_msg = response.choices[0].message

    if not assistant_msg.tool_calls:
        return assistant_msg.content

    # Show what the model requested
    print(f'Model requested {len(assistant_msg.tool_calls)} tool call(s):')
    for tc in assistant_msg.tool_calls:
        print(f'  - {tc.function.name}({tc.function.arguments})')
    print()

    # Execute all tool calls
    start = time.time()
    if parallel:
        tool_results = execute_tool_calls_parallel(assistant_msg.tool_calls)
    else:
        tool_results = execute_tool_calls_sequential(assistant_msg.tool_calls)
    elapsed = time.time() - start

    mode = 'parallel' if parallel else 'sequential'
    print(f'Executed {len(tool_results)} tools ({mode}): {elapsed:.3f}s')
    for tr in tool_results:
        print(f'  Result: {tr["content"]}')
    print()

    # Send all results back
    messages.append(assistant_msg)
    messages.extend(tool_results)

    response = client.chat.completions.create(
        model=MODEL, messages=messages, tools=tools,
    )

    final = response.choices[0].message.content
    print(f'Final answer: {final}')
    return final

## Run it: parallel vs sequential

In [4]:
# Ask a question that requires multiple tools
query = "What's the weather in Tokyo, what's AAPL's stock price, and what's the latest news about AI?"

print('=== PARALLEL EXECUTION ===')
print()
answer_par = run_with_tools(query, parallel=True)

print()
print('=== SEQUENTIAL EXECUTION ===')
print()
answer_seq = run_with_tools(query, parallel=False)

print()
print('Both produce the same answer, but parallel is faster when tools have latency.')

=== PARALLEL EXECUTION ===



Model requested 3 tool call(s):
  - get_weather({"city":"Tokyo"})
  - get_stock_price({"ticker":"AAPL"})
  - get_news_headlines({"topic":"AI"})



Executed 3 tools (parallel): 0.311s
  Result: {"temp_f": 71, "conditions": "Clear"}
  Result: {"price": 198.5, "change": "+1.2%"}
  Result: {"headlines": ["Breaking: Major developments in AI", "AI sector sees unprecedented growth", "Experts weigh in on the future of AI"]}



Final answer: The weather in Tokyo is clear and 71°F. Apple's stock price is $198.5, up 1.2%. Here are the latest news headlines about AI: Breaking: Major developments in AI, AI sector sees unprecedented growth, Experts weigh in on the future of AI.

=== SEQUENTIAL EXECUTION ===



Model requested 3 tool call(s):
  - get_weather({"city":"Tokyo"})
  - get_stock_price({"ticker":"AAPL"})
  - get_news_headlines({"topic":"AI"})



Executed 3 tools (sequential): 0.909s
  Result: {"temp_f": 71, "conditions": "Clear"}
  Result: {"price": 198.5, "change": "+1.2%"}
  Result: {"headlines": ["Breaking: Major developments in AI", "AI sector sees unprecedented growth", "Experts weigh in on the future of AI"]}



Final answer: The weather in Tokyo is clear and 71°F. AAPL's stock price is $198.5, up 1.2%. The latest news headlines about AI are: "Breaking: Major developments in AI", "AI sector sees unprecedented growth", and "Experts weigh in on the future of AI".

Both produce the same answer, but parallel is faster when tools have latency.


## Timing comparison

Each simulated tool takes 300ms. With 3 parallel calls, execution should take ~300ms total instead of ~900ms.

In [5]:
# Precise timing comparison
from unittest.mock import MagicMock

# Create fake tool_calls for timing only
class FakeTC:
    def __init__(self, name, args):
        self.id = f'call_{name}'
        self.function = MagicMock()
        self.function.name = name
        self.function.arguments = json.dumps(args)

fake_calls = [
    FakeTC('get_weather', {'city': 'Tokyo'}),
    FakeTC('get_stock_price', {'ticker': 'AAPL'}),
    FakeTC('get_news_headlines', {'topic': 'AI'}),
]

# Time parallel
start = time.time()
execute_tool_calls_parallel(fake_calls)
parallel_time = time.time() - start

# Time sequential
start = time.time()
execute_tool_calls_sequential(fake_calls)
sequential_time = time.time() - start

print(f'3 tools (each 300ms):')
print(f'  Parallel:   {parallel_time:.3f}s')
print(f'  Sequential: {sequential_time:.3f}s')
print(f'  Speedup:    {sequential_time / parallel_time:.1f}x')

3 tools (each 300ms):
  Parallel:   0.305s
  Sequential: 0.916s
  Speedup:    3.0x
