### Middelware

Middelware provides a way to more tightly control what happens inside the agent. Middelware is useful for foll:

- Tracking agent behaviour with logging, analytics, or debugging.
- Transforming prompts, tool selection, and output formatting.
- Adding retries, fallbacks and early terminations logic.
- Applying rate limits, guradrails and PII detection.

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')

### Summarization MiddleWare

In [2]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.agents.middleware import SummarizationMiddleware
from langgraph.checkpoint.memory import InMemorySaver
from langchain.agents import create_agent
from langchain.messages import HumanMessage

# Main LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0
)

# Middleware LLM (separate instance is best)
summary_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0
)

agent = create_agent(
    model=llm,
    checkpointer=InMemorySaver(),
    middleware=[
        SummarizationMiddleware(
            model=summary_llm,       
            trigger=("messages", 10),
            keep=("messages", 4)
        )
    ],
)


In [9]:
config = {"configurable":{"thread_id":"test-1"}}

In [11]:
questions = [
    "what is 2+2?",
    "what is 5*5?",
    "what is 10/2?",
    "what is 10-2?",
    "what is 5*2?",
    "what is 4*4"
]

for q in questions:
    response = agent.invoke({"messages": [HumanMessage(content=q)]}, config)
    print(f"Messages: {response['messages']}")
    print(f"Messages: {len(response['messages'])}")

Messages: [HumanMessage(content='Here is a summary of the conversation to date:\n\nThe user has been asking basic arithmetic questions (addition, multiplication, division) and the AI has been providing correct answers. The current pending question is "what is 10-2?".', additional_kwargs={}, response_metadata={}, id='5c9a3be7-9dd1-44ab-99dd-1415ce6e884e'), AIMessage(content='10 - 2 = 8', additional_kwargs={}, response_metadata={'finish_reason': 'STOP', 'model_name': 'gemini-2.5-flash', 'safety_ratings': [], 'model_provider': 'google_genai'}, id='lc_run--019b5652-e16c-7e12-9393-290c09e92c64-0', usage_metadata={'input_tokens': 60, 'output_tokens': 39, 'total_tokens': 99, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 31}}), HumanMessage(content='what is 5*2?', additional_kwargs={}, response_metadata={}, id='5ce19e14-abee-4a8a-8170-fe56e4dbde98'), AIMessage(content='5 * 2 = 10', additional_kwargs={}, response_metadata={'finish_reason': 'STOP', 'model_name':

### Token Size

In [9]:
from langgraph.types import Checkpointer
from langchain.agents import create_agent
from langchain.agents.middleware import SummarizationMiddleware
from langchain_core.tools import tool
from langchain_core.messages import HumanMessage
from langgraph.checkpoint.memory import InMemorySaver

@tool
def search_hotels(city: str) -> str:
    """search hotels - return long response to use more tokens"""
    return f"""Found hotels in {city}
    1. Grand Hotel - 5 star, $350/night, spa, pool, gym
    2. City Inn - 4 star, $250/night, business center
    3. Budget Stay - 3 star, $150/night, free wifi"""

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0
)

# Middleware LLM (separate instance is best)
summary_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0
)

agent = create_agent(
    model=llm,
    tools=[search_hotels],
    checkpointer=InMemorySaver(),
    middleware=[
        SummarizationMiddleware(
            model=summary_llm,
            trigger=("tokens",550),
            keep=("tokens",200)
        )
    ],
) 
config = {"configurable": {"thread_id": "test-1"}}

# token counter
def count_tokens(messages):
    total_chars = sum(len(str(m.content))for m in messages)
    return total_chars // 4  # 4 chars = 1 token

In [10]:
cities = ['Paris',"London", "New York", "Tokyo", "Sydney"]

for city in cities:
    response = agent.invoke(
        {"messages": [HumanMessage(content=f"Find hotels in {city}")]},
        config=config
    )
    tokens = count_tokens(response['messages'])
    print(f"{city}: {tokens} tokens, {len(response['messages'])} messages")
    print(f"{(response['messages'])}")

Paris: 104 tokens, 4 messages
[HumanMessage(content='Find hotels in Paris', additional_kwargs={}, response_metadata={}, id='83d9bc4c-7efa-40f2-b678-33880f6cf837'), AIMessage(content='', additional_kwargs={'function_call': {'name': 'search_hotels', 'arguments': '{"city": "Paris"}'}, '__gemini_function_call_thought_signatures__': {'184af65e-029e-4961-91b4-f09d2c2647e9': 'CvcBAXLI2ny3ubjxhTJ3tLTcfvlagNkLQ3e0ocoPLc2LuNFPLRC/OJY/p1TqN09gLCEeQjXd7LODHpXF+N7SAcRl7dZWNZzEQ2QK1Ya634mUgYONvrXg7pMhL1wLujhkwBuh78Vy3OBwjEDjmwt8EeVVBvtxnbRdQkE1rSQnVyHD19qDcPB6UqWd7wBzFbtke4UnNrpnxsZ3ukadTcjkN87Nxg1FbhQAYnw5h6sdxWJcZwptVHBmx4Vdl4+JXMKcRyoOv9tw1HYY0Siym150xem9e4i3GGGP+YJHMAVdOOBqcI5tEx3CXbRoSEWRmiJab+4LeL2e4iCzrw=='}}, response_metadata={'finish_reason': 'STOP', 'model_name': 'gemini-2.5-flash', 'safety_ratings': [], 'model_provider': 'google_genai'}, id='lc_run--019b5965-fdae-7d90-8949-c84077f10bc7-0', tool_calls=[{'name': 'search_hotels', 'args': {'city': 'Paris'}, 'id': '184af65e-029e-4961-91b4-f09

ChatGoogleGenerativeAIError: Error calling model 'gemini-2.5-flash' (RESOURCE_EXHAUSTED): 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 5, model: gemini-2.5-flash\nPlease retry in 40.097216435s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-flash'}, 'quotaValue': '5'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '40s'}]}}

### Fraction

In [2]:
from langgraph.types import Checkpointer
from langchain.agents import create_agent
from langchain.agents.middleware import SummarizationMiddleware
from langchain_core.tools import tool
from langchain_core.messages import HumanMessage
from langgraph.checkpoint.memory import InMemorySaver
from langchain_google_genai import ChatGoogleGenerativeAI


@tool
def search_hotels(city: str) -> str:
    """search hotels - return long response to use more tokens"""
    return f"""Found hotels in {city}
    1. Grand Hotel - 5 star, $350/night, spa, pool, gym"""

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0
)

# Middleware LLM (separate instance is best)
summary_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0
)

# Low fraction for testing
agent = create_agent(
    model=llm,
    tools=[search_hotels],
    checkpointer=InMemorySaver(),
    middleware=[
        SummarizationMiddleware(
            model=summary_llm,
            trigger=("fraction",0.002),  # 0.5% == 640 tokens
            keep=("fraction",0.001)      # 0.2% == 256 tokens
        )
    ],
) 
config = {"configurable": {"thread_id": "test-1"}}

# token counter
def count_tokens(messages):
    return sum(len(str(m.content))for m in messages) // 4  # 4 chars = 1 token

# Test

cities = ['Paris',"London", "New York", "Tokyo", "Sydney"]

for city in cities:
    response = agent.invoke(
        {"messages": [HumanMessage(content=f"Find hotels in {city}")]},
        config=config
    )
    tokens = count_tokens(response['messages'])
    fraction = tokens / 128000
    print(f"{city}: {tokens} tokens, {len(response['messages'])} messages")
    print(f"{(response['messages'])}")


Paris: 57 tokens, 4 messages
[HumanMessage(content='Find hotels in Paris', additional_kwargs={}, response_metadata={}, id='7962e06b-725f-4611-a208-857ed7034153'), AIMessage(content='', additional_kwargs={'function_call': {'name': 'search_hotels', 'arguments': '{"city": "Paris"}'}, '__gemini_function_call_thought_signatures__': {'dd4395a1-4541-4cb7-ba0b-66c42fd03da5': 'Co4DAXLI2nzAhFreyd51c0yciyYma5K64tUsPJ/H4ajCp3bOCcj10jrZAFedf0zJS43mPBg0hvT9vH6G81MVxMvMXKNx3pBHkxDILR+c7m/YT0S1F7X/E+K9IUagG0e2aD2H7c04ICFQcgJy6ypTfKZNhGqfotAe4Aiy/Ibz6qICbvmKplXPT1e9SjP81260SE7s/txA2O37IYSrdo4m6UyKVJRd21hr0JDuJwf5mU5HzKMB2HkAKpc4tiNzIWW4hKDq1qnP9wHvtl1TCG3HkCeBSi7pKNQNWhcbmcE6FgSkfOMGG+Yl2tF8a6l1W8SJObwhvj3rzzLtrq2637h29yymzsdzcsNnWWpLQfT9S1TZtdx16eEukRbyoAdD7h3/xhfIdjzL/Q0yKy8NqB7yJalhcXns9T001oo8cwZ7luuehUGxE3vFo1bOL53QZcdYo1gxfy2WBwcmhFcZkJ8jb3lKMQb5YmFb+Mml/3lP525vpJX57Wo0RlTe79Wczuy4w5XVyAvXXs62XWmoSB2whKo='}}, response_metadata={'finish_reason': 'STOP', 'model_name': 'gemini-2.5-flash', 'safety_ra

ChatGoogleGenerativeAIError: Error calling model 'gemini-2.5-flash' (RESOURCE_EXHAUSTED): 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 5, model: gemini-2.5-flash\nPlease retry in 9.614098124s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-flash'}, 'quotaValue': '5'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '9s'}]}}

### Human in the Loop MiddleWare

Pause agent execution for human approval, editing or rejection of tool calls before they execute. Human-in-the-loop (HITL) is useful for the following:

- High-Stakes operations requiring human approval(e.g. database writes, financial transactions)
- Compilance workflows where human overesight is mandotary.
- Long running conversations where human feedback guides the agent.

In [3]:
from langchain.agents import create_agent
from langchain.agents.middleware import HumanInTheLoopMiddleware
from langgraph.checkpoint.memory import InMemorySaver

def read_email_tool(email_id: str) -> str:
    """Mock function to read an email by its ID"""
    return f"Email content for ID: {email_id}"

def send_email_tool(recipient: str, subject: str, body: str) -> str:
    """Mock function to send an email"""
    return f"Email sent to{recipient} with subject: '{subject}'" 


In [None]:
agent = create_agent(
    model=llm,
    tools=[read_email_tool, send_email_tool],
    checkpointer=InMemorySaver(),
    middleware=[
        HumanInTheLoopMiddleware(
            interrupt_on={
                "send_email_tool":{
                    "allowed_decision":["approve","edit","reject"]
                },
                "read_email_tool":False,
            }
        )
    ],
) 

In [None]:
config = {"configurable": {"thread_id": "test-approve"}}

result = agent.invoke(
    {"messages": [HumanMessage(content="send email to john@test.com with subject 'Hello' and body 'How are you?'")]},
    config=config
)

In [None]:
result

In [None]:
from langgraph.types import Command
if "__interrupt__" in result:
    print("Paused Approval!!...")

    result = agent.invoke(
        command(
            resume={
                "decision": {
                    {"type":"approve"}
                }
            }
        ),
        config = config
    ) 
    
    print(f"Result: {result['messages'][-1].content}")

In [None]:
from langgraph.types import Command
if "__interrupt__" in result:
    print("Paused Approval!!...")

    result = agent.invoke(
        command(
            resume={
                "decision": {
                    {"type":"reject"}
                }
            }
        ),
        config = config
    ) 
    
    print(f"Result: {result['messages'][-1].content}")