In [1]:
import torch
from langchain_community.llms import Ollama
from langchain.agents import Tool, initialize_agent
from langchain.schema import SystemMessage, HumanMessage

In [2]:
def clear_cuda_memory():
    """
    Clear CUDA memory cache to free up GPU resources between queries.
    Only has an effect if CUDA is available.
    """
    if torch.cuda.is_available():
        # Empty the cache
        torch.cuda.empty_cache()
        
        # Force garbage collection
        import gc
        gc.collect()
        
        print("CUDA memory cache cleared")
    return

In [3]:
clear_cuda_memory()

CUDA memory cache cleared


In [4]:
llm_model = 'mistral-small3.2:latest' #works: qwen3:1.7b, qwen3:latest, hir0rameel/qwen-claude:latest, mistral-small3.2:latest
#not working (consistently): llama3.2, qwq, qwen3:0.6b, qwen3:14b,qwen3:30b-a3b, deepseek-r1:latest, atombuild/deepseek-r1-claude3.7:14b

In [5]:
def get_current_weather(location=None, format=None, **kwargs):
    location = location or "Unknown"
    format_ = format or "celsius"
    return f"Sample weather for {location} in {format_} is 20° and sunny!"

weather_tool = Tool(
    name="get_current_weather",
    func=get_current_weather,
    description="Get the current weather for a specific location. ONLY use this tool when the user explicitly asks about weather conditions, temperature, or weather forecasts for a particular place."
)

llm = Ollama(model=llm_model)
system_prompt = """You are a smart assistant. Analyze the user query carefully and answer it appropriately.

ONLY call a tool if:
1. The user query specifically requires information that the tool can provide
2. The tool is directly relevant to answering the user's question

For general conversation, greetings, or questions that don't require tool functionality, respond directly without calling any tools.

Available tools:
- get_current_weather: Only use this when the user specifically asks about weather information for a location."""

agent = initialize_agent(
    tools=[weather_tool],
    llm=llm,
    agent_type="zero-shot-react-description",  # Better reasoning about tool usage
    system_message=SystemMessage(content=system_prompt)
)


  llm = Ollama(model=llm_model)
  agent = initialize_agent(


In [6]:
for i in range(100):
    print(i)
    response = agent.invoke("What's the weather in Aachen, in celsius?")
    print(response)
    clear_cuda_memory()

0
{'input': "What's the weather in Aachen, in celsius?", 'output': 'The current weather in Aachen is 20° Celsius and sunny.'}
CUDA memory cache cleared
1
{'input': "What's the weather in Aachen, in celsius?", 'output': 'The current weather in Aachen is 20° Celsius and sunny.'}
CUDA memory cache cleared
2
{'input': "What's the weather in Aachen, in celsius?", 'output': 'The current weather in Aachen is 20° Celsius and sunny.'}
CUDA memory cache cleared
3
{'input': "What's the weather in Aachen, in celsius?", 'output': 'The current weather in Aachen is 20° Celsius and sunny!'}
CUDA memory cache cleared
4
{'input': "What's the weather in Aachen, in celsius?", 'output': 'The current weather in Aachen is 20° Celsius and sunny.'}
CUDA memory cache cleared
5
{'input': "What's the weather in Aachen, in celsius?", 'output': 'The current weather in Aachen is 20° Celsius and sunny.'}
CUDA memory cache cleared
6
{'input': "What's the weather in Aachen, in celsius?", 'output': 'The current weather 

In [7]:
for i in range(100):
    print(i)
    response2 = agent.invoke("Hi how are u?")
    print(response2)
    clear_cuda_memory()

0
{'input': 'Hi how are u?', 'output': "Hi! I'm just a computer program, so I don't have feelings, but I'm here and ready to help you with anything you need! How can I assist you today?"}
CUDA memory cache cleared
1
{'input': 'Hi how are u?', 'output': "Hi! I'm just a computer program, so I don't have feelings, but I'm here and ready to help you with any questions or tasks you have! How can I assist you today?"}
CUDA memory cache cleared
2
{'input': 'Hi how are u?', 'output': "Hi! I'm just a computer program, so I don't have feelings, but I'm here and ready to help you with any questions or tasks you have! How can I assist you today?"}
CUDA memory cache cleared
3
{'input': 'Hi how are u?', 'output': "Hi! I'm just a computer program, so I don't have feelings, but I'm here and ready to help you with anything you need! How can I assist you today?"}
CUDA memory cache cleared
4
{'input': 'Hi how are u?', 'output': "Hi! I'm just a computer program, so I don't have feelings, but I'm here and 