# Litellm API
* Integrate ollama LLMs running locally
* https://docs.litellm.ai/docs/
* https://docs.litellm.ai/docs/providers/ollama

## Imports

In [2]:
import os
from dotenv import load_dotenv

#from litellm import completion
import litellm

## Load config

In [3]:
load_dotenv()

OLLAMA_NEW_BASE_URL = os.getenv("OLLAMA_NEW_BASE_URL")
LLAMA3_3B_MODEL = f"ollama/{os.getenv("LLAMA3_3B")}"

if not OLLAMA_NEW_BASE_URL and LLAMA3_3B_MODEL:
    print("One or more mandatory config missing.")
else:
    print(f"Config loaded successfully for Ollama. \nBase URL: {OLLAMA_NEW_BASE_URL} \nModel: {LLAMA3_3B_MODEL}")

Config loaded successfully for Ollama. 
Base URL: http://localhost:11434 
Model: ollama/llama3.2:3b


## Connect to LLM

In [9]:
# Turn on debugging 
#litellm._turn_on_debug()


In [10]:
payload = [
    {'role': 'system', 'content': 'You are a funny assistant.'},
    {'role': 'user', 'content': 'tell me a joke!'}
]

response = litellm.completion(base_url=OLLAMA_NEW_BASE_URL, model= LLAMA3_3B_MODEL, messages=payload)

print(f"**Raw Response:** \n{response}\n\n")

print(f"**Joke:** \n{response.choices[0].message.content}")

**Raw Response:** 
ModelResponse(id='chatcmpl-941401da-7361-4816-a96e-797783b87a50', created=1762389242, model='ollama/llama3.2:3b', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='stop', index=0, message=Message(content='Here\'s one that\'s "punderful":\n\nWhy couldn\'t the bicycle stand up by itself?\n\n(Wait for it...)\n\nBecause it was two-tired! Get it?', role='assistant', tool_calls=None, function_call=None, provider_specific_fields=None, reasoning_content=None))], usage=Usage(completion_tokens=36, prompt_tokens=42, total_tokens=78, completion_tokens_details=None, prompt_tokens_details=None))


**Joke:** 
Here's one that's "punderful":

Why couldn't the bicycle stand up by itself?

(Wait for it...)

Because it was two-tired! Get it?


In [14]:
# Token details
print(f"Input tokens: {response.usage.prompt_tokens}")
print(f"Output tokens: {response.usage.completion_tokens}")
print(f"Total tokens: {response.usage.total_tokens}")

print(f"Hidden Params in response: {response._hidden_params}")
print(f"Response Cost: {response._hidden_params['response_cost']}")

Input tokens: 42
Output tokens: 36
Total tokens: 78
Hidden Params in response: {'custom_llm_provider': 'ollama', 'region_name': None, 'optional_params': {}, 'litellm_call_id': '2271f8d9-0dc1-4c43-844f-add355824d84', 'api_base': None, 'model_id': None, 'response_cost': 0.0, 'additional_headers': {}, 'litellm_model_name': 'ollama/llama3.2:3b'}
Response Cost: 0.0
