In [None]:
from dotenv import load_dotenv, find_dotenv
from smolagents import CodeAgent, DuckDuckGoSearchTool, InferenceClientModel, VisitWebpageTool, WikipediaSearchTool, FinalAnswerTool, PythonInterpreterTool,
# import nest_asyncio

# nest_asyncio.apply()

_ = load_dotenv(find_dotenv())

In [19]:
from huggingface_hub import model_info, InferenceClient


In [None]:
client = InferenceClient(
    provider="hyperbolic",
    bill_to="VitalNest",
)


In [2]:
all_inference_models = !curl -s https://huggingface.co/api/models | jq ".[].id"

In [29]:

info = model_info("google/gemma-3-27b-it", expand="inference")
info.inference


'warm'

In [22]:
vlm_model = InferenceClientModel(model_id="Qwen/Qwen2.5-VL-72B-Instruct",bill_to="VitalNest",
                                 timeout=300,temperature=.1, max_tokens=10000)

In [1]:
from PIL import Image
from io import BytesIO
import base64
from pathlib import Path

In [2]:
def encode_image_to_base64(image_path):
    """
    Encodes an image file to a base64 string.

    Args:
        image_path (str): Path to the image file.

    Returns:
        str: Base64 encoded image string.
    """
    """Encodes a PIL image to a base64 string."""
    image = Image.open(image_path)
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")


In [36]:
# MAXIMUM of 4 images can be sent
img_paths = [
    Path(
        "../src/pages/optima-restore-revision.pdf/optima-restore-revision.pdf_page_42.png"
    ),
    Path(
        "../src/pages/optima-restore-revision.pdf/optima-restore-revision.pdf_page_12.png"
    ),
    Path(
        "../src/pages/optima-restore-revision.pdf/optima-restore-revision.pdf_page_1.png"
    ),
    # Path(
    #     "../src/pages/optima-restore-revision.pdf/optima-restore-revision.pdf_page_43.png"
    # ),
    # Path(
    #     "../src/pages/optima-restore-revision.pdf/optima-restore-revision.pdf_page_44.png"
    # ),
]

In [37]:
# grouped_images = [Image.open(fp=pth) for pth in img_paths]
grouped_images = [encode_image_to_base64(pth) for pth in img_paths]

In [38]:
text_query = """what critical illnesses are covered under optima restore?
If the answer isn't in these documents, say you don't know."""
chat_template = [
    {
"role":"system", "content":"""You find answers from the relevant documents. Answer only 
from these documents. If answer isn't available return 'Question cannot be answered based
on the documents provided.' """
    },
    {
        "role": "user",
        "content": [
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image}"},
            }
            for image in grouped_images
        ]
        + [{"type": "text", "text": text_query}],
    }
]


In [39]:
print(vlm_model.generate(messages=chat_template).content)

The critical illnesses covered under Optima Restore include Cancer, Open Chest CABG, First Heart Attack, Kidney Failure, Major Organ/Bone Marrow Transplant, Multiple Sclerosis, Permanent Paralysis of Limbs, and Stroke.


In [41]:
completion = client.chat.completions.create(
    model="Qwen/Qwen2.5-VL-72B-Instruct",
    messages=chat_template,
    temperature=.1,
    max_tokens=10_000
)

print(completion.choices[0].message.content)

# completion = client.chat.completions.create(
#     model="Qwen/Qwen2.5-VL-72B-Instruct",
#     messages=[
#         {
#             "role": "user",
#             "content": [
#                 {"type": "text", "text": "Describe the images in one sentence."},
#                 {
#                     "type": "image_url",
#                     "image_url": {
#                         "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
#                     },
#                 },
#                 {
#                     "type": "image_url",
#                     "image_url": {
#                         "url": "https://upload.wikimedia.org/wikipedia/commons/3/3d/Nicu%C8%99or_Dan_%288_mai_2025%29.jpg"
#                     },
#                 },
#             ],
#         }
#     ],
# )

# print(completion.choices[0].message)


The critical illnesses covered under Optima Restore include Cancer, Open Chest CABG, First Heart Attack, Kidney Failure, Major Organ/Bone Marrow Transplant, Multiple Sclerosis, Permanent Paralysis of Limbs, and Stroke.


In [43]:
from smolagents import Tool

class SaveContent(Tool):
    name = "SaveContent"
    description = "Saves content to a file."
    inputs = {
        "content": {
            "type": "string",
            "description": "The content to save."
        },
        "filename": {
            "type": "string",
            "description": "The filename."
        }
    }
    output_type = "string"

    def forward(self, content: str, filename: str ):
        with open(filename, "w") as f:
            f.write(content)
        return f"Content saved to {filename}"



In [None]:
agent = CodeAgent(tools=[DuckDuckGoSearchTool(), VisitWebpageTool(), 
                         PythonInterpreterTool(), WikipediaSearchTool(), FinalAnswerTool()], 
                  model=InferenceClientModel(bill_to="VitalNest"),
                  additional_authorized_imports=["os", "requests", "bs4"],
                 max_steps=10, planning_interval=2,
                 )

In [None]:
agent.run(f"""Search for multi latent attention in wikipedia.
If you cannot find a wikipedia page for it, search the entire wikipedia site for it.
          Retrieve the top search results and the relevant content.""")

In [2]:
from smolagents import ToolCallingAgent

In [9]:
websearch_agent = ToolCallingAgent(
    model=InferenceClientModel(
        model_id="Qwen/Qwen3-30B-A3B", bill_to="VitalNest", temperature=0.1
    ),
    tools=[
        VisitWebpageTool(max_output_length=20_000),
        DuckDuckGoSearchTool(max_results=5),
    ],
    max_steps=4,
    verbosity_level=0,
    name="web_search_agent",
    planning_interval=2,
    description="Searches the web with a particular query.",
)

In [None]:
# websearch_agent.run("what's multi head latent attention?")

In [11]:
wikipedia_agent = ToolCallingAgent(
    model=InferenceClientModel(model_id="Qwen/Qwen3-30B-A3B", bill_to="VitalNest", temperature=.1),
    tools=[WikipediaSearchTool(user_agent="WikiAssistant (merlin@example.com)")],
    max_steps=3,
    verbosity_level=0,
    name="wikipedia_agent",
    description="Searches Wikipedia for a topic.",
)

In [16]:
manager_agent = CodeAgent(
    tools=[PythonInterpreterTool(), FinalAnswerTool()],
    additional_authorized_imports=["os", "requests", "bs4"],
    model=InferenceClientModel(model_id="Qwen/Qwen3-235B-A22B",bill_to="VitalNest", temperature=.1),
    managed_agents=[websearch_agent, wikipedia_agent],
    max_steps=10,
    planning_interval=3,
    verbosity_level=0,
)

In [17]:
manager_agent.run("""Search for `topic`:multi head latent attention in wikipedia.
                  If there's no information about the same in Wikipedia, search the web 
                  for it and return a summary of the findings along with the search result links.
"""
)

'  \n**Multi-Head Latent Attention (MLA)** is an efficient attention mechanism used in large language models (LLMs). It replaces traditional Grouped Query Attention (GQA) by employing low-rank matrices and up-projection matrices to compress key-value (KV) caches, reducing memory overhead by 93% and enabling faster inference. Key features include:  \n- **Efficiency**: Achieves a 10.6x speedup at 8K context length while maintaining output quality.  \n- **Integration**: Central to DeepSeek models (V2/V3/R1), paired with FP8 quantization and Multi-Token Prediction.  \n- **Technical Innovations**: Uses low-rank matrices for KV cache compression and decoupled Rotary Position Embeddings (RoPE).  \n- **Applications**: Enables deployment of large models on consumer-grade hardware by reducing memory bottlenecks.  \n\n**Sources**:  \n1. [arXiv paper (2502.07864)](https://arxiv.org/abs/2502.07864)  \n2. [Medium article on DeepSeek-V3](https://medium.com/data-science/deepseek-v3-explained-1-multi-h

In [1]:
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available

from colpali_engine.models import ColQwen2_5, ColQwen2_5_Processor


In [None]:

model = ColQwen2_5.from_pretrained(
    pretrained_model_name_or_path="vidore/colqwen2.5-v0.2",
    torch_dtype=torch.bfloat16,
    device_map="cuda:0",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5_Processor.from_pretrained("vidore/colqwen2.5-v0.2", use_fast=True)


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:

# Your inputs
images = [
    Image.new("RGB", (128, 128), color="white"),
    Image.new("RGB", (128, 128), color="black"),
]
queries = [
    "Is attention really all you need?",
    "What is the amount of bananas farmed in Salvador?",
]

# Process the inputs
batch_images = processor.process_images(images).to(model.device)
batch_queries = processor.process_queries(queries).to(model.device)

# Forward pass
with torch.no_grad():
    image_embeddings = model(**batch_images)
    query_embeddings = model(**batch_queries)

scores = processor.score_multi_vector(query_embeddings, image_embeddings)
