# Update the required libraries

In [None]:
!pip install -U transformers bitsandbytes
!pip install -U huggingface_hub



# Login using Huggingface Token

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Download and load the 11B model

In [None]:
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor

model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
image = Image.open(requests.get(url, stream=True).raw)

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": "If I had to write a haiku for this one, it would be: "}
    ]}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt"
).to(model.device)

output = model.generate(**inputs, max_new_tokens=30)
print(processor.decode(output[0]))


## Load quantized model

In [None]:
from transformers import MllamaForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
import torch
import requests
from PIL import Image

model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

# Setup 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16  # Using bf16 for better speed and efficiency
)

# Load model with quantization
model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto"
)

processor = AutoProcessor.from_pretrained(model_id)

## Using Model

In [None]:
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
image = Image.open(requests.get(url, stream=True).raw)

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": "If I had to write a haiku for this one, it would be: "}
    ]}
]

input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt"
).to(model.device)

output = model.generate(**inputs, max_new_tokens=30)
print(processor.decode(output[0]))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>If I had to write a haiku for this one, it would be: <|eot_id|><|start_header_id|>assistant<|end_header_id|>

Here's a haiku for the rabbit:

Whiskers twitch with glee
Flopsy ears so bright and keen
Springtime's sweet delight


# Using Groq Inference API

In [None]:
!pip install groq

Collecting groq
  Downloading groq-0.11.0-py3-none-any.whl.metadata (13 kB)
Collecting httpx<1,>=0.23.0 (from groq)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->groq)
  Downloading httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->groq)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading groq-0.11.0-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.5/106.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.0/78.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB

In [None]:
import os

from groq import Groq

client = Groq(
    api_key="",
)

In [None]:
# LLAMA 3.2 with text input:

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Explain the importance of fast language models",
        }
    ],
    model="llama-3.2-11b-vision-preview",
)

print(chat_completion.choices[0].message.content)

Fast language models have gained significant attention in recent years, driven by advancements in artificial intelligence, machine learning, and natural language processing. These models are designed to process and generate human-like language quickly and efficiently, making them crucial in various applications and industries. Here are some reasons why fast language models are so important:

1. **Improved conversational AI**: Fast language models enable the creation of conversational AI systems that can engage in fluid, natural-sounding conversations. This opens up possibilities for chatbots, voice assistants, and other interactive applications.
2. **Real-time language processing**: With fast language models, applications can process and respond to user input in real-time, enhancing user experience and interactivity.
3. **Information retrieval**: Fast language models can quickly retrieve relevant information from large databases, enabling applications like question answering, search en

In [None]:
# LLAMA 3.2 with image input:

completion = client.chat.completions.create(
    model="llama-3.2-11b-vision-preview",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "What's in this image?"
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "https://img.buzzfeed.com/buzzfeed-static/static/2018-07/16/16/tmp/buzzfeed-prod-web-01/tmp-name-2-27495-1531773764-5_dblbig.jpg?resize=1200:*"
                    }
                }
            ]
        },
        {
            "role": "assistant",
            "content": ""
        }
    ],
    temperature=1,
    max_tokens=1024,
    top_p=1,
    stream=False,
    stop=None,
)

print(completion.choices[0].message.content)


The image showcases a delightful scene of eight adorable panda bear cubs lounging on a wooden floor.

The cubs, with their distinctive black and white fur, are peacefully sleeping in various positions, some lying on their stomachs, backs, or sides. Their eyes are closed as they rest comfortably, soaking up the warmth in the room.

The wooden floor, partially illuminated by natural light, provides a cozy and natural setting for the cubs to catch some Z's. A few wooden logs and a basket are scattered nearby, adding to the rustic ambiance. The top-left corner of a plain white wall is barely visible, completing the serene atmosphere of the scene. Overall, the image exudes tranquility and playfulness, capturing the charm of these adorable animals in their in a peaceful and intimate setting.
