In [1]:
#%pip install transformers accelerate datasets bitsandbytes pillow

In [11]:
import os

llama_models_path = os.getenv("LLAMA_MODELS_PATH")
if not llama_models_path:
    raise ValueError("Where are the models?")

In [12]:
import sys
from PIL import Image as PIL_Image
import torch
from transformers import MllamaForConditionalGeneration, MllamaProcessor
from accelerate import Accelerator
   
model_path = os.path.join(llama_models_path, "Llama3.2-11B-Vision-Instruct")

accelerator = Accelerator()
device = accelerator.device

In [13]:
def load_model_and_processor(model_path: str):
    """
    Load the model and processor based on the 11B Vision model.
    """
    model = MllamaForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16, use_safetensors=False, device_map="auto",
                                                              local_files_only=True)
    processor = MllamaProcessor.from_pretrained(model_path, use_safetensors=False,     local_files_only=True)

    model, processor = accelerator.prepare(model, processor)
    return model, processor


def process_image(image_path: str) -> PIL_Image.Image:
    """
    Open and convert an image from the specified path.
    """
    if not os.path.exists(image_path):
        print(f"The image file '{image_path}' does not exist.")
        sys.exit(1)
    with open(image_path, "rb") as f:
        return PIL_Image.open(f).convert("RGB")


def generate_text_from_image(model, processor, image, prompt_text: str, temperature: float, top_p: float):
    """
    Generate text from an image using the model and processor.
    """
    conversation = [
        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
    inputs = processor(image, prompt, return_tensors="pt").to(device)
    output = model.generate(**inputs, temperature=temperature, top_p=top_p, max_new_tokens=512)
    return processor.decode(output[0])[len(prompt):]

image_path = "/opt/Data/sample-images/car.bmp"

image = process_image(image_path)

model, processor = load_model_and_processor(model_path)

prompt_text = "Describe this image in detail."

generated_text = generate_text_from_image(model, processor, image, prompt_text, temperature=0.7, top_p=0.9)

print("Generated Description:", generated_text)

OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /home/prateek/.llama/checkpoints/Llama3.2-11B-Vision-Instruct.

In [10]:
ls /home/prateek/.llama/checkpoints/Llama-3.2-11B-Vision-Instruct

ls: cannot access '/home/prateek/.llama/checkpoints/Llama-3.2-11B-Vision-Instruct': No such file or directory


In [9]:
ls /home/prateek/.llama/checkpoints/Llama3.2-11B-Vision-Instruct

checklist.chk  consolidated.00.pth  params.json  tokenizer.model
