In [1]:
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import torch
from PIL import Image
import requests
from datasets import load_dataset
import os

In [2]:
output_path = os.path.join(os.curdir, "model_responses.json")

In [3]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

In [4]:
print(device)

cuda


In [5]:
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")

In [6]:
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map=device) 
model.to(device)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlavaNextForConditionalGeneration(
  (vision_tower): CLIPVisionModel(
    (vision_model): CLIPVisionTransformer(
      (embeddings): CLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
        (position_embedding): Embedding(577, 1024)
      )
      (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-23): 24 x CLIPEncoderLayer(
            (self_attn): CLIPSdpaAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (mlp): CLIPMLP(
              (activation_fn

In [7]:
#check where the tensors are allocated
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

NVIDIA GeForce RTX 4090
Memory Usage:
Allocated: 14.1 GB
Cached:    14.2 GB


In [8]:
dataset = load_dataset("lmms-lab/POPE", "default")

In [9]:
dataset = dataset['test'].filter(lambda x: x['category'] == 'adversarial')

In [10]:
def generate_response(question, image):
    """ Prompt model with question regarding image and generate response.

    Args:
        question (str): question regarding the image content
        image_path (str): PIL image object
    
    Returns:
        response (str): model's response to the question
    """
    inputs = processor(images=image, text=question, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(**inputs)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response

In [11]:
def generate_response_new(question,image):
    conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": question},
        ],
    },
]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(image, prompt, return_tensors="pt").to("cuda:0")
    _, length_inputs = inputs["input_ids"].shape
    # print("length of inputs: ", length_inputs)
    # autoregressively complete prompt
    output = model.generate(**inputs, max_new_tokens=100)
    output = output[:, length_inputs:]
    
    return processor.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [None]:
responses = []
for idx in range(len(dataset)):
    question = dataset['question'][idx]
    image = dataset['image'][idx]
    response = generate_response_new(question, image)

    responses.append({
        'question': question,
        'response': response
    })


In [None]:
# Write responses to file
with open(output_path, 'w') as f:
    json.dump(responses, f, indent=4)

print(f"LLaVa's responses have been saved to {output_path}")