In [1]:
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import torch
import json
from PIL import Image
import requests
from datasets import load_dataset
import os

In [2]:
output_path = os.path.join(os.curdir, "model_responses.json")

In [3]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

In [4]:
print(device)

cuda


In [5]:
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")

In [6]:
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map=device) 
model.to(device)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlavaNextForConditionalGeneration(
  (vision_tower): CLIPVisionModel(
    (vision_model): CLIPVisionTransformer(
      (embeddings): CLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
        (position_embedding): Embedding(577, 1024)
      )
      (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-23): 24 x CLIPEncoderLayer(
            (self_attn): CLIPSdpaAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (mlp): CLIPMLP(
              (activation_fn

In [7]:
#check where the tensors are allocated
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

NVIDIA GeForce RTX 4090
Memory Usage:
Allocated: 14.1 GB
Cached:    14.2 GB


In [8]:
#this is to load pope
# dataset = load_dataset("lmms-lab/POPE", "default")
# dataset = dataset['test'].filter(lambda x: x['category'] == 'adversarial')

In [9]:
#this is to load hallusion bench
dataset = load_dataset("lmms-lab/HallusionBench", "default")
dataset = dataset['image']

In [10]:
def generate_response(question, image):
    """ Prompt model with question regarding image and generate response.

    Args:
        question (str): question regarding the image content
        image_path (str): PIL image object
    
    Returns:
        response (str): model's response to the question
    """
    inputs = processor(images=image, text=question, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(**inputs)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response

In [11]:
def generate_response_new(question,image):
    conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": question},
        ],
    },
]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(image, prompt, return_tensors="pt").to("cuda:0")
    _, length_inputs = inputs["input_ids"].shape
    # print("length of inputs: ", length_inputs)
    # autoregressively complete prompt
    output = model.generate(**inputs, max_new_tokens=100)
    output = output[:, length_inputs:]
    
    return processor.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [12]:
#access the RAM tags
def tags_to_dict(filepath):
    filepath = filepath
    ram_data = {}
    with open(filepath, 'r') as file:
        for line in file:
            line = line.strip().rstrip(",")
    
            data_entry = json.loads(line)
    
            ram_data.update(data_entry)
    return ram_data

In [13]:
pope_tag_path =  "../../models/recognize-anything/pope_tags.json"
hallusion_tag_path = "../../models/recognize-anything/hallusionBench_tags.json"

In [18]:
ram_data = tags_to_dict(hallusion_tag_path)

In [15]:
def obtain_attributes(img_src):
    """
    Returns the attributes identified by RAM.
    """
    injection = "This image has these attributes: "
    image_attrs = ram_data[img_src]
    image_attrs = image_attrs.replace('|',' ').split()
    injection = "This image has these attributes: "
    for i in range(len(image_attrs)):
        if i == len(image_attrs) - 2:
            injection = injection + image_attrs[i]+", and "
        elif i == len(image_attrs) - 1:
            injection = injection + image_attrs[i] +". "
        else:
            injection = injection + image_attrs[i] + ", "
    return injection

In [16]:
def inject_info(img_src, question):
    """
    Injects prompt with any needed information. So given question, it will tell the lvlm also what it contains.
    Should fine-tune prompt later.
    """
    image_attrs = ram_data[img_src]
    image_attrs = image_attrs.replace('|',' ').split()
    injection = obtain_attributes(img_src)
    injection = injection + f"Using this information answer the following question: {question}"
    return injection

In [19]:
responses = []
for idx in range(len(dataset)):
    question = dataset['question'][idx]
    image = dataset['image'][idx]
    img_source = dataset['filename'][idx] #this is for hallusion bench
    # img_source = dataset['image_source'][idx]#this is for pope
    prompt = inject_info(img_source, question)
    response = generate_response_new(question, image)
    responses.append({
        'question': question,
        'response': response
    })


Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.


This image has these attributes: graph, number, and individual. Using this information answer the following question: Is China, Hongkong SAR, the leading importing country of gold, silverware, and jewelry with the highest import value in 2018? : The image you've provided shows a bar chart comparing the import value of gold, silverware, and jewelry among several countries in 2018. According to the chart, China, Hong Kong SAR, is indeed the leading importer of these goods, with an import value of $10,000. This is significantly higher than the import values of other countries listed on the chart. 


In [None]:
llava_pope_output_path = os.path.join(os.curdir, "internvl_pope_responses.json")
llava_hallusion_output_path =  os.path.join(os.curdir, "internvl_hallusion_responses.json")

In [None]:
# Write responses to file
output_path = llava_pope_output_path
with open(output_path, 'w') as f:
    json.dump(responses, f, indent=4)

print(f"LLaVa's responses have been saved to {output_path}")