In [43]:
import os
import json
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
import re

In [2]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

In [3]:
print(device)

cuda


In [4]:
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = image_file
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values

In [5]:
path = "OpenGVLab/InternVL2-4B"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True).eval().cuda()

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Phi3ForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


FlashAttention2 is not installed.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)

In [7]:
#check where the tensors are allocated
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

NVIDIA GeForce RTX 4090
Memory Usage:
Allocated: 7.7 GB
Cached:    7.8 GB


In [8]:
#loading either hallusionbench or POPE
def generate_response(question, image):
    """ Prompt model with question regarding image and generate response.
    Args:
        question (str): question regarding the image content
        image_path (str): PIL image object
    
    Returns:
        response (str): model's response to the question
    """
    pixel_values = load_image(image, max_num=12).to(torch.bfloat16).cuda()
    generation_config = dict(max_new_tokens=1024, do_sample=True)
    response, _ = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
    
    return response

In [9]:
internvl_pope_output_path = os.path.join(os.curdir, "internvl_pope_responses.json")
internvl_hallusion_output_path =  os.path.join(os.curdir, "internvl_hallusion_responses.json")

In [10]:
# Load POPE benchmark dataset
dataset = load_dataset("lmms-lab/POPE", "default")
dataset = dataset['test'].filter(lambda x: x['category'] == 'adversarial')

In [33]:
#access the RAM tags
filepath = "../../models/recognize-anything/pope_tags.json"
ram_data = {}
with open(filepath, 'r') as file:
    for line in file:
        line = line.strip().rstrip(",")

        data_entry = json.loads(line)

        ram_data.update(data_entry)

In [65]:
def obtain_attributes(img_src):
    """
    Returns the attributes identified by RAM.
    """
    injection = "This image has these attributes: "
    image_attrs = ram_data[img_src]
    image_attrs = image_attrs.replace('|',' ').split()
    injection = "This image has these attributes: "
    for i in range(len(image_attrs)):
        if i == len(image_attrs) - 2:
            injection = injection + image_attrs[i]+", and "
        elif i == len(image_attrs) - 1:
            injection = injection + image_attrs[i] +". "
        else:
            injection = injection + image_attrs[i] + ", "
    return injection
    
    

In [66]:
def inject_info(img_src, question):
    """
    Injects prompt with any needed information. So given question, it will tell the lvlm also what it contains.
    Should fine-tune prompt later.
    """
    image_attrs = ram_data[img_src]
    image_attrs = image_attrs.replace('|',' ').split()
    injection = obtain_attributes(img_src)
    injection = injection + f"Using this information answer the following question: {question}"
    return injection

In [69]:
responses = []
for idx in range(len(dataset)):
    question = dataset['question'][idx]
    image = dataset['image'][idx]
    img_source = dataset['image_source'][idx]
    prompt = (inject_info(img_source, question))
    response = generate_response(prompt, image)
    responses.append({
        'question': question,
        'response': response
    })

prompt: This image has these attributes: blanket, hill, person, jacket, man, ski, slope, ride, ski, skier, slope, snow, snowboarder, and snowy. Using this information answer the following question: Is there a snowboard in the image?
question: Is there a snowboard in the image? response Yes


In [50]:
# Write responses to file (POPE)
with open(internvl_pope_output_path, 'w') as f:
    json.dump(responses, f, indent=4)
print(f"InternVL's responses have been saved to {internvl_pope_output_path}")

InternVL's responses have been saved to .\internvl_pope_responses.json
