In [2]:
import os
import json
import numpy as np
from PIL import Image

def load_annotation(file_path):
    data = []

    with open (file_path, 'r', encoding="utf-8") as file:
        data = json.load(file)
    return data

def load_image(image_path, size):
    image = Image.open(image_path).convert("RGB").resize(size=size)
    return np.array(image)

In [None]:
import matplotlib.pyplot as plt
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch

annotation_path = "data/vimmsd-warmup.json"
image_folder_path = "data/warmup-images"

data = load_annotation(annotation_path)
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
generated_captions = []

for i, id in enumerate(data):
    image_path = f"{image_folder_path}/{data[id]['image']}"
    image = load_image(image_path, size=(1024, 1024))
    caption = data[id]["caption"]
    label = data[id]["label"]

    inputs = processor(image, return_tensors="pt").to(device, torch.float16)
    generated_ids = model.generate(**inputs, max_length=128, temperature=1.5)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    generated_captions.append([id, generated_text])

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.set_title(f"{label} - {generated_text[:256]}")
    ax.imshow(image)
    ax.axis('off')
plt.tight_layout()
plt.show()

In [None]:
json_str = json.dumps(generated_captions, indent=4, ensure_ascii=False)
with open("../data/generated_captions.json", "w", encoding="utf-8") as outfile:
    outfile.write(json_str)