In [1]:
import torch
from PIL import Image
from torchvision import transforms
from transformers import OFATokenizer, OFAModel

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
ckpt_dir = "/Users/junwu/Documents/ECS270/Project/Image-Model-with-Grounding-DINO/OFA-base"  # path to local OFA-base directory
tokenizer = OFATokenizer.from_pretrained(ckpt_dir)
model = OFAModel.from_pretrained(ckpt_dir)
model.eval()

# === Image preprocessing ===
resolution = 384
mean, std = [0.5] * 3, [0.5] * 3
transform = transforms.Compose([
    lambda image: image.convert("RGB"),
    transforms.Resize((resolution, resolution), interpolation=Image.BICUBIC),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std),
])

# === Load and process image ===
image = Image.open("/Users/junwu/Documents/ECS270/Project/Image-Model-with-Grounding-DINO/input-data/banana.jpg")  
patch_img = transform(image).unsqueeze(0)  # shape: (1, 3, H, W)

# === Prepare text prompt ===
question = " what does the image describe?"
inputs = tokenizer(question, return_tensors="pt").input_ids

# === Generate caption ===
outputs = model.generate(
    input_ids=inputs,
    patch_images=patch_img,
    patch_masks=torch.tensor([True]),
    num_beams=5,
    max_length=16,
    no_repeat_ngram_size=3,
)

# === Decode and print result ===
caption = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print("Generated caption:", caption[0])

/Users/junwu/Documents/ECS270/Project/Image-Model-with-Grounding-DINO/OFA-base
<super: <class 'OFATokenizer'>, <OFATokenizer object>>
Generated caption:  a single banana on a white background
