In [1]:
!pip install Pillow==10.1.0 torch==2.1.2 torchvision==0.16.2 transformers==4.40.0 sentencepiece==0.1.99

Looking in indexes: https://nexus.iisys.de/repository/ki-awz-pypi-group/simple, https://pypi.org/simple


In [2]:
import os
import json
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
from pycocoevalcap.eval import COCOEvalCap
from pycocotools.coco import COCO

# Load the model and tokenizer
model = AutoModel.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True, torch_dtype=torch.float16)
model = model.to(device='cuda')

tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True)
model.eval()



Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


MiniCPMV(
  (llm): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(128256, 4096)
      (layers): ModuleList(
        (0-31): 32 x LlamaDecoderLayer(
          (self_attn): LlamaSdpaAttention(
            (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
            (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
            (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (rotary_emb): LlamaRotaryEmbedding()
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
            (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
            (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): LlamaRMSNorm()
          (post_attention_layernorm): LlamaRMSNorm()

In [3]:
# Function to generate a caption for an image
def generate_caption(image_path):
    image = Image.open(image_path).convert('RGB')
    question = 'What is in the image?'
    msgs = [{'role': 'user', 'content': question}]
    
    res = model.chat(
        image=image,
        msgs=msgs,
        tokenizer=tokenizer,
        sampling=True, # if sampling=False, beam_search will be used by default
        temperature=0.7,
    )
    return res

In [6]:
# Load COCO validation annotations
with open('annotations_trainval2014/annotations/captions_val2014.json', 'r') as f:
    coco_annotations = json.load(f)

In [8]:
# Select a small subset of the data (e.g., first 100 images)
subset_size = 100
subset_annotations = coco_annotations['images'][:subset_size]

# Directory where the COCO validation images are stored
image_dir = 'val2014/val2014'

# Generate captions for the subset
generated_captions = []
for annotation in subset_annotations:
    image_id = annotation['id']
    image_filename = f"COCO_val2014_{image_id:012d}.jpg"
    image_path = os.path.join(image_dir, image_filename)
    try:
        caption = generate_caption(image_path)
        generated_captions.append({'image_id': image_id, 'caption': caption})
    except Exception as e:
        print(f"Error processing image {image_id}: {e}")

In [9]:
# Save the results to a JSON file
with open('results.json', 'w') as f:
    json.dump(generated_captions, f)

In [12]:
# Load the ground truth captions
coco = COCO('annotations_trainval2014/annotations/captions_val2014.json')
coco_results = coco.loadRes('results.json')

loading annotations into memory...
Done (t=0.22s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.01s)
creating index...
index created!


In [14]:
# Evaluate
coco_eval = COCOEvalCap(coco, coco_results)
coco_eval.evaluate()

tokenization...


FileNotFoundError: [Errno 2] No such file or directory: 'java'

In [None]:
# Print the results
for metric, score in coco_eval.eval.items():
    print(f"{metric}: {score:.3f}")