In [38]:
import os
os.environ['COCO_DIR'] = 'coco2017/'
os.environ['AOKVQA_DIR'] = '/usr1/data/mingqia2/aokvqa/'
os.environ['HF_HOME'] = '/usr1/data/models_cache'
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
import random

from PIL import Image
import requests
import torch

from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig, LlavaForConditionalGeneration 
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from transformers import AddedToken


from load_aokvqa import load_aokvqa, get_coco_path


In [26]:
coco_dir = os.getenv('COCO_DIR')
aokvqa_dir = os.getenv('AOKVQA_DIR')
val_dataset = load_aokvqa(aokvqa_dir, 'val')

In [19]:
def save_results_to_file(results, output_file):
    with open(output_file, 'w') as f:
        json.dump(results, f)

In [86]:
def extract_answer(answer_text):
    assistant_marker = "ASSISTANT:"
    start_idx = answer_text.find(assistant_marker)
    if start_idx != -1:
        # Extract the portion after "ASSISTANT:" and strip any leading/trailing whitespace
        return answer_text[start_idx + len(assistant_marker):].strip()
    else:
        # Return None if "ASSISTANT:" is not found
        return None
    
def process_results(output_file, dataset, model='llava', mc=False):
    with open(output_file, 'r') as f:
        results = json.load(f)
    for result, item in zip(results, dataset):
        if model == 'llava':
            result['answer'] = extract_answer(result['answer'])
        elif model == 'blip':
            result['answer'] = result['answer'].strip()
        else:
            print("Model not recognized!")
        
        if mc:
            result['correct_choice_idx'] = item['correct_choice_idx']
        else:
            result['direct_answers'] = item['direct_answers']
    return results

In [80]:
def eval_vqa(val_result, val_dataset, seed=42, mc=False):
    random.seed(seed)
    acc = []
    for result, item in zip(val_result, val_dataset):
        image_id = result['image_id']
        question_id = result['question_id']
        pred = result['answer'].lower()
        if item['image_id'] == image_id and item['question_id'] == question_id:
            direct_answers = item['direct_answers']
            if mc:
                pred = re.search(r'\d+', result['answer'])
                if pred in ['1', '2', '3', '4']:
                    pred = int(pred)
                else:
                    pred = random.randint(1, 4)
                acc.append(1) if pred == (item['correct_choice_idx']+1) else acc.append(0) 
            else:
                num_match = sum([pred == da for da in direct_answers])
                vqa_acc = min(1.0, num_match / 3.0)
                acc.append(vqa_acc)
        else:
            print(f"Image ID or question ID mismatch: {image_id}, {question_id}")
    acc = sum(acc) / len(acc) * 100
    return acc

# Q&A Prompts
Git Repo: https://github.com/WHB139426/QA-Prompts

# BLIP-2

In [90]:
def blip_pipeline(processor, model, val_dataset, coco_dir, output_file, batch_size=100, split='val', mc=False):
    results = []
    total_items = len(val_dataset)
    
    for idx, item in tqdm(enumerate(val_dataset), total=len(val_dataset)):
        image_id = item['image_id']
        question_id = item['question_id']
        if mc:
            choices = item['choices']
            choices_formatted = " ".join([f"({i+1}) {choice}" for i, choice in enumerate(choices)])
            question = item['question'] + " " + choices_formatted + " Please return only the chosen choice number (1, 2, 3, or 4) as the answer."
        else:
            question = item['question'] + " Please return only the key word or phrase as the answer, not a full sentence."
        
        image_path = get_coco_path(split, image_id, coco_dir)

        # Load the image from the local path
        raw_image = Image.open(image_path)
        prompt = f"Question: {question} Answer:"
        inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(device="cuda",dtype=torch.float16)

        # Run the model and generate output
        output = model.generate(**inputs, max_new_tokens=10, do_sample=False)
        decoded_output = processor.decode(output[0][2:], skip_special_tokens=True)

        # Store the results
        result = {
            "image_id": image_id,
            "question_id": question_id,
            "answer": decoded_output
        }
        results.append(result)

        if (idx + 1) % batch_size == 0 or (idx + 1) == total_items:
            save_results_to_file(results, output_file)
    # return results

In [40]:
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b",device_map={"": 0}, torch_dtype=torch.float16)

processor.num_query_tokens = model.config.num_query_tokens
image_token = AddedToken("<image>", normalized=False, special=True)
processor.tokenizer.add_tokens([image_token], special_tokens=True)

model.resize_token_embeddings(len(processor.tokenizer), pad_to_multiple_of=64) # pad for efficient computation
model.config.image_token_index = len(processor.tokenizer) - 1

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]


In [46]:
output_file = 'blip2-opt-2.7b_val_results_da.json'
blip_pipeline(processor, model, val_dataset, coco_dir, output_file, batch_size=100, split='val')

100%|██████████| 1145/1145 [01:25<00:00, 13.43it/s]


In [91]:
output_file = 'blip2-opt-2.7b_val_results_mc.json'
blip_pipeline(processor, model, val_dataset, coco_dir, output_file, batch_size=100, split='val', mc=True)

100%|██████████| 1145/1145 [02:27<00:00,  7.74it/s]


In [92]:
blip_mc_result = process_results('blip2-opt-2.7b_val_results_mc.json', val_dataset, model='blip', mc=True)
blip_da_result = process_results('blip2-opt-2.7b_val_results_da.json', val_dataset, model='blip')

In [96]:
blip_da_result

[{'image_id': 461751,
  'question_id': '22jbM6gDxdaMaunuzgrsBB',
  'answer': 'cigarette',
  'direct_answers': ['cigarette',
   'cigarette',
   'cigarette',
   'cigarette',
   'cigarette',
   'cigarette',
   'cigarette',
   'cigarette',
   'cigarette',
   'cigarette']},
 {'image_id': 377368,
  'question_id': '2Aq5RiEn7eyfWjEbpuYT2o',
  'answer': '',
  'direct_answers': ['thirty',
   '30th',
   'thirty',
   'thirty',
   'thirty',
   '30th',
   'thirty',
   'thirty',
   'thirty',
   'thirty']},
 {'image_id': 563603,
  'question_id': '2Br4bJfKY7SQM9DECrqqeG',
  'answer': '',
  'direct_answers': ['muddy',
   'dirty',
   'murky water',
   'muddy',
   'pond',
   'pond',
   'watering hole',
   'pond',
   'pond',
   'dirty']},
 {'image_id': 329542,
  'question_id': '2C8riXpRLX3CyM5jDz23m7',
  'answer': 'ing',
  'direct_answers': ['icing',
   'whipped cream',
   'icing',
   'frosting',
   'icing',
   'frosting',
   'cream',
   'icing',
   'icing',
   'frosting']},
 {'image_id': 182202,
  'questi

In [98]:
save_results_to_file(blip_mc_result, '../Jan-ML/blip2-opt-2.7b_val_results_mc.json')
save_results_to_file(blip_da_result, '../Jan-ML/blip2-opt-2.7b_val_results_da.json')

In [99]:
eval_vqa(blip_da_result, val_dataset)

4.192139737991267

In [97]:
eval_vqa(blip_mc_result, val_dataset, mc=True)

25.065502183406114

# Molmo

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"


In [4]:
processor = AutoProcessor.from_pretrained(
    'allenai/MolmoE-1B-0924',
    trust_remote_code=True,
    torch_dtype='auto',
    device_map='auto'
)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPTNeoXTokenizer'. 
The class this function is called from is 'GPT2TokenizerFast'.


In [8]:
# load the model
model = AutoModelForCausalLM.from_pretrained(
    'allenai/MolmoE-1B-0924',
    trust_remote_code=True,
    torch_dtype='auto',
    device_map='auto'
)

OSError: Unable to load weights from pytorch checkpoint file for '/usr1/data/models_cache/hub/models--allenai--MolmoE-1B-0924/snapshots/03d86d0fc2cc9a8840f92c0df54d178d08b86be5/pytorch_model.bin' at '/usr1/data/models_cache/hub/models--allenai--MolmoE-1B-0924/snapshots/03d86d0fc2cc9a8840f92c0df54d178d08b86be5/pytorch_model.bin'. If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True.

In [None]:
conversation = [
    {
      "role": "user",
      "content": [
          {"type": "text", "text": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"},
          {"type": "image"},
        ],
    },
]
processor = AutoProcessor.from_pretrained(model_id)

prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
print(outputs)

# LLaVA

In [2]:
model_id = "llava-hf/llava-1.5-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained(
    model_id, 
    torch_dtype=torch.float16, 
    low_cpu_mem_usage=True, 
    cache_dir="/usr1/data/models_cache"
).to(0)

processor = AutoProcessor.from_pretrained(model_id)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
print(dir(processor))

['__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_auto_class', '_create_repo', '_get_arguments_from_pretrained', '_get_files_timestamps', '_merge_kwargs', '_upload_modified_files', 'apply_chat_template', 'attributes', 'batch_decode', 'chat_template', 'decode', 'feature_extractor_class', 'from_args_and_dict', 'from_pretrained', 'get_processor_dict', 'image_processor', 'image_processor_class', 'image_token', 'model_input_names', 'optional_attributes', 'optional_call_args', 'patch_size', 'prepare_and_validate_optional_call_args', 'push_to_hub', 'register_for_auto_class', 'save_pretrained', 'to_dict', 'to_json_file', 'to_json_string', 'tokenizer', 'tokenizer_class', '

In [5]:
conversation = [
    {

      "role": "user",
      "content": [
          {"type": "text", "text": "What are these?"},
          {"type": "image"},
        ],
    },
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)
inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(0, torch.float16)

output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
decoded_output = processor.decode(output[0][2:], skip_special_tokens=True)

Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [6]:
output

tensor([[    1,  3148,  1001, 29901, 29871, 32000, 29871,    13,  5618,   526,
          1438, 29973,   319,  1799,  9047, 13566, 29901,  4525,   526,  1023,
           274,  1446, 19214,   373,   263,   282,   682,   274,  3222, 29889,
             2]], device='cuda:0')

In [14]:
aokvqa_dir = os.getenv('AOKVQA_DIR')
val_dataset = load_aokvqa(aokvqa_dir, 'val')

In [16]:
dataset_example = val_dataset[0]
image_path = get_coco_path('val', dataset_example['image_id'], coco_dir)

In [17]:
dataset_example

{'split': 'val',
 'image_id': 461751,
 'question_id': '22jbM6gDxdaMaunuzgrsBB',
 'question': "What is in the motorcyclist's mouth?",
 'choices': ['toothpick', 'food', 'popsicle stick', 'cigarette'],
 'correct_choice_idx': 3,
 'direct_answers': ['cigarette',
  'cigarette',
  'cigarette',
  'cigarette',
  'cigarette',
  'cigarette',
  'cigarette',
  'cigarette',
  'cigarette',
  'cigarette'],
 'difficult_direct_answer': False,
 'rationales': ["He's smoking while riding.",
  'The motorcyclist has a lit cigarette in his mouth while he rides on the street.',
  'The man is smoking.']}

In [18]:
image_path

'/usr1/data/mingqia2/coco/val2017/000000461751.jpg'

In [88]:
def llava_pipeline(val_dataset, coco_dir, output_file, batch_size=100, split='val', mc=False):
    results = []
    total_items = len(val_dataset)
    
    for idx, item in tqdm(enumerate(val_dataset), total=len(val_dataset)):
        image_id = item['image_id']
        question_id = item['question_id']
        if mc:
            choices = item['choices']
            choices_formatted = " ".join([f"({i+1}) {choice}" for i, choice in enumerate(choices)])
            question = item['question'] + " " + choices_formatted + " Please return only the chosen choice number as the answer."
        else:
            question = item['question'] + " Please return only the key word or phrase as the answer, not a full sentence."
        
        image_path = get_coco_path(split, image_id, coco_dir)

        # Load the image from the local path
        raw_image = Image.open(image_path)
        # Create conversation template with the question
        conversation = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": question},
                    {"type": "image"},
                ],
            },
        ]
        prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
        inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(0, torch.float16)

        # Run the model and generate output
        output = model.generate(**inputs, max_new_tokens=10, do_sample=False)
        decoded_output = processor.decode(output[0][2:], skip_special_tokens=True)

        # Store the results
        result = {
            "image_id": image_id,
            "question_id": question_id,
            "answer": decoded_output
        }
        results.append(result)

        if (idx + 1) % batch_size == 0 or (idx + 1) == total_items:
            save_results_to_file(results, output_file)
    # return results

In [15]:
train_dataset = load_aokvqa(aokvqa_dir, 'train')
len(train_dataset)

17056

In [77]:
output_file = 'llava1.5-7b_val_results_da.json'
llava_pipeline(val_dataset, coco_dir, output_file, batch_size=100, split='val')

100%|██████████| 1145/1145 [03:40<00:00,  5.19it/s]


In [89]:
output_file = 'llava1.5-7b_val_results_mc.json'
llava_pipeline(val_dataset, coco_dir, output_file, batch_size=100, split='val', mc=True)

100%|██████████| 1145/1145 [03:51<00:00,  4.96it/s]


In [50]:
llava_mc_result = process_results('llava1.5-7b_val_results_mc.json', val_dataset, mc=True)

In [51]:
llava_da_result = process_results('llava1.5-7b_val_results_da.json', val_dataset)

In [57]:
save_results_to_file(llava_mc_result, 'llava1.5-7b_val_results_mc.json')
save_results_to_file(llava_da_result, 'llava1.5-7b_val_results_da.json')

In [69]:
llava_mc_path = "llava1.5-7b_val_results_mc.json"
llava_da_path = "llava1.5-7b_val_results_da.json"
with open(llava_mc_path, 'r') as f:
    llava_mc_result = json.load(f)
    
with open(llava_da_path, 'r') as f:
    llava_da_result = json.load(f)

In [78]:
eval_vqa(llava_da_result, val_dataset)

58.16593886462881

In [79]:
eval_vqa(llava_mc_result, val_dataset, mc=True)

25.065502183406114