In [1]:
!pip install -q transformers datasets accelerate torch torchvision torchaudio evaluate scikit-learn pandas bert_score python-Levenshtein inflect Pillow nltk

import os
import gc
import pandas as pd
import numpy as np
from PIL import Image
from pathlib import Path
from tqdm.auto import tqdm
import copy
from functools import partial
import json
import Levenshtein
import re

import nltk
from nltk.corpus import wordnet

import torch
from torch.utils.data import Dataset, DataLoader
from datasets import Dataset as HFDataset

from transformers import (
    AutoProcessor, AutoTokenizer,
    AutoModelForVisualQuestionAnswering,
    BlipForQuestionAnswering, BlipProcessor,
    Blip2ForConditionalGeneration, Blip2Processor,
    ViTImageProcessor, BertTokenizerFast, GPT2TokenizerFast,
    GitForCausalLM, GitProcessor,
    # OFAModel, OFAProcessor
)
from torchvision import transforms
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score as sklearn_f1_score
import evaluate

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def clear_gpu_memory():
    print("Attempting to clear GPU memory...")
    gc.collect()
    if torch.cuda.is_available(): torch.cuda.empty_cache()
    print("GPU memory clear attempt complete.")

try:
    nltk.data.find('corpora/wordnet.zip')
    print("WordNet resource found.")
except:
    print("Downloading WordNet resource...")
    nltk.download('wordnet', quiet=True)
try:
    nltk.data.find('corpora/omw-1.4.zip')
    print("OMW-1.4 resource found.")
except:
    print("Downloading OMW-1.4 resource...")
    nltk.download('omw-1.4', quiet=True)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m70.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

2025-05-14 06:09:53.224438: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747202993.426688      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747202993.482536      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda
WordNet resource found.
Downloading OMW-1.4 resource...


In [2]:
CURATED_DATASET_CSV = "/kaggle/input/vqa-fine-tuning/vqa_dataset.csv"
IMAGE_ROOT_DIR = Path("/kaggle/input/vqa-fine-tuning/images/small/")
TEST_SET_SIZE = 0.15; VAL_SET_SIZE = 0.10; RANDOM_STATE = 42
MODELS_TO_EVALUATE = [
    ("dandelin/vilt-b32-finetuned-vqa", "dandelin/vilt-b32-finetuned-vqa", "vilt", 40, (384, 384)),
    ("Salesforce/blip-vqa-base", "Salesforce/blip-vqa-base", "blip", None, (384, 384)),
    ("Salesforce/blip-vqa-capfilt-large", "Salesforce/blip-vqa-capfilt-large", "blip", None, (384, 384)),
    ("microsoft/git-base-vqav2", "microsoft/git-base-vqav2", "git", None, (480, 480)),
    ("microsoft/git-large-vqav2", "microsoft/git-large-vqav2", "git", None, (420, 420)),
    # ("OFA-Sys/OFA-base", "OFA-Sys/OFA-base", "ofa", None, (256,256)),
]
BLIP2_MODEL_TO_EVALUATE = {
    "name": "Salesforce/blip2-opt-2.7b", "processor": "Salesforce/blip2-opt-2.7b",
    "type": "blip2", "text_max_len": None, "image_size": (224, 224)
}
# BLIP2_MODEL_TO_EVALUATE = None
BATCH_SIZE = 8; DEFAULT_TEXT_MAX_LEN = 128
DEFAULT_IMAGE_TARGET_SIZE_CUSTOM_COLLATE = (384, 384)
WUPS_SIMILARITY_THRESHOLD = 0.9
LEVENSHTEIN_SIMILARITY_THRESHOLD = 0.8

In [3]:
try:
    full_df = pd.read_csv(CURATED_DATASET_CSV)
    full_df.dropna(subset=['image_path', 'question', 'answer'], inplace=True)
    full_df['answer'] = full_df['answer'].astype(str)
    print(f"Loaded dataset with {len(full_df)} samples.")
    _, test_df = train_test_split(full_df, test_size=TEST_SET_SIZE, random_state=RANDOM_STATE)
    test_hf_dataset = HFDataset.from_pandas(test_df.reset_index(drop=True))
    print(f"Test set size: {len(test_hf_dataset)}")
    if len(test_hf_dataset)>0: print("\nSample from test_hf_dataset:", test_hf_dataset[0])
    else: print("Warning: Test dataset is empty after split/load.")
except Exception as e:
    print(f"ERROR loading/splitting data: {e}"); test_hf_dataset = None

Loaded dataset with 63182 samples.
Test set size: 9478

Sample from test_hf_dataset: {'image_path': '06/063d633d.jpg', 'question': 'What color is the filament?', 'answer': 'Red'}


In [4]:
model_type_being_processed = ""

class VQABaselineDataset_OriginalProcessor(Dataset):
    def __init__(self, hf_dataset, processor, image_root_dir, text_max_length=128, image_size_tuple=(224, 224)):
        self.hf_dataset = hf_dataset; self.processor = processor; self.image_root_dir = Path(image_root_dir)
        self.text_max_length = text_max_length; self.image_size_tuple = image_size_tuple
    def __len__(self): return len(self.hf_dataset)
    def __getitem__(self, idx):
        item = self.hf_dataset[idx]; image_path_str = item['image_path']; question = item['question']; answer = str(item['answer'])
        try: image = Image.open(self.image_root_dir / image_path_str).convert("RGB")
        except Exception: image = Image.new('RGB', self.image_size_tuple, color='grey')
        try:
            processor_inputs = {"text": question, "images": image}
            if model_type_being_processed == "ofa":
                 pass

            encoding = self.processor(**processor_inputs, padding="max_length", truncation=True, max_length=self.text_max_length, return_tensors="pt")
            encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        except Exception as e: print(f"Error in OriginalProcessor item {idx} (type: {model_type_being_processed}): {e}"); raise e
        return {"encoding": encoding, "raw_question": question, "raw_answer": answer, "image_path": image_path_str}

class VQABaselineDataset_CustomCollate(Dataset):
    def __init__(self, hf_dataset, tokenizer, image_root_dir, image_processor_for_norm=None, text_max_length=40):
        self.hf_dataset = hf_dataset; self.tokenizer = tokenizer; self.image_processor_for_norm = image_processor_for_norm
        self.image_root_dir = Path(image_root_dir); self.text_max_length = text_max_length
    def __len__(self): return len(self.hf_dataset)
    def __getitem__(self, idx):
        item = self.hf_dataset[idx]; image_path_str = item['image_path']; question = item['question']; answer = str(item['answer'])
        try: image = Image.open(self.image_root_dir / image_path_str).convert("RGB")
        except Exception: image = Image.new('RGB', DEFAULT_IMAGE_TARGET_SIZE_CUSTOM_COLLATE, color='grey')
        try:
            text_encoding = self.tokenizer(question, padding="max_length", truncation=True, return_tensors="pt", max_length=self.text_max_length)
            s_text_enc = {k: v.squeeze(0) for k, v in text_encoding.items()}
        except Exception as e: print(f"Error in CustomCollate item {idx} tokenizing: {e}"); raise e
        return {"pixel_values_pil": image, "text_encoding": s_text_enc, "raw_question": question, "raw_answer": answer, "image_path": image_path_str}

def custom_collate_fn(batch, image_processor_hf, target_size=(384, 384)):
    try:
        actual_img_proc = image_processor_hf
        if hasattr(image_processor_hf, 'image_processor') and image_processor_hf.image_processor is not None:
            actual_img_proc = image_processor_hf.image_processor

        mean = actual_img_proc.image_mean
        std = actual_img_proc.image_std
    except AttributeError:
        mean = [0.485, 0.456, 0.406]
        std = [0.229, 0.224, 0.225]
        print(f"Warn: Custom collate for {type(image_processor_hf)} (actual: {type(actual_img_proc) if 'actual_img_proc' in locals() else 'N/A'}) using default ImageNet norm.")

    image_transforms = transforms.Compose([
        transforms.Resize(target_size, interpolation=transforms.InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=std)
    ])
    images_pil = [item['pixel_values_pil'] for item in batch]; processed_images = torch.stack([image_transforms(img) for img in images_pil])
    text_input_ids = torch.stack([item['text_encoding']['input_ids'] for item in batch])
    text_attention_mask = torch.stack([item['text_encoding']['attention_mask'] for item in batch])
    text_encoding_batched = {"input_ids": text_input_ids, "attention_mask": text_attention_mask}
    if 'token_type_ids' in batch[0]['text_encoding']: text_encoding_batched['token_type_ids'] = torch.stack([item['text_encoding']['token_type_ids'] for item in batch])
    raw_questions = [item['raw_question'] for item in batch]; raw_answers = [item['raw_answer'] for item in batch]; image_paths = [item['image_path'] for item in batch]
    model_inputs = {**text_encoding_batched, "pixel_values": processed_images}
    return {"encoding": model_inputs, "raw_question": raw_questions, "raw_answer": raw_answers, "image_path": image_paths}

In [5]:
def categorize_question(question):
    q_lower = question.lower()
    if "what color" in q_lower or "color is the" in q_lower: return "Color"
    if "how many" in q_lower: return "Count"
    if "brand" in q_lower: return "Brand"
    if "material" in q_lower: return "Material"
    if q_lower.startswith(("is ", "are ", "does ", "do ")): return "Yes/No"
    if "type" in q_lower and ("product" in q_lower or "object" in q_lower or "item" in q_lower): return "Product Type"
    if "style" in q_lower: return "Style"
    if "weight" in q_lower or "weigh" in q_lower or "dimension" in q_lower or "height" in q_lower or "width" in q_lower or "length" in q_lower : return "Weight/Dimensions"
    return "Other"

def get_wup_similarity(word1, word2, similarity_threshold=0.9):
    if not word1 or not word2: return 0.0
    syns1 = wordnet.synsets(word1)
    syns2 = wordnet.synsets(word2)

    if not syns1 or not syns2: return 0.0

    max_sim = 0.0
    for s1 in syns1:
        for s2 in syns2:
            sim = s1.wup_similarity(s2)
            if sim is not None and sim > max_sim:
                max_sim = sim
    return 1.0 if max_sim >= similarity_threshold else (max_sim if max_sim is not None else 0.0) # Soft or hard thresholding

# For cleaning numbers and common variations before comparison
contractions = { "isn't": "is not", "aren't": "are not", # ... add more
                 "'s": " is", "'re": " are", "'ve": " have" }
def preprocess_answer_for_metric(answer):
    answer = str(answer).lower().strip()
    for k, v in contractions.items(): answer = answer.replace(k, v)
    answer = re.sub(r"[^\w\s]", "", answer)
    answer = re.sub(r"\s+", " ", answer).strip()
    try:
      if p_inflect.singular_noun(answer) is False:
        num_val = p_inflect.word_to_num(answer)
        answer = str(num_val)
    except: pass
    return answer

In [6]:
def evaluate_model(model_name_or_path, processor_name_or_path, test_hf_dataset, image_root_dir,
                   model_type="vilt", text_max_len_override=None, image_target_size_config=None, batch_size_override=None):
    global model_type_being_processed
    model_type_being_processed = model_type

    print(f"\n--- Evaluating: {model_name_or_path} (Type: {model_type}) ---")
    current_batch_size = batch_size_override if batch_size_override is not None else BATCH_SIZE
    print(f"Using Batch Size: {current_batch_size}")

    accuracy = 0.0; bertscore_f1 = 0.0; avg_wups_score = 0.0; avg_token_f1 = 0.0
    category_summary = pd.DataFrame()
    model, full_processor, tokenizer, image_processor_hf = None, None, None, None
    eval_dataset, eval_dataloader = None, None

    try:
        print(f"Loading processor: {processor_name_or_path}")
        if model_type == "git": full_processor = GitProcessor.from_pretrained(processor_name_or_path)
        elif model_type == "ofa": full_processor = OFAProcessor.from_pretrained(processor_name_or_path)
        elif model_type.startswith("blip2"): full_processor = Blip2Processor.from_pretrained(processor_name_or_path)
        else: full_processor = AutoProcessor.from_pretrained(processor_name_or_path)

        if hasattr(full_processor, 'tokenizer') and full_processor.tokenizer is not None:
            tokenizer = full_processor.tokenizer
        else:
            tokenizer = full_processor

        if model_type in ["git", "blip2", "ofa", "blip"]:
            if hasattr(tokenizer, 'padding_side') and tokenizer.padding_side == 'right':
                print(f"INFO: Setting padding_side='left' for {model_type} tokenizer.")
                tokenizer.padding_side = "left"
                if tokenizer.pad_token is None and hasattr(tokenizer, 'eos_token') and tokenizer.eos_token is not None:
                    print(f"INFO: Setting pad_token to eos_token for {model_type} tokenizer for left-padding effect.")
                    tokenizer.pad_token = tokenizer.eos_token


        if model_type in ["vilt", "blip"] and not model_type.startswith("blip2"):
            image_processor_hf = full_processor.image_processor if hasattr(full_processor, 'image_processor') else None
        elif model_type == "git":
            image_processor_hf = full_processor
        else:
            image_processor_hf = None

        print(f"Loading model: {model_name_or_path}")
        load_kwargs = {}
        if model_type.startswith("blip2") or model_type == "ofa": load_kwargs['torch_dtype'] = torch.float16
        if model_type.startswith("blip2"): model = Blip2ForConditionalGeneration.from_pretrained(model_name_or_path, **load_kwargs)
        elif model_type == "blip": model = BlipForQuestionAnswering.from_pretrained(model_name_or_path, **load_kwargs)
        elif model_type == "git": model = GitForCausalLM.from_pretrained(model_name_or_path, **load_kwargs)
        elif model_type == "vilt": model = AutoModelForVisualQuestionAnswering.from_pretrained(model_name_or_path, **load_kwargs)
        # elif model_type == "ofa": model = OFAModel.from_pretrained(model_name_or_path, **load_kwargs)
        else: raise ValueError(f"Unsupported model_type: {model_type}")
        model.to(device); model.eval();
        if torch.cuda.is_available(): torch.cuda.empty_cache(); print("Model, processor loaded. GPU cache cleared.")

        text_max_length=DEFAULT_TEXT_MAX_LEN
        if text_max_len_override: text_max_length=text_max_len_override
        elif hasattr(tokenizer,'model_max_length') and isinstance(tokenizer.model_max_length,int) and 0<tokenizer.model_max_length<=4096: text_max_length=tokenizer.model_max_length
        print(f"Final text_max_length for {model_type}: {text_max_length}")

        if model_type.startswith("blip2") or model_type=="ofa":
            print(f"Using OriginalProcessor Dataset for {model_type}.")
            img_sz_tuple=image_target_size_config if image_target_size_config else (BLIP2_MODEL_TO_EVALUATE.get("image_size",(224,224)) if model_type.startswith("blip2") and BLIP2_MODEL_TO_EVALUATE else (256,256))
            eval_dataset=VQABaselineDataset_OriginalProcessor(test_hf_dataset,full_processor,image_root_dir,text_max_length=text_max_length,image_size_tuple=img_sz_tuple)
            eval_dataloader=DataLoader(eval_dataset,batch_size=current_batch_size,shuffle=False)
        else:
            print(f"Using CustomCollate Dataset for {model_type}.")
            img_proc_collate=image_processor_hf if image_processor_hf is not None else full_processor
            eval_dataset=VQABaselineDataset_CustomCollate(test_hf_dataset,tokenizer,image_root_dir,img_proc_collate,text_max_length)
            target_img_sz_collate=image_target_size_config if image_target_size_config else DEFAULT_IMAGE_TARGET_SIZE_CUSTOM_COLLATE
            print(f"Custom collate target image size: {target_img_sz_collate}")
            collate_fn_use=partial(custom_collate_fn,image_processor_hf=img_proc_collate,target_size=target_img_sz_collate)
            eval_dataloader=DataLoader(eval_dataset,batch_size=current_batch_size,shuffle=False,collate_fn=collate_fn_use)
    except Exception as e:
        print(f"ERROR: Setup FAILED for {model_name_or_path}: {e}"); clear_gpu_memory()
        return accuracy, bertscore_f1, avg_wups_score, avg_token_f1, category_summary

    predictions=[]; ground_truths=[]; image_paths_eval=[]; questions_eval=[]
    print("Starting inference loop...")
    with torch.no_grad():
        for batch_idx,batch in enumerate(tqdm(eval_dataloader,desc=f"Inferring with {model_type}")):
            inputs=batch['encoding']; raw_ans=batch['raw_answer']; raw_qs=batch['raw_question']; img_paths=batch['image_path']
            try: inputs={k:v.to(device) for k,v in inputs.items() if isinstance(v,torch.Tensor)}
            except Exception as e: print(f"Error moving batch {batch_idx} to device: {e}"); continue
            try:
                pred_ans_texts = []
                if model_type.startswith("blip2"):
                    gen_ids=model.generate(pixel_values=inputs.get('pixel_values'),input_ids=inputs.get('input_ids'),attention_mask=inputs.get('attention_mask'),max_new_tokens=20)
                    pred_ans_texts=full_processor.batch_decode(gen_ids,skip_special_tokens=True)
                elif model_type=="blip" or model_type=="git" or model_type=="ofa":
                    gen_inputs={};
                    if 'input_ids' in inputs: gen_inputs['input_ids']=inputs['input_ids']
                    if 'attention_mask' in inputs: gen_inputs['attention_mask']=inputs['attention_mask']
                    if 'pixel_values' in inputs: gen_inputs['pixel_values']=inputs['pixel_values']
                    if 'patch_images' in inputs: gen_inputs['patch_images']=inputs['patch_images']
                    if 'patch_masks' in inputs: gen_inputs['patch_masks']=inputs['patch_masks']
                    if not gen_inputs or ('input_ids' not in gen_inputs and model_type != "ofa"): raise ValueError(f"Insufficient inputs for generate in {model_type}. Got: {inputs.keys()}")
                    
                    gen_ids=model.generate(**gen_inputs,max_new_tokens=20)
                    decoder=full_processor if model_type in ["git","ofa"] else tokenizer
                    pred_ans_texts=decoder.batch_decode(gen_ids,skip_special_tokens=True)
                elif model_type=="vilt":
                    outputs=model(**inputs); pred_ids=outputs.logits.argmax(-1)
                    pred_ans_texts=tokenizer.batch_decode(pred_ids,skip_special_tokens=True)
                else: print(f"Warn: Unknown inference for {model_type}"); pred_ans_texts=["<NO_LOGIC>"]*len(raw_ans)
                
                predicted_answers=[text.strip() for text in pred_ans_texts]

            except Exception as e:
                print(f"Error inference batch {batch_idx} ({model_type}): {e}")
                predicted_answers=["<ERROR>"]*len(raw_ans)
                if "CUDA out of memory" in str(e): print("OOM. Stopping."); break
            predictions.extend(predicted_answers); ground_truths.extend(raw_ans); image_paths_eval.extend(img_paths); questions_eval.extend(raw_qs)

    print("Calculating metrics...")
    valid_indices=[i for i,p in enumerate(predictions) if p not in ['<ERROR>','<NO_LOGIC>']]
    if not valid_indices: print("No valid predictions."); clear_gpu_memory(); return accuracy,bertscore_f1,avg_wups_score,avg_token_f1,category_summary

    final_preds=[preprocess_answer_for_metric(predictions[i]) for i in valid_indices]
    final_trues=[preprocess_answer_for_metric(ground_truths[i]) for i in valid_indices]
    final_qs=[questions_eval[i] for i in valid_indices]

    correct=sum(1 for p,t in zip(final_preds,final_trues) if p==t); accuracy=(correct/len(valid_indices))*100
    print(f"Exact Match Accuracy: {accuracy:.2f}% (on {len(valid_indices)} valid predictions)")

    try:
        cats=[categorize_question(q) for q in final_qs]
        df=pd.DataFrame({'cat':cats,'correct':[p==t for p,t in zip(final_preds,final_trues)]})
        cat_acc=df.groupby('cat')['correct'].mean()*100; cat_counts=df['cat'].value_counts()
        category_summary=pd.concat([cat_acc,cat_counts],axis=1).fillna(0); category_summary.columns=['Acc (%)','Count']
        print("\n--- Accuracy Per Category ---"); print(category_summary.round(2))
    except Exception as e: print(f"Cat acc error: {e}")

    try:
        bs_metric=evaluate.load("bertscore");
        if final_preds and final_trues:
            bs=bs_metric.compute(predictions=final_preds,references=final_trues,lang="en",model_type="distilbert-base-uncased",device=device,verbose=False)
            bertscore_f1=np.mean(bs['f1'])*100; print(f"\nAverage BERTScore F1: {bertscore_f1:.2f}%")
        else: print("Skipping BERTScore.")
    except Exception as e: print(f"BERTScore error: {e}")

    wups_scores = []
    for pred, true_ans in zip(final_preds, final_trues):
        if pred == true_ans: wups_scores.append(1.0)
        else:
            dist = Levenshtein.distance(pred, true_ans)
            max_l = max(len(pred), len(true_ans), 1)
            lev_sim = 1 - (dist / max_l)
            if lev_sim >= LEVENSHTEIN_SIMILARITY_THRESHOLD: wups_scores.append(0.9)
            else:
                 wups_sim = get_wup_similarity(pred.split(" ")[0], true_ans.split(" ")[0], WUPS_SIMILARITY_THRESHOLD)
                 wups_scores.append(wups_sim if wups_sim >= WUPS_SIMILARITY_THRESHOLD else 0.0)

    if wups_scores: avg_wups_score = np.mean(wups_scores) * 100; print(f"Average Simplified VQA (WUPS-style) Score: {avg_wups_score:.2f}%")

    token_f1_scores = []
    for pred, true_ans in zip(final_preds, final_trues):
        pred_tokens = set(pred.split())
        true_tokens = set(true_ans.split())
        if not pred_tokens and not true_tokens: token_f1_scores.append(1.0); continue
        if not pred_tokens or not true_tokens: token_f1_scores.append(0.0); continue
        
        common_tokens = len(pred_tokens.intersection(true_tokens))
        precision = common_tokens / len(pred_tokens) if len(pred_tokens) > 0 else 0
        recall = common_tokens / len(true_tokens) if len(true_tokens) > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        token_f1_scores.append(f1)
    if token_f1_scores: avg_token_f1 = np.mean(token_f1_scores) * 100; print(f"Average Token-Overlap F1 Score: {avg_token_f1:.2f}%")
    print("---------------------------")

    print("\nExample Preds:");
    for i in range(min(5,len(valid_indices))):
         idx=valid_indices[i]; print(f"  Img: {image_paths_eval[idx]}, Q: {questions_eval[idx]}\n  Pred: '{predictions[idx]}' | True: '{ground_truths[idx]}' | Correct (Exact): {final_preds[i]==final_trues[i]}")

    print(f"--- Eval Complete: {model_name_or_path} ---")
    del model,full_processor,tokenizer,image_processor_hf,eval_dataset,eval_dataloader,inputs
    if 'outputs' in locals(): del outputs;
    if 'generated_ids' in locals(): del generated_ids
    clear_gpu_memory()
    return accuracy,bertscore_f1,avg_wups_score,avg_token_f1,category_summary

In [7]:
baseline_results_summary = {}
if test_hf_dataset is not None and len(test_hf_dataset) > 0:
    for model_path, proc_path, model_tag, txt_max_len, img_size_cfg in MODELS_TO_EVALUATE:
        print(f"\n{'='*20} Starting: {model_path} {'='*20}")
        acc, bs_f1, wups, tok_f1, cat_sum = evaluate_model(
            model_path, proc_path, test_hf_dataset, IMAGE_ROOT_DIR, model_type=model_tag,
            text_max_len_override=txt_max_len, image_target_size_config=img_size_cfg
        )
        baseline_results_summary[model_path] = {'Accuracy': acc, 'BERTScore_F1': bs_f1, 'WUPS_Score': wups, 'Token_F1': tok_f1, 'CategoryAcc_DF': cat_sum}
        print(f"{'='*20} Finished: {model_path} {'='*20}")

    if BLIP2_MODEL_TO_EVALUATE and BLIP2_MODEL_TO_EVALUATE.get("name"):
        print(f"\n{'='*20} Starting BLIP-2: {BLIP2_MODEL_TO_EVALUATE['name']} {'='*20}")
        acc, bs_f1, wups, tok_f1, cat_sum = evaluate_model(
            BLIP2_MODEL_TO_EVALUATE["name"], BLIP2_MODEL_TO_EVALUATE["processor"], test_hf_dataset, IMAGE_ROOT_DIR,
            model_type=BLIP2_MODEL_TO_EVALUATE["type"], text_max_len_override=BLIP2_MODEL_TO_EVALUATE["text_max_len"],
            image_target_size_config=BLIP2_MODEL_TO_EVALUATE.get("image_size"), batch_size_override=1
        )
        baseline_results_summary[BLIP2_MODEL_TO_EVALUATE["name"]] = {'Accuracy': acc, 'BERTScore_F1': bs_f1, 'WUPS_Score': wups, 'Token_F1': tok_f1, 'CategoryAcc_DF': cat_sum}
        print(f"{'='*20} Finished BLIP-2: {BLIP2_MODEL_TO_EVALUATE['name']} {'='*20}")
else: print("Test dataset not loaded/empty.")



--- Evaluating: dandelin/vilt-b32-finetuned-vqa (Type: vilt) ---
Using Batch Size: 8
Loading processor: dandelin/vilt-b32-finetuned-vqa


preprocessor_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/136k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Loading model: dandelin/vilt-b32-finetuned-vqa


pytorch_model.bin:   0%|          | 0.00/470M [00:00<?, ?B/s]

Model, processor loaded. GPU cache cleared.
Final text_max_length for vilt: 40
Using CustomCollate Dataset for vilt.
Custom collate target image size: (384, 384)
Starting inference loop...


Inferring with vilt:   0%|          | 0/1185 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/470M [00:00<?, ?B/s]

Calculating metrics...
Exact Match Accuracy: 0.00% (on 9478 valid predictions)

--- Accuracy Per Category ---
                   Acc (%)  Count
cat                              
Brand                  0.0    806
Color                  0.0    848
Count                  0.0    474
Material               0.0   1307
Other                  0.0   3398
Product Type           0.0    781
Style                  0.0    751
Weight/Dimensions      0.0    128
Yes/No                 0.0    985


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]




Average BERTScore F1: 64.83%
Average Simplified VQA (WUPS-style) Score: 0.00%
Average Token-Overlap F1 Score: 0.00%
---------------------------

Example Preds:
  Img: 06/063d633d.jpg, Q: What color is the filament?
  Pred: '[unused5]' | True: 'Red' | Correct (Exact): False
  Img: 03/03d10684.jpg, Q: What is the product type?
  Pred: '[unused291]' | True: 'Boot' | Correct (Exact): False
  Img: 06/06b09ad8.jpg, Q: What is the material?
  Pred: '~' | True: 'Synthetic' | Correct (Exact): False
  Img: 1a/1a377ff8.jpg, Q: What color are the frames?
  Pred: '[unused58]' | True: 'Brown' | Correct (Exact): False
  Img: 9f/9f010c07.jpg, Q: What color is the case?
  Pred: '[unused41]' | True: 'Pink' | Correct (Exact): False
--- Eval Complete: dandelin/vilt-b32-finetuned-vqa ---
Attempting to clear GPU memory...
GPU memory clear attempt complete.


--- Evaluating: Salesforce/blip-vqa-base (Type: blip) ---
Using Batch Size: 8
Loading processor: Salesforce/blip-vqa-base


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

INFO: Setting padding_side='left' for blip tokenizer.
Loading model: Salesforce/blip-vqa-base


config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Model, processor loaded. GPU cache cleared.
Final text_max_length for blip: 512
Using CustomCollate Dataset for blip.
Custom collate target image size: (384, 384)
Starting inference loop...


Inferring with blip:   0%|          | 0/1185 [00:00<?, ?it/s]

Calculating metrics...
Exact Match Accuracy: 15.51% (on 9478 valid predictions)

--- Accuracy Per Category ---
                   Acc (%)  Count
cat                              
Brand                 0.12    806
Color                31.49    848
Count                 3.16    474
Material             11.55   1307
Other                 8.00   3398
Product Type          0.26    781
Style                 0.67    751
Weight/Dimensions     0.78    128
Yes/No               76.75    985

Average BERTScore F1: 75.60%
Average Simplified VQA (WUPS-style) Score: 18.75%
Average Token-Overlap F1 Score: 16.10%
---------------------------

Example Preds:
  Img: 06/063d633d.jpg, Q: What color is the filament?
  Pred: 'red' | True: 'Red' | Correct (Exact): True
  Img: 03/03d10684.jpg, Q: What is the product type?
  Pred: 'no' | True: 'Boot' | Correct (Exact): False
  Img: 06/06b09ad8.jpg, Q: What is the material?
  Pred: 'man ' s boots are leather' | True: 'Synthetic' | Correct (Exact): False
  Img: 1a

preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/524 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

INFO: Setting padding_side='left' for blip tokenizer.
Loading model: Salesforce/blip-vqa-capfilt-large


config.json:   0%|          | 0.00/4.59k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Model, processor loaded. GPU cache cleared.
Final text_max_length for blip: 512
Using CustomCollate Dataset for blip.
Custom collate target image size: (384, 384)
Starting inference loop...


Inferring with blip:   0%|          | 0/1185 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Calculating metrics...
Exact Match Accuracy: 15.51% (on 9478 valid predictions)

--- Accuracy Per Category ---
                   Acc (%)  Count
cat                              
Brand                 0.12    806
Color                31.49    848
Count                 3.16    474
Material             11.55   1307
Other                 8.00   3398
Product Type          0.26    781
Style                 0.67    751
Weight/Dimensions     0.78    128
Yes/No               76.75    985

Average BERTScore F1: 75.60%
Average Simplified VQA (WUPS-style) Score: 18.75%
Average Token-Overlap F1 Score: 16.10%
---------------------------

Example Preds:
  Img: 06/063d633d.jpg, Q: What color is the filament?
  Pred: 'red' | True: 'Red' | Correct (Exact): True
  Img: 03/03d10684.jpg, Q: What is the product type?
  Pred: 'no' | True: 'Boot' | Correct (Exact): False
  Img: 06/06b09ad8.jpg, Q: What is the material?
  Pred: 'man ' s boots are leather' | True: 'Synthetic' | Correct (Exact): False
  Img: 1a

preprocessor_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

INFO: Setting padding_side='left' for git tokenizer.
Loading model: microsoft/git-base-vqav2


config.json:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/709M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Model, processor loaded. GPU cache cleared.
Final text_max_length for git: 512
Using CustomCollate Dataset for git.
Custom collate target image size: (480, 480)
Starting inference loop...


Inferring with git:   0%|          | 0/1185 [00:00<?, ?it/s]

Calculating metrics...
Exact Match Accuracy: 0.00% (on 9478 valid predictions)

--- Accuracy Per Category ---
                   Acc (%)  Count
cat                              
Brand                  0.0    806
Color                  0.0    848
Count                  0.0    474
Material               0.0   1307
Other                  0.0   3398
Product Type           0.0    781
Style                  0.0    751
Weight/Dimensions      0.0    128
Yes/No                 0.0    985

Average BERTScore F1: 67.09%
Average Simplified VQA (WUPS-style) Score: 0.00%
Average Token-Overlap F1 Score: 3.36%
---------------------------

Example Preds:
  Img: 06/063d633d.jpg, Q: What color is the filament?
  Pred: 'what color is the filament?' | True: 'Red' | Correct (Exact): False
  Img: 03/03d10684.jpg, Q: What is the product type?
  Pred: 'what is the product type?' | True: 'Boot' | Correct (Exact): False
  Img: 06/06b09ad8.jpg, Q: What is the material?
  Pred: 'what is the material?' | True: 'Synt

preprocessor_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

INFO: Setting padding_side='left' for git tokenizer.
Loading model: microsoft/git-large-vqav2


config.json:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Model, processor loaded. GPU cache cleared.
Final text_max_length for git: 512
Using CustomCollate Dataset for git.
Custom collate target image size: (420, 420)
Starting inference loop...


Inferring with git:   0%|          | 0/1185 [00:00<?, ?it/s]

Calculating metrics...
Exact Match Accuracy: 0.00% (on 9478 valid predictions)

--- Accuracy Per Category ---
                   Acc (%)  Count
cat                              
Brand                  0.0    806
Color                  0.0    848
Count                  0.0    474
Material               0.0   1307
Other                  0.0   3398
Product Type           0.0    781
Style                  0.0    751
Weight/Dimensions      0.0    128
Yes/No                 0.0    985

Average BERTScore F1: 66.17%
Average Simplified VQA (WUPS-style) Score: 0.00%
Average Token-Overlap F1 Score: 1.51%
---------------------------

Example Preds:
  Img: 06/063d633d.jpg, Q: What color is the filament?
  Pred: 'what color is the filament? red' | True: 'Red' | Correct (Exact): False
  Img: 03/03d10684.jpg, Q: What is the product type?
  Pred: 'what is the product type?' | True: 'Boot' | Correct (Exact): False
  Img: 06/06b09ad8.jpg, Q: What is the material?
  Pred: 'what is the material?' | True: '

preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

INFO: Setting padding_side='left' for blip2 tokenizer.
Loading model: Salesforce/blip2-opt-2.7b


config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/122k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Model, processor loaded. GPU cache cleared.
Final text_max_length for blip2: 128
Using OriginalProcessor Dataset for blip2.
Starting inference loop...


Inferring with blip2:   0%|          | 0/9478 [00:00<?, ?it/s]

Calculating metrics...
Exact Match Accuracy: 0.00% (on 9478 valid predictions)

--- Accuracy Per Category ---
                   Acc (%)  Count
cat                              
Brand                  0.0    806
Color                  0.0    848
Count                  0.0    474
Material               0.0   1307
Other                  0.0   3398
Product Type           0.0    781
Style                  0.0    751
Weight/Dimensions      0.0    128
Yes/No                 0.0    985

Average BERTScore F1: 65.51%
Average Simplified VQA (WUPS-style) Score: 0.00%
Average Token-Overlap F1 Score: 0.33%
---------------------------

Example Preds:
  Img: 06/063d633d.jpg, Q: What color is the filament?
  Pred: 'What color is the filament?' | True: 'Red' | Correct (Exact): False
  Img: 03/03d10684.jpg, Q: What is the product type?
  Pred: 'What is the product type?' | True: 'Boot' | Correct (Exact): False
  Img: 06/06b09ad8.jpg, Q: What is the material?
  Pred: 'What is the material?' | True: 'Synt

In [8]:
print("\n\n--- Overall Baseline Evaluation Summary ---")
if baseline_results_summary:
    summary_list = []
    for model_name, metrics in baseline_results_summary.items():
        summary_list.append({
            'Model': model_name,
            'Accuracy': metrics.get('Accuracy', 0.0),
            'BERTScore_F1': metrics.get('BERTScore_F1', 0.0),
            'WUPS_Score': metrics.get('WUPS_Score', 0.0),
            'Token_F1': metrics.get('Token_F1', 0.0)
        })
    summary_df_display = pd.DataFrame(summary_list)
    if not summary_df_display.empty:
        print(summary_df_display.set_index('Model').round(2))
    else: print("No summary data.")

    for model_name, results in baseline_results_summary.items():
        print(f"\n--- Category Accuracy for: {model_name} ---")
        cat_df = results.get('CategoryAcc_DF')
        if isinstance(cat_df, pd.DataFrame) and not cat_df.empty: print(cat_df.round(2))
        else: print("  (No category accuracy calculated/available)")
else: print("No evaluation results.")



--- Overall Baseline Evaluation Summary ---
                                   Accuracy  BERTScore_F1  WUPS_Score  \
Model                                                                   
dandelin/vilt-b32-finetuned-vqa        0.00         64.83        0.00   
Salesforce/blip-vqa-base              15.51         75.60       18.75   
Salesforce/blip-vqa-capfilt-large     15.51         75.60       18.75   
microsoft/git-base-vqav2               0.00         67.09        0.00   
microsoft/git-large-vqav2              0.00         66.17        0.00   
Salesforce/blip2-opt-2.7b              0.00         65.51        0.00   

                                   Token_F1  
Model                                        
dandelin/vilt-b32-finetuned-vqa        0.00  
Salesforce/blip-vqa-base              16.10  
Salesforce/blip-vqa-capfilt-large     16.10  
microsoft/git-base-vqav2               3.36  
microsoft/git-large-vqav2              1.51  
Salesforce/blip2-opt-2.7b              0.33  

