In [3]:
!pip install -q --no-deps bert-score

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h

**Imports and Configurations**

In [4]:
import os
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm

import torch
from transformers import BlipProcessor, BlipForQuestionAnswering
from accelerate import init_empty_weights, infer_auto_device_map

from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

from sklearn.metrics import accuracy_score,f1_score

from bert_score import score as bert_score
import numpy as np

import re
import nltk
from nltk.corpus import wordnet as wn
from torch.utils.data import IterableDataset

# Download required NLTK resources
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

MODEL_NAME="Salesforce/blip-vqa-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


cuda


**Load Dataset**

In [5]:
SUBSET_SIZE = 5000
DATA_CSV = "/kaggle/input/vrproject2dataset/vqa_dataset.csv"  # expected columns: ["path", "question", "answer"]
BATCH_SIZE = 8

df = pd.read_csv(DATA_CSV)

test_df = df.sample(n=SUBSET_SIZE, random_state=42).reset_index(drop=True)

**Initialize Processor and Model**

In [6]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained(MODEL_NAME).to("cuda")

model.eval()


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

BlipForQuestionAnswering(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-05, e

**Helper: Number Word to Digit Map**

In [7]:
number_map = {
    'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4',
    'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10'
}

**WUPS Calculation Function**

In [8]:
# Map NLTK POS tags to WordNet POS categories for WUP similarity
def get_wordnet_pos(word):
    # Get POS tag for the word using NLTK
    tag = nltk.pos_tag([word])[0][1][0].upper()
    # Define mapping from NLTK tags to WordNet POS categories
    tag_dict = {"J": wn.ADJ, "N": wn.NOUN, "V": wn.VERB, "R": wn.ADV}
    # Return WordNet POS or default to NOUN if tag is unmapped
    return tag_dict.get(tag, wn.NOUN)

# Calculate Wu-Palmer (WUP) similarity for lexical similarity between predictions and references
def calculate_wup_score(preds, refs):
    # Define helper function to compute WUP similarity for a single pred-ref pair
    def wup_sim(pred, ref):
        # Clean and tokenize prediction and reference text
        pred_tokens = re.sub(r'[^\w\s]', '', pred.lower()).split()
        ref_tokens = re.sub(r'[^\w\s]', '', ref.lower()).split()
        # Return 0 if either token list is empty
        if not pred_tokens or not ref_tokens:
            return 0.0
        # Initialize list to store maximum similarities for prediction tokens
        max_similarities = []
        # Iterate over each prediction token
        for p_token in pred_tokens:
            token_max_sim = 0.0
            # Get synsets for prediction token with POS or fallback to all synsets
            p_synsets = wn.synsets(p_token, pos=get_wordnet_pos(p_token)) or wn.synsets(p_token)
            if not p_synsets:
                continue
            # Iterate over each reference token
            for r_token in ref_tokens:
                # Get synsets for reference token with POS or fallback
                r_synsets = wn.synsets(r_token, pos=get_wordnet_pos(r_token)) or wn.synsets(r_token)
                if not r_synsets:
                    continue
                # Compute WUP similarity for all synset pairs
                token_sims = [wn.wup_similarity(p_syn, r_syn) or 0.0 for p_syn in p_synsets for r_syn in r_synsets]
                if token_sims:
                    # Store maximum similarity for this token pair
                    token_max_sim = max(token_sims)
            if token_max_sim > 0:
                # Store non-zero maximum similarity
                max_similarities.append(token_max_sim)
        # Return average similarity or 0 if no valid similarities
        return sum(max_similarities) / len(max_similarities) if max_similarities else 0.0
    # Compute WUP similarity for all pred-ref pairs
    wup_scores = [wup_sim(p, r) for p, r in zip(preds, refs)]
    # Return average WUP score across all pairs
    return sum(wup_scores) / len(wup_scores) if wup_scores else 0.0

**Inference**

In [9]:
preds = [] # Store model predictions
refs  = [] # Store ground-truth answers

BASE_DIR = "/kaggle/input/vrproject2/abo-images-small/images/small"

for i in tqdm(range(0, len(test_df), BATCH_SIZE), desc="Evaluating"):
    batch = test_df.iloc[i : i + BATCH_SIZE]
    images = [
        Image.open(os.path.join(BASE_DIR, fname)).convert("RGB")
        for fname in batch["path"]
    ]
    questions = batch["question"].tolist()

    # Tokenize / preprocess
    inputs = processor(
        images=images,
        text=questions,
        return_tensors="pt",
        padding="max_length",
        truncation=True
    )

    # Move everything to GPU
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

    # 3) Generate exactly one token
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=1,
            num_beams=5,
            early_stopping=True
        )

    # Decode full answers
    raw_answers = processor.batch_decode(generated_ids, skip_special_tokens=True)
    cleaned_preds = [ans.strip().lower() for ans in raw_answers]

    # Coerce answers to str before strip/lower
    cleaned_refs = [str(r).strip().lower() for r in batch["answer"]]

    cleaned_refs = [number_map.get(r, r) for r in cleaned_refs]

    print(i, cleaned_preds, cleaned_refs)
    
    res = bert_score(cleaned_preds, cleaned_refs, model_type="distilbert-base-uncased")
    
    print(res[0])

    preds.extend(cleaned_preds)
    refs.extend(cleaned_refs)


Evaluating:   0%|          | 0/625 [00:00<?, ?it/s]

0 ['it', 'yes', 'oval', 'yes', 'oval', 'no', 'yes', 'leather'] ['brown', 'camera', 'round', 'yes', 'round', 'face', 'lion', 'leather']


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tensor([0.7537, 0.6218, 0.7800, 1.0000, 0.7800, 0.7487, 0.6485, 1.0000])
8 ['slightly', 'no', 'no', 'no', 'no', 'metal', 'no', 'yes'] ['horizontal', 'triangles', 'landscape', 'yellow', 'suitcase', 'gold', 'green', 'yes']
tensor([0.7880, 0.7532, 0.7263, 0.7730, 0.7723, 0.8080, 0.7623, 1.0000])
16 ['black', 'yes', 'yes', 'yes', 'white', 'woman', 'no', 'yes'] ['black', 'yes', 'car', 'yes', 'white', 'leggings', 'vertical', 'waves']
tensor([1.0000, 1.0000, 0.6421, 1.0000, 1.0000, 0.6343, 0.7384, 0.6652])
24 ['black', 'black', 'yes', 'yes', 'yes', 'yes', '10', 'no'] ['black', 'white', 'yes', 'screen', 'yes', 'no', '10', 'yes']
tensor([1.0000, 0.8870, 1.0000, 0.6516, 1.0000, 0.8813, 1.0000, 0.8813])
32 ['brown', 'no', 'yes', 'leather', 'yes', 'yes', 'rec', 'yes'] ['taupe', 'lbs', 'headphones', 'suede', 'yes', 'yes', 'rectangular', 'necklace']
tensor([0.6507, 0.7407, 0.6277, 0.7538, 1.0000, 1.0000, 0.7819, 0.6663])
40 ['no', 'oval', 'glass', 'oval', 'yes', 'yes', 'yes', 'phone'] ['leaves', 'he

**Evaluation Metrics**

**String Match and F1**

In [10]:
acc = accuracy_score(refs, preds) #String Match
f1  = f1_score(refs, preds, average="macro")

print(f"Baseline Evaluation Results on {len(test_df)} examples:")
print(f"  • Accuracy: {acc * 100:.2f}%")
print(f"  • Macro F1  : {f1  * 100:.2f}%")

Baseline Evaluation Results on 5000 examples:
  • Accuracy: 33.64%
  • Macro F1  : 3.05%


**BERT Score and WUPS**

In [11]:
P, R, F1 = bert_score(preds, refs, lang='en', rescale_with_baseline=True, device=DEVICE) #89
print(P.mean(), R.mean(), F1.mean())
# Take the mean over all examples
avg_precision = P.mean().item()
avg_recall    = R.mean().item()
avg_f1        = F1.mean().item()

print(f"  • BERTScore Precision: {avg_precision*100:.2f}%")
print(f"  • BERTScore Recall   : {avg_recall*100:.2f}%")
print(f"  • BERTScore F1       : {avg_f1*100:.2f}%")

wup_score = calculate_wup_score(preds, refs)
print(f"WUP Score: {wup_score*100:.2f}%")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor(0.9171) tensor(0.8845) tensor(0.8992)
  • BERTScore Precision: 91.71%
  • BERTScore Recall   : 88.45%
  • BERTScore F1       : 89.92%
WUP Score: 59.67%


**Save detailed results**

In [12]:
OUTPUT_CSV = "results_subset.csv"

results_df = pd.DataFrame({
    "path"      : test_df["path"],
    "question"  : test_df["question"],
    "answer"    : refs,
    "prediction": preds
})

results_df.to_csv(OUTPUT_CSV, index=False)
print(f"\nSaved detailed results to {OUTPUT_CSV}")


Saved detailed results to results_subset.csv
