In [10]:

# !pip install transformers torch pandas pillow tqdm scikit-learn bert-score

IMPORTS

In [24]:
import os
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm
import torch
from transformers import ViltProcessor, ViltForQuestionAnswering
from sklearn.metrics import accuracy_score, f1_score
from bert_score import score as bert_score
import re
from nltk.corpus import wordnet as wn
import nltk

# Download required NLTK resources
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

**Initialize Processor and Model**

In [12]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "dandelin/vilt-b32-finetuned-vqa"
processor = ViltProcessor.from_pretrained(MODEL_NAME)
model = ViltForQuestionAnswering.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()

ViltForQuestionAnswering(
  (vilt): ViltModel(
    (embeddings): ViltEmbeddings(
      (text_embeddings): TextEmbeddings(
        (word_embeddings): Embedding(30522, 768)
        (position_embeddings): Embedding(40, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (patch_embeddings): ViltPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32))
      )
      (token_type_embeddings): Embedding(2, 768)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViltEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViltLayer(
          (attention): ViltAttention(
            (attention): ViltSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=76

**Load Dataset**

In [13]:
SUBSET_SIZE = 5000
DATA_CSV = "/kaggle/input/vrproject2dataset/vqa_dataset.csv"
BATCH_SIZE = 8
BASE_DIR = "/kaggle/input/vrproject2/abo-images-small/images/small" 

df = pd.read_csv(DATA_CSV)
test_df = df.sample(n=SUBSET_SIZE, random_state=42).reset_index(drop=True)

**Helper: Number Word to Digit Map**

In [14]:
number_map = {
    'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4',
    'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10'
}

preds = []
refs = []

**WUPS Calculation Function**

In [15]:
# Map NLTK POS tags to WordNet POS categories for WUP similarity
def get_wordnet_pos(word):
    # Get POS tag for the word using NLTK
    tag = nltk.pos_tag([word])[0][1][0].upper()
    # Define mapping from NLTK tags to WordNet POS categories
    tag_dict = {"J": wn.ADJ, "N": wn.NOUN, "V": wn.VERB, "R": wn.ADV}
    # Return WordNet POS or default to NOUN if tag is unmapped
    return tag_dict.get(tag, wn.NOUN)

# Calculate Wu-Palmer (WUP) similarity for lexical similarity between predictions and references
def calculate_wup_score(preds, refs):
    # Define helper function to compute WUP similarity for a single pred-ref pair
    def wup_sim(pred, ref):
        # Clean and tokenize prediction and reference text
        pred_tokens = re.sub(r'[^\w\s]', '', pred.lower()).split()
        ref_tokens = re.sub(r'[^\w\s]', '', ref.lower()).split()
        # Return 0 if either token list is empty
        if not pred_tokens or not ref_tokens:
            return 0.0
        # Initialize list to store maximum similarities for prediction tokens
        max_similarities = []
        # Iterate over each prediction token
        for p_token in pred_tokens:
            token_max_sim = 0.0
            # Get synsets for prediction token with POS or fallback to all synsets
            p_synsets = wn.synsets(p_token, pos=get_wordnet_pos(p_token)) or wn.synsets(p_token)
            if not p_synsets:
                continue
            # Iterate over each reference token
            for r_token in ref_tokens:
                # Get synsets for reference token with POS or fallback
                r_synsets = wn.synsets(r_token, pos=get_wordnet_pos(r_token)) or wn.synsets(r_token)
                if not r_synsets:
                    continue
                # Compute WUP similarity for all synset pairs
                token_sims = [wn.wup_similarity(p_syn, r_syn) or 0.0 for p_syn in p_synsets for r_syn in r_synsets]
                if token_sims:
                    # Store maximum similarity for this token pair
                    token_max_sim = max(token_sims)
            if token_max_sim > 0:
                # Store non-zero maximum similarity
                max_similarities.append(token_max_sim)
        # Return average similarity or 0 if no valid similarities
        return sum(max_similarities) / len(max_similarities) if max_similarities else 0.0
    # Compute WUP similarity for all pred-ref pairs
    wup_scores = [wup_sim(p, r) for p, r in zip(preds, refs)]
    # Return average WUP score across all pairs
    return sum(wup_scores) / len(wup_scores) if wup_scores else 0.0

**Inference**

In [16]:
for i in tqdm(range(0, len(test_df), BATCH_SIZE), desc="Evaluating"):
    batch = test_df.iloc[i:i + BATCH_SIZE]
    images = []
    for fname in batch["path"]:
        img_path = os.path.join(BASE_DIR, fname)
        if os.path.exists(img_path):
            images.append(Image.open(img_path).convert("RGB"))
        else:
            print(f"Warning: Image not found at {img_path}")
            images.append(Image.new('RGB', (224, 224)))  # Placeholder for missing images

    # Use raw questions
    questions = batch["question"].tolist()

    # Preprocess inputs
    encoding = processor(images, questions, padding=True, truncation=True, return_tensors="pt").to(DEVICE)

    # Generate answers
    with torch.no_grad():
        outputs = model(**encoding)
        logits = outputs.logits
        idx = logits.argmax(-1).tolist()
        answers = [model.config.id2label[id] for id in idx]

    # Clean predictions: strip and lowercase
    cleaned_preds = [ans.strip().lower() for ans in answers]
    # Convert number words to digits
    cleaned_preds = [number_map.get(p, p) for p in cleaned_preds]

    # Prepare references
    cleaned_refs = [str(r).strip().lower() for r in batch["answer"]]
    cleaned_refs = [number_map.get(r, r) for r in cleaned_refs]

    # Debug first batch
    if i == 0:
        print("Questions:", questions)
        print("Raw answers:", answers)
        print("Cleaned predictions:", cleaned_preds)
        print("References:", cleaned_refs)

    preds.extend(cleaned_preds)
    refs.extend(cleaned_refs)

Evaluating:   0%|          | 0/625 [00:00<?, ?it/s]

Questions: ["What is the dresser's color?", 'What is the girl holding?', 'What is the shape?', 'Is this phonecase colorful?', 'What shape are the gems?', 'What pattern is displayed?', 'What animal is printed on the phone case?', 'What type of material is it?']
Raw answers: ['brown', 'camera', 'triangle', 'yes', 'heart', 'flowers', 'dog', 'leather']
Cleaned predictions: ['brown', 'camera', 'triangle', 'yes', 'heart', 'flowers', 'dog', 'leather']
References: ['brown', 'camera', 'round', 'yes', 'round', 'face', 'lion', 'leather']


**String Match and F1**

In [17]:
acc = accuracy_score(refs, preds) #String Match
f1  = f1_score(refs, preds, average="macro")

print(f"Baseline Evaluation Results on {len(test_df)} examples:")
print(f"  • Accuracy: {acc * 100:.2f}%")
print(f"  • Macro F1  : {f1  * 100:.2f}%")

Baseline Evaluation Results on 5000 examples:
  • Accuracy: 39.08%
  • Macro F1  : 7.79%


**BERT Score and WUPS**

In [25]:
P, R, F1 = bert_score(preds, refs, lang='en', rescale_with_baseline=True, device=DEVICE) #89
print(P.mean(), R.mean(), F1.mean())
# Take the mean over all examples
avg_precision = P.mean().item()
avg_recall    = R.mean().item()
avg_f1        = F1.mean().item()

print(f"  • BERTScore Precision: {avg_precision*100:.2f}%")
print(f"  • BERTScore Recall   : {avg_recall*100:.2f}%")
print(f"  • BERTScore F1       : {avg_f1*100:.2f}%")

wup_score = calculate_wup_score(preds, refs)
print(f"WUP Score: {wup_score*100:.2f}%")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor(0.8758) tensor(0.8664) tensor(0.8689)
  • BERTScore Precision: 87.58%
  • BERTScore Recall   : 86.64%
  • BERTScore F1       : 86.89%
WUP Score: 69.48%


**Save detailed results**

In [None]:
OUTPUT_CSV = "results_subset.csv"

results_df = pd.DataFrame({
    "path"      : test_df["path"],
    "question"  : test_df["question"],
    "answer"    : refs,
    "prediction": preds
})

results_df.to_csv(OUTPUT_CSV, index=False)
print(f"\nSaved detailed results to {OUTPUT_CSV}")