In [2]:
!pip install transformers bert-score torch torchvision --quiet


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import ViltProcessor, ViltForQuestionAnswering
from PIL import Image
import torch
from bert_score import score as bert_score
from tqdm import tqdm
import re


# === Helper function for text normalization ===
def normalize_text(text):
    """Normalize text for consistent comparison"""
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# === Load CSV ===
df = pd.read_csv('/kaggle/input/vrmini2/12049.csv')

# === 80/20 Split ===
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_df.to_csv("train_split.csv", index=False)
test_df.to_csv("test_split.csv", index=False)


# === Load ViLT model ===
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa").to("cuda" if torch.cuda.is_available() else "cpu")

# === Prediction function ===
def predict_answer(image_path, question):
    try:
        image = Image.open(image_path).convert("RGB")
        encoding = processor(image, question, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model(**encoding)
        predicted_idx = outputs.logits.argmax(-1).item()
        return model.config.id2label[predicted_idx]
    except Exception as e:
        print(f"Error with {image_path}: {e}")
        return "unknown"

# === Run predictions ===
tqdm.pandas()
test_df["prediction"] = test_df.progress_apply(
    lambda row: predict_answer(row["full_path"], row["question"]), axis=1
)

# === Normalize predictions and answers ===
test_df["norm_answer"] = test_df["answer"].apply(normalize_text)
test_df["norm_prediction"] = test_df["prediction"].apply(normalize_text)

# === Compute Matches ===
test_df["match"] = test_df["norm_answer"] == test_df["norm_prediction"]

# === Metrics ===
accuracy = accuracy_score(test_df["norm_answer"], test_df["norm_prediction"])
f1 = f1_score(test_df["norm_answer"], test_df["norm_prediction"], average="macro")

P, R, F1_bert = bert_score(test_df["norm_prediction"].tolist(), test_df["norm_answer"].tolist(), lang="en", rescale_with_baseline=True)
bert_f1_mean = F1_bert.mean().item()

# === Print Results ===
print("\n=== Evaluation Metrics ===")
print(f"Accuracy     : {accuracy:.4f}")
print(f"F1 Score     : {f1:.4f}")
print(f"BERTScore F1 : {bert_f1_mean:.4f}")

# === Sample predictions ===
print("\n=== Sample Predictions vs References ===")
sample = test_df.sample(5, random_state=42)
for _, row in sample.iterrows():
    print(f"Question : {row['question']}")
    print(f"Reference: {row['answer']} (Normalized: {row['norm_answer']})")
    print(f"Prediction: {row['prediction']} (Normalized: {row['norm_prediction']})")
    print(f"Match    : {row['match']}")
    print("---")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 2410/2410 [00:57<00:00, 42.08it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Evaluation Metrics ===
Accuracy     : 0.2490
F1 Score     : 0.0356
BERTScore F1 : 0.6896

=== Sample Predictions vs References ===
Question : Considering the image and metadata, where would this blanket be most appropriately used?
Reference: Outdoors (Normalized: outdoors)
Prediction: yes (Normalized: yes)
Match    : False
---
Question : Based on the image and metadata, where would this product most likely be found in a physical store?
Reference: Kitchen (Normalized: kitchen)
Prediction: store (Normalized: store)
Match    : False
---
Question : What is the color of the shoe?
Reference: Grey (Normalized: grey)
Prediction: brown (Normalized: brown)
Match    : False
---
Question : Considering the image and metadata, what is the primary design feature of this phone case?
Reference: Birds (Normalized: birds)
Prediction: face (Normalized: face)
Match    : False
---
Question : Is this binder designed for heavy-duty use or everyday use?
Reference: Everyday (Normalized: everyday)
Predictio