In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os


In [None]:
!pip install --upgrade --force-reinstall transformers --no-cache-dir


In [3]:
import transformers
print(transformers.__version__)  # Must show 4.5.3


4.51.3


In [None]:
!pip uninstall -y torch torchvision torchaudio
!pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2
!pip install nltk tqdm Pillow

In [None]:
!pip uninstall -y nltk scikit-learn
!pip install nltk==3.8.1
!pip install scikit-learn==1.2.2
!pip install numpy==1.23.5
!pip install scipy==1.10.1

In [2]:
import transformers
from transformers import BlipProcessor, BlipForQuestionAnswering
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import transformers
import warnings
import logging
import pandas as pd
import numpy as np
import torch
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import os
from tqdm import tqdm
import evaluate

2025-05-16 19:09:11.959310: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747422551.983029     116 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747422551.990357     116 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# ✅ Suppress transformers and evaluate warnings
logging.getLogger("transformers").setLevel(logging.ERROR)
warnings.filterwarnings("ignore", message=".*Empty candidate sentence detected.*")
warnings.filterwarnings("ignore", message=".*Empty reference sentence detected.*")
warnings.filterwarnings("ignore", message="Some weights of.*were not initialized.*")

# ✅ Load CSV and define image path
df = pd.read_csv('/kaggle/input/finaldata/final.csv')
dataset_dir = '/kaggle/input/images/abo-images-small/images/small'
df = df.sample(n=100000, random_state=42).reset_index(drop=True)

# ✅ Load models
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
bertscore = evaluate.load("bertscore")

# ✅ Evaluation
results = []
smoothie = SmoothingFunction().method1
bleu_scores = []
bertscore_precisions = []
bertscore_recalls = []
bertscore_f1s = []

for i in tqdm(range(len(df))):
    row = df.iloc[i]
    img_path = os.path.join(dataset_dir, row['path'])

    try:
        image = Image.open(img_path).convert("RGB")
    except:
        continue

    question = str(row['Question']).strip() if pd.notnull(row['Question']) else ""
    true_answer = str(row['Answer']).strip() if pd.notnull(row['Answer']) else ""

    question += " (answer in one word)"

    inputs = processor(image, question, return_tensors="pt").to(device)
    output = model.generate(**inputs)
    pred = processor.decode(output[0], skip_special_tokens=True).strip()

    # ✅ BLEU
    reference = [true_answer.lower().split()]
    candidate = pred.lower().split()
    bleu = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=smoothie)
    bleu_scores.append(bleu)

    # ✅ BERTScore
    try:
        bert_out = bertscore.compute(predictions=[pred], references=[true_answer], lang="en")
        bert_p = bert_out['precision'][0]
        bert_r = bert_out['recall'][0]
        bert_f1 = bert_out['f1'][0]
    except:
        bert_p = bert_r = bert_f1 = 0.0  # fallback if any issue

    bertscore_precisions.append(bert_p)
    bertscore_recalls.append(bert_r)
    bertscore_f1s.append(bert_f1)

    results.append({
        'Index': i,
        'Question': question,
        'GroundTruth': true_answer,
        'Prediction': pred,
        'BLEU-1': bleu,
        'BERTScore_Precision': bert_p,
        'BERTScore_Recall': bert_r,
        'BERTScore_F1': bert_f1
    })

# ✅ Summary and Save
res_df = pd.DataFrame(results)
res_df['Match'] = res_df.apply(
    lambda x: x['GroundTruth'].strip().lower() == x['Prediction'].strip().lower(),
    axis=1
)

accuracy = res_df['Match'].mean()
average_bleu = sum(bleu_scores) / len(bleu_scores)
average_bert_f1 = sum(bertscore_f1s) / len(bertscore_f1s)

res_df.to_csv('/kaggle/working/blip_vqa_base_30k_sample_with_bertscore.csv', index=False)
print(f"✅ Exact Match Accuracy: {accuracy:.4f}")
print(f"📊 Average BLEU-1 Score: {average_bleu:.4f}")
print(f"🧠 Average BERTScore F1: {average_bert_f1:.4f}")

res_df.head(10)

  0%|          | 0/100000 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

100%|██████████| 100000/100000 [3:19:03<00:00,  8.37it/s]


✅ Exact Match Accuracy: 0.2514
📊 Average BLEU-1 Score: 0.2572
🧠 Average BERTScore F1: 0.9225


Unnamed: 0,Index,Question,GroundTruth,Prediction,BLEU-1,BERTScore_Precision,BERTScore_Recall,BERTScore_F1,Match
0,0,What is the pattern of the shoe sole? (answer ...,Ridged,stripes,0.0,0.921868,0.817347,0.866467,False
1,1,"Judging by the image, what is the rust-resista...",Brown,rust,0.0,0.990879,0.990879,0.990879,False
2,2,What color is the lid of the 365 Everyday Valu...,Green,green,1.0,0.964729,0.964729,0.964729,True
3,3,"Judging from the image, what color is the ""Who...",White,brown,0.0,0.998189,0.998189,0.998189,False
4,4,Considering the Rivet Theresa chair's upholste...,Dotted,tweed,0.0,0.813693,0.835803,0.824599,False
5,5,"Given the ""Boy and Girl"" design, what shape ar...",Hearts,heart,0.0,0.999315,0.999315,0.999315,False
6,6,Based on the AmazonBasics mat's double-dot tex...,Non-slip,high,0.0,0.931948,0.840843,0.884055,False
7,7,What color is the handle of the garden tool? (...,Blue,blue,1.0,0.964938,0.964938,0.964938,True
8,8,What design is printed on the cups? (answer in...,Leaves,leaf,0.0,0.996152,0.996152,0.996152,False
9,9,What color is the plumbing fixture shown in th...,Black,silver,0.0,0.974037,0.974037,0.974037,False


In [10]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.

In [11]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.0.0->bert_score)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.0.0->bert_score)
  