# BLIP-2 Baseline Evaluation

In [34]:
!pip install bert_score
!pip install rouge_score

import nltk
nltk.download('punkt')
nltk.download('wordnet')

import os
import pandas as pd
from PIL import Image
from tqdm import tqdm

import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from bert_score import score

from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
# =====================
# CONFIGURATION
# =====================
CSV_PATH = "/kaggle/input/complete-vqa/complete_vqa.csv"
IMAGE_ROOT_DIR = "/kaggle/input/images/images/images/small"

# =====================
# LOAD VQA CSV
# =====================
print("Loading VQA dataset...")
df = pd.read_csv(CSV_PATH)
df = df.sample(frac=0.25, random_state=42)
df = df.head(100) # for checking if code is fine

# =====================
# HELPER FUNCTIONS
# =====================
def get_full_image_path(rel_path):
    full_path = os.path.join(IMAGE_ROOT_DIR, rel_path)
    return full_path if os.path.exists(full_path) else None

def load_image(image_path):
    return Image.open(image_path).convert("RGB")

# =====================
# LOAD BLIP-2 MODEL
# =====================
print("Loading BLIP-2 model...")
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
)
model.to(device)

# =====================
# RUN INFERENCE
# =====================
print("Running inference...")
predictions = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    image_path = get_full_image_path(row["image_path"])

    if image_path is None:
        predictions.append("Error: Image not found")
        continue

    question = row["question"]
    prompt = f"Question: {question} Answer in one word:" 

    raw_image = load_image(image_path)
    inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(
        device, torch.float16 if device == "cuda" else torch.float32
    )
    generated_ids = model.generate(**inputs, max_new_tokens=10)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    answer = generated_text.strip()
    
    answer = answer.replace(prompt, "").strip()
    
    predictions.append(answer)

# =====================
# EVALUATION
# =====================
print("\nEvaluating predictions...")

df["prediction"] = predictions

# 1. Exact Match Accuracy
print("Calculating Exact Match Accuracy...")
df["correct"] = df["answer"].str.lower().str.strip() == df["prediction"].str.lower().str.strip()
accuracy = df["correct"].mean()

# 2. Filter valid entries
valid_df = df[
    ~df["prediction"].astype(str).str.startswith("Error") &
    df["answer"].apply(lambda x: isinstance(x, str)) &
    df["prediction"].apply(lambda x: isinstance(x, str))
]

references = valid_df["answer"].tolist()
candidates = valid_df["prediction"].tolist()

# 3. BERTScore
print("Calculating BERTScore...")
P, R, F1 = score(candidates, references, lang="en", verbose=True)
valid_df["bertscore_f1"] = F1.cpu()
avg_bertscore = F1.mean().item()

# 4. ROUGE-L F1
print("Calculating ROUGE-L F1...")
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_l_f1s = [
    scorer.score(ref, cand)["rougeL"].fmeasure
    for ref, cand in zip(references, candidates)
]
valid_df["rougeL_f1"] = rouge_l_f1s
avg_rouge_l_f1 = sum(rouge_l_f1s) / len(rouge_l_f1s)

# 6. METEOR Score
print("Calculating METEOR...")
meteor_scores = [
    meteor_score([ref.split()], cand.split())
    for ref, cand in zip(references, candidates)
]
valid_df["meteor"] = meteor_scores
avg_meteor = sum(meteor_scores) / len(meteor_scores)

# Assign back to full df
for col in ["bertscore_f1", "rougeL_f1", "meteor"]:
    df.loc[valid_df.index, col] = valid_df[col].values

blank_row = {col: "" for col in df.columns}
df = pd.concat([df, pd.DataFrame([blank_row])], ignore_index=True)

# Append summary row
summary_row = {
    "image_path": "AVERAGES",
    "question": "",
    "answer": "",
    "prediction": "",
    "correct": accuracy,
    "bertscore_f1": avg_bertscore,
    "rougeL_f1": avg_rouge_l_f1,
    "meteor": avg_meteor
}
df = pd.concat([df, pd.DataFrame([summary_row])], ignore_index=True)

# Print average scores
print("\nAverage Scores:")
print(f"  Exact Match Accuracy: {accuracy:.4f}")
print(f"  BERTScore F1:         {avg_bertscore:.4f}")
print(f"  ROUGE-L F1:           {avg_rouge_l_f1:.4f}")
print(f"  METEOR:               {avg_meteor:.4f}")

# Save results
df.to_csv("vqa_blip2_baseline_results.csv", index=False)
print("Results saved to vqa_blip2_baseline_results.csv")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading VQA dataset...
Loading BLIP-2 model...


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Running inference...


100%|██████████| 100/100 [00:13<00:00,  7.22it/s]



Evaluating predictions...
Calculating Exact Match Accuracy...
Calculating BERTScore...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 0.15 seconds, 685.70 sentences/sec
Calculating ROUGE-L F1...
Calculating METEOR...

Average Scores:
  Exact Match Accuracy: 0.4800
  BERTScore F1:         0.8175
  ROUGE-L F1:           0.4867
  METEOR:               0.2695
Results saved to vqa_blip2_baseline_results.csv




# Calculating more scores for analysis

In [59]:
# computing some more scores like BART and BERT variants using the output of the above cell

import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from bert_score import score 
from rouge_score import rouge_scorer
import torch
from transformers import BartTokenizer, BartForConditionalGeneration

# Load BART model and tokenizer
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
bart_model.eval()

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bart_model.to(device)

# Load your CSV
df = pd.read_csv("/kaggle/input/baseline-blip2/blip2_baseline_results_50.csv")  # replace accordingly with output of previous cell
df = df.drop(columns=["rougeL_f1", "bertscore_f1"], errors="ignore")
df = df[df['image_path'] != 'AVERAGES']

# Fill missing predictions with empty strings for uniformity
df['prediction'] = df['prediction'].fillna("").astype(str)

# Initialize metric lists
f1s = []
accuracies = []
bertscore_precisions = []
bertscore_recalls = []
bertscore_f1s = []
bartscores = []
rouge_l_f1s = []

# Compute BERTScore only for valid predictions
valid_mask = df['prediction'].str.strip() != ""
valid_preds = df.loc[valid_mask, 'prediction'].tolist()
valid_refs = df.loc[valid_mask, 'answer'].astype(str).tolist()

if valid_preds:
    print("Calculating BERTScore...")
    P, R, F1 = score(valid_preds, valid_refs, lang="en", verbose=True)
else:
    P, R, F1 = [], [], []

# ROUGE scorer
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

# Row-wise processing
bert_idx = 0  # index to track bert_score results
for idx, row in df.iterrows():
    ref = str(row['answer']).strip()
    pred = str(row['prediction']).strip()
    correct = row.get('correct', False)

    if pred == "":
        # Empty prediction — assign zeros
        f1s.append(0.0)
        accuracies.append(1 if correct is True or correct == "True" else 0)
        bertscore_precisions.append(0.0)
        bertscore_recalls.append(0.0)
        bertscore_f1s.append(0.0)
        bartscores.append(0.0)
        rouge_l_f1s.append(0.0)
        continue

    # F1 (token-based)
    ref_tokens = ref.lower().split()
    pred_tokens = pred.lower().split()
    common = set(ref_tokens) & set(pred_tokens)
    precision = len(common) / len(pred_tokens) if pred_tokens else 0
    recall = len(common) / len(ref_tokens) if ref_tokens else 0
    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0
    f1s.append(f1)

    # Accuracy
    accuracy = 1 if correct is True or correct == "True" else 0
    accuracies.append(accuracy)

    # BERTScore
    bertscore_precisions.append(P[bert_idx].item())
    bertscore_recalls.append(R[bert_idx].item())
    bertscore_f1s.append(F1[bert_idx].item())
    bert_idx += 1

    # BARTScore (negative log-likelihood)
    inputs = bart_tokenizer(pred, return_tensors="pt", truncation=True, max_length=512).to(device)
    with bart_tokenizer.as_target_tokenizer():
        labels = bart_tokenizer(ref, return_tensors="pt", truncation=True, max_length=512).input_ids.to(device)

    with torch.no_grad():
        outputs = bart_model(**inputs, labels=labels)
        log_likelihood = -outputs.loss.item()
    bartscores.append(log_likelihood)

    # ROUGE-L F1
    rouge_l_f1 = rouge.score(ref, pred)['rougeL'].fmeasure
    rouge_l_f1s.append(rouge_l_f1)

# Add new columns to DataFrame
df['f1'] = f1s
df['accuracy'] = accuracies
df['bertscore_precision'] = bertscore_precisions
df['bertscore_recall'] = bertscore_recalls
df['bertscore_f1'] = bertscore_f1s
df['bartscore'] = bartscores
df['rouge_l_f1'] = rouge_l_f1s

# Compute average row
avg_row = {
    'image_path': 'AVERAGE',
    'question': '',
    'answer': '',
    'prediction': '',
    'correct': '',
    'meteor': df['meteor'].mean() if 'meteor' in df else '',
    'f1': np.mean(f1s),
    'accuracy': np.mean(accuracies),
    'bertscore_precision': np.mean(bertscore_precisions),
    'bertscore_recall': np.mean(bertscore_recalls),
    'bertscore_f1': np.mean(bertscore_f1s),
    'bartscore': np.mean(bartscores),
    'rouge_l_f1': np.mean(rouge_l_f1s),
}

# Create a blank row (empty values)
blank_row = pd.DataFrame([{
    'image_path': '',
    'question': '',
    'answer': '',
    'prediction': '',
    'correct': '',
    'meteor': '',
    'f1': '',
    'accuracy': '',
    'bertscore_precision': '',
    'bertscore_recall': '',
    'bertscore_f1': '',
    'bartscore': '',
    'rouge_l_f1': '',
}])

# Neatly print average results
print("\n=== Average Scores for BLIP2 using 50% of dataset ===")
print(f"F1 Score:              {avg_row['f1']:.4f}")
print(f"Accuracy:              {avg_row['accuracy']:.4f}")
print(f"BERTScore Precision:   {avg_row['bertscore_precision']:.4f}")
print(f"BERTScore Recall:      {avg_row['bertscore_recall']:.4f}")
print(f"BERTScore F1:          {avg_row['bertscore_f1']:.4f}")
print(f"BARTScore:             {avg_row['bartscore']:.4f}")
print(f"ROUGE-L F1:            {avg_row['rouge_l_f1']:.4f}")
if 'meteor' in avg_row and avg_row['meteor'] != '':
    print(f"METEOR:                {avg_row['meteor']:.4f}")

# Append and save
df_with_avg = pd.concat([df, blank_row, pd.DataFrame([avg_row])], ignore_index=True)
df_with_avg.to_csv("scored_output.csv", index=False)


Calculating BERTScore...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/34 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/290 [00:00<?, ?it/s]

done in 5.24 seconds, 3541.32 sentences/sec





=== Average Scores for BLIP2 using 50% of dataset ===
F1 Score:              0.4109
Accuracy:              0.4043
BERTScore Precision:   0.7904
BERTScore Recall:      0.7869
BERTScore F1:          0.7882
BARTScore:             -3.8912
ROUGE-L F1:            0.4258
METEOR:                0.2261
