In [1]:
!pip install pandas numpy scikit-learn
!pip install rouge-score
!pip install bert-score
!pip install python-Levenshtein
!pip install sentence-transformers

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=e6fd0c13c75f61b782265da6479a6916bfa0926a9f1ce0351d1575826a640a66
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_runtime_cu12

In [5]:
import zipfile
import os

# Path to the zip file
zip_path = "/content/results.zip"  # change path accordingly

# Unzip to a folder
extract_to = "/content/results"
os.makedirs(extract_to, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print(f"Files extracted to: {extract_to}")

# Optional: list files
import glob
csv_files = glob.glob(os.path.join(extract_to, "*.csv"))
print("CSV files found:")
for file in csv_files:
    print(file)

Files extracted to: /content/results
CSV files found:


In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from rouge_score import rouge_scorer
from bert_score import BERTScorer
import Levenshtein
from sentence_transformers import SentenceTransformer, util
import torch

# Set device
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Load all batch result files
result_dir = "/content/results/results"  # folder with result CSVs
result_files = [
    os.path.join(result_dir, f)
    for f in os.listdir(result_dir)
    if f.startswith("blip_2_vqa_results_batch_") and f.endswith(".csv")
]

all_batches = pd.concat([pd.read_csv(f) for f in result_files], ignore_index=True)

# Normalize text to lowercase strings
all_batches['answer'] = all_batches['answer'].astype(str).str.lower()
all_batches['blip_answer'] = all_batches['blip_answer'].astype(str).str.lower()

# Extract predictions and references lists
predictions = all_batches['blip_answer'].tolist()
references = all_batches['answer'].tolist()

# Compute exact-match binary metrics
y_pred_bin = [int(p == r) for p, r in zip(predictions, references)]
y_true_bin = [1] * len(references)

acc = accuracy_score(y_true_bin, y_pred_bin)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_true_bin, y_pred_bin, average="binary", zero_division=0
)

# Save prediction vs reference with correctness
df_pred_ref = pd.DataFrame({
    "Prediction": predictions,
    "Ground_Truth": references,
    "Exact_Match_Correct": y_pred_bin
})
df_pred_ref.to_csv('predictions_vs_references.csv', index=False)
print("Saved: predictions_vs_references.csv")

# Initialize final metrics dictionary with exact match scores
final_metrics = {
    "exact_match_accuracy": acc,
    "exact_match_precision": prec,
    "exact_match_recall": rec,
    "exact_match_f1": f1
}

# --- ROUGE Scores ---
print("\n--- ROUGE Scores ---")
rouge_eval_scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
rouge1_scores, rougeL_scores = [], []

for pred, ref in zip(predictions, references):
    if not pred or not ref:
        rouge1_scores.append(0.0)
        rougeL_scores.append(0.0)
        continue
    scores = rouge_eval_scorer.score(ref, pred)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

final_metrics["rouge1_f1"] = np.mean(rouge1_scores) if rouge1_scores else 0.0
final_metrics["rougeL_f1"] = np.mean(rougeL_scores) if rougeL_scores else 0.0

print(f"Average ROUGE-1 F1: {final_metrics['rouge1_f1']:.3f}")
print(f"Average ROUGE-L F1: {final_metrics['rougeL_f1']:.3f}")

# --- BERTScore ---
print("\n--- BERTScore ---")
try:
    bert_eval_scorer = BERTScorer(lang="en", rescale_with_baseline=True, device=device)
    filtered_pairs = [(p, r) for p, r in zip(predictions, references) if p and r]
    if filtered_pairs:
        filtered_preds, filtered_refs = zip(*filtered_pairs)
        P, R, F1 = bert_eval_scorer.score(list(filtered_preds), list(filtered_refs))
        final_metrics["bertscore_precision"] = P.mean().item()
        final_metrics["bertscore_recall"] = R.mean().item()
        final_metrics["bertscore_f1"] = F1.mean().item()

        print(f"Average BERTScore Precision: {final_metrics['bertscore_precision']:.3f}")
        print(f"Average BERTScore Recall:    {final_metrics['bertscore_recall']:.3f}")
        print(f"Average BERTScore F1:        {final_metrics['bertscore_f1']:.3f}")
    else:
        print("Not enough valid pairs for BERTScore.")
except Exception as e:
    print(f"Could not compute BERTScore: {e}")

# --- Levenshtein Normalized Similarity ---
print("\n--- Levenshtein Normalized Similarity ---")
lev_similarities = []
for pred, ref in zip(predictions, references):
    if not pred and not ref:
        similarity = 1.0
    elif not pred or not ref:
        similarity = 0.0
    else:
        similarity = Levenshtein.ratio(pred, ref)
    lev_similarities.append(similarity)

final_metrics["levenshtein_sim"] = np.mean(lev_similarities) if lev_similarities else 0.0
print(f"Average Levenshtein Normalized Similarity: {final_metrics['levenshtein_sim']:.3f}")

# --- Sentence-BERT Cosine Similarity ---
print("\n--- Sentence-BERT Cosine Similarity ---")
try:
    sbert_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
    valid_pairs = [(p, r) for p, r in zip(predictions, references) if p and r]
    if valid_pairs:
        sbert_preds, sbert_refs = zip(*valid_pairs)
        embeddings_preds = sbert_model.encode(list(sbert_preds), convert_to_tensor=True)
        embeddings_refs = sbert_model.encode(list(sbert_refs), convert_to_tensor=True)
        cosine_scores = util.cos_sim(embeddings_preds, embeddings_refs)
        pairwise_sim = [cosine_scores[i, i].item() for i in range(len(valid_pairs))]
        final_metrics["sbert_cosine_sim"] = np.mean(pairwise_sim)
        print(f"Average Sentence-BERT Cosine Similarity: {final_metrics['sbert_cosine_sim']:.3f}")
    else:
        print("Not enough valid pairs for Sentence-BERT similarity.")
except Exception as e:
    print(f"Could not compute Sentence-BERT similarity: {e}")

# Save final metrics summary
df_metrics = pd.DataFrame([final_metrics])
df_metrics.to_csv("evaluation_metrics_summary.csv", index=False)
print("\n--- Final Metrics ---")
for metric_name, metric_value in final_metrics.items():
    print(f"{metric_name}: {metric_value:.3f}")

print("\nFinal metrics saved to 'evaluation_metrics_summary.csv'")


Saved: predictions_vs_references.csv

--- ROUGE Scores ---
Average ROUGE-1 F1: 0.391
Average ROUGE-L F1: 0.391

--- BERTScore ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average BERTScore Precision: 0.559
Average BERTScore Recall:    0.664
Average BERTScore F1:        0.607

--- Levenshtein Normalized Similarity ---
Average Levenshtein Normalized Similarity: 0.539

--- Sentence-BERT Cosine Similarity ---


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Could not compute Sentence-BERT similarity: CUDA out of memory. Tried to allocate 13.34 GiB. GPU 0 has a total capacity of 14.74 GiB of which 13.15 GiB is free. Process 5192 has 1.58 GiB memory in use. Of the allocated memory 1.43 GiB is allocated by PyTorch, and 29.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

--- Final Metrics ---
exact_match_accuracy: 0.331
exact_match_precision: 1.000
exact_match_recall: 0.331
exact_match_f1: 0.497
rouge1_f1: 0.391
rougeL_f1: 0.391
bertscore_precision: 0.559
bertscore_recall: 0.664
bertscore_f1: 0.607
levenshtein_sim: 0.539

Final metrics saved to 'evaluation_metrics_summary.csv'
