In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [2]:
import random
import numpy as np
import torch
import os

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
torch.backends.cudnn.deterministic = True

print("Random seed set to 42.")


Random seed set to 42.


In [3]:
import json
import os
from glob import glob

# Paths
json_path = "/kaggle/input/zero-shot-dataset/zero_shot_dataset"  
image_dir = "/kaggle/input/berkley-dataset/"  

# Check JSON files
json_files = sorted(glob(os.path.join(json_path, "*.json")))
print(f"Found {len(json_files)} JSON files.")
print("Sample JSON file paths:", json_files[:2])

Found 25000 JSON files.
Sample JSON file paths: ['/kaggle/input/zero-shot-dataset/zero_shot_dataset/010-mllS7JL.json', '/kaggle/input/zero-shot-dataset/zero_shot_dataset/21-GDjbU0yL.json']


In [4]:
# Install necessary libraries
!pip install tqdm transformers datasets bert-score tqdm Pillow sentence-transformers

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.0.0->bert-score)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.0.0->bert-score)
  Do

In [5]:
import os
import json
from tqdm import tqdm

# List to store all QA pairs across all files in the desired format
qa_data_all = []

# Counter for malformed QA pairs
missing_count = 0

# Process each JSON file separately
for selected_json in tqdm(json_files, desc="Processing JSON files", unit="file"):
    with open(selected_json, 'r') as f:
        dataset = json.load(f)

    for i in range(0, len(dataset), 2):
        image_entry = dataset[i]
        questions_entry = dataset[i + 1] if i + 1 < len(dataset) else {}

        image_path = image_entry.get("path", "")

        if image_dir and not os.path.isabs(image_path):
            image_path = os.path.join(image_dir, image_path.lstrip("/").lstrip("../"))

        questions = questions_entry.get("questions", [])
        for qa_pair in questions:
            qa_pair = {k.strip(): v for k, v in qa_pair.items()}

            if "question" in qa_pair and "answer" in qa_pair:
                qa_data_all.append({
                    "image_path": image_path,
                    "question": qa_pair["question"],
                    "answer": qa_pair["answer"]
                })
            else:
                missing_count += 1
                print(f"\nMalformed QA pair in file: {selected_json}")
                print(f"  Image path: {image_path}")
                print(f"  Raw entry: {qa_pair}")

# Summary
total_qa_pairs = len(qa_data_all)
print(f"\nTotal question-answer pairs across all files: {total_qa_pairs}")
print(f"Total malformed QA pairs skipped: {missing_count}")


Processing JSON files: 100%|██████████| 25000/25000 [01:47<00:00, 232.39file/s]


Total question-answer pairs across all files: 374828
Total malformed QA pairs skipped: 0





In [6]:
import torch
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration, logging
from sentence_transformers import SentenceTransformer

# Suppress unnecessary warnings
logging.set_verbosity_error()

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Sentence Transformer (for semantic similarity)
sentence_model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

# Load BLIP-2 VQA model and processor
model_name = "Salesforce/blip2-opt-2.7b"
processor = Blip2Processor.from_pretrained(model_name)
model = Blip2ForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
).eval()

print("✅ BLIP-2 model and SentenceTransformer loaded successfully.")

2025-05-17 08:59:58.670889: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747472398.857132      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747472398.910914      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/122k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

✅ BLIP-2 model and SentenceTransformer loaded successfully.


In [7]:
from torch.utils.data import Dataset, DataLoader
import torch
import os
from PIL import Image
import pickle
import time

class VQADataset(Dataset):
    def __init__(self, qa_list, processor, cache_dir=None):
        
        self.qa = qa_list
        self.processor = processor
        self.cache_dir = cache_dir
        
        if self.cache_dir and not os.path.exists(self.cache_dir):
            os.makedirs(self.cache_dir)
            
    def __len__(self):
        return len(self.qa)
    
    def __getitem__(self, idx):
        rec = self.qa[idx]
        img_path = rec["image_path"]
        
        if self.cache_dir:
            cache_path = os.path.join(self.cache_dir, f"{os.path.basename(img_path)}.pkl")
            if os.path.exists(cache_path):
                try:
                    with open(cache_path, 'rb') as f:
                        img = pickle.load(f)
                    return img, rec["question"], rec["answer"], rec["image_path"]
                except Exception:
                    pass  
        
        img = Image.open(img_path).convert("RGB")
        
        if self.cache_dir:
            try:
                with open(cache_path, 'wb') as f:
                    pickle.dump(img, f)
            except Exception:
                pass  
                
        return img, rec["question"], rec["answer"], rec["image_path"]

def collate_fn(batch):

    imgs, questions, gts, paths = zip(*batch)
    
    try:
        encoding = processor(
            images=list(imgs),
            text=list(questions),
            return_tensors="pt",
            padding=True
        )
    except Exception as e:
        print(f"Processor error: {e}")
        first_good_batch = next((i for i, img in enumerate(imgs) if img is not None), 0)
        encoding = processor(
            images=[imgs[first_good_batch]],
            text=[questions[first_good_batch]],
            return_tensors="pt"
        )
        gts = [gts[first_good_batch]]
        paths = [paths[first_good_batch]]
        
    return encoding, gts, paths

def create_dataloader(qa_data, processor, batch_size=16, num_workers=4, 
                     cache_dir="./image_cache", resume_from=0):

    if resume_from > 0:
        qa_data = qa_data[resume_from:]
        
    dataset = VQADataset(qa_data, processor, cache_dir=cache_dir)
    
    return DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        collate_fn=collate_fn,
        persistent_workers=True,
        pin_memory=True,
        prefetch_factor=2,  
        drop_last=False,   
    )

BATCH_SIZE = 16
NUM_WORKERS = 4
CACHE_DIR = "./image_cache"

resume_idx = 0
if os.path.exists("resume_state.pkl"):
    with open("resume_state.pkl", "rb") as f:
        resume_state = pickle.load(f)
        resume_idx = resume_state.get("last_processed_idx", 0)
        print(f"Resuming from index {resume_idx}")

loader = create_dataloader(
    qa_data_all, 
    processor,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    cache_dir=CACHE_DIR,
    resume_from=resume_idx
)

print(f"DataLoader ready with {len(loader)} batches!")

DataLoader ready with 23427 batches!


In [8]:
import torch
import numpy as np
import pandas as pd
import os
import time
import json
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from bert_score import score as bert_score
from sentence_transformers import util
from scipy.spatial.distance import cdist

# ─── ENHANCED CONFIG ────────────────────────────────────────────────────────────
PRINT_LIMIT = 20     # how many samples to print
BATCH_SIZE = 16      # Should match the batch size in your DataLoader
DEVICE = device
PROCESSOR = processor
MODEL = model.eval()
SENT_MODEL = sentence_model
RESULTS_CSV = "vqa_evaluation_results.csv"
RESULTS_JSON = "vqa_evaluation_results.json"  # New JSON format for complete data
METRICS_NPZ = "vqa_metrics.npz"  # NumPy compressed format for numerical metrics

# Note: The loader is already created in the previous code
# loader = create_dataloader(...) from your previous code block

# Initialize accumulators
all_em = []
all_f1_tok = []
all_bert_p = []
all_bert_r = []
all_bert_f1 = []
all_sent = []
all_cos = []
printed = 0
sample_data = []  # Store full sample data for later analysis

# No need to handle previous results anymore since we've removed the checkpointing

# Helper function to save results in multiple formats
def save_results():
    """Save evaluation results in multiple formats for later analysis"""
    
    # 1. Save detailed CSV with per-sample metrics
    df = pd.DataFrame(sample_data)
    df.to_csv(RESULTS_CSV, index=False)
    
    # 2. Save numerical metrics in NumPy compressed format (efficient for loading arrays)
    np.savez(
        METRICS_NPZ,
        exact_match=np.array(all_em),
        f1_token=np.array(all_f1_tok),
        bert_p=np.array(all_bert_p),
        bert_r=np.array(all_bert_r),
        bert_f1=np.array(all_bert_f1),
        sent_sim=np.array(all_sent),
        cos_sim=np.array(all_cos)
    )
    
    # 3. Save complete data in JSON format (human-readable and includes text)
    all_data = {
        "samples": sample_data,
        "summary": {
            "exact_match": float(np.mean(all_em)),
            "f1_token": float(np.mean(all_f1_tok)),
            "bert_p": float(np.mean(all_bert_p)),
            "bert_r": float(np.mean(all_bert_r)),
            "bert_f1": float(np.mean(all_bert_f1)),
            "sent_sim": float(np.mean(all_sent)),
            "cos_sim": float(np.mean(all_cos)),
            "total_samples": len(all_em)
        }
    }
    
    with open(RESULTS_JSON, 'w') as f:
        json.dump(all_data, f)
    
    # 4. Also save summary stats
    summary_df = pd.DataFrame({
        "metric": ["Exact Match", "Token F1", "BERT-P", "BERT-R", "BERT-F1", "Sent Sim", "Cos Sim"],
        "value": [
            np.mean(all_em),
            np.mean(all_f1_tok),
            np.mean(all_bert_p),
            np.mean(all_bert_r),
            np.mean(all_bert_f1),
            np.mean(all_sent),
            np.mean(all_cos)
        ]
    })
    summary_df.to_csv("vqa_summary_metrics.csv", index=False)
    
    print(f"Results saved to: {RESULTS_CSV}, {RESULTS_JSON}, and {METRICS_NPZ}")

# ─── OPTIMIZED EVALUATION FUNCTION ────────────────────────────────────────────
@torch.no_grad()  # More efficient than using context manager in a loop
def process_batch(batch_data):
    encoding, gts, paths = batch_data  # Unpack the data from the dataloader
    
    # Move input data to GPU with error handling
    try:
        encoding = {k: v.to(DEVICE, non_blocking=True) for k, v in encoding.items()}
    except Exception as e:
        print(f"Error moving input tensors to device: {e}")
        return False
    
    # Forward pass 
    try:
        outs = MODEL.generate(**encoding)
    except Exception as e:
        print(f"Error in model generation: {e}")
        return False
    
    # Decode predictions
    preds = [PROCESSOR.decode(o, skip_special_tokens=True).strip() for o in outs]
    
    # Compute metrics efficiently (vectorized when possible)
    em_arr = np.array([int(p.lower() == g.lower()) for p, g in zip(preds, gts)])
    
    # F1 scores
    f1_arr = np.zeros(len(preds))
    for i, (p, g) in enumerate(zip(preds, gts)):
        try:
            f1_arr[i] = f1_score([g], [p], average='micro', zero_division=0)
        except Exception:
            f1_arr[i] = 0
    
    # BERT scores with batching and error handling
    try:
        P, R, F1 = bert_score(preds, gts, lang='en', verbose=False, batch_size=len(preds))
        P, R, F1 = P.cpu().numpy(), R.cpu().numpy(), F1.cpu().numpy()
    except Exception as e:
        print(f"BERT score error: {e}, using zeros")
        P = R = F1 = np.zeros(len(preds))
    
    # Sentence embeddings with error handling
    try:
        emb_gt = SENT_MODEL.encode(gts, convert_to_tensor=True, show_progress_bar=False)
        emb_pr = SENT_MODEL.encode(preds, convert_to_tensor=True, show_progress_bar=False)
        sent_sim = util.cos_sim(emb_gt, emb_pr).diag().cpu().numpy()
        cos_sim = 1 - cdist(emb_gt.cpu().numpy(), emb_pr.cpu().numpy(), metric='cosine').diagonal()
    except Exception as e:
        print(f"Embedding error: {e}, using zeros")
        sent_sim = cos_sim = np.zeros(len(preds))
    
    # Store all metrics
    global printed, all_em, all_f1_tok, all_bert_p, all_bert_r, all_bert_f1, all_sent, all_cos, sample_data
    
    all_em.extend(em_arr.tolist())
    all_f1_tok.extend(f1_arr.tolist())
    all_bert_p.extend(P.tolist())
    all_bert_r.extend(R.tolist())
    all_bert_f1.extend(F1.tolist())
    all_sent.extend(sent_sim.tolist())
    all_cos.extend(cos_sim.tolist())
    
    # Store full sample data for each example
    for i, path in enumerate(paths):
        sample_data.append({
            "path": path,
            "ground_truth": gts[i],
            "prediction": preds[i],
            "exact_match": em_arr[i],
            "f1_token": f1_arr[i],
            "bert_p": float(P[i]),
            "bert_r": float(R[i]),
            "bert_f1": float(F1[i]),
            "sent_sim": float(sent_sim[i]),
            "cos_sim": float(cos_sim[i])
        })
    
    # Print samples
    for i in range(len(preds)):
        if printed < PRINT_LIMIT:
            print(f"\nSample {printed+1}")
            print(f" GT: {gts[i]}")
            print(f" P:  {preds[i]}")
            printed += 1
        else:
            break
    
    return True

# ─── STREAMLINED EVALUATION LOOP WITH TIME TRACKING ──────────────────────────────────────────────────
start_time = time.time()
total_batches = len(loader)

print(f"Starting evaluation of {total_batches} batches...")
print(f"Results will be saved to {RESULTS_CSV}, {RESULTS_JSON}, and {METRICS_NPZ}")

# Process all batches - note that loader already yields the batch data
for batch_idx, batch_data in tqdm(enumerate(loader), 
                                desc="Evaluating", 
                                unit="batch", 
                                dynamic_ncols=True,
                                total=total_batches,
                                bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]"):
    
    # Process the current batch data
    success = process_batch(batch_data)
    if not success:
        print(f"Batch {batch_idx} processing failed, skipping to next batch")
        continue
        
    # Force Python's garbage collector to free memory periodically
    if (batch_idx + 1) % 10 == 0:
        import gc
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        # Also print estimated time remaining every 10 batches
        elapsed = time.time() - start_time
        progress = (batch_idx + 1) / total_batches
        if progress > 0:
            estimated_total = elapsed / progress
            remaining = estimated_total - elapsed
            remaining_min = remaining / 60
            elapsed_min = elapsed / 60
            print(f"Progress: {progress:.1%} | Elapsed: {elapsed_min:.1f} min | Remaining: {remaining_min:.1f} min")

# Save all results at the end
save_results()

# ─── FINAL SUMMARY ────────────────────────────────────────────────────────────
print("\n=== OVERALL METRICS ===")
print(f"Exact Match:           {np.mean(all_em):.4f}")
print(f"Token-level F1:        {np.mean(all_f1_tok):.4f}")
print(f"BERT-Score P/R/F1:     {np.mean(all_bert_p):.4f}/{np.mean(all_bert_r):.4f}/{np.mean(all_bert_f1):.4f}")
print(f"Sentence Similarity:   {np.mean(all_sent):.4f}")
print(f"Cosine Similarity:     {np.mean(all_cos):.4f}")
print(f"Total samples:         {len(all_em)}")
print(f"Total execution time:  {(time.time() - start_time) / 60:.2f} minutes")

# Visualize the distribution of metrics
try:
    import matplotlib.pyplot as plt
    
    metrics = [
        ("Exact Match", all_em),
        ("Token F1", all_f1_tok), 
        ("BERT F1", all_bert_f1),
        ("Sentence Sim", all_sent)
    ]
    
    plt.figure(figsize=(12, 8))
    for i, (name, values) in enumerate(metrics):
        plt.subplot(2, 2, i+1)
        plt.hist(values, bins=20)
        plt.title(f"{name} (mean: {np.mean(values):.4f})")
    
    plt.tight_layout()
    plt.savefig("vqa_metrics_distribution.png")
    print("Visualization saved to: vqa_metrics_distribution.png")
except Exception as e:
    print(f"Could not create visualization: {e}")

Starting evaluation of 23427 batches...
Results will be saved to vqa_evaluation_results.csv, vqa_evaluation_results.json, and vqa_metrics.npz


Evaluating:   0%|          | 0/23427 [00:00<?, ?batch/s]