In [None]:
import os
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm

import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from accelerate import init_empty_weights, infer_auto_device_map

from sklearn.metrics import accuracy_score, f1_score

In [None]:
MODEL_NAME = "Salesforce/blip2-flan-t5-xl"   # or any other BLIP-2 variant
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
SUBSET_SIZE = 100
DATA_CSV = "/kaggle/input/vrproject2dataset/vqa_dataset.csv"  # expected columns: ["path", "question", "answer"]
BATCH_SIZE = 8

In [None]:
df = pd.read_csv(DATA_CSV)
# Optional: split into train/test if not already split
# from sklearn.model_selection import train_test_split
# train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
# test_df = df  # assuming this is your evaluation split

test_df = df.sample(n=SUBSET_SIZE, random_state=42).reset_index(drop=True)

In [None]:
# # 3. Infer a full device_map so that `language_model` (and all sub‑modules)
# #    are assigned to specific GPUs and the warning goes away.
# #    Adjust max_memory per‑GPU as needed.
# with init_empty_weights():
#     dummy = Blip2ForConditionalGeneration.from_pretrained(MODEL_NAME)
# device_map = infer_auto_device_map(
#     dummy,
#     max_memory={i: "30GB" for i in range(torch.cuda.device_count())},
#     no_split_module_classes=["Blip2VisionModel"]  # prevents splitting vision patch embed
# )


In [None]:
# 3. Load model & processor
processor = Blip2Processor.from_pretrained(MODEL_NAME, use_fast=True)
model = Blip2ForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,    # half precision
    device_map="auto",            # spread layers over all GPUs
    offload_folder="offload",     # optional: spill to CPU if still tight
)
model.eval()


In [None]:
# 4. Inference loop
preds = []
refs  = []

BASE_DIR = "/kaggle/input/vrproject2/abo-images-small/images/small"

for i in tqdm(range(0, len(test_df), BATCH_SIZE), desc="Evaluating"):
    batch = test_df.iloc[i : i + BATCH_SIZE]
    images = [
            Image.open(os.path.join(BASE_DIR, fname)).convert("RGB")
            for fname in batch["path"]
        ]    
    questions = list(batch["question"])

    # Preprocess
    inputs = processor(images=images,
                       text=questions,
                       return_tensors="pt",
                       padding="max_length",
                       truncation=True)

    # Generate answers (single‐word expected)
    with torch.no_grad():
        generated_ids = model.generate(**inputs,
                                       max_new_tokens=5,
                                       num_beams=5)
    answers = processor.batch_decode(generated_ids, skip_special_tokens=True)

    torch.cuda.empty_cache()
    
    preds.extend([a.strip().lower() for a in answers])
    refs.extend([r.strip().lower() for r in batch["answer"]])
    print(i)

# preds, refs = [], []

# for i in tqdm(range(0, len(test_df), BATCH_SIZE), desc="Evaluating"):
#     batch = test_df.iloc[i : i + BATCH_SIZE]
#     images = [
#         Image.open(os.path.join(BASE_DIR, fname)).convert("RGB")
#         for fname in batch["path"]
#     ]
#     questions = list(batch["question"])

#     inputs = processor(
#         images=images,
#         text=questions,
#         return_tensors="pt",
#         padding="max_length",
#         truncation=True,
#     )

#     # ——— NEW: Move all input tensors onto the same device as the model
#     for k, v in inputs.items():
#         inputs[k] = v.to(model.device)

#     with torch.no_grad():
#         generated_ids = model.generate(
#             **inputs,
#             max_new_tokens=5,
#             num_beams=5,
#         )

#     #################
#     torch.cuda.empty_cache()
#     ##########
    
#     answers = processor.batch_decode(generated_ids, skip_special_tokens=True)
#     preds.extend([a.strip().lower() for a in answers])
#     refs.extend([r.strip().lower() for r in batch["answer"]])
#     print(i)



In [None]:
acc = accuracy_score(refs, preds)
f1  = f1_score(refs, preds, average="macro")  # or "weighted" if class‐imbalanced

print(f"Baseline Evaluation Results on {len(test_df)} examples:")
print(f"  • Accuracy: {acc * 100:.2f}%")
print(f"  • Macro F1  : {f1  * 100:.2f}%")

In [None]:
# 6. Save detailed results

OUTPUT_CSV = "results_subset.csv"

results_df = pd.DataFrame({
    "path"      : test_df["path"],
    "question"  : test_df["question"],
    "answer"    : test_df["answer"],
    "prediction": preds
})

results_df.to_csv(OUTPUT_CSV, index=False)
print(f"\nSaved detailed results to {OUTPUT_CSV}")