In [1]:
!pip install accelerate datasets bitsandbytes scikit-learn pandas pillow tqdm timeout-decorator transformers==4.40.1 peft==0.10.0 --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m74.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m93.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta

In [2]:
import os
import pandas as pd
from PIL import Image
from transformers import BlipProcessor, BlipForQuestionAnswering, Trainer, TrainingArguments
import torch
from accelerate import Accelerator
from peft import LoraConfig, get_peft_model
from transformers.data.data_collator import default_data_collator
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

2025-05-16 12:07:06.661343: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747397226.851062      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747397226.904094      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
# === CONFIGURATION ===
BASE_PATH = "/kaggle/input/vr-dataset/dataset_curated"
SUBFOLDERS = [f"S{i}" for i in range(1, 7)]
TRAIN_FRAC = 0.6
VAL_FRAC = 0.2
TEST_FRAC = 0.2
R = 16
EPOCHS = 3
DEBUG = False  # Set to True to run on a subset of data
DEBUG_SAMPLES = 2000  # Number of QA pairs to use in debug mode

In [4]:
# === LOAD DATA ===
all_dfs = []
for folder in SUBFOLDERS:
    csv_path = os.path.join(BASE_PATH, folder, f"{folder}_qa_data.csv")
    if not os.path.exists(csv_path):
        print(f"CSV not found: {csv_path}")
        continue
    df = pd.read_csv(csv_path, header=None, names=["image_path", "question", "answer"])
    df["image_path"] = df["image_path"].apply(
        lambda p: os.path.join(BASE_PATH, os.path.normpath(p).split("dataset_curated/")[-1])
    )
    df = df[df["image_path"].apply(os.path.exists)].reset_index(drop=True)
    print(f"Loaded {len(df)} examples from {csv_path}")
    all_dfs.append(df)

df_all = pd.concat(all_dfs).reset_index(drop=True)
# Clean dataset
df_all = df_all[df_all["question"].notna() & df_all["answer"].notna()]
df_all["question"] = df_all["question"].astype(str)
df_all["answer"] = df_all["answer"].astype(str)
df_all = df_all[df_all["question"].str.strip() != ""]
df_all = df_all[df_all["answer"].str.strip() != ""]
print(f"After cleaning, total QA pairs: {len(df_all)}")

# === DEBUG MODE ===
if DEBUG:
    df_all = df_all.sample(n=DEBUG_SAMPLES, random_state=42).reset_index(drop=True)
    print(f"DEBUG mode: Sampled {len(df_all)} QA pairs")

# === DATA SPLITTING ===
train_val, test_df = train_test_split(df_all, test_size=TEST_FRAC, random_state=42, shuffle=True)
rel_val = VAL_FRAC / (TRAIN_FRAC + VAL_FRAC)
train_df, val_df = train_test_split(train_val, test_size=rel_val, random_state=42, shuffle=True)
print(f"Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")

Loaded 14366 examples from /kaggle/input/vr-dataset/dataset_curated/S1/S1_qa_data.csv
Loaded 14358 examples from /kaggle/input/vr-dataset/dataset_curated/S2/S2_qa_data.csv
Loaded 14367 examples from /kaggle/input/vr-dataset/dataset_curated/S3/S3_qa_data.csv
Loaded 14366 examples from /kaggle/input/vr-dataset/dataset_curated/S4/S4_qa_data.csv
Loaded 14387 examples from /kaggle/input/vr-dataset/dataset_curated/S5/S5_qa_data.csv
Loaded 14376 examples from /kaggle/input/vr-dataset/dataset_curated/S6/S6_qa_data.csv
After cleaning, total QA pairs: 86208
Train size: 51724, Val size: 17242, Test size: 17242


In [5]:
# === ACCELERATOR INIT ===
accelerator = Accelerator()

# === LOAD BLIP MODEL & PROCESSOR ===
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base", use_fast=True)
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

# === DEFINE CUSTOM DATASET ===
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, df, processor, image_base_path):
        self.df = df
        self.processor = processor
        self.image_base_path = image_base_path

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        full_image_path = row['image_path']
        try:
            image = Image.open(full_image_path).convert("RGB")
        except Exception as e:
            print(f"Failed to load {full_image_path}: {e}")
            image = Image.new("RGB", (224, 224), (0, 0, 0))
        encoding = self.processor(
            images=image,
            text=row['question'],
            padding="max_length",
            max_length=128,
            truncation=True,
            return_tensors="pt",
            return_attention_mask=True
        )
        labels = self.processor.tokenizer(
            row['answer'],
            padding="max_length",
            truncation=True,
            max_length=32,
            return_tensors="pt"
        )["input_ids"]
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        encoding["labels"] = labels.squeeze(0)
        return encoding



preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

In [None]:
# === CREATE DATASET INSTANCES ===
train_dataset = VQADataset(train_df, processor, BASE_PATH)
val_dataset = VQADataset(val_df, processor, BASE_PATH)

# === APPLY LoRA TO MODEL ===
lora_config = LoraConfig(
    r=R,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, lora_config)
print("LoRA applied.")

# === PREPARE MODEL FOR ACCELERATION ===
model = accelerator.prepare(model)

# === DEFINE TRAINING ARGUMENTS ===
training_args = TrainingArguments(
    output_dir="./results",
    run_name="blip_vqa_lora_finetune_curated",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    fp16=torch.cuda.is_available(),
    remove_unused_columns=False,
    report_to="none"
)

# === TRAINER SETUP ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=default_data_collator,
    tokenizer=processor
)

LoRA applied.


In [7]:
# === GPU INFO ===
if torch.cuda.is_available():
    print("GPU Memory Usage Before Training:")
    print(torch.cuda.memory_summary())

# === START TRAINING ===
trainer.train()

# === SAVE MODEL ===
trainer.save_model("./blip_vqa_16")
print("Model saved to './blip_vqa_16'")

GPU Memory Usage Before Training:
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   1479 MiB |   1479 MiB |   1479 MiB |      0 B   |
|       from large pool |   1468 MiB |   1468 MiB |   1468 MiB |      0 B   |
|       from small pool |     10 MiB |     10 MiB |     10 MiB |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |   1479 MiB |   1479 MiB |   1479 MiB |      0 B   |
|       from large pool |   1468 MiB |   1468 MiB |   1468 MiB |      0 B   |
|       from small pool |     10 MiB |     10 MiB |     10 MiB |      0 B   |
|-----------------------------

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss
10,10.3056
20,10.0721
30,9.8703
40,9.6988
50,9.5528
60,9.4366
70,9.3182
80,9.2063
90,9.0957
100,8.982


Model saved to './blip_vqa_16'


In [8]:
# === TEST-TIME EVALUATION ===
model.eval()
y_true, y_pred = [], []

def predict_one(example):
    try:
        img = Image.open(example["image_path"]).convert("RGB")
        question = example["question"]
        inputs = processor(images=img, text=question, return_tensors="pt")
        inputs = {k: v.to(accelerator.device) for k, v in inputs.items()}
        with torch.no_grad():
            out = model.generate(**inputs, max_new_tokens=32)
        return processor.decode(out[0], skip_special_tokens=True).strip().lower()
    except Exception as e:
        print(f"Error predicting: {e}")
        return ""

for i in tqdm(range(len(test_df))):
    row = test_df.iloc[i]
    gt = str(row.answer).lower()
    pr = predict_one(row.to_dict())
    y_true.append(gt)
    y_pred.append(pr)
    # Clear memory after each prediction
    torch.cuda.empty_cache()

# Compute metrics
acc = accuracy_score(y_true, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="macro", zero_division=0)

print(f"\nTest Accuracy: {acc:.2f}")
print(f"Macro Precision: {prec:.2f}")
print(f"Macro Recall: {rec:.2f}")
print(f"Macro F1 Score: {f1:.2f}")

100%|██████████| 17242/17242 [30:34<00:00,  9.40it/s]



Test Accuracy: 0.65
Macro Precision: 0.20
Macro Recall: 0.20
Macro F1 Score: 0.19
