In [None]:
!pip install --upgrade google-genai
!pip install google-generativeai pillow

**Imports**

In [None]:
import os
import json
from pathlib import Path
import pandas as pd
from PIL import Image
import google.generativeai as genai
import re
import time
import csv

**API Configuration**

In [None]:
os.environ['GEMINI_API_KEY'] = '<APIKEY>'# API KEY
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

model = genai.GenerativeModel(model_name="gemini-2.0-flash") # Initialize Gemini model

**Filter Metadata**

In [None]:
#List of metadata fields to retain for VQA prompt
fields_to_keep = [
    "color",
    "item_name",
    "product_type",
    "image_id",
    "item_keywords"
]

# Read balanced dataset CSV and filter columns
meta_list = []

with open("/kaggle/input/balanced_dataset.csv", "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        filtered_obj = {key: row.get(key) for key in fields_to_keep}
        meta_list.append(filtered_obj)


**Load Data**

In [None]:
images_df = pd.read_csv("/kaggle/input/abo-images-small/images/metadata/images.csv")
path_by_id = dict(zip(images_df["image_id"], images_df["path"]))

**Generate VQA Pairs via Gemini**

In [None]:
records = []
skip_count=19000

# Loop over meta_list, skipping the first `skip_count` entries
try:
    for item in meta_list[skip_count:]:
        if not item.get("image_id"):
            print("No main image found for item:", item.get("item_id"))
            continue
    
        img_id   = item["image_id"]
        rel_path = path_by_id.get(img_id)
        if not rel_path:
            continue
    
        meta_str = json.dumps(item)
        prompt   = (
            "You are creating a Visual Question Answering (VQA) dataset.\n"
            f"Product metadata:\n{meta_str}\n\n"
            "Given this metadata + image, generate 5 distinct question-answer pairs, ensure you know the answer to the question.\n"
            "- Each answer must be exactly one word.\n"
            "- The 5 questions generated MUST be different from each other, and MUST be answerable just by looking at the image.\n"
            "- Return a JSON array of {question,answer} objects."
        )
    
        img = Image.open(f"/kaggle/input/vrproject2/abo-images-small/images/small/{rel_path}")
        response = model.generate_content([prompt, img])
        raw      = response.text
    
        # Strip markdown fences
        raw = re.sub(r"^```json\s*\n?", "", raw)
        raw = re.sub(r"\n?```$", "", raw)
    
        # Parse JSON
        vqa_items = json.loads(raw)
    
        print("Response for", img_id, ":", vqa_items)
        for qa in vqa_items:
            records.append({
                "image_id": img_id,
                "question": qa["question"],
                "answer":   qa["answer"],
                "path": rel_path
            })
    
        print("Last record:", records[-1])
        time.sleep(5)
except Exception as e:
    # When done, save:
    print("Error: ",e)
    vqa_df = pd.DataFrame(records)
    vqa_df.to_csv("/kaggle/working/vqa_dataset0.csv", index=False)
    print("Wrote", len(vqa_df), "rows (skipped first", skip_count, "entries).")

**Save Generated VQA Dataset**

In [None]:
vqa_df = pd.DataFrame(records) # Convert to DataFrame and export
vqa_df.to_csv("vqa_dataset.csv", index=False)
print("Generated", len(vqa_df), "Q&A pairs across all images")