In [5]:
import os
import json
import zipfile
import json
import zipfile
import pandas as pd
from PIL import Image
import time
import csv
import random
import numpy as np
from tqdm import tqdm
from google.colab import drive

drive.mount('/content/drive/')

zip_path = '/content/drive/MyDrive/metadata.zip'
extract_dir = 'meta_data'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

Mounted at /content/drive/


In [1]:
! pip install tqdm



In [None]:
metadata_zip_path = '/content/drive/MyDrive/metadata.zip'
images_base_path = '/content/drive/MyDrive/ABO/images/small/'
extract_dir = 'meta_data'

In [None]:
data = pd.read_csv('final_output.csv')
data

Unnamed: 0,image_id,image_path,description
0,81duX-6eMDL,cd/cdefa595.jpg,
1,717RrHbUDWL,59/59188588.jpg,Outer Material: PU | Closure Type: Lace-Up | H...
2,81IZqnVjLNL,d6/d6a50cdd.jpg,"Snug fit for Samsung Galaxy M21, with perfect ..."
3,81gyvslce+L,21/21a9ec2f.jpg,"Snug fit for Samsung Galaxy M21, with perfect ..."
4,71WiahtDXeL,8e/8e2af54f.jpg,Kosher | One 0.75 Oz. Packet Of Allegro Tea Or...
...,...,...,...
147122,71loipDQ6aL,44/44f4e0ad.jpg,3D Printed Hard Back Case Mobile Cover for Mic...
147123,51Ajy3EZhwL,5b/5bb7f6f6.jpg,1-gallon container (4-pack) of liquid professi...
147124,61OLxxM-vYL,e6/e6b3eb59.jpg,
147125,71cd4GIAv0L,41/412b4182.jpg,


In [1]:
# Extract metadata | Load product metadata
with zipfile.ZipFile(metadata_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

products_df = pd.read_csv(os.path.join(extract_dir, 'products.csv'))
listings_df = pd.read_csv(os.path.join(extract_dir, 'listings.csv'))
images_df = pd.read_csv(os.path.join(extract_dir, 'images.csv'))

# Merge datasets
merged_df = pd.merge(images_df, products_df, on='product_id')
merged_df = pd.merge(merged_df, listings_df, on='product_id')

print(f"Loaded dataset with {len(merged_df)} items")

Loaded dataset with 147127 items


# Gemini API Setup and Prompt Engineering

In [None]:
import google.generativeai as genai

# Configure API (replace with API key)
genai.configure(api_key="AIzaSyDVL6tiPn7klefV3ppq12aRv3h2hFGStDc")

# Initialize model
model = genai.GenerativeModel(model_name="gemini-2.0-flash")

# Define refined prompt based on project report
PROMPT_TEMPLATE = """
Analyze the given image and description. Generate exactly 5 diverse factual questions per image.
Each question should be followed by a one-word answer and then by a difficulty in the format: <Question> # <Answer> # <Difficulty>
Answers must not be 'Yes' or 'No'; instead, provide a specific noun, adjective, or number as a one-word factual answer.
Separate each question-answer pair with a % symbol. Do not include any numbering, colons, or extra text.
Output only one line in the format: <Question1> # <Answer1> % <Question2> # <Answer2> % ... for exactly 5 question-answer pairs.

For each question, assign a difficulty level (0-5):
- Level 0: Answer directly visible in the image or explicitly stated in text
- Level 1: Answer requires basic inference from visible elements
- Level 2: Answer requires combining information from image and text
- Level 3: Answer requires product knowledge beyond what's explicitly shown
- Level 5: ONLY for questions requiring specialized domain expertise
"""


# Data Generation Function

In [None]:
# Cell 4:
def generate_qa_pairs(image_path, description, max_retries=3):
    """Generate QA pairs with difficulty levels for a given image and description"""

    def validate_answer(answer):
        return len(answer.split()) == 1 and answer.lower() not in ['yes', 'no']

    attempts = 0
    while attempts < max_retries:
        try:
            # Load image
            full_image_path = os.path.join(images_base_path, image_path)
            img = Image.open(full_image_path)

            prompt = PROMPT_TEMPLATE.format(description=description)
            response = model.generate_content([prompt, img])

            qa_lines = [line.strip() for line in response.text.split('%') if line.strip()]
            qa_data = []

            for line in qa_lines:
                if ',' in line:  # Basic CSV validation
                    question, answer,  difficulty, = line.split('#', 2)
                    question = question.strip()
                    difficulty = difficulty.strip()
                    answer = answer.strip()

                    # Validate using the nested function we defined above
                    if (question and difficulty.isdigit() and
                        0 <= int(difficulty) <= 5 and
                        validate_answer(answer)):
                        qa_data.append({
                            'question': question,
                            'answer': answer,
                            'difficulty': int(difficulty),
                        })

                return qa_data

        except Exception as e:
            print(f"Attempt {attempts+1} failed: {str(e)}")
            time.sleep(2)  # Backoff

        attempts += 1

    return None

# Batch Processing with Progress Tracking

In [4]:
output_file = 15

fieldnames = [
    'image_id',
    'path',
    'question',
    'answer',
    'difficulty',
]


# Write header
with open(output_file, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()

# Process each item with progress tracking
for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df)):
    try:
        qa_pairs = generate_qa_pairs(row['image_path'], row['description'])

        if qa_pairs:
            for pair in qa_pairs:

                # Write to CSV
                with open(output_file, mode='a', newline='', encoding='utf-8') as f:
                    writer = csv.DictWriter(f, fieldnames=fieldnames)
                    writer.writerow({
                        'image_id': row['image_id'],
                        'path': row['image_path'],
                        'question': pair['question'],
                        'answer': pair['answer'],
                        'difficulty': pair['difficulty'],
                    })

        # Rate limiting
        time.sleep(3 if idx % 10 == 0 else 1)  # Slower every 10 items

    except Exception as e:
        print(f"Error processing row {idx}: {str(e)}")
        continue

print(f"Dataset curation complete. Results saved to {output_file}.csv")

Processing items: 100%|██████████| 8326/8326 [00:17<00:00, 462.75it/s, 8326/8326]


Completed processing all 8326 items
Dataset curation complete. Results saved to 15.csv 



