In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [None]:
import pandas as pd

metadata_path = "/kaggle/input/abo-small/metadata/images.csv"
df = pd.read_csv(metadata_path)
df.head()
print("Total images:", len(df))

In [None]:
import json

# Loading the listings
with open("/kaggle/input/abo-product-metadata/combined_listings.json", "r") as f:
    data = json.load(f)

print(f"Total entries: {len(data)}")

# # Printing a sample listing
# sample_id = list(data.keys())[0]
# print("Sample image_id:", sample_id)
# print("Associated product data:", data[sample_id])

In [None]:
# Printing a sample listing
sample_id = list(data.keys())[5]
print("Sample image_id:", sample_id)
print("Associated product data:", data[sample_id])

In [None]:
def extract_metadata(data):
    metadata_lines = []

    def get_en_value(field):
        """Helper to extract English value from a language-tagged list."""
        return next((entry.get('value') for entry in data.get(field, [])
                     if entry.get('language_tag', '').startswith('en') and entry.get('value')), None)


    item_name = get_en_value('item_name')
    if item_name:
        metadata_lines.append(f"Item Name: {item_name}")

    model_name = get_en_value('model_name')
    if model_name:
        metadata_lines.append(f"Model Name: {model_name}")
        
    brand = get_en_value('brand')
    if brand:
        metadata_lines.append(f"Brand: {brand}")


    product_type = data.get('product_type', [{}])[0].get('value')
    if product_type:
        metadata_lines.append(f"Product Type: {product_type}")

    style = get_en_value('style')
    if style:
        metadata_lines.append(f"Style: {style}")

    color = get_en_value('color')
    if color:
        metadata_lines.append(f"Color: {color}")

    # Weight
    weight_data = data.get('item_weight', [{}])[0].get('normalized_value', {})
    if weight_data:
        weight = weight_data.get('value')
        unit = weight_data.get('unit')
        if weight is not None and unit:
            metadata_lines.append(f"Item Weight: {weight} {unit}")

    # Dimensions
    dimensions = data.get('item_dimensions')
    if dimensions:
        height = dimensions.get('height', {}).get('value')
        length = dimensions.get('length', {}).get('value')
        width = dimensions.get('width', {}).get('value')
        unit = dimensions.get('height', {}).get('unit')
        if None not in (height, length, width, unit):
            metadata_lines.append(f"Item Dimensions (H x L x W): {height} x {length} x {width} {unit}")

    # Bullet Points (only English)
    bullet_points = [
        bp['value'] for bp in data.get('bullet_point', [])
        if bp.get('language_tag', '').startswith('en') and bp.get('value')
    ]
    if bullet_points:
        metadata_lines.append("Bullet Points: " + "; ".join(bullet_points))


    item_keywords = get_en_value('item_keywords')
    if item_keywords:
        metadata_lines.append(f"Item Keywords: {item_keywords}")

    return "\n".join(metadata_lines)

In [None]:
print(extract_metadata(data[sample_id]))

In [None]:
import json
import random
from collections import defaultdict


print(f"Total entries in original data: {len(data)}")

# Helper to check if an entry is in English
def is_english(entry):
    for field in ['brand', 'bullet_point', 'color', 'item_name', 'style', 'item_keywords']:
        if field in entry:
            values = entry[field]
            if not all(v.get('language_tag', '').startswith('en') for v in values):
                return False
    return True

# Filter to English-only entries
english_entries = {
    img_id: info for img_id, info in data.items() if is_english(info)
}

print(f"Entries after English filtering: {len(english_entries)}")

# Group by product type
product_type_groups = defaultdict(list)
for img_id, info in english_entries.items():
    product_type = info.get('product_type', [{}])[0].get('value')
    if product_type:
        product_type_groups[product_type].append((img_id, info))

# Determine equal sample size per product type
num_types = len(product_type_groups)
target_total = 30000
per_type_quota = target_total // num_types
sampled_data = {}

print(f"Sampling approx {per_type_quota} entries per product type...")

# First round of balanced sampling
for ptype, items in product_type_groups.items():
    sample_count = min(per_type_quota, len(items))
    sampled = random.sample(items, sample_count)
    for img_id, info in sampled:
        sampled_data[img_id] = info

# Fill remaining with random from types that had more than quota
remaining = target_total - len(sampled_data)
overflow_pool = []

for ptype, items in product_type_groups.items():
    available = [item for item in items if item[0] not in sampled_data]
    overflow_pool.extend(available)

if remaining > 0 and overflow_pool:
    additional_samples = random.sample(overflow_pool, min(remaining, len(overflow_pool)))
    for img_id, info in additional_samples:
        sampled_data[img_id] = info

print(f"Final sampled size: {len(sampled_data)}")

# Save the sampled data
with open("sampled_english_equal_product_types.json", "w") as f_out:
    json.dump(sampled_data, f_out)

print("Saved to 'sampled_english_equal_product_types.json'")


In [None]:
image_base_path = "/kaggle/input/abo-small/small"
metadata_path = "/kaggle/input/abo-small/metadata/images.csv"
metadata_df = pd.read_csv(metadata_path)

In [None]:
!pip install google-generativeai tqdm --quiet

In [None]:
sample_key, val = None, None
for key, value in sampled_data.items():
    sample_key = key
    val = value
    break

In [None]:
metadata_df = metadata_df.set_index("image_id")

# Then retrieve path directly
path = metadata_df.at[val.get("main_image_id"), "path"]
# print("Path:", path)

In [None]:
def extract_json(output):
    try:
        start = output.find('[')
        end = output.rfind(']')
        json_str = output[start:end+1]

        data = json.loads(json_str)

        # Check format
        if not isinstance(data, list):
            raise ValueError("The response should be a list of Q&A pairs.")
        for item in data:
            if not isinstance(item, dict):
                raise ValueError("Each item in the list should be a dictionary.")
            if "question" not in item or "answer" not in item:
                raise ValueError("Each dictionary should contain 'question' and 'answer' keys.")
            if not isinstance(item["question"], str) or not isinstance(item["answer"], str):
                raise ValueError("Both 'question' and 'answer' should be strings.")

        return data

    except:
        return None

In [None]:
extract_metadata(val)

In [None]:
import google.generativeai as genai
from PIL import Image
import os

genai.configure(api_key="AIzaSyAyQ9vilRYQmaDVhAvtCUyU--HhaDjt9eo")

model = genai.GenerativeModel("gemini-1.5-flash")

image_path = os.path.join(image_base_path, path)
image = Image.open(image_path)



prompt = '''You are shown a product image along with its metadata. Use the metadata only for *contextual understanding*, not for answering. Your task is to generate a set of **4–6 diverse, image-answerable, and challenging questions**, where **each answer is a single word** (e.g., "blue", "five", "yes", "sneakers").

These questions must require **careful visual inspection**, **fine-grained detail recognition**, or **higher-level visual reasoning**, but must be **fully answerable using only the image itself**. The metadata is provided to help disambiguate what the image might contain — do not base your questions on metadata-only information.

Question Requirements:
- Must be **clearly grounded in visible evidence** in the image
- Must be **unambiguous**, **non-subjective**, and **non-trivial**
- Each answer must be a **single word** (e.g., noun, color, number, "yes"/"no")

Coverage:
Include **at least one** question that requires:
1. Small detail recognition 
2. Fine-grained categorization 
3. Challenging reasoning or fine-grained counting

Also consider:
- Material/texture recognition
- Color attribute specificity
- Brand/logo identification
- Shape or structure understanding
- Functional yes/no questions

Difficulty:
- Mix easy, moderate, and hard — with emphasis on **moderate to hard**
- Avoid obvious or metadata-derived questions
- Avoid subjective or vague terms like "modern" or "comfortable"

Output Format:
Return only a list of dictionaries in this exact format:

[
    {
        "question": "What specific type of product is shown?",
        "answer": "Backpack"
    },
    {
        "question": "What material is most visible on the product?",
        "answer": "Leather"
    },
    {
        "question": "What color is the zipper?",
        "answer": "Silver"
    },
    {
        "question": "Is the logo present on the front side?",
        "answer": "Yes"
    },
    {
        "question": "How many pockets are visible?",
        "answer": "Three"
    }
]
''' + f"Image Metadata: {extract_metadata(val)}"

print("Generating response")
response = model.generate_content([image, prompt])
text = response.text

print(text)
print(extract_json(text))

In [None]:
import os
import json
import time
import pandas as pd
from PIL import Image
import google.generativeai as genai
from tqdm import tqdm

# === CONFIGURATION ===
API_KEY = "AIzaSyDW7iUb6rnEZqrzA8hOa1qBT4wSrJsl9GA"
LISTINGS_JSON_PATH = "/kaggle/input/product-listings/product_listings.json"
OUTPUT_JSON = "vqa_results_24400_to_25800.json"

START_INDEX = 24400
END_INDEX = 25800


SLEEP_TIME = 3.5  # in seconds

# === Load listings JSON ===
with open(LISTINGS_JSON_PATH, "r") as f:
    data = json.load(f)


keys = list(data.keys())
data_subset = {k: data[k] for k in keys[START_INDEX:END_INDEX]}

# === Setup Gemini ===
genai.configure(api_key=API_KEY)
model = genai.GenerativeModel("gemini-2.0-flash")



all_results = {}

In [None]:

# === Iterate through listings ===
count = 0
for key, val in tqdm(data_subset.items()):

    try:
        image_id = val.get("main_image_id")
        if not image_id:
            continue

        # Retrieve image path
        path = metadata_df.at[image_id, "path"]
        image_path = os.path.join(image_base_path, path)
        if not os.path.exists(image_path):
            continue

        # Open image
        image = Image.open(image_path)

        # Compose prompt
        prompt = '''You are shown a product image along with its metadata. Use the metadata only for *contextual understanding*, not for answering. Your task is to generate a set of **4–6 diverse, image-answerable, and challenging questions**, where **each answer is a single word** (e.g., "blue", "five", "yes", "sneakers").

These questions must require **careful visual inspection**, **fine-grained detail recognition**, or **higher-level visual reasoning**, but must be **fully answerable using only the image itself**. The metadata is provided to help disambiguate what the image might contain — do not base your questions on metadata-only information.

Question Requirements:
- Must be **clearly grounded in visible evidence** in the image
- Must be **unambiguous**, **non-subjective**, and **non-trivial**
- Each answer must be a **single word**

Coverage:
Include **at least one** question that requires:
1. Small detail recognition 
2. Fine-grained categorization 
3. Challenging reasoning or fine-grained counting

Also consider:
- Material/texture recognition
- Color attribute specificity
- Brand/logo identification
- Shape or structure understanding
- Functional yes/no questions
- Any other visible aspects of the image

Difficulty:
- Mix easy, moderate, and hard — with emphasis on **moderate to hard**
- Avoid obvious or metadata-derived questions
- Avoid subjective or vague terms like "modern" or "comfortable"

Output Format:
Return only a list of dictionaries in this exact format:

[
    {
        "question": "What specific type of product is shown?",
        "answer": "Backpack"
    },
    {
        "question": "What material is most visible on the product?",
        "answer": "Leather"
    },
    {
        "question": "What color is the zipper?",
        "answer": "Silver"
    },
    {
        "question": "Is the logo present on the front side?",
        "answer": "Yes"
    },
    {
        "question": "How many pockets are visible?",
        "answer": "Three"
    }
]
''' + f"\nImage Metadata: {extract_metadata(val)}"

        print(f"Generating response for {key}...")
        response = model.generate_content([image, prompt])
        qa_data = response.text.strip()
        final_qa = extract_json(qa_data)

        all_results[key] = {
            "image_id": image_id,
            "qa_data": final_qa
        }

        # Save intermediate result
        with open(OUTPUT_JSON, "w") as f:
            json.dump(all_results, f, indent=2)

        count += 1
        time.sleep(SLEEP_TIME)

    except Exception as e:
        print(f"[ERROR] Failed for {key}: {e}")
        time.sleep(SLEEP_TIME * 2)  # longer pause after error
        continue

print(f"\n Completed {count} listings. Results saved to {OUTPUT_JSON}")


In [None]:
from transformers import BlipProcessor, BlipForQuestionAnswering, BartTokenizer, BartForConditionalGeneration
from PIL import Image
import torch
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
# import evaluate
# from bert_score import score as bert_score  # Uncomment if installed
import csv
from pathlib import Path

# Prepare output CSV
output_path = Path("vqa_test_predictions.csv")

# Config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
total_images = 1000  # total available
eval_fraction = 0.3   # 20% of data for baseline evaluation

# Load BLIP
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)

# Load dataset and sample 20%
df = pd.read_csv("/kaggle/input/combined-vqa/flattened_vqa_with_metadata.csv")
df_sampled = df.sample(frac=eval_fraction, random_state=42).copy()

# Train-test split (e.g., 80-20)
train_df, test_df = train_test_split(df_sampled, test_size=0.2, random_state=42)

# Only evaluate on test set
predictions = []
references = test_df['answer'].tolist()

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Running VQA on Test Set"):
    try:
        image_path = row['image_path']
        question = row['question']
        ground_truth = row['answer']
        image = Image.open(image_path).convert("RGB")
        inputs = processor(image, question, return_tensors="pt").to(device)
        output = model.generate(**inputs)
        answer = processor.decode(output[0], skip_special_tokens=True)
        predictions.append(answer)
    except Exception as e:
        print(f"Failed on {row['image_path']}: {e}")
        predictions.append("")
        
    with open(output_path, mode="a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow([image_path, question, ground_truth, answer])
    print("predictions saved to csv")
# Save and evaluate
# test_df = test_df.copy()
# test_df['predicted_answer'] = predictions
# test_df['correct'] = test_df['predicted_answer'].str.strip().str.lower() == test_df['answer'].str.strip().str.lower()
# test_df.to_csv("vqa_test_predictions.csv", index=False)


In [None]:
df_sampled

In [1]:
!pip install bert-score rouge-score nltk

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.0.0->bert-score)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-non

In [6]:
from transformers import BlipProcessor, BlipForQuestionAnswering
from peft import PeftModel, PeftConfig
import torch

# Load processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")

# Load base model
base_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, "/kaggle/input/16-16-full-no-dense/transformers/default/1").to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

PeftModel(
  (base_model): LoraModel(
    (model): BlipForQuestionAnswering(
      (vision_model): BlipVisionModel(
        (embeddings): BlipVisionEmbeddings(
          (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
        )
        (encoder): BlipEncoder(
          (layers): ModuleList(
            (0-11): 12 x BlipEncoderLayer(
              (self_attn): BlipAttention(
                (dropout): Dropout(p=0.0, inplace=False)
                (qkv): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=2304, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=16, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=16, out_features=2304, bias=False)
                  )
   

In [9]:
from PIL import Image
import pandas as pd
import torch
from PIL import Image
from tqdm import tqdm
from transformers import BlipProcessor, BlipForQuestionAnswering
from peft import PeftModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load your test CSV (replace with actual path if needed)
test_df = pd.read_csv("/kaggle/input/combined-vqa/flattened_vqa_with_metadata.csv")  # or whatever test set you want
test_df = test_df.sample(n=30000, random_state=42).copy()  # Adjust sample size if needed

# Prepare predictions
predictions = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Running Inference"):
    try:
        image_path = row['image_path']
        question = str(row['question'])

        image = Image.open(image_path).convert("RGB")

        # Preprocess
        inputs = processor(image, question, return_tensors="pt").to(device)

        # Generate answer
        with torch.no_grad():
            output = model.generate(**inputs)

        answer = processor.decode(output[0], skip_special_tokens=True)
        predictions.append(answer)
    except Exception as e:
        print(f"Error with {image_path}: {e}")
        predictions.append("")

# Save predictions to CSV
test_df['predicted_answer'] = predictions
test_df.to_csv("blip_16-16-full-no-dense-epoch2-finetuned_predictions.csv", index=False)
print("Predictions saved to blip_8-16-qkv-finetuned_predictions.csv")


Running Inference:   2%|▏         | 513/30000 [12:57<12:24:48,  1.52s/it]


KeyboardInterrupt: 

In [3]:
!pip install bert-score rouge-score nltk

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.0.0->bert-score)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-non

In [4]:
import pandas as pd
from sklearn.metrics import f1_score
from bert_score import score as bert_score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# from bart_score import BARTScorer
from tqdm import tqdm

tqdm.pandas()


In [8]:
# Load CSV
df = pd.read_csv("/kaggle/working/blip_16-16-full-no-dense-epoch2-finetuned_predictions.csv")  # Replace with your CSV file path

# Clean text (optional but recommended)
def normalize_text(s):
    return str(s).strip().lower()

df['answer'] = df['answer'].astype(str).apply(normalize_text)
df['predicted_answer'] = df['predicted_answer'].astype(str).apply(normalize_text)

# Accuracy
accuracy = (df['answer'] == df['predicted_answer']).mean()

# BERTScore (Precision/Recall/F1)
P, R, F1 = bert_score(df['predicted_answer'].tolist(), df['answer'].tolist(), lang='en', verbose=True)
bert_f1 = F1.mean().item()

# ROUGE Score
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
df['rougeL'] = df.progress_apply(lambda row: rouge.score(row['answer'], row['predicted_answer'])['rougeL'].fmeasure, axis=1)
avg_rougeL = df['rougeL'].mean()

# F1 Score (Token-level macro F1)
def compute_token_f1(pred, gt):
    pred_tokens = pred.split()
    gt_tokens = gt.split()
    common = set(pred_tokens) & set(gt_tokens)
    if not pred_tokens or not gt_tokens:
        return 0.0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(gt_tokens)
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

df['token_f1'] = df.progress_apply(lambda row: compute_token_f1(row['predicted_answer'], row['answer']), axis=1)
avg_token_f1 = df['token_f1'].mean()

# BLEU Score
smooth_fn = SmoothingFunction().method1
df['bleu'] = df.progress_apply(lambda row: sentence_bleu([row['answer'].split()], row['predicted_answer'].split(), smoothing_function=smooth_fn), axis=1)
avg_bleu = df['bleu'].mean()

# # BARTScore (optional; needs large model and GPU)
# bart_scorer = BARTScorer(device='cuda' if torch.cuda.is_available() else 'cpu', checkpoint='facebook/bart-large-cnn')
# df['bartscore'] = df.progress_apply(lambda row: bart_scorer.score([row['answer']], [row['ground_truth']])[0], axis=1)
# avg_bartscore = df['bartscore'].mean()

print(f"Accuracy      : {accuracy:.4f}")
print(f"BERTScore F1  : {bert_f1:.4f}")
print(f"ROUGE-L       : {avg_rougeL:.4f}")
print(f"Token F1      : {avg_token_f1:.4f}")
print(f"BLEU          : {avg_bleu:.4f}")
# print(f"BARTScore     : {avg_bartscore:.4f}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 2.00 seconds, 49.98 sentences/sec


100%|██████████| 100/100 [00:00<00:00, 14258.10it/s]
100%|██████████| 100/100 [00:00<00:00, 60990.32it/s]
100%|██████████| 100/100 [00:00<00:00, 14868.15it/s]

Accuracy      : 0.6100
BERTScore F1  : 0.9836
ROUGE-L       : 0.6200
Token F1      : 0.6100
BLEU          : 0.1085



