<a href="https://colab.research.google.com/github/SEKAR147/IJCNN_Work_SK/blob/main/Copy_of_Blip2_pipeline3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Standard "Stable" environment for your GQA project
!pip install -q huggingface_hub>=0.25.0 transformers>=4.45.0
!pip install -q accelerate bitsandbytes timm
import torch
from PIL import Image
import json
import os
import shutil
from google.colab import files
from transformers import (
    AutoProcessor,
    AutoModelForZeroShotObjectDetection,
    Blip2Processor,
    Blip2ForConditionalGeneration,
    AutoModelForCausalLM,
    AutoTokenizer
)

In [None]:
from transformers import BitsAndBytesConfig

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"


quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

dino_id = "IDEA-Research/grounding-dino-tiny"
dino_processor = AutoProcessor.from_pretrained(dino_id)
dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(dino_id).to(device)

blip_id = "Salesforce/blip2-opt-2.7b"
blip_processor = Blip2Processor.from_pretrained(blip_id)
blip_model = Blip2ForConditionalGeneration.from_pretrained(
    blip_id,
    quantization_config=quantization_config,
    device_map="auto"
)


In [None]:
!pip install -q datasets
from datasets import load_dataset
from itertools import islice

In [None]:
import spacy
from tqdm import tqdm
import torch

nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])

In [None]:
def run_pipeline_streaming(image, text_prompt, question_id, image_id, base_drive_path):
    q_dir = os.path.join(base_drive_path, question_id)
    crops_dir = os.path.join(q_dir, "crops")
    os.makedirs(crops_dir, exist_ok=True)

    image = image.convert("RGB")
    image.save(os.path.join(q_dir, "original.jpg"))

    COLOR_SET = {
        "red", "blue", "green", "yellow", "black", "white", "silver", "gold", "orange",
        "pink", "purple", "brown", "gray", "grey", "tan", "turquoise", "beige", "maroon",
        "navy", "teal", "azure", "bronze", "copper", "lavender", "violet", "blonde",
        "light", "dark", "bright", "pale", "colorful", "multicolored"
    }

    MATERIAL_SET = {
        "wood", "metal", "plastic", "glass", "brick", "denim", "leather", "cloth", "stone",
        "concrete", "paper", "rubber", "wool", "silk", "velvet", "fabric", "cotton", "nylon",
        "steel", "aluminum", "iron", "chrome", "porcelain", "ceramic", "tile", "marble",
        "asphalt", "cardboard", "wicker", "vinyl", "suede", "fleece"
    }

    TEXTURE_SET = {
        "smooth", "rough", "shiny", "metallic", "soft", "hard", "fuzzy", "clear", "dull",
        "matte", "wet", "dry", "painted", "polished", "glossy", "checkered", "striped",
        "dotted", "patterned", "cracked", "dirty", "clean", "worn", "new", "old", "fuzzy",
        "hairy", "scratched", "wrinkled", "rusted", "transparent", "opaque"
    }

    SHAPE_SET = {
        "round", "square", "rectangular", "triangular", "oval", "flat", "curved", "pointed",
        "thick", "thin", "wide", "narrow", "large", "small", "tiny", "huge", "long", "short",
        "tall", "curvy", "straight", "bent", "circular", "spherical", "conical", "cylindrical"
    }

    CLOTHING_SET = {
        "shirt", "pants", "hat", "glasses", "jacket", "jeans", "dress", "shorts", "shoes",
        "t-shirt", "sweater", "suit", "tie", "skirt", "boots", "sneakers", "socks", "gloves",
        "scarf", "belt", "cap", "helmet", "uniform", "vest", "coat", "hoodie"
    }

    # --- PHASE 1: GROUNDING DINO ---
    inputs = dino_processor(images=image, text=text_prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = dino_model(**inputs)

    results = dino_processor.post_process_grounded_object_detection(
        outputs, inputs.input_ids,
        threshold=0.35, text_threshold=0.25,
        target_sizes=[image.size[::-1]]
    )[0]

    final_data = []

    # --- PHASE 2 & 3: OBJECT LOOP ---
    for i, (score, label, box) in enumerate(zip(results["scores"], results["labels"], results["boxes"])):
        box = [round(b, 2) for b in box.tolist()]
        crop = image.crop((box[0], box[1], box[2], box[3]))

        if crop.size[0] < 40 or crop.size[1] < 40:
            continue

        crop_name = f"obj_{i}_{label}.jpg"
        crop.save(os.path.join(crops_dir, crop_name))

        extracted_attributes = []
        actions = []
        obj_id = f"gqa_2026_{question_id}_obj_{i}"

        # --- PHASE 2: BLIP-2 ---
        if label.lower() == "person":
            vqa_prompt = f"Question: Describe the clothing and current activity of this person. Answer: This person is wearing"
        else:
            vqa_prompt = f"Question: What are the specific visual properties of this {label}? Describe its color and material. Answer: This {label} is"
        blip_inputs = blip_processor(images=crop, text=vqa_prompt, return_tensors="pt").to(device, torch.float16)

        with torch.no_grad():
            gen_ids = blip_model.generate(**blip_inputs, max_new_tokens=40, min_new_tokens=10, repetition_penalty=1.2)
            desc = blip_processor.batch_decode(gen_ids, skip_special_tokens=True)[0].strip()
            desc = desc.split("Answer:")[-1].strip()

        # --- PHASE 3: SPACY ---
        doc = nlp(desc.lower())
        categorized_attr = {
            "color": [],
            "material": [],
            "texture": [],
            "shape": [],
            "action": [],
            "clothing": []
        }

        for token in doc:
            t_text = token.text
            # 1. Action/Pose: Capture Present Participle verbs (-ing)
            if token.pos_ == "VERB" and token.tag_ == "VBG":
                if t_text not in ["is", "being", "having", "pose", "wearing"]:
                    categorized_attr["action"].append(t_text)

            # 2. Categorize based on GQA Sets
            if t_text in COLOR_SET:
                categorized_attr["color"].append(t_text)
            if t_text in MATERIAL_SET:
                categorized_attr["material"].append(t_text)
            if t_text in TEXTURE_SET:
                categorized_attr["texture"].append(t_text)
            if t_text in SHAPE_SET:
                categorized_attr["shape"].append(t_text)
            if t_text in CLOTHING_SET:
                categorized_attr["clothing"].append(t_text)

        categorized_attr = {k: list(set(v)) for k, v in categorized_attr.items()}

        final_data.append({
            "question_id": question_id,
            "img_id": image_id,
            "obj_id": obj_id,
            "box": box,
            "object_type": label,
            "attributes": categorized_attr, # Labeled output
            "crop_path": os.path.join("crops", crop_name),
            "raw": desc
        })

    # Save the JSON for this specific image inside its folder
    with open(os.path.join(q_dir, "data.json"), "w") as f:
        json.dump(final_data, f, indent=4)

    # Cleanup memory
    del results, inputs, outputs
    return final_data

In [None]:
import os
from google.colab import drive

!fusermount -u /content/drive

!rm -rf /content/drive

!mkdir /content/drive

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
drive_path = "/content/drive/MyDrive/GQA_Research_2026_7"
os.makedirs(drive_path, exist_ok=True)

In [None]:
import torch
import json
import os
from itertools import islice
from datasets import load_dataset

dataset = load_dataset("lmms-lab/GQA", "val_balanced_instructions", streaming=True, split="val")

image_dataset = load_dataset("lmms-lab/GQA", "val_balanced_images", streaming=True, split="val")

print("Building image lookup table...")
image_lookup = {}
for img_ex in islice(image_dataset, 5000):
    image_lookup[img_ex['id']] = img_ex['image']

all_final_results = []
num_questions = 1000
prompt = "person . clothing . bag . chair . table . house . boat . door . animal . car . sign . bottle . cup . food . plate ."

print(f"Starting stream for {num_questions} images...")

for i, example in enumerate(islice(dataset, num_questions)):
    q_id = example['id']
    img_id = example['imageId']

    if img_id not in image_lookup:
        print(f"Skipping Q {q_id}: Image {img_id} not found in cache.")
        continue

    img = image_lookup[img_id]

    try:
        image_results = run_pipeline_streaming(img, prompt, question_id = q_id, image_id=img_id, base_drive_path= drive_path)
        all_final_results.extend(image_results)
    except Exception as e:
        print(f"Skipping image {i} due to error: {e}")

    if i % 50 == 0 and i > 0:
        checkpoint_file = os.path.join(drive_path, "results_checkpoint.json")
        with open(checkpoint_file, "w") as f:
            json.dump(all_final_results, f)
        print(f"Checkpoint saved at question {i}")

        torch.cuda.empty_cache()

final_file = os.path.join(drive_path, "final_gqa_results.json")
with open(final_file, "w") as f:
    json.dump(all_final_results, f)

print(f"Done! All results are saved in {drive_path}")

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch
import os
import json
import numpy as np

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_iou(boxA, boxB):
    xA = max(boxA[0], boxB[0]); yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2]); yB = min(boxA[3], boxB[3])
    interArea = max(0, xB - xA) * max(0, yB - yA)
    areaA = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    areaB = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    return interArea / float(areaA + areaB - interArea + 1e-6)

def run_semantic_evaluation(root_dir, gt_path, threshold=0.50):
    print(f"Loading Ground Truth...")
    with open(gt_path, 'r') as f:
        gt_data = json.load(f)

    results = {"total_items": 0, "semantic_hits": 0, "exact_hits": 0}

    print("Starting Semantic Analysis...")
    for root, dirs, files in os.walk(root_dir):
        if "data.json" not in files: continue

        with open(os.path.join(root, "data.json"), 'r') as f:
            predictions = json.load(f)
        if not predictions: continue

        img_id = str(predictions[0].get('img_id'))
        if img_id not in gt_data: continue
        gt_objs = gt_data[img_id]['objects']

        for p_obj in predictions:
            p_bbox = p_obj.get('box')
            if not p_bbox: continue

            # --- SPATIAL MATCHING ---
            best_iou = 0
            best_gt_attrs = []

            for g_id, g_obj in gt_objs.items():
                # GQA [x, y, w, h] -> [x1, y1, x2, y2]
                g_bbox = [g_obj['x'], g_obj['y'], g_obj['x'] + g_obj['w'], g_obj['y'] + g_obj['h']]
                iou = calculate_iou(p_bbox, g_bbox)
                if iou > best_iou:
                    best_iou = iou
                    best_gt_attrs = [a.lower() for a in g_obj.get('attributes', [])]


            # Only proceed if we actually matched an object (IoU > 0.5)
            if best_iou >= 0.5 and best_gt_attrs:
                pred_list = [a for sublist in p_obj['attributes'].values() for a in sublist]
                if not pred_list: continue

                # --- SEMANTIC MATCHING ---
                # 1. Check Exact Matches first (Fast)
                exact_matches = [p for p in pred_list if p.lower() in best_gt_attrs]
                results["exact_hits"] += len(exact_matches)

                # 2. Check remaining for Semantic Similarity
                remaining_preds = [p for p in pred_list if p.lower() not in best_gt_attrs]
                results["total_items"] += len(pred_list)

                if remaining_preds:
                    pred_embeddings = sbert_model.encode(remaining_preds, convert_to_tensor=True)
                    gt_embeddings = sbert_model.encode(best_gt_attrs, convert_to_tensor=True)
                    cosine_scores = util.cos_sim(pred_embeddings, gt_embeddings)

                    for i in range(len(remaining_preds)):
                        if torch.max(cosine_scores[i]) >= threshold:
                            results["semantic_hits"] += 1

                # Add exact hits to semantic total
                results["semantic_hits"] += len(exact_matches)

    # Final Report
    exact_acc = (results['exact_hits']/results['total_items'])*100 if results['total_items'] > 0 else 0
    sem_acc = (results['semantic_hits']/results['total_items'])*100 if results['total_items'] > 0 else 0

    print("\n" + "="*40)
    print("      SEMANTIC EVALUATION REPORT")
    print("="*40)
    print(f"Exact Match Accuracy:   {exact_acc:.2f}%")
    print(f"Semantic Match Accuracy: {sem_acc:.2f}% (Threshold: {threshold})")
    print(f"Total Attributes Scored: {results['total_items']}")
    print("="*40)


In [None]:
run_semantic_evaluation(drive_path, "/content/drive/MyDrive/GQA_Research_2026_3/val_sceneGraphs.json", 0.6)