<a href="https://colab.research.google.com/github/RArunn/Intent-Identification-Detection/blob/main/hicodet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Drive Mount and Dataset Download
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Cell 2: Imports and GPU Configuration
import os
import json
import torch
import gc
import tarfile
import shutil
from PIL import Image
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from tqdm import tqdm
import re
from torch.utils.data import Dataset, DataLoader
import warnings
import time
from datetime import datetime
import numpy as np
warnings.filterwarnings("ignore")

# GPU Optimizations
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False

In [None]:
# Cell 3: Dataset Setup
def setup_hicodet_test2015(base_path="/content/drive/MyDrive/hicodet"):
    """Setup for test2015 dataset"""
    tar_filename = os.path.join(base_path, "hico_20160224_det.tar.gz")

    if not os.path.exists(tar_filename):
        raise FileNotFoundError(f"Dataset not found at {tar_filename}")

    test_dir = "images/test2015"
    os.makedirs(test_dir, exist_ok=True)

    test_count = len(os.listdir(test_dir)) if os.path.exists(test_dir) else 0

    if test_count < 9600:
        with tarfile.open(tar_filename, 'r:gz') as tar_ref:
            for member in tqdm(tar_ref.getmembers(), desc="Extracting test2015"):
                if "images/test2015/" in member.name and member.isfile() and member.name.endswith('.jpg'):
                    filename = os.path.basename(member.name)
                    tar_ref.extract(member, "temp/")
                    shutil.move(f"temp/{member.name}", f"{test_dir}/{filename}")

        shutil.rmtree("temp", ignore_errors=True)
        test_count = len(os.listdir(test_dir))

    return test_dir

In [None]:
# Cell 4: Model Initialization
def init_model():
    """Initialize model and processor"""
    gc.collect()
    torch.cuda.empty_cache()

    model_path = "Qwen/Qwen2.5-VL-3B-Instruct"

    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )

    model = torch.compile(model, mode="reduce-overhead", fullgraph=True)

    processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
    processor.tokenizer.pad_token = processor.tokenizer.eos_token

    return model, processor

In [None]:
# Cell 5: Dataset Class
class HicoDetDataset(Dataset):
    def __init__(self, image_dir, max_images=None):
        self.image_dir = image_dir
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.jpg')])

        if max_images:
            self.image_files = self.image_files[:max_images]

        # Pre-compute image IDs
        self.image_ids = []
        for img_file in self.image_files:
            base_id = img_file.replace('.jpg', '')
            if base_id.startswith('HICO_'):
                self.image_ids.append(base_id)
            else:
                self.image_ids.append(f"HICO_test2015_{base_id.zfill(8)}")

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_file = self.image_files[idx]
        image_id = self.image_ids[idx]
        image_path = os.path.join(self.image_dir, img_file)

        try:
            image = Image.open(image_path).convert('RGB')
            original_size = image.size

            if max(original_size) > 448:
                ratio = 448 / max(original_size)
                new_size = (int(original_size[0] * ratio), int(original_size[1] * ratio))
                image = image.resize(new_size, Image.Resampling.LANCZOS)

            return {
                'image': image,
                'image_id': image_id,
                'scale': np.array([original_size[0] / image.size[0], original_size[1] / image.size[1]], dtype=np.float32),
                'original_size': original_size,
                'filename': img_file
            }
        except:
            return None

In [None]:
# Cell 6: Coordinate Extraction
class CoordinateExtractor:
    PATTERN_WITH_CONF = re.compile(
        r'person\s+at\s+\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\s+(?:doing\s+)?(\w+)\s+(?:with\s+)?(\w+)\s+at\s+\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\s+confidence\s+([0-9]*\.?[0-9]+)',
        re.IGNORECASE
    )

    PATTERN_NO_CONF = re.compile(
        r'person\s+at\s+\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\s+(?:doing\s+)?(\w+)\s+(?:with\s+)?(\w+)\s+at\s+\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]',
        re.IGNORECASE
    )

    BLACKLIST = {'object', 'thing', 'item'}

    @staticmethod
    def extract_detections_vectorized(text, image_data):
        """Vectorized coordinate extraction"""
        results = []

        matches = CoordinateExtractor.PATTERN_WITH_CONF.findall(text.lower())
        has_confidence = True

        if not matches:
            matches = CoordinateExtractor.PATTERN_NO_CONF.findall(text.lower())
            has_confidence = False

        if not matches:
            return results

        scale = image_data['scale']
        width, height = image_data['original_size']

        for match in matches:
            try:
                coords = [int(x) for x in match[:4]]
                action = match[4].strip()
                obj = match[5].strip()
                obj_coords = [int(x) for x in match[6:10]]

                if (len(obj) < 2 or obj in CoordinateExtractor.BLACKLIST or
                    any(c <= 0 for c in coords[2:4]) or any(c <= 0 for c in obj_coords[2:4])):
                    continue

                person_coords = np.array(coords, dtype=np.float32)
                object_coords = np.array(obj_coords, dtype=np.float32)

                person_coords[[0, 2]] *= scale[0]
                person_coords[[1, 3]] *= scale[1]
                object_coords[[0, 2]] *= scale[0]
                object_coords[[1, 3]] *= scale[1]

                person_box = [
                    max(0, min(width, int(person_coords[0]))),
                    max(0, min(height, int(person_coords[1]))),
                    max(0, min(width, int(person_coords[0] + person_coords[2]))),
                    max(0, min(height, int(person_coords[1] + person_coords[3])))
                ]

                object_box = [
                    max(0, min(width, int(object_coords[0]))),
                    max(0, min(height, int(object_coords[1]))),
                    max(0, min(width, int(object_coords[0] + object_coords[2]))),
                    max(0, min(height, int(object_coords[1] + object_coords[3])))
                ]

                if (person_box[2] > person_box[0] and person_box[3] > person_box[1] and
                    object_box[2] > object_box[0] and object_box[3] > object_box[1]):

                    confidence = float(match[10]) if has_confidence and len(match) > 10 else 0.8
                    confidence = max(0.1, min(1.0, confidence))

                    results.append({
                        'image_id': image_data['image_id'],
                        'person_box': person_box,
                        'object_box': object_box,
                        'action': action,
                        'object': obj,
                        'confidence': confidence
                    })

            except (ValueError, IndexError):
                continue

        return results

In [None]:
# Cell 7: Batch Inference
def batch_inference(batch_data, model, processor):
    """Optimized batch inference"""
    valid_batch = [item for item in batch_data if item is not None]
    if not valid_batch:
        return []

    prompt = """Analyze this image for human-object interactions. For each interaction you find, provide:

1. What the person is doing
2. What object they're interacting with
3. Location of the person as [x, y, width, height]
4. Location of the object as [x, y, width, height]

Format your response as:
Person at [x,y,w,h] doing ACTION with OBJECT at [x,y,w,h]

Only include clear, visible interactions. Maximum 10 interactions per image."""

    images = [item['image'] for item in valid_batch]

    messages_list = []
    for image in images:
        messages_list.append([{
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt}
            ]
        }])

    try:
        texts = [processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
                 for msgs in messages_list]

        inputs = processor(
            text=texts,
            images=images,
            return_tensors="pt",
            padding=True,
            max_length=1024,
            truncation=True
        ).to(model.device, non_blocking=True)

        with torch.inference_mode():
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                output_ids = model.generate(
                    **inputs,
                    max_new_tokens=200,
                    do_sample=False,
                    use_cache=True,
                    pad_token_id=processor.tokenizer.eos_token_id,
                    num_beams=1
                )

        generated_ids = output_ids[:, inputs.input_ids.shape[1]:]
        responses = processor.batch_decode(generated_ids, skip_special_tokens=True)

        all_results = []
        for data, response in zip(valid_batch, responses):
            results = CoordinateExtractor.extract_detections_vectorized(response, data)
            all_results.extend(results)

        del inputs, output_ids, generated_ids

        return all_results

    except Exception as e:
        return []

In [None]:
# Cell 8: Processing Pipeline
def process_hicodet(model, processor, image_dir,
                   output_file="hicodet_output.json",
                   batch_size=32, max_images=None, save_interval=2000):
    """Processing pipeline with optimizations"""

    dataset = HicoDetDataset(image_dir, max_images)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=12,
        pin_memory=True,
        collate_fn=lambda x: [item for item in x if item is not None],
        persistent_workers=True,
        prefetch_factor=3
    )

    all_detections = []
    processed_count = 0
    images_with_detections = 0

    start_time = time.time()

    for batch_idx, batch_data in enumerate(tqdm(dataloader, desc="Processing")):
        if not batch_data:
            continue

        try:
            batch_results = batch_inference(batch_data, model, processor)

            if batch_results:
                batch_image_ids = set(r['image_id'] for r in batch_results)
                images_with_detections += len(batch_image_ids)

            all_detections.extend(batch_results)
            processed_count += len(batch_data)

            if processed_count % save_interval == 0:
                elapsed = time.time() - start_time
                rate = processed_count / elapsed

                checkpoint_data = {
                    'detections': all_detections,
                    'metadata': {
                        'processed_count': processed_count,
                        'detection_count': len(all_detections),
                        'images_with_detections': images_with_detections,
                        'elapsed_time': elapsed,
                        'images_per_second': rate,
                        'estimated_completion': elapsed * (len(dataset) / processed_count - 1) / 3600
                    }
                }

                checkpoint_file = f"{output_file}.checkpoint_{processed_count}.json"
                with open(checkpoint_file, 'w') as f:
                    json.dump(checkpoint_data, f, indent=2)

            if batch_idx % 100 == 0:
                torch.cuda.empty_cache()

        except Exception as e:
            continue

    elapsed_time = time.time() - start_time
    images_per_second = processed_count / elapsed_time

    final_output = {
        'detections': all_detections,
        'metadata': {
            'total_images_processed': processed_count,
            'total_detections': len(all_detections),
            'images_with_detections': images_with_detections,
            'detection_rate': f"{images_with_detections/processed_count*100:.1f}%" if processed_count > 0 else "0%",
            'processing_time_hours': elapsed_time / 3600,
            'images_per_second': images_per_second,
            'batch_size': batch_size,
            'vocab_restrictions': 'None - Full Qwen2.5-VL vocabulary',
            'timestamp': datetime.now().isoformat()
        }
    }

    with open(output_file, 'w') as f:
        json.dump(final_output, f, indent=2)

    return all_detections

In [None]:
# Cell 9: Test Processing
def run_test():
    """Test processing"""
    test_dir = setup_hicodet_test2015()
    model, processor = init_model()

    results = process_hicodet(
        model, processor, test_dir,
        output_file="hicodet_output_test.json",
        batch_size=24,
        max_images=100,
        save_interval=50
    )

    return results

In [None]:
# Cell 10: Full Processing
def run_full_processing():
    """Full dataset processing"""
    test_dir = setup_hicodet_test2015()
    model, processor = init_model()

    batch_size = 32

    results = process_hicodet(
        model, processor, test_dir,
        output_file="hicodet_output_full.json",
        batch_size=batch_size,
        max_images=None,
        save_interval=2000
    )

    if results:
        actions = {}
        objects = {}
        for result in results:
            action = result['action']
            obj = result['object']
            actions[action] = actions.get(action, 0) + 1
            objects[obj] = objects.get(obj, 0) + 1

    return results

In [None]:
# Cell 11: Execute
if __name__ == "__main__":
    # Run test first
    # test_results = run_test()

    # Run full processing
    full_results = run_full_processing()