In [1]:
# Import required libraries
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
import torchvision.transforms as transforms
import os
from pycocotools.coco import COCO
import requests
from tqdm import tqdm
import json
import cv2
import warnings
warnings.filterwarnings('ignore')

print("‚úì All imports successful!")
print(f"‚úì PyTorch version: {torch.__version__}")
print(f"‚úì CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"‚úì GPU: {torch.cuda.get_device_name(0)}")
    for i in range(torch.cuda.device_count()):
        print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")


‚úì All imports successful!
‚úì PyTorch version: 2.6.0+cu124
‚úì CUDA available: True
‚úì GPU: Tesla T4
  GPU 0: Tesla T4
  GPU 1: Tesla T4


In [2]:
# Memory optimization utilities for Kaggle
import gc
import torch

def clear_memory():
    """Clear GPU and system memory"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    print("‚úì Memory cleared")

def print_memory_usage():
    """Print current GPU memory usage"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        print(f"GPU Memory: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")
    
print("‚úì Memory utilities loaded")

‚úì Memory utilities loaded


## Configuration

Set your dataset generation parameters here:

### ‚è±Ô∏è **Time Estimation for Kaggle (12hr limit):**
- **Qwen-VL-Chat inference**: ~3-5 seconds per image (with short captions)
- **5,000 train images**: ~4-7 hours
- **500 val images**: ~25-42 minutes
- **Total estimated time**: ~4.5-8 hours (fits within 12hr limit!)

**Optimizations to speed up:**
- Shorter captions (70 tokens max) = faster generation
- Batch processing where possible
- Skip failed downloads quickly
- GPU acceleration (T4/P100)

In [3]:
# Dataset Generation Configuration
class Config:
    # Get notebook directory for absolute paths
    NOTEBOOK_DIR = os.path.dirname(os.path.abspath('__file__')) if '__file__' in dir() else os.getcwd()
    
    # Paths (using absolute paths to avoid directory issues)
    COCO_ROOT = os.path.join(NOTEBOOK_DIR, 'coco_data')
    OUTPUT_DIR = os.path.join(NOTEBOOK_DIR, 'dataset_output')
    
    # Dataset sizes
    TARGET_TRAIN = 5000  # Training images
    TARGET_VAL = 500     # Validation images
    
    # Quality filtering thresholds
    MIN_KEYPOINTS = 10   # Minimum visible keypoints (out of 17)
    MIN_PERSON_AREA = 5000  # Minimum person area in pixels¬≤
    
    # Caption generation
    MAX_CAPTION_TOKENS = 70  # CLIP limit is 77, we use 70 for safety
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # Model - Using Qwen2-VL-2B (proven to work, no OOM)
    MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"
    MODEL_TYPE = "qwen2-vl"

config = Config()

print("="*80)
print("CONFIGURATION")
print("="*80)
print(f"Working Directory: {config.NOTEBOOK_DIR}")
print(f"COCO Root: {config.COCO_ROOT}")
print(f"Output Directory: {config.OUTPUT_DIR}")
print(f"Target Train Images: {config.TARGET_TRAIN}")
print(f"Target Val Images: {config.TARGET_VAL}")
print(f"Quality Thresholds:")
print(f"  - Min Keypoints: {config.MIN_KEYPOINTS}/17")
print(f"  - Min Person Area: {config.MIN_PERSON_AREA}px¬≤")
print(f"Caption Settings:")
print(f"  - Max Tokens: {config.MAX_CAPTION_TOKENS}")
print(f"  - Device: {config.DEVICE}")
print(f"  - Model: {config.MODEL_NAME}")
print(f"  - Model Type: {config.MODEL_TYPE}")
print(f"\n‚úì Using Qwen2-VL-2B (proven working, no OOM issues)")
print(f"  - Smaller than Qwen-VL-Chat")
print(f"  - Disk cache system for captions")
print(f"  - Can unload model after caption generation")
print("="*80)

# Create directories
os.makedirs(config.COCO_ROOT, exist_ok=True)
os.makedirs(config.OUTPUT_DIR, exist_ok=True)
os.makedirs(os.path.join(config.COCO_ROOT, 'annotations'), exist_ok=True)
os.makedirs(os.path.join(config.COCO_ROOT, 'train2017'), exist_ok=True)
os.makedirs(os.path.join(config.COCO_ROOT, 'val2017'), exist_ok=True)

print(f"\n‚úì Directories created/verified:")
print(f"  - {config.COCO_ROOT}")
print(f"  - {config.OUTPUT_DIR}")


CONFIGURATION
Working Directory: /kaggle/working
COCO Root: /kaggle/working/coco_data
Output Directory: /kaggle/working/dataset_output
Target Train Images: 5000
Target Val Images: 500
Quality Thresholds:
  - Min Keypoints: 10/17
  - Min Person Area: 5000px¬≤
Caption Settings:
  - Max Tokens: 70
  - Device: cuda
  - Model: Qwen/Qwen2-VL-2B-Instruct
  - Model Type: qwen2-vl

‚úì Using Qwen2-VL-2B (proven working, no OOM issues)
  - Smaller than Qwen-VL-Chat
  - Disk cache system for captions
  - Can unload model after caption generation

‚úì Directories created/verified:
  - /kaggle/working/coco_data
  - /kaggle/working/dataset_output


## Step 1: Load COCO Annotations

Download and load COCO 2017 annotations for person keypoints.

In [4]:
# Load COCO annotations
ann_dir = os.path.join(config.COCO_ROOT, 'annotations')
train_ann_file = os.path.join(ann_dir, 'person_keypoints_train2017.json')
val_ann_file = os.path.join(ann_dir, 'person_keypoints_val2017.json')

# Download annotations if they don't exist
if not os.path.exists(train_ann_file) or not os.path.exists(val_ann_file):
    print("‚ö†Ô∏è  COCO annotations not found! Downloading...")
    
    import zipfile
    import urllib.request
    
    ann_url = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
    zip_path = os.path.join(config.COCO_ROOT, 'annotations_trainval2017.zip')
    
    print(f"Downloading from: {ann_url}")
    print("This may take a few minutes (~252 MB)...")
    
    # Download with progress
    urllib.request.urlretrieve(ann_url, zip_path)
    print("‚úì Download complete! Extracting...")
    
    # Extract
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(config.COCO_ROOT)
    
    # Cleanup zip file
    os.remove(zip_path)
    print("‚úì Extraction complete!")

# Now load COCO annotations
print("\nLoading COCO annotations...")
coco_train = COCO(train_ann_file)
coco_val = COCO(val_ann_file)
print("‚úì COCO annotations loaded successfully!")
print(f"  - Train images: {len(coco_train.getImgIds())}")
print(f"  - Val images: {len(coco_val.getImgIds())}")

‚ö†Ô∏è  COCO annotations not found! Downloading...
Downloading from: http://images.cocodataset.org/annotations/annotations_trainval2017.zip
This may take a few minutes (~252 MB)...
‚úì Download complete! Extracting...
‚úì Extraction complete!

Loading COCO annotations...
loading annotations into memory...
Done (t=7.75s)
creating index...
index created!
loading annotations into memory...
Done (t=0.92s)
creating index...
index created!
‚úì COCO annotations loaded successfully!
  - Train images: 118287
  - Val images: 5000


## Step 2: Filter High-Quality Pose Images

Filter COCO dataset for images with high-quality pose annotations.

In [5]:
def filter_high_quality_images(coco, split='train', min_keypoints=10, min_area=5000, max_images=None):
    """
    Filter COCO dataset for high-quality pose images
    
    Args:
        coco: COCO API instance
        split: 'train' or 'val'
        min_keypoints: Minimum visible keypoints (out of 17)
        min_area: Minimum person area in pixels
        max_images: Maximum number of images to return
    
    Returns:
        list: Filtered image IDs with quality scores
    """
    print(f"\n{'='*80}")
    print(f"FILTERING {split.upper()} IMAGES FOR HIGH-QUALITY POSES")
    print(f"{'='*80}")
    print(f"Criteria:")
    print(f"  - Minimum keypoints: {min_keypoints}/17")
    print(f"  - Minimum person area: {min_area}px¬≤")
    print(f"  - No crowd annotations")
    print(f"{'='*80}\n")
    
    cat_ids = coco.getCatIds(catNms=['person'])
    all_img_ids = coco.getImgIds(catIds=cat_ids)
    
    quality_images = []
    
    for img_id in tqdm(all_img_ids, desc=f"Filtering {split} images"):
        ann_ids = coco.getAnnIds(imgIds=img_id, catIds=cat_ids, iscrowd=False)
        anns = coco.loadAnns(ann_ids)
        
        best_quality_score = 0
        best_keypoint_count = 0
        
        for ann in anns:
            if 'keypoints' not in ann:
                continue
            
            num_keypoints = ann.get('num_keypoints', 0)
            person_area = ann.get('area', 0)
            
            # Quality filtering
            if (num_keypoints >= min_keypoints and 
                person_area >= min_area and 
                ann.get('iscrowd', 0) == 0):
                
                # Quality score: combine keypoints and area
                quality_score = num_keypoints * 1.0 + (person_area / 10000) * 0.5
                
                if quality_score > best_quality_score:
                    best_quality_score = quality_score
                    best_keypoint_count = num_keypoints
        
        if best_quality_score > 0:
            quality_images.append({
                'image_id': img_id,
                'quality_score': best_quality_score,
                'keypoints': best_keypoint_count
            })
    
    # Sort by quality score (best first)
    quality_images.sort(key=lambda x: x['quality_score'], reverse=True)
    
    # Limit to max_images if specified
    if max_images and len(quality_images) > max_images:
        quality_images = quality_images[:max_images]
    
    print(f"\n‚úì Filtered {len(quality_images)} high-quality images from {len(all_img_ids)} total")
    print(f"  - Average keypoints: {np.mean([img['keypoints'] for img in quality_images]):.1f}/17")
    print(f"  - Quality range: {quality_images[-1]['quality_score']:.2f} to {quality_images[0]['quality_score']:.2f}")
    
    return quality_images

# Check if COCO annotations are loaded
if 'coco_train' not in globals() or 'coco_val' not in globals():
    print("‚ö†Ô∏è  ERROR: COCO annotations not loaded!")
    print("Please run Cell 5 (Step 1: Load COCO Annotations) first.")
    raise NameError("coco_train and coco_val are not defined. Run the previous cells in order.")

# Filter train and val images
train_images = filter_high_quality_images(
    coco_train, 
    split='train',
    min_keypoints=config.MIN_KEYPOINTS,
    min_area=config.MIN_PERSON_AREA,
    max_images=config.TARGET_TRAIN
)

val_images = filter_high_quality_images(
    coco_val, 
    split='val',
    min_keypoints=config.MIN_KEYPOINTS,
    min_area=config.MIN_PERSON_AREA,
    max_images=config.TARGET_VAL
)

print(f"\n{'='*80}")
print("FILTERING COMPLETE")
print(f"{'='*80}")
print(f"‚úì Train: {len(train_images)} images selected")
print(f"‚úì Val: {len(val_images)} images selected")
print(f"{'='*80}\n")


FILTERING TRAIN IMAGES FOR HIGH-QUALITY POSES
Criteria:
  - Minimum keypoints: 10/17
  - Minimum person area: 5000px¬≤
  - No crowd annotations



Filtering train images: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64115/64115 [00:00<00:00, 127652.59it/s]



‚úì Filtered 5000 high-quality images from 64115 total
  - Average keypoints: 15.7/17
  - Quality range: 17.95 to 25.67

FILTERING VAL IMAGES FOR HIGH-QUALITY POSES
Criteria:
  - Minimum keypoints: 10/17
  - Minimum person area: 5000px¬≤
  - No crowd annotations



Filtering val images: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2693/2693 [00:00<00:00, 143309.95it/s]


‚úì Filtered 500 high-quality images from 2693 total
  - Average keypoints: 15.8/17
  - Quality range: 16.68 to 22.91

FILTERING COMPLETE
‚úì Train: 5000 images selected
‚úì Val: 500 images selected






## Step 3: Initialize Caption Generator

Load Qwen-VL-Chat model for generating CLIP-compatible captions.

In [6]:
# Install required dependencies for Qwen2-VL-2B
import subprocess
import sys
import gc

print("Installing dependencies for Qwen2-VL-2B...")

# First, clear any existing GPU memory
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("‚úì GPU memory cleared")

# Install compatible versions
packages = [
    'transformers>=4.45.0',
    'accelerate',
    'qwen-vl-utils',
    'pillow',
    'torchvision'
]

for package in packages:
    print(f"Installing {package}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

print("\n‚úì All dependencies installed!")


Installing dependencies for Qwen2-VL-2B...
‚úì GPU memory cleared
Installing transformers>=4.45.0...
Installing accelerate...
   ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 363.4/363.4 MB 5.2 MB/s eta 0:00:00
   ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 13.8/13.8 MB 109.4 MB/s eta 0:00:00
   ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 24.6/24.6 MB 89.8 MB/s eta 0:00:00
   ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 883.7/883.7 kB 36.8 MB/s eta 0:00:00
   ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 664.8/664.8 MB 1.1 MB/s eta 0:00:00
   ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
libcugraph-cu12 25.6.0 requires libraft-cu12==25.6.*, but you have libraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 25.6.0 requires pylibraft-cu12==25.6.*, but you have pylibraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 25.6.0 requires rmm-cu12==25.6.*, but you have rmm-cu12 25.2.0 which is incompatible.


Installing qwen-vl-utils...
   ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 40.2/40.2 MB 48.7 MB/s eta 0:00:00
Installing pillow...
Installing torchvision...

‚úì All dependencies installed!


In [7]:
# GPU Memory Status Check
import gc

print("="*80)
print("GPU MEMORY STATUS")
print("="*80)

# Force garbage collection and clear cache
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    
    # Get GPU memory info
    props = torch.cuda.get_device_properties(0)
    print(f"GPU: {props.name}")
    print(f"Total GPU Memory: {props.total_memory / 1e9:.2f} GB")
    
    # Get current memory usage
    current_allocated = torch.cuda.memory_allocated() / 1e9
    reserved = torch.cuda.memory_reserved() / 1e9
    total = props.total_memory / 1e9
    free = total - (reserved)
    
    print(f"\nCurrent Memory Usage:")
    print(f"  - Allocated: {current_allocated:.2f} GB")
    print(f"  - Reserved: {reserved:.2f} GB")
    print(f"  - Free: {free:.2f} GB")
    
    if current_allocated > 10:
        print(f"\n‚ö†Ô∏è  WARNING: High GPU memory usage!")
        print("Clearing unneeded variables...")
        # Try to identify and clear large objects
        if 'train_images' in globals() or 'val_images' in globals():
            print("  - Keeping image lists (needed for caption generation)")
        if 'coco_train' in globals() or 'coco_val' in globals():
            print("  - Keeping COCO objects (needed for image info)")
else:
    print("‚ö†Ô∏è  CUDA not available!")

print("="*80 + "\n")


GPU MEMORY STATUS
GPU: Tesla T4
Total GPU Memory: 15.83 GB

Current Memory Usage:
  - Allocated: 0.00 GB
  - Reserved: 0.00 GB
  - Free: 15.83 GB



In [8]:
class CaptionGenerator:
    """Ultra-optimized caption generator for Kaggle - minimal memory footprint"""
    
    def __init__(self, model_name="Qwen/Qwen2-VL-2B-Instruct"):
        """Initialize with aggressive memory management"""
        import gc
        from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
        
        print(f"Loading {model_name}...")
        
        # Pre-load cleanup
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        
        # Load processor
        self.processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
        
        # Load model in fp16 for memory efficiency
        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            trust_remote_code=True,
            device_map="auto",
        ).eval()
        
        print(f"‚úì Model loaded successfully")
        
        # Post-load cleanup
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    
    def generate_caption(self, image_path):
        """Generate caption with minimal memory usage"""
        import gc
        from PIL import Image as PILImage
        
        try:
            # Pre-process cleanup
            gc.collect()
            torch.cuda.empty_cache()
            
            # Load and resize image (critical for memory)
            image = PILImage.open(image_path).convert('RGB')
            
            # Aggressive downsizing - max 768 pixels
            if max(image.size) > 768:
                ratio = 768 / max(image.size)
                new_size = (int(image.width * ratio), int(image.height * ratio))
                image = image.resize(new_size, PILImage.Resampling.LANCZOS)
            
            # Create prompt
            conversation = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": image},
                        {"type": "text", "text": "Describe this image for image generation. Include: main subjects, their positions, colors, lighting, mood, background, and style. Be specific, vivid, and under 70 words."}
                    ]
                }
            ]
            
            # Process with minimal overhead
            text_prompt = self.processor.apply_chat_template(
                conversation, tokenize=False, add_generation_prompt=True
            )
            
            inputs = self.processor(
                text=text_prompt,
                images=[image],
                return_tensors="pt",
                padding=True
            )
            
            # Move to device
            inputs = {k: v.to(self.model.device) if isinstance(v, torch.Tensor) else v 
                     for k, v in inputs.items()}
            
            # Generate with minimal parameters
            with torch.no_grad():
                output_ids = self.model.generate(
                    **inputs,
                    max_new_tokens=70,
                    do_sample=True
                )

            # isolate generated tokens (remove prompt)
            gen_ids = output_ids[0][inputs["input_ids"].shape[1]:]
            
            # decode only the assistant's answer
            caption = self.processor.decode(gen_ids, skip_special_tokens=True).strip()
            
            # Remove common prefixes
            for prefix in ['assistant:', '<|assistant|>', 'sure ', 'here ', 'the image']:
                if caption.lower().startswith(prefix):
                    caption = caption[len(prefix):].strip()
            
            # Enforce 70-token limit (hard cutoff)
            words = caption.split()
            if len(words) > 70:
                caption = ' '.join(words[:70])
            
            # Ensure proper ending
            if caption and not caption.endswith(('.', '!', '?')):
                caption += '.'
            
            # Immediate cleanup
            del image, inputs, output_ids, text_prompt, conversation
            gc.collect()
            torch.cuda.empty_cache()
            
            return caption if len(caption) > 5 else "A person in a scene."
            
        except Exception as e:
            print(f"‚ö†Ô∏è  Error: {str(e)[:40]}")
            torch.cuda.empty_cache()
            gc.collect()
            return "A person in a scene."
    
    def cleanup(self):
        """Free all GPU memory"""
        import gc
        del self.model
        del self.processor
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        print("‚úì Model cleaned up")

In [9]:
# Initialize Caption Generator
print("="*80)
print("INITIALIZING CAPTION GENERATOR")
print("="*80)
print("Loading Qwen2-VL-2B model (this will take 2-3 minutes)...")
print()

caption_gen = CaptionGenerator(model_name=config.MODEL_NAME)

print()
print("="*80)
print("‚úì CAPTION GENERATOR READY!")
print("="*80)
print()
print_memory_usage()

INITIALIZING CAPTION GENERATOR
Loading Qwen2-VL-2B model (this will take 2-3 minutes)...



2025-12-10 09:49:09.113621: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765360149.563568      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765360149.671071      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Loading Qwen/Qwen2-VL-2B-Instruct...


preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


chat_template.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

‚úì Model loaded successfully

‚úì CAPTION GENERATOR READY!

GPU Memory: 2.02GB allocated, 2.46GB reserved


## Step 4: Generate Training Captions

Download training images and generate detailed captions.

In [10]:
# TEST CAPTION GENERATION ON 5 RANDOM TRAIN IMAGES
import os
import gc
import random
import requests
from tqdm import tqdm

# Must have caption_gen from your previous initialization cell
if 'caption_gen' not in globals():
    raise NameError("Run Cell 13 (Initialize Caption Generator) first!")

# Pick 5 random images from COCO train set
sample_images = random.sample(train_images, 5)

test_captions = {}
img_dir_train = os.path.join(config.COCO_ROOT, 'train2017')

print(f"\n{'='*60}")
print("TESTING CAPTION GENERATOR ON 5 IMAGES")
print(f"{'='*60}\n")

success = 0
failed = 0

for img_data in tqdm(sample_images, desc="Test"):
    img_id = img_data['image_id']
    img_info = coco_train.loadImgs(img_id)[0]
    filename = img_info['file_name']
    img_path = os.path.join(img_dir_train, filename)

    # Download if missing (COCO images sometimes are not pre-downloaded)
    if not os.path.exists(img_path):
        try:
            r = requests.get(img_info['coco_url'], timeout=10)
            r.raise_for_status()
            os.makedirs(os.path.dirname(img_path), exist_ok=True)
            with open(img_path, 'wb') as f:
                f.write(r.content)
        except Exception as e:
            print(f"Download error ({filename}): {e}")
            failed += 1
            continue

    # Generate caption
    try:
        caption = caption_gen.generate_caption(img_path)
        if caption and len(caption) > 3:
            test_captions[filename] = caption
            print(f"\n[{filename}]")
            print("Caption:", caption)
            print("-" * 80)
            success += 1
        else:
            failed += 1
    except:
        failed += 1

    # cleanup
    gc.collect()
    torch.cuda.empty_cache()

print(f"\n‚úì Success: {success}")
print(f"‚úó Failed : {failed}")
print(f"{'='*60}\n")


TESTING CAPTION GENERATOR ON 5 IMAGES



Test:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
import gc
import json
import os
import requests

CHECKPOINT_PATH = "train_captions_checkpoint.json"
SAVE_INTERVAL = 200   # save every N images

# Load previous progress if exists
if os.path.exists(CHECKPOINT_PATH):
    with open(CHECKPOINT_PATH, "r") as f:
        train_captions = json.load(f)
    print(f"Loaded {len(train_captions)} previously saved captions.")
else:
    train_captions = {}

img_dir_train = os.path.join(config.COCO_ROOT, 'train2017')

print(f"\n{'='*60}")
print(f"GENERATING {len(train_images)} TRAINING CAPTIONS")
print(f"{'='*60}\n")

success = len(train_captions)
failed = 0

for idx, img_data in enumerate(tqdm(train_images, desc="Train")):
    img_id = img_data['image_id']
    img_info = coco_train.loadImgs(img_id)[0]
    filename = img_info['file_name']

    # Skip if already processed
    if filename in train_captions:
        continue

    img_path = os.path.join(img_dir_train, filename)
    
    # Download if missing
    if not os.path.exists(img_path):
        try:
            r = requests.get(img_info['coco_url'], timeout=10)
            r.raise_for_status()
            os.makedirs(os.path.dirname(img_path), exist_ok=True)
            with open(img_path, 'wb') as f:
                f.write(r.content)
        except:
            failed += 1
            continue
    
    # Generate caption
    try:
        caption = caption_gen.generate_caption(img_path)
        if caption and len(caption) > 3:
            train_captions[filename] = caption
            success += 1
        else:
            failed += 1
    except:
        failed += 1

    # Periodic cleanup + save
    if (success + failed) % SAVE_INTERVAL == 0:
        with open(CHECKPOINT_PATH, "w") as f:
            json.dump(train_captions, f)
        gc.collect()
        torch.cuda.empty_cache()
        print(f"\nüíæ Saved checkpoint at {success} captions.\n")

# Final save
with open(CHECKPOINT_PATH, "w") as f:
    json.dump(train_captions, f)

print(f"\n‚úì Generated: {success}")
print(f"‚úó Failed: {failed}")
print(f"üíæ Final save completed.")
print(f"{'='*60}\n")


## Step 5: Generate Validation Captions

Download validation images and generate detailed captions.

In [None]:
# Generate Validation Captions - Ultra-optimized
import gc
import requests

if 'caption_gen' not in globals():
    raise NameError("Run Cell 13 (Initialize Caption Generator) first!")

val_captions = {}
img_dir_val = os.path.join(config.COCO_ROOT, 'val2017')

print(f"\n{'='*60}")
print(f"GENERATING {len(val_images)} VALIDATION CAPTIONS")
print(f"{'='*60}\n")

success = 0
failed = 0

for img_data in tqdm(val_images, desc="Val"):
    img_id = img_data['image_id']
    img_info = coco_val.loadImgs(img_id)[0]
    filename = img_info['file_name']
    img_path = os.path.join(img_dir_val, filename)
    
    # Download if missing
    if not os.path.exists(img_path):
        try:
            r = requests.get(img_info['coco_url'], timeout=10)
            r.raise_for_status()
            os.makedirs(os.path.dirname(img_path), exist_ok=True)
            with open(img_path, 'wb') as f:
                f.write(r.content)
        except:
            failed += 1
            continue
    
    # Generate caption
    try:
        caption = caption_gen.generate_caption(img_path)
        if caption and len(caption) > 3:
            val_captions[filename] = caption
            success += 1
        else:
            failed += 1
    except:
        failed += 1
    
    # Periodic cleanup
    if (success + failed) % 50 == 0:
        gc.collect()
        torch.cuda.empty_cache()

print(f"\n‚úì Generated: {success}")
print(f"‚úó Failed: {failed}")
print(f"{'='*60}\n")

## Step 6: Save Caption Files

Save the generated captions to JSON files.

In [None]:
# Save Dataset & Cleanup
import json
import gc

# Cleanup model
if 'caption_gen' in globals():
    caption_gen.cleanup()

# Save captions
train_out = os.path.join(config.OUTPUT_DIR, 'train_captions.json')
val_out = os.path.join(config.OUTPUT_DIR, 'val_captions.json')

with open(train_out, 'w') as f:
    json.dump(train_captions, f, indent=2)

with open(val_out, 'w') as f:
    json.dump(val_captions, f, indent=2)

print(f"\n{'='*60}")
print("DATASET COMPLETE!")
print(f"{'='*60}")
print(f"Train: {len(train_captions)} captions ‚Üí {train_out}")
print(f"Val: {len(val_captions)} captions ‚Üí {val_out}")
print(f"\nStats:")
print(f"  Total: {len(train_captions) + len(val_captions)} images")
if train_captions:
    avg_train = sum(len(c.split()) for c in train_captions.values()) / len(train_captions)
    print(f"  Avg caption (train): {avg_train:.1f} tokens")
if val_captions:
    avg_val = sum(len(c.split()) for c in val_captions.values()) / len(val_captions)
    print(f"  Avg caption (val): {avg_val:.1f} tokens")
print(f"{'='*60}\n")

## Step 7: Verify Dataset Quality

Check a few samples to verify caption quality and token counts.

In [None]:
# Verify Dataset (Quick Check Only)
print("\n" + "="*60)
print("SAMPLE CAPTIONS")
print("="*60)

if train_captions:
    for i, (fname, cap) in enumerate(list(train_captions.items())[:2]):
        print(f"\nTrain[{i}]: {fname}")
        print(f"  Caption: {cap[:100]}...")
        print(f"  Tokens: {len(cap.split())}")

if val_captions:
    for i, (fname, cap) in enumerate(list(val_captions.items())[:2]):
        print(f"\nVal[{i}]: {fname}")
        print(f"  Caption: {cap[:100]}...")
        print(f"  Tokens: {len(cap.split())}")

print("\n‚úÖ Ready for ControlNet training!")
print("="*60)