In [None]:
# Import required libraries
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
import torchvision.transforms as transforms
import os
from pycocotools.coco import COCO
import requests
from tqdm import tqdm
import json
import cv2
import warnings
warnings.filterwarnings('ignore')

print("All imports successful.")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")


In [None]:
# Memory optimization utilities for Kaggle
import gc
import torch

def clear_memory():
    """Clear GPU and system memory"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    print("âœ“ Memory cleared")

def print_memory_usage():
    """Print current GPU memory usage"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        print(f"GPU Memory: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")
    
print("Memory utilities loaded")

In [None]:
# Dataset Generation Configuration
class Config:
    # Get notebook directory for absolute paths
    NOTEBOOK_DIR = os.path.dirname(os.path.abspath('__file__')) if '__file__' in dir() else os.getcwd()
    
    # Paths
    COCO_ROOT = os.path.join(NOTEBOOK_DIR, 'coco_data')
    OUTPUT_DIR = os.path.join(NOTEBOOK_DIR, 'dataset_output')
    
    # Dataset sizes
    TARGET_TRAIN = 5000  # Training images
    TARGET_VAL = 500     # Validation images
    
    # Quality filtering thresholds
    MIN_KEYPOINTS = 10   # Minimum visible keypoints (out of 17)
    MIN_PERSON_AREA = 5000  # Minimum person area
    
    # Caption generation
    MAX_CAPTION_TOKENS = 70  # CLIP limit is 77, we use 70 for safety
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # Model - Using Qwen2-VL-2B
    MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"
    MODEL_TYPE = "qwen2-vl"

config = Config()

# Create directories
os.makedirs(config.COCO_ROOT, exist_ok=True)
os.makedirs(config.OUTPUT_DIR, exist_ok=True)
os.makedirs(os.path.join(config.COCO_ROOT, 'annotations'), exist_ok=True)
os.makedirs(os.path.join(config.COCO_ROOT, 'train2017'), exist_ok=True)
os.makedirs(os.path.join(config.COCO_ROOT, 'val2017'), exist_ok=True)

print(f"\nDirectories created/verified:")
print(f"- {config.COCO_ROOT}")
print(f"- {config.OUTPUT_DIR}")


## Step 1: Load COCO Annotations

Download and load COCO 2017 annotations for person keypoints.

In [None]:
# Load COCO annotations
ann_dir = os.path.join(config.COCO_ROOT, 'annotations')
train_ann_file = os.path.join(ann_dir, 'person_keypoints_train2017.json')
val_ann_file = os.path.join(ann_dir, 'person_keypoints_val2017.json')

# Download annotations if they don't exist
if not os.path.exists(train_ann_file) or not os.path.exists(val_ann_file):
    print("COCO annotations not found. Downloading...")
    
    import zipfile
    import urllib.request
    
    ann_url = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
    zip_path = os.path.join(config.COCO_ROOT, 'annotations_trainval2017.zip')
    
    print(f"Downloading from: {ann_url}")
    
    # Download with progress
    urllib.request.urlretrieve(ann_url, zip_path)
    print("Download complet. Extracting...")
    
    # Extract
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(config.COCO_ROOT)
    
    # Cleanup zip file
    os.remove(zip_path)
    print("Extraction complete.")

# Now load COCO annotations
print("\nLoading COCO annotations...")
coco_train = COCO(train_ann_file)
coco_val = COCO(val_ann_file)
print("COCO annotations loaded successfully.")
print(f"- Train images: {len(coco_train.getImgIds())}")
print(f"- Val images: {len(coco_val.getImgIds())}")

## Step 2: Filter High-Quality Pose Images

Filter COCO dataset for images with high-quality pose annotations.

In [None]:
def filter_high_quality_images(coco, split='train', min_keypoints=10, min_area=5000, max_images=None):
    """
    Filter COCO dataset for high-quality pose images
    """
    
    cat_ids = coco.getCatIds(catNms=['person'])
    all_img_ids = coco.getImgIds(catIds=cat_ids)
    
    quality_images = []
    
    for img_id in tqdm(all_img_ids, desc=f"Filtering {split} images"):
        ann_ids = coco.getAnnIds(imgIds=img_id, catIds=cat_ids, iscrowd=False)
        anns = coco.loadAnns(ann_ids)
        
        best_quality_score = 0
        best_keypoint_count = 0
        
        for ann in anns:
            if 'keypoints' not in ann:
                continue
            
            num_keypoints = ann.get('num_keypoints', 0)
            person_area = ann.get('area', 0)
            
            # Quality filtering
            if (num_keypoints >= min_keypoints and 
                person_area >= min_area and 
                ann.get('iscrowd', 0) == 0):
                
                # Quality score: combine keypoints and area
                quality_score = num_keypoints * 1.0 + (person_area / 10000) * 0.5
                
                if quality_score > best_quality_score:
                    best_quality_score = quality_score
                    best_keypoint_count = num_keypoints
        
        if best_quality_score > 0:
            quality_images.append({
                'image_id': img_id,
                'quality_score': best_quality_score,
                'keypoints': best_keypoint_count
            })
    
    # Sort by quality score (best first)
    quality_images.sort(key=lambda x: x['quality_score'], reverse=True)
    
    # Limit to max_images if specified
    if max_images and len(quality_images) > max_images:
        quality_images = quality_images[:max_images]
    
    print(f"\nFiltered {len(quality_images)} high-quality images from {len(all_img_ids)} total")
    print(f"- Average keypoints: {np.mean([img['keypoints'] for img in quality_images]):.1f}/17")
    print(f"- Quality range: {quality_images[-1]['quality_score']:.2f} to {quality_images[0]['quality_score']:.2f}")
    
    return quality_images

# Check if COCO annotations are loaded
if 'coco_train' not in globals() or 'coco_val' not in globals():
    print("ERROR: COCO annotations not loaded.")
    raise NameError("coco_train and coco_val are not defined.")

# Filter train and val images
train_images = filter_high_quality_images(
    coco_train, 
    split='train',
    min_keypoints=config.MIN_KEYPOINTS,
    min_area=config.MIN_PERSON_AREA,
    max_images=config.TARGET_TRAIN
)

val_images = filter_high_quality_images(
    coco_val, 
    split='val',
    min_keypoints=config.MIN_KEYPOINTS,
    min_area=config.MIN_PERSON_AREA,
    max_images=config.TARGET_VAL
)

print(f"Train: {len(train_images)} images selected")
print(f"Val: {len(val_images)} images selected")

## Step 3: Initialize Caption Generator

Load Qwen-VL-Chat model for generating CLIP-compatible captions.

In [None]:
# Install required dependencies for Qwen2-VL-2B
import subprocess
import sys
import gc

print("Installing dependencies for Qwen2-VL-2B...")

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("GPU memory cleared")

# Install compatible versions
packages = [
    'transformers>=4.45.0',
    'accelerate',
    'qwen-vl-utils',
    'pillow',
    'torchvision'
]

for package in packages:
    print(f"Installing {package}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

print("\nAll dependencies installed")


In [None]:
# GPU Memory Status Check
import gc

# Force garbage collection and clear cache
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    
    # Get GPU memory info
    props = torch.cuda.get_device_properties(0)
    print(f"GPU: {props.name}")
    print(f"Total GPU Memory: {props.total_memory / 1e9:.2f} GB")
    
    # Get current memory usage
    current_allocated = torch.cuda.memory_allocated() / 1e9
    reserved = torch.cuda.memory_reserved() / 1e9
    total = props.total_memory / 1e9
    free = total - (reserved)
    
    print(f"\nCurrent Memory Usage:")
    print(f"- Allocated: {current_allocated:.2f} GB")
    print(f"- Reserved: {reserved:.2f} GB")
    print(f"- Free: {free:.2f} GB")
else:
    print("CUDA not available.")


In [None]:
class CaptionGenerator:
    """
    Caption generator
    """
    
    def __init__(self, model_name="Qwen/Qwen2-VL-2B-Instruct"):
        import gc
        from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
        
        print(f"Loading {model_name}...")
        
        # Pre-load cleanup
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        
        # Load processor
        self.processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
        
        # Load model in fp16 for memory efficiency
        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            trust_remote_code=True,
            device_map="auto",
        ).eval()
        
        print(f"Model loaded successfully")
        
        # Post-load cleanup
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    
    def generate_caption(self, image_path):
        import gc
        from PIL import Image as PILImage
        
        try:
            # Pre-process cleanup
            gc.collect()
            torch.cuda.empty_cache()
            
            # Load and resize image
            image = PILImage.open(image_path).convert('RGB')
            
            if max(image.size) > 768:
                ratio = 768 / max(image.size)
                new_size = (int(image.width * ratio), int(image.height * ratio))
                image = image.resize(new_size, PILImage.Resampling.LANCZOS)
            
            # Create prompt
            conversation = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": image},
                        {"type": "text", "text": "Describe this image for image generation. Include: main subjects, their positions, colors, lighting, mood, background, and style. Be specific, vivid, and under 70 words."}
                    ]
                }
            ]
            
            # Process with minimal overhead
            text_prompt = self.processor.apply_chat_template(
                conversation, tokenize=False, add_generation_prompt=True
            )
            
            inputs = self.processor(
                text=text_prompt,
                images=[image],
                return_tensors="pt",
                padding=True
            )
            
            # Move to device
            inputs = {k: v.to(self.model.device) if isinstance(v, torch.Tensor) else v 
                     for k, v in inputs.items()}
            
            # Generate with minimal parameters
            with torch.no_grad():
                output_ids = self.model.generate(
                    **inputs,
                    max_new_tokens=70,
                    do_sample=True
                )

            # isolate generated tokens (remove prompt)
            gen_ids = output_ids[0][inputs["input_ids"].shape[1]:]
            
            # decode only the assistant's answer
            caption = self.processor.decode(gen_ids, skip_special_tokens=True).strip()
            
            # Remove common prefixes
            for prefix in ['assistant:', '<|assistant|>', 'sure ', 'here ', 'the image']:
                if caption.lower().startswith(prefix):
                    caption = caption[len(prefix):].strip()
            
            # Enforce 70-token limit
            words = caption.split()
            if len(words) > 70:
                caption = ' '.join(words[:70])
            
            # Ensure proper ending
            if caption and not caption.endswith(('.', '!', '?')):
                caption += '.'
            
            # Immediate cleanup
            del image, inputs, output_ids, text_prompt, conversation
            gc.collect()
            torch.cuda.empty_cache()
            
            return caption if len(caption) > 5 else "A person in a scene."
            
        except Exception as e:
            print(f"Error: {str(e)[:40]}")
            torch.cuda.empty_cache()
            gc.collect()
            return "A person in a scene."
    
    def cleanup(self):
        import gc
        del self.model
        del self.processor
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        print("Model cleaned up")

In [None]:
# Initialize Caption Generator
caption_gen = CaptionGenerator(model_name=config.MODEL_NAME)
print_memory_usage()

## Step 4: Generate Training Captions

Download training images and generate detailed captions.

In [None]:
# Test caption generation on 5 random images
import os
import gc
import random
import requests
from tqdm import tqdm

# Pick 5 random images from COCO train set
sample_images = random.sample(train_images, 5)

test_captions = {}
img_dir_train = os.path.join(config.COCO_ROOT, 'train2017')

success = 0
failed = 0

for img_data in tqdm(sample_images, desc="Test"):
    img_id = img_data['image_id']
    img_info = coco_train.loadImgs(img_id)[0]
    filename = img_info['file_name']
    img_path = os.path.join(img_dir_train, filename)

    # Download if missing
    if not os.path.exists(img_path):
        try:
            r = requests.get(img_info['coco_url'], timeout=10)
            r.raise_for_status()
            os.makedirs(os.path.dirname(img_path), exist_ok=True)
            with open(img_path, 'wb') as f:
                f.write(r.content)
        except Exception as e:
            print(f"Download error ({filename}): {e}")
            failed += 1
            continue

    # Generate caption
    try:
        caption = caption_gen.generate_caption(img_path)
        if caption and len(caption) > 3:
            test_captions[filename] = caption
            print(f"\n[{filename}]")
            print("Caption:", caption)
            print("-" * 80)
            success += 1
        else:
            failed += 1
    except:
        failed += 1

    # cleanup
    gc.collect()
    torch.cuda.empty_cache()

print(f"\nSuccess: {success}")
print(f"Failed : {failed}")

In [None]:
import gc
import json
import os
import requests

CHECKPOINT_PATH = "train_captions_checkpoint.json"
SAVE_INTERVAL = 200   # save every N images

# Load previous progress if exists
if os.path.exists(CHECKPOINT_PATH):
    with open(CHECKPOINT_PATH, "r") as f:
        train_captions = json.load(f)
    print(f"Loaded {len(train_captions)} previously saved captions.")
else:
    train_captions = {}

img_dir_train = os.path.join(config.COCO_ROOT, 'train2017')

success = len(train_captions)
failed = 0

for idx, img_data in enumerate(tqdm(train_images, desc="Train")):
    img_id = img_data['image_id']
    img_info = coco_train.loadImgs(img_id)[0]
    filename = img_info['file_name']

    # Skip if already processed
    if filename in train_captions:
        continue

    img_path = os.path.join(img_dir_train, filename)
    
    # Download if missing
    if not os.path.exists(img_path):
        try:
            r = requests.get(img_info['coco_url'], timeout=10)
            r.raise_for_status()
            os.makedirs(os.path.dirname(img_path), exist_ok=True)
            with open(img_path, 'wb') as f:
                f.write(r.content)
        except:
            failed += 1
            continue
    
    # Generate caption
    try:
        caption = caption_gen.generate_caption(img_path)
        if caption and len(caption) > 3:
            train_captions[filename] = caption
            success += 1
        else:
            failed += 1
    except:
        failed += 1

    # Periodic cleanup + save
    if (success + failed) % SAVE_INTERVAL == 0:
        with open(CHECKPOINT_PATH, "w") as f:
            json.dump(train_captions, f)
        gc.collect()
        torch.cuda.empty_cache()
        print(f"\nSaved checkpoint at {success} captions.\n")

# Final save
with open(CHECKPOINT_PATH, "w") as f:
    json.dump(train_captions, f)

print(f"\nGenerated: {success}")
print(f"Failed: {failed}")
print(f"Final save completed.")


## Step 5: Generate Validation Captions

Download validation images and generate detailed captions.

In [None]:
# Generate Validation Captions - Ultra-optimized
import gc
import requests

val_captions = {}
img_dir_val = os.path.join(config.COCO_ROOT, 'val2017')

success = 0
failed = 0

for img_data in tqdm(val_images, desc="Val"):
    img_id = img_data['image_id']
    img_info = coco_val.loadImgs(img_id)[0]
    filename = img_info['file_name']
    img_path = os.path.join(img_dir_val, filename)
    
    # Download if missing
    if not os.path.exists(img_path):
        try:
            r = requests.get(img_info['coco_url'], timeout=10)
            r.raise_for_status()
            os.makedirs(os.path.dirname(img_path), exist_ok=True)
            with open(img_path, 'wb') as f:
                f.write(r.content)
        except:
            failed += 1
            continue
    
    # Generate caption
    try:
        caption = caption_gen.generate_caption(img_path)
        if caption and len(caption) > 3:
            val_captions[filename] = caption
            success += 1
        else:
            failed += 1
    except:
        failed += 1
    
    # Periodic cleanup
    if (success + failed) % 50 == 0:
        gc.collect()
        torch.cuda.empty_cache()

print(f"\nGenerated: {success}")
print(f"Failed: {failed}")

## Step 6: Save Caption Files

Save the generated captions to JSON files.

In [None]:
# Save Dataset & Cleanup
import json
import gc

# Cleanup model
if 'caption_gen' in globals():
    caption_gen.cleanup()

# Save captions
train_out = os.path.join(config.OUTPUT_DIR, 'train_captions.json')
val_out = os.path.join(config.OUTPUT_DIR, 'val_captions.json')

with open(train_out, 'w') as f:
    json.dump(train_captions, f, indent=2)

with open(val_out, 'w') as f:
    json.dump(val_captions, f, indent=2)

if train_captions:
    avg_train = sum(len(c.split()) for c in train_captions.values()) / len(train_captions)
    print(f"  Avg caption (train): {avg_train:.1f} tokens")
if val_captions:
    avg_val = sum(len(c.split()) for c in val_captions.values()) / len(val_captions)
    print(f"  Avg caption (val): {avg_val:.1f} tokens")

## Step 7: Verify Dataset Quality

Check a few samples to verify caption quality and token counts.

In [None]:
# Verify Dataset
if train_captions:
    for i, (fname, cap) in enumerate(list(train_captions.items())[:2]):
        print(f"\nTrain[{i}]: {fname}")
        print(f"  Caption: {cap[:100]}...")
        print(f"  Tokens: {len(cap.split())}")

if val_captions:
    for i, (fname, cap) in enumerate(list(val_captions.items())[:2]):
        print(f"\nVal[{i}]: {fname}")
        print(f"  Caption: {cap[:100]}...")
        print(f"  Tokens: {len(cap.split())}")