# Dataset Structure Exploration (Corrected Version)

This notebook helps you investigate and understand the structure of different layout detection datasets with improved sample selection by index.

**Key Improvements:**
- Browse samples by index (1st, 5th, etc.) instead of random selection
- Only shows samples that actually exist on disk
- Interactive sample browsing
- Better error handling
- **FIXED:** Correct image path detection (PNG/ not PNG/train/)

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import numpy as np
from PIL import Image, ImageDraw
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

print("📊 Dataset Explorer initialized!")

## 1. Load Dataset and Find Available Images

In [None]:
def load_dataset_with_available_images():
    """Load DocLayNet dataset and find which images are actually available on disk."""
    
    # Set up paths
    data_dir = Path("../data")
    doclaynet_dir = data_dir / "doclaynet"
    train_annotations_file = doclaynet_dir / "COCO" / "train.json"
    
    if not train_annotations_file.exists():
        print("DocLayNet data not found. Run: python scripts/download_training_data.py --dataset doclaynet")
        return None, []
    
    # Load annotations
    print("📖 Loading DocLayNet annotations...")
    with open(train_annotations_file, 'r') as f:
        doclaynet_data = json.load(f)
    
    # Find available images - FIXED: Use PNG/ not PNG/train/
    image_dir = doclaynet_dir / "PNG"
    available_images = []
    
    if image_dir.exists():
        print(f"🔍 Checking which images are available on disk...")
        for img in doclaynet_data['images']:
            if (image_dir / img['file_name']).exists():
                available_images.append(img)
    
    print(f"✅ Loaded {len(doclaynet_data['images'])} total images from annotations")
    print(f"📁 Found {len(available_images)} available images on disk")
    
    return doclaynet_data, available_images

# Load data
doclaynet_data, available_images = load_dataset_with_available_images()

## 2. Dataset Overview

In [None]:
if doclaynet_data:
    print(" DocLayNet Dataset Overview:")
    print(f"   • Total images in annotations: {len(doclaynet_data['images'])}")
    print(f"   • Available images on disk: {len(available_images)}")
    print(f"   • Total annotations: {len(doclaynet_data['annotations'])}")
    print(f"   • Categories: {len(doclaynet_data['categories'])}")
    
    print("\n🏷️  Available Categories:")
    for cat in doclaynet_data['categories']:
        print(f"   • {cat['name']} (ID: {cat['id']})")
else:
    print("❌ No data loaded")

## 3. Category Distribution Analysis

In [None]:
if doclaynet_data:
    # Count annotations per category
    category_counts = Counter([ann['category_id'] for ann in doclaynet_data['annotations']])
    category_names = {cat['id']: cat['name'] for cat in doclaynet_data['categories']}
    
    # Create DataFrame for better display
    stats_data = []
    for cat_id, count in category_counts.items():
        stats_data.append({
            'Category': category_names[cat_id],
            'Count': count,
            'Percentage': f"{count/len(doclaynet_data['annotations'])*100:.1f}%"
        })
    
    df_stats = pd.DataFrame(stats_data).sort_values('Count', ascending=False)
    print("📊 Category Distribution:")
    display(df_stats)
    
    # Plot distribution
    plt.figure(figsize=(12, 6))
    plt.bar(df_stats['Category'], df_stats['Count'])
    plt.title('DocLayNet Category Distribution')
    plt.xlabel('Category')
    plt.ylabel('Number of Annotations')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## 4. Browse Samples by Index

In [None]:
def show_sample_info(sample_index):
    """Show information about a specific sample by index."""
    
    if not available_images:
        print("❌ No images available for visualization")
        return
    
    if sample_index >= len(available_images):
        print(f"❌ Index {sample_index} out of range. Available: 0-{len(available_images)-1}")
        return
    
    # Get image info
    image_info = available_images[sample_index]
    image_id = image_info['id']
    
    # Get annotations for this image
    annotations = [ann for ann in doclaynet_data['annotations'] if ann['image_id'] == image_id]
    
    print(f"🖼️  Sample #{sample_index+1}: {image_info['file_name']}")
    print(f"📏 Dimensions: {image_info['width']}x{image_info['height']}")
    print(f"📊 Annotations: {len(annotations)}")
    
    # Show categories in this image
    category_names = {cat['id']: cat['name'] for cat in doclaynet_data['categories']}
    image_categories = [category_names[ann['category_id']] for ann in annotations]
    category_counts = Counter(image_categories)
    
    print("📋 Categories in this image:")
    for cat, count in category_counts.items():
        print(f"   • {cat}: {count} annotations")
    
    return image_info, annotations

# Show first few samples
print("🔍 First 5 available samples:")
for i in range(min(5, len(available_images))):
    image_info = available_images[i]
    image_id = image_info['id']
    annotations = [ann for ann in doclaynet_data['annotations'] if ann['image_id'] == image_id]
    
    category_names = {cat['id']: cat['name'] for cat in doclaynet_data['categories']}
    image_categories = list(set([category_names[ann['category_id']] for ann in annotations]))
    
    print(f"  {i+1:2d}. {image_info['file_name'][:50]:<50} | {len(annotations):2d} annotations | {', '.join(image_categories[:3])}")

## 5. Visualize Specific Sample

In [None]:
def visualize_sample(sample_index):
    """Visualize a sample with bounding boxes and labels."""
    
    # Get sample info
    result = show_sample_info(sample_index)
    if result is None:
        return
    
    image_info, annotations = result
    
    # Load and visualize image - FIXED: Use PNG/ not PNG/train/
    data_dir = Path("../data")
    doclaynet_dir = data_dir / "doclaynet"
    image_dir = doclaynet_dir / "PNG"
    image_path = image_dir / image_info['file_name']
    
    # Load image
    image = Image.open(image_path).convert('RGB')
    draw = ImageDraw.Draw(image)
    
    # Category colors
    categories = doclaynet_data['categories']
    colors = plt.cm.Set3(np.linspace(0, 1, len(categories)))
    color_map = {cat['id']: tuple(int(c*255) for c in colors[i][:3]) for i, cat in enumerate(categories)}
    
    # Draw bounding boxes
    for ann in annotations:
        x, y, w, h = ann['bbox']
        category_id = ann['category_id']
        color = color_map.get(category_id, (255, 0, 0))
        
        # Draw rectangle
        draw.rectangle([x, y, x+w, y+h], outline=color, width=3)
        
        # Draw category label
        category_name = next((cat['name'] for cat in categories if cat['id'] == category_id), 'Unknown')
        draw.text((x, max(0, y-20)), category_name, fill=color)
    
    # Display
    plt.figure(figsize=(15, 10))
    plt.imshow(image)
    plt.axis('off')
    plt.title(f"Sample #{sample_index+1}: {image_info['file_name']} ({image.size[0]}x{image.size[1]})")
    plt.tight_layout()
    plt.show()

# Visualize first sample
if available_images:
    print("🎨 Visualizing first available sample:")
    visualize_sample(0)
else:
    print("❌ No images available for visualization")

## 6. Interactive Sample Browser

In [None]:
# Change this index to browse different samples
SAMPLE_INDEX = 1  # Change this to view different samples (0-based)

if available_images:
    print(f"🔍 Browsing sample at index {SAMPLE_INDEX}:")
    visualize_sample(SAMPLE_INDEX)
else:
    print("❌ No images available")

## 7. Find Samples with Specific Categories

In [None]:
def find_samples_with_category(category_name, limit=10):
    """Find samples containing specific category."""
    
    if not doclaynet_data:
        return
    
    # Find category ID
    category_id = None
    for cat in doclaynet_data['categories']:
        if cat['name'].lower() == category_name.lower():
            category_id = cat['id']
            break
    
    if category_id is None:
        print(f"❌ Category '{category_name}' not found")
        print("Available categories:", [cat['name'] for cat in doclaynet_data['categories']])
        return
    
    # Find images with this category
    matching_images = []
    for img in available_images:
        annotations = [ann for ann in doclaynet_data['annotations'] 
                     if ann['image_id'] == img['id'] and ann['category_id'] == category_id]
        if annotations:
            matching_images.append((img, len(annotations)))
    
    print(f"🔍 Found {len(matching_images)} images with '{category_name}' category:")
    
    for i, (img, count) in enumerate(matching_images[:limit]):
        img_index = available_images.index(img)
        print(f"  {img_index+1:3d}. {img['file_name'][:50]:<50} | {count} {category_name} annotations")
    
    return matching_images

# Example: Find samples with tables
table_samples = find_samples_with_category("Table", limit=5)

# Visualize first table sample if found
if table_samples:
    first_table_img, _ = table_samples[0]
    table_index = available_images.index(first_table_img)
    print(f"\n🎨 Visualizing first table sample (index {table_index}):")
    visualize_sample(table_index)

## 8. Browse Multiple Samples

In [None]:
def browse_multiple_samples(start_index=0, count=3):
    """Browse multiple samples starting from index."""
    
    if not available_images:
        print("❌ No images available")
        return
    
    end_index = min(start_index + count, len(available_images))
    print(f"🔍 Browsing samples {start_index+1} to {end_index}:")
    
    for i in range(start_index, end_index):
        print(f"\n--- Sample {i+1} ---")
        visualize_sample(i)

# Browse samples 2-4
browse_multiple_samples(start_index=1, count=3)