# Traffic Object Detection - Data Exploration

This notebook provides tools for exploring and analyzing traffic object detection datasets.

In [None]:
import os
import sys
import json
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from PIL import Image

# Add src to path
sys.path.append('../src')

from datasets import TrafficDataset, get_transforms
from utils.data_utils import verify_dataset, create_class_mapping
from utils.visualization import plot_class_distribution

## Dataset Configuration

In [None]:
# Dataset paths
DATASET_ROOT = "../data/traffic"
TRAIN_ANNOTATIONS = os.path.join(DATASET_ROOT, "train_annotations.json")
VAL_ANNOTATIONS = os.path.join(DATASET_ROOT, "val_annotations.json")
TEST_ANNOTATIONS = os.path.join(DATASET_ROOT, "test_annotations.json")
IMAGES_DIR = os.path.join(DATASET_ROOT, "images")

## Load and Analyze Dataset

In [None]:
# Load annotations
def load_annotations(ann_file):
    with open(ann_file, 'r') as f:
        return json.load(f)

# Check which annotation files exist
splits = {}
for split_name, ann_file in [("train", TRAIN_ANNOTATIONS), ("val", VAL_ANNOTATIONS), ("test", TEST_ANNOTATIONS)]:
    if os.path.exists(ann_file):
        splits[split_name] = load_annotations(ann_file)
        print(f"{split_name.capitalize()} split: {len(splits[split_name])} images")
    else:
        print(f"{split_name.capitalize()} annotations not found: {ann_file}")

## Dataset Statistics

In [None]:
def analyze_split(data, split_name):
    total_images = len(data)
    total_objects = sum(len(img_data['objects']) for img_data in data.values())
    empty_images = sum(1 for img_data in data.values() if len(img_data['objects']) == 0)
    
    # Class distribution
    class_counts = Counter()
    for img_data in data.values():
        for obj in img_data['objects']:
            class_counts[obj['class']] += 1
    
    print(f"\n{split_name.upper()} SPLIT STATISTICS:")
    print(f"Total images: {total_images}")
    print(f"Total objects: {total_objects}")
    print(f"Empty images: {empty_images} ({empty_images/total_images*100:.1f}%)")
    print(f"Avg objects per image: {total_objects/total_images:.2f}")
    print(f"Classes found: {len(class_counts)}")
    
    return class_counts

# Analyze each split
split_stats = {}
for split_name, data in splits.items():
    split_stats[split_name] = analyze_split(data, split_name)

## Class Distribution Visualization

In [None]:
# Plot class distribution for each split
for split_name, class_counts in split_stats.items():
    if class_counts:
        fig = plot_class_distribution(dict(class_counts), top_n=15)
        plt.title(f'Class Distribution - {split_name.upper()} Split')
        plt.show()

## Sample Images Visualization

In [None]:
def visualize_sample_images(data, images_dir, num_samples=6):
    """
    Visualize sample images with their annotations.
    """
    import matplotlib.patches as patches
    import random
    
    # Get random samples
    sample_ids = random.sample(list(data.keys()), min(num_samples, len(data)))
    
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    colors = plt.cm.Set3(np.linspace(0, 1, 10))
    
    for i, img_id in enumerate(sample_ids):
        img_data = data[img_id]
        img_path = os.path.join(images_dir, img_data['filename'])
        
        if os.path.exists(img_path):
            # Load and display image
            img = Image.open(img_path)
            axes[i].imshow(img)
            axes[i].set_title(f"Image: {img_data['filename']}\nObjects: {len(img_data['objects'])}")
            axes[i].axis('off')
            
            # Draw bounding boxes
            for obj in img_data['objects']:
                bbox = obj['bbox']  # [x, y, w, h]
                x, y, w, h = bbox
                
                # Choose color based on class
                color = colors[hash(obj['class']) % len(colors)]
                
                # Draw rectangle
                rect = patches.Rectangle(
                    (x, y), w, h,
                    linewidth=2, edgecolor=color, facecolor='none'
                )
                axes[i].add_patch(rect)
                
                # Add label
                axes[i].text(
                    x, y-5, obj['class'],
                    fontsize=8, color='white',
                    bbox=dict(boxstyle="round,pad=0.3", facecolor=color, alpha=0.7)
                )
        else:
            axes[i].text(0.5, 0.5, f"Image not found:\n{img_path}", 
                        ha='center', va='center', transform=axes[i].transAxes)
            axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()

# Visualize samples from each split
for split_name, data in splits.items():
    print(f"\nSample images from {split_name} split:")
    visualize_sample_images(data, IMAGES_DIR)

## Bounding Box Analysis

In [None]:
def analyze_bounding_boxes(data):
    """
    Analyze bounding box sizes and aspect ratios.
    """
    box_areas = []
    box_widths = []
    box_heights = []
    aspect_ratios = []
    
    for img_data in data.values():
        for obj in img_data['objects']:
            bbox = obj['bbox']
            x, y, w, h = bbox
            
            box_areas.append(w * h)
            box_widths.append(w)
            box_heights.append(h)
            
            if h > 0:
                aspect_ratios.append(w / h)
    
    return {
        'areas': box_areas,
        'widths': box_widths,
        'heights': box_heights,
        'aspect_ratios': aspect_ratios
    }

# Analyze bounding boxes for train split
if 'train' in splits:
    bbox_stats = analyze_bounding_boxes(splits['train'])
    
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    
    # Box areas
    axes[0, 0].hist(bbox_stats['areas'], bins=50, alpha=0.7)
    axes[0, 0].set_title('Bounding Box Areas')
    axes[0, 0].set_xlabel('Area (pixels²)')
    axes[0, 0].set_ylabel('Count')
    
    # Box widths
    axes[0, 1].hist(bbox_stats['widths'], bins=50, alpha=0.7)
    axes[0, 1].set_title('Bounding Box Widths')
    axes[0, 1].set_xlabel('Width (pixels)')
    axes[0, 1].set_ylabel('Count')
    
    # Box heights
    axes[1, 0].hist(bbox_stats['heights'], bins=50, alpha=0.7)
    axes[1, 0].set_title('Bounding Box Heights')
    axes[1, 0].set_xlabel('Height (pixels)')
    axes[1, 0].set_ylabel('Count')
    
    # Aspect ratios
    axes[1, 1].hist(bbox_stats['aspect_ratios'], bins=50, alpha=0.7)
    axes[1, 1].set_title('Aspect Ratios (W/H)')
    axes[1, 1].set_xlabel('Aspect Ratio')
    axes[1, 1].set_ylabel('Count')
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print(f"Bounding Box Statistics:")
    print(f"Area - Mean: {np.mean(bbox_stats['areas']):.1f}, Std: {np.std(bbox_stats['areas']):.1f}")
    print(f"Width - Mean: {np.mean(bbox_stats['widths']):.1f}, Std: {np.std(bbox_stats['widths']):.1f}")
    print(f"Height - Mean: {np.mean(bbox_stats['heights']):.1f}, Std: {np.std(bbox_stats['heights']):.1f}")
    print(f"Aspect Ratio - Mean: {np.mean(bbox_stats['aspect_ratios']):.2f}, Std: {np.std(bbox_stats['aspect_ratios']):.2f}")

## Data Quality Check

In [None]:
# Check for missing images and other data quality issues
def check_data_quality(data, images_dir):
    issues = {
        'missing_images': [],
        'empty_annotations': [],
        'invalid_bboxes': [],
        'unknown_classes': []
    }
    
    known_classes = {
        'car', 'truck', 'bus', 'motorcycle', 'bicycle', 
        'person', 'traffic_light', 'traffic_sign', 'stop_sign'
    }
    
    for img_id, img_data in data.items():
        filename = img_data['filename']
        img_path = os.path.join(images_dir, filename)
        
        # Check if image exists
        if not os.path.exists(img_path):
            issues['missing_images'].append(filename)
        
        # Check for empty annotations
        if len(img_data['objects']) == 0:
            issues['empty_annotations'].append(filename)
        
        # Check bounding boxes and classes
        for obj in img_data['objects']:
            bbox = obj['bbox']
            x, y, w, h = bbox
            
            # Check for invalid bounding boxes
            if w <= 0 or h <= 0 or x < 0 or y < 0:
                issues['invalid_bboxes'].append((filename, bbox))
            
            # Check for unknown classes
            if obj['class'] not in known_classes:
                issues['unknown_classes'].append((filename, obj['class']))
    
    return issues

# Check data quality for each split
for split_name, data in splits.items():
    print(f"\nData Quality Check - {split_name.upper()} Split:")
    issues = check_data_quality(data, IMAGES_DIR)
    
    print(f"Missing images: {len(issues['missing_images'])}")
    print(f"Empty annotations: {len(issues['empty_annotations'])}")
    print(f"Invalid bounding boxes: {len(issues['invalid_bboxes'])}")
    print(f"Unknown classes: {len(issues['unknown_classes'])}")
    
    if issues['missing_images']:
        print(f"  Sample missing images: {issues['missing_images'][:5]}")
    
    if issues['unknown_classes']:
        unknown_class_names = list(set([cls for _, cls in issues['unknown_classes']]))
        print(f"  Unknown classes found: {unknown_class_names}")

## Dataset Summary

In [None]:
# Create comprehensive summary
print("\n" + "="*50)
print("DATASET SUMMARY")
print("="*50)

total_images = sum(len(data) for data in splits.values())
total_objects = sum(sum(len(img_data['objects']) for img_data in data.values()) for data in splits.values())

print(f"Total images across all splits: {total_images}")
print(f"Total objects across all splits: {total_objects}")

# Combined class distribution
all_classes = Counter()
for split_stats_dict in split_stats.values():
    all_classes.update(split_stats_dict)

print(f"\nClass distribution across all splits:")
for class_name, count in all_classes.most_common():
    percentage = count / total_objects * 100
    print(f"  {class_name}: {count} ({percentage:.1f}%)")

print(f"\nDataset is ready for training!")