# Data Exploration for Object Detection

This notebook explores the dataset for object detection model training.

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from pathlib import Path
import json
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print('Libraries imported successfully')

In [None]:
# Configuration
DATA_DIR = Path('../data')
ANNOTATIONS_DIR = DATA_DIR / 'annotations'
IMAGES_DIR = DATA_DIR / 'raw'

# Check directories
print(f"Data directory exists: {DATA_DIR.exists()}")
print(f"Annotations directory exists: {ANNOTATIONS_DIR.exists()}")
print(f"Images directory exists: {IMAGES_DIR.exists()}")

## Load Annotations

In [None]:
# Load COCO-style annotations
def load_coco_annotations(annotation_file):
    with open(annotation_file, 'r') as f:
        annotations = json.load(f)
    return annotations

# Example loading
# annotations = load_coco_annotations(ANNOTATIONS_DIR / 'train.json')
# print(f"Loaded {len(annotations['images'])} images")
# print(f"Loaded {len(annotations['annotations'])} annotations")
# print(f"Number of categories: {len(annotations['categories'])}")

## Dataset Statistics

In [None]:
# Generate sample dataset statistics
np.random.seed(42)

# Simulate dataset
num_images = 1000
num_classes = 20
class_names = [f'class_{i}' for i in range(num_classes)]

# Generate random annotations
annotations_data = []
for img_id in range(num_images):
    num_objects = np.random.randint(1, 10)
    for _ in range(num_objects):
        annotations_data.append({
            'image_id': img_id,
            'category_id': np.random.randint(0, num_classes),
            'bbox': [np.random.rand() * 100, np.random.rand() * 100, 
                    np.random.rand() * 200, np.random.rand() * 200],
            'area': np.random.rand() * 10000
        })

df_annotations = pd.DataFrame(annotations_data)
print(f"Total annotations: {len(df_annotations)}")
print(f"Average objects per image: {len(df_annotations) / num_images:.2f}")

In [None]:
# Class distribution
class_counts = df_annotations['category_id'].value_counts().sort_index()

plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
plt.bar(range(len(class_counts)), class_counts.values)
plt.xlabel('Class ID')
plt.ylabel('Number of Instances')
plt.title('Class Distribution')
plt.xticks(range(0, num_classes, 2))

plt.subplot(1, 2, 2)
plt.pie(class_counts.values[:10], labels=[f'Class {i}' for i in range(10)], autopct='%1.1f%%')
plt.title('Top 10 Classes Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Bounding box size analysis
df_annotations['width'] = df_annotations['bbox'].apply(lambda x: x[2])
df_annotations['height'] = df_annotations['bbox'].apply(lambda x: x[3])
df_annotations['aspect_ratio'] = df_annotations['width'] / (df_annotations['height'] + 1e-6)

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

axes[0].hist(df_annotations['width'], bins=50, edgecolor='black')
axes[0].set_xlabel('Width (pixels)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Bounding Box Width Distribution')

axes[1].hist(df_annotations['height'], bins=50, edgecolor='black')
axes[1].set_xlabel('Height (pixels)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Bounding Box Height Distribution')

axes[2].hist(df_annotations['aspect_ratio'], bins=50, edgecolor='black')
axes[2].set_xlabel('Aspect Ratio (W/H)')
axes[2].set_ylabel('Frequency')
axes[2].set_title('Aspect Ratio Distribution')

plt.tight_layout()
plt.show()

## Image Analysis

In [None]:
# Analyze image properties
def analyze_images(image_dir, num_samples=100):
    image_stats = []
    
    # Generate sample image stats
    for i in range(num_samples):
        # Simulate image properties
        width = np.random.choice([640, 1280, 1920])
        height = np.random.choice([480, 720, 1080])
        channels = 3
        
        image_stats.append({
            'width': width,
            'height': height,
            'channels': channels,
            'aspect_ratio': width / height,
            'total_pixels': width * height
        })
    
    return pd.DataFrame(image_stats)

df_images = analyze_images(IMAGES_DIR)
print("Image Statistics:")
print(df_images.describe())

In [None]:
# Image resolution distribution
resolution_counts = df_images.groupby(['width', 'height']).size().reset_index(name='count')

plt.figure(figsize=(10, 6))
for _, row in resolution_counts.iterrows():
    plt.scatter(row['width'], row['height'], s=row['count']*50, alpha=0.6)
    plt.text(row['width'], row['height'], f"{row['width']}x{row['height']}", 
             fontsize=9, ha='center')

plt.xlabel('Width (pixels)')
plt.ylabel('Height (pixels)')
plt.title('Image Resolution Distribution')
plt.grid(True, alpha=0.3)
plt.show()

## Data Quality Checks

In [None]:
# Check for data quality issues
def check_data_quality(df_annotations):
    issues = []
    
    # Check for negative coordinates
    negative_coords = df_annotations[
        df_annotations['bbox'].apply(lambda x: any(coord < 0 for coord in x[:2]))
    ]
    if len(negative_coords) > 0:
        issues.append(f"Found {len(negative_coords)} annotations with negative coordinates")
    
    # Check for zero area boxes
    zero_area = df_annotations[df_annotations['area'] <= 0]
    if len(zero_area) > 0:
        issues.append(f"Found {len(zero_area)} annotations with zero or negative area")
    
    # Check for very small boxes
    small_boxes = df_annotations[df_annotations['area'] < 100]
    if len(small_boxes) > 0:
        issues.append(f"Found {len(small_boxes)} very small boxes (area < 100 pixels)")
    
    # Check for extreme aspect ratios
    extreme_ar = df_annotations[
        (df_annotations['aspect_ratio'] < 0.1) | (df_annotations['aspect_ratio'] > 10)
    ]
    if len(extreme_ar) > 0:
        issues.append(f"Found {len(extreme_ar)} boxes with extreme aspect ratios")
    
    return issues

quality_issues = check_data_quality(df_annotations)
if quality_issues:
    print("⚠️ Data Quality Issues Found:")
    for issue in quality_issues:
        print(f"  - {issue}")
else:
    print("✅ No data quality issues found")

## Visualization Samples

In [None]:
# Create sample visualizations
def create_sample_image_with_boxes(img_size=(640, 480), num_boxes=5):
    """Create a sample image with bounding boxes"""
    # Create random image
    img = np.random.randint(0, 255, (*img_size, 3), dtype=np.uint8)
    
    # Add random boxes
    colors = plt.cm.hsv(np.linspace(0, 1, num_boxes))
    
    for i in range(num_boxes):
        x1 = np.random.randint(0, img_size[0] - 100)
        y1 = np.random.randint(0, img_size[1] - 100)
        x2 = x1 + np.random.randint(50, 150)
        y2 = y1 + np.random.randint(50, 150)
        
        color = tuple(int(c * 255) for c in colors[i][:3])
        cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
        cv2.putText(img, f'Object {i}', (x1, y1-5), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
    
    return img

# Display sample images
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i in range(6):
    sample_img = create_sample_image_with_boxes()
    axes[i].imshow(sample_img)
    axes[i].set_title(f'Sample {i+1}')
    axes[i].axis('off')

plt.suptitle('Sample Images with Annotations', fontsize=16)
plt.tight_layout()
plt.show()

## Dataset Split Analysis

In [None]:
# Analyze train/val/test split
split_ratios = {'train': 0.7, 'val': 0.15, 'test': 0.15}
total_images = 1000

split_counts = {split: int(total_images * ratio) 
                for split, ratio in split_ratios.items()}

# Visualize split
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Pie chart
ax1.pie(split_counts.values(), labels=split_counts.keys(), 
        autopct='%1.1f%%', startangle=90)
ax1.set_title('Dataset Split Distribution')

# Bar chart
ax2.bar(split_counts.keys(), split_counts.values(), 
        color=['blue', 'orange', 'green'])
ax2.set_ylabel('Number of Images')
ax2.set_title('Dataset Split Counts')
for i, (k, v) in enumerate(split_counts.items()):
    ax2.text(i, v + 10, str(v), ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

print("Dataset Split:")
for split, count in split_counts.items():
    print(f"  {split}: {count} images ({split_ratios[split]*100:.0f}%)")

## Anchor Analysis for Object Detection

In [None]:
# Analyze optimal anchor boxes
from sklearn.cluster import KMeans

# Get all box dimensions
box_dims = df_annotations[['width', 'height']].values

# Perform K-means clustering
n_anchors = 9
kmeans = KMeans(n_clusters=n_anchors, random_state=42)
kmeans.fit(box_dims)

# Get anchor boxes
anchors = kmeans.cluster_centers_
anchors = anchors[anchors[:, 0].argsort()]  # Sort by width

# Visualize anchors
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.scatter(box_dims[:, 0], box_dims[:, 1], alpha=0.3, s=1)
plt.scatter(anchors[:, 0], anchors[:, 1], c='red', s=100, marker='x', linewidths=3)
plt.xlabel('Width')
plt.ylabel('Height')
plt.title('Anchor Boxes from K-means Clustering')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
for i, (w, h) in enumerate(anchors):
    rect = plt.Rectangle((0, i), w/10, h/10, 
                         linewidth=2, edgecolor='r', facecolor='none')
    plt.gca().add_patch(rect)
    plt.text(w/10 + 1, i + h/20, f'{w:.0f}x{h:.0f}', fontsize=8)

plt.xlim(0, 50)
plt.ylim(-1, n_anchors)
plt.xlabel('Relative Width')
plt.ylabel('Anchor Index')
plt.title('Anchor Box Shapes')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Optimal Anchor Boxes:")
for i, (w, h) in enumerate(anchors):
    print(f"  Anchor {i+1}: {w:.1f} x {h:.1f} (aspect ratio: {w/h:.2f})")

## Summary and Recommendations

In [None]:
# Generate summary report
print("="*60)
print("DATASET EXPLORATION SUMMARY")
print("="*60)

print("\n📊 Dataset Statistics:")
print(f"  • Total images: {num_images}")
print(f"  • Total annotations: {len(df_annotations)}")
print(f"  • Number of classes: {num_classes}")
print(f"  • Avg objects per image: {len(df_annotations)/num_images:.2f}")

print("\n📐 Bounding Box Statistics:")
print(f"  • Mean width: {df_annotations['width'].mean():.1f} pixels")
print(f"  • Mean height: {df_annotations['height'].mean():.1f} pixels")
print(f"  • Mean aspect ratio: {df_annotations['aspect_ratio'].mean():.2f}")

print("\n🎯 Recommendations:")
print("  1. Consider data augmentation for underrepresented classes")
print("  2. Use the calculated anchor boxes for better detection")
print("  3. Apply image resizing to standard dimensions (640x640 or 1280x1280)")
print("  4. Implement class balancing strategies during training")
print("  5. Consider removing or fixing annotations with quality issues")

print("\n✅ Dataset is ready for model training!")