# AutoVI Dataset Exploration

This notebook explores the AutoVI visual inspection dataset for federated learning experiments.

## Contents
1. Dataset Loading
2. Image Dimensions Analysis
3. Class Distribution
4. Defect Type Analysis
5. FL Partitioning Preview

In [None]:
import sys
sys.path.insert(0, '..')

import json
from pathlib import Path
from collections import Counter

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

from src.data import (
    AutoVIDataset,
    CATEGORIES,
    SMALL_OBJECTS,
    LARGE_OBJECTS,
)
from src.data.partitioner import (
    IIDPartitioner,
    CategoryPartitioner,
    compute_partition_stats,
)

In [None]:
# Configuration
project_root = Path('..').resolve()
DATA_ROOT = project_root / "dataset"
print(f"Dataset root: {DATA_ROOT}")
print(f"Categories: {CATEGORIES}")
print(f"Small objects (400x400): {SMALL_OBJECTS}")
print(f"Large objects (1000x750): {LARGE_OBJECTS}")

## 1. Dataset Loading

In [None]:
# Load train and test datasets
train_dataset = AutoVIDataset(root_dir=DATA_ROOT, split='train')
test_dataset = AutoVIDataset(root_dir=DATA_ROOT, split='test')

print(f"Train samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")

In [None]:
# Get statistics
train_stats = train_dataset.get_statistics()
test_stats = test_dataset.get_statistics()

print("\nTrain Statistics:")
print(json.dumps(train_stats, indent=2))

print("\nTest Statistics:")
print(json.dumps(test_stats, indent=2))

## 2. Image Dimensions Analysis

In [None]:
# Sample images to check dimensions
dimensions = {}
for category in CATEGORIES:
    cat_dataset = AutoVIDataset(root_dir=DATA_ROOT, categories=[category], split='train')
    if len(cat_dataset) > 0:
        sample = cat_dataset[0]
        img = sample['image']
        dimensions[category] = img.size
        print(f"{category}: {img.size} (WxH)")

In [None]:
# Display sample images from each category
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, category in enumerate(CATEGORIES):
    cat_dataset = AutoVIDataset(root_dir=DATA_ROOT, categories=[category], split='train')
    if len(cat_dataset) > 0:
        sample = cat_dataset[0]
        axes[idx].imshow(sample['image'])
        axes[idx].set_title(f"{category}\n{sample['image'].size}")
        axes[idx].axis('off')

plt.suptitle('Sample Images from Each Category', fontsize=14)
plt.tight_layout()
plt.show()

## 3. Class Distribution

In [None]:
# Create distribution summary table
print("\n" + "="*80)
print(f"{'Category':<20} {'Train':>10} {'Test Good':>12} {'Test Defect':>12} {'Total':>10}")
print("="*80)

total_train = 0
total_test_good = 0
total_test_defect = 0

for category in CATEGORIES:
    train_count = train_stats['by_category'].get(category, {}).get('good', 0)
    test_good = test_stats['by_category'].get(category, {}).get('good', 0)
    test_defect = test_stats['by_category'].get(category, {}).get('defective', 0)
    total = train_count + test_good + test_defect
    
    total_train += train_count
    total_test_good += test_good
    total_test_defect += test_defect
    
    print(f"{category:<20} {train_count:>10} {test_good:>12} {test_defect:>12} {total:>10}")

print("="*80)
grand_total = total_train + total_test_good + total_test_defect
print(f"{'TOTAL':<20} {total_train:>10} {total_test_good:>12} {total_test_defect:>12} {grand_total:>10}")

In [None]:
# Visualize class distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Train distribution by category
categories = list(train_stats['by_category'].keys())
train_counts = [train_stats['by_category'][cat]['good'] for cat in categories]

ax1.bar(range(len(categories)), train_counts, color='steelblue')
ax1.set_xticks(range(len(categories)))
ax1.set_xticklabels(categories, rotation=45, ha='right')
ax1.set_ylabel('Number of Samples')
ax1.set_title('Training Set Distribution by Category')

# Test distribution (good vs defective)
test_good_counts = [test_stats['by_category'][cat]['good'] for cat in categories]
test_defect_counts = [test_stats['by_category'][cat]['defective'] for cat in categories]

x = np.arange(len(categories))
width = 0.35

ax2.bar(x - width/2, test_good_counts, width, label='Good', color='green', alpha=0.7)
ax2.bar(x + width/2, test_defect_counts, width, label='Defective', color='red', alpha=0.7)
ax2.set_xticks(x)
ax2.set_xticklabels(categories, rotation=45, ha='right')
ax2.set_ylabel('Number of Samples')
ax2.set_title('Test Set Distribution by Category')
ax2.legend()

plt.tight_layout()
plt.show()

## 4. Defect Type Analysis

In [None]:
# Analyze defect types
print("\nDefect Types by Category:")
print("="*60)

for category in CATEGORIES:
    cat_dataset = AutoVIDataset(root_dir=DATA_ROOT, categories=[category], split='test')
    defects = [s[3] for s in cat_dataset.samples if s[3] is not None]
    defect_counts = Counter(defects)
    
    print(f"\n{category}:")
    if defect_counts:
        for defect, count in defect_counts.most_common():
            print(f"  - {defect}: {count}")
    else:
        print("  No defects found")

In [None]:
# Display defect examples for engine_wiring
category = 'engine_wiring'
cat_dataset = AutoVIDataset(root_dir=DATA_ROOT, categories=[category], split='test')

# Get unique defect types
defect_types = list(set(s[3] for s in cat_dataset.samples if s[3] is not None))

if defect_types:
    fig, axes = plt.subplots(1, min(len(defect_types), 4), figsize=(16, 4))
    if len(defect_types) == 1:
        axes = [axes]
    
    for idx, defect_type in enumerate(defect_types[:4]):
        defect_indices = cat_dataset.get_defect_indices(defect_type)
        if defect_indices:
            sample = cat_dataset[defect_indices[0]]
            axes[idx].imshow(sample['image'])
            axes[idx].set_title(f"{defect_type}")
            axes[idx].axis('off')
    
    plt.suptitle(f'Defect Examples: {category}', fontsize=14)
    plt.tight_layout()
    plt.show()

## 5. FL Partitioning Preview

In [None]:
# IID Partitioning
iid_partitioner = IIDPartitioner(num_clients=5, seed=42)
iid_partition = iid_partitioner.partition(train_dataset)
iid_stats = compute_partition_stats(train_dataset, iid_partition)

print("IID Partition Statistics:")
print("="*60)
for client_id, client_data in iid_stats['clients'].items():
    print(f"\nClient {client_id}: {client_data['num_samples']} samples")
    for cat, count in client_data['by_category'].items():
        print(f"  {cat}: {count}")

In [None]:
# Category-based (Non-IID) Partitioning
category_partitioner = CategoryPartitioner(seed=42)
category_partition = category_partitioner.partition(train_dataset)
category_stats = compute_partition_stats(train_dataset, category_partition)

print("Category-based (Non-IID) Partition Statistics:")
print("="*60)

client_roles = {
    0: "Engine Assembly",
    1: "Underbody Line",
    2: "Fastener Station",
    3: "Clip Inspection",
    4: "Quality Control",
}

for client_id, client_data in category_stats['clients'].items():
    role = client_roles.get(int(client_id), "Unknown")
    print(f"\nClient {client_id} ({role}): {client_data['num_samples']} samples")
    for cat, count in client_data['by_category'].items():
        print(f"  {cat}: {count}")

In [None]:
# Visualize partition distributions
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# IID partition
client_ids = list(iid_stats['clients'].keys())
iid_counts = [iid_stats['clients'][c]['num_samples'] for c in client_ids]

ax1.bar(client_ids, iid_counts, color='steelblue')
ax1.set_xlabel('Client ID')
ax1.set_ylabel('Number of Samples')
ax1.set_title('IID Partition: Samples per Client')

# Category partition with stacked bars
client_ids = list(category_stats['clients'].keys())
bottom = np.zeros(len(client_ids))

colors = plt.cm.Set3(np.linspace(0, 1, len(CATEGORIES)))

for idx, cat in enumerate(CATEGORIES):
    counts = [category_stats['clients'][c]['by_category'].get(cat, 0) for c in client_ids]
    ax2.bar(client_ids, counts, bottom=bottom, label=cat, color=colors[idx])
    bottom += np.array(counts)

ax2.set_xlabel('Client ID')
ax2.set_ylabel('Number of Samples')
ax2.set_title('Category Partition: Samples per Client')
ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

## Summary

Key findings from the AutoVI dataset exploration:

1. **Dataset Size**: Total of ~1,500 training samples and ~2,400 test samples
2. **Categories**: 6 object types with varying defect types
3. **Image Sizes**: Small objects (400x400) and large objects (1000x750)
4. **Class Imbalance**: Some categories have more samples than others
5. **FL Partitioning**: Both IID and category-based strategies implemented