# Conveyor Object Detection - Data Analysis & Error Analysis

This notebook provides comprehensive analysis of the dataset and model performance for the conveyor object detection system.


In [None]:
# Import required libraries
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import yaml
import json
from collections import Counter
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

from src.data.dataset import DatasetManager
from src.utils.visualization import DataAnalysisVisualizer, MetricsVisualizer
from src.models.yolo_model import YOLOModelManager
from ultralytics import YOLO

plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline
pd.set_option('display.max_columns', None)

print("Libraries imported successfully!")

## 1. Dataset Analysis

In [None]:
# Initialize dataset manager and analyze
dataset_manager = DatasetManager()
dataset_yaml_path = "../data/processed/yolo_dataset/dataset.yaml"

try:
    with open(dataset_yaml_path, 'r') as f:
        dataset_config = yaml.safe_load(f)
    
    print("Dataset Configuration:")
    print(f"  Path: {dataset_config['path']}")
    print(f"  Classes: {dataset_config['names']}")
    print(f"  Number of classes: {dataset_config['nc']}")
    
    analysis = dataset_manager.analyze_dataset(dataset_yaml_path)
    
    print("\nDataset Analysis Summary:")
    for split, stats in analysis['splits'].items():
        print(f"  {split.upper()} Split:")
        print(f"    Images: {stats['num_images']}")
        print(f"    Labels: {stats['num_labels']}")
        print(f"    Total Objects: {stats['total_objects']}")
        print(f"    Objects per Image: {stats['total_objects']/stats['num_images']:.2f}")
        
except FileNotFoundError:
    print(f"Dataset file not found: {dataset_yaml_path}")
    print("Please run the data preparation script first.")
    analysis = None

## 2. Class Distribution Analysis

In [None]:
if analysis:
    class_dist = analysis['class_distribution']
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    classes = list(class_dist.keys())
    counts = list(class_dist.values())
    
    bars = axes[0].bar(classes, counts, color=['skyblue', 'lightcoral'])
    axes[0].set_title('Overall Class Distribution', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Classes')
    axes[0].set_ylabel('Number of Instances')
    
    for bar, count in zip(bars, counts):
        axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(counts)*0.01,
                    str(count), ha='center', va='bottom', fontweight='bold')
    
    axes[1].pie(counts, labels=classes, autopct='%1.1f%%', startangle=90,
               colors=['skyblue', 'lightcoral'])
    axes[1].set_title('Class Distribution (Percentage)', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    total_objects = sum(counts)
    imbalance_ratio = max(counts) / min(counts) if min(counts) > 0 else float('inf')
    
    print(f"\nClass Imbalance Analysis:")
    print(f"  Total objects: {total_objects}")
    print(f"  Imbalance ratio: {imbalance_ratio:.2f}")
    
    if imbalance_ratio > 2:
        print("  ⚠️  Significant class imbalance detected!")
        print("  Consider using class weights or data augmentation.")
    else:
        print("  ✅ Classes are relatively balanced.")

## 3. Model Performance Analysis

In [None]:
# Look for training results
results_dir = Path("../runs/train")
metrics_files = list(results_dir.glob("**/results.csv")) if results_dir.exists() else []

if metrics_files:
    latest_results = max(metrics_files, key=lambda x: x.stat().st_mtime)
    print(f"Loading training results from: {latest_results}")
    
    try:
        metrics_df = pd.read_csv(latest_results)
        metrics_df.columns = metrics_df.columns.str.strip()
        
        print(f"\nTraining completed after {len(metrics_df)} epochs")
        
        final_metrics = metrics_df.iloc[-1]
        print("\n📈 Final Training Metrics:")
        
        metric_names = {
            'metrics/precision(B)': 'Precision',
            'metrics/recall(B)': 'Recall',
            'metrics/mAP50(B)': 'mAP@0.5',
            'metrics/mAP50-95(B)': 'mAP@0.5:0.95'
        }
        
        for col, name in metric_names.items():
            if col in final_metrics:
                print(f"  {name}: {final_metrics[col]:.4f}")
        
        # Plot training curves
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        fig.suptitle('Training Metrics Over Time', fontsize=16, fontweight='bold')
        
        if 'train/box_loss' in metrics_df.columns and 'val/box_loss' in metrics_df.columns:
            axes[0, 0].plot(metrics_df['epoch'], metrics_df['train/box_loss'], 'b-', label='Train')
            axes[0, 0].plot(metrics_df['epoch'], metrics_df['val/box_loss'], 'r-', label='Validation')
            axes[0, 0].set_title('Box Loss')
            axes[0, 0].legend()
            axes[0, 0].grid(True, alpha=0.3)
        
        if 'metrics/precision(B)' in metrics_df.columns and 'metrics/recall(B)' in metrics_df.columns:
            axes[0, 1].plot(metrics_df['epoch'], metrics_df['metrics/precision(B)'], 'g-', label='Precision')
            axes[0, 1].plot(metrics_df['epoch'], metrics_df['metrics/recall(B)'], 'orange', label='Recall')
            axes[0, 1].set_title('Precision & Recall')
            axes[0, 1].legend()
            axes[0, 1].grid(True, alpha=0.3)
        
        if 'metrics/mAP50(B)' in metrics_df.columns:
            axes[1, 0].plot(metrics_df['epoch'], metrics_df['metrics/mAP50(B)'], 'purple', label='mAP@0.5')
            axes[1, 0].set_title('mAP@0.5')
            axes[1, 0].legend()
            axes[1, 0].grid(True, alpha=0.3)
        
        if 'lr/pg0' in metrics_df.columns:
            axes[1, 1].plot(metrics_df['epoch'], metrics_df['lr/pg0'], 'red')
            axes[1, 1].set_title('Learning Rate')
            axes[1, 1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"Error loading metrics: {e}")
else:
    print("No training results found. Train a model first.")

## 4. Error Analysis and Recommendations

In [None]:
print("🔍 Error Analysis and Recommendations:")
print("\n1. Data Quality:")
if analysis:
    total_images = sum(stats['num_images'] for stats in analysis['splits'].values())
    total_objects = sum(analysis['class_distribution'].values())
    print(f"   • Dataset size: {total_images} images, {total_objects} objects")
    print(f"   • Average objects per image: {total_objects/total_images:.2f}")
    
    if total_images < 1000:
        print("   ⚠️  Small dataset - consider collecting more data")
    elif total_images < 5000:
        print("   ✅ Moderate dataset size - good for initial training")
    else:
        print("   ✅ Large dataset - excellent for robust training")

print("\n2. Model Performance:")
print("   • Monitor validation loss to detect overfitting")
print("   • Target mAP@0.5 > 0.8 for production deployment")
print("   • Precision > 0.9 important for conveyor sorting accuracy")

print("\n3. Conveyor-Specific Considerations:")
print("   • Test with different lighting conditions")
print("   • Validate performance at different belt speeds")
print("   • Consider object occlusion scenarios")
print("   • Test with worn/damaged objects")

print("\n4. Deployment Recommendations:")
print("   • Export to ONNX for faster inference")
print("   • Implement confidence thresholding (>0.7 recommended)")
print("   • Add object tracking for temporal consistency")
print("   • Monitor inference FPS (target >30 FPS)")

print("\n5. Next Steps:")
print("   • Collect edge case data (poor lighting, damaged objects)")
print("   • Implement A/B testing for model versions")
print("   • Set up continuous monitoring in production")
print("   • Create feedback loop for model improvement")