# Defect Detection - Exploratory Data Analysis
## MSc Thesis - Arden University Berlin

This notebook provides exploratory data analysis for the defect detection dataset.

In [None]:
# Import required libraries
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from tqdm import tqdm

# Add src to path
sys.path.append('../')

from src.data.preprocessing import DataPreprocessor
from src.visualization.visualizer import DefectVisualizer

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Dataset Overview

In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor(config_path='../configs/config.yaml')

# Define paths
train_images = '../data/processed/train/images'
train_labels = '../data/processed/train/labels'

# Validate and get statistics
if Path(train_images).exists():
    stats = preprocessor.validate_dataset(train_images, train_labels)
    preprocessor.print_dataset_stats(stats)
else:
    print("Please prepare your dataset first using examples/prepare_data.py")

## 2. Class Distribution Analysis

In [None]:
# Plot class distribution
if Path(train_images).exists() and 'class_distribution' in stats:
    classes = list(stats['class_distribution'].keys())
    counts = list(stats['class_distribution'].values())
    
    plt.figure(figsize=(10, 6))
    bars = plt.bar(classes, counts, color='steelblue', alpha=0.8)
    plt.xlabel('Defect Class', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.title('Defect Class Distribution', fontsize=14, fontweight='bold')
    plt.xticks(rotation=45, ha='right')
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}',
                ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

## 3. Image Size Analysis

In [None]:
# Analyze image sizes
if Path(train_images).exists() and 'image_sizes' in stats:
    heights = [s[0] for s in stats['image_sizes']]
    widths = [s[1] for s in stats['image_sizes']]
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # Height distribution
    ax1.hist(heights, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
    ax1.axvline(np.mean(heights), color='red', linestyle='--', label=f'Mean: {np.mean(heights):.0f}')
    ax1.set_xlabel('Height (pixels)', fontsize=12)
    ax1.set_ylabel('Frequency', fontsize=12)
    ax1.set_title('Image Height Distribution', fontsize=13, fontweight='bold')
    ax1.legend()
    
    # Width distribution
    ax2.hist(widths, bins=30, color='lightcoral', edgecolor='black', alpha=0.7)
    ax2.axvline(np.mean(widths), color='red', linestyle='--', label=f'Mean: {np.mean(widths):.0f}')
    ax2.set_xlabel('Width (pixels)', fontsize=12)
    ax2.set_ylabel('Frequency', fontsize=12)
    ax2.set_title('Image Width Distribution', fontsize=13, fontweight='bold')
    ax2.legend()
    
    plt.tight_layout()
    plt.show()

## 4. Sample Image Visualization

In [None]:
# Visualize sample images with annotations
if Path(train_images).exists():
    image_files = sorted(list(Path(train_images).glob('*.jpg')))[:6]
    
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    for idx, img_path in enumerate(image_files):
        # Read image
        img = cv2.imread(str(img_path))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        # Read label if exists
        label_path = Path(train_labels) / (img_path.stem + '.txt')
        if label_path.exists():
            with open(label_path, 'r') as f:
                lines = f.readlines()
            
            # Draw bounding boxes
            h, w = img.shape[:2]
            for line in lines:
                class_id, x_center, y_center, width, height = map(float, line.split())
                
                # Convert YOLO format to pixel coordinates
                x1 = int((x_center - width/2) * w)
                y1 = int((y_center - height/2) * h)
                x2 = int((x_center + width/2) * w)
                y2 = int((y_center + height/2) * h)
                
                # Draw rectangle
                cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2)
        
        axes[idx].imshow(img)
        axes[idx].set_title(f'Sample {idx+1}', fontsize=11)
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()

## 5. Dataset Statistics Summary

In [None]:
# Create summary dataframe
if Path(train_images).exists():
    summary_data = {
        'Metric': ['Total Images', 'Images with Labels', 'Total Defects', 'Avg. Defects per Image'],
        'Value': [
            stats['total_images'],
            stats['images_with_labels'],
            stats['total_objects'],
            f"{stats['total_objects'] / stats['total_images']:.2f}" if stats['total_images'] > 0 else 0
        ]
    }
    
    df_summary = pd.DataFrame(summary_data)
    print("\nDataset Summary:")
    print(df_summary.to_string(index=False))
    
    # Class distribution dataframe
    if 'class_distribution' in stats:
        class_data = {
            'Class': list(stats['class_distribution'].keys()),
            'Count': list(stats['class_distribution'].values()),
            'Percentage': [f"{(c/stats['total_objects']*100):.2f}%" 
                          for c in stats['class_distribution'].values()]
        }
        df_classes = pd.DataFrame(class_data)
        print("\nClass Distribution:")
        print(df_classes.to_string(index=False))

## Conclusion

This notebook provided an overview of the defect detection dataset. Key findings:

1. Dataset contains images with various defect types
2. Class distribution shows the balance (or imbalance) across defect types
3. Image sizes vary and will be normalized during training
4. Annotations are in YOLO format for training

Next steps:
- Train the model using `train.py`
- Evaluate performance on test set
- Fine-tune hyperparameters if needed