# SwellSight Real-to-Synthetic Pipeline - Enhanced Data Import and Preprocessing

This enhanced notebook handles the import and preprocessing of real beach camera images with comprehensive data validation, quality checks, memory optimization, and robust error handling.

## Overview
Enhanced features include:
- **Comprehensive Data Validation**: Image quality validation with resolution, format, and corruption detection
- **Memory-Aware Batch Processing**: Dynamic batch sizing based on available memory
- **Robust Error Handling**: Retry logic for file operations with exponential backoff
- **Progress Tracking**: Progress bars with memory usage display
- **Quality Summary Reporting**: Detailed quality statistics and recommendations
- **Standardized Data Format**: Pipeline integration with shared utility functions

## Pipeline Integration
This notebook implements enhanced data preparation:
1. **Configuration Loading**: Load shared configuration with validation
2. **Image Discovery**: Find all supported image formats with validation
3. **Quality Validation**: Comprehensive image quality assessment
4. **Memory Optimization**: Dynamic batch sizing and memory monitoring
5. **Error Recovery**: Robust error handling with retry mechanisms
6. **Data Standardization**: Prepare data in standardized format for next stages

## Prerequisites
- Complete execution of `01_Setup_and_Installation_Enhanced.ipynb`
- Real beach images available in configured data directory
- Shared utility functions properly installed

---

## 1. Load Configuration and Initialize Enhanced Components

In [None]:
import sys
import os
import json
import logging
import time
from pathlib import Path
from datetime import datetime
from typing import Dict, Any, List, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Import core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image, ImageStat
import cv2
from tqdm.auto import tqdm
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
from collections import Counter

# Import enhanced utility functions
try:
    from utils.config_manager import ConfigManager
    from utils.data_validator import DataValidator
    from utils.memory_optimizer import MemoryOptimizer
    from utils.error_handler import ErrorHandler
    from utils.progress_tracker import ProgressTracker
    from utils.data_flow_manager import DataFlowManager
    print("‚úì Enhanced utility functions loaded successfully")
except ImportError as e:
    print(f"‚ùå Error importing utility functions: {e}")
    print("Please ensure utils/ directory is in your Python path")
    raise

# Check environment
IN_COLAB = 'google.colab' in sys.modules

# Mount Google Drive if in Colab
if IN_COLAB:
    from google.colab import drive
    print("Mounting Google Drive...")
    try:
        drive.mount('/content/drive')
        print("‚úì Google Drive mounted successfully")
    except Exception as e:
        print(f"Drive mount failed: {e}")
        try:
            drive.mount('/content/drive', force_remount=True, timeout_ms=300000)
            print("‚úì Force remount successful")
        except Exception as e2:
            print(f"‚ùå Critical failure mounting drive: {e2}")
            raise

# Initialize enhanced components
print("\nüîß Initializing enhanced pipeline components...")

try:
    # Initialize configuration manager
    config_manager = ConfigManager()
    CONFIG = config_manager.load_config()
    
    # Initialize other components
    data_validator = DataValidator(quality_threshold=CONFIG.get('processing', {}).get('quality_threshold', 0.7))
    memory_optimizer = MemoryOptimizer(safety_margin=0.1)
    error_handler = ErrorHandler(max_retries=3, backoff_factor=2.0)
    progress_tracker = ProgressTracker()
    data_flow_manager = DataFlowManager()
    
    print("‚úì Enhanced components initialized successfully")
    
    # Validate dependencies
    dependency_status = data_flow_manager.check_dependencies('data_preprocessing')
    if not dependency_status['all_satisfied']:
        print(f"‚ö†Ô∏è  Missing dependencies: {dependency_status['missing_dependencies']}")
        print("Please complete the setup notebook first")
    else:
        print("‚úì All dependencies satisfied")
    
    # Set up paths from configuration
    REAL_IMAGES_PATH = Path(CONFIG['paths']['real_images_path'])
    OUTPUT_PATH = Path(CONFIG['paths']['output_path'])
    
    print(f"\nüìÅ Configuration loaded:")
    print(f"   Session ID: {CONFIG['session']['session_id']}")
    print(f"   Real images path: {REAL_IMAGES_PATH}")
    print(f"   Output path: {OUTPUT_PATH}")
    print(f"   Quality threshold: {data_validator.quality_threshold}")
    
except Exception as e:
    print(f"‚ùå Failed to initialize enhanced components: {e}")
    print("Falling back to basic configuration loading...")
    
    # Fallback to basic configuration loading
    try:
        if IN_COLAB:
            config_file = Path('/content/drive/MyDrive/SwellSight/config.json')
        else:
            config_file = Path('config.json')
        
        with open(config_file, 'r') as f:
            CONFIG = json.load(f)
        
        # Initialize basic components
        data_validator = DataValidator()
        memory_optimizer = MemoryOptimizer()
        
        REAL_IMAGES_PATH = Path(CONFIG['paths']['real_images_path'])
        OUTPUT_PATH = Path(CONFIG['paths']['output_path'])
        
        print("‚úì Basic configuration loaded successfully")
        
    except Exception as e2:
        print(f"‚ùå Critical error loading configuration: {e2}")
        raise

# Configure logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler(OUTPUT_PATH / 'data_preprocessing.log')
    ]
)
logger = logging.getLogger(__name__)

print("\nüöÄ Enhanced data preprocessing notebook ready!")

## 2. Enhanced Image Discovery with Validation

In [None]:
# Enhanced image discovery with comprehensive validation
print("üîç Enhanced image discovery and validation...")

# Check if input directory exists
if not REAL_IMAGES_PATH.exists():
    print(f"‚ùå Input directory does not exist: {REAL_IMAGES_PATH}")
    print("\nüìù To proceed, please:")
    print(f"   1. Create the directory: {REAL_IMAGES_PATH}")
    print("   2. Add your real beach camera images")
    print("   3. Re-run this notebook")
    raise FileNotFoundError(f"Input directory not found: {REAL_IMAGES_PATH}")

# Get supported formats from validator
supported_extensions = list(data_validator.SUPPORTED_FORMATS)
print(f"\nüîé Searching for images with extensions: {supported_extensions}")

# Discover image files with error handling
def discover_images_with_retry():
    """Discover images with retry logic for network drives"""
    def _discover():
        image_paths = []
        for ext in supported_extensions:
            # Search for both lowercase and uppercase extensions
            lower_files = list(REAL_IMAGES_PATH.glob(f'*{ext.lower()}'))
            upper_files = list(REAL_IMAGES_PATH.glob(f'*{ext.upper()}'))
            image_paths.extend(lower_files + upper_files)
        
        # Remove duplicates and sort
        return sorted(list(set(image_paths)))
    
    return error_handler.retry_with_backoff(_discover)

try:
    image_paths = discover_images_with_retry()
    
    # Count by extension
    extension_counts = {}
    for path in image_paths:
        ext = path.suffix.lower()
        extension_counts[ext] = extension_counts.get(ext, 0) + 1
    
    print(f"\nüìä Discovery Results:")
    for ext, count in extension_counts.items():
        if count > 0:
            print(f"   ‚úÖ {ext}: {count} files")
        else:
            print(f"   ‚ö™ {ext}: 0 files")
    
    print(f"\n   Total images found: {len(image_paths)}")
    print(f"   Input directory: {REAL_IMAGES_PATH}")
    
except Exception as e:
    logger.error(f"Error during image discovery: {e}")
    print(f"‚ùå Image discovery failed: {e}")
    raise

if not image_paths:
    print(f"\n‚ùå No image files found in {REAL_IMAGES_PATH}")
    print(f"\nüìù Supported formats: {', '.join(supported_extensions)}")
    print("\nüí° Please add real beach camera images to the input directory")
    raise ValueError("No images found for processing")

# Apply memory-aware batch sizing for processing limit
max_images = CONFIG['processing'].get('max_images_per_session', 500)
memory_info = memory_optimizer.monitor_memory_usage()

# Estimate memory requirements and adjust batch size if needed
estimated_memory_per_image = memory_optimizer.estimate_image_memory_usage()
available_memory = memory_info.get('system_available_gb', 4) * 1024**3  # Convert to bytes
memory_based_limit = int(available_memory * 0.5 / estimated_memory_per_image)  # Use 50% of available memory

effective_limit = min(max_images, memory_based_limit, len(image_paths))

if len(image_paths) > effective_limit:
    print(f"\n‚ö†Ô∏è  Limiting processing to {effective_limit} images")
    print(f"   Found: {len(image_paths)} images")
    print(f"   Config limit: {max_images}")
    print(f"   Memory-based limit: {memory_based_limit}")
    print(f"   Available memory: {memory_info.get('system_available_gb', 0):.1f}GB")
    image_paths = image_paths[:effective_limit]

print(f"\n‚úÖ Final image count for processing: {len(image_paths)}")

# Display sample filenames
print(f"\nüìã Sample filenames:")
for i, path in enumerate(image_paths[:5]):
    print(f"   {i+1}. {path.name}")
if len(image_paths) > 5:
    print(f"   ... and {len(image_paths) - 5} more")

# Memory optimization suggestions
suggestions = memory_optimizer.suggest_memory_optimizations(memory_info)
if suggestions:
    print(f"\nüí° Memory optimization suggestions:")
    for suggestion in suggestions[:3]:  # Show top 3
        print(f"   ‚Ä¢ {suggestion}")

## 3. Comprehensive Image Quality Assessment with Memory Optimization

In [None]:
# Enhanced image quality assessment with memory-aware batch processing
print("üîç Comprehensive image quality assessment with memory optimization...")

# Calculate optimal batch size for processing
optimal_batch_size = memory_optimizer.get_optimal_batch_size(
    item_size=estimated_memory_per_image,
    max_batch_size=32
)

print(f"\nüìä Processing Configuration:")
print(f"   Total images: {len(image_paths)}")
print(f"   Optimal batch size: {optimal_batch_size}")
print(f"   Quality threshold: {data_validator.quality_threshold}")
print(f"   Estimated memory per image: {estimated_memory_per_image / (1024*1024):.1f}MB")

# Initialize tracking variables
image_metadata = []
valid_images = []
invalid_images = []
processing_errors = []
quality_stats = {
    'total_size_mb': 0,
    'resolutions': [],
    'aspect_ratios': [],
    'formats': {},
    'color_modes': {},
    'quality_scores': [],
    'brightness_scores': [],
    'contrast_scores': [],
    'sharpness_scores': []
}

# Process images in memory-optimized batches with progress tracking
start_time = time.time()
processed_count = 0
batch_count = 0

with memory_optimizer.memory_monitor(log_usage=True) as monitor:
    # Create progress tracker
    progress = progress_tracker.create_progress_bar(
        total=len(image_paths),
        description="Analyzing images",
        show_memory=True
    )
    
    # Process in batches
    for batch_start in range(0, len(image_paths), optimal_batch_size):
        batch_end = min(batch_start + optimal_batch_size, len(image_paths))
        batch_paths = image_paths[batch_start:batch_end]
        batch_count += 1
        
        print(f"\nüîÑ Processing batch {batch_count} ({len(batch_paths)} images)...")
        
        # Process each image in the batch
        batch_results = []
        
        for image_path in batch_paths:
            try:
                # Validate image quality with comprehensive checks
                def validate_image():
                    return data_validator.validate_image_quality(str(image_path))
                
                # Use error handler for robust validation
                validation_result = error_handler.retry_with_backoff(validate_image)
                
                # Extract metadata from validation result
                metadata = {
                    'filename': image_path.name,
                    'path': str(image_path),
                    'valid': validation_result['valid'],
                    'quality_score': validation_result['score'],
                    'issues': validation_result['issues'],
                    **validation_result['metrics']
                }
                
                image_metadata.append(metadata)
                batch_results.append(metadata)
                
                if validation_result['valid']:
                    valid_images.append(image_path)
                    
                    # Update quality statistics
                    metrics = validation_result['metrics']
                    quality_stats['total_size_mb'] += metrics.get('file_size', 0) / (1024*1024)
                    quality_stats['resolutions'].append(f"{metrics.get('width', 0)}x{metrics.get('height', 0)}")
                    quality_stats['aspect_ratios'].append(metrics.get('width', 1) / max(metrics.get('height', 1), 1))
                    quality_stats['quality_scores'].append(validation_result['score'])
                    quality_stats['brightness_scores'].append(metrics.get('brightness_mean', 0))
                    quality_stats['contrast_scores'].append(metrics.get('contrast', 0))
                    quality_stats['sharpness_scores'].append(metrics.get('sharpness', 0))
                    
                    # Count formats and modes
                    format_name = metrics.get('format', 'Unknown')
                    mode = metrics.get('mode', 'Unknown')
                    quality_stats['formats'][format_name] = quality_stats['formats'].get(format_name, 0) + 1
                    quality_stats['color_modes'][mode] = quality_stats['color_modes'].get(mode, 0) + 1
                    
                else:
                    invalid_images.append(image_path)
                    logger.warning(f"Invalid image {image_path.name}: {validation_result['issues']}")
                
                processed_count += 1
                
                # Update progress with memory info
                current_memory = memory_optimizer.monitor_memory_usage()
                progress_tracker.update_progress(
                    progress, 
                    processed_count, 
                    additional_info=f"Memory: {current_memory.get('system_percent', 0):.1f}%"
                )
                
            except Exception as e:
                error_info = {
                    'filename': image_path.name,
                    'error': str(e),
                    'batch': batch_count
                }
                processing_errors.append(error_info)
                logger.error(f"Error processing {image_path.name}: {e}")
                
                # Continue with next image
                processed_count += 1
                progress_tracker.update_progress(progress, processed_count)
        
        # Memory cleanup after each batch
        memory_optimizer.cleanup_variables([batch_results])
        
        # Check memory usage and adjust if needed
        current_memory = memory_optimizer.monitor_memory_usage()
        if current_memory.get('system_percent', 0) > 85:
            print(f"\n‚ö†Ô∏è  High memory usage detected: {current_memory.get('system_percent', 0):.1f}%")
            suggestions = memory_optimizer.suggest_memory_optimizations(current_memory)
            for suggestion in suggestions[:2]:
                print(f"   üí° {suggestion}")
    
    progress_tracker.close_progress_bar(progress)

processing_time = time.time() - start_time

print(f"\n‚úÖ Quality assessment completed!")
print(f"   Processing time: {processing_time:.1f} seconds")
print(f"   Valid images: {len(valid_images)}")
print(f"   Invalid images: {len(invalid_images)}")
print(f"   Processing errors: {len(processing_errors)}")
print(f"   Success rate: {len(valid_images)/len(image_paths)*100:.1f}%")

# Report processing errors if any
if processing_errors:
    print(f"\n‚ö†Ô∏è  Processing errors encountered:")
    for error in processing_errors[:3]:  # Show first 3
        print(f"   - {error['filename']}: {error['error']}")
    if len(processing_errors) > 3:
        print(f"   ... and {len(processing_errors) - 3} more errors")

# Report invalid images if any
if invalid_images:
    print(f"\n‚ö†Ô∏è  Invalid images found:")
    for img_path in invalid_images[:3]:  # Show first 3
        print(f"   - {img_path.name}")
    if len(invalid_images) > 3:
        print(f"   ... and {len(invalid_images) - 3} more")

# Final memory cleanup
memory_optimizer.cleanup_variables([batch_results])
final_memory = memory_optimizer.monitor_memory_usage()
print(f"\nüíæ Final memory usage: {final_memory.get('system_percent', 0):.1f}%")

## 4. Enhanced Data Statistics and Quality Analysis

In [None]:
# Enhanced data statistics with comprehensive quality analysis
if not valid_images:
    print("‚ùå No valid images found. Cannot proceed with analysis.")
    raise ValueError("No valid images available for processing")

print("üìä Generating comprehensive data statistics and quality analysis...")

# Basic dataset statistics
print(f"\nüìà Dataset Statistics:")
print(f"   Total valid images: {len(valid_images)}")
print(f"   Total size: {quality_stats['total_size_mb']:.1f} MB")
print(f"   Average size per image: {quality_stats['total_size_mb']/len(valid_images):.2f} MB")
print(f"   Processing time: {processing_time:.1f} seconds")
print(f"   Images per second: {len(image_paths)/processing_time:.1f}")

# Resolution analysis with detailed statistics
resolution_counts = Counter(quality_stats['resolutions'])
print(f"\nüìê Resolution Distribution:")
for resolution, count in resolution_counts.most_common(10):
    percentage = count / len(valid_images) * 100
    print(f"   {resolution}: {count} images ({percentage:.1f}%)")

# Format and color mode distribution
print(f"\nüñºÔ∏è  Format Distribution:")
for format_name, count in quality_stats['formats'].items():
    percentage = count / len(valid_images) * 100
    print(f"   {format_name}: {count} images ({percentage:.1f}%)")

print(f"\nüé® Color Mode Distribution:")
for mode, count in quality_stats['color_modes'].items():
    percentage = count / len(valid_images) * 100
    print(f"   {mode}: {count} images ({percentage:.1f}%)")

# Enhanced quality metrics with statistical analysis
quality_scores = quality_stats['quality_scores']
brightness_scores = quality_stats['brightness_scores']
contrast_scores = quality_stats['contrast_scores']
sharpness_scores = quality_stats['sharpness_scores']
aspect_ratios = quality_stats['aspect_ratios']

print(f"\nüí° Enhanced Quality Metrics:")
print(f"   Quality Score - Mean: {np.mean(quality_scores):.3f}, Std: {np.std(quality_scores):.3f}, Range: [{np.min(quality_scores):.3f}, {np.max(quality_scores):.3f}]")
print(f"   Brightness - Mean: {np.mean(brightness_scores):.1f}, Std: {np.std(brightness_scores):.1f}, Range: [{np.min(brightness_scores):.1f}, {np.max(brightness_scores):.1f}]")
print(f"   Contrast - Mean: {np.mean(contrast_scores):.3f}, Std: {np.std(contrast_scores):.3f}, Range: [{np.min(contrast_scores):.3f}, {np.max(contrast_scores):.3f}]")
print(f"   Sharpness - Mean: {np.mean(sharpness_scores):.3f}, Std: {np.std(sharpness_scores):.3f}, Range: [{np.min(sharpness_scores):.3f}, {np.max(sharpness_scores):.3f}]")
print(f"   Aspect Ratio - Mean: {np.mean(aspect_ratios):.3f}, Std: {np.std(aspect_ratios):.3f}, Range: [{np.min(aspect_ratios):.3f}, {np.max(aspect_ratios):.3f}]")

# Quality distribution analysis
high_quality_count = sum(1 for score in quality_scores if score >= 0.8)
medium_quality_count = sum(1 for score in quality_scores if 0.5 <= score < 0.8)
low_quality_count = sum(1 for score in quality_scores if score < 0.5)

print(f"\nüéØ Quality Distribution:")
print(f"   High quality (‚â•0.8): {high_quality_count} images ({high_quality_count/len(valid_images)*100:.1f}%)")
print(f"   Medium quality (0.5-0.8): {medium_quality_count} images ({medium_quality_count/len(valid_images)*100:.1f}%)")
print(f"   Low quality (<0.5): {low_quality_count} images ({low_quality_count/len(valid_images)*100:.1f}%)")

# Enhanced quality assessment with specific recommendations
print(f"\nüéØ Enhanced Quality Assessment:")

# Brightness analysis
avg_brightness = np.mean(brightness_scores)
brightness_std = np.std(brightness_scores)
if avg_brightness < 50:
    print(f"   ‚ö†Ô∏è  Images appear dark (avg: {avg_brightness:.1f}) - may affect depth estimation quality")
    print(f"      üí° Consider brightness adjustment or gamma correction")
elif avg_brightness > 200:
    print(f"   ‚ö†Ô∏è  Images appear bright (avg: {avg_brightness:.1f}) - may have overexposure issues")
    print(f"      üí° Consider exposure adjustment or histogram equalization")
else:
    print(f"   ‚úÖ Brightness levels good for depth estimation (avg: {avg_brightness:.1f})")

if brightness_std > 50:
    print(f"   ‚ö†Ô∏è  High brightness variation (std: {brightness_std:.1f}) - inconsistent lighting conditions")
    print(f"      üí° Consider normalization or adaptive processing")

# Contrast analysis
avg_contrast = np.mean(contrast_scores)
if avg_contrast < 0.1:
    print(f"   ‚ö†Ô∏è  Low contrast detected (avg: {avg_contrast:.3f}) - may reduce depth map quality")
    print(f"      üí° Consider contrast enhancement or CLAHE")
else:
    print(f"   ‚úÖ Contrast levels adequate for depth estimation (avg: {avg_contrast:.3f})")

# Sharpness analysis
avg_sharpness = np.mean(sharpness_scores)
if avg_sharpness < 0.1:
    print(f"   ‚ö†Ô∏è  Low sharpness detected (avg: {avg_sharpness:.3f}) - images may be blurred")
    print(f"      üí° Consider sharpening filters or deblurring")
else:
    print(f"   ‚úÖ Sharpness levels good (avg: {avg_sharpness:.3f})")

# Aspect ratio analysis
aspect_std = np.std(aspect_ratios)
if aspect_std > 0.5:
    print(f"   ‚ö†Ô∏è  High aspect ratio variation (std: {aspect_std:.3f}) - consider consistent cropping")
    print(f"      üí° Standardize aspect ratios for better model performance")
else:
    print(f"   ‚úÖ Consistent aspect ratios across dataset (std: {aspect_std:.3f})")

# Overall dataset quality assessment
avg_quality = np.mean(quality_scores)
quality_threshold = data_validator.quality_threshold

print(f"\nüèÜ Overall Dataset Assessment:")
if avg_quality >= quality_threshold:
    print(f"   ‚úÖ Dataset quality is good (avg: {avg_quality:.3f} ‚â• {quality_threshold})")
    print(f"   üöÄ Ready for depth estimation processing")
else:
    print(f"   ‚ö†Ô∏è  Dataset quality below threshold (avg: {avg_quality:.3f} < {quality_threshold})")
    print(f"   üîß Consider quality improvements before proceeding")

# Memory usage summary
current_memory = memory_optimizer.monitor_memory_usage()
print(f"\nüíæ Memory Usage Summary:")
print(f"   System memory: {current_memory.get('system_percent', 0):.1f}% used")
print(f"   Available memory: {current_memory.get('system_available_gb', 0):.1f}GB")
if current_memory.get('gpu_total_gb', 0) > 0:
    print(f"   GPU memory: {current_memory.get('gpu_percent', 0):.1f}% used")

## 5. Enhanced Visual Quality Analysis with Error Handling

In [None]:
# Enhanced visual quality analysis with robust error handling
print("üìä Creating enhanced visual quality analysis...")

try:
    # Set up the plotting environment
    plt.style.use('default')
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle('Enhanced SwellSight Dataset Quality Analysis', fontsize=16, fontweight='bold')
    
    # 1. Quality Score Distribution
    axes[0, 0].hist(quality_scores, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
    axes[0, 0].axvline(np.mean(quality_scores), color='red', linestyle='--', 
                      label=f'Mean: {np.mean(quality_scores):.3f}')
    axes[0, 0].axvline(quality_threshold, color='orange', linestyle='--', 
                      label=f'Threshold: {quality_threshold}')
    axes[0, 0].set_title('Quality Score Distribution')
    axes[0, 0].set_xlabel('Quality Score')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)
    
    # 2. Brightness Distribution
    axes[0, 1].hist(brightness_scores, bins=20, alpha=0.7, color='gold', edgecolor='black')
    axes[0, 1].axvline(np.mean(brightness_scores), color='red', linestyle='--', 
                      label=f'Mean: {np.mean(brightness_scores):.1f}')
    axes[0, 1].set_title('Brightness Distribution')
    axes[0, 1].set_xlabel('Brightness (0-255)')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.3)
    
    # 3. Contrast Distribution
    axes[0, 2].hist(contrast_scores, bins=20, alpha=0.7, color='lightgreen', edgecolor='black')
    axes[0, 2].axvline(np.mean(contrast_scores), color='red', linestyle='--', 
                      label=f'Mean: {np.mean(contrast_scores):.3f}')
    axes[0, 2].set_title('Contrast Distribution')
    axes[0, 2].set_xlabel('Contrast (0-1)')
    axes[0, 2].set_ylabel('Frequency')
    axes[0, 2].legend()
    axes[0, 2].grid(True, alpha=0.3)
    
    # 4. Sharpness Distribution
    axes[1, 0].hist(sharpness_scores, bins=20, alpha=0.7, color='lightcoral', edgecolor='black')
    axes[1, 0].axvline(np.mean(sharpness_scores), color='red', linestyle='--', 
                      label=f'Mean: {np.mean(sharpness_scores):.3f}')
    axes[1, 0].set_title('Sharpness Distribution')
    axes[1, 0].set_xlabel('Sharpness')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    
    # 5. Resolution Distribution (Top 10)
    top_resolutions = resolution_counts.most_common(10)
    if top_resolutions:
        res_names = [res[0] for res in top_resolutions]
        res_counts = [res[1] for res in top_resolutions]
        
        axes[1, 1].bar(range(len(res_names)), res_counts, alpha=0.7, color='mediumpurple')
        axes[1, 1].set_title('Top 10 Resolutions')
        axes[1, 1].set_xlabel('Resolution')
        axes[1, 1].set_ylabel('Count')
        axes[1, 1].set_xticks(range(len(res_names)))
        axes[1, 1].set_xticklabels(res_names, rotation=45, ha='right')
        axes[1, 1].grid(True, alpha=0.3)
    
    # 6. Quality vs Brightness Scatter Plot
    scatter = axes[1, 2].scatter(brightness_scores, quality_scores, alpha=0.6, 
                                c=contrast_scores, cmap='viridis', s=20)
    axes[1, 2].set_title('Quality vs Brightness (colored by Contrast)')
    axes[1, 2].set_xlabel('Brightness')
    axes[1, 2].set_ylabel('Quality Score')
    axes[1, 2].axhline(quality_threshold, color='red', linestyle='--', alpha=0.7, 
                      label=f'Quality Threshold: {quality_threshold}')
    axes[1, 2].legend()
    axes[1, 2].grid(True, alpha=0.3)
    
    # Add colorbar for scatter plot
    cbar = plt.colorbar(scatter, ax=axes[1, 2])
    cbar.set_label('Contrast')
    
    plt.tight_layout()
    
    # Save the plot with error handling
    try:
        plot_path = OUTPUT_PATH / 'quality_analysis.png'
        plt.savefig(plot_path, dpi=300, bbox_inches='tight')
        print(f"‚úÖ Quality analysis plot saved: {plot_path}")
    except Exception as e:
        logger.warning(f"Could not save plot: {e}")
    
    plt.show()
    
except Exception as e:
    logger.error(f"Error creating visual analysis: {e}")
    print(f"‚ö†Ô∏è  Could not create visual analysis: {e}")
    print("Continuing with text-based analysis...")

# Create summary statistics table
try:
    summary_stats = {
        'Metric': ['Quality Score', 'Brightness', 'Contrast', 'Sharpness', 'Aspect Ratio'],
        'Mean': [np.mean(quality_scores), np.mean(brightness_scores), np.mean(contrast_scores), 
                np.mean(sharpness_scores), np.mean(aspect_ratios)],
        'Std': [np.std(quality_scores), np.std(brightness_scores), np.std(contrast_scores), 
               np.std(sharpness_scores), np.std(aspect_ratios)],
        'Min': [np.min(quality_scores), np.min(brightness_scores), np.min(contrast_scores), 
               np.min(sharpness_scores), np.min(aspect_ratios)],
        'Max': [np.max(quality_scores), np.max(brightness_scores), np.max(contrast_scores), 
               np.max(sharpness_scores), np.max(aspect_ratios)]
    }
    
    summary_df = pd.DataFrame(summary_stats)
    summary_df = summary_df.round(3)
    
    print("\nüìã Quality Metrics Summary Table:")
    print(summary_df.to_string(index=False))
    
    # Save summary table
    try:
        summary_path = OUTPUT_PATH / 'quality_summary.csv'
        summary_df.to_csv(summary_path, index=False)
        print(f"\n‚úÖ Quality summary saved: {summary_path}")
    except Exception as e:
        logger.warning(f"Could not save summary table: {e}")
        
except Exception as e:
    logger.error(f"Error creating summary statistics: {e}")
    print(f"‚ö†Ô∏è  Could not create summary statistics: {e}")

## 6. Standardized Data Format and Pipeline Integration

In [None]:
# Prepare standardized data format for pipeline integration
print("üì¶ Preparing standardized data format for pipeline integration...")

try:
    # Create standardized data structure
    processed_data = {
        'valid_image_paths': [str(path) for path in valid_images],
        'invalid_image_paths': [str(path) for path in invalid_images],
        'image_metadata': image_metadata,
        'quality_statistics': {
            'total_images_processed': len(image_paths),
            'valid_images_count': len(valid_images),
            'invalid_images_count': len(invalid_images),
            'success_rate': len(valid_images) / len(image_paths),
            'average_quality_score': float(np.mean(quality_scores)) if quality_scores else 0.0,
            'quality_score_std': float(np.std(quality_scores)) if quality_scores else 0.0,
            'average_brightness': float(np.mean(brightness_scores)) if brightness_scores else 0.0,
            'average_contrast': float(np.mean(contrast_scores)) if contrast_scores else 0.0,
            'average_sharpness': float(np.mean(sharpness_scores)) if sharpness_scores else 0.0,
            'total_size_mb': quality_stats['total_size_mb'],
            'resolution_distribution': dict(resolution_counts.most_common(10)),
            'format_distribution': quality_stats['formats'],
            'color_mode_distribution': quality_stats['color_modes']
        },
        'processing_info': {
            'processing_time_seconds': processing_time,
            'images_per_second': len(image_paths) / processing_time,
            'batch_size_used': optimal_batch_size,
            'memory_usage': memory_optimizer.monitor_memory_usage(),
            'quality_threshold': data_validator.quality_threshold,
            'processing_errors': processing_errors
        },
        'recommendations': {
            'dataset_ready_for_next_stage': avg_quality >= quality_threshold and len(valid_images) > 0,
            'suggested_improvements': [],
            'memory_optimizations': memory_optimizer.suggest_memory_optimizations()
        }
    }
    
    # Add specific recommendations based on analysis
    recommendations = processed_data['recommendations']['suggested_improvements']
    
    if avg_brightness < 50:
        recommendations.append("Consider brightness adjustment - images appear dark")
    elif avg_brightness > 200:
        recommendations.append("Consider exposure adjustment - images appear overexposed")
    
    if avg_contrast < 0.1:
        recommendations.append("Consider contrast enhancement - low contrast detected")
    
    if avg_sharpness < 0.1:
        recommendations.append("Consider sharpening filters - low sharpness detected")
    
    if aspect_std > 0.5:
        recommendations.append("Consider standardizing aspect ratios for consistency")
    
    if len(invalid_images) > len(valid_images) * 0.1:  # More than 10% invalid
        recommendations.append("High number of invalid images - review input data quality")
    
    # Create metadata for data flow manager
    stage_metadata = {
        'processing_time_seconds': processing_time,
        'input_count': len(image_paths),
        'output_count': len(valid_images),
        'success_rate': len(valid_images) / len(image_paths),
        'quality_metrics': {
            'mean_quality_score': float(np.mean(quality_scores)) if quality_scores else 0.0,
            'min_quality_score': float(np.min(quality_scores)) if quality_scores else 0.0,
            'max_quality_score': float(np.max(quality_scores)) if quality_scores else 0.0
        },
        'errors': [
            {
                'type': 'processing_error',
                'count': len(processing_errors),
                'examples': [error['filename'] for error in processing_errors[:3]]
            },
            {
                'type': 'invalid_image',
                'count': len(invalid_images),
                'examples': [path.name for path in invalid_images[:3]]
            }
        ] if (processing_errors or invalid_images) else [],
        'outputs': {
            'processed_images': 'data_preprocessing_results.json',
            'quality_report': 'quality_summary.csv',
            'quality_plot': 'quality_analysis.png'
        }
    }
    
    # Save results using data flow manager
    success = data_flow_manager.save_stage_results(
        data=processed_data,
        stage_name='data_preprocessing',
        metadata=stage_metadata
    )
    
    if success:
        print("‚úÖ Data preprocessing results saved successfully")
        print(f"   Stage: data_preprocessing")
        print(f"   Valid images: {len(valid_images)}")
        print(f"   Quality score: {avg_quality:.3f}")
        print(f"   Ready for next stage: {processed_data['recommendations']['dataset_ready_for_next_stage']}")
    else:
        print("‚ö†Ô∏è  Warning: Could not save results to data flow manager")
        print("Results are still available in memory for this session")
    
    # Display final summary
    print(f"\nüéØ Final Processing Summary:")
    print(f"   Total images processed: {len(image_paths)}")
    print(f"   Valid images: {len(valid_images)} ({len(valid_images)/len(image_paths)*100:.1f}%)")
    print(f"   Average quality score: {avg_quality:.3f}")
    print(f"   Processing time: {processing_time:.1f} seconds")
    print(f"   Memory usage: {final_memory.get('system_percent', 0):.1f}%")
    
    if processed_data['recommendations']['dataset_ready_for_next_stage']:
        print(f"\nüöÄ Dataset is ready for depth extraction stage!")
        print(f"   You can now proceed to notebook 03: Depth-Anything-V2 Extraction")
    else:
        print(f"\n‚ö†Ô∏è  Dataset may need improvements before proceeding:")
        for rec in recommendations[:3]:
            print(f"   ‚Ä¢ {rec}")
    
except Exception as e:
    logger.error(f"Error preparing standardized data format: {e}")
    print(f"‚ùå Error preparing data for pipeline: {e}")
    print("Results are available in memory but may not be properly formatted for next stage")
    raise

print(f"\n‚úÖ Enhanced data preprocessing completed successfully!")
print(f"üìä All results saved and ready for pipeline integration")