# Face Verification - Image Preprocessing Pipeline

This notebook demonstrates the complete image preprocessing pipeline for the face verification project. We'll cover:

1. **Data Exploration** - Understanding our dataset structure
2. **Image Loading and Preprocessing** - Individual image processing
3. **Batch Processing** - Processing multiple images efficiently
4. **Visualization** - Visualizing preprocessing steps
5. **Quality Control** - Ensuring processed images meet requirements

## Project Overview

This preprocessing pipeline prepares facial images for the FaceNet model by:
- Loading images from various formats
- Detecting and extracting faces
- Resizing to standard dimensions (160x160)
- Normalizing pixel values
- Saving processed images for model training/inference

In [None]:
# Import required libraries
import os
import sys
import numpy as np
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import pandas as pd
from pathlib import Path

# Add project source directory to path
project_root = Path.cwd().parent
sys.path.append(str(project_root / "src"))

# Import custom modules
from preprocessing import (
    load_image, resize_image, normalize_image, save_processed,
    preprocess_image, batch_preprocess_images, visualize_preprocessing_steps
)
from utils import setup_project_environment, DatasetUtils, VisualizationUtils

print("‚úÖ Libraries imported successfully!")
print(f"Project root: {project_root}")
print(f"Current working directory: {Path.cwd()}")

# Setup project environment
paths, logger, config = setup_project_environment()
print(f"‚úÖ Project environment setup complete!")

## 1. Data Exploration

Let's first explore the structure of our dataset and understand what we're working with.

In [None]:
# Explore project directory structure
print("üìÅ Project Directory Structure:")
print("=" * 40)

def show_directory_tree(path, prefix="", max_depth=3, current_depth=0):
    if current_depth > max_depth:
        return
    
    path = Path(path)
    items = list(path.iterdir())
    for i, item in enumerate(sorted(items)):
        is_last = i == len(items) - 1
        current_prefix = "‚îî‚îÄ‚îÄ " if is_last else "‚îú‚îÄ‚îÄ "
        print(f"{prefix}{current_prefix}{item.name}")
        
        if item.is_dir() and current_depth < max_depth:
            next_prefix = prefix + ("    " if is_last else "‚îÇ   ")
            show_directory_tree(item, next_prefix, max_depth, current_depth + 1)

show_directory_tree(paths.project_root)

print("\nüìä Directory Information:")
print("=" * 40)
print(f"Raw data directory: {paths.data_raw}")
print(f"Processed data directory: {paths.data_processed}")
print(f"Pairs directory: {paths.data_pairs}")

# Check if directories exist and their contents
for name, path in [
    ("Raw", paths.data_raw),
    ("Processed", paths.data_processed), 
    ("Pairs", paths.data_pairs)
]:
    if path.exists():
        files = list(path.glob("*"))
        print(f"  {name}: {len(files)} items")
    else:
        print(f"  {name}: Directory not found")

In [None]:
# Get image files from raw directory (if any exist)
image_files = DatasetUtils.get_image_files(str(paths.data_raw))

print(f"üì∑ Found {len(image_files)} image files in raw directory")

if len(image_files) > 0:
    print("\nSample files:")
    for i, file_path in enumerate(image_files[:5]):  # Show first 5 files
        file_size = os.path.getsize(file_path) / (1024 * 1024)  # Size in MB
        print(f"  {i+1}. {Path(file_path).name} ({file_size:.2f} MB)")
    
    if len(image_files) > 5:
        print(f"  ... and {len(image_files) - 5} more files")
else:
    print("\n‚ö†Ô∏è  No images found in raw directory.")
    print("   Place your images in:", paths.data_raw)
    print("   Supported formats: .jpg, .jpeg, .png, .bmp, .tiff, .tif")
    
    # Create sample directory structure for demonstration
    print("\nüìù Creating sample image placeholders...")
    sample_names = [
        "person_a_001.jpg", "person_a_002.jpg", "person_a_003.jpg",
        "person_b_001.jpg", "person_b_002.jpg", "person_c_001.jpg"
    ]
    
    for name in sample_names:
        placeholder_path = paths.data_raw / name
        # Create empty placeholder files for demonstration
        placeholder_path.touch()
        print(f"  üìÑ Created placeholder: {name}")
    
    print("\nüí° Replace these placeholders with actual image files to test the pipeline!")

## 2. Individual Image Preprocessing

Let's test our preprocessing functions on individual images to understand each step of the pipeline.

In [None]:
# Test preprocessing functions with a sample image
# For demonstration, we'll create a synthetic image if no real images are available

def create_sample_image(size=(224, 224), filename="sample_face.jpg"):
    """Create a synthetic sample image for testing purposes."""
    sample_path = paths.data_raw / filename
    
    if not sample_path.exists():
        # Create a synthetic image with some patterns
        img = np.random.randint(0, 255, size + (3,), dtype=np.uint8)
        
        # Add some structure to make it look more like a face
        center_y, center_x = size[0] // 2, size[1] // 2
        
        # Create a circular face-like region
        y, x = np.ogrid[:size[0], :size[1]]
        mask = (x - center_x) ** 2 + (y - center_y) ** 2 <= (min(size) // 3) ** 2
        img[mask] = img[mask] * 0.8 + np.array([220, 180, 150]) * 0.2
        
        # Save the synthetic image
        cv2.imwrite(str(sample_path), cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
        print(f"‚úÖ Created synthetic sample image: {filename}")
    
    return str(sample_path)

# Get or create a sample image for testing
sample_image_path = None
actual_image_files = [f for f in DatasetUtils.get_image_files(str(paths.data_raw)) 
                     if not Path(f).name.startswith("sample_")]

if actual_image_files:
    sample_image_path = actual_image_files[0]
    print(f"üñºÔ∏è  Using actual image: {Path(sample_image_path).name}")
else:
    sample_image_path = create_sample_image()
    print(f"üé® Using synthetic sample image for demonstration")

print(f"Sample image path: {sample_image_path}")

In [None]:
# Test individual preprocessing functions
if sample_image_path and os.path.exists(sample_image_path):
    try:
        print("üîç Testing individual preprocessing functions...")
        print("=" * 50)
        
        # Step 1: Load image
        print("1. Loading image...")
        original_image = load_image(sample_image_path)
        print(f"   ‚úÖ Image loaded successfully")
        print(f"   üìä Shape: {original_image.shape}")
        print(f"   üìä Data type: {original_image.dtype}")
        print(f"   üìä Value range: [{original_image.min()}, {original_image.max()}]")
        
        # Step 2: Resize image
        print("\n2. Resizing image...")
        target_size = (160, 160)  # FaceNet input size
        resized_image = resize_image(original_image, target_size)
        print(f"   ‚úÖ Image resized to {target_size}")
        print(f"   üìä New shape: {resized_image.shape}")
        
        # Step 3: Normalize image
        print("\n3. Normalizing image...")
        normalized_image = normalize_image(resized_image, method='facenet')
        print(f"   ‚úÖ Image normalized using 'facenet' method")
        print(f"   üìä Value range: [{normalized_image.min():.3f}, {normalized_image.max():.3f}]")
        
        # Step 4: Test complete pipeline
        print("\n4. Testing complete pipeline...")
        processed_image = preprocess_image(
            sample_image_path, 
            size=target_size, 
            normalize_method='facenet'
        )
        print(f"   ‚úÖ Complete preprocessing pipeline successful")
        print(f"   üìä Final shape: {processed_image.shape}")
        print(f"   üìä Final range: [{processed_image.min():.3f}, {processed_image.max():.3f}]")
        
    except Exception as e:
        print(f"‚ùå Error in preprocessing: {str(e)}")
        import traceback
        traceback.print_exc()
else:
    print("‚ö†Ô∏è  Sample image not found or path is invalid")