# 03 Data Preprocessing

Create train/val/test splits and data loaders for model training.

**Outputs:**
- Split indices saved to JSON for reproducibility
- DataLoader factory for training notebook

## Configuration

In [3]:
import json
from pathlib import Path
from datetime import datetime

# All configuration in one place
config = {
    # Paths
    'data_dir': Path('../data/CleanPetImages'),
    'output_dir': Path('../outputs'),
    'split_file': Path('../outputs/splits/train_val_test_split.json'),
    
    # Split ratios (val and test are from total, train is remainder)
    'val_ratio': 0.10,
    'test_ratio': 0.10,
    
    # Iteration subset - use smaller training set for fast experiments
    'use_subset': True,
    'subset_size': 4000,
    
    # DataLoader settings
    'batch_size': 32,
    'num_workers': 4,
    'pin_memory': True,
    
    # Image settings
    'image_size': 224,
    
    # Reproducibility
    'random_seed': 42,
}

# Create output directories
config['output_dir'].mkdir(parents=True, exist_ok=True)
(config['output_dir'] / 'splits').mkdir(exist_ok=True)

print("Configuration loaded")
print(f"  Data directory: {config['data_dir']}")
print(f"  Subset mode: {config['use_subset']} ({config['subset_size']} images)")

Configuration loaded
  Data directory: ..\data\CleanPetImages
  Subset mode: True (4000 images)


## Load File Paths

Load all image paths from the clean dataset.

In [4]:
def load_file_paths(data_dir):
    """
    Load all image paths from Cat and Dog folders.
    
    Returns:
        files: list of paths relative to data_dir (e.g., 'Cat/123.jpg')
        labels: list of int labels (0=cat, 1=dog)
    """
    data_dir = Path(data_dir)
    
    cat_files = sorted(data_dir.glob('Cat/*.jpg'))
    dog_files = sorted(data_dir.glob('Dog/*.jpg'))
    
    # Store as relative paths (portable across machines)
    files = []
    labels = []
    
    for f in cat_files:
        files.append(f'Cat/{f.name}')
        labels.append(0)
    
    for f in dog_files:
        files.append(f'Dog/{f.name}')
        labels.append(1)
    
    return files, labels


# Load the dataset
all_files, all_labels = load_file_paths(config['data_dir'])

# Quick summary
n_cats = sum(1 for label in all_labels if label == 0)
n_dogs = sum(1 for label in all_labels if label == 1)

print(f"Loaded {len(all_files)} images")
print(f"  Cats: {n_cats}")
print(f"  Dogs: {n_dogs}")

Loaded 24924 images
  Cats: 12456
  Dogs: 12468
