# 03 Data Preprocessing

Create train/val/test splits and data loaders for model training.

**Outputs:**
- Split indices saved to JSON for reproducibility
- DataLoader factory for training notebook

## Configuration

In [3]:
import json
from pathlib import Path
from datetime import datetime

# All configuration in one place
config = {
    # Paths
    'data_dir': Path('../data/CleanPetImages'),
    'output_dir': Path('../outputs'),
    'split_file': Path('../outputs/splits/train_val_test_split.json'),
    
    # Split ratios (val and test are from total, train is remainder)
    'val_ratio': 0.10,
    'test_ratio': 0.10,
    
    # Iteration subset - use smaller training set for fast experiments
    'use_subset': True,
    'subset_size': 4000,
    
    # DataLoader settings
    'batch_size': 32,
    'num_workers': 4,
    'pin_memory': True,
    
    # Image settings
    'image_size': 224,
    
    # Reproducibility
    'random_seed': 42,
}

# Create output directories
config['output_dir'].mkdir(parents=True, exist_ok=True)
(config['output_dir'] / 'splits').mkdir(exist_ok=True)

print("Configuration loaded")
print(f"  Data directory: {config['data_dir']}")
print(f"  Subset mode: {config['use_subset']} ({config['subset_size']} images)")

Configuration loaded
  Data directory: ..\data\CleanPetImages
  Subset mode: True (4000 images)


## Load File Paths

Load all image paths from the clean dataset.

In [4]:
def load_file_paths(data_dir):
    """
    Load all image paths from Cat and Dog folders.
    
    Returns:
        files: list of paths relative to data_dir (e.g., 'Cat/123.jpg')
        labels: list of int labels (0=cat, 1=dog)
    """
    data_dir = Path(data_dir)
    
    cat_files = sorted(data_dir.glob('Cat/*.jpg'))
    dog_files = sorted(data_dir.glob('Dog/*.jpg'))
    
    # Store as relative paths (portable across machines)
    files = []
    labels = []
    
    for f in cat_files:
        files.append(f'Cat/{f.name}')
        labels.append(0)
    
    for f in dog_files:
        files.append(f'Dog/{f.name}')
        labels.append(1)
    
    return files, labels


# Load the dataset
all_files, all_labels = load_file_paths(config['data_dir'])

# Quick summary
n_cats = sum(1 for label in all_labels if label == 0)
n_dogs = sum(1 for label in all_labels if label == 1)

print(f"Loaded {len(all_files)} images")
print(f"  Cats: {n_cats}")
print(f"  Dogs: {n_dogs}")

Loaded 24924 images
  Cats: 12456
  Dogs: 12468


## Train/Val/Test Split

Split the dataset with stratification to maintain class balance.

- Train: 80% (used for model training)
- Val: 10% (used for hyperparameter tuning)  
- Test: 10% (held out for final evaluation)

Val and test sets are fixed across all experiments. Only train set size changes when using subset mode.

In [5]:
from sklearn.model_selection import train_test_split
import numpy as np

def create_splits(files, labels, val_ratio, test_ratio, random_seed):
    """
    Create stratified train/val/test splits.
    
    Args:
        files: list of file paths
        labels: list of labels (0 or 1)
        val_ratio: fraction for validation (e.g., 0.10)
        test_ratio: fraction for test (e.g., 0.10)
        random_seed: for reproducibility
    
    Returns:
        dict with train_files, val_files, test_files, and corresponding labels
    """
    # First split: separate test set
    train_val_files, test_files, train_val_labels, test_labels = train_test_split(
        files, 
        labels,
        test_size=test_ratio,
        stratify=labels,
        random_state=random_seed
    )
    
    # Second split: separate val from train
    # Adjust val_ratio since we're splitting from (1 - test_ratio)
    adjusted_val_ratio = val_ratio / (1 - test_ratio)
    
    train_files, val_files, train_labels, val_labels = train_test_split(
        train_val_files,
        train_val_labels,
        test_size=adjusted_val_ratio,
        stratify=train_val_labels,
        random_state=random_seed
    )
    
    return {
        'train_files': train_files,
        'train_labels': train_labels,
        'val_files': val_files,
        'val_labels': val_labels,
        'test_files': test_files,
        'test_labels': test_labels,
    }


# Create the splits
splits = create_splits(
    all_files, 
    all_labels,
    val_ratio=config['val_ratio'],
    test_ratio=config['test_ratio'],
    random_seed=config['random_seed']
)

print("Split sizes:")
print(f"  Train: {len(splits['train_files'])}")
print(f"  Val:   {len(splits['val_files'])}")
print(f"  Test:  {len(splits['test_files'])}")

Split sizes:
  Train: 19938
  Val:   2493
  Test:  2493


In [6]:
def print_class_distribution(labels, name):
    """Print class balance for a split."""
    labels = np.array(labels)
    n_cats = np.sum(labels == 0)
    n_dogs = np.sum(labels == 1)
    total = len(labels)
    
    print(f"{name}:")
    print(f"  Cats: {n_cats} ({100*n_cats/total:.1f}%)")
    print(f"  Dogs: {n_dogs} ({100*n_dogs/total:.1f}%)")


# Verify stratification maintained class balance
print("Class distribution (should be ~50/50 in each):\n")
print_class_distribution(splits['train_labels'], "Train")
print()
print_class_distribution(splits['val_labels'], "Val")
print()
print_class_distribution(splits['test_labels'], "Test")

Class distribution (should be ~50/50 in each):

Train:
  Cats: 9964 (50.0%)
  Dogs: 9974 (50.0%)

Val:
  Cats: 1246 (50.0%)
  Dogs: 1247 (50.0%)

Test:
  Cats: 1246 (50.0%)
  Dogs: 1247 (50.0%)


## Create Iteration Subset

Select a fixed subset of training data for fast hyperparameter iteration.
The subset is stratified to maintain class balance.

In [7]:
def create_subset_indices(train_labels, subset_size, random_seed):
    """
    Select stratified subset indices from training set.
    
    Args:
        train_labels: labels for training set
        subset_size: number of samples for subset
        random_seed: for reproducibility
    
    Returns:
        list of indices into the training set
    """
    n_train = len(train_labels)
    
    if subset_size >= n_train:
        print(f"Subset size ({subset_size}) >= train size ({n_train}), using all training data")
        return list(range(n_train))
    
    # Use train_test_split to get stratified subset
    all_indices = list(range(n_train))
    subset_ratio = subset_size / n_train
    
    subset_indices, _ = train_test_split(
        all_indices,
        train_size=subset_ratio,
        stratify=train_labels,
        random_state=random_seed
    )
    
    return sorted(subset_indices)


# Create subset indices
subset_indices = create_subset_indices(
    splits['train_labels'],
    config['subset_size'],
    config['random_seed']
)

# Verify subset
subset_labels = [splits['train_labels'][i] for i in subset_indices]
print(f"Subset size: {len(subset_indices)}")
print()
print_class_distribution(subset_labels, "Subset")

Subset size: 3999

Subset:
  Cats: 1998 (50.0%)
  Dogs: 2001 (50.0%)


## Save Splits to JSON

Save all split information for reproducibility across notebooks.

In [8]:
def save_splits(splits, subset_indices, config):
    """Save splits to JSON file."""
    
    split_data = {
        'train_files': splits['train_files'],
        'train_labels': splits['train_labels'],
        'val_files': splits['val_files'],
        'val_labels': splits['val_labels'],
        'test_files': splits['test_files'],
        'test_labels': splits['test_labels'],
        'subset_indices': subset_indices,
        'config': {
            'val_ratio': config['val_ratio'],
            'test_ratio': config['test_ratio'],
            'subset_size': config['subset_size'],
            'random_seed': config['random_seed'],
        },
        'created': datetime.now().isoformat(),
    }
    
    with open(config['split_file'], 'w') as f:
        json.dump(split_data, f, indent=2)
    
    print(f"Splits saved to {config['split_file']}")


# Save
save_splits(splits, subset_indices, config)

Splits saved to ..\outputs\splits\train_val_test_split.json


In [9]:
# Quick verification - reload and check
with open(config['split_file'], 'r') as f:
    loaded = json.load(f)

print("Verification - reloaded from JSON:")
print(f"  Train files: {len(loaded['train_files'])}")
print(f"  Val files:   {len(loaded['val_files'])}")
print(f"  Test files:  {len(loaded['test_files'])}")
print(f"  Subset indices: {len(loaded['subset_indices'])}")
print(f"  Random seed: {loaded['config']['random_seed']}")

Verification - reloaded from JSON:
  Train files: 19938
  Val files:   2493
  Test files:  2493
  Subset indices: 3999
  Random seed: 42
