# 3. Preprocessing & Augmentation Pipeline

This notebook creates the data preprocessing pipeline with medical-appropriate augmentations using Albumentations, builds PyTorch Dataset and DataLoader classes, and computes class weights for handling imbalance.

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2
import os
import numpy as np
from config import DATASET_PATH, CLASS_NAMES, IMAGE_SIZE, BATCH_SIZE, NUM_WORKERS, RANDOM_SEED

# Set random seeds for reproducibility
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

print("="*60)
print("AUGMENTATION PIPELINE SETUP")
print("="*60)

# Training augmentations - medical imaging appropriate
train_transform = A.Compose([
    A.Resize(IMAGE_SIZE, IMAGE_SIZE),
    A.HorizontalFlip(p=0.5),
    A.Rotate(limit=10, p=0.5),
    A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=10, p=0.5),
    A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1, p=0.5),
    A.GaussNoise(var_limit=(10.0, 30.0), p=0.3),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2()
])

# Validation/Test augmentations - no data augmentation
val_transform = A.Compose([
    A.Resize(IMAGE_SIZE, IMAGE_SIZE),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2()
])

print("\n✓ Training Transforms:")
print("  • Resize to 224x224")
print("  • Horizontal Flip (p=0.5)")
print("  • Rotation ±10° (p=0.5)")
print("  • Shift/Scale/Rotate (p=0.5)")
print("  • Brightness/Contrast adjustment (p=0.5)")
print("  • Gaussian Noise (p=0.3)")
print("  • ImageNet Normalization")

print("\n✓ Validation/Test Transforms:")
print("  • Resize to 224x224")
print("  • ImageNet Normalization only")

print("\n✓ Augmentation pipeline created")

AUGMENTATION PIPELINE SETUP

✓ Training Transforms:
  • Resize to 224x224
  • Horizontal Flip (p=0.5)
  • Rotation ±10° (p=0.5)
  • Shift/Scale/Rotate (p=0.5)
  • Brightness/Contrast adjustment (p=0.5)
  • Gaussian Noise (p=0.3)
  • ImageNet Normalization

✓ Validation/Test Transforms:
  • Resize to 224x224
  • ImageNet Normalization only

✓ Augmentation pipeline created


  original_init(self, **validated_kwargs)
  A.GaussNoise(var_limit=(10.0, 30.0), p=0.3),


## PyTorch Dataset Class

Create a custom PyTorch Dataset class for loading and transforming chest X-ray images.

In [2]:
# Custom Dataset class
class ChestXrayDataset(Dataset):
    def __init__(self, root_dir, split='train', transform=None):
        self.root_dir = root_dir
        self.split = split
        self.transform = transform
        self.images = []
        self.labels = []
        
        # Load image paths and labels
        for class_idx, class_name in enumerate(CLASS_NAMES):
            class_path = os.path.join(root_dir, split, class_name)
            if os.path.exists(class_path):
                for img_name in os.listdir(class_path):
                    if img_name.endswith(('.jpeg', '.jpg', '.png')):
                        self.images.append(os.path.join(class_path, img_name))
                        self.labels.append(class_idx)
        
        print(f"{split.upper()} - Loaded {len(self.images)} images")
        
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        img_path = self.images[idx]
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        label = self.labels[idx]
        
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        
        return image, label

print("="*60)
print("DATASET CREATION")
print("="*60)
print()

# Create datasets
train_dataset = ChestXrayDataset(DATASET_PATH, split='train', transform=train_transform)
val_dataset = ChestXrayDataset(DATASET_PATH, split='val', transform=val_transform)
test_dataset = ChestXrayDataset(DATASET_PATH, split='test', transform=val_transform)

print("\n✓ All datasets created successfully")

DATASET CREATION

TRAIN - Loaded 5216 images
VAL - Loaded 47 images
TEST - Loaded 624 images

✓ All datasets created successfully


## Class Weights Calculation

Calculate class weights to handle the imbalanced dataset during training.

In [3]:
print("="*60)
print("CLASS WEIGHTS CALCULATION")
print("="*60)

# Calculate class weights for imbalance
train_labels = train_dataset.labels
class_counts = np.bincount(train_labels)

# Calculate weights: inverse of frequency
# Higher weight for minority class (Normal)
total_samples = len(train_labels)
class_weights = total_samples / (len(class_counts) * class_counts)
class_weights_tensor = torch.FloatTensor(class_weights)

print(f"\nClass counts in training set:")
print(f"  • Normal (class 0): {class_counts[0]} ({class_counts[0]/total_samples*100:.1f}%)")
print(f"  • Pneumonia (class 1): {class_counts[1]} ({class_counts[1]/total_samples*100:.1f}%)")

print(f"\nClass weights (for loss function):")
print(f"  • Normal (minority): {class_weights[0]:.4f} (HIGHER weight)")
print(f"  • Pneumonia (majority): {class_weights[1]:.4f} (LOWER weight)")

print(f"\nImbalance ratio: {class_counts[1]/class_counts[0]:.2f}:1 (Pneumonia:Normal)")
print(f"Weight ratio: {class_weights[0]/class_weights[1]:.2f}:1 (Normal:Pneumonia)")

# Save class weights for model training
torch.save(class_weights_tensor, 'class_weights.pt')
print("\n✓ Class weights saved to 'class_weights.pt'")
print("✓ Minority class (Normal) receives ~2.89x higher weight")

CLASS WEIGHTS CALCULATION

Class counts in training set:
  • Normal (class 0): 1341 (25.7%)
  • Pneumonia (class 1): 3875 (74.3%)

Class weights (for loss function):
  • Normal (minority): 1.9448 (HIGHER weight)
  • Pneumonia (majority): 0.6730 (LOWER weight)

Imbalance ratio: 2.89:1 (Pneumonia:Normal)
Weight ratio: 2.89:1 (Normal:Pneumonia)

✓ Class weights saved to 'class_weights.pt'
✓ Minority class (Normal) receives ~2.89x higher weight


## DataLoader Creation

Create PyTorch DataLoaders for training, validation, and test sets with appropriate batching and shuffling.

In [4]:
print("="*60)
print("DATALOADER CREATION")
print("="*60)

# Create DataLoaders
train_loader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    num_workers=NUM_WORKERS,
    pin_memory=False
)

val_loader = DataLoader(
    val_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    num_workers=NUM_WORKERS,
    pin_memory=False
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    num_workers=NUM_WORKERS,
    pin_memory=False
)

print(f"\n✓ Train DataLoader:")
print(f"  • Batch size: {BATCH_SIZE}")
print(f"  • Number of batches: {len(train_loader)}")
print(f"  • Shuffle: True")

print(f"\n✓ Validation DataLoader:")
print(f"  • Batch size: {BATCH_SIZE}")
print(f"  • Number of batches: {len(val_loader)}")
print(f"  • Shuffle: False")

print(f"\n✓ Test DataLoader:")
print(f"  • Batch size: {BATCH_SIZE}")
print(f"  • Number of batches: {len(test_loader)}")
print(f"  • Shuffle: False")

print("\n✓ All DataLoaders created successfully")

DATALOADER CREATION

✓ Train DataLoader:
  • Batch size: 32
  • Number of batches: 163
  • Shuffle: True

✓ Validation DataLoader:
  • Batch size: 32
  • Number of batches: 2
  • Shuffle: False

✓ Test DataLoader:
  • Batch size: 32
  • Number of batches: 20
  • Shuffle: False

✓ All DataLoaders created successfully


## Test Data Pipeline

Verify the data pipeline by loading a sample batch and checking tensor shapes and properties.

In [7]:
print("="*60)
print("TESTING DATA PIPELINE")
print("="*60)

# Create a test loader with num_workers=0 to avoid Windows multiprocessing issues
test_train_loader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    num_workers=0,  # Set to 0 for testing
    pin_memory=False
)

print("\n⏳ Loading sample batch...")

# Test data loading
sample_batch = next(iter(test_train_loader))
images, labels = sample_batch

print(f"\n✓ Sample batch loaded successfully")
print(f"\nBatch properties:")
print(f"  • Images shape: {images.shape}")
print(f"  • Labels shape: {labels.shape}")
print(f"  • Image dtype: {images.dtype}")
print(f"  • Label dtype: {labels.dtype}")
print(f"  • Image value range: [{images.min():.3f}, {images.max():.3f}]")
print(f"  • Unique labels in batch: {labels.unique().tolist()}")

# Verify dimensions
expected_shape = (BATCH_SIZE, 3, IMAGE_SIZE, IMAGE_SIZE)
print(f"\n✓ Expected shape: {expected_shape}")
print(f"✓ Actual shape: {images.shape}")

if images.shape[1:] == expected_shape[1:]:
    print("✓ Image dimensions correct!")
else:
    print("✗ Image dimensions mismatch!")

print("\n" + "="*60)
print("NOTEBOOK 3 COMPLETE - preprocessing.ipynb")
print("="*60)
print("Next: Create 'model_baseline_cnn.ipynb' for baseline model")

TESTING DATA PIPELINE

⏳ Loading sample batch...

✓ Sample batch loaded successfully

Batch properties:
  • Images shape: torch.Size([32, 3, 224, 224])
  • Labels shape: torch.Size([32])
  • Image dtype: torch.float32
  • Label dtype: torch.int64
  • Image value range: [-2.118, 2.640]
  • Unique labels in batch: [0, 1]

✓ Expected shape: (32, 3, 224, 224)
✓ Actual shape: torch.Size([32, 3, 224, 224])
✓ Image dimensions correct!

NOTEBOOK 3 COMPLETE - preprocessing.ipynb
Next: Create 'model_baseline_cnn.ipynb' for baseline model
