<a href="https://colab.research.google.com/github/ShovalBenjer/deep_learning_neural_networks/blob/main/ex_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **MNIST Dataset Preparation**

This notebook section preprocesses the MNIST dataset for training, validation, and testing. The workflow includes:
1. **Transformations**: Normalize pixel values to [-1, 1] using a predefined transformation pipeline.
2. **Dataset Loading**: Load the MNIST training and testing datasets.
3. **Data Splitting**: Split the training dataset into 90% training and 10% validation subsets.
4. **DataLoader Creation**: Set up batch processing for each subset.

Below are reusable functions with proper docstrings to accomplish each step.


In [None]:
import torch
import torchvision
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split

def get_transforms():
    """
    Create and return a composition of transformations for preprocessing MNIST dataset.

    Returns:
        transform (transforms.Compose): Transformation pipeline for MNIST dataset.
    """
    return transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])

def load_mnist_dataset(transform):
    """
    Load the MNIST dataset for training and testing.

    Args:
        transform (transforms.Compose): Transformation pipeline to apply to the dataset.

    Returns:
        train_dataset (Dataset): MNIST training dataset.
        test_dataset (Dataset): MNIST testing dataset.
    """
    train_dataset = datasets.MNIST(root="./data", train=True, transform=transform, download=True)
    test_dataset = datasets.MNIST(root="./data", train=False, transform=transform, download=True)
    return train_dataset, test_dataset

def split_train_validation(dataset, train_ratio=0.9):
    """
    Split the training dataset into training and validation subsets.

    Args:
        dataset (Dataset): The full training dataset.
        train_ratio (float): Proportion of the dataset to allocate for training.

    Returns:
        train_subset (Subset): Training subset of the dataset.
        val_subset (Subset): Validation subset of the dataset.
    """
    train_size = int(train_ratio * len(dataset))
    val_size = len(dataset) - train_size
    return random_split(dataset, [train_size, val_size])

def create_dataloaders(train_subset, val_subset, test_dataset, batch_size=50):
    """
    Create DataLoader objects for training, validation, and testing.

    Args:
        train_subset (Subset): Training subset of the dataset.
        val_subset (Subset): Validation subset of the dataset.
        test_dataset (Dataset): Test dataset.
        batch_size (int): Number of samples per batch.

    Returns:
        train_loader (DataLoader): DataLoader for the training subset.
        val_loader (DataLoader): DataLoader for the validation subset.
        test_loader (DataLoader): DataLoader for the test dataset.
    """
    train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, val_loader, test_loader

def main():
    """
    Main function to load and preprocess the MNIST dataset, split it into subsets, and create DataLoaders.
    """
    transform = get_transforms()
    train_dataset, test_dataset = load_mnist_dataset(transform)
    train_subset, val_subset = split_train_validation(train_dataset)
    train_loader, val_loader, test_loader = create_dataloaders(train_subset, val_subset, test_dataset)

    print(f"Training set size: {len(train_subset)}")
    print(f"Validation set size: {len(val_subset)}")
    print(f"Test set size: {len(test_dataset)}")

if __name__ == "__main__":
    main()
