# 02. DataLoader


Official documentation: https://docs.pytorch.org/docs/stable/data.html

DataLoader class:
* https://docs.pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader


In [36]:
import time

from multiprocessing import freeze_support

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import v2

In [None]:
class MyDataset(Dataset):
    def __init__(self):
        self.data = [1, 2, 3, 4, 5]
        self.labels = ["odd", "even", "odd", "even", "odd"]
        # The data and the labels must be of the same length

    def __len__(self) -> int:
        # DataLoader uses this to know dataset size
        return len(self.data)

    def __getitem__(self, i: int):
        # returns one sample by index
        return self.data[i], self.labels[i]

In [None]:
# The batch size controls the number of samples per batch
for batch_size in [2, 3]:
    print("Batch size", batch_size)
    dataloader = DataLoader(dataset, batch_size=batch_size)
    for epoch in range(2):
        print("\tEpoch", epoch)
        for data, label in dataloader:
            print("\t\t", epoch, "Batch", data, label)
    print()

# Observe that the last batch is smaller if the dataset size is not a multiple of the batch size

Batch size 2
	Epoch 0
		 0 Batch tensor([1, 2]) ('odd', 'even')
		 0 Batch tensor([3, 4]) ('odd', 'even')
		 0 Batch tensor([5]) ('odd',)
	Epoch 1
		 1 Batch tensor([1, 2]) ('odd', 'even')
		 1 Batch tensor([3, 4]) ('odd', 'even')
		 1 Batch tensor([5]) ('odd',)

Batch size 3
	Epoch 0
		 0 Batch tensor([1, 2, 3]) ('odd', 'even', 'odd')
		 0 Batch tensor([4, 5]) ('even', 'odd')
	Epoch 1
		 1 Batch tensor([1, 2, 3]) ('odd', 'even', 'odd')
		 1 Batch tensor([4, 5]) ('even', 'odd')

Batch size 5
	Epoch 0
		 0 Batch tensor([1, 2, 3, 4, 5]) ('odd', 'even', 'odd', 'even', 'odd')
	Epoch 1
		 1 Batch tensor([1, 2, 3, 4, 5]) ('odd', 'even', 'odd', 'even', 'odd')



In [31]:
# We should use the drop_last=True option to drop the last batch if it is smaller than the batch size
dataloader = DataLoader(dataset, batch_size=2, drop_last=True)
for epoch in range(2):
    print("Epoch", epoch)
    for data, label in dataloader:
        print("\t", epoch, "Batch", data, label)

Epoch 0
	 0 Batch tensor([1, 2]) ('odd', 'even')
	 0 Batch tensor([3, 4]) ('odd', 'even')
Epoch 1
	 1 Batch tensor([1, 2]) ('odd', 'even')
	 1 Batch tensor([3, 4]) ('odd', 'even')


In [30]:
# We should use shuffle=True to shuffle the data
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
for epoch in range(2):
    print("Epoch", epoch)
    for data, label in dataloader:
        print("\t", epoch, "Batch", data, label)

Epoch 0
	 0 Batch tensor([3, 1]) ('odd', 'odd')
	 0 Batch tensor([4, 2]) ('even', 'even')
	 0 Batch tensor([5]) ('odd',)
Epoch 1
	 1 Batch tensor([5, 3]) ('odd', 'odd')
	 1 Batch tensor([4, 2]) ('even', 'even')
	 1 Batch tensor([1]) ('odd',)


---

## Parallel data loading

* See https://docs.pytorch.org/docs/stable/data.html#multi-process-data-loading
* See [02_parallel_dataloader.py](./02_parallel_dataloader.py)

Possible output:
```
Loading item 0 in main process
Loading item 1 in main process
Loading item 2 in main process
Loading item 3 in main process
load_data(0) -> total time: 8.007728800s
Worker 0/2 is loading item 0
Worker 1/2 is loading item 1
Worker 0/2 is loading item 2
Worker 1/2 is loading item 3
load_data(2) -> total time: 7.586626900s
Worker 2/4 is loading item 2
Worker 3/4 is loading item 3
Worker 0/4 is loading item 0
Worker 1/4 is loading item 1
load_data(4) -> total time: 7.876157100s
Worker 0/8 is loading item 0
Worker 1/8 is loading item 1
Worker 2/8 is loading item 2
Worker 3/8 is loading item 3
load_data(8) -> total time: 8.691724500s

num_workers: 0, time: 8.0077288 seconds
num_workers: 2, time: 7.5866269 seconds
num_workers: 4, time: 7.8761571 seconds
num_workers: 8, time: 8.6917245 seconds
Speedup: 1.0555058137892612, 1.0167050629297376, 0.9213049493227726
```