# Custom Data Sets

Preprocessing your data for pytorch training

## Pytorch data environments:

Pytorch is an expansive library with different problem spaces for different issues. Each problem space has its own prefered dataset.

i.e:

- Vision: torchvision.datasets 
- Text: torchtext.datasets
- Audio: torchaudio.datasets
...

In [1]:
import torch
from torch import nn

torch.__version__

'2.5.1+cu124'

In [3]:
# Setup device agnostic code:

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

### Our dataset is a subset of Food101 dataset. 

It consists of 101 classes of food with 1000 images each: (750 training, 250 testing)

We will use 3 classes and 75 training images and 25 testing images.

In [8]:
import requests
import zipfile
from pathlib import Path

# Setup path to data folder
data_path = Path("data/")
image_path = data_path / "pizza_steak_sushi"

# If the image folder doesn't exist, download it and prepare it... 
if image_path.is_dir():
    print(f"{image_path} directory exists.")
else:
    print(f"Did not find {image_path} directory, creating one...")
    image_path.mkdir(parents=True, exist_ok=True)
    
    # Download pizza, steak, sushi data
    with open(data_path / "pizza_steak_sushi.zip", "wb") as f:
        request = requests.get("https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip")
        print("Downloading pizza, steak, sushi data...")
        f.write(request.content)

    # Unzip pizza, steak, sushi data
    with zipfile.ZipFile(data_path / "pizza_steak_sushi.zip", "r") as zip_ref:
        print("Unzipping pizza, steak, sushi data...") 
        zip_ref.extractall(image_path)

Did not find data/pizza_steak_sushi directory, creating one...
Downloading pizza, steak, sushi data...
Unzipping pizza, steak, sushi data...


In [9]:
# Setup train and test path

train_dir=image_path / 'train'
test_dir=image_path / 'test'

I will not write out all of the code, as this is not exactly relevant to my research, but I will write down observations:

**Observations**
- [path].glob('*/*.txt') for choosing certain files
- random.choice([list]) will give you a random choice in this list
- its often useful to transform our data:
```python
import torch
from torchvision import datasets, transforms

data_transform = transform.Compose([
    transforms.transform1()
    transforms.transform2()
    ...
    transforms.ToTensor()
])
```

- How to change the order of dimensions:

```python
[tensor].permute(i,j,k) # put the ith dimension first, then the jth and then the kth.
```

- Usual workflow is as follows:
    - Obtain data
    - Normalize data and randomly assign it (unnecessary in my case)
    - Split into train and test
    - Put into dataloader for batch sizes: should have (data=[batch size, color channels, height, width], label)
    ```python
    # Run with:
    img, label = next(iter(train_dataloader))
    ```

In [10]:
import os
os.cpu_count()

40

### Creating our own dataloader class

Note: This is going to be for a classifier :/ 

In [12]:
# Base level of datasets is torch.utils.data.Dataset
# All other datasets classes subclass this class.

import os
import pathlib
import torch

from PIL import Image
from torch.utils.data import Dataset
from torchvision import transforms
from typing import Tuple, Dict, List


In [None]:
# PSEUDO CODE DOES NOT ACTUALLY WORK!

class myDataLoader(Dataset):
    def __init__(self,where_data,transform=None):
        self.transform = transform
        self.data = [] # Open data from pickle

    def load_data(self, index: int) -> Dict:
        return dict
    
    # Must overwrite __len()__
    def __len__(self) -> int:
        return len(self.data)
    
    # Must overwrite __getitem__():
    def __getitem__(self, index):
        return super().__getitem__(index) # Must return (data=[batch size, color channels, height, width], label)
    
        # Transform if necessary:
        if self.transform:
            return self.transform(data)
        
        
train_data_custom = myDataLoader(parameters)
test_data_custom = myDataLoader(parameters)



### Data Augmentation?

- Seems to be referring to the action of modifying your data randomly to obtain new patterns.
    - Allows to look at the same image in different ways! Which may be useful for pattern identification
    - Allows to generalize model easier.

In [14]:
# Train_step function:

def train_step(model: torch.nn.Module, 
               dataloader: torch.utils.data.DataLoader, 
               loss_fn: torch.nn.Module, 
               optimizer: torch.optim.Optimizer):
    # Put model in train mode
    model.train()
    
    # Setup train loss and train accuracy values
    train_loss, train_acc = 0, 0
    
    # Loop through data loader data batches
    for batch, (X, y) in enumerate(dataloader):
        # Send data to target device
        X, y = X.to(device), y.to(device)

        # 1. Forward pass
        y_pred = model(X)

        # 2. Calculate  and accumulate loss
        loss = loss_fn(y_pred, y)
        train_loss += loss.item() 

        # 3. Optimizer zero grad
        optimizer.zero_grad()

        # 4. Loss backward
        loss.backward()

        # 5. Optimizer step
        optimizer.step()

        # Calculate and accumulate accuracy metrics across all batches
        y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
        train_acc += (y_pred_class == y).sum().item()/len(y_pred)

    # Adjust metrics to get average loss and accuracy per batch 
    train_loss = train_loss / len(dataloader)
    train_acc = train_acc / len(dataloader)
    return train_loss, train_acc

In [None]:
# Test step function:

def test_step(model: torch.nn.Module, 
              dataloader: torch.utils.data.DataLoader, 
              loss_fn: torch.nn.Module):
    # Put model in eval mode
    model.eval() 
    
    # Setup test loss and test accuracy values
    test_loss, test_acc = 0, 0
    
    # Turn on inference context manager
    with torch.inference_mode():
        # Loop through DataLoader batches
        for batch, (X, y) in enumerate(dataloader):
            # Send data to target device
            X, y = X.to(device), y.to(device)
    
            # 1. Forward pass
            test_pred_logits = model(X)

            # 2. Calculate and accumulate loss
            loss = loss_fn(test_pred_logits, y)
            test_loss += loss.item()
            
            # Calculate and accumulate accuracy
            test_pred_labels = test_pred_logits.argmax(dim=1)
            test_acc += ((test_pred_labels == y).sum().item()/len(test_pred_labels))
            
    # Adjust metrics to get average loss and accuracy per batch 
    test_loss = test_loss / len(dataloader)
    test_acc = test_acc / len(dataloader)
    return test_loss, test_acc

In [None]:
# Train function (Combines both steps)

from tqdm.auto import tqdm

# 1. Take in various parameters required for training and test steps
def train(model: torch.nn.Module, 
          train_dataloader: torch.utils.data.DataLoader, 
          test_dataloader: torch.utils.data.DataLoader, 
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module = nn.CrossEntropyLoss(),
          epochs: int = 5):
    
    # 2. Create empty results dictionary
    results = {"train_loss": [],
        "train_acc": [],
        "test_loss": [],
        "test_acc": []
    }
    
    # 3. Loop through training and testing steps for a number of epochs
    for epoch in tqdm(range(epochs)):
        train_loss, train_acc = train_step(model=model,
                                           dataloader=train_dataloader,
                                           loss_fn=loss_fn,
                                           optimizer=optimizer)
        test_loss, test_acc = test_step(model=model,
            dataloader=test_dataloader,
            loss_fn=loss_fn)
        
        # 4. Print out what's happening
        print(
            f"Epoch: {epoch+1} | "
            f"train_loss: {train_loss:.4f} | "
            f"train_acc: {train_acc:.4f} | "
            f"test_loss: {test_loss:.4f} | "
            f"test_acc: {test_acc:.4f}"
        )

        # 5. Update results dictionary
        # Ensure all data is moved to CPU and converted to float for storage
        results["train_loss"].append(train_loss.item() if isinstance(train_loss, torch.Tensor) else train_loss)
        results["train_acc"].append(train_acc.item() if isinstance(train_acc, torch.Tensor) else train_acc)
        results["test_loss"].append(test_loss.item() if isinstance(test_loss, torch.Tensor) else test_loss)
        results["test_acc"].append(test_acc.item() if isinstance(test_acc, torch.Tensor) else test_acc)

    # 6. Return the filled results at the end of the epochs
    return results

In [None]:
# Example usage:

# Set random seeds
torch.manual_seed(42) 
torch.cuda.manual_seed(42)

# Set number of epochs
NUM_EPOCHS = 5

# Recreate an instance of TinyVGG
model_0 = TinyVGG(input_shape=3, # number of color channels (3 for RGB) 
                  hidden_units=10, 
                  output_shape=len(train_data.classes)).to(device)

# Setup loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model_0.parameters(), lr=0.001)

# Start the timer
from timeit import default_timer as timer 
start_time = timer()

# Train model_0 
model_0_results = train(model=model_0, 
                        train_dataloader=train_dataloader_simple,
                        test_dataloader=test_dataloader_simple,
                        optimizer=optimizer,
                        loss_fn=loss_fn, 
                        epochs=NUM_EPOCHS)

# End the timer and print out how long it took
end_time = timer()
print(f"Total training time: {end_time-start_time:.3f} seconds")