# 5 Pytorch Custom Data set

In [None]:
import pandas as pd
import numpy as np
import torch as tc
from torch import nn
# import torchvision
from torchvision import datasets, transforms
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt

## 5.1 Getting our datasets and becoming one with Data

In [None]:
import os
def walk_through_dir(dir_path):
  """walks through dir path returning its content"""
  for dirpath,dirnames,filenames in os.walk(dir_path):
    print(f"There are {len(dirnames)} directories and {len(filenames)} images in {dirpath}")

In [None]:
walk_through_dir(dir_path="DATA")

In [None]:
from pathlib import Path
data_path = Path("DATA/")

In [None]:
# setting up our train and test path
train_dir = data_path / "train"
test_dir = data_path / "test"
train_dir,test_dir

### 5.1.1 Visuallising our image

In [None]:
import random
from PIL import Image

# set seed
tc.manual_seed(42)

# 1. get all the image path
image_path_list = list(data_path.glob("*/*/*.jpg"))
# image_path_list

# 2. Plot random image
random_image_path = random.choice(image_path_list)
print(random_image_path)

# 3. get the image class
image_class = random_image_path.parent.stem
print(image_class)

# 4. open the image
img = Image.open(random_image_path)

# 5. print meta data
print(f"Random image path: {random_image_path}")
print(f"Image class: {image_class}")
print(f"Image Height: {img.height}")
print(f"Image width: {img.width}")
img

In [None]:
plt.imshow(img)
plt.axis(False)

In [None]:
img_as_array = np.asarray(img)
img_as_array

## 5.2 Transforming our data

### 5.2.1 Transform data with `torchvision.transform`

In [None]:
data_transform = transforms.Compose([
  # resize the image
  transforms.Resize(size=(64,64)),
  # flip the image
  transforms.RandomHorizontalFlip(p=0.5),
  # Turn image into a torch tensor
  ToTensor()
])

data_transform(img).shape

In [None]:
def plot_transformed_images(images_paths,transform,n=3,seed=None):
  """
    Selects random images from a path of images and loads/transforms them then plots the original vs the transformed version
  """
  if seed:
    random.seed(seed)
    random_image_paths = random.sample(images_paths,k=n)
    for image_path in random_image_paths:
      with Image.open(image_path) as f:
        fig,ax = plt.subplots(nrows=1,ncols=2)
        ax[0].imshow(f)
        ax[0].set_title(f"Origin\nSize: {f.size}")
        ax[0].axis(False)
        
        # transform and plot target image
        transformed_image = transform(f).permute(1,2,0) # not we will need to change the sahpe for the transform
        ax[1].imshow(transformed_image)
        ax[1].set_title(f"Transformed\nShape: {transformed_image.shape}")
        ax[1].axis(False)
        
        fig.suptitle(f"Class: {image_path.parent.stem}", fontsize=16) 

plot_transformed_images(image_path_list,transform=data_transform,seed=42)

## 5.3 Option1 Loading image data using ImageFolder

In [None]:
# use image folder to create datasets
train_data = datasets.ImageFolder(root=train_dir,transform=data_transform,target_transform=None)
test_data = datasets.ImageFolder(root=test_dir,transform=data_transform,target_transform=None)

train_data,test_data

In [None]:
# getting classnames
train_classnames = train_data.classes
train_classnames

In [None]:
# Index on the train data datasets
img,label =  train_data[1][0], train_data[1][1]
# img,label
print(f"Image Tensor:\n{img}") 
print(f"Image Shape:\n{img.dtype}") 
print(f"Image Datatype:\n{img.dtype}")
print(f"Image Label:\n{train_classnames[label]}")

In [None]:
# Rearrange the order of dimension
img_permute = img.permute(1,2,0)

# print out different shapes
print(f"Original shape: {img.shape} -> [color_channels, height, width]")
print(f"Image Permute shape: {img_permute.shape} -> [height, width, color_channels]")

# plot the image
plt.figure(figsize=(10,7))
plt.imshow(img_permute)
plt.axis(False)
plt.title(train_classnames[label],fontsize=14)

### 5.3.1 Turn loaded images into Dataloader

In [None]:
import os
os.cpu_count()

In [None]:
from torch.utils.data import DataLoader

BATCH_SIZE = 1
train_dataloader = DataLoader(
    dataset=train_data, batch_size=BATCH_SIZE, num_workers=os.cpu_count(), shuffle=True
)
test_dataloader = DataLoader(
    dataset=test_data, batch_size=BATCH_SIZE, num_workers=os.cpu_count(), shuffle=True
)

len(train_dataloader),len(test_dataloader)

## 5.4 Option2 Loading Image Data with a custom `Dataset`

In [None]:
from typing import Tuple,Dict,List
train_data.classes, train_data.class_to_idx

### 5.4.1 Creating a helper function to get classnames

In [None]:
# def 
# setup target directory
target_directory = train_dir
print(f"Target dir: {target_directory}")

# get the class names from the target directory
class_names_found = sorted([entry.name for entry in list(os.scandir(target_directory))])
class_names_found

In [None]:
def find_classes(directory:str) -> Tuple[List[str],Dict[str,int]]:
  """Finds the class folder names in a target directory."""
  # 1. Get the class names by scanning the target directory
  classes = sorted(entry.name for entry in os.scandir(directory) if entry.is_dir())
  
  # 2. raise an error if class names nt found
  if not classes:
    raise FileNotFoundError(f"Couldn't find any classes in {directory}...please check the file structure")
  
  # 3. create a dictionary of index labels
  classes_to_idx = {class_name: i for i, class_name in enumerate(classes)}
  return classes, classes_to_idx

In [None]:
find_classes(target_directory)

### 5.4.2 Create a custom `Dataset` to replicate `ImageFolder`

In [None]:
from torch.utils.data import Dataset
# 1. subclass
class ImageFolderCustom(Dataset):
  # 2. Inititalise the custom dataset
  def __init__(self,targ_dir:str,transform=None):
    # 3. create class attributes
    # get all the input image paths
    self.paths = list(Path(targ_dir).glob("*/*.jpg"))
    # setup transforms
    self.transform = transform
    # create classes and class_to_idx
    self.classes,self.class_to_idx = find_classes(targ_dir)
    
  # 4. create a function to load images
  def load_image(self,index:int) -> Image.Image:
    """Opens an image file paths and returns it"""
    image_path = self.paths[index]
    return Image.open(image_path)
  
  # 5. overwrite __len__()
  def __len__(self) -> int:
    """Return total number of samples"""
    return len(self.paths)
  
  # 6. overwrite __getitem__
  def __getitem__(self,index:int) -> Tuple[tc.Tensor,int]:
    """Returns one sample of data and label (X,y)"""
    img = self.load_image(index)
    class_name = self.paths[index].parent.name 
    class_idx = self.class_to_idx[class_name]
    
    # Transform if necassary
    
    if self.transform:
      return self.transform(img) ,class_idx
    else:
      return img,class_idx

In [None]:
# 
train_transforms = transforms.Compose([
  transforms.Resize(size=(64,64)),
  transforms.RandomHorizontalFlip(p=0.5),
  transforms.ToTensor()
])

test_transforms = transforms.Compose([
  transforms.Resize(size=(64,64)),
  transforms.ToTensor()
])

In [None]:
# Test out ImageCustomFolder
train_data_custom = ImageFolderCustom(targ_dir=train_dir,transform=train_transforms)
test_data_custom = ImageFolderCustom(targ_dir=train_dir,transform=test_transforms)
train_data_custom,test_data_custom

In [None]:
# check for equality between original ImageFolder Dataset and ImageFolderCustomDataset
print(train_data_custom.classes == train_data.classes)
print(test_data_custom.classes == test_data.classes)

### 5.4.3 Create a custom function to display random images

In [None]:
# 1. create a function to take in a dataset
def display_random_images(dataset:tc.utils.data.Dataset,classes:List[str] = None,n:int = 10,display_shape:bool = True,seed:int = None):
  # 2. Adjust display
  if n > 10:
    n = 10
    display_shape = False
    print(f"For display purposes, n shouldn't be larger than 10, setting to 10 and removing shape display.")
    
  # 3. set the random seed
  if seed:
    random.seed(seed)
    
  # 4. get rnadom indexes
  random_samples_idx = random.sample(range(len(dataset)),k=n)
  
  # 5. setup the plot
  plt.figure(figsize=(16,10))  
  
  # 6. loop through random sample images
  for i,targ_smaple in enumerate(random_samples_idx):
    targ_image,targ_label = dataset[targ_smaple][0],dataset[targ_smaple][1]
    
    # 7. Adjust tensor dimensions for plotting
    targ_image_adjust = targ_image.permute(1,2,0)
    
    # plot adjusted samples
    plt.subplot(1,n,i+1)
    plt.imshow(targ_image_adjust)
    plt.axis(False)
    
    if classes:
      title = f"classes: {classes[targ_label]}"
      if display_shape:
        title = title + f"\nShape: {targ_image_adjust.shape}"
        
    plt.title(title)
    

In [None]:
# 
display_random_images(train_data,n=4,classes=train_classnames,seed=42)

In [None]:
display_random_images(train_data_custom, n=4, classes=train_classnames, seed=42)

## 5.5 Turn custom loaded images into DataLoader's

In [None]:
BATCH_SIZE = 32
train_dataloader_custom = DataLoader(
    dataset=train_data_custom, batch_size=BATCH_SIZE, num_workers=0, shuffle=True
)
test_dataloader_custom = DataLoader(
    dataset=test_data_custom,
    batch_size=BATCH_SIZE,
    num_workers=0,
    shuffle=False,
)

train_dataloader_custom, test_dataloader_custom

In [None]:
# get imaeg and label
img_custom, lable_custom = next(iter(train_dataloader_custom))

# Print out the shapes
img_custom.shape, lable_custom.shape

## 5.6 Other forms of transforms (data augmentation)

In [None]:
train_transform = transforms.Compose([
  transforms.Resize(size=(224,224)),
  transforms.TrivialAugmentWide(num_magnitude_bins=5),
  transforms.ToTensor()
])

test_transform = transforms.Compose([
  transforms.Resize(size=(224,224)),
  transforms.ToTensor()
])

In [None]:
image_path_list[:10]

In [None]:
# plot random transformed images
plot_transformed_images(images_paths=image_path_list,transform=train_transform,n=3,seed=42)

## 5.7 Model 0: TinyVGG without data augmentation

### 5.7.1 Creating transforms and loading data for Model 0

In [None]:
# create simple transform
simple_transform = transforms.Compose([
  transforms.Resize(size=(64,64)),
  transforms.ToTensor()
])

In [None]:
# 1. load and transform data
train_data_simple = datasets.ImageFolder(root=train_dir, transform=simple_transform)
test_data_simple = datasets.ImageFolder(root=test_dir, transform=simple_transform)

# 2. Train dataset into data loader
train_dataloader_simple = DataLoader(
    dataset=train_data_simple,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=os.cpu_count(),
)
test_dataloader_simple = DataLoader(
    dataset=test_data_simple, batch_size=BATCH_SIZE, num_workers=os.cpu_count()
)

### 5.7.2 Create TinyVGG class

In [None]:
class TinyVGG(nn.Module):
    """Model architecture copying TinyVGG from CNN Explainer"""

    def __init__(self, input_shape: int, hidden_units: int, output_shape: int) -> None:
        super().__init__()
        self.conv_block_1 = nn.Sequential(
            nn.Conv2d(
                in_channels=input_shape,
                out_channels=hidden_units,
                kernel_size=3,
                stride=1,
                padding=1,
            ),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=hidden_units,
                out_channels=hidden_units,
                kernel_size=3,
                stride=1,
                padding=1,
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.conv_block_2 = nn.Sequential(
            nn.Conv2d(
                in_channels=hidden_units,
                out_channels=hidden_units,
                kernel_size=3,
                stride=1,
                padding=1,
            ),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=hidden_units,
                out_channels=hidden_units,
                kernel_size=3,
                stride=1,
                padding=1,
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.classifier = nn.Sequential(
            nn.Flatten(), nn.Linear(in_features=hidden_units*16*16, out_features=output_shape)
        )
        
    def forward(self,x):
      x = self.conv_block_1(x)
      print(x.shape)
      x = self.conv_block_2(x)
      print(x.shape)
      x = self.classifier(x)
      print(x.shape)
      return x
      # return self.classifier(self.conv_block_2(self.conv_block_1(x))) 

In [None]:
# 
tc.manual_seed(42)
model0 = TinyVGG(input_shape=3,hidden_units=10,output_shape=len(train_classnames))
model0

### 5.7.3 Trying a forward pass on a single image

In [None]:
# get a single image batch
image_batch, label_batch = next(iter(train_dataloader_simple))
image_batch.shape,label_batch.shape

In [None]:
# Try a forward pass
model0(image_batch)

In [None]:
# from torch 