In [2]:
%pip install torchinfo

Note: you may need to restart the kernel to use updated packages.


In [3]:
import matplotlib.pyplot as plt 
import torch 
import torchvision 

from torch import nn
from torchvision import transforms
# from helper_function import set_seeds

In [4]:
import numpy as np
import random

def set_seeds(seed: int = 42):
    """Sets random seeds for reproducibility.

    Args:
        seed (int, optional): The seed to set. Defaults to 42.
    """
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [6]:
# 1. Get pretrained weights for ViT-Base 
pretrained_vit_weights = torchvision.models.ViT_B_16_Weights.DEFAULT

# 2. Setup a ViT model with pretrained weights
pretrained_vit = torchvision.models.vit_b_16(weights=pretrained_vit_weights).to(device)

# 3. Freeze the base parameters 
for parameter in pretrained_vit.parameters():
    parameter.requires_grad = False

# 4. Change the classifier head to match the number of classes in your dataset
class_names = ['dandelion', 'sunflower', 'tulip']  

set_seeds()
pretrained_vit.heads = nn.Linear(in_features=768, out_features=len(class_names)).to(device)
                                # 16X16X3

In [7]:
from torchinfo import summary

# Print the model summary
summary(pretrained_vit, 
        input_size=(32, 3, 224, 224), # batch size of 32, 3 channels (RGB), and image size of 224x224
        # col_names=["input_size"],
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"]
)

Layer (type (var_name))                                      Input Shape          Output Shape         Param #              Trainable
VisionTransformer (VisionTransformer)                        [32, 3, 224, 224]    [32, 3]              768                  Partial
├─Conv2d (conv_proj)                                         [32, 3, 224, 224]    [32, 768, 14, 14]    (590,592)            False
├─Encoder (encoder)                                          [32, 197, 768]       [32, 197, 768]       151,296              False
│    └─Dropout (dropout)                                     [32, 197, 768]       [32, 197, 768]       --                   --
│    └─Sequential (layers)                                   [32, 197, 768]       [32, 197, 768]       --                   False
│    │    └─EncoderBlock (encoder_layer_0)                   [32, 197, 768]       [32, 197, 768]       (7,087,872)          False
│    │    └─EncoderBlock (encoder_layer_1)                   [32, 197, 768]       [32, 

Notice how only the output layer is trainable, where as, all of the rest of the layers are untrainable (frozen)

In [8]:
# setup directtory path to train and test data
train_dir = "flower_dataset/train"
test_dir = "flower_dataset/test"

Remember, if you're going to use a pretrained model, it's generally important to ensure your own custom data is transformed/formatted in the same way the data the original model was trained on.

In [9]:
# Get automatic transforms from pretrained ViT weights
pretrained_vit_transforms = pretrained_vit_weights.transforms()
print(pretrained_vit_transforms)

ImageClassification(
    crop_size=[224]
    resize_size=[256]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)



And now we've got transforms ready, we can turn our images into DataLoaders using the create_dataloaders()

In [10]:
import os

from torchvision import datasets, transforms
from torch.utils.data import DataLoader

NUM_WORKERS = os.cpu_count()

def create_dataloaders(
    train_dir: str, 
    test_dir: str, 
    transform: transforms.Compose, 
    batch_size: int, 
    num_workers: int=NUM_WORKERS
):

  # Use ImageFolder to create dataset(s)
  train_data = datasets.ImageFolder(train_dir, transform=transform)
  test_data = datasets.ImageFolder(test_dir, transform=transform)

  # Get class names
  class_names = train_data.classes

  # Turn images into data loaders
  train_dataloader = DataLoader(
      train_data,
      batch_size=batch_size,
      shuffle=True,
      num_workers=num_workers,
      pin_memory=True,
  )
  test_dataloader = DataLoader(
      test_data,
      batch_size=batch_size,
      shuffle=False,
      num_workers=num_workers,
      pin_memory=True,
  )

  return train_dataloader, test_dataloader, class_names

In [11]:
# Setup dataloaders
train_dataloader_pretrained, test_dataloader_pretrained, class_names = create_dataloaders(train_dir=train_dir,
                                                                                                     test_dir=test_dir,
                                                                                                     transform=pretrained_vit_transforms,
                                                                                                     batch_size=32) # Could increase if we had more samples, such as here: https://arxiv.org/abs/2205.01580 (there are other improvements there too...)

To use the going_modular module, one need to clone the repository from GitHub that contains this code.

In [12]:
# !git clone https://github.com/mrdbourke/pytorch-deep-learning

In [13]:
import sys
sys.path.append('/content/pytorch-deep-learning')

from going_modular.going_modular import engine

In [14]:
# Create optimizer and loss function
optimizer = torch.optim.Adam(params=pretrained_vit.parameters(), 
                             lr=1e-3)
loss_fn = torch.nn.CrossEntropyLoss()

# Train the classifier head of the pretrained ViT feature extractor model
set_seeds()
pretrained_vit_results = engine.train(model=pretrained_vit,
                                      train_dataloader=train_dataloader_pretrained,
                                      test_dataloader=test_dataloader_pretrained,
                                      optimizer=optimizer,
                                      loss_fn=loss_fn,
                                      epochs=10,
                                      device=device)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 0.9643 | train_acc: 0.5804 | test_loss: 1.3734 | test_acc: 0.2760
Epoch: 2 | train_loss: 0.5933 | train_acc: 0.8652 | test_loss: 1.5545 | test_acc: 0.2682
Epoch: 3 | train_loss: 0.4142 | train_acc: 0.8786 | test_loss: 1.6512 | test_acc: 0.2604
Epoch: 4 | train_loss: 0.3145 | train_acc: 0.9054 | test_loss: 1.6668 | test_acc: 0.3000
Epoch: 5 | train_loss: 0.2544 | train_acc: 0.9429 | test_loss: 1.6906 | test_acc: 0.3401
Epoch: 6 | train_loss: 0.2157 | train_acc: 0.9295 | test_loss: 1.7180 | test_acc: 0.3552
Epoch: 7 | train_loss: 0.1867 | train_acc: 0.9491 | test_loss: 1.7482 | test_acc: 0.3786
Epoch: 8 | train_loss: 0.1629 | train_acc: 0.9500 | test_loss: 1.8038 | test_acc: 0.3786
Epoch: 9 | train_loss: 0.1467 | train_acc: 0.9607 | test_loss: 1.8857 | test_acc: 0.3786
Epoch: 10 | train_loss: 0.1299 | train_acc: 0.9812 | test_loss: 1.9240 | test_acc: 0.3630


In [None]:
# # Save the model
# from going_modular.going_modular import save_model
# save_model(model=pretrained_vit,
#            target_dir="models"
#            model_name="pretrained_vit_classifier_head.pth")

: 

pretrained ViT performed far better than our custom ViT model trained from scratch (in the same amount of time).

In [None]:
# Plot the loss curves
from helper_functions import plot_loss_curves

plot_loss_curves(pretrained_vit_results) 

# Prediction:

In [None]:
import requests

# Import function to make predictions on images and plot them 
from going_modular.going_modular.predictions import pred_and_plot_image

# Setup custom image path
custom_image_path = "test_img.jpg"

# Predict on custom image
pred_and_plot_image(model=pretrained_vit,
                    image_path=custom_image_path,
                    class_names=class_names)