# Vision Transformer Training

In [3]:
import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split

from utils import EarlyStopper
from models import SimplifiedVisionTransformer

torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
batch_size = 512
learning_rate = 1e-3
hidden_channels = 64
num_transformer_layers = 2
num_heads = 8
mlp_ratio = 4

In [4]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:False"

In [5]:
transform = transforms.Compose([
  transforms.ToTensor(),
  transforms.Normalize((0.5,), (0.5,))
])

full_train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

train_size = int(0.8 * len(full_train_dataset))
val_size = len(full_train_dataset) - train_size
train_dataset, val_dataset = random_split(full_train_dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

In [6]:
print(next(iter(train_dataloader))[0].shape)
print(next(iter(val_dataloader))[0].shape)
print(next(iter(test_dataloader))[0].shape)

torch.Size([512, 1, 28, 28])


torch.Size([512, 1, 28, 28])
torch.Size([512, 1, 28, 28])


### Load Vision Transformer, Loss & Optimizer

In [7]:
model = SimplifiedVisionTransformer(
  in_channels=1,
  hidden_channels=hidden_channels,
  out_channels=10,
  num_transformer_layers=num_transformer_layers,
  num_heads=num_heads,
  mlp_ratio=mlp_ratio,
  dropout=0.1,
  image_height=28,
  image_width=28
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [8]:
output = model(next(iter(train_dataloader))[0].to(device))
print(output.shape)
print(output)

OutOfMemoryError: CUDA out of memory. Tried to allocate 9.38 GiB. GPU 0 has a total capacity of 14.75 GiB of which 4.74 GiB is free. Process 1935553 has 10.00 GiB memory in use. Of the allocated memory 9.87 GiB is allocated by PyTorch, and 11.75 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [6]:
num_epochs = 1000
training_losses = []
training_accuracies = []
validation_losses = []
validation_accuracies = []
early_stopper = EarlyStopper(patience=10)

for epoch in tqdm.tqdm(range(num_epochs), desc="Training..."):
  model.train()

  epoch_loss = 0.0
  epoch_correct = 0
  epoch_total = 0

  # Training loop
  for index, (x, y) in enumerate(train_dataloader):
    x, y = x.to(device), y.to(device)
    optimizer.zero_grad()
    output = model(x.float()) # (batch_size, num_channels, height, width) -> (batch_size, num_channels)

    loss = criterion(output, y)
    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    predictions = output.argmax(dim=1, keepdim=True)
    epoch_correct += predictions.eq(y.view_as(predictions)).sum().item()
    epoch_total += y.size(0)

    if index % 100 == 0:
      print(f"Epoch {epoch} | Batch {index}/{len(train_dataloader)} | Loss: {loss.item():.6f}")

  training_losses.append(epoch_loss / len(train_dataloader))
  training_accuracies.append(epoch_correct / epoch_total)

  # Validation
  model.eval()
  val_loss = 0.0
  val_correct = 0
  val_total = 0

  with torch.no_grad():
    for x, y in val_dataloader:
      output = model(x.float())
      loss = criterion(output, y)

      val_loss += loss.item()
      predictions = output.argmax(dim=1, keepdim=True)
      val_correct += predictions.eq(y.view_as(predictions)).sum().item()
      val_total += y.size(0)

  validation_losses.append(val_loss / len(val_dataloader))
  validation_accuracies.append(val_correct / val_total)

  if early_stopper.early_stop(validation_losses[-1]):
    print("Stopping Early")
    break


Training...:   0%|          | 0/1000 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 9.38 GiB. GPU 0 has a total capacity of 14.75 GiB of which 4.74 GiB is free. Process 903950 has 10.01 GiB memory in use. Of the allocated memory 9.87 GiB is allocated by PyTorch, and 17.75 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Test the model using the test_dataloader
model.eval()
output = model(next(iter(test_dataloader)).to(device))
print(f"Predictions: {output}")
print(f"Test data: {next(iter(test_dataloader))}")