<a href="https://colab.research.google.com/github/Pavani3005/ViT_assignment/blob/main/q1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!rm -r '/content/sample_data'

In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from tqdm import tqdm # For a nice progress bar

# Set a seed for reproducibility
torch.manual_seed(42)

# Check for GPU availability and set the device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [None]:
# --- CONFIGURATION ---
IMG_SIZE = 32
PATCH_SIZE = 4
NUM_CLASSES = 10 # CIFAR-10 has 10 classes
EMB_DIM = 512    # Embedding dimension
NUM_HEADS = 8    # Number of attention heads
NUM_LAYERS = 6   # Number of Transformer encoder layers
HIDDEN_DIM = 2048 # Hidden dimension in the MLP
DROPOUT = 0.2

# --- TRAINING PARAMS ---
BATCH_SIZE = 256
LEARNING_RATE = 1e-4
NUM_EPOCHS = 100

In [None]:
# --- DATA PREPARATION ---

# Define transformations for training data (with augmentation)
transform_train = transforms.Compose([
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.8, 1.0)),
    transforms.TrivialAugmentWide(),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2470, 0.2435, 0.2616])
])

# Define transformations for test data (only normalization)
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2470, 0.2435, 0.2616])
])

# Download datasets
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

100%|██████████| 170M/170M [00:13<00:00, 12.3MB/s]


In [None]:
# --- MODEL ARCHITECTURE ---

class PatchEmbedding(nn.Module):
    """Converts an image into a sequence of patch embeddings."""
    def __init__(self):
        super().__init__()
        self.patcher = nn.Conv2d(
            in_channels=3,
            out_channels=EMB_DIM,
            kernel_size=PATCH_SIZE,
            stride=PATCH_SIZE
        )

    def forward(self, x):
        # x shape: [B, C, H, W] -> e.g., [256, 3, 32, 32]
        # Output shape: [B, EMB_DIM, H/P, W/P] -> e.g., [256, 512, 8, 8]
        x = self.patcher(x)

        # Flatten and transpose
        # Output shape: [B, EMB_DIM, N] -> e.g., [256, 512, 64]
        x = x.flatten(2)
        # Output shape: [B, N, EMB_DIM] -> e.g., [256, 64, 512]
        x = x.transpose(1, 2)
        return x

In [None]:
class TransformerEncoder(nn.Module):
    """The standard Transformer encoder block."""
    def __init__(self):
        super().__init__()
        # Attention block
        self.norm1 = nn.LayerNorm(EMB_DIM)
        self.attn = nn.MultiheadAttention(EMB_DIM, NUM_HEADS, dropout=DROPOUT, batch_first=True)
        self.dropout1 = nn.Dropout(DROPOUT)

        # MLP block
        self.norm2 = nn.LayerNorm(EMB_DIM)
        self.mlp = nn.Sequential(
            nn.Linear(EMB_DIM, HIDDEN_DIM),
            nn.GELU(),
            nn.Dropout(DROPOUT),
            nn.Linear(HIDDEN_DIM, EMB_DIM),
            nn.Dropout(DROPOUT)
        )

    def forward(self, x):
        # Attention block with residual connection
        x_norm = self.norm1(x)
        attn_out, _ = self.attn(x_norm, x_norm, x_norm)
        x = x + self.dropout1(attn_out)

        # MLP block with residual connection
        x_norm = self.norm2(x)
        mlp_out = self.mlp(x_norm)
        x = x + mlp_out

        return x

In [None]:
class VisionTransformer(nn.Module):
    """The full Vision Transformer model."""
    def __init__(self):
        super().__init__()
        num_patches = (IMG_SIZE // PATCH_SIZE) ** 2

        self.patch_embedding = PatchEmbedding()

        # Learnable CLS token
        self.cls_token = nn.Parameter(torch.randn(1, 1, EMB_DIM))

        # Learnable positional embeddings
        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, EMB_DIM))

        # Stack of Transformer encoders
        self.encoders = nn.Sequential(*[TransformerEncoder() for _ in range(NUM_LAYERS)])

        # Final classification head
        self.norm = nn.LayerNorm(EMB_DIM)
        self.classifier = nn.Linear(EMB_DIM, NUM_CLASSES)

    def forward(self, x):
        # 1. Create patch embeddings
        x = self.patch_embedding(x)

        # 2. Prepend CLS token
        # Expand cls_token to match the batch size
        cls_tokens = self.cls_token.expand(x.shape[0], -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)

        # 3. Add positional embeddings
        x = x + self.pos_embedding

        # 4. Pass through Transformer encoders
        x = self.encoders(x)

        # 5. Get the CLS token output for classification
        # We only use the output corresponding to the first token (CLS token)
        cls_output = x[:, 0]

        # 6. Pass through the final norm and classifier
        cls_output = self.norm(cls_output)
        out = self.classifier(cls_output)

        return out

# Instantiate the model and move it to the GPU
model = VisionTransformer().to(device)
print(model)

VisionTransformer(
  (patch_embedding): PatchEmbedding(
    (patcher): Conv2d(3, 512, kernel_size=(4, 4), stride=(4, 4))
  )
  (encoders): Sequential(
    (0): TransformerEncoder(
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
      )
      (dropout1): Dropout(p=0.2, inplace=False)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (0): Linear(in_features=512, out_features=2048, bias=True)
        (1): GELU(approximate='none')
        (2): Dropout(p=0.2, inplace=False)
        (3): Linear(in_features=2048, out_features=512, bias=True)
        (4): Dropout(p=0.2, inplace=False)
      )
    )
    (1): TransformerEncoder(
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=5

In [None]:
# --- TRAINING SETUP ---

# Loss function
criterion = nn.CrossEntropyLoss()
# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.1)
# Learning rate scheduler
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS)

# --- TRAINING & EVALUATION FUNCTIONS ---

def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct_preds = 0
    total_samples = 0

    progress_bar = tqdm(loader, desc="Training")
    for inputs, labels in progress_bar:
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        # Statistics
        running_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs.data, 1)
        total_samples += labels.size(0)
        correct_preds += (predicted == labels).sum().item()

        progress_bar.set_postfix(loss=loss.item(), acc=f"{(predicted == labels).sum().item()/labels.size(0):.2f}")

    epoch_loss = running_loss / total_samples
    epoch_acc = correct_preds / total_samples
    return epoch_loss, epoch_acc


def evaluate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct_preds = 0
    total_samples = 0

    with torch.no_grad():
        progress_bar = tqdm(loader, desc="Evaluating")
        for inputs, labels in progress_bar:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total_samples += labels.size(0)
            correct_preds += (predicted == labels).sum().item()

            progress_bar.set_postfix(loss=loss.item())

    epoch_loss = running_loss / total_samples
    epoch_acc = correct_preds / total_samples
    return epoch_loss, epoch_acc

In [None]:
# --- MAIN TRAINING LOOP ---

best_test_acc = 0.0

print("Starting training...")
for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
    test_loss, test_acc = evaluate(model, test_loader, criterion, device)

    # Step the scheduler
    scheduler.step()

    print(f"Epoch {epoch+1}/{NUM_EPOCHS} | "
          f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
          f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")

    # Save the best model
    if test_acc > best_test_acc:
        best_test_acc = test_acc
        torch.save(model.state_dict(), 'best_vit_cifar10.pth')
        print(f"New best model saved with accuracy: {best_test_acc:.4f}")

print("Training finished!")
print(f"Best Test Accuracy: {best_test_acc:.4f}")

Starting training...


Training: 100%|██████████| 196/196 [02:03<00:00,  1.59it/s, acc=0.25, loss=1.93]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.65it/s, loss=1.56]


Epoch 1/100 | Train Loss: 2.0402, Train Acc: 0.2517 | Test Loss: 1.6046, Test Acc: 0.4280
New best model saved with accuracy: 0.4280


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.40, loss=1.66]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=1.23]


Epoch 2/100 | Train Loss: 1.8060, Train Acc: 0.3501 | Test Loss: 1.4088, Test Acc: 0.4993
New best model saved with accuracy: 0.4993


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.36, loss=1.61]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=1.26]


Epoch 3/100 | Train Loss: 1.6805, Train Acc: 0.3969 | Test Loss: 1.3382, Test Acc: 0.5233
New best model saved with accuracy: 0.5233


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.46, loss=1.49]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.86it/s, loss=1.06]


Epoch 4/100 | Train Loss: 1.6136, Train Acc: 0.4204 | Test Loss: 1.2938, Test Acc: 0.5372
New best model saved with accuracy: 0.5372


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.49, loss=1.6]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=1.11]


Epoch 5/100 | Train Loss: 1.5618, Train Acc: 0.4402 | Test Loss: 1.2215, Test Acc: 0.5566
New best model saved with accuracy: 0.5566


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.46, loss=1.57]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.78it/s, loss=1.14]


Epoch 6/100 | Train Loss: 1.5291, Train Acc: 0.4511 | Test Loss: 1.1845, Test Acc: 0.5736
New best model saved with accuracy: 0.5736


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.54, loss=1.41]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.86it/s, loss=1.08]


Epoch 7/100 | Train Loss: 1.4871, Train Acc: 0.4694 | Test Loss: 1.1879, Test Acc: 0.5781
New best model saved with accuracy: 0.5781


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.44, loss=1.51]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.83it/s, loss=1.05]


Epoch 8/100 | Train Loss: 1.4567, Train Acc: 0.4806 | Test Loss: 1.0937, Test Acc: 0.6059
New best model saved with accuracy: 0.6059


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.46, loss=1.41]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.83it/s, loss=0.96]


Epoch 9/100 | Train Loss: 1.4261, Train Acc: 0.4871 | Test Loss: 1.0919, Test Acc: 0.6126
New best model saved with accuracy: 0.6126


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.47, loss=1.36]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.86it/s, loss=1.19]


Epoch 10/100 | Train Loss: 1.4086, Train Acc: 0.4952 | Test Loss: 1.0648, Test Acc: 0.6204
New best model saved with accuracy: 0.6204


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.50, loss=1.45]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=1.03]


Epoch 11/100 | Train Loss: 1.3936, Train Acc: 0.5010 | Test Loss: 1.0349, Test Acc: 0.6257
New best model saved with accuracy: 0.6257


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.45, loss=1.43]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.79it/s, loss=0.959]


Epoch 12/100 | Train Loss: 1.3598, Train Acc: 0.5149 | Test Loss: 1.0098, Test Acc: 0.6395
New best model saved with accuracy: 0.6395


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.45, loss=1.41]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.946]


Epoch 13/100 | Train Loss: 1.3455, Train Acc: 0.5204 | Test Loss: 0.9988, Test Acc: 0.6432
New best model saved with accuracy: 0.6432


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.60, loss=1.13]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.83it/s, loss=1.02]


Epoch 14/100 | Train Loss: 1.3191, Train Acc: 0.5293 | Test Loss: 1.0115, Test Acc: 0.6431


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.55, loss=1.37]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.83it/s, loss=0.815]


Epoch 15/100 | Train Loss: 1.2956, Train Acc: 0.5360 | Test Loss: 0.9891, Test Acc: 0.6487
New best model saved with accuracy: 0.6487


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.47, loss=1.36]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.783]


Epoch 16/100 | Train Loss: 1.2826, Train Acc: 0.5422 | Test Loss: 0.9300, Test Acc: 0.6735
New best model saved with accuracy: 0.6735


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.57, loss=1.17]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.871]


Epoch 17/100 | Train Loss: 1.2645, Train Acc: 0.5497 | Test Loss: 0.9484, Test Acc: 0.6665


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.53, loss=1.52]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.86it/s, loss=0.946]


Epoch 18/100 | Train Loss: 1.2458, Train Acc: 0.5549 | Test Loss: 0.9305, Test Acc: 0.6701


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.50, loss=1.38]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.86it/s, loss=0.774]


Epoch 19/100 | Train Loss: 1.2351, Train Acc: 0.5593 | Test Loss: 0.9279, Test Acc: 0.6795
New best model saved with accuracy: 0.6795


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.54, loss=1.35]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.728]


Epoch 20/100 | Train Loss: 1.2203, Train Acc: 0.5643 | Test Loss: 0.9431, Test Acc: 0.6708


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.55, loss=1.4]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.86it/s, loss=0.911]


Epoch 21/100 | Train Loss: 1.2039, Train Acc: 0.5718 | Test Loss: 0.9086, Test Acc: 0.6791


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.49, loss=1.32]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.79it/s, loss=0.711]


Epoch 22/100 | Train Loss: 1.1928, Train Acc: 0.5759 | Test Loss: 0.8661, Test Acc: 0.6928
New best model saved with accuracy: 0.6928


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.66, loss=1.02]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.805]


Epoch 23/100 | Train Loss: 1.1774, Train Acc: 0.5806 | Test Loss: 0.8461, Test Acc: 0.7003
New best model saved with accuracy: 0.7003


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.66, loss=0.963]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.846]


Epoch 24/100 | Train Loss: 1.1547, Train Acc: 0.5868 | Test Loss: 0.8290, Test Acc: 0.7009
New best model saved with accuracy: 0.7009


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.65, loss=0.94]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.783]


Epoch 25/100 | Train Loss: 1.1465, Train Acc: 0.5933 | Test Loss: 0.8433, Test Acc: 0.7017
New best model saved with accuracy: 0.7017


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.49, loss=1.36]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.83it/s, loss=0.68]


Epoch 26/100 | Train Loss: 1.1318, Train Acc: 0.5960 | Test Loss: 0.8362, Test Acc: 0.7036
New best model saved with accuracy: 0.7036


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.65, loss=0.968]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.866]


Epoch 27/100 | Train Loss: 1.1292, Train Acc: 0.5979 | Test Loss: 0.8207, Test Acc: 0.7088
New best model saved with accuracy: 0.7088


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.68, loss=0.941]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.827]


Epoch 28/100 | Train Loss: 1.1184, Train Acc: 0.6013 | Test Loss: 0.8496, Test Acc: 0.7009


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.61, loss=1.18]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.82it/s, loss=0.759]


Epoch 29/100 | Train Loss: 1.1012, Train Acc: 0.6069 | Test Loss: 0.7991, Test Acc: 0.7187
New best model saved with accuracy: 0.7187


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.60, loss=1.19]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.568]


Epoch 30/100 | Train Loss: 1.0870, Train Acc: 0.6115 | Test Loss: 0.7903, Test Acc: 0.7223
New best model saved with accuracy: 0.7223


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.68, loss=0.898]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.81it/s, loss=0.724]


Epoch 31/100 | Train Loss: 1.0736, Train Acc: 0.6203 | Test Loss: 0.7958, Test Acc: 0.7208


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.49, loss=1.12]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.77it/s, loss=0.576]


Epoch 32/100 | Train Loss: 1.0732, Train Acc: 0.6188 | Test Loss: 0.7861, Test Acc: 0.7269
New best model saved with accuracy: 0.7269


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.62, loss=0.922]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.479]


Epoch 33/100 | Train Loss: 1.0539, Train Acc: 0.6255 | Test Loss: 0.7753, Test Acc: 0.7278
New best model saved with accuracy: 0.7278


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.71, loss=0.865]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.83it/s, loss=0.745]


Epoch 34/100 | Train Loss: 1.0486, Train Acc: 0.6278 | Test Loss: 0.7768, Test Acc: 0.7269


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.65, loss=0.969]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.80it/s, loss=0.538]


Epoch 35/100 | Train Loss: 1.0372, Train Acc: 0.6329 | Test Loss: 0.7400, Test Acc: 0.7412
New best model saved with accuracy: 0.7412


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.66, loss=1.1]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.81it/s, loss=0.559]


Epoch 36/100 | Train Loss: 1.0280, Train Acc: 0.6336 | Test Loss: 0.7458, Test Acc: 0.7403


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.66, loss=0.978]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.514]


Epoch 37/100 | Train Loss: 1.0179, Train Acc: 0.6379 | Test Loss: 0.7501, Test Acc: 0.7432
New best model saved with accuracy: 0.7432


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.64, loss=0.968]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.561]


Epoch 38/100 | Train Loss: 1.0159, Train Acc: 0.6391 | Test Loss: 0.7432, Test Acc: 0.7445
New best model saved with accuracy: 0.7445


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.64, loss=0.951]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.516]


Epoch 39/100 | Train Loss: 1.0008, Train Acc: 0.6462 | Test Loss: 0.7226, Test Acc: 0.7500
New best model saved with accuracy: 0.7500


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.59, loss=1.07]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.82it/s, loss=0.452]


Epoch 40/100 | Train Loss: 0.9906, Train Acc: 0.6485 | Test Loss: 0.7191, Test Acc: 0.7474


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.65, loss=1.07]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.602]


Epoch 41/100 | Train Loss: 0.9818, Train Acc: 0.6498 | Test Loss: 0.7083, Test Acc: 0.7530
New best model saved with accuracy: 0.7530


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.65, loss=0.951]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.82it/s, loss=0.645]


Epoch 42/100 | Train Loss: 0.9740, Train Acc: 0.6539 | Test Loss: 0.7249, Test Acc: 0.7470


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.68, loss=0.929]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.82it/s, loss=0.519]


Epoch 43/100 | Train Loss: 0.9726, Train Acc: 0.6538 | Test Loss: 0.6889, Test Acc: 0.7582
New best model saved with accuracy: 0.7582


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.68, loss=1.05]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.444]


Epoch 44/100 | Train Loss: 0.9586, Train Acc: 0.6594 | Test Loss: 0.6910, Test Acc: 0.7632
New best model saved with accuracy: 0.7632


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.55, loss=1.08]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.82it/s, loss=0.549]


Epoch 45/100 | Train Loss: 0.9558, Train Acc: 0.6618 | Test Loss: 0.6847, Test Acc: 0.7624


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.69, loss=0.993]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.86it/s, loss=0.437]


Epoch 46/100 | Train Loss: 0.9414, Train Acc: 0.6650 | Test Loss: 0.6809, Test Acc: 0.7657
New best model saved with accuracy: 0.7657


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.64, loss=0.984]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.396]


Epoch 47/100 | Train Loss: 0.9359, Train Acc: 0.6673 | Test Loss: 0.6785, Test Acc: 0.7620


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.72, loss=0.714]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.477]


Epoch 48/100 | Train Loss: 0.9305, Train Acc: 0.6704 | Test Loss: 0.6858, Test Acc: 0.7632


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.62, loss=1.09]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.519]


Epoch 49/100 | Train Loss: 0.9185, Train Acc: 0.6757 | Test Loss: 0.6889, Test Acc: 0.7615


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.72, loss=0.848]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.76it/s, loss=0.48]


Epoch 50/100 | Train Loss: 0.9161, Train Acc: 0.6734 | Test Loss: 0.6698, Test Acc: 0.7680
New best model saved with accuracy: 0.7680


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.59, loss=1.06]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.82it/s, loss=0.559]


Epoch 51/100 | Train Loss: 0.9104, Train Acc: 0.6774 | Test Loss: 0.6885, Test Acc: 0.7643


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.65, loss=1.02]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.83it/s, loss=0.439]


Epoch 52/100 | Train Loss: 0.9005, Train Acc: 0.6805 | Test Loss: 0.6515, Test Acc: 0.7754
New best model saved with accuracy: 0.7754


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.69, loss=0.922]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.83it/s, loss=0.579]


Epoch 53/100 | Train Loss: 0.8948, Train Acc: 0.6831 | Test Loss: 0.6491, Test Acc: 0.7720


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.68, loss=1.01]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.516]


Epoch 54/100 | Train Loss: 0.8918, Train Acc: 0.6830 | Test Loss: 0.6587, Test Acc: 0.7739


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.68, loss=0.875]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.83it/s, loss=0.616]


Epoch 55/100 | Train Loss: 0.8812, Train Acc: 0.6881 | Test Loss: 0.6483, Test Acc: 0.7768
New best model saved with accuracy: 0.7768


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.75, loss=0.78]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.534]


Epoch 56/100 | Train Loss: 0.8803, Train Acc: 0.6872 | Test Loss: 0.6558, Test Acc: 0.7748


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.59, loss=0.951]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.77it/s, loss=0.525]


Epoch 57/100 | Train Loss: 0.8685, Train Acc: 0.6913 | Test Loss: 0.6511, Test Acc: 0.7759


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.57, loss=1.3]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.83it/s, loss=0.466]


Epoch 58/100 | Train Loss: 0.8675, Train Acc: 0.6912 | Test Loss: 0.6532, Test Acc: 0.7750


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.61, loss=1.02]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.529]


Epoch 59/100 | Train Loss: 0.8569, Train Acc: 0.6945 | Test Loss: 0.6158, Test Acc: 0.7872
New best model saved with accuracy: 0.7872


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.72, loss=0.804]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.531]


Epoch 60/100 | Train Loss: 0.8552, Train Acc: 0.6953 | Test Loss: 0.6376, Test Acc: 0.7810


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.69, loss=0.887]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.81it/s, loss=0.534]


Epoch 61/100 | Train Loss: 0.8515, Train Acc: 0.6983 | Test Loss: 0.6260, Test Acc: 0.7839


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.70, loss=0.846]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.594]


Epoch 62/100 | Train Loss: 0.8493, Train Acc: 0.6979 | Test Loss: 0.6426, Test Acc: 0.7807


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.72, loss=0.815]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.80it/s, loss=0.431]


Epoch 63/100 | Train Loss: 0.8429, Train Acc: 0.7010 | Test Loss: 0.6356, Test Acc: 0.7843


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.76, loss=0.771]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.449]


Epoch 64/100 | Train Loss: 0.8353, Train Acc: 0.7028 | Test Loss: 0.6292, Test Acc: 0.7856


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.75, loss=0.736]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.86it/s, loss=0.483]


Epoch 65/100 | Train Loss: 0.8256, Train Acc: 0.7067 | Test Loss: 0.6070, Test Acc: 0.7954
New best model saved with accuracy: 0.7954


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.78, loss=0.565]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.82it/s, loss=0.487]


Epoch 66/100 | Train Loss: 0.8216, Train Acc: 0.7080 | Test Loss: 0.5983, Test Acc: 0.7972
New best model saved with accuracy: 0.7972


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.78, loss=0.792]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.457]


Epoch 67/100 | Train Loss: 0.8236, Train Acc: 0.7083 | Test Loss: 0.6120, Test Acc: 0.7895


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.71, loss=0.744]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.439]


Epoch 68/100 | Train Loss: 0.8152, Train Acc: 0.7122 | Test Loss: 0.6015, Test Acc: 0.7948


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.68, loss=0.81]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.41]


Epoch 69/100 | Train Loss: 0.8117, Train Acc: 0.7114 | Test Loss: 0.6245, Test Acc: 0.7885


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.76, loss=0.683]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.82it/s, loss=0.439]


Epoch 70/100 | Train Loss: 0.8096, Train Acc: 0.7136 | Test Loss: 0.6076, Test Acc: 0.7935


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.79, loss=0.778]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.83it/s, loss=0.533]


Epoch 71/100 | Train Loss: 0.8089, Train Acc: 0.7137 | Test Loss: 0.6021, Test Acc: 0.7956


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.68, loss=0.84]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.413]


Epoch 72/100 | Train Loss: 0.8027, Train Acc: 0.7166 | Test Loss: 0.6157, Test Acc: 0.7928


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.75, loss=0.773]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.86it/s, loss=0.461]


Epoch 73/100 | Train Loss: 0.7971, Train Acc: 0.7198 | Test Loss: 0.5940, Test Acc: 0.8004
New best model saved with accuracy: 0.8004


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.68, loss=0.929]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.411]


Epoch 74/100 | Train Loss: 0.8001, Train Acc: 0.7152 | Test Loss: 0.5992, Test Acc: 0.7950


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.64, loss=1.01]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.86it/s, loss=0.34]


Epoch 75/100 | Train Loss: 0.7918, Train Acc: 0.7211 | Test Loss: 0.6029, Test Acc: 0.7998


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.70, loss=0.732]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.86it/s, loss=0.333]


Epoch 76/100 | Train Loss: 0.7857, Train Acc: 0.7231 | Test Loss: 0.6062, Test Acc: 0.7994


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.76, loss=0.707]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.86it/s, loss=0.392]


Epoch 77/100 | Train Loss: 0.7900, Train Acc: 0.7186 | Test Loss: 0.6052, Test Acc: 0.7979


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.68, loss=1.06]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.82it/s, loss=0.4]


Epoch 78/100 | Train Loss: 0.7918, Train Acc: 0.7164 | Test Loss: 0.5902, Test Acc: 0.8000


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.69, loss=0.758]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.86it/s, loss=0.353]


Epoch 79/100 | Train Loss: 0.7784, Train Acc: 0.7235 | Test Loss: 0.5945, Test Acc: 0.8002


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.72, loss=0.73]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.79it/s, loss=0.41]


Epoch 80/100 | Train Loss: 0.7822, Train Acc: 0.7234 | Test Loss: 0.5933, Test Acc: 0.8023
New best model saved with accuracy: 0.8023


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.70, loss=0.87]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.383]


Epoch 81/100 | Train Loss: 0.7795, Train Acc: 0.7251 | Test Loss: 0.6058, Test Acc: 0.7938


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.69, loss=0.763]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.77it/s, loss=0.379]


Epoch 82/100 | Train Loss: 0.7734, Train Acc: 0.7252 | Test Loss: 0.5953, Test Acc: 0.7989


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.75, loss=0.756]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.86it/s, loss=0.369]


Epoch 83/100 | Train Loss: 0.7768, Train Acc: 0.7226 | Test Loss: 0.5892, Test Acc: 0.7997


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.74, loss=0.741]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.36]


Epoch 84/100 | Train Loss: 0.7721, Train Acc: 0.7270 | Test Loss: 0.5907, Test Acc: 0.8000


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.68, loss=0.831]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.385]


Epoch 85/100 | Train Loss: 0.7753, Train Acc: 0.7241 | Test Loss: 0.5902, Test Acc: 0.8011


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.72, loss=0.847]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.363]


Epoch 86/100 | Train Loss: 0.7637, Train Acc: 0.7271 | Test Loss: 0.5914, Test Acc: 0.8017


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.79, loss=0.675]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.358]


Epoch 87/100 | Train Loss: 0.7654, Train Acc: 0.7305 | Test Loss: 0.5887, Test Acc: 0.8034
New best model saved with accuracy: 0.8034


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.78, loss=0.691]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.354]


Epoch 88/100 | Train Loss: 0.7599, Train Acc: 0.7305 | Test Loss: 0.5923, Test Acc: 0.8034


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.71, loss=0.818]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.82it/s, loss=0.362]


Epoch 89/100 | Train Loss: 0.7671, Train Acc: 0.7295 | Test Loss: 0.5843, Test Acc: 0.8064
New best model saved with accuracy: 0.8064


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.74, loss=0.75]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.353]


Epoch 90/100 | Train Loss: 0.7589, Train Acc: 0.7308 | Test Loss: 0.5874, Test Acc: 0.8051


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.70, loss=0.769]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.86it/s, loss=0.361]


Epoch 91/100 | Train Loss: 0.7641, Train Acc: 0.7306 | Test Loss: 0.5867, Test Acc: 0.8055


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.66, loss=0.852]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.35]


Epoch 92/100 | Train Loss: 0.7628, Train Acc: 0.7296 | Test Loss: 0.5860, Test Acc: 0.8057


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.74, loss=0.696]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.81it/s, loss=0.354]


Epoch 93/100 | Train Loss: 0.7577, Train Acc: 0.7317 | Test Loss: 0.5869, Test Acc: 0.8055


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.71, loss=0.808]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.82it/s, loss=0.361]


Epoch 94/100 | Train Loss: 0.7637, Train Acc: 0.7289 | Test Loss: 0.5896, Test Acc: 0.8049


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.80, loss=0.588]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.83it/s, loss=0.359]


Epoch 95/100 | Train Loss: 0.7548, Train Acc: 0.7316 | Test Loss: 0.5865, Test Acc: 0.8047


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.72, loss=0.766]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.82it/s, loss=0.359]


Epoch 96/100 | Train Loss: 0.7603, Train Acc: 0.7296 | Test Loss: 0.5863, Test Acc: 0.8041


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.74, loss=0.65]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.356]


Epoch 97/100 | Train Loss: 0.7607, Train Acc: 0.7307 | Test Loss: 0.5871, Test Acc: 0.8049


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.64, loss=0.854]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, loss=0.359]


Epoch 98/100 | Train Loss: 0.7588, Train Acc: 0.7297 | Test Loss: 0.5871, Test Acc: 0.8045


Training: 100%|██████████| 196/196 [02:04<00:00,  1.57it/s, acc=0.68, loss=1]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, loss=0.36]


Epoch 99/100 | Train Loss: 0.7562, Train Acc: 0.7338 | Test Loss: 0.5865, Test Acc: 0.8041


Training: 100%|██████████| 196/196 [02:04<00:00,  1.58it/s, acc=0.75, loss=0.677]
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.86it/s, loss=0.36]

Epoch 100/100 | Train Loss: 0.7532, Train Acc: 0.7326 | Test Loss: 0.5866, Test Acc: 0.8041
Training finished!
Best Test Accuracy: 0.8064





# Vision Transformer (ViT) for CIFAR-10 Classification

This notebook implements a Vision Transformer model from scratch in PyTorch to classify images from the CIFAR-10 dataset.

## Overview

The notebook covers the following steps:

1.  **Setup**: Imports necessary libraries and sets up the device (GPU or CPU).
2.  **Configuration**: Defines hyperparameters for the model and training process.
3.  **Data Preparation**: Downloads, transforms, and loads the CIFAR-10 dataset using PyTorch DataLoaders. Data augmentation is applied to the training set.
4.  **Model Architecture**: Defines the Vision Transformer architecture, including:
    *   `PatchEmbedding`: Converts input images into a sequence of flattened patches and projects them into a higher-dimensional space.
    *   `TransformerEncoder`: Implements a standard Transformer encoder layer with multi-head attention and an MLP block.
    *   `VisionTransformer`: Combines the patch embedding, learnable CLS token, positional embeddings, and a stack of Transformer encoders with a final classification head.
5.  **Training Setup**: Defines the loss function (Cross-Entropy), optimizer (AdamW), and learning rate scheduler (Cosine Annealing).
6.  **Training & Evaluation Functions**: Implements functions for training one epoch and evaluating the model on the test set.
7.  **Main Training Loop**: Runs the training process for a specified number of epochs, saves the best model based on test accuracy, and prints training and evaluation metrics per epoch.

## Requirements

*   PyTorch
*   Torchvision
*   Tqdm

These libraries are typically pre-installed in Google Colab environments.

## Usage

1.  **Run all cells**: Execute the cells sequentially from top to bottom.
2.  **Monitor Training**: Observe the training progress and metrics printed in the output of the training loop cell. The best test accuracy achieved will be reported at the end.
3.  **Best Model**: The best performing model weights will be saved to a file named `best_vit_cifar10.pth`.

## Configuration

You can adjust the model and training hyperparameters in the "Configuration" cell (e.g., `IMG_SIZE`, `PATCH_SIZE`, `NUM_EPOCHS`, `LEARNING_RATE`, etc.) to experiment with different settings.

## Dataset

The notebook uses the CIFAR-10 dataset, which consists of 60,000 32x32 color images in 10 classes, with 6,000 images per class. There are 50,000 training images and 10,000 test images.