# Question 1

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import copy
import json
import os

In [2]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super().__init__()
        self.softmax = nn.Softmax(dim=-1)
        
    def forward(self, q, k, v, mask=None):
        d_k = k.size(-1)
        scores = torch.matmul(q, k.transpose(-2, -1)) / np.sqrt(d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attention = self.softmax(scores)
        output = torch.matmul(attention, v)
        return output, attention

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, n_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.n_heads = n_heads
        self.head_dim = embed_dim // n_heads
        
        self.q_linear = nn.Linear(embed_dim, embed_dim)
        self.k_linear = nn.Linear(embed_dim, embed_dim)
        self.v_linear = nn.Linear(embed_dim, embed_dim)
        
        self.attention = ScaledDotProductAttention()
        self.out_linear = nn.Linear(embed_dim, embed_dim)
        
    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)
        
        q = self.q_linear(q).view(batch_size, -1, self.n_heads, self.head_dim).transpose(1, 2)
        k = self.k_linear(k).view(batch_size, -1, self.n_heads, self.head_dim).transpose(1, 2)
        v = self.v_linear(v).view(batch_size, -1, self.n_heads, self.head_dim).transpose(1, 2)
        
        output, attention = self.attention(q, k, v, mask)
        
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.embed_dim)
        output = self.out_linear(output)
        
        return output, attention

class MLPBlock(nn.Module):
    def __init__(self, embed_dim, mlp_dim, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, mlp_dim)
        self.fc2 = nn.Linear(mlp_dim, embed_dim)
        self.act = nn.GELU()
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.dropout(x)
        return x

class TransformerEncoderLayer(nn.Module):
    def __init__(self, embed_dim, n_heads, mlp_dim, dropout=0.1):
        super().__init__()
        self.attention = MultiHeadAttention(embed_dim, n_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.mlp = MLPBlock(embed_dim, mlp_dim, dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        attn_output, _ = self.attention(x, x, x)
        x = x + self.dropout(attn_output)
        x = self.norm1(x)
        
        mlp_output = self.mlp(x)
        x = x + self.dropout(mlp_output)
        x = self.norm2(x)
        
        return x


In [3]:
class VisionTransformer(nn.Module):
    def __init__(
        self,
        img_size=32,
        patch_size=4,
        in_channels=3,
        num_classes=10,
        embed_dim=256,
        depth=6,
        n_heads=8,
        mlp_dim=512,
        dropout=0.1,
    ):
        super().__init__()
        
        self.patch_size = patch_size
        self.num_patches = (img_size // patch_size) ** 2
        
        self.patch_embed = nn.Conv2d(
            in_channels, embed_dim, 
            kernel_size=patch_size, stride=patch_size
        )
      
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        
        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches + 1, embed_dim))
        
        self.dropout = nn.Dropout(dropout)
        
        self.transformer_encoder = nn.ModuleList([
            TransformerEncoderLayer(embed_dim, n_heads, mlp_dim, dropout)
            for _ in range(depth)
        ])
        
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)

        self._init_weights()
        
    def _init_weights(self):
        nn.init.normal_(self.cls_token, std=0.02)
        nn.init.normal_(self.pos_embed, std=0.02)
        self.apply(self._init_linear_weights)
            
    def _init_linear_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, std=0.02)
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.LayerNorm):
            nn.init.zeros_(m.bias)
            nn.init.ones_(m.weight)
        
    def forward(self, x):
        batch_size = x.size(0)
        
        x = self.patch_embed(x)
       
        x = x.flatten(2)
        x = x.transpose(1, 2)
      
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        
        x = x + self.pos_embed
        x = self.dropout(x)
        
        for layer in self.transformer_encoder:
            x = layer(x)
        
        x = self.norm(x)
        x = x[:, 0]
        
        x = self.head(x)
        
        return x

In [4]:
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    train_pbar = tqdm(train_loader, desc="Training")
    
    for images, labels in train_pbar:
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
        
        train_pbar.set_postfix({
            'loss': running_loss / (train_pbar.n + 1),
            'acc': 100. * correct / total
        })
    
    train_loss = running_loss / len(train_loader)
    train_acc = 100. * correct / total
    
    return train_loss, train_acc

def evaluate(model, test_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    
    test_loss = running_loss / len(test_loader)
    test_acc = 100. * correct / total
    
    return test_loss, test_acc

In [5]:
def get_data_loaders(batch_size=128, augmentation='default'):
    if augmentation == 'default':
        transform_train = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
        ])
    elif augmentation == 'strong':
        transform_train = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(15),
            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
            transforms.RandomErasing(p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3)),
        ])
    elif augmentation == 'autoaugment':
        transform_train = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.AutoAugment(transforms.AutoAugmentPolicy.CIFAR10),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
        ])
    elif augmentation == 'cutmix':
        transform_train = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
        ])
    else:
        raise ValueError(f"Unknown augmentation: {augmentation}")
    
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
    ])
    
    train_dataset = torchvision.datasets.CIFAR10(
        root='./data', train=True, download=True, transform=transform_train)
    
    test_dataset = torchvision.datasets.CIFAR10(
        root='./data', train=False, download=True, transform=transform_test)
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    
    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
    
    return train_loader, test_loader

In [6]:
def train_model(config, device, num_epochs=20):
    patch_size = config['patch_size']
    embed_dim = config['embed_dim']
    depth = config['depth']
    n_heads = config['n_heads']
    mlp_dim = config['mlp_dim']
    batch_size = config['batch_size']
    lr = config['lr']
    weight_decay = config['weight_decay']
    augmentation = config.get('augmentation', 'default')
   
    train_loader, test_loader = get_data_loaders(batch_size, augmentation)
    
    model = VisionTransformer(
        img_size=32,
        patch_size=patch_size,
        in_channels=3,
        num_classes=10,
        embed_dim=embed_dim,
        depth=depth,
        n_heads=n_heads,
        mlp_dim=mlp_dim,
        dropout=0.1
    ).to(device)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
    
    train_losses, train_accs = [], []
    test_losses, test_accs = [], []
    best_acc = 0.0
    best_model_state = None
    
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        
        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
        test_loss, test_acc = evaluate(model, test_loader, criterion, device)
        
        scheduler.step()
        
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        test_losses.append(test_loss)
        test_accs.append(test_acc)
        
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%')
        print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')
        
        if test_acc > best_acc:
            best_acc = test_acc
            best_model_state = copy.deepcopy(model.state_dict())
    
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    
    results = {
        'config': config,
        'train_losses': train_losses,
        'train_accs': train_accs,
        'test_losses': test_losses,
        'test_accs': test_accs,
        'best_test_acc': best_acc
    }
    
    exp_name = f"vit_ps{patch_size}_e{embed_dim}_d{depth}_h{n_heads}_mlp{mlp_dim}"
    if augmentation != 'default':
        exp_name += f"_aug{augmentation}"
    
    if not os.path.exists('results'):
        os.makedirs('results')
    
    torch.save(model.state_dict(), f'results/{exp_name}_model.pth')
    with open(f'results/{exp_name}_results.json', 'w') as f:
        json.dump({k: v if not isinstance(v, list) else [float(i) for i in v] 
                  for k, v in results.items() if k != 'config'}, f)
    
    return model, results


In [7]:
def plot_comparison_results(all_results, comparison_name, filename):
    plt.figure(figsize=(15, 10))
    
    keys = list(all_results.keys())
    
    plt.subplot(2, 2, 1)
    for key in keys:
        plt.plot(all_results[key]['train_losses'], label=f'{comparison_name} = {key}')
    plt.xlabel('Epoch')
    plt.ylabel('Training Loss')
    plt.legend()
    plt.title('Training Loss vs. Epoch')
    
    plt.subplot(2, 2, 2)
    for key in keys:
        plt.plot(all_results[key]['test_losses'], label=f'{comparison_name} = {key}')
    plt.xlabel('Epoch')
    plt.ylabel('Testing Loss')
    plt.legend()
    plt.title('Testing Loss vs. Epoch')
    
    plt.subplot(2, 2, 3)
    for key in keys:
        plt.plot(all_results[key]['train_accs'], label=f'{comparison_name} = {key}')
    plt.xlabel('Epoch')
    plt.ylabel('Training Accuracy (%)')
    plt.legend()
    plt.title('Training Accuracy vs. Epoch')
    
    plt.subplot(2, 2, 4)
    for key in keys:
        plt.plot(all_results[key]['test_accs'], label=f'{comparison_name} = {key}')
    plt.xlabel('Epoch')
    plt.ylabel('Testing Accuracy (%)')
    plt.legend()
    plt.title('Testing Accuracy vs. Epoch')
    
    plt.tight_layout()
    plt.savefig(f'results/{filename}.png')
    plt.close()

def plot_augmentation_results(all_results, comparison_name, filename):
    plt.figure(figsize=(15, 5))
    
    keys = list(all_results.keys())
    
    plt.subplot(1, 2, 1)
    for key in keys:
        plt.plot(all_results[key]['test_accs'], label=f'{key}')
    plt.xlabel('Epoch')
    plt.ylabel('Testing Accuracy (%)')
    plt.legend()
    plt.title('Testing Accuracy vs. Epoch')
    
    plt.subplot(1, 2, 2)
    best_accs = [all_results[key]['best_test_acc'] for key in keys]
    plt.bar(keys, best_accs)
    plt.xlabel('Augmentation Type')
    plt.ylabel('Best Test Accuracy (%)')
    plt.title('Best Test Accuracy by Augmentation')
    
    plt.tight_layout()
    plt.savefig(f'results/{filename}.png')
    plt.close()


In [8]:
def run_patch_size_experiment(device, num_epochs=15):
    patch_sizes = [2, 4, 8]
    all_results = {}
    
    base_config = {
        'batch_size': 128,
        'lr': 1e-3,
        'weight_decay': 0.05,
        'augmentation': 'default',
    }
    
    for patch_size in patch_sizes:
        print(f"\n=== Running experiment with patch_size={patch_size} ===")
        
        if patch_size == 2:
            embed_dim = 192
            mlp_dim = 384
        elif patch_size == 4:
            embed_dim = 256
            mlp_dim = 512
        else:  # patch_size == 8
            embed_dim = 384
            mlp_dim = 768
        
        config = {**base_config, 
                  'patch_size': patch_size,
                  'embed_dim': embed_dim,
                  'depth': 6,
                  'n_heads': 8,
                  'mlp_dim': mlp_dim}
        
        _, results = train_model(config, device, num_epochs)
        all_results[patch_size] = results
    
    plot_comparison_results(all_results, "Patch Size", "vit_patch_size_comparison")
    
    print("\nFinal Best Test Accuracies:")
    for patch_size in patch_sizes:
        print(f"Patch Size = {patch_size}: {all_results[patch_size]['best_test_acc']:.2f}%")
    
    return all_results

def run_hyperparameter_exploration(device, num_epochs=50):
    configs = [
        # Base configuration
        {
            'patch_size': 4,
            'embed_dim': 256,
            'depth': 6,
            'n_heads': 8,  
            'mlp_dim': 512,
            'batch_size': 128,
            'lr': 1e-3,
            'weight_decay': 0.05,
            'augmentation': 'default',
        },
        # Variation 1: Deeper model
        {
            'patch_size': 4,
            'embed_dim': 256,
            'depth': 8,
            'n_heads': 8,  
            'mlp_dim': 512,
            'batch_size': 128,
            'lr': 1e-3,
            'weight_decay': 0.05,
            'augmentation': 'default',
        },
        # Variation 2: Wider model
        {
            'patch_size': 4,
            'embed_dim': 384,
            'depth': 6,
            'n_heads': 12,  
            'mlp_dim': 768,
            'batch_size': 128,
            'lr': 1e-3,
            'weight_decay': 0.05,
            'augmentation': 'default',
        },
        # Variation 3: More attention heads - FIXED
        {
            'patch_size': 4,
            'embed_dim': 288,  
            'depth': 6,
            'n_heads': 12,
            'mlp_dim': 576,    
            'batch_size': 128,
            'lr': 1e-3,
            'weight_decay': 0.05,
            'augmentation': 'default',
        },
        # Variation 4: Different learning rate
        {
            'patch_size': 4,
            'embed_dim': 256,
            'depth': 6,
            'n_heads': 8,  
            'mlp_dim': 512,
            'batch_size': 128,
            'lr': 5e-4,
            'weight_decay': 0.05,
            'augmentation': 'default',
        },
    ]
    
    all_results = []
    
    for i, config in enumerate(configs):
        print(f"\n=== Running hyperparameter configuration {i+1}/{len(configs)} ===")
        
        assert config['embed_dim'] % config['n_heads'] == 0, \
            f"Configuration {i+1}: embed_dim ({config['embed_dim']}) must be divisible by n_heads ({config['n_heads']})"
        
        _, results = train_model(config, device, num_epochs)
        all_results.append(results)
    
    best_idx = max(range(len(all_results)), key=lambda i: all_results[i]['best_test_acc'])
    best_config = configs[best_idx]
    best_acc = all_results[best_idx]['best_test_acc']
    
    return best_config, all_results

In [9]:
def run_augmentation_experiment(device, best_config, num_epochs=20):
    augmentations = ['default', 'strong', 'autoaugment', 'cutmix']
    all_results = {}
    
    for aug in augmentations:
        print(f"\n=== Running experiment with augmentation={aug} ===")
        
        config = {**best_config, 'augmentation': aug}
        
        _, results = train_model(config, device, num_epochs)
        all_results[aug] = results
    
    plot_augmentation_results(all_results, "Data Augmentation", "vit_augmentation_comparison")
    
    print("\nFinal Best Test Accuracies with Different Augmentations:")
    for aug in augmentations:
        print(f"{aug}: {all_results[aug]['best_test_acc']:.2f}%")
    
    best_aug = max(augmentations, key=lambda aug: all_results[aug]['best_test_acc'])
    best_aug_acc = all_results[best_aug]['best_test_acc']
    
    print(f"\nBest Augmentation: {best_aug} (Test Acc: {best_aug_acc:.2f}%)")
    
    return all_results


In [23]:
torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if not os.path.exists('results'):
    os.makedirs('results')

experiments_to_run = ['patch_size', 'hyperparameters', 'augmentation']

results = {}

if 'patch_size' in experiments_to_run:
    print("\n=== Running Patch Size Variation Experiment ===")
    results['patch_size'] = run_patch_size_experiment(device)

Using device: cuda

=== Running Patch Size Variation Experiment ===

=== Running experiment with patch_size=2 ===

Epoch 1/15


Training: 100%|██████████| 391/391 [00:54<00:00,  7.16it/s, loss=1.73, acc=35.3]


Train Loss: 1.7289, Train Acc: 35.34%
Test Loss: 1.5368, Test Acc: 46.10%

Epoch 2/15


Training: 100%|██████████| 391/391 [00:55<00:00,  7.10it/s, loss=1.32, acc=52.1]


Train Loss: 1.3186, Train Acc: 52.07%
Test Loss: 1.2039, Test Acc: 56.28%

Epoch 3/15


Training: 100%|██████████| 391/391 [00:55<00:00,  7.10it/s, loss=1.17, acc=57.7]


Train Loss: 1.1710, Train Acc: 57.65%
Test Loss: 1.0859, Test Acc: 61.04%

Epoch 4/15


Training: 100%|██████████| 391/391 [00:55<00:00,  7.09it/s, loss=1.06, acc=61.7]


Train Loss: 1.0636, Train Acc: 61.73%
Test Loss: 0.9910, Test Acc: 64.75%

Epoch 5/15


Training: 100%|██████████| 391/391 [00:55<00:00,  7.09it/s, loss=0.998, acc=64.2]


Train Loss: 0.9983, Train Acc: 64.23%
Test Loss: 0.9401, Test Acc: 66.45%

Epoch 6/15


Training: 100%|██████████| 391/391 [00:55<00:00,  7.09it/s, loss=0.923, acc=67.1]


Train Loss: 0.9232, Train Acc: 67.14%
Test Loss: 0.9188, Test Acc: 67.71%

Epoch 7/15


Training: 100%|██████████| 391/391 [00:55<00:00,  7.09it/s, loss=0.859, acc=69.3]


Train Loss: 0.8590, Train Acc: 69.32%
Test Loss: 0.8166, Test Acc: 71.70%

Epoch 8/15


Training: 100%|██████████| 391/391 [00:55<00:00,  7.10it/s, loss=0.801, acc=71.7]


Train Loss: 0.8012, Train Acc: 71.69%
Test Loss: 0.8124, Test Acc: 71.74%

Epoch 9/15


Training: 100%|██████████| 391/391 [00:55<00:00,  7.08it/s, loss=0.74, acc=73.7] 


Train Loss: 0.7398, Train Acc: 73.71%
Test Loss: 0.7692, Test Acc: 73.42%

Epoch 10/15


Training: 100%|██████████| 391/391 [00:55<00:00,  7.09it/s, loss=0.69, acc=75.5] 


Train Loss: 0.6897, Train Acc: 75.55%
Test Loss: 0.7087, Test Acc: 75.64%

Epoch 11/15


Training: 100%|██████████| 391/391 [00:55<00:00,  7.07it/s, loss=0.638, acc=77.6]


Train Loss: 0.6382, Train Acc: 77.61%
Test Loss: 0.7202, Test Acc: 75.39%

Epoch 12/15


Training: 100%|██████████| 391/391 [00:55<00:00,  7.07it/s, loss=0.59, acc=79]   


Train Loss: 0.5902, Train Acc: 79.03%
Test Loss: 0.6674, Test Acc: 77.25%

Epoch 13/15


Training: 100%|██████████| 391/391 [00:55<00:00,  7.08it/s, loss=0.548, acc=80.5]


Train Loss: 0.5483, Train Acc: 80.54%
Test Loss: 0.6508, Test Acc: 78.12%

Epoch 14/15


Training: 100%|██████████| 391/391 [00:55<00:00,  7.08it/s, loss=0.522, acc=81.4]


Train Loss: 0.5224, Train Acc: 81.45%
Test Loss: 0.6345, Test Acc: 78.40%

Epoch 15/15


Training: 100%|██████████| 391/391 [00:55<00:00,  7.08it/s, loss=0.507, acc=82.1]


Train Loss: 0.5070, Train Acc: 82.10%
Test Loss: 0.6321, Test Acc: 78.89%

=== Running experiment with patch_size=4 ===

Epoch 1/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.68it/s, loss=1.76, acc=34.4]


Train Loss: 1.7471, Train Acc: 34.42%
Test Loss: 1.5229, Test Acc: 45.32%

Epoch 2/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.49it/s, loss=1.42, acc=48.3]


Train Loss: 1.4127, Train Acc: 48.34%
Test Loss: 1.3161, Test Acc: 52.86%

Epoch 3/15


Training: 100%|██████████| 391/391 [00:15<00:00, 26.04it/s, loss=1.27, acc=54]  


Train Loss: 1.2696, Train Acc: 54.02%
Test Loss: 1.1720, Test Acc: 57.21%

Epoch 4/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.60it/s, loss=1.18, acc=57.7]


Train Loss: 1.1728, Train Acc: 57.67%
Test Loss: 1.1756, Test Acc: 59.42%

Epoch 5/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.86it/s, loss=1.1, acc=61]   


Train Loss: 1.0909, Train Acc: 60.95%
Test Loss: 1.0665, Test Acc: 61.97%

Epoch 6/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.70it/s, loss=1.02, acc=63.7]


Train Loss: 1.0163, Train Acc: 63.73%
Test Loss: 1.0324, Test Acc: 63.55%

Epoch 7/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.83it/s, loss=0.953, acc=66.2]


Train Loss: 0.9485, Train Acc: 66.20%
Test Loss: 0.8761, Test Acc: 69.01%

Epoch 8/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.59it/s, loss=0.889, acc=68.9]


Train Loss: 0.8847, Train Acc: 68.85%
Test Loss: 0.8389, Test Acc: 69.77%

Epoch 9/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.60it/s, loss=0.816, acc=71.3]


Train Loss: 0.8115, Train Acc: 71.34%
Test Loss: 0.7872, Test Acc: 72.55%

Epoch 10/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.29it/s, loss=0.757, acc=73.3]


Train Loss: 0.7533, Train Acc: 73.33%
Test Loss: 0.7371, Test Acc: 74.42%

Epoch 11/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.53it/s, loss=0.702, acc=75.1]


Train Loss: 0.6988, Train Acc: 75.13%
Test Loss: 0.6860, Test Acc: 76.03%

Epoch 12/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.34it/s, loss=0.65, acc=77.2] 


Train Loss: 0.6467, Train Acc: 77.17%
Test Loss: 0.6714, Test Acc: 76.90%

Epoch 13/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.72it/s, loss=0.607, acc=78.5]


Train Loss: 0.6040, Train Acc: 78.55%
Test Loss: 0.6574, Test Acc: 77.40%

Epoch 14/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.21it/s, loss=0.579, acc=79.7]


Train Loss: 0.5761, Train Acc: 79.74%
Test Loss: 0.6404, Test Acc: 78.02%

Epoch 15/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.35it/s, loss=0.554, acc=80.4]


Train Loss: 0.5530, Train Acc: 80.36%
Test Loss: 0.6273, Test Acc: 78.71%

=== Running experiment with patch_size=8 ===

Epoch 1/15


Training: 100%|██████████| 391/391 [00:13<00:00, 29.81it/s, loss=2.18, acc=17.6]


Train Loss: 2.1759, Train Acc: 17.65%
Test Loss: 2.2885, Test Acc: 12.76%

Epoch 2/15


Training: 100%|██████████| 391/391 [00:13<00:00, 28.11it/s, loss=2.3, acc=10.3] 


Train Loss: 2.3035, Train Acc: 10.34%
Test Loss: 2.3054, Test Acc: 10.00%

Epoch 3/15


Training: 100%|██████████| 391/391 [00:12<00:00, 30.74it/s, loss=2.32, acc=10.1]


Train Loss: 2.3060, Train Acc: 10.12%
Test Loss: 2.3036, Test Acc: 10.00%

Epoch 4/15


Training: 100%|██████████| 391/391 [00:12<00:00, 30.75it/s, loss=2.32, acc=9.97]


Train Loss: 2.3043, Train Acc: 9.97%
Test Loss: 2.3046, Test Acc: 10.00%

Epoch 5/15


Training: 100%|██████████| 391/391 [00:12<00:00, 30.40it/s, loss=2.3, acc=10.2] 


Train Loss: 2.3041, Train Acc: 10.17%
Test Loss: 2.3031, Test Acc: 10.00%

Epoch 6/15


Training: 100%|██████████| 391/391 [00:12<00:00, 31.41it/s, loss=2.31, acc=10.1]


Train Loss: 2.3038, Train Acc: 10.05%
Test Loss: 2.3033, Test Acc: 10.00%

Epoch 7/15


Training: 100%|██████████| 391/391 [00:12<00:00, 30.96it/s, loss=2.32, acc=9.88]


Train Loss: 2.3035, Train Acc: 9.88%
Test Loss: 2.3036, Test Acc: 10.00%

Epoch 8/15


Training: 100%|██████████| 391/391 [00:13<00:00, 29.62it/s, loss=2.32, acc=10.3]


Train Loss: 2.3030, Train Acc: 10.28%
Test Loss: 2.3027, Test Acc: 10.00%

Epoch 9/15


Training: 100%|██████████| 391/391 [00:13<00:00, 29.66it/s, loss=2.32, acc=9.48]


Train Loss: 2.3033, Train Acc: 9.48%
Test Loss: 2.3027, Test Acc: 10.00%

Epoch 10/15


Training: 100%|██████████| 391/391 [00:13<00:00, 29.56it/s, loss=2.31, acc=10]  


Train Loss: 2.3030, Train Acc: 10.01%
Test Loss: 2.3027, Test Acc: 10.00%

Epoch 11/15


Training: 100%|██████████| 391/391 [00:12<00:00, 31.79it/s, loss=2.31, acc=9.94]


Train Loss: 2.3029, Train Acc: 9.94%
Test Loss: 2.3028, Test Acc: 10.00%

Epoch 12/15


Training: 100%|██████████| 391/391 [00:12<00:00, 30.64it/s, loss=2.31, acc=9.7] 


Train Loss: 2.3030, Train Acc: 9.70%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 13/15


Training: 100%|██████████| 391/391 [00:11<00:00, 34.07it/s, loss=2.31, acc=9.84]


Train Loss: 2.3027, Train Acc: 9.84%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 14/15


Training: 100%|██████████| 391/391 [00:13<00:00, 29.24it/s, loss=2.31, acc=9.79]


Train Loss: 2.3027, Train Acc: 9.79%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 15/15


Training: 100%|██████████| 391/391 [00:12<00:00, 30.38it/s, loss=2.32, acc=10]  


Train Loss: 2.3027, Train Acc: 10.03%
Test Loss: 2.3026, Test Acc: 10.00%

Final Best Test Accuracies:
Patch Size = 2: 78.89%
Patch Size = 4: 78.71%
Patch Size = 8: 12.76%


Patch size 8 performs terribly (~10% accuracy) because it divides 32×32 CIFAR-10 images into only 4×4=16 patches, creating an extreme information bottleneck that loses critical details needed for classification. In contrast, patch sizes 2 and 4 work well because they preserve sufficient spatial resolution while providing enough tokens for the attention mechanism to learn meaningful relationships. The results confirm that appropriate patch size selection is critical relative to input image dimensions.

In [35]:
if 'hyperparameters' in experiments_to_run:
    print("\n=== Running Hyperparameter Exploration ===")
    best_config, hyperparameter_results = run_hyperparameter_exploration(device)
    results['hyperparameters'] = hyperparameter_results


=== Running Hyperparameter Exploration ===

=== Running hyperparameter configuration 1/5 ===

Epoch 1/50


Training: 100%|██████████| 391/391 [00:14<00:00, 27.03it/s, loss=1.77, acc=34.2]


Train Loss: 1.7611, Train Acc: 34.20%
Test Loss: 1.4996, Test Acc: 44.66%

Epoch 2/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.83it/s, loss=1.43, acc=47.9]


Train Loss: 1.4223, Train Acc: 47.87%
Test Loss: 1.2701, Test Acc: 53.64%

Epoch 3/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.73it/s, loss=1.31, acc=52.8]


Train Loss: 1.3048, Train Acc: 52.81%
Test Loss: 1.2282, Test Acc: 54.70%

Epoch 4/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.64it/s, loss=1.22, acc=56.2]


Train Loss: 1.2170, Train Acc: 56.21%
Test Loss: 1.1393, Test Acc: 58.79%

Epoch 5/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.61it/s, loss=1.15, acc=58.9]


Train Loss: 1.1441, Train Acc: 58.93%
Test Loss: 1.1098, Test Acc: 59.61%

Epoch 6/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.38it/s, loss=1.09, acc=61.2]


Train Loss: 1.0839, Train Acc: 61.19%
Test Loss: 1.0645, Test Acc: 61.99%

Epoch 7/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.82it/s, loss=1.04, acc=63]  


Train Loss: 1.0352, Train Acc: 62.99%
Test Loss: 0.9746, Test Acc: 64.27%

Epoch 8/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.59it/s, loss=0.992, acc=64.7]


Train Loss: 0.9874, Train Acc: 64.69%
Test Loss: 0.9497, Test Acc: 66.23%

Epoch 9/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.61it/s, loss=0.953, acc=66.2]


Train Loss: 0.9481, Train Acc: 66.20%
Test Loss: 0.9447, Test Acc: 67.31%

Epoch 10/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.24it/s, loss=0.922, acc=67.6]


Train Loss: 0.9170, Train Acc: 67.55%
Test Loss: 0.8267, Test Acc: 70.85%

Epoch 11/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.43it/s, loss=0.888, acc=68.6]


Train Loss: 0.8837, Train Acc: 68.63%
Test Loss: 0.8922, Test Acc: 67.76%

Epoch 12/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.21it/s, loss=0.855, acc=69.8]


Train Loss: 0.8505, Train Acc: 69.81%
Test Loss: 0.8066, Test Acc: 71.58%

Epoch 13/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.50it/s, loss=0.834, acc=70.6]


Train Loss: 0.8297, Train Acc: 70.64%
Test Loss: 0.8006, Test Acc: 71.53%

Epoch 14/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.13it/s, loss=0.795, acc=72]  


Train Loss: 0.7927, Train Acc: 71.95%
Test Loss: 0.7604, Test Acc: 73.03%

Epoch 15/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.51it/s, loss=0.773, acc=72.8]


Train Loss: 0.7691, Train Acc: 72.79%
Test Loss: 0.7092, Test Acc: 75.00%

Epoch 16/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.22it/s, loss=0.752, acc=73.8]


Train Loss: 0.7486, Train Acc: 73.76%
Test Loss: 0.7197, Test Acc: 75.00%

Epoch 17/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.09it/s, loss=0.724, acc=74.5]


Train Loss: 0.7203, Train Acc: 74.49%
Test Loss: 0.6904, Test Acc: 75.72%

Epoch 18/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.51it/s, loss=0.706, acc=75.3]


Train Loss: 0.7025, Train Acc: 75.31%
Test Loss: 0.6731, Test Acc: 76.40%

Epoch 19/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.59it/s, loss=0.687, acc=75.9]


Train Loss: 0.6830, Train Acc: 75.92%
Test Loss: 0.7143, Test Acc: 75.31%

Epoch 20/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.21it/s, loss=0.665, acc=76.6]


Train Loss: 0.6615, Train Acc: 76.62%
Test Loss: 0.6803, Test Acc: 76.55%

Epoch 21/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.45it/s, loss=0.642, acc=77.4]


Train Loss: 0.6387, Train Acc: 77.38%
Test Loss: 0.6733, Test Acc: 77.01%

Epoch 22/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.18it/s, loss=0.618, acc=78.1]


Train Loss: 0.6166, Train Acc: 78.11%
Test Loss: 0.6653, Test Acc: 77.37%

Epoch 23/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.29it/s, loss=0.598, acc=78.9]


Train Loss: 0.5947, Train Acc: 78.94%
Test Loss: 0.5997, Test Acc: 79.30%

Epoch 24/50


Training: 100%|██████████| 391/391 [00:15<00:00, 26.05it/s, loss=0.576, acc=79.7]


Train Loss: 0.5727, Train Acc: 79.70%
Test Loss: 0.6115, Test Acc: 79.39%

Epoch 25/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.46it/s, loss=0.558, acc=80.3]


Train Loss: 0.5550, Train Acc: 80.28%
Test Loss: 0.6264, Test Acc: 78.66%

Epoch 26/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.45it/s, loss=0.533, acc=81.1]


Train Loss: 0.5334, Train Acc: 81.15%
Test Loss: 0.5796, Test Acc: 80.02%

Epoch 27/50


Training: 100%|██████████| 391/391 [00:31<00:00, 12.52it/s, loss=0.52, acc=81.7] 


Train Loss: 0.5189, Train Acc: 81.68%
Test Loss: 0.5919, Test Acc: 80.02%

Epoch 28/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.55it/s, loss=0.494, acc=82.7]


Train Loss: 0.4917, Train Acc: 82.72%
Test Loss: 0.5620, Test Acc: 81.47%

Epoch 29/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.49it/s, loss=0.474, acc=83.3]


Train Loss: 0.4717, Train Acc: 83.27%
Test Loss: 0.5731, Test Acc: 81.18%

Epoch 30/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.19it/s, loss=0.456, acc=83.8]


Train Loss: 0.4540, Train Acc: 83.76%
Test Loss: 0.5660, Test Acc: 81.27%

Epoch 31/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.57it/s, loss=0.431, acc=84.8]


Train Loss: 0.4284, Train Acc: 84.81%
Test Loss: 0.5415, Test Acc: 82.17%

Epoch 32/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.45it/s, loss=0.413, acc=85.6]


Train Loss: 0.4112, Train Acc: 85.56%
Test Loss: 0.5531, Test Acc: 81.53%

Epoch 33/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.46it/s, loss=0.39, acc=85.9] 


Train Loss: 0.3884, Train Acc: 85.91%
Test Loss: 0.5651, Test Acc: 81.97%

Epoch 34/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.50it/s, loss=0.374, acc=86.6]


Train Loss: 0.3722, Train Acc: 86.57%
Test Loss: 0.5479, Test Acc: 82.16%

Epoch 35/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.78it/s, loss=0.35, acc=87.5] 


Train Loss: 0.3482, Train Acc: 87.53%
Test Loss: 0.5254, Test Acc: 82.72%

Epoch 36/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.68it/s, loss=0.336, acc=88]  


Train Loss: 0.3347, Train Acc: 87.95%
Test Loss: 0.5322, Test Acc: 83.11%

Epoch 37/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.63it/s, loss=0.319, acc=88.7]


Train Loss: 0.3171, Train Acc: 88.68%
Test Loss: 0.5585, Test Acc: 82.43%

Epoch 38/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.47it/s, loss=0.296, acc=89.5]


Train Loss: 0.2948, Train Acc: 89.46%
Test Loss: 0.5360, Test Acc: 83.35%

Epoch 39/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.53it/s, loss=0.279, acc=90.2]


Train Loss: 0.2780, Train Acc: 90.20%
Test Loss: 0.5544, Test Acc: 83.27%

Epoch 40/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.47it/s, loss=0.267, acc=90.4]


Train Loss: 0.2659, Train Acc: 90.40%
Test Loss: 0.5503, Test Acc: 83.15%

Epoch 41/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.60it/s, loss=0.251, acc=91]  


Train Loss: 0.2502, Train Acc: 91.00%
Test Loss: 0.5601, Test Acc: 83.19%

Epoch 42/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.44it/s, loss=0.246, acc=91.2]


Train Loss: 0.2443, Train Acc: 91.20%
Test Loss: 0.5491, Test Acc: 83.61%

Epoch 43/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.44it/s, loss=0.231, acc=91.7]


Train Loss: 0.2297, Train Acc: 91.71%
Test Loss: 0.5681, Test Acc: 83.50%

Epoch 44/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.48it/s, loss=0.222, acc=92.1]


Train Loss: 0.2212, Train Acc: 92.06%
Test Loss: 0.5682, Test Acc: 83.81%

Epoch 45/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.51it/s, loss=0.215, acc=92.4]


Train Loss: 0.2143, Train Acc: 92.40%
Test Loss: 0.5721, Test Acc: 83.44%

Epoch 46/50


Training: 100%|██████████| 391/391 [00:15<00:00, 26.01it/s, loss=0.208, acc=92.5]


Train Loss: 0.2072, Train Acc: 92.55%
Test Loss: 0.5689, Test Acc: 83.52%

Epoch 47/50


Training: 100%|██████████| 391/391 [00:15<00:00, 26.05it/s, loss=0.201, acc=92.8]


Train Loss: 0.2002, Train Acc: 92.82%
Test Loss: 0.5723, Test Acc: 83.35%

Epoch 48/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.22it/s, loss=0.199, acc=93]  


Train Loss: 0.1976, Train Acc: 92.96%
Test Loss: 0.5690, Test Acc: 83.63%

Epoch 49/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.43it/s, loss=0.196, acc=93]  


Train Loss: 0.1945, Train Acc: 93.01%
Test Loss: 0.5708, Test Acc: 83.59%

Epoch 50/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.47it/s, loss=0.195, acc=93.1]


Train Loss: 0.1943, Train Acc: 93.07%
Test Loss: 0.5706, Test Acc: 83.62%

=== Running hyperparameter configuration 2/5 ===

Epoch 1/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.61it/s, loss=1.81, acc=31.5]


Train Loss: 1.8141, Train Acc: 31.48%
Test Loss: 1.5792, Test Acc: 41.59%

Epoch 2/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.19it/s, loss=1.5, acc=45.3] 


Train Loss: 1.4947, Train Acc: 45.28%
Test Loss: 1.3666, Test Acc: 50.03%

Epoch 3/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.52it/s, loss=1.37, acc=50.8]


Train Loss: 1.3585, Train Acc: 50.76%
Test Loss: 1.2304, Test Acc: 54.98%

Epoch 4/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.51it/s, loss=1.26, acc=54.7]


Train Loss: 1.2532, Train Acc: 54.74%
Test Loss: 1.2281, Test Acc: 56.43%

Epoch 5/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.52it/s, loss=1.19, acc=57.3]


Train Loss: 1.1858, Train Acc: 57.29%
Test Loss: 1.1050, Test Acc: 60.00%

Epoch 6/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.59it/s, loss=1.12, acc=59.5]


Train Loss: 1.1188, Train Acc: 59.54%
Test Loss: 1.0541, Test Acc: 62.40%

Epoch 7/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.60it/s, loss=1.07, acc=61.7]


Train Loss: 1.0663, Train Acc: 61.69%
Test Loss: 1.0270, Test Acc: 63.50%

Epoch 8/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.59it/s, loss=1.02, acc=63.8]


Train Loss: 1.0101, Train Acc: 63.83%
Test Loss: 0.9426, Test Acc: 66.46%

Epoch 9/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.54it/s, loss=0.97, acc=65.7] 


Train Loss: 0.9649, Train Acc: 65.67%
Test Loss: 0.8755, Test Acc: 69.31%

Epoch 10/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.57it/s, loss=0.932, acc=67]  


Train Loss: 0.9275, Train Acc: 67.01%
Test Loss: 0.8770, Test Acc: 69.05%

Epoch 11/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.60it/s, loss=0.898, acc=68.2]


Train Loss: 0.8936, Train Acc: 68.17%
Test Loss: 0.8362, Test Acc: 70.33%

Epoch 12/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.65it/s, loss=0.859, acc=69.7]


Train Loss: 0.8546, Train Acc: 69.65%
Test Loss: 0.8131, Test Acc: 71.09%

Epoch 13/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.21it/s, loss=0.829, acc=70.8]


Train Loss: 0.8265, Train Acc: 70.77%
Test Loss: 0.7934, Test Acc: 72.31%

Epoch 14/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.64it/s, loss=0.803, acc=71.7]


Train Loss: 0.7993, Train Acc: 71.68%
Test Loss: 0.7653, Test Acc: 73.17%

Epoch 15/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.38it/s, loss=0.775, acc=72.6]


Train Loss: 0.7755, Train Acc: 72.61%
Test Loss: 0.7250, Test Acc: 74.39%

Epoch 16/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.56it/s, loss=0.748, acc=73.6]


Train Loss: 0.7441, Train Acc: 73.61%
Test Loss: 0.7136, Test Acc: 75.02%

Epoch 17/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.53it/s, loss=0.726, acc=74.6]


Train Loss: 0.7219, Train Acc: 74.61%
Test Loss: 0.7138, Test Acc: 74.90%

Epoch 18/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.15it/s, loss=0.705, acc=75.2]


Train Loss: 0.7029, Train Acc: 75.15%
Test Loss: 0.7035, Test Acc: 75.27%

Epoch 19/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.65it/s, loss=0.673, acc=76.3]


Train Loss: 0.6734, Train Acc: 76.27%
Test Loss: 0.6615, Test Acc: 76.95%

Epoch 20/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.14it/s, loss=0.659, acc=76.7]


Train Loss: 0.6568, Train Acc: 76.66%
Test Loss: 0.6647, Test Acc: 76.83%

Epoch 21/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.54it/s, loss=0.635, acc=77.7]


Train Loss: 0.6313, Train Acc: 77.65%
Test Loss: 0.6329, Test Acc: 78.36%

Epoch 22/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.46it/s, loss=0.611, acc=78.6]


Train Loss: 0.6076, Train Acc: 78.62%
Test Loss: 0.6200, Test Acc: 78.68%

Epoch 23/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.53it/s, loss=0.586, acc=79.4]


Train Loss: 0.5833, Train Acc: 79.40%
Test Loss: 0.6008, Test Acc: 79.22%

Epoch 24/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.53it/s, loss=0.57, acc=79.9] 


Train Loss: 0.5666, Train Acc: 79.94%
Test Loss: 0.5869, Test Acc: 79.88%

Epoch 25/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.52it/s, loss=0.539, acc=80.7]


Train Loss: 0.5394, Train Acc: 80.74%
Test Loss: 0.5864, Test Acc: 80.18%

Epoch 26/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.53it/s, loss=0.525, acc=81.4]


Train Loss: 0.5222, Train Acc: 81.42%
Test Loss: 0.5853, Test Acc: 80.16%

Epoch 27/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.43it/s, loss=0.503, acc=82.2]


Train Loss: 0.5003, Train Acc: 82.22%
Test Loss: 0.5576, Test Acc: 81.18%

Epoch 28/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.52it/s, loss=0.477, acc=83.3]


Train Loss: 0.4742, Train Acc: 83.33%
Test Loss: 0.5595, Test Acc: 80.96%

Epoch 29/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.49it/s, loss=0.46, acc=83.8] 


Train Loss: 0.4572, Train Acc: 83.79%
Test Loss: 0.5733, Test Acc: 81.08%

Epoch 30/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.68it/s, loss=0.43, acc=84.9] 


Train Loss: 0.4280, Train Acc: 84.87%
Test Loss: 0.5461, Test Acc: 81.27%

Epoch 31/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.59it/s, loss=0.405, acc=85.6]


Train Loss: 0.4025, Train Acc: 85.57%
Test Loss: 0.5708, Test Acc: 81.32%

Epoch 32/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.53it/s, loss=0.39, acc=86.1] 


Train Loss: 0.3885, Train Acc: 86.09%
Test Loss: 0.5349, Test Acc: 82.08%

Epoch 33/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.54it/s, loss=0.369, acc=86.9]


Train Loss: 0.3668, Train Acc: 86.89%
Test Loss: 0.5430, Test Acc: 82.61%

Epoch 34/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.45it/s, loss=0.344, acc=87.8]


Train Loss: 0.3428, Train Acc: 87.84%
Test Loss: 0.5264, Test Acc: 82.93%

Epoch 35/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.56it/s, loss=0.323, acc=88.5]


Train Loss: 0.3210, Train Acc: 88.54%
Test Loss: 0.5560, Test Acc: 82.47%

Epoch 36/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.51it/s, loss=0.303, acc=89.3]


Train Loss: 0.3016, Train Acc: 89.27%
Test Loss: 0.5369, Test Acc: 82.99%

Epoch 37/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.47it/s, loss=0.285, acc=89.9]


Train Loss: 0.2832, Train Acc: 89.86%
Test Loss: 0.5344, Test Acc: 83.34%

Epoch 38/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.59it/s, loss=0.265, acc=90.6]


Train Loss: 0.2636, Train Acc: 90.58%
Test Loss: 0.5380, Test Acc: 83.46%

Epoch 39/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.47it/s, loss=0.248, acc=91.1]


Train Loss: 0.2465, Train Acc: 91.13%
Test Loss: 0.5470, Test Acc: 83.23%

Epoch 40/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.59it/s, loss=0.23, acc=91.8] 


Train Loss: 0.2291, Train Acc: 91.79%
Test Loss: 0.5552, Test Acc: 83.59%

Epoch 41/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.52it/s, loss=0.213, acc=92.5]


Train Loss: 0.2119, Train Acc: 92.47%
Test Loss: 0.5656, Test Acc: 83.36%

Epoch 42/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.49it/s, loss=0.211, acc=92.4]


Train Loss: 0.2103, Train Acc: 92.36%
Test Loss: 0.5575, Test Acc: 83.45%

Epoch 43/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.00it/s, loss=0.194, acc=93.1]


Train Loss: 0.1938, Train Acc: 93.12%
Test Loss: 0.5582, Test Acc: 83.90%

Epoch 44/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.47it/s, loss=0.183, acc=93.5]


Train Loss: 0.1824, Train Acc: 93.49%
Test Loss: 0.5596, Test Acc: 83.99%

Epoch 45/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.51it/s, loss=0.175, acc=93.8]


Train Loss: 0.1741, Train Acc: 93.85%
Test Loss: 0.5729, Test Acc: 83.95%

Epoch 46/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.51it/s, loss=0.17, acc=94]   


Train Loss: 0.1694, Train Acc: 94.00%
Test Loss: 0.5726, Test Acc: 83.90%

Epoch 47/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.55it/s, loss=0.162, acc=94.3]


Train Loss: 0.1610, Train Acc: 94.29%
Test Loss: 0.5727, Test Acc: 83.84%

Epoch 48/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.56it/s, loss=0.161, acc=94.3]


Train Loss: 0.1601, Train Acc: 94.30%
Test Loss: 0.5767, Test Acc: 83.87%

Epoch 49/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.46it/s, loss=0.154, acc=94.6]


Train Loss: 0.1544, Train Acc: 94.57%
Test Loss: 0.5779, Test Acc: 83.81%

Epoch 50/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.41it/s, loss=0.155, acc=94.6]


Train Loss: 0.1537, Train Acc: 94.64%
Test Loss: 0.5773, Test Acc: 83.81%

=== Running hyperparameter configuration 3/5 ===

Epoch 1/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.32it/s, loss=2.13, acc=19.1]


Train Loss: 2.1237, Train Acc: 19.07%
Test Loss: 2.3176, Test Acc: 10.01%

Epoch 2/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.24it/s, loss=2.32, acc=9.92]


Train Loss: 2.3103, Train Acc: 9.92%
Test Loss: 2.3129, Test Acc: 10.01%

Epoch 3/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.28it/s, loss=2.31, acc=9.96]


Train Loss: 2.3066, Train Acc: 9.96%
Test Loss: 2.3067, Test Acc: 10.01%

Epoch 4/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.28it/s, loss=2.31, acc=9.82]


Train Loss: 2.3053, Train Acc: 9.82%
Test Loss: 2.3055, Test Acc: 10.00%

Epoch 5/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.34it/s, loss=2.31, acc=10]  


Train Loss: 2.3044, Train Acc: 10.00%
Test Loss: 2.3040, Test Acc: 10.01%

Epoch 6/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.24it/s, loss=2.31, acc=9.88]


Train Loss: 2.3040, Train Acc: 9.88%
Test Loss: 2.3037, Test Acc: 10.02%

Epoch 7/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.26it/s, loss=2.31, acc=9.92]


Train Loss: 2.3035, Train Acc: 9.92%
Test Loss: 2.3035, Test Acc: 10.04%

Epoch 8/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.30it/s, loss=2.31, acc=10]  


Train Loss: 2.3035, Train Acc: 10.02%
Test Loss: 2.3029, Test Acc: 10.00%

Epoch 9/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.33it/s, loss=2.31, acc=9.92]


Train Loss: 2.3033, Train Acc: 9.92%
Test Loss: 2.3029, Test Acc: 10.00%

Epoch 10/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.36it/s, loss=2.31, acc=9.93]


Train Loss: 2.3032, Train Acc: 9.93%
Test Loss: 2.3028, Test Acc: 10.00%

Epoch 11/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.32it/s, loss=2.31, acc=9.75]


Train Loss: 2.3034, Train Acc: 9.75%
Test Loss: 2.3027, Test Acc: 10.00%

Epoch 12/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.28it/s, loss=2.31, acc=10.1]


Train Loss: 2.3030, Train Acc: 10.13%
Test Loss: 2.3027, Test Acc: 10.00%

Epoch 13/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.43it/s, loss=2.31, acc=9.94]


Train Loss: 2.3030, Train Acc: 9.94%
Test Loss: 2.3028, Test Acc: 10.00%

Epoch 14/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.32it/s, loss=2.31, acc=9.74]


Train Loss: 2.3031, Train Acc: 9.74%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 15/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.28it/s, loss=2.31, acc=10.1]


Train Loss: 2.3029, Train Acc: 10.10%
Test Loss: 2.3036, Test Acc: 10.00%

Epoch 16/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.17it/s, loss=2.3, acc=9.69] 


Train Loss: 2.3032, Train Acc: 9.69%
Test Loss: 2.3027, Test Acc: 10.00%

Epoch 17/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.42it/s, loss=2.31, acc=9.89]


Train Loss: 2.3032, Train Acc: 9.89%
Test Loss: 2.3028, Test Acc: 10.01%

Epoch 18/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.32it/s, loss=2.31, acc=9.89]


Train Loss: 2.3029, Train Acc: 9.89%
Test Loss: 2.3028, Test Acc: 9.99%

Epoch 19/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.27it/s, loss=2.31, acc=9.82]


Train Loss: 2.3030, Train Acc: 9.82%
Test Loss: 2.3026, Test Acc: 10.01%

Epoch 20/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.37it/s, loss=2.31, acc=9.91]


Train Loss: 2.3030, Train Acc: 9.91%
Test Loss: 2.3027, Test Acc: 10.00%

Epoch 21/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.30it/s, loss=2.31, acc=9.77]


Train Loss: 2.3030, Train Acc: 9.77%
Test Loss: 2.3028, Test Acc: 10.00%

Epoch 22/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.37it/s, loss=2.31, acc=9.84]


Train Loss: 2.3030, Train Acc: 9.84%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 23/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.34it/s, loss=2.31, acc=9.71]


Train Loss: 2.3029, Train Acc: 9.71%
Test Loss: 2.3028, Test Acc: 10.00%

Epoch 24/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.35it/s, loss=2.31, acc=9.94]


Train Loss: 2.3042, Train Acc: 9.94%
Test Loss: 2.3028, Test Acc: 9.99%

Epoch 25/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.26it/s, loss=2.31, acc=9.89]


Train Loss: 2.3029, Train Acc: 9.89%
Test Loss: 2.3031, Test Acc: 10.00%

Epoch 26/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.31it/s, loss=2.31, acc=9.95]


Train Loss: 2.3030, Train Acc: 9.95%
Test Loss: 2.3027, Test Acc: 10.00%

Epoch 27/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.34it/s, loss=2.31, acc=9.9] 


Train Loss: 2.3029, Train Acc: 9.90%
Test Loss: 2.3027, Test Acc: 10.00%

Epoch 28/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.36it/s, loss=2.31, acc=9.76]


Train Loss: 2.3028, Train Acc: 9.76%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 29/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.23it/s, loss=2.31, acc=10.1]


Train Loss: 2.3027, Train Acc: 10.07%
Test Loss: 2.3030, Test Acc: 10.00%

Epoch 30/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.35it/s, loss=2.31, acc=9.91]


Train Loss: 2.3031, Train Acc: 9.91%
Test Loss: 2.3027, Test Acc: 10.00%

Epoch 31/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.27it/s, loss=2.31, acc=9.75]


Train Loss: 2.3028, Train Acc: 9.75%
Test Loss: 2.3027, Test Acc: 10.00%

Epoch 32/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.31it/s, loss=2.31, acc=9.98]


Train Loss: 2.3027, Train Acc: 9.98%
Test Loss: 2.3027, Test Acc: 10.00%

Epoch 33/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.36it/s, loss=2.31, acc=9.98]


Train Loss: 2.3027, Train Acc: 9.98%
Test Loss: 2.3027, Test Acc: 10.00%

Epoch 34/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.29it/s, loss=2.31, acc=9.65]


Train Loss: 2.3027, Train Acc: 9.65%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 35/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.37it/s, loss=2.31, acc=9.93]


Train Loss: 2.3027, Train Acc: 9.93%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 36/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.27it/s, loss=2.31, acc=9.78]


Train Loss: 2.3027, Train Acc: 9.78%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 37/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.28it/s, loss=2.31, acc=9.83]


Train Loss: 2.3027, Train Acc: 9.83%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 38/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.28it/s, loss=2.31, acc=9.95]


Train Loss: 2.3027, Train Acc: 9.95%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 39/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.22it/s, loss=2.3, acc=9.93] 


Train Loss: 2.3027, Train Acc: 9.93%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 40/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.29it/s, loss=2.31, acc=9.94]


Train Loss: 2.3026, Train Acc: 9.94%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 41/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.29it/s, loss=2.31, acc=9.97]


Train Loss: 2.3027, Train Acc: 9.97%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 42/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.34it/s, loss=2.31, acc=9.75]


Train Loss: 2.3026, Train Acc: 9.75%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 43/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.29it/s, loss=2.31, acc=9.91]


Train Loss: 2.3026, Train Acc: 9.91%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 44/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.30it/s, loss=2.31, acc=9.8] 


Train Loss: 2.3026, Train Acc: 9.80%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 45/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.29it/s, loss=2.31, acc=10]  


Train Loss: 2.3026, Train Acc: 10.02%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 46/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.29it/s, loss=2.31, acc=9.97]


Train Loss: 2.3026, Train Acc: 9.97%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 47/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.28it/s, loss=2.31, acc=9.88]


Train Loss: 2.3026, Train Acc: 9.88%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 48/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.33it/s, loss=2.31, acc=10]  


Train Loss: 2.3026, Train Acc: 10.00%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 49/50


Training: 100%|██████████| 391/391 [00:24<00:00, 16.23it/s, loss=2.3, acc=9.98] 


Train Loss: 2.3026, Train Acc: 9.98%
Test Loss: 2.3026, Test Acc: 10.00%

Epoch 50/50


Training: 100%|██████████| 391/391 [00:23<00:00, 16.38it/s, loss=2.31, acc=9.92]


Train Loss: 2.3026, Train Acc: 9.92%
Test Loss: 2.3026, Test Acc: 10.00%

=== Running hyperparameter configuration 4/5 ===

Epoch 1/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.73it/s, loss=1.78, acc=33.4]


Train Loss: 1.7848, Train Acc: 33.42%
Test Loss: 1.6317, Test Acc: 40.03%

Epoch 2/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.86it/s, loss=1.48, acc=46.4]


Train Loss: 1.4697, Train Acc: 46.43%
Test Loss: 1.3539, Test Acc: 50.28%

Epoch 3/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.82it/s, loss=1.35, acc=51.1]


Train Loss: 1.3478, Train Acc: 51.06%
Test Loss: 1.2610, Test Acc: 54.81%

Epoch 4/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.70it/s, loss=1.26, acc=54.5]


Train Loss: 1.2570, Train Acc: 54.54%
Test Loss: 1.2086, Test Acc: 55.74%

Epoch 5/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.75it/s, loss=1.21, acc=56.7]


Train Loss: 1.2033, Train Acc: 56.70%
Test Loss: 1.1200, Test Acc: 59.51%

Epoch 6/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.80it/s, loss=1.15, acc=58.8]


Train Loss: 1.1408, Train Acc: 58.83%
Test Loss: 1.1062, Test Acc: 60.48%

Epoch 7/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.71it/s, loss=1.1, acc=60.9] 


Train Loss: 1.0925, Train Acc: 60.91%
Test Loss: 1.0496, Test Acc: 62.94%

Epoch 8/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.83it/s, loss=1.04, acc=62.8]


Train Loss: 1.0400, Train Acc: 62.76%
Test Loss: 0.9563, Test Acc: 65.63%

Epoch 9/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.81it/s, loss=0.994, acc=64.8]


Train Loss: 0.9888, Train Acc: 64.78%
Test Loss: 0.9996, Test Acc: 64.39%

Epoch 10/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.70it/s, loss=0.956, acc=66]  


Train Loss: 0.9507, Train Acc: 65.96%
Test Loss: 0.9567, Test Acc: 66.29%

Epoch 11/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.24it/s, loss=0.916, acc=67.5]


Train Loss: 0.9139, Train Acc: 67.55%
Test Loss: 0.8645, Test Acc: 69.37%

Epoch 12/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.83it/s, loss=0.887, acc=68.6]


Train Loss: 0.8821, Train Acc: 68.64%
Test Loss: 0.8495, Test Acc: 69.70%

Epoch 13/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.83it/s, loss=0.857, acc=69.7]


Train Loss: 0.8521, Train Acc: 69.74%
Test Loss: 0.7840, Test Acc: 72.41%

Epoch 14/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.91it/s, loss=0.825, acc=71]  


Train Loss: 0.8211, Train Acc: 71.01%
Test Loss: 0.8084, Test Acc: 71.89%

Epoch 15/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.72it/s, loss=0.803, acc=71.8]


Train Loss: 0.7985, Train Acc: 71.80%
Test Loss: 0.7320, Test Acc: 74.09%

Epoch 16/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.83it/s, loss=0.771, acc=72.8]


Train Loss: 0.7670, Train Acc: 72.75%
Test Loss: 0.7582, Test Acc: 72.72%

Epoch 17/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.78it/s, loss=0.749, acc=73.6]


Train Loss: 0.7447, Train Acc: 73.61%
Test Loss: 0.7355, Test Acc: 74.42%

Epoch 18/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.75it/s, loss=0.716, acc=74.8]


Train Loss: 0.7122, Train Acc: 74.81%
Test Loss: 0.6805, Test Acc: 76.26%

Epoch 19/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.87it/s, loss=0.697, acc=75.4]


Train Loss: 0.6936, Train Acc: 75.37%
Test Loss: 0.7120, Test Acc: 75.42%

Epoch 20/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.84it/s, loss=0.666, acc=76.7]


Train Loss: 0.6623, Train Acc: 76.73%
Test Loss: 0.6788, Test Acc: 76.67%

Epoch 21/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.69it/s, loss=0.65, acc=77.3] 


Train Loss: 0.6465, Train Acc: 77.26%
Test Loss: 0.6685, Test Acc: 77.08%

Epoch 22/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.66it/s, loss=0.629, acc=77.8]


Train Loss: 0.6258, Train Acc: 77.85%
Test Loss: 0.6312, Test Acc: 77.99%

Epoch 23/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.71it/s, loss=0.603, acc=78.8]


Train Loss: 0.6000, Train Acc: 78.82%
Test Loss: 0.6478, Test Acc: 77.64%

Epoch 24/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.72it/s, loss=0.586, acc=79.2]


Train Loss: 0.5830, Train Acc: 79.20%
Test Loss: 0.6162, Test Acc: 79.31%

Epoch 25/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.71it/s, loss=0.559, acc=80.3]


Train Loss: 0.5577, Train Acc: 80.32%
Test Loss: 0.5871, Test Acc: 80.26%

Epoch 26/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.64it/s, loss=0.534, acc=81.2]


Train Loss: 0.5309, Train Acc: 81.15%
Test Loss: 0.6034, Test Acc: 79.44%

Epoch 27/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.76it/s, loss=0.521, acc=81.8]


Train Loss: 0.5183, Train Acc: 81.77%
Test Loss: 0.5807, Test Acc: 80.55%

Epoch 28/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.50it/s, loss=0.496, acc=82.4]


Train Loss: 0.4943, Train Acc: 82.39%
Test Loss: 0.5670, Test Acc: 80.69%

Epoch 29/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.83it/s, loss=0.471, acc=83.4]


Train Loss: 0.4690, Train Acc: 83.38%
Test Loss: 0.5667, Test Acc: 80.82%

Epoch 30/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.74it/s, loss=0.451, acc=84]  


Train Loss: 0.4491, Train Acc: 83.97%
Test Loss: 0.5483, Test Acc: 81.46%

Epoch 31/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.68it/s, loss=0.424, acc=85]  


Train Loss: 0.4245, Train Acc: 84.99%
Test Loss: 0.5645, Test Acc: 81.03%

Epoch 32/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.65it/s, loss=0.408, acc=85.5]


Train Loss: 0.4057, Train Acc: 85.49%
Test Loss: 0.5507, Test Acc: 81.59%

Epoch 33/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.71it/s, loss=0.385, acc=86.3]


Train Loss: 0.3826, Train Acc: 86.26%
Test Loss: 0.5660, Test Acc: 81.90%

Epoch 34/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.57it/s, loss=0.366, acc=87]  


Train Loss: 0.3647, Train Acc: 87.01%
Test Loss: 0.5494, Test Acc: 82.07%

Epoch 35/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.67it/s, loss=0.347, acc=87.7]


Train Loss: 0.3451, Train Acc: 87.66%
Test Loss: 0.5419, Test Acc: 82.73%

Epoch 36/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.83it/s, loss=0.324, acc=88.4]


Train Loss: 0.3223, Train Acc: 88.41%
Test Loss: 0.5558, Test Acc: 82.49%

Epoch 37/50


Training: 100%|██████████| 391/391 [00:19<00:00, 20.42it/s, loss=0.306, acc=89.2]


Train Loss: 0.3054, Train Acc: 89.16%
Test Loss: 0.5618, Test Acc: 82.32%

Epoch 38/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.87it/s, loss=0.288, acc=89.9]


Train Loss: 0.2862, Train Acc: 89.89%
Test Loss: 0.5483, Test Acc: 83.14%

Epoch 39/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.70it/s, loss=0.269, acc=90.4]


Train Loss: 0.2678, Train Acc: 90.42%
Test Loss: 0.5487, Test Acc: 82.97%

Epoch 40/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.74it/s, loss=0.251, acc=91.2]


Train Loss: 0.2500, Train Acc: 91.17%
Test Loss: 0.5566, Test Acc: 83.31%

Epoch 41/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.69it/s, loss=0.237, acc=91.5]


Train Loss: 0.2360, Train Acc: 91.52%
Test Loss: 0.5717, Test Acc: 82.94%

Epoch 42/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.69it/s, loss=0.225, acc=92]  


Train Loss: 0.2243, Train Acc: 92.02%
Test Loss: 0.5720, Test Acc: 83.11%

Epoch 43/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.76it/s, loss=0.217, acc=92.3]


Train Loss: 0.2154, Train Acc: 92.31%
Test Loss: 0.5798, Test Acc: 83.08%

Epoch 44/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.72it/s, loss=0.202, acc=92.8]


Train Loss: 0.2015, Train Acc: 92.75%
Test Loss: 0.5794, Test Acc: 83.01%

Epoch 45/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.74it/s, loss=0.196, acc=93.2]


Train Loss: 0.1951, Train Acc: 93.18%
Test Loss: 0.5844, Test Acc: 83.25%

Epoch 46/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.71it/s, loss=0.19, acc=93.3] 


Train Loss: 0.1889, Train Acc: 93.29%
Test Loss: 0.5935, Test Acc: 83.13%

Epoch 47/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.81it/s, loss=0.182, acc=93.5]


Train Loss: 0.1810, Train Acc: 93.54%
Test Loss: 0.5861, Test Acc: 83.38%

Epoch 48/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.77it/s, loss=0.178, acc=93.8]


Train Loss: 0.1771, Train Acc: 93.76%
Test Loss: 0.5880, Test Acc: 83.33%

Epoch 49/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.67it/s, loss=0.176, acc=93.7]


Train Loss: 0.1751, Train Acc: 93.70%
Test Loss: 0.5894, Test Acc: 83.34%

Epoch 50/50


Training: 100%|██████████| 391/391 [00:18<00:00, 20.88it/s, loss=0.174, acc=93.9]


Train Loss: 0.1736, Train Acc: 93.93%
Test Loss: 0.5886, Test Acc: 83.33%

=== Running hyperparameter configuration 5/5 ===

Epoch 1/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.62it/s, loss=1.71, acc=36.9]


Train Loss: 1.7017, Train Acc: 36.92%
Test Loss: 1.4025, Test Acc: 49.47%

Epoch 2/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.39it/s, loss=1.33, acc=51.8]


Train Loss: 1.3203, Train Acc: 51.75%
Test Loss: 1.1871, Test Acc: 56.74%

Epoch 3/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.47it/s, loss=1.19, acc=57.3]


Train Loss: 1.1824, Train Acc: 57.30%
Test Loss: 1.1197, Test Acc: 59.77%

Epoch 4/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.23it/s, loss=1.1, acc=60.7] 


Train Loss: 1.0946, Train Acc: 60.68%
Test Loss: 1.0026, Test Acc: 64.11%

Epoch 5/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.61it/s, loss=1.04, acc=62.8]


Train Loss: 1.0384, Train Acc: 62.84%
Test Loss: 0.9592, Test Acc: 65.63%

Epoch 6/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.48it/s, loss=0.988, acc=64.8]


Train Loss: 0.9827, Train Acc: 64.75%
Test Loss: 0.8981, Test Acc: 67.99%

Epoch 7/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.43it/s, loss=0.938, acc=66.9]


Train Loss: 0.9329, Train Acc: 66.91%
Test Loss: 0.9170, Test Acc: 67.55%

Epoch 8/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.39it/s, loss=0.9, acc=68.2]  


Train Loss: 0.8954, Train Acc: 68.24%
Test Loss: 0.8538, Test Acc: 69.19%

Epoch 9/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.33it/s, loss=0.847, acc=70.1]


Train Loss: 0.8428, Train Acc: 70.06%
Test Loss: 0.7953, Test Acc: 71.55%

Epoch 10/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.32it/s, loss=0.813, acc=71.4]


Train Loss: 0.8093, Train Acc: 71.40%
Test Loss: 0.7836, Test Acc: 72.36%

Epoch 11/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.31it/s, loss=0.776, acc=72.9]


Train Loss: 0.7724, Train Acc: 72.94%
Test Loss: 0.7698, Test Acc: 73.18%

Epoch 12/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.21it/s, loss=0.742, acc=73.7]


Train Loss: 0.7399, Train Acc: 73.70%
Test Loss: 0.7340, Test Acc: 73.94%

Epoch 13/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.35it/s, loss=0.715, acc=74.7]


Train Loss: 0.7116, Train Acc: 74.68%
Test Loss: 0.7057, Test Acc: 75.28%

Epoch 14/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.48it/s, loss=0.687, acc=75.8]


Train Loss: 0.6834, Train Acc: 75.77%
Test Loss: 0.7032, Test Acc: 75.00%

Epoch 15/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.16it/s, loss=0.658, acc=76.8]


Train Loss: 0.6546, Train Acc: 76.81%
Test Loss: 0.6634, Test Acc: 77.00%

Epoch 16/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.41it/s, loss=0.638, acc=77.7]


Train Loss: 0.6350, Train Acc: 77.71%
Test Loss: 0.6315, Test Acc: 77.75%

Epoch 17/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.48it/s, loss=0.61, acc=78.3] 


Train Loss: 0.6071, Train Acc: 78.29%
Test Loss: 0.6521, Test Acc: 77.46%

Epoch 18/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.11it/s, loss=0.593, acc=79]  


Train Loss: 0.5902, Train Acc: 79.04%
Test Loss: 0.6345, Test Acc: 77.96%

Epoch 19/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.56it/s, loss=0.57, acc=79.8] 


Train Loss: 0.5684, Train Acc: 79.81%
Test Loss: 0.6272, Test Acc: 78.54%

Epoch 20/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.45it/s, loss=0.547, acc=80.9]


Train Loss: 0.5441, Train Acc: 80.86%
Test Loss: 0.6054, Test Acc: 79.07%

Epoch 21/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.37it/s, loss=0.528, acc=81.6]


Train Loss: 0.5251, Train Acc: 81.59%
Test Loss: 0.5953, Test Acc: 79.09%

Epoch 22/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.57it/s, loss=0.508, acc=82.1]


Train Loss: 0.5051, Train Acc: 82.11%
Test Loss: 0.6000, Test Acc: 80.04%

Epoch 23/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.29it/s, loss=0.486, acc=82.9]


Train Loss: 0.4833, Train Acc: 82.94%
Test Loss: 0.5853, Test Acc: 79.98%

Epoch 24/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.09it/s, loss=0.467, acc=83.5]


Train Loss: 0.4649, Train Acc: 83.46%
Test Loss: 0.5629, Test Acc: 80.69%

Epoch 25/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.39it/s, loss=0.444, acc=84.4]


Train Loss: 0.4414, Train Acc: 84.37%
Test Loss: 0.5541, Test Acc: 81.58%

Epoch 26/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.49it/s, loss=0.428, acc=85]  


Train Loss: 0.4262, Train Acc: 84.98%
Test Loss: 0.5613, Test Acc: 81.34%

Epoch 27/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.41it/s, loss=0.412, acc=85.5]


Train Loss: 0.4096, Train Acc: 85.46%
Test Loss: 0.5465, Test Acc: 82.12%

Epoch 28/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.21it/s, loss=0.393, acc=86]  


Train Loss: 0.3915, Train Acc: 86.04%
Test Loss: 0.5358, Test Acc: 82.33%

Epoch 29/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.41it/s, loss=0.377, acc=86.6]


Train Loss: 0.3754, Train Acc: 86.56%
Test Loss: 0.5628, Test Acc: 81.68%

Epoch 30/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.08it/s, loss=0.356, acc=87.3]


Train Loss: 0.3539, Train Acc: 87.32%
Test Loss: 0.5372, Test Acc: 82.91%

Epoch 31/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.28it/s, loss=0.339, acc=87.8]


Train Loss: 0.3372, Train Acc: 87.84%
Test Loss: 0.5389, Test Acc: 82.29%

Epoch 32/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.35it/s, loss=0.322, acc=88.5]


Train Loss: 0.3207, Train Acc: 88.52%
Test Loss: 0.5289, Test Acc: 83.10%

Epoch 33/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.30it/s, loss=0.306, acc=89.1]


Train Loss: 0.3049, Train Acc: 89.08%
Test Loss: 0.5469, Test Acc: 82.83%

Epoch 34/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.36it/s, loss=0.295, acc=89.6]


Train Loss: 0.2932, Train Acc: 89.62%
Test Loss: 0.5305, Test Acc: 83.21%

Epoch 35/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.52it/s, loss=0.28, acc=90.1] 


Train Loss: 0.2789, Train Acc: 90.12%
Test Loss: 0.5229, Test Acc: 83.59%

Epoch 36/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.50it/s, loss=0.268, acc=90.6]


Train Loss: 0.2666, Train Acc: 90.58%
Test Loss: 0.5177, Test Acc: 83.75%

Epoch 37/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.34it/s, loss=0.253, acc=91]  


Train Loss: 0.2517, Train Acc: 91.00%
Test Loss: 0.5402, Test Acc: 83.64%

Epoch 38/50


Training: 100%|██████████| 391/391 [00:15<00:00, 25.96it/s, loss=0.237, acc=91.6]


Train Loss: 0.2365, Train Acc: 91.61%
Test Loss: 0.5238, Test Acc: 84.21%

Epoch 39/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.70it/s, loss=0.226, acc=91.9]


Train Loss: 0.2251, Train Acc: 91.87%
Test Loss: 0.5403, Test Acc: 83.88%

Epoch 40/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.41it/s, loss=0.222, acc=92.1]


Train Loss: 0.2205, Train Acc: 92.11%
Test Loss: 0.5401, Test Acc: 84.15%

Epoch 41/50


Training: 100%|██████████| 391/391 [00:15<00:00, 25.79it/s, loss=0.208, acc=92.5]


Train Loss: 0.2074, Train Acc: 92.54%
Test Loss: 0.5394, Test Acc: 84.15%

Epoch 42/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.68it/s, loss=0.204, acc=92.7]


Train Loss: 0.2033, Train Acc: 92.72%
Test Loss: 0.5358, Test Acc: 84.37%

Epoch 43/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.53it/s, loss=0.195, acc=93.2]


Train Loss: 0.1937, Train Acc: 93.15%
Test Loss: 0.5500, Test Acc: 84.06%

Epoch 44/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.30it/s, loss=0.188, acc=93.4]


Train Loss: 0.1871, Train Acc: 93.37%
Test Loss: 0.5461, Test Acc: 84.23%

Epoch 45/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.35it/s, loss=0.182, acc=93.5]


Train Loss: 0.1815, Train Acc: 93.51%
Test Loss: 0.5466, Test Acc: 84.34%

Epoch 46/50


Training: 100%|██████████| 391/391 [00:15<00:00, 26.04it/s, loss=0.176, acc=93.8]


Train Loss: 0.1751, Train Acc: 93.77%
Test Loss: 0.5477, Test Acc: 84.39%

Epoch 47/50


Training: 100%|██████████| 391/391 [00:15<00:00, 25.79it/s, loss=0.174, acc=93.9]


Train Loss: 0.1737, Train Acc: 93.86%
Test Loss: 0.5425, Test Acc: 84.39%

Epoch 48/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.31it/s, loss=0.171, acc=93.9]


Train Loss: 0.1704, Train Acc: 93.92%
Test Loss: 0.5465, Test Acc: 84.56%

Epoch 49/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.14it/s, loss=0.174, acc=93.8]


Train Loss: 0.1738, Train Acc: 93.80%
Test Loss: 0.5442, Test Acc: 84.61%

Epoch 50/50


Training: 100%|██████████| 391/391 [00:14<00:00, 26.10it/s, loss=0.17, acc=94.1] 


Train Loss: 0.1693, Train Acc: 94.06%
Test Loss: 0.5449, Test Acc: 84.56%

Hyperparameter Exploration Results:
Config 1:
  Patch Size: 4
  Embed Dim: 256
  Depth: 6
  Heads: 8
  MLP Dim: 512
  Learning Rate: 0.001
  Best Test Accuracy: 83.81%
Config 2:
  Patch Size: 4
  Embed Dim: 256
  Depth: 8
  Heads: 8
  MLP Dim: 512
  Learning Rate: 0.001
  Best Test Accuracy: 83.99%
Config 3:
  Patch Size: 4
  Embed Dim: 384
  Depth: 6
  Heads: 12
  MLP Dim: 768
  Learning Rate: 0.001
  Best Test Accuracy: 10.04%
Config 4:
  Patch Size: 4
  Embed Dim: 288
  Depth: 6
  Heads: 12
  MLP Dim: 576
  Learning Rate: 0.001
  Best Test Accuracy: 83.38%
Config 5:
  Patch Size: 4
  Embed Dim: 256
  Depth: 6
  Heads: 8
  MLP Dim: 512
  Learning Rate: 0.0005
  Best Test Accuracy: 84.61%

Best Configuration (Test Acc: 84.61%):
  Patch Size: 4
  Embed Dim: 256
  Depth: 6
  Heads: 8
  MLP Dim: 512
  Learning Rate: 0.0005


In [None]:
all_results = results['hyperparameters']

In [40]:
print("\nHyperparameter Exploration Results:")
print("=" * 50)
for i, results in enumerate(all_results):
    config = results["config"]
    print(f"Config {i+1}:")
    print(f"  Patch Size: {config['patch_size']}")
    print(f"  Embed Dim: {config['embed_dim']}")
    print(f"  Depth: {config['depth']}")
    print(f"  Heads: {config['n_heads']}")
    print(f"  MLP Dim: {config['mlp_dim']}")
    print(f"  Learning Rate: {config['lr']}")
    print(f"  Best Test Accuracy: {results['best_test_acc']:.2f}%")
    print("=" * 50)

# Find best configuration
best_idx = max(range(len(all_results)), key=lambda i: all_results[i]['best_test_acc'])
best_config = all_results[best_idx]['config']
best_acc = all_results[best_idx]['best_test_acc']

print(f"\nBest Configuration (Test Acc: {best_acc:.2f}%):")
print(f"  Patch Size: {best_config['patch_size']}")
print(f"  Embed Dim: {best_config['embed_dim']}")
print(f"  Depth: {best_config['depth']}")
print(f"  Heads: {best_config['n_heads']}")
print(f"  MLP Dim: {best_config['mlp_dim']}")
print(f"  Learning Rate: {best_config['lr']}")


Hyperparameter Exploration Results:
Config 1:
  Patch Size: 4
  Embed Dim: 256
  Depth: 6
  Heads: 8
  MLP Dim: 512
  Learning Rate: 0.001
  Best Test Accuracy: 83.81%
Config 2:
  Patch Size: 4
  Embed Dim: 256
  Depth: 8
  Heads: 8
  MLP Dim: 512
  Learning Rate: 0.001
  Best Test Accuracy: 83.99%
Config 3:
  Patch Size: 4
  Embed Dim: 384
  Depth: 6
  Heads: 12
  MLP Dim: 768
  Learning Rate: 0.001
  Best Test Accuracy: 10.04%
Config 4:
  Patch Size: 4
  Embed Dim: 288
  Depth: 6
  Heads: 12
  MLP Dim: 576
  Learning Rate: 0.001
  Best Test Accuracy: 83.38%
Config 5:
  Patch Size: 4
  Embed Dim: 256
  Depth: 6
  Heads: 8
  MLP Dim: 512
  Learning Rate: 0.0005
  Best Test Accuracy: 84.61%

Best Configuration (Test Acc: 84.61%):
  Patch Size: 4
  Embed Dim: 256
  Depth: 6
  Heads: 8
  MLP Dim: 512
  Learning Rate: 0.0005


The hyperparameter exploration results reveal that configuration 5 performs best (84.61% accuracy) with a lower learning rate (0.0005) compared to other setups. Interestingly, configuration 3 fails completely (10.04% accuracy) despite having the largest embedding dimension (384) and MLP size (768), likely because the model becomes too complex and struggles to converge properly. The optimal configuration balances model capacity (256 embedding dimension, 6 transformer layers, 8 attention heads) with training stability. The results show that bigger isn't always better - carefully tuned smaller models often outperform larger ones, and learning rate has a significant impact on final performance.

In [36]:
if 'augmentation' in experiments_to_run:
    print("\n=== Running Data Augmentation Experiment ===")
    results['augmentation'] = run_augmentation_experiment(device, best_config)


=== Running Data Augmentation Experiment ===

=== Running experiment with augmentation=default ===

Epoch 1/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.64it/s, loss=1.7, acc=37.3] 


Train Loss: 1.6909, Train Acc: 37.30%
Test Loss: 1.3674, Test Acc: 50.27%

Epoch 2/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.44it/s, loss=1.32, acc=52.4]


Train Loss: 1.3126, Train Acc: 52.41%
Test Loss: 1.2181, Test Acc: 56.46%

Epoch 3/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.98it/s, loss=1.18, acc=57.6]


Train Loss: 1.1768, Train Acc: 57.57%
Test Loss: 1.1108, Test Acc: 60.88%

Epoch 4/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.70it/s, loss=1.1, acc=60.7] 


Train Loss: 1.0928, Train Acc: 60.71%
Test Loss: 1.0688, Test Acc: 61.51%

Epoch 5/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.73it/s, loss=1.03, acc=63.1]


Train Loss: 1.0253, Train Acc: 63.06%
Test Loss: 1.0531, Test Acc: 62.08%

Epoch 6/20


Training: 100%|██████████| 391/391 [00:14<00:00, 27.05it/s, loss=0.972, acc=65.5]


Train Loss: 0.9668, Train Acc: 65.45%
Test Loss: 0.9577, Test Acc: 65.76%

Epoch 7/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.65it/s, loss=0.916, acc=67.5]


Train Loss: 0.9112, Train Acc: 67.48%
Test Loss: 0.8638, Test Acc: 69.79%

Epoch 8/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.63it/s, loss=0.865, acc=69.3]


Train Loss: 0.8608, Train Acc: 69.32%
Test Loss: 0.8729, Test Acc: 68.58%

Epoch 9/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.84it/s, loss=0.821, acc=71.1]


Train Loss: 0.8168, Train Acc: 71.08%
Test Loss: 0.7746, Test Acc: 72.83%

Epoch 10/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.76it/s, loss=0.771, acc=72.9]


Train Loss: 0.7670, Train Acc: 72.92%
Test Loss: 0.7510, Test Acc: 73.32%

Epoch 11/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.64it/s, loss=0.733, acc=74.2]


Train Loss: 0.7293, Train Acc: 74.17%
Test Loss: 0.7191, Test Acc: 74.33%

Epoch 12/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.72it/s, loss=0.689, acc=75.9]


Train Loss: 0.6853, Train Acc: 75.88%
Test Loss: 0.6779, Test Acc: 75.90%

Epoch 13/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.66it/s, loss=0.647, acc=77.2]


Train Loss: 0.6435, Train Acc: 77.23%
Test Loss: 0.6637, Test Acc: 76.76%

Epoch 14/20


Training: 100%|██████████| 391/391 [00:15<00:00, 25.80it/s, loss=0.617, acc=78.4]


Train Loss: 0.6155, Train Acc: 78.36%
Test Loss: 0.6594, Test Acc: 76.74%

Epoch 15/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.09it/s, loss=0.584, acc=79.4]


Train Loss: 0.5810, Train Acc: 79.44%
Test Loss: 0.6416, Test Acc: 77.66%

Epoch 16/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.45it/s, loss=0.562, acc=80.2]


Train Loss: 0.5593, Train Acc: 80.18%
Test Loss: 0.6377, Test Acc: 77.95%

Epoch 17/20


Training: 100%|██████████| 391/391 [00:15<00:00, 25.65it/s, loss=0.538, acc=81.1]


Train Loss: 0.5363, Train Acc: 81.08%
Test Loss: 0.6127, Test Acc: 78.93%

Epoch 18/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.21it/s, loss=0.518, acc=81.9]


Train Loss: 0.5156, Train Acc: 81.89%
Test Loss: 0.6033, Test Acc: 79.31%

Epoch 19/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.46it/s, loss=0.51, acc=82]   


Train Loss: 0.5076, Train Acc: 82.00%
Test Loss: 0.5966, Test Acc: 79.56%

Epoch 20/20


Training: 100%|██████████| 391/391 [00:15<00:00, 26.04it/s, loss=0.499, acc=82.4]


Train Loss: 0.4981, Train Acc: 82.36%
Test Loss: 0.5967, Test Acc: 79.54%

=== Running experiment with augmentation=strong ===

Epoch 1/20


Training: 100%|██████████| 391/391 [00:21<00:00, 18.30it/s, loss=1.81, acc=32]  


Train Loss: 1.8133, Train Acc: 31.97%
Test Loss: 1.5731, Test Acc: 41.61%

Epoch 2/20


Training: 100%|██████████| 391/391 [00:23<00:00, 16.50it/s, loss=1.5, acc=45.3] 


Train Loss: 1.4995, Train Acc: 45.34%
Test Loss: 1.3252, Test Acc: 52.34%

Epoch 3/20


Training: 100%|██████████| 391/391 [00:21<00:00, 18.17it/s, loss=1.37, acc=50.5]


Train Loss: 1.3711, Train Acc: 50.46%
Test Loss: 1.1955, Test Acc: 56.58%

Epoch 4/20


Training: 100%|██████████| 391/391 [00:23<00:00, 16.61it/s, loss=1.3, acc=53.3] 


Train Loss: 1.3007, Train Acc: 53.29%
Test Loss: 1.1356, Test Acc: 58.99%

Epoch 5/20


Training: 100%|██████████| 391/391 [00:21<00:00, 18.14it/s, loss=1.24, acc=55.5]


Train Loss: 1.2356, Train Acc: 55.49%
Test Loss: 1.0722, Test Acc: 61.14%

Epoch 6/20


Training: 100%|██████████| 391/391 [00:21<00:00, 18.20it/s, loss=1.18, acc=57.7]


Train Loss: 1.1808, Train Acc: 57.66%
Test Loss: 1.0303, Test Acc: 62.78%

Epoch 7/20


Training: 100%|██████████| 391/391 [00:21<00:00, 18.22it/s, loss=1.14, acc=59.3]


Train Loss: 1.1390, Train Acc: 59.35%
Test Loss: 0.9903, Test Acc: 64.01%

Epoch 8/20


Training: 100%|██████████| 391/391 [00:21<00:00, 18.18it/s, loss=1.08, acc=61.5]


Train Loss: 1.0819, Train Acc: 61.53%
Test Loss: 0.9570, Test Acc: 66.43%

Epoch 9/20


Training: 100%|██████████| 391/391 [00:23<00:00, 16.46it/s, loss=1.05, acc=62.5]


Train Loss: 1.0493, Train Acc: 62.49%
Test Loss: 0.9028, Test Acc: 67.56%

Epoch 10/20


Training: 100%|██████████| 391/391 [00:22<00:00, 17.19it/s, loss=1.01, acc=64]  


Train Loss: 1.0051, Train Acc: 64.04%
Test Loss: 0.8543, Test Acc: 69.44%

Epoch 11/20


Training: 100%|██████████| 391/391 [00:21<00:00, 18.30it/s, loss=0.966, acc=65.5]


Train Loss: 0.9664, Train Acc: 65.54%
Test Loss: 0.8277, Test Acc: 70.35%

Epoch 12/20


Training: 100%|██████████| 391/391 [00:21<00:00, 18.21it/s, loss=0.929, acc=67.3]


Train Loss: 0.9294, Train Acc: 67.25%
Test Loss: 0.8018, Test Acc: 71.76%

Epoch 13/20


Training: 100%|██████████| 391/391 [00:21<00:00, 18.31it/s, loss=0.9, acc=67.9]  


Train Loss: 0.8977, Train Acc: 67.91%
Test Loss: 0.7747, Test Acc: 72.58%

Epoch 14/20


Training: 100%|██████████| 391/391 [00:21<00:00, 18.18it/s, loss=0.87, acc=69.2] 


Train Loss: 0.8679, Train Acc: 69.20%
Test Loss: 0.7672, Test Acc: 72.79%

Epoch 15/20


Training: 100%|██████████| 391/391 [00:21<00:00, 18.28it/s, loss=0.838, acc=70.3]


Train Loss: 0.8354, Train Acc: 70.26%
Test Loss: 0.7413, Test Acc: 73.39%

Epoch 16/20


Training: 100%|██████████| 391/391 [00:24<00:00, 16.02it/s, loss=0.81, acc=71.2] 


Train Loss: 0.8077, Train Acc: 71.20%
Test Loss: 0.7100, Test Acc: 75.35%

Epoch 17/20


Training: 100%|██████████| 391/391 [00:21<00:00, 18.20it/s, loss=0.798, acc=72]  


Train Loss: 0.7956, Train Acc: 71.97%
Test Loss: 0.7103, Test Acc: 75.06%

Epoch 18/20


Training: 100%|██████████| 391/391 [00:21<00:00, 18.18it/s, loss=0.781, acc=72.4]


Train Loss: 0.7787, Train Acc: 72.42%
Test Loss: 0.6864, Test Acc: 76.01%

Epoch 19/20


Training: 100%|██████████| 391/391 [00:21<00:00, 18.19it/s, loss=0.771, acc=72.6]


Train Loss: 0.7694, Train Acc: 72.55%
Test Loss: 0.6819, Test Acc: 76.19%

Epoch 20/20


Training: 100%|██████████| 391/391 [00:22<00:00, 17.42it/s, loss=0.762, acc=72.9]


Train Loss: 0.7619, Train Acc: 72.91%
Test Loss: 0.6784, Test Acc: 76.40%

=== Running experiment with augmentation=autoaugment ===

Epoch 1/20


Training: 100%|██████████| 391/391 [00:19<00:00, 20.46it/s, loss=1.92, acc=28.9]


Train Loss: 1.9082, Train Acc: 28.93%
Test Loss: 1.5291, Test Acc: 44.54%

Epoch 2/20


Training: 100%|██████████| 391/391 [00:21<00:00, 18.36it/s, loss=1.58, acc=42.6]


Train Loss: 1.5833, Train Acc: 42.60%
Test Loss: 1.3203, Test Acc: 51.80%

Epoch 3/20


Training: 100%|██████████| 391/391 [00:18<00:00, 20.93it/s, loss=1.49, acc=46.5]


Train Loss: 1.4808, Train Acc: 46.51%
Test Loss: 1.2155, Test Acc: 55.79%

Epoch 4/20


Training: 100%|██████████| 391/391 [00:19<00:00, 20.49it/s, loss=1.41, acc=49.8]


Train Loss: 1.4016, Train Acc: 49.78%
Test Loss: 1.1152, Test Acc: 59.78%

Epoch 5/20


Training: 100%|██████████| 391/391 [00:18<00:00, 20.61it/s, loss=1.34, acc=51.8]


Train Loss: 1.3395, Train Acc: 51.83%
Test Loss: 1.0525, Test Acc: 62.06%

Epoch 6/20


Training: 100%|██████████| 391/391 [00:18<00:00, 20.86it/s, loss=1.29, acc=53.6]


Train Loss: 1.2858, Train Acc: 53.62%
Test Loss: 1.0622, Test Acc: 61.84%

Epoch 7/20


Training: 100%|██████████| 391/391 [00:19<00:00, 20.41it/s, loss=1.24, acc=55.8]


Train Loss: 1.2383, Train Acc: 55.82%
Test Loss: 0.9990, Test Acc: 64.32%

Epoch 8/20


Training: 100%|██████████| 391/391 [00:19<00:00, 20.55it/s, loss=1.19, acc=57.8]


Train Loss: 1.1848, Train Acc: 57.76%
Test Loss: 0.9290, Test Acc: 66.43%

Epoch 9/20


Training: 100%|██████████| 391/391 [00:19<00:00, 20.38it/s, loss=1.15, acc=59.3]


Train Loss: 1.1464, Train Acc: 59.26%
Test Loss: 0.8774, Test Acc: 68.72%

Epoch 10/20


Training: 100%|██████████| 391/391 [00:18<00:00, 20.59it/s, loss=1.1, acc=60.9] 


Train Loss: 1.0960, Train Acc: 60.87%
Test Loss: 0.8666, Test Acc: 68.83%

Epoch 11/20


Training: 100%|██████████| 391/391 [00:19<00:00, 20.51it/s, loss=1.06, acc=62.5]


Train Loss: 1.0558, Train Acc: 62.45%
Test Loss: 0.8180, Test Acc: 71.16%

Epoch 12/20


Training: 100%|██████████| 391/391 [00:18<00:00, 20.65it/s, loss=1.02, acc=63.9]


Train Loss: 1.0186, Train Acc: 63.90%
Test Loss: 0.7851, Test Acc: 72.70%

Epoch 13/20


Training: 100%|██████████| 391/391 [00:18<00:00, 20.58it/s, loss=0.981, acc=65.2]


Train Loss: 0.9812, Train Acc: 65.21%
Test Loss: 0.7488, Test Acc: 73.62%

Epoch 14/20


Training: 100%|██████████| 391/391 [00:18<00:00, 20.86it/s, loss=0.951, acc=66.4]


Train Loss: 0.9466, Train Acc: 66.42%
Test Loss: 0.7175, Test Acc: 74.55%

Epoch 15/20


Training: 100%|██████████| 391/391 [00:18<00:00, 20.63it/s, loss=0.928, acc=67.3]


Train Loss: 0.9256, Train Acc: 67.26%
Test Loss: 0.6918, Test Acc: 75.54%

Epoch 16/20


Training: 100%|██████████| 391/391 [00:18<00:00, 20.61it/s, loss=0.897, acc=68.3]


Train Loss: 0.8943, Train Acc: 68.31%
Test Loss: 0.6916, Test Acc: 75.51%

Epoch 17/20


Training: 100%|██████████| 391/391 [00:18<00:00, 20.71it/s, loss=0.879, acc=69.2]


Train Loss: 0.8748, Train Acc: 69.16%
Test Loss: 0.6981, Test Acc: 75.46%

Epoch 18/20


Training: 100%|██████████| 391/391 [00:18<00:00, 20.65it/s, loss=0.867, acc=69.3]


Train Loss: 0.8647, Train Acc: 69.28%
Test Loss: 0.6661, Test Acc: 76.86%

Epoch 19/20


Training: 100%|██████████| 391/391 [00:21<00:00, 18.46it/s, loss=0.853, acc=70.1]


Train Loss: 0.8525, Train Acc: 70.12%
Test Loss: 0.6620, Test Acc: 76.89%

Epoch 20/20


Training: 100%|██████████| 391/391 [00:20<00:00, 18.77it/s, loss=0.849, acc=70]  


Train Loss: 0.8468, Train Acc: 69.98%
Test Loss: 0.6590, Test Acc: 76.92%

=== Running experiment with augmentation=cutmix ===

Epoch 1/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.52it/s, loss=1.71, acc=36.5]


Train Loss: 1.7019, Train Acc: 36.53%
Test Loss: 1.4299, Test Acc: 47.24%

Epoch 2/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.55it/s, loss=1.33, acc=52.1]


Train Loss: 1.3225, Train Acc: 52.08%
Test Loss: 1.1797, Test Acc: 57.22%

Epoch 3/20


Training: 100%|██████████| 391/391 [00:15<00:00, 26.07it/s, loss=1.19, acc=57.3]


Train Loss: 1.1904, Train Acc: 57.29%
Test Loss: 1.1094, Test Acc: 60.46%

Epoch 4/20


Training: 100%|██████████| 391/391 [00:15<00:00, 25.94it/s, loss=1.11, acc=60.4]


Train Loss: 1.1022, Train Acc: 60.38%
Test Loss: 1.0243, Test Acc: 64.31%

Epoch 5/20


Training: 100%|██████████| 391/391 [00:15<00:00, 26.04it/s, loss=1.04, acc=63.2]


Train Loss: 1.0334, Train Acc: 63.21%
Test Loss: 1.0283, Test Acc: 63.18%

Epoch 6/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.09it/s, loss=0.992, acc=64.8]


Train Loss: 0.9868, Train Acc: 64.78%
Test Loss: 0.9329, Test Acc: 66.31%

Epoch 7/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.17it/s, loss=0.922, acc=67.4]


Train Loss: 0.9175, Train Acc: 67.36%
Test Loss: 0.8798, Test Acc: 68.58%

Epoch 8/20


Training: 100%|██████████| 391/391 [00:15<00:00, 26.00it/s, loss=0.877, acc=69.1]


Train Loss: 0.8723, Train Acc: 69.07%
Test Loss: 0.8520, Test Acc: 69.41%

Epoch 9/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.10it/s, loss=0.827, acc=70.5]


Train Loss: 0.8229, Train Acc: 70.53%
Test Loss: 0.8092, Test Acc: 70.47%

Epoch 10/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.41it/s, loss=0.781, acc=72.7]


Train Loss: 0.7768, Train Acc: 72.66%
Test Loss: 0.7958, Test Acc: 71.54%

Epoch 11/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.48it/s, loss=0.734, acc=74.1]


Train Loss: 0.7306, Train Acc: 74.15%
Test Loss: 0.7568, Test Acc: 73.76%

Epoch 12/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.50it/s, loss=0.696, acc=75.8]


Train Loss: 0.6922, Train Acc: 75.79%
Test Loss: 0.7021, Test Acc: 75.15%

Epoch 13/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.42it/s, loss=0.659, acc=76.8]


Train Loss: 0.6571, Train Acc: 76.84%
Test Loss: 0.6791, Test Acc: 76.16%

Epoch 14/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.16it/s, loss=0.618, acc=78.4]


Train Loss: 0.6146, Train Acc: 78.41%
Test Loss: 0.6541, Test Acc: 77.27%

Epoch 15/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.27it/s, loss=0.586, acc=79.4]


Train Loss: 0.5831, Train Acc: 79.40%
Test Loss: 0.6321, Test Acc: 78.06%

Epoch 16/20


Training: 100%|██████████| 391/391 [00:15<00:00, 26.03it/s, loss=0.558, acc=80.3]


Train Loss: 0.5562, Train Acc: 80.25%
Test Loss: 0.6384, Test Acc: 77.92%

Epoch 17/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.55it/s, loss=0.54, acc=81.1] 


Train Loss: 0.5371, Train Acc: 81.09%
Test Loss: 0.6280, Test Acc: 78.59%

Epoch 18/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.34it/s, loss=0.521, acc=81.6]


Train Loss: 0.5181, Train Acc: 81.60%
Test Loss: 0.6179, Test Acc: 78.98%

Epoch 19/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.59it/s, loss=0.51, acc=81.9] 


Train Loss: 0.5073, Train Acc: 81.92%
Test Loss: 0.6086, Test Acc: 79.35%

Epoch 20/20


Training: 100%|██████████| 391/391 [00:14<00:00, 26.51it/s, loss=0.505, acc=82.1]


Train Loss: 0.5027, Train Acc: 82.11%
Test Loss: 0.6100, Test Acc: 79.30%

Final Best Test Accuracies with Different Augmentations:
default: 79.56%
strong: 76.40%
autoaugment: 76.92%
cutmix: 79.35%

Best Augmentation: default (Test Acc: 79.56%)


the default augmentation (simple cropping and flipping) outperforms more complex augmentation techniques on CIFAR-10. This suggests that Vision Transformers may be sensitive to excessive perturbations in small-scale datasets. Strong augmentation techniques like random rotations, color jittering, and random erasing actually hurt performance by creating training samples that deviate too far from the test distribution. AutoAugment, despite being optimized for CIFAR-10, also underperforms likely because its policy was developed for CNNs rather than transformer architectures. CutMix performs nearly as well as the default because it preserves more original image information while still providing regularization benefits. The results indicate that for Vision Transformers on small images, simpler augmentation strategies that maintain image integrity work best.

### Position Embedding

In [9]:
import math 

class VisionTransformerWithPosEmbed(nn.Module):
    def __init__(
        self,
        img_size=32,
        patch_size=4,
        in_channels=3,
        num_classes=10,
        embed_dim=256,
        depth=6,
        n_heads=8,
        mlp_dim=512,
        dropout=0.1,
        pos_embedding_type='learned_1d'  
    ):
        super().__init__()
        
        self.patch_size = patch_size
        self.num_patches = (img_size // patch_size) ** 2
        self.pos_embedding_type = pos_embedding_type
        
        self.patch_embed = nn.Conv2d(
            in_channels, embed_dim, 
            kernel_size=patch_size, stride=patch_size
        )
        
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        
        if pos_embedding_type == 'none':
            self.pos_embed = None
        elif pos_embedding_type == 'learned_1d':
            self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches + 1, embed_dim))
        elif pos_embedding_type == 'learned_2d':
            h_patches = w_patches = img_size // patch_size
            self.pos_embed_x = nn.Parameter(torch.zeros(1, h_patches, embed_dim // 2))
            self.pos_embed_y = nn.Parameter(torch.zeros(1, w_patches, embed_dim // 2))
        elif pos_embedding_type == 'sinusoidal':
            self.register_buffer('pos_embed', self._create_sinusoidal_embeddings(
                self.num_patches + 1, embed_dim
            ))
        else:
            raise ValueError(f"Unknown positional embedding type: {pos_embedding_type}")
        
        self.dropout = nn.Dropout(dropout)
        
        self.transformer_encoder = nn.ModuleList([
            TransformerEncoderLayer(embed_dim, n_heads, mlp_dim, dropout)
            for _ in range(depth)
        ])
        
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)

        self._init_weights()
    
    def _init_weights(self):
        nn.init.normal_(self.cls_token, std=0.02)
        if self.pos_embedding_type == 'learned_1d':
            nn.init.normal_(self.pos_embed, std=0.02)
        elif self.pos_embedding_type == 'learned_2d':
            nn.init.normal_(self.pos_embed_x, std=0.02)
            nn.init.normal_(self.pos_embed_y, std=0.02)
        
        self.apply(self._init_linear_weights)
    
    def _init_linear_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, std=0.02)
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.LayerNorm):
            nn.init.zeros_(m.bias)
            nn.init.ones_(m.weight)
    
    def _create_sinusoidal_embeddings(self, seq_len, embed_dim):
        position = torch.arange(seq_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim))
        
        pos_embedding = torch.zeros(1, seq_len, embed_dim)
        pos_embedding[0, :, 0::2] = torch.sin(position * div_term)
        pos_embedding[0, :, 1::2] = torch.cos(position * div_term)
        
        return pos_embedding
    
    def _generate_2d_pos_embed(self, batch_size):
        h_patches = w_patches = int(math.sqrt(self.num_patches))
        
        pos_embed_x = self.pos_embed_x.expand(batch_size, -1, -1)
        pos_embed_y = self.pos_embed_y.expand(batch_size, -1, -1)
        
        x_emb = pos_embed_x.unsqueeze(2).repeat(1, 1, w_patches, 1)
        y_emb = pos_embed_y.unsqueeze(1).repeat(1, h_patches, 1, 1)
        
        pos_2d = torch.cat([x_emb, y_emb], dim=-1)
        pos_2d = pos_2d.reshape(batch_size, self.num_patches, -1)
        
        cls_pos = torch.zeros(batch_size, 1, pos_2d.size(-1), device=pos_2d.device)
        pos_embedding = torch.cat([cls_pos, pos_2d], dim=1)
        
        return pos_embedding
    
    def forward(self, x):
        batch_size = x.size(0)
        
        x = self.patch_embed(x)
        
        x = x.flatten(2)
        x = x.transpose(1, 2)
        
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        
        if self.pos_embedding_type == 'none':
            pass
        elif self.pos_embedding_type == 'learned_1d':
            x = x + self.pos_embed
        elif self.pos_embedding_type == 'learned_2d':
            pos_embedding = self._generate_2d_pos_embed(batch_size)
            x = x + pos_embedding
        elif self.pos_embedding_type == 'sinusoidal':
            x = x + self.pos_embed
        
        x = self.dropout(x)
        for layer in self.transformer_encoder:
            x = layer(x)
        x = self.norm(x)
        x = x[:, 0]
        x = self.head(x)
        
        return x


In [10]:
def train_with_pos_embedding(pos_embedding_type, device, best_config, num_epochs=15):
    print(f"\n=== Training with {pos_embedding_type} positional embedding ===")
    
    batch_size = best_config['batch_size']
    
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
    ])
    
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
    ])
    
    train_dataset = torchvision.datasets.CIFAR10(
        root='./data', train=True, download=True, transform=transform_train)
    
    test_dataset = torchvision.datasets.CIFAR10(
        root='./data', train=False, download=True, transform=transform_test)
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    
    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
    
    model = VisionTransformerWithPosEmbed(
        img_size=32,
        patch_size=best_config['patch_size'],
        in_channels=3,
        num_classes=10,
        embed_dim=best_config['embed_dim'],
        depth=best_config['depth'],
        n_heads=best_config['n_heads'],
        mlp_dim=best_config['mlp_dim'],
        dropout=0.1,
        pos_embedding_type=pos_embedding_type
    ).to(device)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=best_config['lr'], weight_decay=best_config['weight_decay'])
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
    
    train_losses, train_accs = [], []
    test_losses, test_accs = [], []
    best_test_acc = 0.0
    best_model_state = None
    
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        
        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
        test_loss, test_acc = evaluate(model, test_loader, criterion, device)
        
        scheduler.step()
        
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        test_losses.append(test_loss)
        test_accs.append(test_acc)
        
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%')
        print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')
        
        if test_acc > best_test_acc:
            best_test_acc = test_acc
            best_model_state = copy.deepcopy(model.state_dict())
    
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    
    if not os.path.exists('saved_models'):
        os.makedirs('saved_models')
    
    model_path = f'saved_models/vit_cifar10_{pos_embedding_type}_best.pth'
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'best_test_acc': best_test_acc,
        'config': best_config,
        'pos_embedding_type': pos_embedding_type
    }, model_path)
    
    print(f"Model saved to {model_path}")
    
    results = {
        'pos_embedding_type': pos_embedding_type,
        'train_losses': train_losses,
        'train_accs': train_accs,
        'test_losses': test_losses,
        'test_accs': test_accs,
        'final_test_acc': test_accs[-1],
        'best_test_acc': best_test_acc,
        'model_path': model_path
    }
    
    return results, model

def run_positional_embedding_experiment(device, best_config):
    pos_embedding_types = ['none', 'learned_1d', 'learned_2d', 'sinusoidal']
    all_results = {}
    all_models = {}  
    
    for pe_type in pos_embedding_types:
        results, model = train_with_pos_embedding(pe_type, device, best_config)
        all_results[pe_type] = results
        all_models[pe_type] = model
    
    plt.figure(figsize=(15, 10))
    
    plt.subplot(2, 2, 1)
    for pe_type in pos_embedding_types:
        plt.plot(all_results[pe_type]['train_losses'], label=f'{pe_type}')
    plt.xlabel('Epoch')
    plt.ylabel('Training Loss')
    plt.legend()
    plt.title('Training Loss vs. Epoch')
    
    plt.subplot(2, 2, 2)
    for pe_type in pos_embedding_types:
        plt.plot(all_results[pe_type]['test_losses'], label=f'{pe_type}')
    plt.xlabel('Epoch')
    plt.ylabel('Testing Loss')
    plt.legend()
    plt.title('Testing Loss vs. Epoch')
    
    plt.subplot(2, 2, 3)
    for pe_type in pos_embedding_types:
        plt.plot(all_results[pe_type]['train_accs'], label=f'{pe_type}')
    plt.xlabel('Epoch')
    plt.ylabel('Training Accuracy (%)')
    plt.legend()
    plt.title('Training Accuracy vs. Epoch')
    
    plt.subplot(2, 2, 4)
    for pe_type in pos_embedding_types:
        plt.plot(all_results[pe_type]['test_accs'], label=f'{pe_type}')
    plt.xlabel('Epoch')
    plt.ylabel('Testing Accuracy (%)')
    plt.legend()
    plt.title('Testing Accuracy vs. Epoch')
    
    plt.tight_layout()
    plt.savefig('vit_positional_embedding_comparison.png')
    plt.close()
    
    plt.figure(figsize=(10, 6))
    final_accs = [all_results[pe_type]['best_test_acc'] for pe_type in pos_embedding_types]
    plt.bar(pos_embedding_types, final_accs)
    plt.xlabel('Positional Embedding Type')
    plt.ylabel('Test Accuracy (%)')
    plt.title('Best Test Accuracy by Positional Embedding Type')
    for i, acc in enumerate(final_accs):
        plt.text(i, acc + 0.5, f'{acc:.2f}%', ha='center')
    plt.ylim(0, 100)
    plt.savefig('vit_positional_embedding_final_accuracy.png')
    plt.close()
    
    print("\nBest Test Accuracies by Positional Embedding Type:")
    for pe_type in pos_embedding_types:
        print(f"{pe_type}: {all_results[pe_type]['best_test_acc']:.2f}%")
        print(f"   Model saved at: {all_results[pe_type]['model_path']}")
    
    return all_results, all_models

In [12]:
torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

best_config = {
    'patch_size': 4,
    'embed_dim': 256,
    'depth': 6,
    'n_heads': 8,
    'mlp_dim': 512,
    'batch_size': 128,
    'lr': 0.0005,
    'weight_decay': 0.05,
}

results = run_positional_embedding_experiment(device, best_config)

Using device: cuda

=== Training with none positional embedding ===

Epoch 1/15


Training: 100%|██████████| 391/391 [00:16<00:00, 23.09it/s, loss=1.73, acc=35.6]


Train Loss: 1.7218, Train Acc: 35.62%
Test Loss: 1.4960, Test Acc: 46.14%

Epoch 2/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.49it/s, loss=1.39, acc=50]  


Train Loss: 1.3806, Train Acc: 50.05%
Test Loss: 1.3152, Test Acc: 52.98%

Epoch 3/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.21it/s, loss=1.28, acc=54.1]


Train Loss: 1.2699, Train Acc: 54.14%
Test Loss: 1.2383, Test Acc: 55.20%

Epoch 4/15


Training: 100%|██████████| 391/391 [00:15<00:00, 26.00it/s, loss=1.21, acc=56.6]


Train Loss: 1.2040, Train Acc: 56.57%
Test Loss: 1.1774, Test Acc: 57.90%

Epoch 5/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.22it/s, loss=1.15, acc=59.1]


Train Loss: 1.1434, Train Acc: 59.07%
Test Loss: 1.1617, Test Acc: 58.16%

Epoch 6/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.25it/s, loss=1.1, acc=60.8] 


Train Loss: 1.0909, Train Acc: 60.79%
Test Loss: 1.1239, Test Acc: 59.89%

Epoch 7/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.09it/s, loss=1.05, acc=62.7]


Train Loss: 1.0437, Train Acc: 62.71%
Test Loss: 1.1242, Test Acc: 59.51%

Epoch 8/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.09it/s, loss=0.998, acc=64.8]


Train Loss: 0.9931, Train Acc: 64.81%
Test Loss: 1.0100, Test Acc: 64.11%

Epoch 9/15


Training: 100%|██████████| 391/391 [00:15<00:00, 26.06it/s, loss=0.953, acc=66.1]


Train Loss: 0.9478, Train Acc: 66.13%
Test Loss: 0.9865, Test Acc: 64.73%

Epoch 10/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.94it/s, loss=0.908, acc=67.7]


Train Loss: 0.9037, Train Acc: 67.66%
Test Loss: 0.9448, Test Acc: 66.72%

Epoch 11/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.92it/s, loss=0.869, acc=69]  


Train Loss: 0.8645, Train Acc: 69.02%
Test Loss: 0.9338, Test Acc: 67.02%

Epoch 12/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.99it/s, loss=0.831, acc=70.4]


Train Loss: 0.8263, Train Acc: 70.38%
Test Loss: 0.9129, Test Acc: 67.74%

Epoch 13/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.12it/s, loss=0.807, acc=71.4]


Train Loss: 0.8029, Train Acc: 71.40%
Test Loss: 0.8766, Test Acc: 69.08%

Epoch 14/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.90it/s, loss=0.781, acc=72.3]


Train Loss: 0.7771, Train Acc: 72.30%
Test Loss: 0.8730, Test Acc: 69.31%

Epoch 15/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.82it/s, loss=0.777, acc=72.6]


Train Loss: 0.7730, Train Acc: 72.63%
Test Loss: 0.8711, Test Acc: 69.60%
Model saved to saved_models/vit_cifar10_none_best.pth

=== Training with learned_1d positional embedding ===

Epoch 1/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.66it/s, loss=1.71, acc=37.2]


Train Loss: 1.6966, Train Acc: 37.22%
Test Loss: 1.4089, Test Acc: 48.57%

Epoch 2/15


Training: 100%|██████████| 391/391 [00:15<00:00, 26.01it/s, loss=1.32, acc=52.7]


Train Loss: 1.3097, Train Acc: 52.66%
Test Loss: 1.1801, Test Acc: 58.40%

Epoch 3/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.08it/s, loss=1.18, acc=57.9]


Train Loss: 1.1703, Train Acc: 57.92%
Test Loss: 1.0797, Test Acc: 61.29%

Epoch 4/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.90it/s, loss=1.09, acc=61]  


Train Loss: 1.0875, Train Acc: 61.04%
Test Loss: 1.0239, Test Acc: 63.43%

Epoch 5/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.99it/s, loss=1.01, acc=64]  


Train Loss: 1.0067, Train Acc: 63.98%
Test Loss: 0.9349, Test Acc: 66.55%

Epoch 6/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.60it/s, loss=0.948, acc=66.3]


Train Loss: 0.9433, Train Acc: 66.30%
Test Loss: 0.9673, Test Acc: 65.39%

Epoch 7/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.85it/s, loss=0.892, acc=68.3]


Train Loss: 0.8870, Train Acc: 68.30%
Test Loss: 0.8679, Test Acc: 69.00%

Epoch 8/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.85it/s, loss=0.831, acc=70.6]


Train Loss: 0.8263, Train Acc: 70.58%
Test Loss: 0.7956, Test Acc: 71.83%

Epoch 9/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.96it/s, loss=0.782, acc=72.5]


Train Loss: 0.7781, Train Acc: 72.45%
Test Loss: 0.7840, Test Acc: 72.95%

Epoch 10/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.54it/s, loss=0.73, acc=74.3] 


Train Loss: 0.7263, Train Acc: 74.30%
Test Loss: 0.7454, Test Acc: 73.54%

Epoch 11/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.08it/s, loss=0.693, acc=75.7]


Train Loss: 0.6897, Train Acc: 75.71%
Test Loss: 0.7014, Test Acc: 75.47%

Epoch 12/15


Training: 100%|██████████| 391/391 [00:15<00:00, 26.00it/s, loss=0.653, acc=77]  


Train Loss: 0.6497, Train Acc: 76.99%
Test Loss: 0.6833, Test Acc: 76.33%

Epoch 13/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.08it/s, loss=0.619, acc=78.3]


Train Loss: 0.6156, Train Acc: 78.27%
Test Loss: 0.6647, Test Acc: 77.24%

Epoch 14/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.97it/s, loss=0.598, acc=78.9]


Train Loss: 0.5950, Train Acc: 78.86%
Test Loss: 0.6606, Test Acc: 77.32%

Epoch 15/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.77it/s, loss=0.596, acc=79.2]


Train Loss: 0.5931, Train Acc: 79.17%
Test Loss: 0.6541, Test Acc: 77.57%
Model saved to saved_models/vit_cifar10_learned_1d_best.pth

=== Training with learned_2d positional embedding ===

Epoch 1/15


Training: 100%|██████████| 391/391 [00:15<00:00, 26.02it/s, loss=1.7, acc=37.1] 


Train Loss: 1.6934, Train Acc: 37.14%
Test Loss: 1.4421, Test Acc: 48.56%

Epoch 2/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.41it/s, loss=1.3, acc=53.2] 


Train Loss: 1.2947, Train Acc: 53.16%
Test Loss: 1.1890, Test Acc: 57.18%

Epoch 3/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.92it/s, loss=1.16, acc=58.6]


Train Loss: 1.1538, Train Acc: 58.61%
Test Loss: 1.1344, Test Acc: 59.13%

Epoch 4/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.29it/s, loss=1.07, acc=61.7]


Train Loss: 1.0639, Train Acc: 61.74%
Test Loss: 1.0063, Test Acc: 64.00%

Epoch 5/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.58it/s, loss=0.992, acc=64.7]


Train Loss: 0.9864, Train Acc: 64.70%
Test Loss: 0.9095, Test Acc: 67.96%

Epoch 6/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.78it/s, loss=0.93, acc=67.1] 


Train Loss: 0.9250, Train Acc: 67.08%
Test Loss: 0.8688, Test Acc: 69.06%

Epoch 7/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.83it/s, loss=0.869, acc=69.2]


Train Loss: 0.8649, Train Acc: 69.15%
Test Loss: 0.8337, Test Acc: 70.30%

Epoch 8/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.17it/s, loss=0.816, acc=71.2]


Train Loss: 0.8118, Train Acc: 71.22%
Test Loss: 0.7743, Test Acc: 72.52%

Epoch 9/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.91it/s, loss=0.758, acc=73.4]


Train Loss: 0.7541, Train Acc: 73.38%
Test Loss: 0.7401, Test Acc: 74.24%

Epoch 10/15


Training: 100%|██████████| 391/391 [00:15<00:00, 26.05it/s, loss=0.718, acc=74.7]


Train Loss: 0.7139, Train Acc: 74.69%
Test Loss: 0.7153, Test Acc: 74.84%

Epoch 11/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.89it/s, loss=0.673, acc=76]  


Train Loss: 0.6692, Train Acc: 76.01%
Test Loss: 0.6756, Test Acc: 76.51%

Epoch 12/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.21it/s, loss=0.636, acc=77.6]


Train Loss: 0.6325, Train Acc: 77.59%
Test Loss: 0.6547, Test Acc: 77.67%

Epoch 13/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.12it/s, loss=0.611, acc=78.5]


Train Loss: 0.6074, Train Acc: 78.50%
Test Loss: 0.6359, Test Acc: 78.20%

Epoch 14/15


Training: 100%|██████████| 391/391 [00:15<00:00, 26.05it/s, loss=0.586, acc=79.2]


Train Loss: 0.5833, Train Acc: 79.24%
Test Loss: 0.6314, Test Acc: 78.36%

Epoch 15/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.71it/s, loss=0.577, acc=79.8]


Train Loss: 0.5742, Train Acc: 79.76%
Test Loss: 0.6241, Test Acc: 78.64%
Model saved to saved_models/vit_cifar10_learned_2d_best.pth

=== Training with sinusoidal positional embedding ===

Epoch 1/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.78it/s, loss=1.68, acc=38.4]


Train Loss: 1.6698, Train Acc: 38.43%
Test Loss: 1.3834, Test Acc: 49.33%

Epoch 2/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.28it/s, loss=1.36, acc=51]  


Train Loss: 1.3515, Train Acc: 51.00%
Test Loss: 1.2199, Test Acc: 55.47%

Epoch 3/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.30it/s, loss=1.23, acc=55.5]


Train Loss: 1.2285, Train Acc: 55.55%
Test Loss: 1.1707, Test Acc: 58.17%

Epoch 4/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.48it/s, loss=1.16, acc=58.6]


Train Loss: 1.1515, Train Acc: 58.63%
Test Loss: 1.0475, Test Acc: 62.55%

Epoch 5/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.30it/s, loss=1.07, acc=61.6]


Train Loss: 1.0666, Train Acc: 61.65%
Test Loss: 1.0047, Test Acc: 64.05%

Epoch 6/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.94it/s, loss=1, acc=64.2]    


Train Loss: 0.9970, Train Acc: 64.21%
Test Loss: 0.9205, Test Acc: 67.57%

Epoch 7/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.55it/s, loss=0.941, acc=66.4]


Train Loss: 0.9358, Train Acc: 66.38%
Test Loss: 0.8956, Test Acc: 68.63%

Epoch 8/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.95it/s, loss=0.888, acc=68.5]


Train Loss: 0.8833, Train Acc: 68.51%
Test Loss: 0.8599, Test Acc: 68.99%

Epoch 9/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.25it/s, loss=0.839, acc=70.4]


Train Loss: 0.8347, Train Acc: 70.38%
Test Loss: 0.8018, Test Acc: 71.76%

Epoch 10/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.96it/s, loss=0.784, acc=72.3]


Train Loss: 0.7803, Train Acc: 72.26%
Test Loss: 0.7903, Test Acc: 72.33%

Epoch 11/15


Training: 100%|██████████| 391/391 [00:14<00:00, 26.14it/s, loss=0.742, acc=74]  


Train Loss: 0.7385, Train Acc: 73.98%
Test Loss: 0.7679, Test Acc: 73.03%

Epoch 12/15


Training: 100%|██████████| 391/391 [00:15<00:00, 26.02it/s, loss=0.704, acc=75.1]


Train Loss: 0.7004, Train Acc: 75.14%
Test Loss: 0.7201, Test Acc: 74.86%

Epoch 13/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.86it/s, loss=0.676, acc=75.9]


Train Loss: 0.6730, Train Acc: 75.92%
Test Loss: 0.7087, Test Acc: 75.27%

Epoch 14/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.95it/s, loss=0.656, acc=76.9]


Train Loss: 0.6524, Train Acc: 76.85%
Test Loss: 0.6977, Test Acc: 75.62%

Epoch 15/15


Training: 100%|██████████| 391/391 [00:15<00:00, 25.58it/s, loss=0.645, acc=77.3]


Train Loss: 0.6417, Train Acc: 77.32%
Test Loss: 0.6944, Test Acc: 75.85%
Model saved to saved_models/vit_cifar10_sinusoidal_best.pth

Best Test Accuracies by Positional Embedding Type:
none: 69.60%
   Model saved at: saved_models/vit_cifar10_none_best.pth
learned_1d: 77.57%
   Model saved at: saved_models/vit_cifar10_learned_1d_best.pth
learned_2d: 78.64%
   Model saved at: saved_models/vit_cifar10_learned_2d_best.pth
sinusoidal: 75.85%
   Model saved at: saved_models/vit_cifar10_sinusoidal_best.pth


The positional embedding comparison shows learned representations (especially 2D at 78.64%) significantly outperform fixed patterns (sinusoidal at 75.85%) and no position information (69.60%). This makes sense since learned embeddings can adapt to the specific spatial relationships in CIFAR-10 images, while the model without position embeddings struggles to understand spatial structure completely.
For augmentations, the graph shows all methods eventually converge to similar accuracies, with default and cutmix slightly outperforming others. This suggests Vision Transformers are somewhat robust to augmentation choice on CIFAR-10, but simpler augmentations that maintain original image structure lead to more stable learning curves and marginally better performance.

In [11]:
class VisionTransformerForVisualization(nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        self.pretrained_model = pretrained_model
        self.attention_weights = []
    
    def forward(self, x, return_attention=True):
        batch_size = x.size(0)
        
        x = self.pretrained_model.patch_embed(x)        
        x = x.flatten(2)
        x = x.transpose(1, 2)
        
        cls_tokens = self.pretrained_model.cls_token.expand(batch_size, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        
        if self.pretrained_model.pos_embed is not None:
            x = x + self.pretrained_model.pos_embed
        
        x = self.pretrained_model.dropout(x)
        
        self.attention_weights = []
        for layer in self.pretrained_model.transformer_encoder:
            attn_output, attn_weights = layer.attention(x, x, x)
            self.attention_weights.append(attn_weights)
            
            x = x + layer.dropout(attn_output)
            x = layer.norm1(x)
            
            mlp_output = layer.mlp(x)
            x = x + layer.dropout(mlp_output)
            x = layer.norm2(x)
        
        x = self.pretrained_model.norm(x)
        
        x_cls = x[:, 0]
        output = self.pretrained_model.head(x_cls)
        
        if return_attention:
            return output, self.attention_weights
        else:
            return output

def visualize_attention_maps(model, image, device,index, save_path='attention_maps'):
    img_size = 32
    patch_size = model.pretrained_model.patch_size
    n_patches = (img_size // patch_size) ** 2
    
    model.eval()
    with torch.no_grad():
        if isinstance(image, np.ndarray):
            image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(0).float()
        if image.dim() == 3:
            image = image.unsqueeze(0)
        image = image.to(device)
        
        _, attention_weights = model(image, return_attention=True)
    
    os.makedirs(save_path, exist_ok=True)
    
    last_attn = attention_weights[-1][0]  
    n_heads = last_attn.shape[0]
    
    fig, axes = plt.subplots(2, 4, figsize=(16, 8))
    axes = axes.flatten()
    
    for i in range(min(n_heads, 8)):
        cls_to_all = last_attn[i, 0, 1:]  # Exclude CLS->CLS attention
        cls_to_all = cls_to_all.reshape(img_size // patch_size, img_size // patch_size)
        
        axes[i].imshow(cls_to_all.cpu().numpy(), cmap='hot')
        axes[i].set_title(f'Head {i+1}')
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.savefig(os.path.join(save_path, f'attention_heads_last_layer_{index}.png'))
    plt.close()
    
    avg_attn = last_attn.mean(0)[0, 1:]  
    avg_attn = avg_attn.reshape(img_size // patch_size, img_size // patch_size)
    
    plt.figure(figsize=(8, 8))
    plt.imshow(avg_attn.cpu().numpy(), cmap='hot')
    plt.title('Average Attention from CLS Token (Last Layer)')
    plt.colorbar()
    plt.savefig(os.path.join(save_path, f'average_attention_last_layer_{index}.png'))
    plt.close()
    
    n_layers = len(attention_weights)
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    for i in range(min(n_layers, 6)):
        layer_attn = attention_weights[i][0].mean(0)[0, 1:]  # [n_patches]
        layer_attn = layer_attn.reshape(img_size // patch_size, img_size // patch_size)
        
        axes[i].imshow(layer_attn.cpu().numpy(), cmap='hot')
        axes[i].set_title(f'Layer {i+1}')
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.savefig(os.path.join(save_path, f'attention_across_layers_{index}.png'))
    plt.close()

In [12]:
def attention_rollout(attention_weights, device):
    attention_matrix = torch.eye(attention_weights[0].shape[-1]).to(device)
    for layer_attn in attention_weights:
        layer_attn = layer_attn.mean(1)
        layer_attn = layer_attn + torch.eye(layer_attn.shape[-1]).to(device)
        layer_attn = layer_attn / layer_attn.sum(dim=-1, keepdim=True)
        attention_matrix = torch.matmul(layer_attn, attention_matrix)
    
    return attention_matrix

def visualize_attention_rollout(model, image, device, index, save_path='attention_rollout'):
    img_size = 32
    patch_size = model.pretrained_model.patch_size
    model.eval()
    with torch.no_grad():
        if isinstance(image, np.ndarray):
            image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(0).float()
        if image.dim() == 3:
            image = image.unsqueeze(0)
        image = image.to(device)
        
        _, attention_weights = model(image, return_attention=True)
    
    rollout_matrix = attention_rollout(attention_weights, device)[0] 
    
    cls_to_all = rollout_matrix[0, 1:]  
    cls_to_all = cls_to_all.reshape(img_size // patch_size, img_size // patch_size)
    
    os.makedirs(save_path, exist_ok=True)
    
    plt.figure(figsize=(8, 8))
    plt.imshow(cls_to_all.cpu().numpy(), cmap='hot')
    plt.title('Attention Rollout from CLS Token')
    plt.colorbar()
    plt.savefig(os.path.join(save_path, f'attention_rollout_{index}.png'))
    plt.close()
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
    
    original_img = image[0].cpu().numpy().transpose(1, 2, 0)
    mean = np.array([0.4914, 0.4822, 0.4465])
    std = np.array([0.2470, 0.2435, 0.2616])
    original_img = original_img * std + mean
    original_img = np.clip(original_img, 0, 1)
    
    ax1.imshow(original_img)
    ax1.set_title('Original Image')
    ax1.axis('off')
    
    ax2.imshow(cls_to_all.cpu().numpy(), cmap='hot')
    ax2.set_title('Attention Rollout')
    ax2.axis('off')
    
    plt.tight_layout()
    plt.savefig(os.path.join(save_path, f'attention_rollout_comparison_{index}.png'))
    plt.close()


In [24]:
def visualize_positional_embeddings(model,index, save_path='positional_embeddings'):
    pos_embed = model.pretrained_model.pos_embed[0] 
    sim_matrix = torch.matmul(pos_embed, pos_embed.transpose(0, 1))
    
    os.makedirs(save_path, exist_ok=True)
    
    plt.figure(figsize=(10, 8))
    plt.imshow(sim_matrix.detach().cpu().numpy(), cmap='viridis')
    plt.title('Positional Embedding Similarity Matrix')
    plt.xlabel('Position')
    plt.ylabel('Position')
    plt.colorbar()
    plt.tight_layout()
    plt.savefig(os.path.join(save_path, f'pos_embed_similarity_{index}.png'))
    plt.close()
    
    dist_matrix = torch.cdist(pos_embed, pos_embed, p=2)
    
    plt.figure(figsize=(10, 8))
    plt.imshow(dist_matrix.detach().cpu().numpy(), cmap='viridis')
    plt.title('Positional Embedding Distance Matrix')
    plt.xlabel('Position')
    plt.ylabel('Position')
    plt.colorbar()
    plt.tight_layout()
    plt.savefig(os.path.join(save_path, f'pos_embed_distance_{index}.png'))
    plt.close()


def run_all_visualizations(model_path, device, index):
    checkpoint = torch.load(model_path, map_location=device)

    best_config = checkpoint['config']
    pos_embedding_type = checkpoint['pos_embedding_type']
    print(pos_embedding_type)
  
    pretrained_model = VisionTransformerWithPosEmbed(
        img_size=32,
        patch_size=best_config['patch_size'],
        in_channels=3,
        num_classes=10,
        embed_dim=best_config['embed_dim'],
        depth=best_config['depth'],
        n_heads=best_config['n_heads'],
        mlp_dim=best_config['mlp_dim'],
        dropout=0.1,
        pos_embedding_type=pos_embedding_type
    ).to(device)
    
    pretrained_model.load_state_dict(checkpoint['model_state_dict'])
    vis_model = VisionTransformerForVisualization(pretrained_model)
    
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
    ])
    
    test_dataset = torchvision.datasets.CIFAR10(
        root='./data', train=False, download=True, transform=transform_test)
    
    sample_idx = 100 
    sample_image, sample_label = test_dataset[sample_idx]
    
    print(f"Visualizing model with {pos_embedding_type} positional embedding...")
    print("Visualizing attention maps...")
    visualize_attention_maps(vis_model, sample_image, device, index, 'attention_maps')
    
    print("Visualizing attention rollout...")
    visualize_attention_rollout(vis_model, sample_image, device, index, 'attention_rollout')
    
    print("Visualizing positional embeddings...")
    visualize_positional_embeddings(vis_model, index, 'positional_embeddings')
    
    print("All visualizations completed!")


In [25]:
model_path = './saved_models/vit_cifar10_learned_1d_best.pth'
index = 'learned_1d'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
run_all_visualizations(model_path, device, index)

learned_1d
Visualizing model with learned_1d positional embedding...
Visualizing attention maps...
Visualizing attention rollout...
Visualizing positional embeddings...
All visualizations completed!


In [30]:
model_path = './saved_models/vit_cifar10_sinusoidal_best.pth'
index = 'sinusoidal'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
run_all_visualizations(model_path, device, index)

sinusoidal
Visualizing model with sinusoidal positional embedding...
Visualizing attention maps...
Visualizing attention rollout...
Visualizing positional embeddings...
All visualizations completed!


### Attention maps:
Image 1 shows attention patterns across six transformer layers, progressing from broad, diffuse attention in early layers (Layer 1) to more focused, object-specific attention in later layers (Layers 4-6). Image 2 displays different specializations across eight attention heads in the final layer, with some heads focusing on specific features (like edges or shapes) while others attend to broader patterns. Image 3 shows the average attention from the CLS token to all patches, highlighting the model's focus on salient image regions, with brighter areas (particularly at coordinates (4,4), (2,6), and (7,6)) receiving significantly more attention for classification.

### Attention rollout:
The attention rollout visualizations reveal how the ViT model integrates information across transformer layers when classifying images. In Image 1, we see a dog image (left) and its corresponding attention rollout map (right), which shows the model strongly focuses on the central region where the dog's face and body are located, with the brightest spots (white/yellow) highlighting the most important features for classification. Image 2 presents the same information as a heatmap with grid coordinates, clearly showing intense focus on patches (3,3) and (3,4) which correspond to the dog's main features, while maintaining moderate attention (red areas) to surrounding contextual regions. Attention rollout differs from regular attention maps as it accounts for information flow through all layers simultaneously by propagating attention in a principled way, providing a holistic view of how the model makes classification decisions.

### Positional Embeddings
Image 1 shows the L2 distance matrix between positional embeddings. The dark diagonal represents zero distance between identical positions, while increasing brightness (yellow) indicates greater distance between positions that are far apart. The periodic patterns and grid-like structures suggest the model has learned systematic spatial relationships where nearby positions have similar embeddings.
Image 2 displays the dot product similarity matrix of the same embeddings, essentially showing the inverse relationship. Higher similarity (green/yellow) appears between positions close to each other, while lower similarity (dark blue/purple) appears between distant positions. The model exhibits a gradual transition from highly similar adjacent positions to dissimilar distant ones.
Both visualizations confirm that the ViT has successfully learned meaningful 1D positional encodings that respect spatial locality - positions that are close together have similar embeddings, while those far apart have more distinct representations. This spatial awareness is crucial for the model to understand image structure despite the token sequence being flattened.RetryClaude can make mistakes. Please double-check responses. 3.7 Sonnet
