In [None]:
#Problem 1, ViT from scratch

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
image_size = 32
patch_size = 4
num_classes = 100
num_epochs = 50
batch_size = 64
learning_rate = .0001
num_heads = 4
num_layers = 4
hidden_dim = 256
mlp_dim = 512

# Data preparation
transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.2675, 0.2568, 0.2761))
])

# CIFAR-100 dataset
train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True,
                                           download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False,
                                          download=True, transform=transform)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

# Patch embedding layer
class PatchEmbedding(nn.Module):
    def __init__(self, image_size, patch_size, in_channels=3, embed_dim=256):
        super().__init__()
        self.num_patches = (image_size // patch_size) ** 2
        self.proj = nn.Conv2d(in_channels, embed_dim, 
                            kernel_size=patch_size, stride=patch_size)
        
    def forward(self, x):
        x = self.proj(x)  # [B, embed_dim, H', W']
        x = x.flatten(2)  # [B, embed_dim, num_patches]
        x = x.transpose(1, 2)  # [B, num_patches, embed_dim]
        return x

# Transformer Encoder
class TransformerEncoder(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout=0.1):
        super().__init__()
        self.layer_norm1 = nn.LayerNorm(embed_dim)
        self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
        self.layer_norm2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_dim, embed_dim),
            nn.Dropout(dropout)
        )
        
    def forward(self, x):
        x2 = self.layer_norm1(x)
        attention_output, _ = self.attention(x2, x2, x2)
        x = x + attention_output
        x2 = self.layer_norm2(x)
        mlp_output = self.mlp(x2)
        x = x + mlp_output
        return x

# Vision Transformer
class VisionTransformer(nn.Module):
    def __init__(self, image_size, patch_size, num_classes, embed_dim, 
                 num_heads, num_layers, mlp_dim, dropout=0.1):
        super().__init__()
        self.patch_embed = PatchEmbedding(image_size, patch_size, 3, embed_dim)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        num_patches = (image_size // patch_size) ** 2
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
        self.dropout = nn.Dropout(dropout)
        
        self.transformer = nn.ModuleList(
            [TransformerEncoder(embed_dim, num_heads, mlp_dim, dropout) 
             for _ in range(num_layers)]
        )
        
        self.layer_norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)
        
    def forward(self, x):
        B = x.shape[0]
        x = self.patch_embed(x)
        
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embed
        x = self.dropout(x)
        
        for transformer in self.transformer:
            x = transformer(x)
            
        x = self.layer_norm(x)
        cls_token_final = x[:, 0]
        x = self.head(cls_token_final)
        return x

# Initialize model
model = VisionTransformer(
    image_size=image_size,
    patch_size=patch_size,
    num_classes=num_classes,
    embed_dim=hidden_dim,
    num_heads=num_heads,
    num_layers=num_layers,
    mlp_dim=mlp_dim
).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
def train():
    model.train()
    train_losses = []
    train_accuracies = []
    
    for epoch in range(num_epochs):
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
        running_loss = 0.0
        correct = 0
        total = 0
        
        for i, (images, labels) in enumerate(progress_bar):
            images = images.to(device)
            labels = labels.to(device)
            
            # Debug information
            if i == 0 and epoch == 0:
                print(f"Input images shape: {images.shape}")
                print(f"Labels shape: {labels.shape}")
                print(f"Labels values: {labels[:10]}")  # Print first 10 labels
            
            # Forward pass
            outputs = model(images)
            
            # Debug information
            if i == 0 and epoch == 0:
                print(f"Model outputs shape: {outputs.shape}")
                print(f"Expected outputs shape: {torch.Size([batch_size, num_classes])}")
            
            loss = criterion(outputs, labels)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Calculate accuracy
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            running_loss += loss.item()
            
            # Update progress bar
            progress_bar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{100 * correct / total:.2f}%'
            })
        
        # Calculate epoch metrics
        epoch_loss = running_loss / len(train_loader)
        epoch_acc = 100 * correct / total
        
        # Store metrics
        train_losses.append(epoch_loss)
        train_accuracies.append(epoch_acc)
        
        # Print epoch summary
        print(f'Epoch {epoch+1}/{num_epochs} - Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%')
    
    return train_losses, train_accuracies

# Test the model
def test():
    model.eval()
    test_losses = []
    test_accuracies = []
    
    with torch.no_grad():
        correct = 0
        total = 0
        running_loss = 0.0
        progress_bar = tqdm(test_loader, desc='Testing')
        
        for images, labels in progress_bar:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            running_loss += loss.item()
            
            # Update progress bar with current accuracy
            accuracy = 100 * correct / total
            progress_bar.set_postfix({'accuracy': f'{accuracy:.2f}%'})
        
        # Calculate final metrics
        final_loss = running_loss / len(test_loader)
        final_acc = 100 * correct / total
        
        # Store metrics
        test_losses.append(final_loss)
        test_accuracies.append(final_acc)
        
        print(f'Final Test Loss: {final_loss:.4f}, Final Test Accuracy: {final_acc:.2f}%')
    
    return test_losses, test_accuracies

# Visualize training and testing results
def visualize_results(train_losses, train_accuracies, test_losses, test_accuracies):
    plt.figure(figsize=(12, 5))
    
    # Plot losses
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Training Loss')
    plt.plot([len(train_losses)-1], test_losses, 'ro', label='Test Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Test Loss')
    plt.legend()
    plt.grid(True)
    
    # Plot accuracies
    plt.subplot(1, 2, 2)
    plt.plot(train_accuracies, label='Training Accuracy')
    plt.plot([len(train_accuracies)-1], test_accuracies, 'ro', label='Test Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.title('Training and Test Accuracy')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.savefig('vit_training_results.png')
    plt.show()

# Run training and testing
if __name__ == '__main__':
    print("Training started...")
    train_losses, train_accuracies = train()
    print("\nTesting started...")
    test_losses, test_accuracies = test()
    
    # Visualize results
    print("\nVisualizing results...")
    visualize_results(train_losses, train_accuracies, test_losses, test_accuracies)


AttributeError: module 'torch.optim' has no attribute 'AdamX'

In [None]:
#problem 2: swin finetuning and "from scratch" comparison
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from transformers import SwinForImageClassification, SwinConfig, AutoImageProcessor
from tqdm import tqdm
import time
import pandas as pd
from copy import deepcopy

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Hyperparameters
num_epochs = 5
batch_size = 32
learning_rate = 2e-5  # Smaller learning rate for fine-tuning
image_size = 224  # Swin expects 224x224 input by default

# Model configurations
models_config = {
    "swin-tiny-pretrained": {
        "name": "microsoft/swin-tiny-patch4-window7-224",
        "pretrained": True,
        "freeze_backbone": True
    },
    "swin-small-pretrained": {
        "name": "microsoft/swin-small-patch4-window7-224",
        "pretrained": True,
        "freeze_backbone": True
    },
    "swin-tiny-scratch": {
        "name": "microsoft/swin-tiny-patch4-window7-224",
        "pretrained": False,
        "freeze_backbone": False
    }
}

# Results tracking
results = {
    "model": [],
    "epoch_train_time": [],
    "test_accuracy": []
}

# CIFAR-100 dataset preparation
def prepare_data(model_name):
    # Data preparation with proper preprocessing for Swin
    processor = AutoImageProcessor.from_pretrained(model_name)
    
    transform = transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=processor.image_mean, std=processor.image_std)
    ])
    
    # CIFAR-100 dataset
    train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True,
                                              download=True, transform=transform)
    test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False,
                                             download=True, transform=transform)
    
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
    
    return train_loader, test_loader

# Create and configure model
def setup_model(config):
    if config["pretrained"]:
        print(f"Loading pretrained {config['name']}...")
        model = SwinForImageClassification.from_pretrained(
            config["name"],
            num_labels=100,  # CIFAR-100 has 100 classes
            ignore_mismatched_sizes=True  # Allows replacing the original classifier head
        ).to(device)
    else:
        print(f"Initializing {config['name']} from scratch...")
        # For scratch training, initialize with the same architecture but random weights
        swin_config = SwinConfig.from_pretrained(
            config["name"],
            num_labels=100  # CIFAR-100 has 100 classes
        )
        model = SwinForImageClassification(swin_config).to(device)
    
    # Freeze backbone parameters if specified
    if config["freeze_backbone"]:
        print("Freezing backbone parameters...")
        for param in model.swin.parameters():
            param.requires_grad = False
        
        # Only the classifier head will be trained
        for param in model.classifier.parameters():
            param.requires_grad = True
        
        # Configure optimizer for fine-tuning (only classifier parameters)
        optimizer = torch.optim.Adam(model.classifier.parameters(), lr=learning_rate)
    else:
        print("Training all parameters...")
        # Configure optimizer for training from scratch (all parameters)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    return model, optimizer

# Training function
def train_model(model, optimizer, train_loader, test_loader, model_name):
    criterion = nn.CrossEntropyLoss()
    epoch_times = []
    
    for epoch in range(num_epochs):
        model.train()
        start_time = time.time()
        progress_bar = tqdm(train_loader, desc=f'Epoch [{epoch+1}/{num_epochs}]')
        
        for i, (images, labels) in enumerate(progress_bar):
            images = images.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(images).logits
            loss = criterion(outputs, labels)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Update progress bar
            if (i+1) % 100 == 0:
                progress_bar.set_postfix({'loss': loss.item()})
        
        epoch_time = time.time() - start_time
        epoch_times.append(epoch_time)
        print(f"Epoch {epoch+1} training time: {epoch_time:.2f} seconds")
    
    # Calculate average epoch time
    avg_epoch_time = sum(epoch_times) / len(epoch_times)
    
    # Test the model
    accuracy = test_model(model, test_loader)
    
    # Store results
    results["model"].append(model_name)
    results["epoch_train_time"].append(avg_epoch_time)
    results["test_accuracy"].append(accuracy)
    
    return avg_epoch_time, accuracy

# Testing function
def test_model(model, test_loader):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in tqdm(test_loader, desc='Testing'):
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images).logits
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        accuracy = 100 * correct / total
        print(f'Test Accuracy: {accuracy:.2f}%')
        
        return accuracy

# Main function
def main():
    for model_name, config in models_config.items():
        print(f"\n{'='*50}")
        print(f"Training {model_name}")
        print(f"{'='*50}")
        
        # Prepare data
        train_loader, test_loader = prepare_data(config["name"])
        
        # Setup model
        model, optimizer = setup_model(config)
        
        # Train and test model
        avg_epoch_time, accuracy = train_model(model, optimizer, train_loader, test_loader, model_name)
        
        print(f"Model: {model_name}")
        print(f"Average epoch training time: {avg_epoch_time:.2f} seconds")
        print(f"Test accuracy: {accuracy:.2f}%")
        
        # Clear GPU memory
        del model, optimizer
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
    # Create and display results table
    results_df = pd.DataFrame(results)
    print("\nResults Summary:")
    print(results_df.to_string(index=False))
    
    # Save results to CSV
    results_df.to_csv("swin_comparison_results.csv", index=False)
    print("Results saved to swin_comparison_results.csv")
    
    # Print findings for report
    print("\nKey Findings for Report:")
    print("1. Fine-tuning vs. Training from Scratch:")
    ft_acc = results_df[results_df['model'] == 'swin-tiny-pretrained']['test_accuracy'].values[0]
    scratch_acc = results_df[results_df['model'] == 'swin-tiny-scratch']['test_accuracy'].values[0]
    print(f"   - Accuracy difference: {ft_acc - scratch_acc:.2f}%")
    
    print("2. Swin-Tiny vs. Swin-Small:")
    tiny_acc = results_df[results_df['model'] == 'swin-tiny-pretrained']['test_accuracy'].values[0]
    small_acc = results_df[results_df['model'] == 'swin-small-pretrained']['test_accuracy'].values[0]
    print(f"   - Accuracy difference: {small_acc - tiny_acc:.2f}%")
    
    # Note about training times
    tiny_time = results_df[results_df['model'] == 'swin-tiny-pretrained']['epoch_train_time'].values[0]
    small_time = results_df[results_df['model'] == 'swin-small-pretrained']['epoch_train_time'].values[0]
    scratch_time = results_df[results_df['model'] == 'swin-tiny-scratch']['epoch_train_time'].values[0]
    print(f"3. Training Time Comparison:")
    print(f"   - Swin-Tiny (pretrained): {tiny_time:.2f} seconds/epoch")
    print(f"   - Swin-Small (pretrained): {small_time:.2f} seconds/epoch")
    print(f"   - Swin-Tiny (scratch): {scratch_time:.2f} seconds/epoch")

if __name__ == '__main__':
    main() 