### Code Trainer for the ViT is available at : [Collab](https://colab.research.google.com/drive/1WZY56BOF6jdAL6uDY3mCh9kcJggncxqg?usp=sharing)

### Import Required Library

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset

from Models.Vit import ViTTransformerForClassification

### Setup Model Conffiguration

In [3]:
class ViTconfig:
    model_type = "ViT"
    patch_size = 16
    image_size = 224
    patch_length = ((image_size - patch_size) // patch_size + 1) ** 2
    in_channels = 3
    embed_dim = 768
    num_layers = 12
    num_heads = 12
    num_classes = 101

### Load and Prepare Dataset

In [2]:
from datasets import load_dataset

ds = load_dataset("ethz/food101")

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 75750
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 25250
    })
})

In [5]:
from torch.utils.data import DataLoader
from torchvision import transforms
from transformers import ViTFeatureExtractor
from datasets import load_dataset

In [6]:
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
normalize = transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    normalize,
])



In [8]:
from torch.utils.data import Dataset as TorchDataset

class HFDatasetWrapper(TorchDataset):
    def __init__(self, hf_dataset, transform):
        self.dataset = hf_dataset
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        image = item['image'].convert('RGB')  # ensure RGB
        image = self.transform(image)
        label = item['label']
        return {'pixel_values': image, 'labels': label}

In [9]:
train_dataset = HFDatasetWrapper(ds['train'], transform)
val_dataset = HFDatasetWrapper(ds['validation'], transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)


### Define Model, Loss Criterion and Optimizer

In [7]:
model = ViTTransformerForClassification(ViTconfig)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-2)

### Define Trainer Class

In [11]:
TrainerConfig = {
    "epochs": 10,
    "batch_size": 32,
    "learning_rate": 1e-4,
    "weight_decay": 1e-2,
    "device": device,
    "criterion": criterion,
    "optimizer": optimizer
}

In [18]:
from sklearn.metrics import accuracy_score, f1_score
class Trainer:
    def __init__(self, Trainerconfig):
        self.device = Trainerconfig["device"]
        self.optimizer = Trainerconfig["optimizer"]
        self.criterion = Trainerconfig["criterion"]
        self.epochs = Trainerconfig["epochs"]
        

    def train(self, model, train_loader, val_loader, log_interval=10):
        model.train()
        total_loss = 0
        train_loader_length = len(train_loader)
        for batch in train_loader:
            
            
            input = batch['pixel_values'].to(self.device)
            labels = batch['labels'].to(self.device)
            

            self.optimizer.zero_grad()
            logits = model(input)
            
            
            loss = self.criterion(logits, labels)
            loss.backward()
            self.optimizer.step()
            
            total_loss += loss.item()

            if train_loader_length % log_interval == 0:
                print({"accuracy": self.evaluate(model, val_loader)[0], "f1_score": self.evaluate(model, val_loader)[1] ,"loss": loss.item()})

        # print(f"Epoch {self.epoch + 1}, Loss: {total_loss / len(train_loader):.4f}")

    def evaluate(self, model, val_loader):
        model.eval()
        preds, targets = [], []
        with torch.no_grad():
            for batch in val_loader:

                # Get input data from the batch
                input = batch['pixel_values'].to(self.device)
                labels = batch['labels'].to(self.device)
                
                
                logits = model(input)
                predictions = torch.argmax(logits, dim=1)
                
                
                preds.extend(predictions.cpu().numpy())
                targets.extend(labels.cpu().numpy())
        
        return accuracy_score(targets, preds), f1_score(targets, preds, average='macro')
        # print(f"Validation Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")

In [19]:
train = Trainer(TrainerConfig)

In [20]:
for epoch in range(TrainerConfig["epochs"]):
    print(f"Epoch {epoch + 1}/{TrainerConfig['epochs']}")
    train.train(model, train_loader, val_loader, log_interval=100)

Epoch 1/10


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 3.69 GiB of which 21.12 MiB is free. Process 3511 has 1000.00 MiB memory in use. Including non-PyTorch memory, this process has 2.51 GiB memory in use. Of the allocated memory 2.30 GiB is allocated by PyTorch, and 127.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)