In [22]:
import sys
import os
from dotenv import load_dotenv
load_dotenv()
ROOT_DIR_PATH = os.environ.get('ROOT_PATH')
sys.path.append(os.path.abspath(ROOT_DIR_PATH))  # Adds root directory to sys.path

# Model Architecture and Data Loading

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# config numbers
class ViTConfig:
    def __init__(self,
                 img_size=32,
                 patch_size=4,
                 in_channels=3,
                 emb_size=64,
                 depth=6,
                 num_heads=4,
                 mlp_ratio=4.0,
                 num_classes=10,
                 dropout=0.1):
        self.img_size = img_size
        self.patch_size = patch_size
        self.in_channels = in_channels
        self.emb_size = emb_size
        self.depth = depth
        self.num_heads = num_heads
        self.mlp_ratio = mlp_ratio
        self.num_classes = num_classes
        self.dropout = dropout

# Patch Embedding
class PatchEmbedding(nn.Module):
    def __init__(self, in_channels, patch_size, emb_size, img_size):
        super().__init__()
        self.patch_size = patch_size
        self.n_patches = (img_size // patch_size) ** 2
        self.projection = nn.Conv2d(in_channels, emb_size, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        x = self.projection(x)        # (B, emb_size, H/patch, W/patch)
        x = x.flatten(2)              # (B, emb_size, N)
        x = x.transpose(1, 2)         # (B, N, emb_size)
        return x                      # Shape: (B, N, E)

# Transformer Encoder Block
class TransformerEncoderBlock(nn.Module):
    def __init__(self, emb_size, num_heads, mlp_ratio, dropout):
        super().__init__()
        self.ln1 = nn.LayerNorm(emb_size)
        self.attn = nn.MultiheadAttention(embed_dim=emb_size, num_heads=num_heads, batch_first=True)
        self.ln2 = nn.LayerNorm(emb_size)
        self.mlp = nn.Sequential(
            nn.Linear(emb_size, int(emb_size * mlp_ratio)),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(int(emb_size * mlp_ratio), emb_size),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x), self.ln1(x), self.ln1(x))[0]
        x = x + self.mlp(self.ln2(x))
        return x

# Vit Test Model
class VisionTransformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        cfg = config["model"]
        
        self.CHANNEL = cfg["in_channels"]
        self.PATCH = cfg["patch_size"]
        self.EMBEDDING = cfg["emb_size"]
        self.IMAGE = cfg["img_size"]
        self.NUM_HEADS = cfg["num_heads"]
        self.MLP_RATIO = cfg["mlp_ratio"]
        self.DROPOUT = cfg["dropout"]
        self.NUM_CLASS = cfg["num_classes"]
        self.DEPTH = cfg["depth"]

        self.patch_embed = PatchEmbedding(
            in_channels=self.CHANNEL,
            patch_size=self.PATCH,
            emb_size=self.EMBEDDING,
            img_size=self.IMAGE
        )

        self.n_patches = (self.IMAGE// self.PATCH) ** 2
        self.cls_token = nn.Parameter(torch.randn(1, 1, self.EMBEDDING))
        self.pos_embedding = nn.Parameter(torch.randn(1, self.n_patches + 1, self.EMBEDDING))

        self.encoder = nn.Sequential(*[
            TransformerEncoderBlock(
                emb_size=self.EMBEDDING,
                num_heads=self.NUM_HEADS,
                mlp_ratio=self.MLP_RATIO,
                dropout=self.DROPOUT
            ) for _ in range(self.DEPTH)
        ])

        self.norm = nn.LayerNorm(self.EMBEDDING)
        self.head = nn.Linear(self.EMBEDDING, self.NUM_CLASS)

    def forward(self, x):
        B = x.size(0)
        x = self.patch_embed(x)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embedding[:, :x.size(1), :]
        x = self.encoder(x)
        cls_out = self.norm(x[:, 0])
        return self.head(cls_out)


In [26]:
from utils.config_loader import load_config
from utils.data_loader import DatasetLoader

# loading config file for model
config = load_config(f"{ROOT_DIR_PATH}/config/vit_test_config.yaml")

In [27]:
config

{'data': {'dataset': 'CIFAR10',
  'data_path': '/home/wd/Documents/work_stuff/ViT_REPLICATION/data/CIFAR10',
  'batch_size': 32,
  'num_workers': 4,
  'img_size': 32},
 'model': {'img_size': 32,
  'patch_size': 4,
  'in_channels': 3,
  'emb_size': 128,
  'depth': 4,
  'num_heads': 4,
  'mlp_ratio': 4.0,
  'num_classes': 10,
  'dropout': 0.1},
 'train': {'batch_size': 32, 'epochs': 20, 'lr': 0.001}}

In [4]:
model = VisionTransformer(config)

print('testing model sanity')
dummy_input = torch.randn(1, 3, 32, 32)
print(dummy_input.shape)
with torch.no_grad():
    output = model(dummy_input)

print("Output shape:", output.shape)  # torch.Size([1, 10])


print('loading training testing data')
# loading config file for CIFAR10
data_cfg = config["data"]
DATASET = data_cfg["dataset"]
DATA_DIR = data_cfg["data_path"]
BATCH = data_cfg["batch_size"]
NUM_WORKERS = data_cfg["num_workers"]
IMAGE = data_cfg["img_size"]

# loading data
loader = DatasetLoader(dataset_name=DATASET,
                        data_dir=DATA_DIR,
                        batch_size=BATCH,
                        num_workers=NUM_WORKERS,
                        img_size=IMAGE)
train_loader, test_loader = loader.get_loaders()
print(f"Train batches: {len(train_loader)}, Test batches: {len(test_loader)}")
print('data sanity check')
for images, labels in train_loader:
    print(f'image shape and labels shape in training data - one batch : {images.shape}, {labels.shape}')
    break

testing model sanity
torch.Size([1, 3, 32, 32])
Output shape: torch.Size([1, 10])
loading training testing data
Train batches: 1563, Test batches: 313
data sanity check
image shape and labels shape in training data - one batch : torch.Size([32, 3, 32, 32]), torch.Size([32])


# Training

### running forward pass for one batch

In [5]:
#This creates an iterator from an iterable object.
#Fetches the next element from the iterator.

# 1st batch of 32 images
print(next(iter(train_loader))[0].shape)
next(iter(train_loader))[0]

torch.Size([32, 3, 32, 32])


tensor([[[[-0.5451, -0.5216, -0.3725,  ..., -0.0745, -0.1373, -0.2941],
          [-0.5765, -0.5529, -0.4902,  ..., -0.1294, -0.2392, -0.2706],
          [-0.5529, -0.5608, -0.6078,  ..., -0.1608, -0.2157, -0.1686],
          ...,
          [ 0.8431,  0.8667,  0.8431,  ...,  0.8588,  0.8275,  0.7725],
          [ 0.8275,  0.8353,  0.8431,  ...,  0.7412,  0.7333,  0.7412],
          [ 0.7725,  0.7804,  0.7961,  ...,  0.7647,  0.7490,  0.7098]],

         [[-0.3098, -0.2941, -0.1294,  ...,  0.0902,  0.0510, -0.0667],
          [-0.3490, -0.3098, -0.2863,  ...,  0.0667, -0.0275, -0.0510],
          [-0.3333, -0.3098, -0.4353,  ...,  0.0667,  0.0118,  0.0510],
          ...,
          [ 0.6078,  0.6078,  0.6471,  ...,  0.6314,  0.6078,  0.5529],
          [ 0.5765,  0.5686,  0.6314,  ...,  0.5059,  0.5059,  0.5216],
          [ 0.5216,  0.5137,  0.5765,  ...,  0.5608,  0.5608,  0.5216]],

         [[-0.5843, -0.5686, -0.4118,  ..., -0.2784, -0.3569, -0.5137],
          [-0.6000, -0.5608, -

In [6]:
type(next(iter(train_loader)))

list

In [7]:
len(next(iter(train_loader)))

2

In [8]:
a,b = next(iter(train_loader))

In [9]:
a.shape

torch.Size([32, 3, 32, 32])

In [10]:
b.shape

torch.Size([32])

In [11]:
# 1. Get one batch
data_iter = iter(train_loader)
images, labels = next(data_iter)
data_iter

<torch.utils.data.dataloader._MultiProcessingDataLoaderIter at 0x7904f40c9b40>

In [12]:
images.shape

torch.Size([32, 3, 32, 32])

In [13]:
labels.shape

torch.Size([32])

#### 2. Send to device

PyTorch supports running tensors either on:

CPU → default if you're not using a GPU

GPU (CUDA) → much faster for deep learning

#### images.to(device) will -
➡️ "Move this tensor (images) to the computing device (device) you specify — either CPU or GPU."

#### Hardware level

💻 1. Your CPU and GPU Have Separate Memory
Component	Type of Memory	Example Usage
CPU	RAM (main memory)	images tensor when first loaded
GPU	VRAM (device memory)	images.to("cuda") sends it here

| Component | Type of Memory       | Example Usage                     |
| --------- | -------------------- | --------------------------------- |
| CPU       | RAM (main memory)    | `images` tensor when first loaded |
| GPU       | VRAM (device memory) | `images.to("cuda")` sends it here |


The CPU and GPU do not share memory. So if you want to use the GPU to compute something, you must copy the tensor from CPU RAM to GPU VRAM.

⚙️ 2. What Happens Internally?
Here’s the process when you run tensor.to("cuda"):

PyTorch checks which device the tensor is currently on.

Allocates memory in the GPU VRAM to hold the tensor.

Copies data from system RAM to GPU memory using the CUDA driver.

Returns a new tensor that lives on the GPU.

This data transfer uses the PCIe (Peripheral Component Interconnect Express) bus — this is the "highway" that connects your CPU and GPU physically.


Slow: Moving data over PCIe is relatively slow. So if you do .to("cuda") inside your training loop (on every batch), it will kill your performance.

Plan ahead: Always move both your model and data to the GPU before training or inference begins.

#### `Run it before and after running .to("cuda") — you’ll see the memory usage spike as your tensor lands in VRAM.`

In [14]:
type(images)

torch.Tensor

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
images, labels = images.to(device), labels.to(device)

#### 3. Load model and move to device

In [16]:
model = model.to(device)


In [17]:
model

VisionTransformer(
  (patch_embed): PatchEmbedding(
    (projection): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
  )
  (encoder): Sequential(
    (0): TransformerEncoderBlock(
      (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): GELU(approximate='none')
        (2): Dropout(p=0.1, inplace=False)
        (3): Linear(in_features=512, out_features=128, bias=True)
        (4): Dropout(p=0.1, inplace=False)
      )
    )
    (1): TransformerEncoderBlock(
      (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (

In [18]:
type(model)

__main__.VisionTransformer

#### 4. Set model to eval mode (we just want to inspect)

✅ Layers Affected
Dropout (nn.Dropout)

In train() mode: randomly zeroes out some neurons with a probability p (adds noise for regularization).

In eval() mode: no neurons are dropped — it just passes the values through.

BatchNorm (nn.BatchNorm1d/2d/3d)

In train() mode: uses batch statistics (mean and variance) and updates running stats.

In eval() mode: uses the running averages (learned from training), and does not update them.


✅ What model.eval() does not do

It does not:

Disable gradient computation (you need torch.no_grad() for that).

Freeze model weights.

Turn off autograd.

Stop the model from being trained if optimizer.step() is still called

In [20]:
model.eval()

VisionTransformer(
  (patch_embed): PatchEmbedding(
    (projection): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
  )
  (encoder): Sequential(
    (0): TransformerEncoderBlock(
      (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): GELU(approximate='none')
        (2): Dropout(p=0.1, inplace=False)
        (3): Linear(in_features=512, out_features=128, bias=True)
        (4): Dropout(p=0.1, inplace=False)
      )
    )
    (1): TransformerEncoderBlock(
      (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (

#### 5. Forward pass with torch.no_grad

#### `torch.no_grad`

❗ If you're only doing a forward pass (no loss.backward() and no optimizer.step()), the model parameters will not get updated — with or without torch.no_grad().

#### Purpose of torch.no_grad() (even during forward-only pass):

Saves Memory:

PyTorch usually tracks every operation on tensors to later compute gradients.

If you're not planning to backpropagate (e.g. during evaluation or debugging), this is unnecessary overhead.

torch.no_grad() disables autograd, making things faster and leaner.

Speeds Up Inference:

Especially helpful in evaluation, validation, and visualizations.

No need to maintain a compute graph → performance is better.

Prevents Accidental Training:

Sometimes, we accidentally call .backward() or .step() inside inference loops.

Wrapping your code in torch.no_grad() makes it safe.

In [22]:
images.shape

torch.Size([32, 3, 32, 32])

In [23]:
with torch.no_grad():
    patch_vectors = model.patch_embed(images)
    print("Patch Vectors:", patch_vectors.shape)  # (B, N, E)


Patch Vectors: torch.Size([32, 64, 128])


batchsize is 32, patch size is 4, so 32 X 32 image becomes (32/4)*(32/4) = 64 number of patches. 128 embedding dimensions for each patch

In [24]:

with torch.no_grad():
    patch_vectors = model.patch_embed(images)
    print("Patch Vectors:", patch_vectors.shape)  # (B, N, E)

    ## getting the batch size to match the cls_toke dimension so that we can add it with the patch embedding vector    
    B = images.shape[0]
    cls_token = model.cls_token.expand(B, -1, -1)
    
    #Concatenate the [CLS] token at the start of patch tokens along the sequence dimension (dim=1).
    #Resulting shape: (B, N+1, E) — one extra token.
    patch_with_cls = torch.cat((cls_token, patch_vectors), dim=1)
    print("After Adding CLS Token:", patch_with_cls.shape)  # (B, N+1, E)

    x = patch_with_cls + model.pos_embedding[:, :patch_with_cls.size(1), :]
    print("After Adding Positional Embedding:", x.shape)

    # Pass through encoder
    for i, block in enumerate(model.encoder):
        x = block(x)
        print(f"After Encoder Block {i+1}: {x.shape}")

    # Final CLS token output
    cls_output = model.norm(x[:, 0])
    print("🔹 Final CLS Token Output:", cls_output.shape)

    # Final prediction
    output = model.head(cls_output)
    print("🔹 Logits Output:", output.shape)
    print("🔹 Predicted Classes:", output.argmax(dim=1))

Patch Vectors: torch.Size([32, 64, 128])
After Adding CLS Token: torch.Size([32, 65, 128])
After Adding Positional Embedding: torch.Size([32, 65, 128])
After Encoder Block 1: torch.Size([32, 65, 128])
After Encoder Block 2: torch.Size([32, 65, 128])
After Encoder Block 3: torch.Size([32, 65, 128])
After Encoder Block 4: torch.Size([32, 65, 128])
🔹 Final CLS Token Output: torch.Size([32, 128])
🔹 Logits Output: torch.Size([32, 10])
🔹 Predicted Classes: tensor([6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 6, 6, 0, 6, 0, 6,
        6, 6, 6, 6, 6, 6, 6, 6], device='cuda:0')


## Running One Epoch

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

from utils.config_loader import load_config
from utils.data_loader import DatasetLoader


def validate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    progress_bar = tqdm(loader, desc="Validation", leave=False)

    with torch.no_grad():
        for inputs, labels in progress_bar:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * inputs.size(0)
            _, predicted = outputs.max(1)
            correct += predicted.eq(labels).sum().item()
            total += labels.size(0)

            # Avoid division by zero on first step
            if total > 0:
                avg_loss = running_loss / total
                accuracy = 100. * correct / total

                progress_bar.set_postfix({
                    "Loss": f"{avg_loss:.4f}",
                    "Acc": f"{accuracy:.2f}%"
                })

    avg_loss = running_loss / total
    accuracy = 100. * correct / total

    return avg_loss, accuracy

def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    progress_bar = tqdm(loader, desc="Training", leave=False)
    for  inputs, labels in progress_bar:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        _, predicted = outputs.max(1)
        correct += predicted.eq(labels).sum().item()
        total += labels.size(0)

        # Update progress bar with metrics
        if total > 0:
            avg_loss = running_loss / total
            accuracy = 100. * correct / total
            progress_bar.set_postfix({
                "Loss": f"{avg_loss:.4f}",
                "Acc": f"{accuracy:.2f}%"
            })


    avg_loss = running_loss / total
    accuracy = 100. * correct / total
    
    return avg_loss, accuracy

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = VisionTransformer(config).to(device)

cuda


In [18]:
model

VisionTransformer(
  (patch_embed): PatchEmbedding(
    (projection): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
  )
  (encoder): Sequential(
    (0): TransformerEncoderBlock(
      (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): GELU(approximate='none')
        (2): Dropout(p=0.1, inplace=False)
        (3): Linear(in_features=512, out_features=128, bias=True)
        (4): Dropout(p=0.1, inplace=False)
      )
    )
    (1): TransformerEncoderBlock(
      (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (

In [19]:
# Data loaders
print('loading training testing data')
# loading config file for CIFAR10
data_cfg = config["data"]
DATASET = data_cfg["dataset"]
DATA_DIR = data_cfg["data_path"]
BATCH = data_cfg["batch_size"]
NUM_WORKERS = data_cfg["num_workers"]
IMAGE = data_cfg["img_size"]

# loading data
loader = DatasetLoader(dataset_name=DATASET,
                        data_dir=DATA_DIR,
                        batch_size=BATCH,
                        num_workers=NUM_WORKERS,
                        img_size=IMAGE)
train_loader, val_loader = loader.get_loaders()

loading training testing data


In [20]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=config["train"]["lr"])

# Training loop
#for epoch in range(config["train"]["epochs"]):

#running for one epoch
for epoch in range(1):

    print(f"\nEpoch {epoch+1}/{config['train']['epochs']}")

    train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = validate(model, val_loader, criterion, device)

    print(f"Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.2f}%")
    print(f"Val   Loss: {val_loss:.4f}, Accuracy: {val_acc:.2f}%")


# deleting GPU memory cache
del model
torch.cuda.empty_cache()




Epoch 1/20


                                                                                       

Train Loss: 1.7339, Accuracy: 36.90%
Val   Loss: 1.5392, Accuracy: 44.58%


## Few Important Observations

1. Why does accuracy improve when rerunning training cells multiple times?
This happens because your model is continuing training from where it left off in RAM/GPU memory — you're not reinitializing the model weights in between runs. So the model is learning incrementally across multiple cell runs, just like extra training epochs.

Solution if you want fresh training each time:
Reinitialize the model each time before running training, or restart the kernel to clear memory.

2. Why does accuracy drop after restarting the kernel?
When you restart the kernel, all variables including model weights are reset to their initial random state, so the model forgets everything it had learned in earlier training runs. That’s why accuracy drops back to what you'd expect from a randomly initialized model.

This is expected behavior unless you load a saved model checkpoint.

4. High CPU usage (98%) and persistent GPU memory usage (285 MiB)?

- High CPU usage:
This is from data loading, especially if you set num_workers > 0 in your DataLoader. PyTorch uses CPU subprocesses to load/transform data while GPU runs the model. 98% CPU is okay — it means your CPU is trying to keep up with GPU.

- GPU memory stays occupied (even when not training):
This is due to PyTorch caching.

PyTorch doesn’t free GPU memory immediately after computations.
Also, model weights and buffers stay on GPU unless you delete the model or restart the kernel.

5. Why does GPU memory still show usage after del model and torch.cuda.empty_cache()?

PyTorch’s caching allocator: PyTorch doesn’t release memory back to the GPU driver immediately after you delete tensors or call empty_cache(). Instead, it keeps the freed memory cached internally for future allocations to speed up subsequent GPU operations.

Driver and system reserved memory: Some portion of GPU memory is always reserved by the GPU driver and the CUDA runtime itself, which shows up as occupied but cannot be freed.

Other processes: Sometimes other system or user processes (like display server, background tasks, or other apps) use some GPU memory.

6. Training:  70%|██████▉   | 1089/1563 [00:17<00:07, 64.15it/s]
- Why 1563?

It comes from the size of your training dataset and the batch size you set.

For example:

Suppose your training dataset size is 50,000 images (which is the case for CIFAR-10 training set).

Your batch size is 32.

- The number of batches per epoch (also called iterations per epoch) is:

number of batches = dataset_size / batch_size = ceil(5000/32) = 1563

- 1562 batches of 32 images each = 1562 * 32 = 49,984 images

- The last batch will have the remaining 16 images to make up the total 50,000.

