In [1]:
import sys
import os
CURR_PATH = f'/home/wd/Documents/work_stuff/ViT_REPLICATION'
sys.path.append(os.path.abspath(CURR_PATH))  # Adds root directory to sys.path

# Model Architecture and Data Loading

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# config numbers
class ViTConfig:
    def __init__(self,
                 img_size=32,
                 patch_size=4,
                 in_channels=3,
                 emb_size=64,
                 depth=6,
                 num_heads=4,
                 mlp_ratio=4.0,
                 num_classes=10,
                 dropout=0.1):
        self.img_size = img_size
        self.patch_size = patch_size
        self.in_channels = in_channels
        self.emb_size = emb_size
        self.depth = depth
        self.num_heads = num_heads
        self.mlp_ratio = mlp_ratio
        self.num_classes = num_classes
        self.dropout = dropout

# Patch Embedding
class PatchEmbedding(nn.Module):
    def __init__(self, in_channels, patch_size, emb_size, img_size):
        super().__init__()
        self.patch_size = patch_size
        self.n_patches = (img_size // patch_size) ** 2
        self.projection = nn.Conv2d(in_channels, emb_size, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        x = self.projection(x)        # (B, emb_size, H/patch, W/patch)
        x = x.flatten(2)              # (B, emb_size, N)
        x = x.transpose(1, 2)         # (B, N, emb_size)
        return x                      # Shape: (B, N, E)

# Transformer Encoder Block
class TransformerEncoderBlock(nn.Module):
    def __init__(self, emb_size, num_heads, mlp_ratio, dropout):
        super().__init__()
        self.ln1 = nn.LayerNorm(emb_size)
        self.attn = nn.MultiheadAttention(embed_dim=emb_size, num_heads=num_heads, batch_first=True)
        self.ln2 = nn.LayerNorm(emb_size)
        self.mlp = nn.Sequential(
            nn.Linear(emb_size, int(emb_size * mlp_ratio)),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(int(emb_size * mlp_ratio), emb_size),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x), self.ln1(x), self.ln1(x))[0]
        x = x + self.mlp(self.ln2(x))
        return x

# Vit Test Model
class VisionTransformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        cfg = config["model"]
        
        self.CHANNEL = cfg["in_channels"]
        self.PATCH = cfg["patch_size"]
        self.EMBEDDING = cfg["emb_size"]
        self.IMAGE = cfg["img_size"]
        self.NUM_HEADS = cfg["num_heads"]
        self.MLP_RATIO = cfg["mlp_ratio"]
        self.DROPOUT = cfg["dropout"]
        self.NUM_CLASS = cfg["num_classes"]
        self.DEPTH = cfg["depth"]

        self.patch_embed = PatchEmbedding(
            in_channels=self.CHANNEL,
            patch_size=self.PATCH,
            emb_size=self.EMBEDDING,
            img_size=self.IMAGE
        )

        self.n_patches = (self.IMAGE// self.PATCH) ** 2
        self.cls_token = nn.Parameter(torch.randn(1, 1, self.EMBEDDING))
        self.pos_embedding = nn.Parameter(torch.randn(1, self.n_patches + 1, self.EMBEDDING))

        self.encoder = nn.Sequential(*[
            TransformerEncoderBlock(
                emb_size=self.EMBEDDING,
                num_heads=self.NUM_HEADS,
                mlp_ratio=self.MLP_RATIO,
                dropout=self.DROPOUT
            ) for _ in range(self.DEPTH)
        ])

        self.norm = nn.LayerNorm(self.EMBEDDING)
        self.head = nn.Linear(self.EMBEDDING, self.NUM_CLASS)

    def forward(self, x):
        B = x.size(0)
        x = self.patch_embed(x)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embedding[:, :x.size(1), :]
        x = self.encoder(x)
        cls_out = self.norm(x[:, 0])
        return self.head(cls_out)


In [11]:
from utils.config_loader import load_config
from utils.data_loader import DatasetLoader

# loading config file for model
config = load_config(f"{CURR_PATH}/config/vit_test_config.yaml")
model = VisionTransformer(config)

print('testing model sanity')
dummy_input = torch.randn(1, 3, 32, 32)
print(dummy_input.shape)
with torch.no_grad():
    output = model(dummy_input)

print("Output shape:", output.shape)  # torch.Size([1, 10])


print('loading training testing data')
# loading config file for CIFAR10
data_cfg = config["data"]
DATASET = data_cfg["dataset"]
DATA_DIR = data_cfg["data_path"]
BATCH = data_cfg["batch_size"]
NUM_WORKERS = data_cfg["num_workers"]
IMAGE = data_cfg["img_size"]

# loading data
loader = DatasetLoader(dataset_name=DATASET,
                        data_dir=DATA_DIR,
                        batch_size=BATCH,
                        num_workers=NUM_WORKERS,
                        img_size=IMAGE)
train_loader, test_loader = loader.get_loaders()
print(f"Train batches: {len(train_loader)}, Test batches: {len(test_loader)}")
print('data sanity check')
for images, labels in train_loader:
    print(f'image shape and labels shape in training data - one batch : {images.shape}, {labels.shape}')
    break

testing model sanity
torch.Size([1, 3, 32, 32])
Output shape: torch.Size([1, 10])
loading training testing data
Train batches: 1563, Test batches: 313
data sanity check
image shape and labels shape in training data - one batch : torch.Size([32, 3, 32, 32]), torch.Size([32])


# Training

### running forward pass for one batch

In [14]:
#This creates an iterator from an iterable object.
#Fetches the next element from the iterator.

# 1st batch of 32 images
print(next(iter(train_loader))[0].shape)
next(iter(train_loader))[0]

torch.Size([32, 3, 32, 32])


tensor([[[[ 0.6235,  0.7255,  0.8118,  ...,  0.0667,  0.2235,  0.4431],
          [ 0.4824,  0.5608,  0.6078,  ...,  0.2784,  0.1686,  0.3412],
          [ 0.5686,  0.5451,  0.5922,  ...,  0.2157,  0.2314,  0.2706],
          ...,
          [-0.2235, -0.2235, -0.3412,  ...,  0.6706,  0.7020,  0.6078],
          [-0.1608, -0.1922, -0.2784,  ...,  0.7176,  0.7412,  0.6863],
          [-0.1216,  0.0588,  0.2000,  ...,  0.7412,  0.7725,  0.7333]],

         [[ 0.5373,  0.6392,  0.7020,  ..., -0.1294,  0.0431,  0.2627],
          [ 0.3647,  0.4510,  0.4902,  ...,  0.0510, -0.0510,  0.1294],
          [ 0.4431,  0.4275,  0.4745,  ..., -0.0431, -0.0275,  0.0275],
          ...,
          [-0.2863, -0.2549, -0.3333,  ...,  0.4745,  0.5059,  0.4118],
          [-0.2314, -0.2235, -0.2706,  ...,  0.5451,  0.5686,  0.5059],
          [-0.1922,  0.0275,  0.2078,  ...,  0.5765,  0.6078,  0.5686]],

         [[ 0.3255,  0.4275,  0.5137,  ..., -0.2863, -0.1216,  0.1059],
          [ 0.1686,  0.2471,  

In [19]:
type(next(iter(train_loader)))

list

In [25]:
len(next(iter(train_loader)))

2

In [20]:
a,b = next(iter(train_loader))

In [22]:
a.shape

torch.Size([32, 3, 32, 32])

In [23]:
b.shape

torch.Size([32])

In [15]:
# 1. Get one batch
data_iter = iter(train_loader)
images, labels = next(data_iter)
data_iter

<torch.utils.data.dataloader._MultiProcessingDataLoaderIter at 0x700ed5e94b50>

In [16]:
images.shape

torch.Size([32, 3, 32, 32])

In [17]:
labels.shape

torch.Size([32])

#### 2. Send to device
PyTorch supports running tensors either on:

CPU → default if you're not using a GPU

GPU (CUDA) → much faster for deep learning

#### images.to(device) will -
➡️ "Move this tensor (images) to the computing device (device) you specify — either CPU or GPU."

#### Hardware level

💻 1. Your CPU and GPU Have Separate Memory
Component	Type of Memory	Example Usage
CPU	RAM (main memory)	images tensor when first loaded
GPU	VRAM (device memory)	images.to("cuda") sends it here

| Component | Type of Memory       | Example Usage                     |
| --------- | -------------------- | --------------------------------- |
| CPU       | RAM (main memory)    | `images` tensor when first loaded |
| GPU       | VRAM (device memory) | `images.to("cuda")` sends it here |


The CPU and GPU do not share memory. So if you want to use the GPU to compute something, you must copy the tensor from CPU RAM to GPU VRAM.

⚙️ 2. What Happens Internally?
Here’s the process when you run tensor.to("cuda"):

PyTorch checks which device the tensor is currently on.

Allocates memory in the GPU VRAM to hold the tensor.

Copies data from system RAM to GPU memory using the CUDA driver.

Returns a new tensor that lives on the GPU.

This data transfer uses the PCIe (Peripheral Component Interconnect Express) bus — this is the "highway" that connects your CPU and GPU physically.


Slow: Moving data over PCIe is relatively slow. So if you do .to("cuda") inside your training loop (on every batch), it will kill your performance.

Plan ahead: Always move both your model and data to the GPU before training or inference begins.

#### `Run it before and after running .to("cuda") — you’ll see the memory usage spike as your tensor lands in VRAM.`

In [27]:
type(images)

torch.Tensor

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
images, labels = images.to(device), labels.to(device)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

from models.vit import VisionTransformer
from utils.config_loader import load_config
from utils.data_loader import DataLoaderFactory


def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in tqdm(loader, desc="Training", leave=False):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        _, predicted = outputs.max(1)
        correct += predicted.eq(labels).sum().item()
        total += labels.size(0)

    avg_loss = running_loss / total
    accuracy = 100. * correct / total
    return avg_loss, accuracy

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = VisionTransformer(config).to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=config["train"]["lr"])

# Training loop
for epoch in range(config["train"]["epochs"]):
    print(f"\nEpoch {epoch+1}/{config['train']['epochs']}")

    train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = validate(model, val_loader, criterion, device)

    print(f"Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.2f}%")
    print(f"Val   Loss: {val_loss:.4f}, Accuracy: {val_acc:.2f}%")
