In [4]:
import torch
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import copy
import timm

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
print(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

cuda


In [6]:
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
IMG_SIZE = 224
BATCH_SIZE = 32
NUM_CLASSES = 3
EPOCHS = 5
PATIENCE = 3
LEARNING_RATE = 1e-4

In [7]:
train_path = r'C:\Users\Admin\Desktop\Video Classification\dataset\train'
val_path = r'C:\Users\Admin\Desktop\Video Classification\dataset\val'
checkpoint_path = 'best_vit_model.pt'

In [8]:
# Transforms
transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])


In [9]:
# Datasets
train_dataset = ImageFolder(train_path, transform=transform)
val_dataset = ImageFolder(val_path, transform=transform)

# Loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=2, pin_memory=True)

In [10]:
# ✅ Model: ViT-Tiny from timm
model = timm.create_model('vit_tiny_patch16_224', pretrained=True)

# Freeze all layers
for param in model.parameters():
    param.requires_grad = False

# Replace classification head
model.head = nn.Linear(model.head.in_features, NUM_CLASSES)
# Train only the new head
for param in model.head.parameters():
    param.requires_grad = True

model = model.to(device)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [12]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [15]:
# Early stopping setup
best_acc = 0.0
patience_counter = 0

In [16]:
# Training loop
for epoch in range(EPOCHS):
    model.train()
    running_loss = 0

    for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Validation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_acc = correct / total
    print(f"✅ Validation Accuracy: {val_acc*100:.2f}%")

    # Early stopping based on accuracy
    if val_acc > best_acc:
        best_acc = val_acc
        best_model = copy.deepcopy(model.state_dict())
        patience_counter = 0
        torch.save(best_model, checkpoint_path)
        print("📌 New best model saved")
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print("⏹️ Early stopping triggered")
            break

Epoch 1/5: 100%|██████████| 365/365 [00:37<00:00,  9.86it/s]


✅ Validation Accuracy: 84.84%
📌 New best model saved


Epoch 2/5: 100%|██████████| 365/365 [00:39<00:00,  9.20it/s]


✅ Validation Accuracy: 86.32%
📌 New best model saved


Epoch 3/5: 100%|██████████| 365/365 [00:41<00:00,  8.90it/s]


✅ Validation Accuracy: 87.52%
📌 New best model saved


Epoch 4/5: 100%|██████████| 365/365 [00:44<00:00,  8.25it/s]


✅ Validation Accuracy: 88.28%
📌 New best model saved


Epoch 5/5: 100%|██████████| 365/365 [00:38<00:00,  9.39it/s]


✅ Validation Accuracy: 88.56%
📌 New best model saved


In [17]:
# Load best model
model.load_state_dict(torch.load(checkpoint_path))
model.to(device)

  model.load_state_dict(torch.load(checkpoint_path))


VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 192, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)


In [18]:
test_path = r'C:\Users\Admin\Desktop\Video Classification\dataset\test'
# Test dataset and loader
test_dataset = ImageFolder(test_path, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=2, pin_memory=True)


In [19]:
model.load_state_dict(torch.load('best_vit_model.pt'))
model = model.to(device)
model.eval()

correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)
        preds = torch.argmax(outputs, dim=1)

        correct += (preds == labels).sum().item()
        total += labels.size(0)

print(f"✅ Test Accuracy: {correct / total * 100:.2f}%")

  model.load_state_dict(torch.load('best_vit_model.pt'))


✅ Test Accuracy: 88.10%
