# VIT 

This notebook is for traning a Vision Transformer model for dog emotion detection

#### REF
[VIT Transfer Learning](https://huggingface.co/timm/vit_base_patch16_224.augreg2_in21k_ft_in1k)<br>
[Pytorch](https://pytorch.org/docs/stable/index.html)

## Setup

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import timm 
import matplotlib.pyplot as plt
import sys
sys.path.insert(1, '/mnt/c/Eclipse/IU Spring 2025/is_the_dog_happy')

from get_data_loaders import *

  warn(
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Data Processing

In [3]:
train_loader, vali_loader, test_loader, num_classes = get_loaders("../data")

## Model

In [4]:
model = timm.create_model('vit_base_patch16_224', pretrained=True, num_classes=num_classes)
model.to(device)

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity(

In [12]:
creterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
num_epochs = 10

In [16]:
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)

        optimizer.zero_grad()
        outputs = model(data)
        loss = creterion(outputs, target)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    
    avg_loss = running_loss / len(train_loader)
    print(f"Epoch: {epoch}, Loss: {avg_loss}")

    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for data, target in vali_loader:
            data, target = data.to(device), target.to(device)

            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

        val_accuracy = 100 * correct / total
        print(f"Validation accuracy: {val_accuracy}")

torch.save(model.state_dict(), "vit_model.pth")

Epoch: 0, Loss: 1.3490456906794703
Validation accuracy: 39.949748743718594
Epoch: 1, Loss: 1.2665997720643305
Validation accuracy: 46.13693467336683
Epoch: 2, Loss: 1.2152133734781339
Validation accuracy: 42.96482412060301
Epoch: 3, Loss: 1.175412607552418
Validation accuracy: 46.10552763819096
Epoch: 4, Loss: 1.1265816436940101
Validation accuracy: 46.29396984924623
Epoch: 5, Loss: 1.070739634372481
Validation accuracy: 48.14698492462311
Epoch: 6, Loss: 0.9977813722999451
Validation accuracy: 46.38819095477387
Epoch: 7, Loss: 0.9108682803971884
Validation accuracy: 42.71356783919598
Epoch: 8, Loss: 0.801191597378234
Validation accuracy: 40.16959798994975
Epoch: 9, Loss: 0.6862445496883823
Validation accuracy: 43.938442211055275
