# **Extract and Organize Data**

In [2]:
!apt-get install unrar -y


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
unrar is already the newest version (1:6.1.5-1ubuntu0.1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


In [3]:
!unrar x /content/train.rar /content/train/
!unrar x /content/test.rar /content/test/


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Extracting  /content/test/test/test/happy/PrivateTest_22154496.jpg        65%  OK 
Extracting  /content/test/test/test/happy/PrivateTest_2260082.jpg         65%  OK 
Extracting  /content/test/test/test/happy/PrivateTest_22689070.jpg        65%  OK 
Extracting  /content/test/test/test/happy/PrivateTest_22929697.jpg        65%  OK 
Extracting  /content/test/test/test/happy/PrivateTest_22938435.jpg        65%  OK 
Extracting  /content/test/test/test/happy/PrivateTest_22979298.jpg        65%  OK 
Extracting  /content/test/test/test/happy/PrivateTest_2309763.jpg         65%  OK 
Extracting  /content/test/test/test/happy/PrivateTest_23250597.jpg        65%  OK 
Extracting  /content/test/test/test/happy/PrivateTest_23282756.jpg        65%  OK 
Extracting  /content/test/test/test/happy/PrivateTest_23344981.jpg        65%  OK 
Extracting  /co

In [4]:
!ls /content/train
!ls /content/test


train
test


**Load Pretrained Vision Transformer (ViT) Model for Image Classification**

In [None]:
!pip install transformers datasets torchvision --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m110.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m89.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

# Standard ViT image size
IMAGE_SIZE = 224

transform = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5]*3, std=[0.5]*3),  # Normalize to [-1, 1]
])

# Load datasets
train_dataset = ImageFolder("/content/facial_expression_dataset/train", transform=transform)
test_dataset = ImageFolder("/content/facial_expression_dataset/test", transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [None]:
from transformers import ViTForImageClassification, ViTFeatureExtractor
import torch

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pretrained ViT with 7 output labels
model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=7  # Number of emotion classes
).to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Train the Vision Transformer Model**

In [None]:
import torch.nn as nn
from torch.optim import AdamW  # <-- Correct import

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)


In [None]:
from tqdm import tqdm

def train(model, train_loader, criterion, optimizer, device, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        loop = tqdm(train_loader, desc=f"Epoch [{epoch+1}/{num_epochs}]")

        for images, labels in loop:
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            outputs = model(pixel_values=images).logits
            loss = criterion(outputs, labels)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Statistics
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            loop.set_postfix(loss=loss.item(), acc=100 * correct / total)

    print("Training complete.")


In [None]:
train(model, train_loader, criterion, optimizer, device, num_epochs=3)


Epoch [1/3]: 100%|██████████| 898/898 [17:40<00:00,  1.18s/it, acc=61.2, loss=0.488]
Epoch [2/3]: 100%|██████████| 898/898 [17:30<00:00,  1.17s/it, acc=73, loss=0.619]
Epoch [3/3]: 100%|██████████| 898/898 [17:31<00:00,  1.17s/it, acc=81.5, loss=0.163]

Training complete.





In [None]:
from transformers import ViTFeatureExtractor

feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



In [None]:
# Save model and tokenizer
model.save_pretrained("vit_emotion_model")
feature_extractor.save_pretrained("vit_emotion_model")


['vit_emotion_model/preprocessor_config.json']

# **Evaluate on the Test Set**

In [None]:
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from transformers import ViTForImageClassification, ViTImageProcessor  # feature_extractor is now ViTImageProcessor
import os

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define paths
model_dir = "/content/vit_emotion_model"
test_dir = "/content/facial_expression_dataset/test"

# Load saved model and processor
model = ViTForImageClassification.from_pretrained(model_dir).to(device)
feature_extractor = ViTImageProcessor.from_pretrained(model_dir)

# Define transform using the feature extractor
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
])

# Load test dataset
test_dataset = datasets.ImageFolder(test_dir, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for images, labels in dataloader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            _, preds = torch.max(outputs.logits, 1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total * 100
    print(f"✅ Test Accuracy: {accuracy:.2f}%")

# Run evaluation
evaluate(model, test_loader, device)


✅ Test Accuracy: 68.88%


In [5]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from transformers import ViTModel, ViTFeatureExtractor
import numpy as np
from tqdm import tqdm

# Step 1: Load ViT Pretrained Model and Feature Extractor
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
vit_model.eval().cuda()  # Move to GPU if available

# Step 2: Define transform (same as ViT expects)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
])

# Step 3: Load datasets
train_dataset = datasets.ImageFolder('/content/train', transform=transform)
test_dataset = datasets.ImageFolder('/content/test', transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Step 4: Feature extraction function
def extract_vit_features(model, dataloader):
    features = []
    labels = []

    with torch.no_grad():
        for images, lbls in tqdm(dataloader):
            images = images.cuda()
            outputs = model(pixel_values=images)['last_hidden_state'][:, 0, :]  # CLS token
            features.append(outputs.cpu().numpy())
            labels.append(lbls.numpy())

    return np.concatenate(features), np.concatenate(labels)

# Step 5: Extract features from train and test sets
train_features, train_labels = extract_vit_features(vit_model, train_loader)
test_features, test_labels = extract_vit_features(vit_model, test_loader)

# Step 6: Save as .npy
np.save("vit_train_features.npy", train_features)
np.save("vit_train_labels.npy", train_labels)
np.save("vit_test_features.npy", test_features)
np.save("vit_test_labels.npy", test_labels)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

100%|██████████| 898/898 [05:53<00:00,  2.54it/s]
100%|██████████| 449/449 [02:58<00:00,  2.52it/s]


In [6]:
from google.colab import files

files.download("vit_train_features.npy")
files.download("vit_train_labels.npy")
files.download("vit_test_features.npy")
files.download("vit_test_labels.npy")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>