### Importing the libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import ViTModel
import get_training_test_set as gts
import numpy as np
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
2025-11-21 17:34:59.215204: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-21 17:34:59.254890: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-21 17:35:00.215439: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


### Set the device

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


### Importing the dataset

In [3]:
# Get training and test set
X_train, X_test, y_train, y_test = gts.get_data()
X_train, X_test, y_train, y_test = np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test)

# Convert to torch tensors
X_train = torch.FloatTensor(X_train)
X_test = torch.FloatTensor(X_test)
y_train = torch.FloatTensor(y_train).unsqueeze(1)  # Add dimension for binary classification
y_test = torch.FloatTensor(y_test).unsqueeze(1)

# Convert to CHW (Channels, Height, Width) from HWC (Height, Width, Channels)
X_train = X_train.permute(0, 3, 1, 2)
X_test = X_test.permute(0, 3, 1, 2)

# Normalize to [0, 1]
if X_train.max() > 1.0:
    X_train = X_train / 255.0
    X_test = X_test / 255.0

# Create DataLoaders
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

### Building the model

In [4]:
### Building the model with Vision Transformer
class ViTForBinaryClassification(nn.Module):
    def __init__(self, pretrained_model_name='google/vit-base-patch16-224'):
        super(ViTForBinaryClassification, self).__init__()

        # Load pretrained ViT
        self.vit = ViTModel.from_pretrained(pretrained_model_name)

        # Get hidden size from ViT config
        hidden_size = self.vit.config.hidden_size

        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 16),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, pixel_values):
        # Get ViT outputs
        outputs = self.vit(pixel_values=pixel_values)

        # Use the [CLS] token representation (first token)
        cls_output = outputs.last_hidden_state[:, 0]

        # Pass through classifier
        logits = self.classifier(cls_output)

        return logits


# Initialize model
model = ViTForBinaryClassification()
model = model.to(device)

print(f"\nModel created with {sum(p.numel() for p in model.parameters())} parameters")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model created with 86401569 parameters
Trainable parameters: 86401569


### Train the top layers of the model

In [5]:
# Freeze all ViT layers
for param in model.vit.parameters():
    param.trainable = False

# Define loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=0.001)

print("\n=== Training Phase 1: Fine-tuning classifier only ===")

# Training function
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in tqdm(dataloader, desc="Training"):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        predicted = (outputs > 0.5).float()
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(dataloader)
    epoch_acc = 100 * correct / total
    return epoch_loss, epoch_acc

# Validation function
def validate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item()
            predicted = (outputs > 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(dataloader)
    epoch_acc = 100 * correct / total
    return epoch_loss, epoch_acc

# Train for 20 epochs
for epoch in range(5):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = validate(model, test_loader, criterion, device)
    print(f"Epoch {epoch+1}/5 - Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}% - Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")


=== Training Phase 1: Fine-tuning classifier only ===


Training: 100%|██████████| 50/50 [02:03<00:00,  2.48s/it]


Epoch 1/20 - Train Loss: 0.2576, Train Acc: 90.54% - Val Loss: 0.0512, Val Acc: 98.41%


Training: 100%|██████████| 50/50 [02:02<00:00,  2.46s/it]


Epoch 2/20 - Train Loss: 0.1079, Train Acc: 96.43% - Val Loss: 0.0271, Val Acc: 99.60%


Training: 100%|██████████| 50/50 [02:02<00:00,  2.46s/it]


Epoch 3/20 - Train Loss: 0.0837, Train Acc: 97.12% - Val Loss: 0.0196, Val Acc: 100.00%


Training: 100%|██████████| 50/50 [02:02<00:00,  2.46s/it]


Epoch 4/20 - Train Loss: 0.0549, Train Acc: 98.43% - Val Loss: 0.0147, Val Acc: 99.60%


Training: 100%|██████████| 50/50 [02:02<00:00,  2.46s/it]


Epoch 5/20 - Train Loss: 0.0433, Train Acc: 98.87% - Val Loss: 0.0116, Val Acc: 99.60%


### Visualise the layers

In [6]:
print("\n=== Model Structure ===")
for name, module in model.named_children():
    print(f"{name}: {module.__class__.__name__}")
    if name == 'vit':
        print(f"  - ViT has {len(list(module.parameters()))} parameter tensors")
        print(f"  - Encoder has {len(model.vit.encoder.layer)} transformer blocks")


=== Model Structure ===
vit: ViTModel
  - ViT has 200 parameter tensors
  - Encoder has 12 transformer blocks
classifier: Sequential


### Fine-tuning the transformer layers

In [8]:
print("\n=== Training Phase 2: Fine-tuning last transformer blocks ===")

# Unfreeze the ViT
for param in model.vit.parameters():
    param.requires_grad = True

# Freeze embeddings and early encoder layers
# Keep only the last 3 transformer blocks trainable
num_blocks = len(model.vit.encoder.layer)
blocks_to_train = 3

for param in model.vit.embeddings.parameters():
    param.requires_grad = False

for i, block in enumerate(model.vit.encoder.layer):
    if i < num_blocks - blocks_to_train:
        for param in block.parameters():
            param.requires_grad = False

print(f"Training last {blocks_to_train} transformer blocks out of {num_blocks}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

# Use SGD with lower learning rate for fine-tuning
optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()),
                      lr=0.00001, momentum=0.9)

# Fine-tune for 20 more epochs
for epoch in range(5):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = validate(model, test_loader, criterion, device)
    print(f"Epoch {epoch+1}/5 - Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}% - Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")


=== Training Phase 2: Fine-tuning last transformer blocks ===
Training last 3 transformer blocks out of 12
Trainable parameters: 21868065


Training: 100%|██████████| 50/50 [00:57<00:00,  1.14s/it]


Epoch 1/5 - Train Loss: 0.0338, Train Acc: 99.25% - Val Loss: 0.0116, Val Acc: 99.60%


Training: 100%|██████████| 50/50 [00:56<00:00,  1.14s/it]


Epoch 2/5 - Train Loss: 0.0337, Train Acc: 99.25% - Val Loss: 0.0116, Val Acc: 99.60%


Training: 100%|██████████| 50/50 [00:56<00:00,  1.14s/it]


Epoch 3/5 - Train Loss: 0.0318, Train Acc: 99.19% - Val Loss: 0.0116, Val Acc: 99.60%


Training: 100%|██████████| 50/50 [00:56<00:00,  1.14s/it]


Epoch 4/5 - Train Loss: 0.0362, Train Acc: 98.87% - Val Loss: 0.0116, Val Acc: 99.60%


Training: 100%|██████████| 50/50 [00:56<00:00,  1.14s/it]


Epoch 5/5 - Train Loss: 0.0336, Train Acc: 99.31% - Val Loss: 0.0116, Val Acc: 99.60%


### Saving the model

In [9]:
# Save the entire model
torch.save(model.state_dict(), "face_recognition_vit_pytorch_V1.pth")
print("\nModel saved successfully!")


Model saved successfully!
