In [1]:
# #In lambdalabs jupyter lab instance, run these:
# pip install transformers
# pip install tf-keras
# pip install --upgrade "numpy<2"
# pip install datasets
# pip install --upgrade datasets pillow
# pip install --upgrade "accelerate>=0.26.0"
# #then check dependency warnings
# pip check
# #if any issues run
# pip install debugpy
# pip install --upgrade argcomplete
# sudo apt-get install python3-cairo

In [2]:
import os
import numpy as np
import torch
from torch import nn
from transformers import (
    AutoImageProcessor,
    AutoModelForImageClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from datasets import load_dataset
from torchvision import transforms as T
from PIL import Image
from functools import partial

  from .autonotebook import tqdm as notebook_tqdm
2025-03-21 14:23:33.029039: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-21 14:23:33.048527: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742567013.067686    2767 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742567013.073957    2767 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742567013.091754    2767 computation_placer.cc:177] computation placer already r

In [7]:
# --------------------------
# 1. Reload Pretrained Model and Processor
# --------------------------
checkpoint = "trpakov/vit-face-expression"
processor = AutoImageProcessor.from_pretrained(checkpoint,use_fast=True)
model = AutoModelForImageClassification.from_pretrained(checkpoint)
# Load fine-tuned model weights
model.load_state_dict(torch.load("/home/ubuntu/MLexpressionsStorage/final_model_V1.pth"))
#Puts the model into evaluation model->disables dropout, batch norm to ensure consistent results
model.eval()

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermed

In [8]:
# --------------------------
# 2. Prepare Dataset
# --------------------------
dataset = load_dataset("imagefolder", data_dir="/home/ubuntu/MLexpressionsStorage/img_datasets/combo_ferckja_dataset", split="train")

label_mapping = {
    'anger': 'Angry', 'contempt': 'Disgust', 'disgust': 'Disgust',
    'fear': 'Fear', 'happiness': 'Happy', 'sadness': 'Sad',
    'surprise': 'Surprise', 'neutral': 'Neutral'
}
num_mapping = {
    'Angry': 0, 'Disgust': 1, 'Fear': 2, 'Happy': 3,
    'Sad': 4, 'Surprise': 5, 'Neutral': 6
}

def reconcile_labels(example):
    # Convert label from integer to string if needed
    original_label = dataset.features['label'].int2str(example['label']).lower()
    # Map to pre-trained model's label set to get correct label
    mapped_label = label_mapping.get(original_label)
    #Converts label into a numerical value using num_mapping
    example['label'] = num_mapping.get(mapped_label, -1)
    return example

# Apply label reconciliation and filter out unrecognized labels
dataset = dataset.map(reconcile_labels)
dataset = dataset.filter(lambda x: x['label'] != -1)

Downloading data: 100%|██████████| 37081/37081 [00:01<00:00, 26916.95files/s] 
Computing checksums: 100%|██████████| 37081/37081 [00:29<00:00, 1278.60it/s]
Generating train split: 37081 examples [00:02, 15930.68 examples/s]
Map: 100%|██████████| 37081/37081 [00:03<00:00, 11617.98 examples/s]
Filter: 100%|██████████| 37081/37081 [03:14<00:00, 191.03 examples/s]


In [10]:
# --------------------------
# 3. Data Augmentation and Processing
# --------------------------
data_augment = T.Compose([
    T.RandomHorizontalFlip(),
    T.RandomRotation(10),
    T.ColorJitter(brightness=0.1, contrast=0.1)
])

def transform(example):
    image = example["image"]

    # Ensure image is properly loaded
    if not isinstance(image, Image.Image):
        image = Image.open(image)

    # Convert to RGB if it's in a different mode
    if image.mode != "RGB":
        image = image.convert("RGB")

    # Apply data augmentation
    image = data_augment(image)

    # Convert image to tensor format for model input, ensuring 
    # that dataset format aligns with ViT model's expected input
    inputs = processor(image, return_tensors="pt")
    inputs = {k: v.squeeze(0) for k, v in inputs.items()}
    inputs["labels"] = example["label"]

    return inputs

# Apply transformations and remove unnecessary columns
dataset = dataset.map(partial(transform), remove_columns=dataset.column_names)

Map: 100%|██████████| 37081/37081 [04:53<00:00, 126.26 examples/s]


In [11]:
# --------------------------
# 4. Train-Validation Split
# --------------------------
splits = dataset.train_test_split(test_size=0.2)
train_dataset = splits['train']
eval_dataset = splits['test']

In [13]:
# --------------------------
# 5. Training Arguments
# --------------------------
training_args = TrainingArguments(
    output_dir="./vit_retrained_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs",
    #label smoothing prevents overconfidence
    label_smoothing_factor=0.1,
    #cosine lr scheduler helps avoid overfitting
    lr_scheduler_type="cosine",
    #warmup ratio prevents instability @ start
    warmup_ratio=0.1
)

In [14]:
# --------------------------
# 6. Evaluation Metric
# --------------------------
#function computes accurace using armax() to get predicted labels
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

In [15]:
# --------------------------
# 7. Trainer
# --------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    #early stopping if validation loss doesn't imporve for 2 epochs
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.809995,0.832008
2,0.848600,0.813848,0.835513
3,0.614600,0.832012,0.842659
4,0.501200,0.820087,0.848321
5,0.460100,0.814358,0.851827


TrainOutput(global_step=2320, training_loss=0.5851076454951846, metrics={'train_runtime': 13166.4743, 'train_samples_per_second': 11.265, 'train_steps_per_second': 0.176, 'total_flos': 1.1494126967676273e+19, 'train_loss': 0.5851076454951846, 'epoch': 5.0})

In [16]:
# --------------------------
# 8. Save Final Independent Model
# --------------------------
model.save_pretrained("/home/ubuntu/MLexpressionsStorage/vit_final_independent_V3")
processor.save_pretrained("/home/ubuntu/MLexpressionsStorage/vit_final_independent_V3")

['/home/ubuntu/MLexpressionsStorage/vit_final_independent_V3/preprocessor_config.json']

In [17]:
torch.save(model.state_dict(), '/home/ubuntu/MLexpressionsStorage/final_model_V3.pth')