In [55]:
from pathlib import Path
import shutil
import yaml
import zipfile

import torch
from ultralytics import settings
from ultralytics import YOLO

In [50]:
# Settings
YOLO_V8N_VERSION = "8.3.0"
DATASET_PATH = Path("/workspace/dataset")
DATASET_ZIP_PATH = DATASET_PATH / "Human Face Expression.v20i.yolov8.zip"
DATASET_YAML_PATH = DATASET_PATH / "data.yaml"
MODELS_PATH = Path("/workspace/models")
PRETRAINED_MODEL_PATH = MODELS_PATH / "yolo_pretrained.pt"
RUNS_PATH = Path("/workspace/runs")
LOG_PATH = RUNS_PATH / "detect"
BEST_MODEL_PATH = LOG_PATH / "emotion_detection/weights/best.pt"
OUTPUT_MODEL_NAME = "model.onnx"

In [28]:
# Training hyperparameters (https://docs.ultralytics.com/modes/train/#train-settings)
TRANSFER_EPOCHS = 50      # Number of full passes over training dataset during transfer learning
FINE_TUNING_EPOCHS = 50   # Number of full passes over training dataset during fine tuning
BATCH_SIZE = 16           # Number of training samples processed before updating parameters
IMG_SIZE = 640            # Target image size (assumes square aspect ratio)
FREEZE_LAYERS = 10        # Freeze the first N layers of the model (backbone) for transfer learning
NUM_WORKERS = 4           # Number of worker threads for data loading

In [4]:
# Determine if we should execute on the CPU or GPU
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"   # Apple Silicon (Metal Performance Shaders)
else:
    device = "cpu"

print(f"Using device: {device}")

Using device: cuda


## Download dataset

**Manual step required**: Because Roboflow's API key expires every 24 hours, the easiest way to download the dataset is to manually grab it from Roboflow's site.

1. Sign in or create an account on [Roboflow](https://roboflow.com/).
2. Head to the [Human Face Expression Recognition dataset (v20)](https://universe.roboflow.com/human-face-expression-recognition/human-face-expression/dataset/20) on Roboflow.
3. Click **YOLOv8** under *Popular Download Formats**.
4. In the pop-up window, select **Download dataset** and click **Continue**.
5. In the next window, ensure that *YOLOv8* is selected under *Image and Annotation Format*. Select **Download zip to computer** and click **Continue**.
6. Copy the *.zip* file to the *workspace/dataset/* directory.
7. Run the following cells to unzip the dataset and check it.

In [None]:
# Unzip dataset
with zipfile.ZipFile(DATASET_ZIP_PATH, "r") as zip_ref:
    zip_ref.extractall(DATASET_PATH)

In [7]:
# Load the YAML
with open(DATASET_YAML_PATH, "r") as f:
    data_config = yaml.safe_load(f)

# Display contents
print("data.yaml contents:")
for k, v in data_config.items():
    print(f"{k}: {v}")

data.yaml contents:
train: ../train/images
val: ../valid/images
test: ../test/images
nc: 7
names: ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
roboflow: {'workspace': 'human-face-expression-recognition', 'project': 'human-face-expression', 'version': 20, 'license': 'CC BY 4.0', 'url': 'https://universe.roboflow.com/human-face-expression-recognition/human-face-expression/dataset/20'}


## Download model

In [17]:
# Download model weights (pre-trained on COCO)
!wget https://github.com/ultralytics/assets/releases/download/v{YOLO_V8N_VERSION}/yolov8n.pt -O {PRETRAINED_MODEL_PATH}

--2025-11-15 16:50:16--  https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://release-assets.githubusercontent.com/github-production-release-asset/521807533/101dd207-c6a9-4ee0-bfeb-34e12d2b8f40?sp=r&sv=2018-11-09&sr=b&spr=https&se=2025-11-15T17%3A45%3A56Z&rscd=attachment%3B+filename%3Dyolov8n.pt&rsct=application%2Foctet-stream&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0&sktid=398a6654-997b-47e9-b12b-9515b896b4de&skt=2025-11-15T16%3A45%3A11Z&ske=2025-11-15T17%3A45%3A56Z&sks=b&skv=2018-11-09&sig=FXsYHAvFt%2F%2BCU09YpNBpy1CgNQEyu0CCzh1aQbMUobA%3D&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmVsZWFzZS1hc3NldHMuZ2l0aHVidXNlcmNvbnRlbnQuY29tIiwia2V5Ijoia2V5MSIsImV4cCI6MTc2MzIyNTcxNiwibmJmIjoxNzYzMjI1NDE2LCJwYXRoIjoicmVsZWFzZWFzc2V0cHJvZHVjdGlvbi5ibG9iLmNvc

## Transfer Learning

In [32]:
# Enable TensorBoard logging
settings.update({'tensorboard': True})

# Load the YOLOv8-nano model (pretrained on COCO dataset)
model = YOLO(PRETRAINED_MODEL_PATH)

# Print model info
print(f"Model summary: {model.model.info()}")

YOLOv8n summary: 129 layers, 3,157,200 parameters, 0 gradients, 8.9 GFLOPs
Model summary: (129, 3157200, 0, 8.8575488)


In [33]:
# Train with frozen backbone (transfer learning)
results = model.train(
    data=DATASET_YAML_PATH,
    epochs=TRANSFER_EPOCHS,
    freeze=FREEZE_LAYERS,

    # Learning rates
    lr0=0.01,
    lrf=0.01,
    
    # Regularization
    dropout=0.3,           # Add dropout (0.0 to 0.5, start with 0.1)
    weight_decay=0.001,    # L2 regularization (default 0.0005, try 0.001)
    label_smoothing=0.1,   # Smooth hard labels (0.0 to 0.2)
    
    # Data augmentation
    hsv_h=0.02,
    hsv_s=0.8,
    hsv_v=0.5,
    degrees=5,
    translate=0.1,
    scale=0.5,
    fliplr=0.5,

    # Other hyperparameters
    imgsz=IMG_SIZE,
    batch=BATCH_SIZE,
    device=device,
    workers=NUM_WORKERS,
    project=LOG_PATH,
    name='emotion_detection',
    exist_ok=True,
    verbose=True,
)

Ultralytics 8.3.228 üöÄ Python-3.10.12 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 3070 Laptop GPU, 8192MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=/workspace/dataset/data.yaml, degrees=5, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.3, dynamic=False, embed=None, epochs=50, erasing=0.4, exist_ok=True, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=10, half=False, hsv_h=0.02, hsv_s=0.8, hsv_v=0.5, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=/workspace/models/yolo_pretrained.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=emotion_detection, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto

In [34]:
# Load the best model weights (highest mAP50-95) from transfer learning
yolo_fine_tune = YOLO(BEST_MODEL_PATH)

In [35]:
# Fine-tune with unfrozen backbone
results = yolo_fine_tune.train(
    data=DATASET_YAML_PATH,
    epochs=FINE_TUNING_EPOCHS,
    freeze=0,               # Unfreeze all layers
    
    # Lower learning rates
    lr0=0.0001,
    lrf=0.00001,
    
    # Keep regularization
    dropout=0.1,
    weight_decay=0.001,
    label_smoothing=0.1,
    
    # Augmentation
    hsv_h=0.02,
    hsv_s=0.8,
    hsv_v=0.5,
    degrees=5,
    translate=0.1,
    scale=0.5,
    
    # Other params
    imgsz=IMG_SIZE,
    batch=BATCH_SIZE,
    device=device,
    workers=NUM_WORKERS,
    project=LOG_PATH,
    name='emotion_detection_fine_tune',
    exist_ok=True,
    verbose=True
)

Ultralytics 8.3.228 üöÄ Python-3.10.12 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 3070 Laptop GPU, 8192MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=/workspace/dataset/data.yaml, degrees=5, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.1, dynamic=False, embed=None, epochs=50, erasing=0.4, exist_ok=True, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=0, half=False, hsv_h=0.02, hsv_s=0.8, hsv_v=0.5, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.0001, lrf=1e-05, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=/workspace/runs/detect/emotion_detection/weights/best.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=emotion_detection_fine_tune, nbs=64, nms=False, opset=None

## Test retrained model

In [38]:
# Load the best model
best_model = YOLO(BEST_MODEL_PATH)

# Run validation on the test/validation set
metrics = best_model.val(
    data=DATASET_YAML_PATH,
    split='test',  # Or 'val' depending on your data.yaml
    batch=16,
    imgsz=640,
    device=device,
    plots=True,  # Generate confusion matrix, PR curves, etc.
    save_json=False,
    conf=0.001,  # Lower confidence to get all predictions for metrics
    iou=0.6  # NMS IoU threshold
)

Ultralytics 8.3.228 üöÄ Python-3.10.12 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 3070 Laptop GPU, 8192MiB)
Model summary (fused): 72 layers, 3,007,013 parameters, 0 gradients, 8.1 GFLOPs
[34m[1mval: [0mFast image access ‚úÖ (ping: 1.2¬±0.1 ms, read: 14.4¬±5.4 MB/s, size: 30.8 KB)
[K[34m[1mval: [0mScanning /workspace/dataset/test/labels.cache... 122 images, 0 backgrounds, 0 corrupt: 100% ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 122/122 165.8Kit/s 0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 8/8 4.3it/s 1.9s0.4ss
                   all        122        329      0.682      0.748      0.754      0.486
                 angry         50         77      0.666      0.844      0.839      0.544
                  fear         25         48      0.811      0.715      0.848       0.55
                 happy         47         92      0.688      0.839      0.859      0.548
               neutral   

In [47]:
# Print overall metrics
print("\n=== Overall Metrics ===")
print(f"mAP50: {metrics.box.map50:.4f}")
print(f"mAP50-95: {metrics.box.map:.4f}")
print(f"Precision: {metrics.box.mp:.4f}")
print(f"Recall: {metrics.box.mr:.4f}")

# Print per-class metrics
print("\n=== Per-Class mAP50-95 ===")
for i, class_name in enumerate(class_names):
    print(f"{class_name:15s}: {metrics.box.maps[i]:.4f}")


=== Overall Metrics ===
mAP50: 0.7542
mAP50-95: 0.4862
Precision: 0.6822
Recall: 0.7475

=== Per-Class mAP50-95 ===
angry          : 0.5444
disgust        : 0.4862
fear           : 0.5505
happy          : 0.5483
neutral        : 0.3721
sad            : 0.2373
surprise       : 0.6650


## Convert to ONNX

In [61]:
# Export model
exported_path = best_model.export(
    format='onnx',
    imgsz=IMG_SIZE,
    simplify=True,    # Uses onnxslim for optimization
    dynamic=False,    # Disable dynamic input size (must be static resolution=imgsz)
    half=False,       # Use 32-bit floating point (half is 16-bit)
    nms=True,         # Include non-maximum suppression (NMS)
)

# Copy model to the output directory
model_dest = MODELS_PATH / OUTPUT_MODEL_NAME
shutil.copy(exported_path, model_dest)
print(f"Model saved to {model_dest}")

Ultralytics 8.3.228 üöÄ Python-3.10.12 torch-2.5.1+cu121 CPU (11th Gen Intel Core i7-11800H @ 2.30GHz)

[34m[1mPyTorch:[0m starting from '/workspace/runs/detect/emotion_detection/weights/best.pt' with input shape (1, 3, 640, 640) BCHW and output shape(s) (1, 300, 6) (6.0 MB)

[34m[1mONNX:[0m starting export with onnx 1.19.1 opset 19...
[34m[1mONNX:[0m slimming with onnxslim 0.1.74...
[34m[1mONNX:[0m export success ‚úÖ 1.1s, saved as '/workspace/runs/detect/emotion_detection/weights/best.onnx' (11.7 MB)

Export complete (1.2s)
Results saved to [1m/workspace/runs/detect/emotion_detection/weights[0m
Predict:         yolo predict task=detect model=/workspace/runs/detect/emotion_detection/weights/best.onnx imgsz=640  
Validate:        yolo val task=detect model=/workspace/runs/detect/emotion_detection/weights/best.onnx imgsz=640 data=/workspace/dataset/data.yaml  
Visualize:       https://netron.app
Model saved to /workspace/models/model.onnx


## Test with ONNX Runtime

In [62]:
# TODO: 
#  - do test inference--will need to apply letterboxing (and maybe NMS manually)
#  - figure out why learning rates don't match up (see Claude chat)