In [2]:
# Imports
from pathlib import Path
import torch
from ultralytics import YOLO
import yaml
import zipfile

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/tmp/Ultralytics/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [21]:
# Settings
YOLO_V8N_VERSION = "8.3.0"
DATASET_PATH = Path("/workspace/dataset")
DATASET_ZIP_PATH = DATASET_PATH / "Human Face Expression.v20i.yolov8.zip"
DATASET_YAML_PATH = DATASET_PATH / "data.yaml"
MODELS_PATH = Path("/workspace/models")
PRETRAINED_MODEL_PATH = MODELS_PATH / "yolo_pretrained.pt"
EXPORT_STATE_PATH = MODELS_PATH / "yolo_state_dict.pth"

In [22]:
# Constants
NUM_COORDS_AND_OBJ_SCORE = 5    # 4 coordinates (x, y, w, h) + 1 object score

In [4]:
# Determine if we should execute on the CPU or GPU
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"   # Apple Silicon (Metal Performance Shaders)
else:
    device = "cpu"

print(f"Using device: {device}")

Using device: cuda


## Download dataset

**Manual step required**: Because Roboflow's API key expires every 24 hours, the easiest way to download the dataset is to manually grab it from Roboflow's site.

1. Sign in or create an account on [Roboflow](https://roboflow.com/).
2. Head to the [Human Face Expression Recognition dataset (v20)](https://universe.roboflow.com/human-face-expression-recognition/human-face-expression/dataset/20) on Roboflow.
3. Click **YOLOv8** under *Popular Download Formats**.
4. In the pop-up window, select **Download dataset** and click **Continue**.
5. In the next window, ensure that *YOLOv8* is selected under *Image and Annotation Format*. Select **Download zip to computer** and click **Continue**.
6. Copy the *.zip* file to the *workspace/dataset/* directory.
7. Run the following cells to unzip the dataset and check it.

In [16]:
# Unzip dataset
with zipfile.ZipFile(DATASET_ZIP_PATH, "r") as zip_ref:
    zip_ref.extractall(DATASET_PATH)

In [17]:
# Load the YAML
with open(DATASET_YAML_PATH, "r") as f:
    data_config = yaml.safe_load(f)

# Extract class names
class_names = data_config["names"]
num_classes = len(class_names)

# Display contents
print("data.yaml contents:")
for k, v in data_config.items():
    print(f"{k}: {v}")

data.yaml contents:
train: ../train/images
val: ../valid/images
test: ../test/images
nc: 7
names: ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
roboflow: {'workspace': 'human-face-expression-recognition', 'project': 'human-face-expression', 'version': 20, 'license': 'CC BY 4.0', 'url': 'https://universe.roboflow.com/human-face-expression-recognition/human-face-expression/dataset/20'}


## Download model

In [4]:
# Download model weights (pre-trained on COCO)
!wget https://github.com/ultralytics/assets/releases/download/v{YOLO_V8N_VERSION}/yolov8n.pt -O {PRETRAINED_MODEL_PATH}

--2025-11-07 21:04:52--  https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://release-assets.githubusercontent.com/github-production-release-asset/521807533/101dd207-c6a9-4ee0-bfeb-34e12d2b8f40?sp=r&sv=2018-11-09&sr=b&spr=https&se=2025-11-07T21%3A48%3A30Z&rscd=attachment%3B+filename%3Dyolov8n.pt&rsct=application%2Foctet-stream&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0&sktid=398a6654-997b-47e9-b12b-9515b896b4de&skt=2025-11-07T20%3A47%3A38Z&ske=2025-11-07T21%3A48%3A30Z&sks=b&skv=2018-11-09&sig=fQO2r%2FqX1qD6%2FSVoaf7l8mrYl3HUUNRT4VHigbUDB1g%3D&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmVsZWFzZS1hc3NldHMuZ2l0aHVidXNlcmNvbnRlbnQuY29tIiwia2V5Ijoia2V5MSIsImV4cCI6MTc2MjU0OTc5MiwibmJmIjoxNzYyNTQ5NDkyLCJwYXRoIjoicmVsZWFzZWFzc2V0cHJvZHVjdGlvbi5ibG9iLmNvc

## Initialize model for transfer learning

We want to replace part of the detect head (final layers) with a new set of layers that output classification guesses (logits) for our new set of categories (face expressions) instead of the old COCO classes.

In [25]:
# Load the YOLOv8-nano model (pretrained on COCO dataset)
yolo = YOLO(PRETRAINED_MODEL_PATH)

# Access the underlying nn.Module
model = yolo.model

# Move model to CPU or GPU
model.to(device)

# Print model info
print("Number of parameters:", sum(p.numel() for p in model.parameters()))
print(model)

Number of parameters: 3157200
DetectionModel(
  (model): Sequential(
    (0): Conv(
      (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (1): Conv(
      (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (2): C2f(
      (cv1): Conv(
        (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (cv2): Conv(
        (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): S

The detection head is composed of 3 parts:
* cv2 - outputs the box regression (x, y, w, h)
* cv3 - outputs the class logits: 80 categories for COCO
* dfl - converts discrete bins to continuous boxes (to refine box coordinates)

We want to replace cv3 with a similar set of new layers but output the new class logits (7 face expressions). We'll keep cv2 and dfl the same.

In [39]:
# Get the detection head from the model
detect = model.model[-1]

# Update YOLO head metadata
detect.nc = num_classes
detect.no = num_classes + NUM_COORDS_AND_OBJ_SCORE
detect.nl = len(detect.cv3)  # number of detection layers

# Replace the final 1x1 conv in each classification branch
for seq in detect.cv3:
    # Final part of each module is an 1x1 Conv2d layer for 80x80 input
    last_conv = seq[-1]  
    in_ch = last_conv.in_channels

    # Create new conv layer for 7 classes
    new_conv = torch.nn.Conv2d(in_ch, num_classes, kernel_size=1, stride=1)

    # Initialize weights (Kaiming/He normal initialization)
    torch.nn.init.kaiming_normal_(new_conv.weight, mode='fan_out', nonlinearity='relu')
    if new_conv.bias is not None:
        torch.nn.init.zeros_(new_conv.bias)

    # Replace the old layer
    seq[-1] = new_conv

# Show new cv3 part of detection head
print(detect.cv3)

ModuleList(
  (0): Sequential(
    (0): Conv(
      (conv): Conv2d(64, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(80, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (1): Conv(
      (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(80, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (2): Conv2d(80, 7, kernel_size=(1, 1), stride=(1, 1))
  )
  (1): Sequential(
    (0): Conv(
      (conv): Conv2d(128, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(80, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (1): Conv(
      (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(80, eps=0.001, momentum=0.03, affine=True,

In [40]:
# TODO: Freeze everything except new cv3 layers (see ChatGPT for help)