## **0. Environment Setup**




In [9]:
# from google.colab import drive
# drive.mount('/content/drive')

In [10]:
# Dataset path
DATASET_PATH = "../data/"  # contains folders 1,2,...,8

# !pip install mediapipe opencv-python scikit-learn

In [None]:
import os
import cv2
import numpy as np
import mediapipe as mp
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
# from google.colab.patches import cv2_imshow
import random

mp_pose = mp.solutions.pose
pose = mp_pose.Pose(
    static_image_mode=False,
    model_complexity=1,
    enable_segmentation=False,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

# Add FaceMesh for emotions
mp_face = mp.solutions.face_mesh
face_mesh = mp_face.FaceMesh(
    static_image_mode=False,
    max_num_faces=1,
    refine_landmarks=True,  # includes landmarks for eyes and lips
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)


I0000 00:00:1766785325.278750 12199637 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 90.5), renderer: Apple M1 Pro
I0000 00:00:1766785325.290223 12199637 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 90.5), renderer: Apple M1 Pro


W0000 00:00:1766785325.293855 12201609 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [12]:
def extract_face_features(rgb_frame):
    """
    Extract facial emotion features using FaceMesh.
    """
    result = face_mesh.process(rgb_frame)
    N_FACE_FEATURES = 20
    
    if not result.multi_face_landmarks:
        return np.zeros(N_FACE_FEATURES)
    
    face_landmarks = result.multi_face_landmarks[0].landmark
    nose = face_landmarks[1]
    left_eye = face_landmarks[33]
    right_eye = face_landmarks[263]
    eye_dist = max(np.sqrt((left_eye.x - right_eye.x)**2 + (left_eye.y - right_eye.y)**2), 0.01)
    
    features = []

    features.extend([(face_landmarks[159].y - face_landmarks[145].y) / eye_dist,
                     (face_landmarks[386].y - face_landmarks[374].y) / eye_dist])

    features.extend([(face_landmarks[105].y - face_landmarks[159].y) / eye_dist,
                     (face_landmarks[334].y - face_landmarks[386].y) / eye_dist])

    mouth_open = (face_landmarks[14].y - face_landmarks[13].y) / eye_dist
    mouth_width = (face_landmarks[308].x - face_landmarks[78].x) / eye_dist
    features.extend([mouth_open, mouth_width])
    mouth_center_y = (face_landmarks[13].y + face_landmarks[14].y) / 2
    features.extend([(mouth_center_y - face_landmarks[78].y) / eye_dist,
                     (mouth_center_y - face_landmarks[308].y) / eye_dist])

    features.extend([(left_eye.y - right_eye.y) / eye_dist,
                     (nose.x - (left_eye.x + right_eye.x) / 2) / eye_dist])
    features.append((face_landmarks[13].y - nose.y) / eye_dist)
    for idx in [33, 263, 61, 291, 199, 175, 152, 10, 234]:
        features.append((face_landmarks[idx].x - nose.x) / eye_dist)
    
    return np.array(features[:N_FACE_FEATURES])


def extract_pose_vector(frame):
    """
    Takes BGR frame, returns pose + face + hand features vector.
    """
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = pose.process(rgb)

    if not result.pose_landmarks:
        return None

    landmarks = result.pose_landmarks.landmark
    
    LEFT_SHOULDER, RIGHT_SHOULDER = 11, 12
    LEFT_HIP, RIGHT_HIP = 23, 24
    LEFT_WRIST, RIGHT_WRIST = 15, 16
    
    center_x = (landmarks[LEFT_SHOULDER].x + landmarks[RIGHT_SHOULDER].x + 
                landmarks[LEFT_HIP].x + landmarks[RIGHT_HIP].x) / 4
    center_y = (landmarks[LEFT_SHOULDER].y + landmarks[RIGHT_SHOULDER].y + 
                landmarks[LEFT_HIP].y + landmarks[RIGHT_HIP].y) / 4
    
    shoulder_dist = max(np.sqrt(
        (landmarks[LEFT_SHOULDER].x - landmarks[RIGHT_SHOULDER].x) ** 2 +
        (landmarks[LEFT_SHOULDER].y - landmarks[RIGHT_SHOULDER].y) ** 2
    ), 0.01)

    vec = []
    for lm in landmarks:
        norm_x = (lm.x - center_x) / shoulder_dist
        norm_y = (lm.y - center_y) / shoulder_dist
        vec.extend([norm_x, norm_y, lm.visibility])

    # Hand visibility and position features
    left_wrist_vis = landmarks[LEFT_WRIST].visibility
    right_wrist_vis = landmarks[RIGHT_WRIST].visibility
    left_hand_visible = 1.0 if left_wrist_vis > 0.5 else 0.0
    right_hand_visible = 1.0 if right_wrist_vis > 0.5 else 0.0
    any_hand_visible = 1.0 if (left_wrist_vis > 0.5 or right_wrist_vis > 0.5) else 0.0
    left_wrist_above_shoulder = 1.0 if landmarks[LEFT_WRIST].y < landmarks[LEFT_SHOULDER].y else 0.0
    right_wrist_above_shoulder = 1.0 if landmarks[RIGHT_WRIST].y < landmarks[RIGHT_SHOULDER].y else 0.0
    left_hand_dist = np.sqrt((landmarks[LEFT_WRIST].x - center_x)**2 + 
                             (landmarks[LEFT_WRIST].y - center_y)**2) / shoulder_dist
    right_hand_dist = np.sqrt((landmarks[RIGHT_WRIST].x - center_x)**2 + 
                              (landmarks[RIGHT_WRIST].y - center_y)**2) / shoulder_dist
    
    vec.extend([left_hand_visible, right_hand_visible, any_hand_visible,
                left_wrist_above_shoulder, right_wrist_above_shoulder,
                left_hand_dist, right_hand_dist])


    face_features = extract_face_features(rgb)
    vec.extend(face_features)

    return np.array(vec)


W0000 00:00:1766785325.304293 12201610 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [13]:
print("DATASET_PATH content:")
!ls "$DATASET_PATH"


W0000 00:00:1766785325.381143 12201604 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1766785325.400707 12201607 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


DATASET_PATH content:
[34m1[m[m         [34m3[m[m         [34m5[m[m         [34m7[m[m         README.md
[34m2[m[m         [34m4[m[m         [34m6[m[m         [34m8[m[m


In [14]:
from torchvision import transforms
from PIL import Image

# Create augmentations
augmentations = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),      # Horizontal flip
    transforms.RandomRotation(degrees=15),       # Rotate +- 15 deg
    transforms.ColorJitter(brightness=0.2),      # Jitter brightness
    transforms.RandomResizedCrop(size=(224, 224), scale=(0.7, 1.0)) # Crop and resize
])

In [15]:
X = []
y = []

video_ext = (".mp4", ".avi", ".mov", ".mkv")

for class_name in sorted(os.listdir(DATASET_PATH)):
    class_path = os.path.join(DATASET_PATH, class_name)
    if not os.path.isdir(class_path):
        continue

    print(f"→ Class {class_name}")

    for fname in sorted(os.listdir(class_path)):
        if not fname.lower().endswith(video_ext):
            continue

        video_path = os.path.join(class_path, fname)
        print(f"    Video: {fname}")

        cap = cv2.VideoCapture(video_path)
        frame_id = 0

        while True:
            ret, frame = cap.read()
            if not ret:
                break

            # 1. Take every 2nd frame
            if frame_id % 2 == 0:
                
                # --- AUGMENTATION BLOCK ---
                # Copy frame for augmentation
                aug_frame = frame.copy()


                # Prevents model from bias to one side
                if random.random() > 0.5:
                    aug_frame = cv2.flip(aug_frame, 1)


                angle = random.uniform(-15, 15)
                h, w = aug_frame.shape[:2]
                M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
                aug_frame = cv2.warpAffine(aug_frame, M, (w, h))


                brightness = random.uniform(0.7, 1.3)
                aug_frame = cv2.convertScaleAbs(aug_frame, alpha=brightness, beta=0)
                
                # --- VECTOR EXTRACTION ---
                # Process augmented frame
                vec = extract_pose_vector(aug_frame)
                
                if vec is not None:
                    X.append(vec)
                    y.append(class_name)

            frame_id += 1

        cap.release()

X = np.array(X)
y = np.array(y)

print("Total poses:", len(X))
print("Classes:", np.unique(y))


→ Class 1
    Video: 1_1.mov


W0000 00:00:1766785325.819780 12201603 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


    Video: 1_2.mov
    Video: 1_3.mov
    Video: 1_4.MOV
    Video: 1_5.mp4
→ Class 2
    Video: 2_1.mov
    Video: 2_2.mov
    Video: 2_3.mov
    Video: 2_4.MOV
    Video: 2_5.mp4
→ Class 3
    Video: 3_1.mov
    Video: 3_2.mov
    Video: 3_3.mov
    Video: 3_4.MOV
    Video: 3_5.mp4
→ Class 4
    Video: 4_1.mov
    Video: 4_2.mov
    Video: 4_3.mov
    Video: 4_4.MOV
    Video: 4_5.mp4
→ Class 5
    Video: 5_1.mov
    Video: 5_2.mov
    Video: 5_3.mov
    Video: 5_4.MOV
    Video: 5_5.mp4
→ Class 6
    Video: 6_1.mov
    Video: 6_2.mov
    Video: 6_3.mov
    Video: 6_4.MOV
    Video: 6_5.mp4
→ Class 7
    Video: 7_1.mov
    Video: 7_2.mov
    Video: 7_3.mov
    Video: 7_4.mov
    Video: 7_5.mp4
→ Class 8
    Video: 8_1.mov
    Video: 8_2.mov
    Video: 8_3.mov
    Video: 8_4.mov
    Video: 8_5.mov
Total poses: 4259
Classes: ['1' '2' '3' '4' '5' '6' '7' '8']


In [16]:
if len(np.unique(y)) < 2:
    raise ValueError(f"Only one class found: {np.unique(y)}. Check folder structure and data.")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# PCA: keep 98% variance for better quality
pca = PCA(n_components=0.98)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
print(f"PCA: {X_train_scaled.shape[1]} -> {X_train_pca.shape[1]} components (98% variance)")


PCA: 126 -> 24 components (98% variance)


In [17]:
# Best parameters found: C=200, gamma=0.05
clf = SVC(C=200, gamma=0.05, kernel='rbf', probability=True)
clf.fit(X_train_pca, y_train)

y_pred = clf.predict(X_test_pca)
print("Quality report:\n")
print(classification_report(y_test, y_pred))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))


Quality report:

              precision    recall  f1-score   support

           1       0.96      0.88      0.92       146
           2       0.97      0.97      0.97        90
           3       0.96      0.96      0.96       129
           4       1.00      0.92      0.96        61
           5       0.96      0.97      0.96       117
           6       0.92      0.97      0.95       137
           7       1.00      0.96      0.98       134
           8       0.72      1.00      0.84        38

    accuracy                           0.95       852
   macro avg       0.94      0.95      0.94       852
weighted avg       0.95      0.95      0.95       852

Confusion matrix:
[[129   2   0   0   1   8   0   6]
 [  0  87   0   0   0   1   0   2]
 [  0   0 124   0   3   2   0   0]
 [  0   0   0  56   0   0   0   5]
 [  1   0   3   0 113   0   0   0]
 [  3   0   0   0   0 133   0   1]
 [  1   1   2   0   1   0 128   1]
 [  0   0   0   0   0   0   0  38]]


In [18]:
model = {
    "clf": clf,
    "scaler": scaler,
    "pca": pca,
    "classes": sorted(list(np.unique(y)))
}

model_path = "../models/model.pkl"
with open(model_path, "wb") as f:
    pickle.dump(model, f)

print("Model saved at:", model_path)


Model saved at: ../models/model.pkl
