In [61]:
import mediapipe as mp
import cv2
import numpy as np
from tqdm import tqdm
import pandas as pd
import matplotlib as plt
import matplotlib.pyplot as plt

====================================================
# VIDEO UPLOAD
====================================================

In [62]:
video_path = "/Users/williamchalons/code/WiwiC/VERA/data/raw/myvideo.mp4"
cap = cv2.VideoCapture(video_path)

if not cap.isOpened():
    print("❌ Error loading video")
else:
    print("✅ Video loaded")

✅ Video loaded


====================================================
# MODEL LOADING + FACE DETECTION CONFIRMATION
====================================================

In [63]:
face_mesh = mp.solutions.face_mesh.FaceMesh(
    refine_landmarks=True,
    max_num_faces=1,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

cap = cv2.VideoCapture(video_path)
ret, frame = cap.read()

if not ret:
    print("❌ Could not read first frame")
else:
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = face_mesh.process(rgb)

    if results.multi_face_landmarks:
        print("✅ FaceMesh detected a face")
    else:
        print("❌ FaceMesh did NOT detect a face")

✅ FaceMesh detected a face


I0000 00:00:1764675144.243232       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 90.5), renderer: Apple M2 Pro


====================================================
# 1 - FUNCTIONS
====================================================

### Head stability

In [64]:
def compute_head_center(lm):
    """
    Compute the 3D head center using the left and right ear landmarks.
    """
    left_ear  = np.array([lm[234].x, lm[234].y, lm[234].z])
    right_ear = np.array([lm[454].x, lm[454].y, lm[454].z])
    return (left_ear + right_ear) / 2

### Gaze direction consistency

In [65]:
def compute_iris_centers(lm):
    """
    Compute the 3D midpoint between the left and right iris centers.
    """
    left_iris = np.array([lm[468].x, lm[468].y, lm[468].z])
    right_iris = np.array([lm[473].x, lm[473].y, lm[473].z])
    return (left_iris + right_iris) / 2

def compute_face_center(lm):
    """
    Compute a stable 3D anchor point on the face, using the nose bridge landmark.
    """
    nose = np.array([lm[1].x, lm[1].y, lm[1].z])
    return nose

### Smile activation 

In [66]:
def compute_smile_activation(lm):
    """
    Compute smile activation as the Euclidean distance
    between left and right lip corners.
    """
    left = np.array([lm[61].x, lm[61].y, lm[61].z])
    right = np.array([lm[291].x, lm[291].y, lm[291].z])

    return np.linalg.norm(left - right)

### Video metrics extraction

In [67]:
features = []

fps = cap.get(cv2.CAP_PROP_FPS)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

prev_head_center = None
prev_gaze = None

for idx in tqdm(range(frame_count)):
    ret, frame = cap.read()
    if not ret:
        break

    timestamp = idx / fps
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = face_mesh.process(rgb)

    if results.multi_face_landmarks:
        lm = results.multi_face_landmarks[0].landmark

        # ----- HEAD STABILITY -----
        head_center = compute_head_center(lm)
        head_speed = np.linalg.norm(head_center - prev_head_center) if prev_head_center is not None else np.nan
        prev_head_center = head_center

        # ----- GAZE CONSISTENCY -----
        iris_center = compute_iris_centers(lm)
        face_center = compute_face_center(lm)

        gaze_vec = iris_center - face_center
        gaze_vec = gaze_vec / (np.linalg.norm(gaze_vec) + 1e-6)

        dg = np.linalg.norm(gaze_vec - prev_gaze) if prev_gaze is not None else np.nan
        prev_gaze = gaze_vec

        # ----- SMILE ACTIVATION -----
        smile = compute_smile_activation(lm)

    else:
        head_speed = np.nan
        dg = np.nan
        smile = np.nan

    # Append ALL features
    features.append({
        "timestamp": timestamp,
        "head_speed": head_speed,
        "gaze_dg": dg,
        "smile": smile
    })


 98%|█████████▊| 1740/1774 [00:12<00:00, 140.40it/s]


====================================================
# 2 - GROUP TIMESTAMPS OF THE VIDEO PER SEC
====================================================

In [68]:
df = pd.DataFrame(features).set_index("timestamp")
df["second"] = df.index.astype(int)
df

Unnamed: 0_level_0,head_speed,gaze_dg,smile,second
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.000000,,,0.036032,0
0.033378,0.003389,0.041809,0.036482,0
0.066755,0.003727,0.013080,0.036100,0
0.100133,0.001555,0.009895,0.036278,0
0.133510,0.001615,0.006441,0.036408,0
...,...,...,...,...
57.910042,0.000110,0.013984,0.039869,57
57.943420,0.000799,0.023378,0.039556,57
57.976797,0.000899,0.003705,0.038986,57
58.010175,0.000781,0.012703,0.038594,58


====================================================
# 3 - DEFINE SPEED VARIANCE and MEAN PER SECONDE
#### Speed variance -> Head activation and Gaze consistency
#### Mean -> smile activation
====================================================


In [69]:
# HEAD JITTER
jitter_head_1s = df.groupby("second")["head_speed"].var().fillna(0)

# GAZE JITTER
jitter_gaze_1s = df.groupby("second")["gaze_dg"].var().fillna(0)

# SMILE ACTIVATION
smile_1s = df.groupby("second")["smile"].mean().fillna(0)

====================================================
# 4 - DISPLAY OBSERVATIONS ON A 5 SEC WINDOWS/1SEC SLICING
====================================================


## 4.1 - Head Stability grouping

In [43]:
window_size = 5

jitter_5s = []
seconds = jitter_head_1s.index.values

for start in seconds:
    end = start + window_size
    win_vals = jitter_head_1s.loc[start:end]
    if len(win_vals) == window_size + 1:
        jitter_5s.append({
            "start_sec": start,
            "end_sec": end,
            "jitter_5s": win_vals.mean()
        })

df_jitter_5s = pd.DataFrame(jitter_5s)
df_jitter_5s

Unnamed: 0,start_sec,end_sec,jitter_5s
0,0,5,1.898334e-06
1,1,6,1.784012e-06
2,2,7,7.323681e-07
3,3,8,1.01907e-06
4,4,9,9.858126e-07
5,5,10,9.991822e-07
6,6,11,9.998472e-07
7,7,12,1.076596e-06
8,8,13,9.142071e-07
9,9,14,5.472128e-07


## 4.2 - Gaze consistency grouping

In [51]:
window_size = 5
seconds = jitter_gaze_1s.index.values

jitter_gaze_5s = []

for start in seconds:
    end = start + window_size
    win_vals = jitter_gaze_1s.loc[start:end]
    if len(win_vals) == window_size + 1:
        jitter_gaze_5s.append({
            "start_sec": start,
            "end_sec": end,
            "jitter_5s": win_vals.mean()
        })

df_gaze_5s = pd.DataFrame(jitter_gaze_5s)
df_gaze_5s


Unnamed: 0,start_sec,end_sec,jitter_5s
0,0,5,0.000177
1,1,6,0.000162
2,2,7,0.000162
3,3,8,0.000129
4,4,9,0.000118
5,5,10,0.000117
6,6,11,0.000115
7,7,12,0.000126
8,8,13,0.000101
9,9,14,0.000128


## 4.3 - Smile activation grouping

In [70]:
window_size = 5
smile_5s = []

for start in smile_1s.index:
    end = start + window_size
    win = smile_1s.loc[start:end]

    if len(win) == window_size + 1:
        smile_5s.append({
            "start_sec": start,
            "end_sec": end,
            "smile_5s": win.mean()
        })

df_smile_5s = pd.DataFrame(smile_5s)
df_smile_5s

Unnamed: 0,start_sec,end_sec,smile_5s
0,0,5,0.036289
1,1,6,0.035923
2,2,7,0.035207
3,3,8,0.034508
4,4,9,0.03443
5,5,10,0.034348
6,6,11,0.034505
7,7,12,0.034365
8,8,13,0.034282
9,9,14,0.034274


====================================================
# 5 - SCORING
====================================================


## 5.1 - Head Stability score

In [45]:
z = (df_jitter_5s["jitter_5s"] - df_jitter_5s["jitter_5s"].mean()) / df_jitter_5s["jitter_5s"].std()
df_jitter_5s["score"] = 1 / (1 + np.exp(z))

In [54]:
global_head_stability = df_jitter_5s["score"].mean()
print(global_head_stability)

0.5208867123438418


≥ 0.55 — Highly stable head posture (Excellent)
“Controlled, confident, composed delivery.”
Smooth, steady micro-movement
No visible shaking or restlessness
Strong presence on camera or stage
Common among experienced speakers, anchors, presenters
This is the “professional-level composure” zone.

0.45 – 0.55 — Natural and healthy stability (Good)
“Balanced movement: expressive but steady.”
Small natural adjustments
No perceptible instability
Not too stiff, not too mobile
Ideal for pitches and interviews
Most good communicators fall here.

0.30 – 0.45 — Mild instability (Weak)
“Subtle micro-fidgeting or restlessness.”
Slight shaking or frequent small corrections
Noticeable under stress
Does not ruin communication, but is visible
Typical of moderately nervous speakers.

≤ 0.30 — Unstable head movement (Poor)
“Distracting, restless, or shaky posture.”
Frequent jitter or micro-jerking
Strong nervous energy or lack of control
Reduces perceived credibility and presence
This is the zone needing corrective feedback.

## 5.2 - Gaze consistency score

In [56]:
z = (df_gaze_5s["jitter_5s"] - df_gaze_5s["jitter_5s"].mean()) / df_gaze_5s["jitter_5s"].std()
df_gaze_5s["score"] = 1 / (1 + np.exp(z))  # inverted sigmoid

In [57]:
global_gaze_consistency = df_gaze_5s["score"].mean()
print(global_gaze_consistency)

0.5128600847207508


≥ 0.55 — Highly controlled gaze
“Calm, intentional, confident eye behavior.”
Smooth gaze movement
Very few abrupt eye shifts
Audience perceives composure
Excellent for pitches and interviews
This is the “expert presenter” zone.

0.45 – 0.55 — Natural gaze behavior (GOOD)
“Healthy balance between expressiveness and control.”
Natural small adjustments
Not too still (robotic), not too jumpy
Very typical of competent speakers
Visually comfortable and credible
This is where most good speakers land.

0.30 – 0.45 — Slightly unstable gaze
“Occasional darting or scanning behavior.”
Moments of small rapid shifts
Eye instability noticeable under stress
Audience perceives mild distraction or nervousness
Not bad, but room for improvement.

≤ 0.30 — Unsteady or nervous gaze
“Frequent darting eye movements.”
High jitter
Looking around too often
Sudden direction changes
Perceived as discomfort, insecurity, or cognitive overload
This is the improvement-critical zone.

## 5.3 - Smile activation score

In [71]:
z = (df_smile_5s["smile_5s"] - df_smile_5s["smile_5s"].mean()) / df_smile_5s["smile_5s"].std()
df_smile_5s["score"] = 1 / (1 + np.exp(-z))  # normal sigmoid for "more = good"

In [72]:
global_smile_activation = df_smile_5s["score"].mean()
print (global_smile_activation)

0.4955510987779939


≥ 0.55 — Expressive, warm, approachable (Excellent)
“A strong, natural smile that signals openness and positive engagement.”
Visibly activated AU12 (lip-corner puller)
Contributes to warmth and rapport
Very effective in pitches where friendliness matters
Typical of charismatic speakers

0.45 – 0.55 — Balanced, natural smile (Good)
“Occasional or moderate smiling. Pleasant and appropriate.”
Natural social smiling
Not exaggerated
Signals comfort and confidence
Works well for most public speaking contexts
This is where most good communicators fall.

0.30 – 0.45 — Low smile activation (Weak)
“Neutral or minimally expressive. Can feel serious, tense, or distant.”
Little lip-corner activation
Can reduce warmth and perceived approachability
Not necessarily bad — depends on context
Common in nervous speakers or very formal tones

≤ 0.30 — Flat or absent smile (Poor)
“No visible smiling. Can feel closed-off, stressed, or unengaged.”
Almost no AU12 movement
Often correlates with discomfort or disengagement
Pitch may feel rigid or emotionally flat
This is improvement-critical if warmth or persuasion is the goal.

In [None]:

# ---------------------------------------------------------
# Mediapipe setup
# ---------------------------------------------------------
mp_face_mesh = mp.solutions.face_mesh
mp_drawing = mp.solutions.drawing_utils

face_mesh = mp_face_mesh.FaceMesh(
    max_num_faces=1,
    refine_landmarks=True,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

# ---------------------------------------------------------
# VIDEO SETUP
# ---------------------------------------------------------
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

output_path = "/Users/williamchalons/code/WiwiC/VERA/data/processed/debug_facemesh_minimal.mp4"

out = cv2.VideoWriter(
    output_path,
    cv2.VideoWriter_fourcc(*'mp4v'),
    fps,
    (width, height)
)

# ---------------------------------------------------------
# LANDMARK GROUPS (COLOR-CODED)
# ---------------------------------------------------------
# Head stability (ears + optional nose)
HEAD_POINTS = [234, 454, 1]               # BLUE

# Gaze (iris centers + nose)
GAZE_POINTS = [468, 473, 1]               # YELLOW

# Facial expressiveness (eyebrows, eyelids, lips, jaw, cheek)
EXPRESS_POINTS = [
    55, 65, 52,         # left eyebrow
    285, 295, 282,      # right eyebrow
    159, 145,           # left eye (upper/lower)
    386, 374,           # right eye
    13, 14,             # upper/lower lip center
    61, 291,            # lip corners
    234, 454            # cheeks
]                                        # GREEN

# Smile activation (lip corners)
SMILE_POINTS = [61, 291]                 # RED

# Create color map
COLOR_HEAD = (255, 0, 0)     # Blue
COLOR_GAZE = (0, 255, 255)   # Yellow
COLOR_EXP  = (0, 255, 0)     # Green
COLOR_SMILE = (0, 0, 255)    # Red


# ---------------------------------------------------------
# PROCESS VIDEO FRAME BY FRAME
# ---------------------------------------------------------
while True:
    ret, frame = cap.read()
    if not ret:
        break

    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = face_mesh.process(rgb)

    annotated = frame.copy()

    if results.multi_face_landmarks:
        lm = results.multi_face_landmarks[0].landmark
        h, w, _ = frame.shape

        # ----------------------
        # Draw Head Stability Points (BLUE)
        # ----------------------
        for i in HEAD_POINTS:
            x = int(lm[i].x * w)
            y = int(lm[i].y * h)
            cv2.circle(annotated, (x, y), 3, COLOR_HEAD, -1)

        # ----------------------
        # Draw Gaze Points (YELLOW)
        # ----------------------
        for i in GAZE_POINTS:
            x = int(lm[i].x * w)
            y = int(lm[i].y * h)
            cv2.circle(annotated, (x, y), 3, COLOR_GAZE, -1)

        # ----------------------
        # Draw Expressiveness Points (GREEN)
        # ----------------------
        for i in EXPRESS_POINTS:
            x = int(lm[i].x * w)
            y = int(lm[i].y * h)
            cv2.circle(annotated, (x, y), 3, COLOR_EXP, -1)

        # ----------------------
        # Draw Smile Points (RED)
        # ----------------------
        for i in SMILE_POINTS:
            x = int(lm[i].x * w)
            y = int(lm[i].y * h)
            cv2.circle(annotated, (x, y), 4, COLOR_SMILE, -1)

    out.write(annotated)

cap.release()
out.release()

print(f"Saved debug video to: {output_path}")

I0000 00:00:1764619023.946490       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 90.5), renderer: Apple M2 Pro


Saved debug video to: /Users/williamchalons/code/WiwiC/VERA/data/processed/debug_facemesh_minimal.mp4
