<a href="https://colab.research.google.com/github/Sriganesaniyer/Real-Time-Cognitive-State-Classification-in-Safety-Critical-Environments/blob/main/Real_Time_Cognitive_State_Classification_Using_Facial_Landmark_Dynamics_in_Safety_Critical_Environments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Phase 1 – Data Acquisition
# Features to capture per frame / time window:

# 1. Eye Features
#    - EAR (Eye Aspect Ratio)                   : eye openness
#    - Blink Count (per second)                 : number of blinks
#    - Blink Duration (seconds)                 : average blink duration
#    - IBI (Inter-Blink Interval, seconds)     : time between blinks
#    - PERCLOS (Percentage of Eye Closure)     : eye closure percentage
#    - Gaze Position (gaze_x, gaze_y)          : attention tracking

# 2. Eyebrow Features
#    - Brow Raise Frequency                      : stress/surprise indicator
#    - Brow Furrow Intensity                     : cognitive tension/stress

# 3. Lip / Mouth Features
#    - Mouth Aspect Ratio (MAR)                 : mouth openness
#    - Lip Corner Displacement                  : emotional expression / engagement
#    - Speaking Probability                     : speech activity / engagement

# 4. Jaw Features
#    - Jaw Variance                             : clenching or tension
#    - Mouth Open Duration                       : stress or speaking intensity

# 5. Head Pose Features
#    - Pitch, Yaw                         : head orientation; indicates attention vs. distraction

# 6. Temporal Meta
#    - Window Start Time                         : beginning of the time segment
#    - Window End Time                           : end of the time segment

# Target:
#    - Cognitive State (5 classes):
#        1. Alert
#        2. Fatigued
#        3. Stressed
#        4. Distracted
#        5. Neutral

# Notes:
# - Each time window (e.g., 10 seconds) aggregates features (mean, variance, counts, etc.)
# - Suitable for full-face cognitive state classification using LSTM or other temporal models

In [None]:
# Phase 2 – Preprocessing
# 1. Preprocessing steps:
#    - Normalize feature values (e.g., EAR, MAR, gaze coordinates)
#      to range [0,1] or standardize using z-score.
#    - Aggregate frame-level features into fixed-size time windows (e.g., 10 seconds).
#    - Compute derived metrics per window:
#        * Eye: EAR mean, blink rate, blink duration, PERCLOS, IBI
#        * Eyebrows: raise frequency, furrow intensity
#        * Lips: MAR, lip corner displacement, speaking probability
#        * Jaw: jaw variance, mouth open duration
#        * Head Pose: pitch, yaw
#    - Store preprocessed dataset in a structured format (CSV or HDF5) for LSTM input.
#    - Split data into training, validation, and test sets (typical 70:15:15 split).


In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("cognitive_states_dataset.csv")

# Print the first 5 rows as tuples
for row in df.head(5).itertuples(index=False, name=None):
    print(row)

(0.3342153372601146, 0.0257424097472162, 0.3410478728306235, 0.8033045735180335, 0.1631877792411288, 0.2220600213968829, 0.4555016952192202, 0.0308246186993702, 0.8792043375853826, -0.7365093043181234, 2.0037800901970666, 1.0714693099486168, 0.3419344935747266, 4, 14, 'Stressed')
(15.378328914148671, 0.2559518055153246, 0.1514323143699633, 0.4774161894803537, 0.6018067339454473, 0.5817366474211867, 0.4014617790890432, 0.3380844093662614, 0.1268240225504188, -1.4069272629788503, 2.2701619899099432, 0.023411377326365, 0.2088309354945899, 32, 42, 'Fatigued')
(0.415786062069183, 0.6010826653932156, 0.7098047132723719, 0.9505920130759384, 0.1471605702056861, 0.2054782123528669, 0.5208155861440822, 0.268557140620641, 0.7766068397963906, -0.7643043686175046, 2.635949772711972, 0.7389912201094611, 0.481273951827605, 108, 118, 'Stressed')
(0.0641954859978143, 0.5379230312678858, 0.4788631183600459, 0.3874543535702994, 0.721909573992304, 0.8106657734835124, 0.4118974783621112, 0.7464192328441049

In [None]:
# Packages installation
!pip install --quiet numpy
!pip install --quiet pandas
!pip install --quiet scikit-learn
!pip install --quiet matplotlib

In [None]:
# Preprocessing
# Import packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset
dataset = "cognitive_states_dataset.csv"
df = pd.read_csv(dataset)

# Separate features and labels
feature_cols = [
    "blink_rate", "avg_EAR", "gaze_x", "gaze_y",
    "brow_raise_freq", "brow_furrow_intensity",
    "mouth_aspect_ratio", "lip_corner_disp", "speaking_prob",
    "jaw_variance", "mouth_open_duration", "pitch", "yaw"
]
X = df[feature_cols].values
y = df["cognitive_state"].values

# Feature normalization (z-score)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Aggregate features into rolling windows (10-second windows)
# Assuming dataset already has sequential window_start_time / window_end_time
window_size = 10
num_samples = X_scaled.shape[0]

X_windows = []
y_windows = []

for start in range(0, num_samples - window_size + 1, window_size):
    window_features = X_scaled[start:start+window_size]
    # Aggregate: mean over window
    X_windows.append(np.mean(window_features, axis=0))
    # Use majority label in window as target
    y_window = y[start:start+window_size]
    y_windows.append(pd.Series(y_window).mode()[0])

X_windows = np.array(X_windows)
y_windows = np.array(y_windows)

# Train-validation-test split (70:15:15)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_windows, y_windows, test_size=0.3, random_state=42, stratify=y_windows
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Preprocessing complete.")
print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_val.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")


Preprocessing complete.
Training samples: 70
Validation samples: 15
Test samples: 15


In [None]:
# Phase 3 – LSTM Model
# 1. Define the LSTM architecture:
#    - Input: sequences of feature vectors per time window (e.g., 10-second rolling window)
#    - Hidden layers: 1–3 LSTM layers, optional dropout for regularization
#    - Output layer: 5 neurons (softmax) for the five cognitive states
#
# 2. Training procedure:
#    - Use preprocessed sequences from Phase 2
#    - Loss function: categorical cross-entropy
#    - Optimizer: Adam or RMSProp
#    - Evaluation metrics: accuracy, F1-score per class
#
# 3. Save the trained LSTM model for real-time inference in Phase 5.

In [None]:
# LSTM Model Implementation
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Prepare labels for classification
# Convert string labels to integers if necessary
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)
y_test_enc = le.transform(y_test)

# One-hot encoding for categorical cross-entropy
y_train_cat = to_categorical(y_train_enc, num_classes=5)
y_val_cat = to_categorical(y_val_enc, num_classes=5)
y_test_cat = to_categorical(y_test_enc, num_classes=5)

# Reshape input for LSTM
X_train_seq = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_val_seq = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))
X_test_seq = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Define LSTM architecture
model = Sequential([
    LSTM(64, input_shape=(X_train_seq.shape[1], X_train_seq.shape[2]), return_sequences=False),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(5, activation='softmax')  # 5 cognitive states
])

# Compile the model
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    X_train_seq, y_train_cat,
    validation_data=(X_val_seq, y_val_cat),
    epochs=50,
    batch_size=16,
    verbose=1
)

# Evaluate on test set
test_loss, test_acc = model.evaluate(X_test_seq, y_test_cat, verbose=0)
print(f"Test Accuracy: {test_acc*100:.2f}%")

# Save the trained model for Phase 5
model.save("lstm_cognitive_state_model.h5")
print("LSTM model saved as lstm_cognitive_state_model.h5")


Epoch 1/50


  super().__init__(**kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 110ms/step - accuracy: 0.1730 - loss: 1.6131 - val_accuracy: 0.3333 - val_loss: 1.6026
Epoch 2/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.2546 - loss: 1.6028 - val_accuracy: 0.5333 - val_loss: 1.5972
Epoch 3/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.2654 - loss: 1.5990 - val_accuracy: 0.4667 - val_loss: 1.5919
Epoch 4/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.3460 - loss: 1.5928 - val_accuracy: 0.5333 - val_loss: 1.5865
Epoch 5/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.4237 - loss: 1.5857 - val_accuracy: 0.6000 - val_loss: 1.5802
Epoch 6/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.4961 - loss: 1.5767 - val_accuracy: 0.6000 - val_loss: 1.5737
Epoch 7/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37



Test Accuracy: 80.00%
LSTM model saved as lstm_cognitive_state_model.h5


In [None]:
# Phase 4 – Calibration
# 1. Install any packages needed for calibration (e.g., visualization or interactive widgets).
#
# 2. Calibration steps:
#    - Prompts user for permission to access the webcam.
#    - Capture 5–10 seconds of frames from the webcam under relaxed baseline conditions.
#    - Compute baseline metrics for all regions:
#        * Eyes: mean EAR, blink rate, PERCLOS, IBI
#        * Eyebrows: average furrow intensity, raise frequency
#        * Lips: MAR, lip corner displacement, speaking probability
#        * Jaw: jaw variance, mouth open duration
#        * Head Pose: pitch, yaw, roll
#    - Store these baseline values to define dynamic thresholds for feature normalization and anomaly detection.
#    - Provide live webcam feedback to ensure proper calibration (user can adjust position/lighting).

In [None]:
# Imports
from IPython.display import display
from google.colab.output import eval_js
from base64 import b64decode
import cv2
import numpy as np
import mediapipe as mp

# Helper functions
def compute_EAR(landmarks, eye_indices, frame_w, frame_h):
    points = np.array([(landmarks[i].x * frame_w, landmarks[i].y * frame_h) for i in eye_indices])
    A = np.linalg.norm(points[1] - points[5])
    B = np.linalg.norm(points[2] - points[4])
    C = np.linalg.norm(points[0] - points[3])
    return (A + B) / (2.0 * C)

def compute_brow_raise(landmarks, brow_idx, eye_idx, w, h):
    brow = np.array([(landmarks[i].x * w, landmarks[i].y * h) for i in brow_idx])
    eye = np.array([(landmarks[i].x * w, landmarks[i].y * h) for i in eye_idx])
    return np.linalg.norm(np.mean(brow, axis=0) - np.mean(eye, axis=0))

def compute_lip_distance(landmarks, upper_idx, lower_idx, w, h):
    upper = np.array([(landmarks[i].x * w, landmarks[i].y * h) for i in upper_idx])
    lower = np.array([(landmarks[i].x * w, landmarks[i].y * h) for i in lower_idx])
    return np.linalg.norm(np.mean(upper, axis=0) - np.mean(lower, axis=0))

def compute_jaw_openness(landmarks, jaw_top_idx, jaw_bottom_idx, w, h):
    top = np.array([(landmarks[i].x * w, landmarks[i].y * h) for i in jaw_top_idx])
    bottom = np.array([(landmarks[i].x * w, landmarks[i].y * h) for i in jaw_bottom_idx])
    return np.linalg.norm(np.mean(top, axis=0) - np.mean(bottom, axis=0))

def compute_head_pose(landmarks, w, h):
    nose = np.array([landmarks[1].x * w, landmarks[1].y * h, landmarks[1].z])
    left = np.array([landmarks[234].x * w, landmarks[234].y * h, landmarks[234].z])
    right = np.array([landmarks[454].x * w, landmarks[454].y * h, landmarks[454].z])
    dx = right[0] - left[0]
    dy = right[1] - left[1]
    angle = np.degrees(np.arctan2(dy, dx))
    return angle

# JS function to take a photo in Colab
def take_photo(filename='photo.jpg', quality=0.8):
    js_code = f"""
    async function takePhoto(quality) {{
        const div = document.createElement('div');
        const capture = document.createElement('button');
        capture.textContent = 'Capture';
        div.appendChild(capture);
        document.body.appendChild(div);

        const video = document.createElement('video');
        video.style.display = 'block';
        const stream = await navigator.mediaDevices.getUserMedia({{video: true}});
        document.body.appendChild(video);
        video.srcObject = stream;
        await video.play();

        const canvas = document.createElement('canvas');
        canvas.width = video.videoWidth;
        canvas.height = video.videoHeight;
        const context = canvas.getContext('2d');

        await new Promise((resolve) => capture.onclick = resolve);
        context.drawImage(video, 0, 0, canvas.width, canvas.height);

        stream.getVideoTracks()[0].stop();
        video.remove();
        capture.remove();
        div.remove();
        return canvas.toDataURL('image/jpeg', quality);
    }}
    takePhoto({quality})
    """
    data = eval_js(js_code)
    binary = b64decode(data.split(',')[1])
    with open(filename, 'wb') as f:
        f.write(binary)
    return filename

# Baseline Calibration
# Ask for camera permission
permission = input("This program will access your webcam to collect baseline facial metrics. Allow? (y/n): ")
if permission.lower() != 'y':
    print("Permission denied. Exiting.")
    exit()

# Capture frames
num_frames = 5
frames = []
print("Capture each frame by clicking the 'Capture' button in the video window.")
for i in range(num_frames):
    fname = f"frame_{i}.jpg"
    take_photo(fname)
    img = cv2.imread(fname)
    frames.append(img)
    print(f"Captured frame {i+1}/{num_frames}")

# Landmark groups
RIGHT_EYE = [33, 160, 158, 133, 153, 144]
LEFT_EYE  = [362, 385, 387, 263, 373, 380]
BROW_LEFT = [70, 63, 105]
BROW_RIGHT = [336, 296, 334]
UPPER_LIP = [13, 14, 312, 82]
LOWER_LIP = [17, 18, 402, 86]
JAW_TOP = [0, 13, 14]
JAW_BOTTOM = [152, 178, 150]

# Process frames and compute metrics
metrics = {"EAR": [], "brow": [], "lip": [], "jaw": [], "head_pose": []}

with mp.solutions.face_mesh.FaceMesh(
    max_num_faces=1,
    refine_landmarks=False,  # Colab-safe
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
) as face_mesh:

    for frame in frames:
        h, w = frame.shape[:2]
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = face_mesh.process(rgb_frame)
        if not results.multi_face_landmarks:
            continue
        lm = results.multi_face_landmarks[0].landmark

        EAR = (compute_EAR(lm, RIGHT_EYE, w, h) + compute_EAR(lm, LEFT_EYE, w, h)) / 2
        brow = (compute_brow_raise(lm, BROW_LEFT, LEFT_EYE, w, h) +
                compute_brow_raise(lm, BROW_RIGHT, RIGHT_EYE, w, h)) / 2
        lip = compute_lip_distance(lm, UPPER_LIP, LOWER_LIP, w, h)
        jaw = compute_jaw_openness(lm, JAW_TOP, JAW_BOTTOM, w, h)
        head_pose = compute_head_pose(lm, w, h)

        metrics["EAR"].append(EAR)
        metrics["brow"].append(brow)
        metrics["lip"].append(lip)
        metrics["jaw"].append(jaw)
        metrics["head_pose"].append(head_pose)


# Compute baseline averages & thresholds
baseline = {}
for k, v in metrics.items():
    if len(v) > 0:
        baseline[f"avg_{k}"] = np.mean(v)
        baseline[f"std_{k}"] = np.std(v)
    else:
        baseline[f"avg_{k}"] = np.nan
        baseline[f"std_{k}"] = np.nan

# EAR closed-eye threshold
if "avg_EAR" in baseline and not np.isnan(baseline["avg_EAR"]):
    baseline["EAR_closed_threshold"] = baseline["avg_EAR"] * 0.7

# ------------------------------
# Display baseline results
# ------------------------------
print("\nBaseline calibration complete:")
for k, v in baseline.items():
    print(f"{k}: {v:.4f}")


This program will access your webcam to collect baseline facial metrics. Allow? (y/n): y
Capture each frame by clicking the 'Capture' button in the video window.
Captured frame 1/5
Captured frame 2/5
Captured frame 3/5
Captured frame 4/5
Captured frame 5/5

Baseline calibration complete:
avg_EAR: 0.2815
std_EAR: 0.0147
avg_brow: 77.3738
std_brow: 0.6154
avg_lip: 9.2975
std_lip: 0.4924
avg_jaw: 36.8231
std_jaw: 1.2997
avg_head_pose: -2.0013
std_head_pose: 0.3761
EAR_closed_threshold: 0.1971


In [None]:
# Phase 5 – Real-Time Feature Extraction & LSTM Inference

# 1. Load the trained LSTM model from Phase 3.
# 2. Define helper functions:
#    - Facial landmark detection (MediaPipe Holistic)
#    - Feature computation per frame (all regions)
#    - Sequence aggregation into rolling buffer (e.g., last 10 seconds)
# 3. JS snippet (if using Colab) to create a video element for live capture once.
# 4. Apply calibration thresholds from Phase 4 to normalize live features.
# 5. Start live webcam capture:
#    - Extract features per frame
#    - Update rolling buffer
#    - Run LSTM inference on aggregated window
#    - Display predicted cognitive state in real time
# 6. Stop camera stream cleanly when the session ends in Colab.

In [None]:
!pip install --quiet tensorflow

In [None]:
# Phase 5 – Real-Time Feature Extraction & LSTM Inference (Colab)

import cv2
import numpy as np
import mediapipe as mp
from collections import deque
import time
from IPython.display import display, Javascript
from google.colab.output import eval_js
from base64 import b64decode
from tensorflow.keras.models import load_model

# Load trained LSTM model
lstm_model = load_model("lstm_cognitive_state_model.h5")
classes = ["alert","fatigued","stressed","distracted","neutral"]

# Baseline calibration (from Phase 4)
baseline = {
    "avg_EAR": 0.2864, "std_EAR": 0.0082,
    "avg_brow": 101.9758, "std_brow": 1.2428,
    "avg_lip": 13.1024, "std_lip": 0.4144,
    "avg_jaw": 45.8287, "std_jaw": 0.5728,
    "avg_head_pose": -3.3198, "std_head_pose": 0.0963
}

EAR_closed_threshold = 0.2005  # optional for blink detection

# Landmark indices
RIGHT_EYE = [33, 160, 158, 133, 153, 144]
LEFT_EYE  = [362, 385, 387, 263, 373, 380]
BROW_LEFT = [70, 63, 105]
BROW_RIGHT = [336, 296, 334]
UPPER_LIP = [13, 14, 312, 82]
LOWER_LIP = [17, 18, 402, 86]
JAW_TOP = [0, 13, 14]
JAW_BOTTOM = [152, 178, 150]

# Helper functions
def compute_EAR(landmarks, eye_indices, w, h):
    points = np.array([(landmarks[i].x*w, landmarks[i].y*h) for i in eye_indices])
    A = np.linalg.norm(points[1]-points[5])
    B = np.linalg.norm(points[2]-points[4])
    C = np.linalg.norm(points[0]-points[3])
    return (A+B)/(2.0*C)

def compute_brow_raise(landmarks, brow_idx, eye_idx, w, h):
    brow = np.array([(landmarks[i].x*w, landmarks[i].y*h) for i in brow_idx])
    eye = np.array([(landmarks[i].x*w, landmarks[i].y*h) for i in eye_idx])
    return np.linalg.norm(np.mean(brow, axis=0) - np.mean(eye, axis=0))

def compute_lip_distance(landmarks, upper_idx, lower_idx, w, h):
    upper = np.array([(landmarks[i].x*w, landmarks[i].y*h) for i in upper_idx])
    lower = np.array([(landmarks[i].x*w, landmarks[i].y*h) for i in lower_idx])
    return np.linalg.norm(np.mean(upper, axis=0) - np.mean(lower, axis=0))

def compute_jaw_openness(landmarks, jaw_top_idx, jaw_bottom_idx, w, h):
    top = np.array([(landmarks[i].x*w, landmarks[i].y*h) for i in jaw_top_idx])
    bottom = np.array([(landmarks[i].x*w, landmarks[i].y*h) for i in jaw_bottom_idx])
    return np.linalg.norm(np.mean(top, axis=0) - np.mean(bottom, axis=0))

def compute_head_pose(landmarks, w, h):
    nose = np.array([landmarks[1].x*w, landmarks[1].y*h, landmarks[1].z])
    left = np.array([landmarks[234].x*w, landmarks[234].y*h, landmarks[234].z])
    right = np.array([landmarks[454].x*w, landmarks[454].y*h, landmarks[454].z])
    dx = right[0]-left[0]
    dy = right[1]-left[1]
    return np.degrees(np.arctan2(dy, dx))

def normalize_feature(value, mean, std):
    return (value - mean) / std

# Colab webcam capture JS
display(Javascript("""
if (!document.querySelector('video#cam')) {
    const video = document.createElement('video');
    video.id = 'cam';
    video.width = 640;
    video.height = 480;
    video.autoplay = true;
    document.body.appendChild(video);
    navigator.mediaDevices.getUserMedia({video:true}).then(stream => { video.srcObject = stream; });
}
"""))

def grab_frame(max_attempts=5):
    for attempt in range(max_attempts):
        js_code = """
        async function captureFrame() {
            const video = document.querySelector('video#cam');
            if (!video || video.readyState < 2) { return null; }
            const canvas = document.createElement('canvas');
            canvas.width = video.videoWidth;
            canvas.height = video.videoHeight;
            canvas.getContext('2d').drawImage(video, 0, 0);
            return canvas.toDataURL('image/jpeg', 0.8);
        }
        captureFrame();
        """
        data = eval_js(js_code)
        if data is None:
            time.sleep(0.2)
            continue
        try:
            binary = b64decode(data.split(',')[1])
            arr = np.frombuffer(binary, dtype=np.uint8)
            frame = cv2.imdecode(arr, cv2.IMREAD_COLOR)
            if frame is not None:
                return frame
        except:
            time.sleep(0.2)
    return None

# Live capture + inference
BUFFER_LENGTH = 30  # approx 10s
feature_buffer = deque(maxlen=BUFFER_LENGTH)

face_mesh = mp.solutions.face_mesh.FaceMesh(
    max_num_faces=1,
    refine_landmarks=True,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

capture_duration = 60  # seconds
prediction_interval = 10  # seconds
start_time = time.time()
next_prediction_time = start_time + prediction_interval
predictions = []

print("Starting live 60-second capture...")

try:
    while time.time() - start_time < capture_duration:
        frame = grab_frame()
        if frame is None:
            continue

        h, w = frame.shape[:2]
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = face_mesh.process(rgb_frame)

        if results.multi_face_landmarks:
            lm = results.multi_face_landmarks[0].landmark

            # Compute 5 basic features
            EAR_avg = (compute_EAR(lm, RIGHT_EYE, w, h) + compute_EAR(lm, LEFT_EYE, w, h))/2
            brow_avg = (compute_brow_raise(lm, BROW_LEFT, LEFT_EYE, w, h) + compute_brow_raise(lm, BROW_RIGHT, RIGHT_EYE, w, h))/2
            lip_dist = compute_lip_distance(lm, UPPER_LIP, LOWER_LIP, w, h)
            jaw_open = compute_jaw_openness(lm, JAW_TOP, JAW_BOTTOM, w, h)
            head_pose = compute_head_pose(lm, w, h)

            # Normalize features using baseline
            features = [
                normalize_feature(EAR_avg, baseline["avg_EAR"], baseline["std_EAR"]),
                normalize_feature(brow_avg, baseline["avg_brow"], baseline["std_brow"]),
                normalize_feature(lip_dist, baseline["avg_lip"], baseline["std_lip"]),
                normalize_feature(jaw_open, baseline["avg_jaw"], baseline["std_jaw"]),
                normalize_feature(head_pose, baseline["avg_head_pose"], baseline["std_head_pose"])
            ]

            # Placeholder for extra features (fill zeros if needed to match 13 features)
            features += [0]*(13 - len(features))  # ensures 13 features total

            feature_buffer.append(features)

        # Make prediction every prediction_interval
        if time.time() >= next_prediction_time and len(feature_buffer) == BUFFER_LENGTH:
            buffer_array = np.array(feature_buffer).reshape(1, BUFFER_LENGTH, 13)
            pred_probs = lstm_model.predict(buffer_array, verbose=0)
            pred_class = classes[np.argmax(pred_probs)]
            confidence = np.max(pred_probs)
            print(f"[{int(time.time()-start_time)}s] Cognitive state: {pred_class} (Confidence: {confidence:.2f})")
            predictions.append((time.time()-start_time, pred_class, confidence))
            next_prediction_time += prediction_interval

except KeyboardInterrupt:
    print("Live capture interrupted.")

finally:
    face_mesh.close()
    print("Capture finished. Predictions:")
    for t, state, conf in predictions:
        print(f"Time {int(t)}s: {state} ({conf:.2f})")

    # Stop webcam
    display(Javascript("""
    (function() {
        const video = document.querySelector('video#cam');
        if (video) {
            let stream = video.srcObject;
            if (stream) {
                let tracks = stream.getTracks();
                tracks.forEach(track => track.stop());
            }
            video.remove();
        }
    })();
    """))



<IPython.core.display.Javascript object>

Starting live 60-second capture...
[14s] Cognitive state: fatigued (Confidence: 0.60)
[20s] Cognitive state: stressed (Confidence: 0.88)
[30s] Cognitive state: distracted (Confidence: 0.68)
[40s] Cognitive state: fatigued (Confidence: 0.59)
[50s] Cognitive state: stressed (Confidence: 1.00)
[60s] Cognitive state: stressed (Confidence: 1.00)
Capture finished. Predictions:
Time 14s: fatigued (0.60)
Time 20s: stressed (0.88)
Time 30s: distracted (0.68)
Time 40s: fatigued (0.59)
Time 50s: stressed (1.00)
Time 60s: stressed (1.00)


<IPython.core.display.Javascript object>