In [1]:
# V10 changes - 
    # section 3 - updated face_filename
        # updated current_face_encodings to account for stale images

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoImageProcessor, AutoModelForImageClassification
import cv2
import face_recognition
from PIL import Image
import os
import glob
from datetime import datetime
from tqdm import tqdm
import pandas as pd
import shutil

In [3]:
# ==============================================================================
# 1. CONFIGURATION
# ==============================================================================
ANALYSIS_OUTPUT_ROOT = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/data_flywheel"
MODEL_PATH = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V29_20250710_082807"
GATEKEEPER_MODEL_PATH = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/data_flywheel/gatekeeper_models/gatekeeper_V3_20250729_065459/checkpoint-668"

# --- Filtering Thresholds ---
STATIC_FRAME_THRESHOLD = 60 # Number of consecutive frames to identify a static object (e.g., 2 seconds at 30fps)
STABILITY_WINDOW = 5 # Rolling window size for the stability filter
STABILITY_THRESHOLD = 3 # How many times 

In [4]:
# ==============================================================================
# 2. UTILITY FUNCTIONS
# ==============================================================================

# Dynamically determines the next version number.
def get_next_version(base_dir):
    all_entries = glob.glob(os.path.join(base_dir, "V*_*"))
    existing = [os.path.basename(d) for d in all_entries if os.path.isdir(d)]
    versions = [int(d[1:].split("_")[0]) for d in existing if d.startswith("V") and "_" in d and d[1:].split("_")[0].isdigit()]
    return f"V{max(versions, default=0) + 1}"

# Runs the emotion recognition model on a single face image and returns a
# structured dictionary of probabilities.
def get_emotion_predictions(face_image, emotion_model, processor, device):
    # Use the processor to prepare the image for the model
    inputs = processor(images=face_image, return_tensors="pt").to(device)

    # Run inference
    with torch.no_grad():
        logits = emotion_model(**inputs).logits

    # Apply softmax to convert logits to probabilities
    probabilities = F.softmax(logits, dim=1).squeeze()

    # Get the top prediction
    top_confidence, top_pred_idx = torch.max(probabilities, dim=0)
    
    results = {
        "predicted_label": emotion_model.config.id2label[top_pred_idx.item()], 
        "confidence": top_confidence.item()
    }
    
    results["entropy"] = -torch.sum(probabilities * torch.log(probabilities + 1e-9)).item()

    # This loop ensures all individual probabilities are added to the log for detailed analysis.
    for i, prob in enumerate(probabilities):
        results[f"prob_{emotion_model.config.id2label[i]}"] = prob.item()
    
    return results

# Post-processes the log to filter for stable emotional states.
def filter_by_emotion_stability(df, window, threshold):
    if df.empty:
        return df
        
    print(f"\n--- Applying Emotion Stability Filter (Window={window}, Threshold={threshold}) ---")
    
    # This new list will store the indices of the rows we want to keep.
    stable_indices = []

    # Ensure the dataframe is sorted correctly before processing
    df = df.sort_values(by=['person_id', 'timestamp']).reset_index(drop=True)

    # Group by each unique person and iterate through their data
    for person_id, group in df.groupby('person_id'):
        labels = group['predicted_label']
       
        # Manually iterate through each prediction for this person
        for i in range(len(labels)):
            # Define the window of labels to check for stability
            current_window = labels.iloc[max(0, i - window + 1) : i + 1]
            
            # The emotion we are checking for stability is the most recent one in the window
            current_label_to_check = labels.iloc[i]
            
            # Count how many times this emotion appears in the window
            if (current_window == current_label_to_check).sum() >= threshold:
                # If it's stable enough, keep the original index of this row
                stable_indices.append(group.index[i])
                                          
    stable_df = df.loc[stable_indices].copy()
    
    print(f"-> Filtered {len(df) - len(stable_df)} unstable/transitional frames.")
    print(f"‚úÖ {len(stable_df)} stable emotional events remain.")
    return stable_df

In [5]:
# ==============================================================================
# 3. CORE PROCESSING FUNCTION
# ==============================================================================

# Processes video with all filters, ensuring file paths and person IDs are correctly logged.
def analyze_video_with_filters(video_path, save_dir, emotion_model, gatekeeper_model, processor, device, static_threshold, process_every_n_frames=1):
    if not os.path.exists(video_path):
        print(f"‚ùå Error: Video file not found at {video_path}")
        return []

    video_capture = cv2.VideoCapture(video_path)
    total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = video_capture.get(cv2.CAP_PROP_FPS) if video_capture.get(cv2.CAP_PROP_FPS) > 0 else 30
    
    # Get frame dimensions for boundary checks
    ret, frame = video_capture.read()
    if not ret:
        print("‚ùå Error: Could not read the first frame of the video.")
        return []
    
    frame_height, frame_width, _ = frame.shape
    video_capture.set(cv2.CAP_PROP_POS_FRAMES, 0) # Reset video to the beginning
    print(f"‚úÖ Opened video: {os.path.basename(video_path)} ({total_frames} frames at {fps:.2f} fps)")

    # Create the directory for all face crops at the start
    face_crop_dir = os.path.join(save_dir, "face_crops")
    os.makedirs(face_crop_dir, exist_ok=True)
    
    # Data structures for tracking
    static_object_tracker, ignored_locations = {}, set()
    known_face_encodings, known_face_ids = [], []
    next_person_id = 1
    
    all_results_log = []
    
    pbar = tqdm(total=total_frames, desc="Analyzing Video")
    
    for frame_count in range(total_frames):
        ret, frame = video_capture.read()
        if not ret: break

        if frame_count % process_every_n_frames == 0:
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            face_locations = face_recognition.face_locations(rgb_frame)
            current_face_encodings = face_recognition.face_encodings(rgb_frame, face_locations)

            # Identify all face locations detected in the current frame
            current_frame_locations = set(face_locations)
            
            if current_face_encodings:
                for i, face_encoding in enumerate(current_face_encodings):
                    top, right, bottom, left = face_locations[i]
                    loc_key = (top, right, bottom, left)
                    
                    # --- PADDED FACE CROPPING LOGIC (MOVED TO CORRECT LOCATION) ---
                    face_height = bottom - top
                    face_width = right - left
                    v_pad = int(face_height * 0.40)
                    h_pad = int(face_width * 0.15)
                    top_pad = max(0, top - v_pad)
                    bottom_pad = min(frame_height, bottom + int(v_pad * 0.1))
                    left_pad = max(0, left - h_pad)
                    right_pad = min(frame_width, right + h_pad)
                    face_image = Image.fromarray(rgb_frame[top_pad:bottom_pad, left_pad:right_pad])
                    
                    # --- Static Object Filter ---
                    if loc_key in ignored_locations: continue
                    if loc_key not in static_object_tracker:
                        static_object_tracker[loc_key] = {"count": 1, "last_frame": frame_count}
                    else:
                        if frame_count == static_object_tracker[loc_key]["last_frame"] + process_every_n_frames:
                            static_object_tracker[loc_key]["count"] += 1
                        else:
                            static_object_tracker[loc_key]["count"] = 1
                        static_object_tracker[loc_key]["last_frame"] = frame_count
                    if static_object_tracker[loc_key]["count"] > static_threshold:
                        if loc_key not in ignored_locations:
                            ignored_locations.add(loc_key)
                        continue

                    # --- Gatekeeper Filter ---
                    gatekeeper_inputs = processor(images=face_image, return_tensors="pt").to(device)
                    with torch.no_grad():
                        gatekeeper_logits = gatekeeper_model(**gatekeeper_inputs).logits
                    gatekeeper_pred = gatekeeper_model.config.id2label[gatekeeper_logits.argmax(-1).item()]
                    
                    if "Non-Emotional" in gatekeeper_pred:
                        continue

                    # --- Face Identification ---
                    matches = face_recognition.compare_faces(known_face_encodings, face_encoding)
                    person_id = "Unknown"
                    if True in matches:
                        person_id = known_face_ids[matches.index(True)]
                    else:
                        person_id = f"Person_{next_person_id}"
                        known_face_encodings.append(face_encoding)
                        known_face_ids.append(person_id)
                        next_person_id += 1
                    
                    # --- Emotion Classification and Logging ---
                    emotion_results = get_emotion_predictions(face_image, emotion_model, processor, device)
                    face_filename = os.path.join(face_crop_dir, f"frame_{frame_count}_{person_id}.png")
                    face_image.save(face_filename)
                    
                    log_entry = {
                        "timestamp": frame_count / fps,
                        "frame_number": frame_count, # <-- This line was missing
                        "person_id": person_id,
                        "face_crop_path": face_filename,
                        **emotion_results
                    }
                    all_results_log.append(log_entry)

            # Clean up tracker for objects that have disappeared
            stale_keys = [k for k in static_object_tracker if k not in current_frame_locations]
            for k in stale_keys:
                del static_object_tracker[k]
                
        pbar.update(1)
        
    pbar.close()
    video_capture.release()
    
    print(f"\n--- Video Processing Summary ---")
    print(f"‚úÖ Discovered {len(known_face_ids)} unique person(s).")
    print(f"‚ö†Ô∏è Detected and ignored {len(ignored_locations)} static object(s).")
    print(f"‚úÖ Logged {len(all_results_log)} relevant emotional events.")
    
    return all_results_log

In [6]:
# ==============================================================================
# 4. MAIN EXECUTION BLOCK
# ==============================================================================

# --- Setup Dynamic Save Directory ---
VERSION = get_next_version(ANALYSIS_OUTPUT_ROOT)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
VERSION_TAG = f"{VERSION}_{timestamp}"
SAVE_DIR = os.path.join(ANALYSIS_OUTPUT_ROOT, VERSION_TAG)
os.makedirs(SAVE_DIR, exist_ok=True)
print(f"üìÅ Created analysis directory: {SAVE_DIR}")

# --- Load Assets ---
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
processor = AutoImageProcessor.from_pretrained(MODEL_PATH)
# Use a clear variable name for the emotion model
emotion_model = AutoModelForImageClassification.from_pretrained(MODEL_PATH).to(device).eval()
gatekeeper_model = AutoModelForImageClassification.from_pretrained(GATEKEEPER_MODEL_PATH).to(device).eval()
print(f"‚úÖ All models and processor loaded onto {device}.")

# --- Process Video ---
video_path = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/data_flywheel/sample_vids/trevor_noah_interview_2.mp4"

# --- THIS IS THE FIX ---
# Call the correct function name: 'analyze_video_with_filters'
analysis_log = analyze_video_with_filters(
    video_path=video_path,
    save_dir=SAVE_DIR,
    emotion_model=emotion_model,  # Use the 'emotion_model' variable
    gatekeeper_model=gatekeeper_model,
    processor=processor,
    device=device,
    static_threshold=STATIC_FRAME_THRESHOLD
)

# --- Save Logs and Apply Stability Filter ---
if analysis_log:
    log_df = pd.DataFrame(analysis_log)
    log_df.to_csv(os.path.join(SAVE_DIR, "emotion_log_before_stability_filter.csv"), index=False)
    
    stable_log_df = filter_by_emotion_stability(log_df, window=STABILITY_WINDOW, threshold=STABILITY_THRESHOLD)
    
    stable_log_df.to_csv(os.path.join(SAVE_DIR, "final_stable_emotion_log.csv"), index=False)
    print(f"‚úÖ Final, stable emotion log saved.")
else:
    print("\n‚ö†Ô∏è No emotional events were detected after all filters.")

üìÅ Created analysis directory: /Users/natalyagrokh/AI/ml_expressions/img_expressions/data_flywheel/V10_20250729_114729
‚úÖ All models and processor loaded onto mps.
‚úÖ Opened video: trevor_noah_interview_2.mp4 (7554 frames at 29.97 fps)


Analyzing Video: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 7552/7554 [1:06:41<00:01,  1.89it/s]



--- Video Processing Summary ---
‚úÖ Discovered 18 unique person(s).
‚ö†Ô∏è Detected and ignored 7 static object(s).
‚úÖ Logged 6515 relevant emotional events.

--- Applying Emotion Stability Filter (Window=5, Threshold=3) ---
-> Filtered 1747 unstable/transitional frames.
‚úÖ 4768 stable emotional events remain.
‚úÖ Final, stable emotion log saved.
