In [1]:
# V6 changes - adding improved filtering logic for static images
    # section #1 - removed static and stability vars
    # section #2 - reworked get_emotion_predictions
        # removed filter_by_emotion_stability
    # section #3 - updated analyze_video_with_filters
    # section #4 - updated execution block
        # deleted embedding_model
        # simplified analysis_log

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoImageProcessor, AutoModelForImageClassification
import cv2
import face_recognition
from PIL import Image
import os
import glob
from datetime import datetime
from tqdm import tqdm
import numpy as np
import pandas as pd
import shutil

In [3]:
# ==============================================================================
# 1. CONFIGURATION
# ==============================================================================
ANALYSIS_OUTPUT_ROOT = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/data_flywheel"
MODEL_PATH = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V29_20250710_082807"
os.makedirs(ANALYSIS_OUTPUT_ROOT, exist_ok=True)

In [4]:
# ==============================================================================
# 2. UTILITY FUNCTIONS
# ==============================================================================

# Dynamically determines the next version number.
def get_next_version(base_dir):
    all_entries = glob.glob(os.path.join(base_dir, "V*_*"))
    existing = [os.path.basename(d) for d in all_entries if os.path.isdir(d)]
    versions = [int(d[1:].split("_")[0]) for d in existing if d.startswith("V") and "_" in d and d[1:].split("_")[0].isdigit()]
    return f"V{max(versions, default=0) + 1}"


# Runs the emotion recognition model on a single face image and returns a
# structured dictionary of probabilities.
def get_emotion_predictions(face_image, model, processor, device):
    """Runs the emotion model on a face image and returns a structured dictionary."""
    if face_image.size[0] < 10 or face_image.size[1] < 10: # Skip tiny, invalid crops
        return None
        
    # Use the processor to prepare the image for the model    
    inputs = processor(images=face_image, return_tensors="pt").to(device)

    # Run inference
    with torch.no_grad():
        logits = model(**inputs).logits

    # Apply softmax to convert logits to probabilities
    probabilities = F.softmax(logits, dim=1).squeeze()

    # Get the top prediction
    top_confidence, top_pred_idx = torch.max(probabilities, dim=0)
    top_pred_label = model.config.id2label[top_pred_idx.item()]
    
    return {"predicted_label": top_pred_label, "confidence": top_confidence.item()}
    

# Post-processes the log to filter for stable emotional states.
def filter_by_emotion_stability(df, window, threshold):
    if df.empty:
        return df
        
    print(f"\n--- Applying Emotion Stability Filter (Window={window}, Threshold={threshold}) ---")
    
    # This new list will store the indices of the rows we want to keep.
    stable_indices = []
    
    # Ensure the dataframe is sorted correctly before processing
    df = df.sort_values(by=['person_id', 'timestamp_seconds']).reset_index(drop=True)
    
    # Group by each unique person and iterate through their data
    for person_id, group in df.groupby('person_id'):
        labels = group['predicted_label']
        
        # Manually iterate through each prediction for this person
        for i in range(len(labels)):
            # Define the window of labels to check for stability
            current_window = labels.iloc[max(0, i - window + 1) : i + 1]
            
            # The emotion we are checking for stability is the most recent one
            current_label_to_check = labels.iloc[i]
            
            # Count how many times this emotion appears in the window
            if (current_window == current_label_to_check).sum() >= threshold:
                # If it's stable enough, keep it
                stable_indices.append(group.index[i])

    stable_df = df.loc[stable_indices].copy()
    
    print(f"-> Filtered {len(df) - len(stable_df)} unstable/transitional frames.")
    print(f"‚úÖ {len(stable_df)} stable emotional events remain.")
    return stable_df

In [5]:
# ==============================================================================
# 3. CORE PROCESSING FUNCTION (with Multi-Region Filter)
# ==============================================================================
def analyze_video_with_multi_region_filter(video_path, save_dir, model, processor, device, process_every_n_frames=1):
    if not os.path.exists(video_path):
        print(f"‚ùå Error: Video file not found at {video_path}")
        return []
    
    video_capture = cv2.VideoCapture(video_path)
    total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = video_capture.get(cv2.CAP_PROP_FPS) if video_capture.get(cv2.CAP_PROP_FPS) > 0 else 30
    print(f"‚úÖ Opened video: {os.path.basename(video_path)} ({total_frames} frames at {fps:.2f} fps)")

    all_results_log = []

    # Initialize dictionary to track static objects before loop begins
    static_object_tracker = {}
    
    pbar = tqdm(total=total_frames, desc="Analyzing Video")
    
    for frame_count in range(total_frames):
        ret, frame = video_capture.read()
        if not ret: break

        if frame_count % process_every_n_frames == 0:
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Get both face locations and the facial landmarks
            face_locations = face_recognition.face_locations(rgb_frame)
            face_landmarks_list = face_recognition.face_landmarks(rgb_frame, face_locations)

            if face_locations:
                for i, face_landmarks in enumerate(face_landmarks_list):
                    top, right, bottom, left = face_locations[i]
                    full_face_image = Image.fromarray(rgb_frame[top:bottom, left:right])

                    # --- Multi-Region Cropping ---
                    # Create upper face crop (eyes and brows)
                    top_of_eyes = min([p[1] for p in face_landmarks['left_eyebrow'] + face_landmarks['right_eyebrow']])
                    bottom_of_eyes = max([p[1] for p in face_landmarks['left_eye'] + face_landmarks['right_eye']])
                    upper_face_image = Image.fromarray(rgb_frame[top_of_eyes:bottom_of_eyes, left:right])
                    
                    # Create lower face crop (mouth and jaw)
                    top_of_mouth = min([p[1] for p in face_landmarks['top_lip']])
                    bottom_of_mouth = max([p[1] for p in face_landmarks['bottom_lip']])
                    lower_face_image = Image.fromarray(rgb_frame[top_of_mouth:bottom_of_mouth, left:right])
                    
                    # --- Parallel Analysis ---
                    upper_face_results = get_emotion_predictions(upper_face_image, model, processor, device)
                    lower_face_results = get_emotion_predictions(lower_face_image, model, processor, device)

                    # --- The New Filter Logic ---
                    # If the upper face is neutral but the lower face is not, it's likely speech.
                    # We only proceed if an emotion is detected in the upper face.
                    if upper_face_results and upper_face_results['predicted_label'] != 'neutral':
                        
                        # Since it's a real emotion, get the prediction for the FULL face
                        full_face_results = get_emotion_predictions(full_face_image, model, processor, device)
                        
                        if full_face_results:
                            log_entry = {
                                "timestamp_seconds": frame_count / fps,
                                "frame_number": frame_count,
                                "face_index": i,
                                **full_face_results
                            }
                            all_results_log.append(log_entry)

            # Clean up tracker for objects that are no longer detected
            stale_keys = [k for k, v in static_object_tracker.items() if k not in current_frame_locations]
            for k in stale_keys:
                del static_object_tracker[k]
                
        pbar.update(1)

    pbar.close()
    video_capture.release()
    print(f"‚úÖ Video processing complete. Logged {len(all_results_log)} stable emotional events.")
    return all_results_log

In [6]:
# ==============================================================================
# 4. MAIN EXECUTION BLOCK
# ==============================================================================
if __name__ == '__main__':
    # --- Setup ---
    VERSION = get_next_version(ANALYSIS_OUTPUT_ROOT)
    SAVE_DIR = os.path.join(ANALYSIS_OUTPUT_ROOT, f"{VERSION}_{datetime.now().strftime('%Y%m%d_%H%M%S')}")
    os.makedirs(SAVE_DIR, exist_ok=True)
    print(f"üìÅ Created analysis directory: {SAVE_DIR}")

    # --- Load Assets ---
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    model = AutoModelForImageClassification.from_pretrained(MODEL_PATH).to(device).eval()
    processor = AutoImageProcessor.from_pretrained(MODEL_PATH)
    print(f"‚úÖ Assets loaded onto {device}.")

    # --- Process Video ---
    video_to_process = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/data_flywheel/sample_vids/StreetQs.mp4"
    analysis_log = analyze_video_with_multi_region_filter(
        video_path=video_to_process, 
        save_dir=SAVE_DIR,
        model=model,
        processor=processor,
        device=device,
        process_every_n_frames=15 # Start with a larger step for faster testing
    )

    # --- Save Log ---
    if analysis_log:
        log_df = pd.DataFrame(analysis_log)
        csv_path = os.path.join(SAVE_DIR, "multi_region_filtered_log.csv")
        log_df.to_csv(csv_path, index=False)
        print(f"\n‚úÖ Successfully saved final analysis log to: {csv_path}")
    else:
        print("\n‚ö†Ô∏è No stable emotional events were detected.")

üìÅ Created analysis directory: /Users/natalyagrokh/AI/ml_expressions/img_expressions/data_flywheel/V6_20250716_112248
‚úÖ Assets loaded onto mps.
‚úÖ Opened video: StreetQs.mp4 (5657 frames at 30.00 fps)


Analyzing Video: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5657/5657 [01:46<00:00, 53.30it/s]

‚úÖ Video processing complete. Logged 361 stable emotional events.

‚úÖ Successfully saved final analysis log to: /Users/natalyagrokh/AI/ml_expressions/img_expressions/data_flywheel/V6_20250716_112248/multi_region_filtered_log.csv



