In [1]:
# using V2 script for V7 run with new video data

In [2]:
import pandas as pd
import os
import glob
import re
import shutil

In [3]:
# ==============================================================================
# 1. CONFIGURATION
# ==============================================================================
ANALYSIS_OUTPUT_ROOT = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/data_flywheel"

# --- Filtering Thresholds ---
# These are now informed by our diagnostic run.
CONFIDENCE_THRESHOLD = 0.88  # We'll focus on the top 75% of confident predictions.
TOP_N_TO_REVIEW = 150        # We'll take the top 150 most uncertain images from that pool.

In [7]:
# ==============================================================================
# 2. UTILITY FUNCTIONS
# ==============================================================================

#  Extracts integer version number (e.g., V1, V2) from directory name.
def extract_version_from_path(path):
    match = re.search(r"V(\d+)", os.path.basename(path))
    return int(match.group(1)) if match else -1

# Finds the most valuable images for review by first filtering for high
    # confidence, then selecting most uncertain (highest entropy)
# In your analyzer.py script, replace the existing function with this one.
def create_review_queue(log_df, run_dir, confidence_thresh, top_n):
    print("\n--- Creating Prioritized Manual Review Queue ---")

    # Step 1 & 2: Filter and select the top N uncertain images
    confident_mask = log_df['confidence'] > confidence_thresh
    confident_df = log_df[confident_mask]
    review_df = confident_df.sort_values(by='entropy', ascending=False).head(top_n)
    print(f"-> Selected the Top {len(review_df)} most uncertain images for review.")

    # Step 3: Copy images to a review folder
    review_folder_path = os.path.join(run_dir, "manual_review_queue")
    os.makedirs(review_folder_path, exist_ok=True)
    
    copied_count = 0
    for _, row in review_df.iterrows():
        source_path = row.get('face_crop_path')
        if source_path and os.path.exists(source_path):
            try:
                shutil.copy(source_path, review_folder_path)
                copied_count += 1
            except Exception as e:
                print(f"⚠️ Could not copy file {source_path}. Error: {e}")
    
    print(f"\n✅ Success! Copied {copied_count} images to: {review_folder_path}")
    
    # Step 4: Create and save the simplified, sortable CSV
    if not review_df.empty:
        # Select the columns needed for review
        simplified_df = review_df[['frame_number', 'face_crop_path', 'predicted_label']].copy()
        
        # Simplify the path to just the filename
        simplified_df['face_crop_path'] = simplified_df['face_crop_path'].apply(os.path.basename)
        
        # THIS IS THE FIX: Sort the DataFrame by the numerical 'frame_number' column
        simplified_df = simplified_df.sort_values(by='frame_number').reset_index(drop=True)
        
        # Save the new CSV inside the manual_review_queue folder
        simplified_csv_path = os.path.join(review_folder_path, "simplified_review_log.csv")
        simplified_df.to_csv(simplified_csv_path, index=False)
        print(f"✅ Created a sortable, simplified log for manual curation at: {simplified_csv_path}")

In [8]:
# ==============================================================================
# 3. MAIN EXECUTION BLOCK
# ==============================================================================
if __name__ == '__main__':
    
    # Automatically find the most recent run directory by version number
    all_run_dirs = [os.path.join(ANALYSIS_OUTPUT_ROOT, d) for d in os.listdir(ANALYSIS_OUTPUT_ROOT) if d.startswith("V") and os.path.isdir(os.path.join(ANALYSIS_OUTPUT_ROOT, d))]

    if not all_run_dirs:
        print(f"❌ Error: No run directories found in {ANALYSIS_OUTPUT_ROOT}")
    else:
        latest_run_dir = max(all_run_dirs, key=extract_version_from_path)
        print(f"✅ Automatically analyzing latest run: {os.path.basename(latest_run_dir)}")

        # This now points to the correct final log file.
        log_path = os.path.join(latest_run_dir, "final_stable_emotion_log.csv")

        if not os.path.exists(log_path):
            print(f"❌ Error: Could not find 'final_stable_emotion_log.csv' in the directory: {latest_run_dir}")
        else:
            log_df = pd.read_csv(log_path)
            
            # Run the final analysis and file-copying function
            create_review_queue(
                log_df=log_df,
                run_dir=latest_run_dir,
                confidence_thresh=CONFIDENCE_THRESHOLD,
                top_n=TOP_N_TO_REVIEW
            )

✅ Automatically analyzing latest run: V7_20250717_122112

--- Creating Prioritized Manual Review Queue ---
-> Selected the Top 150 most uncertain images for review.

✅ Success! Copied 150 images to: /Users/natalyagrokh/AI/ml_expressions/img_expressions/data_flywheel/V7_20250717_122112/manual_review_queue
✅ Created a sortable, simplified log for manual curation at: /Users/natalyagrokh/AI/ml_expressions/img_expressions/data_flywheel/V7_20250717_122112/manual_review_queue/simplified_review_log.csv
