In [None]:
import pandas as pd
import os
import glob
import re
import shutil

# ==============================================================================
# 1. CONFIGURATION
# ==============================================================================
ANALYSIS_OUTPUT_ROOT = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/flywheel"
CONFIDENCE_THRESHOLD = 0.88
TOP_N_TO_REVIEW = 150

# ==============================================================================
# 2. HELPER FUNCTIONS
# ==============================================================================

def extract_version_from_path(path):
    """Extracts the integer version number from a directory name."""
    match = re.search(r"V(\d+)", os.path.basename(path))
    return int(match.group(1)) if match else -1

def create_review_queue(log_df, run_dir, confidence_thresh, top_n):
    """
    Finds the most valuable images for review, copies their face crops, and
    creates a simplified CSV log with all original data.
    """
    print("\n--- Creating Prioritized Manual Review Queue ---")

    # Step 1: Filter for confident predictions.
    confident_mask = log_df['confidence'] > confidence_thresh
    confident_df = log_df[confident_mask]
    print(f"-> Found {len(confident_df)} stable predictions with confidence > {confidence_thresh}.")

    if confident_df.empty:
        print("⚠️ No predictions met the confidence threshold. Nothing to review.")
        return

    # Step 2: From the confident pool, sort by entropy and take the top N.
    review_df = confident_df.sort_values(by='entropy', ascending=False).head(top_n)
    print(f"-> Selected the Top {len(review_df)} most uncertain images for review.")

    # Step 3: Copy images to a review folder using the full path.
    review_folder_path = os.path.join(run_dir, "manual_review_queue")
    os.makedirs(review_folder_path, exist_ok=True)
    
    copied_count = 0
    for _, row in review_df.iterrows():
        source_path = row.get('face_crop_path')
        if source_path and os.path.exists(source_path):
            try:
                shutil.copy(source_path, review_folder_path)
                copied_count += 1
            except Exception as e:
                print(f"⚠️ Could not copy file {source_path}. Error: {e}")
        else:
            print(f"⚠️ File not found and could not be copied: {source_path}")

    print(f"\n✅ Success! Copied {copied_count} images to: {review_folder_path}")
    
    # Step 4: Create and save the simplified CSV for manual curation.
    if not review_df.empty:
        # Start with a copy of the filtered data to preserve all original columns.
        simplified_df = review_df.copy()
        
        # THIS IS THE CHANGE: Simplify the path to just the filename.
        simplified_df['face_crop_path'] = simplified_df['face_crop_path'].apply(os.path.basename)
        
        # Save the new CSV inside the manual_review_queue folder.
        simplified_csv_path = os.path.join(review_folder_path, "simplified_review_log.csv")
        simplified_df.to_csv(simplified_csv_path, index=False)
        print(f"✅ Created a simplified log for manual curation at: {simplified_csv_path}")

# ==============================================================================
# 3. MAIN EXECUTION BLOCK
# ==============================================================================
if __name__ == '__main__':
    
    # Automatically find the most recent run directory by version number
    all_run_dirs = [os.path.join(ANALYSIS_OUTPUT_ROOT, d) for d in os.listdir(ANALYSIS_OUTPUT_ROOT) if d.startswith("V") and os.path.isdir(os.path.join(ANALYSIS_OUTPUT_ROOT, d))]

    if not all_run_dirs:
        print(f"❌ Error: No run directories found in {ANALYSIS_OUTPUT_ROOT}")
    else:
        latest_run_dir = max(all_run_dirs, key=extract_version_from_path)
        print(f"✅ Automatically analyzing latest run: {os.path.basename(latest_run_dir)}")

        # Point to the final, stable log file created by the producer.
        log_path = os.path.join(latest_run_dir, "final_stable_emotion_log.csv")

        if not os.path.exists(log_path):
            print(f"❌ Error: Could not find 'final_stable_emotion_log.csv' in the directory: {latest_run_dir}")
        else:
            log_df = pd.read_csv(log_path)
            
            # Run the final analysis and file-copying function
            create_review_queue(
                log_df=log_df,
                run_dir=latest_run_dir,
                confidence_thresh=CONFIDENCE_THRESHOLD,
                top_n=TOP_N_TO_REVIEW
            )

In [1]:
import pandas as pd
import os
import glob
import re
import shutil

In [2]:
# ==============================================================================
# 1. CONFIGURATION
# ==============================================================================
ANALYSIS_OUTPUT_ROOT = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/flywheel"

# --- Filtering Thresholds ---
# These are now informed by our diagnostic run.
CONFIDENCE_THRESHOLD = 0.88  # We'll focus on the top 75% of confident predictions.
TOP_N_TO_REVIEW = 150        # We'll take the top 150 most uncertain images from that pool.

In [3]:
# ==============================================================================
# 2. HELPER FUNCTIONS
# ==============================================================================

#  Extracts integer version number (e.g., V1, V2) from directory name.
def extract_version_from_path(path):
    match = re.search(r"V(\d+)", os.path.basename(path))
    return int(match.group(1)) if match else -1

# Finds the most valuable images for review by first filtering for high
    # confidence, then selecting most uncertain (highest entropy)
def find_high_value_review_images(log_df, run_dir, confidence_thresh, top_n):
    print("\n--- Finding High-Value Images for Manual Review ---")

    # Step 1: Filter for confident predictions.
    confident_mask = log_df['confidence'] > confidence_thresh
    confident_df = log_df[confident_mask]
    print(f"-> Found {len(confident_df)} predictions with confidence > {confidence_thresh}.")

    if confident_df.empty:
        print("⚠️ No predictions met the confidence threshold. Nothing to review.")
        return

    # Step 2: From the confident pool, sort by entropy to find the most uncertain.
    review_df = confident_df.sort_values(by='entropy', ascending=False).head(top_n)
    print(f"-> Selected the Top {len(review_df)} most uncertain images from the confident set.")

    # Step 3: Copy the corresponding face crops to a review folder.
    review_folder_path = os.path.join(run_dir, "manual_review_queue")
    os.makedirs(review_folder_path, exist_ok=True)
    
    copied_count = 0
    for _, row in review_df.iterrows():
        source_path = row.get('face_crop_path')
        if source_path and os.path.exists(source_path):
            try:
                shutil.copy(source_path, review_folder_path)
                copied_count += 1
            except Exception as e:
                print(f"⚠️ Could not copy file {source_path}. Error: {e}")
        else:
            print(f"⚠️ File not found and could not be copied: {source_path}")

    print(f"\n✅ Success! Copied {copied_count} images to: {review_folder_path}")

In [4]:
# ==============================================================================
# 3. MAIN EXECUTION BLOCK
# ==============================================================================
if __name__ == '__main__':
    
    # Automatically find the most recent run directory by version number
    all_run_dirs = [os.path.join(ANALYSIS_OUTPUT_ROOT, d) for d in os.listdir(ANALYSIS_OUTPUT_ROOT) if d.startswith("V") and os.path.isdir(os.path.join(ANALYSIS_OUTPUT_ROOT, d))]

    if not all_run_dirs:
        print(f"❌ Error: No run directories found in {ANALYSIS_OUTPUT_ROOT}")
    else:
        latest_run_dir = max(all_run_dirs, key=extract_version_from_path)
        print(f"✅ Automatically analyzing latest run: {os.path.basename(latest_run_dir)}")

        log_path = os.path.join(latest_run_dir, "filtered_emotion_log.csv")

        if not os.path.exists(log_path):
            print(f"❌ Error: Could not find 'filtered_emotion_log.csv' in the directory: {latest_run_dir}")
        else:
            log_df = pd.read_csv(log_path)
            
            # Run the final analysis and file-copying function
            find_high_value_review_images(
                log_df=log_df,
                run_dir=latest_run_dir,
                confidence_thresh=CONFIDENCE_THRESHOLD,
                top_n=TOP_N_TO_REVIEW
            )

✅ Automatically analyzing latest run: V4_20250714_122050

--- Finding High-Value Images for Manual Review ---
-> Found 267 predictions with confidence > 0.88.
-> Selected the Top 150 most uncertain images from the confident set.

✅ Success! Copied 150 images to: /Users/natalyagrokh/AI/ml_expressions/img_expressions/flywheel/V4_20250714_122050/manual_review_queue
