In [2]:
# V6 changes:
    # section #2 - create_review_queue updated and simplified

In [3]:
import pandas as pd
import os
import glob
import re

In [4]:
# ==============================================================================
# 1. CONFIGURATION
# ==============================================================================
ANALYSIS_OUTPUT_ROOT = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/data_flywheel"

# --- Filtering Thresholds ---
CONFIDENCE_THRESHOLD = 0.88  
TOP_N_TO_REVIEW = 150        

In [5]:
# ==============================================================================
# 2. UTILITY FUNCTIONS
# ==============================================================================

#  Extracts integer version number (e.g., V1, V2) from directory name.
def extract_version_from_path(path):
    match = re.search(r"V(\d+)", os.path.basename(path))
    return int(match.group(1)) if match else -1

# Finds the most valuable images for review by first filtering for high
    # confidence, then selecting most uncertain (highest entropy)
def create_review_queue(log_df, run_dir, confidence_thresh, top_n):
    print(f"\n--- Finding the Top {top_n} Most Confident Predictions for Audit ---")

    # Sort the entire log by confidence in descending order
    review_df = log_df.sort_values(by='confidence', ascending=False).head(top_n)
    
    print(f"✅ Selected the Top {len(review_df)} most confident predictions for review.")
    
    return review_df

In [7]:
# ==============================================================================
# 3. MAIN EXECUTION BLOCK
# ==============================================================================
if __name__ == '__main__':
    
    # Automatically find the most recent run directory by version number
    all_run_dirs = [os.path.join(ANALYSIS_OUTPUT_ROOT, d) for d in os.listdir(ANALYSIS_OUTPUT_ROOT) if d.startswith("V") and os.path.isdir(os.path.join(ANALYSIS_OUTPUT_ROOT, d))]

    if not all_run_dirs:
        print(f"❌ Error: No run directories found in {ANALYSIS_OUTPUT_ROOT}")
    else:
        latest_run_dir = max(all_run_dirs, key=extract_version_from_path)
        print(f"✅ Automatically analyzing latest run: {os.path.basename(latest_run_dir)}")

        # This now points to the correct log file name.
        log_path = os.path.join(latest_run_dir, "multi_region_filtered_log.csv")

        if not os.path.exists(log_path):
            print(f"❌ Error: Could not find 'multi_region_filtered_log.csv' in the directory: {latest_run_dir}")
        else:
            log_df = pd.read_csv(log_path)
            
            # Run the analysis and CSV creation function
            review_df = create_review_queue(
                log_df=log_df,
                run_dir=latest_run_dir,
                confidence_thresh=CONFIDENCE_THRESHOLD,
                top_n=TOP_N_TO_REVIEW
            )

            if not review_df.empty:
                output_path = os.path.join(latest_run_dir, "top_confidence_review.csv")
                review_df.to_csv(output_path, index=False)
                print(f"\n✅ Successfully saved Top-Confidence review queue to: {output_path}")

✅ Automatically analyzing latest run: V6_20250716_112248

--- Finding the Top 150 Most Confident Predictions for Audit ---
✅ Selected the Top 150 most confident predictions for review.

✅ Successfully saved Top-Confidence review queue to: /Users/natalyagrokh/AI/ml_expressions/img_expressions/data_flywheel/V6_20250716_112248/top_confidence_review.csv
