In [10]:
import pandas as pd
import os
import glob
import re

# ==============================================================================
# 1. CONFIGURATION
# ==============================================================================
ANALYSIS_OUTPUT_ROOT = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/flywheel"

# ==============================================================================
# 2. HELPER FUNCTIONS
# ==============================================================================
def extract_version_from_path(path):
    """Extracts the integer version number from a directory name."""
    match = re.search(r"V(\d+)", os.path.basename(path))
    return int(match.group(1)) if match else -1

# ==============================================================================
# 3. MAIN EXECUTION BLOCK
# ==============================================================================
if __name__ == '__main__':
    
    # --- Find the latest run directory ---
    all_run_dirs = [
        os.path.join(ANALYSIS_OUTPUT_ROOT, d)
        for d in os.listdir(ANALYSIS_OUTPUT_ROOT)
        if d.startswith("V") and os.path.isdir(os.path.join(ANALYSIS_OUTPUT_ROOT, d))
    ]

    if not all_run_dirs:
        print(f"❌ Error: No run directories found in {ANALYSIS_OUTPUT_ROOT}")
    else:
        latest_run_dir = max(all_run_dirs, key=extract_version_from_path)
        print(f"✅ Analyzing latest run directory: {os.path.basename(latest_run_dir)}")

        log_path = os.path.join(latest_run_dir, "filtered_emotion_log.csv")

        if not os.path.exists(log_path):
            print(f"❌ Error: Could not find 'filtered_emotion_log.csv' in the directory.")
        else:
            df = pd.read_csv(log_path)
            print(f"✅ Loaded '{log_path}' with {len(df)} relevant emotional events.")

            # --- Perform Diagnostic Analysis ---
            print("\n--- DATA DIAGNOSTIC REPORT ---")
            
            # 1. Analyze 'confidence'
            print("\nConfidence Score Distribution:")
            print(df['confidence'].describe())
            
            # 2. Analyze 'entropy'
            print("\nEntropy Score Distribution:")
            print(df['entropy'].describe())
            
            # 3. Analyze the maximum probability for each prediction
            prob_columns = [col for col in df.columns if col.startswith('prob_')]
            df['max_probability'] = df[prob_columns].max(axis=1)
            print("\nMax Emotion Probability Distribution:")
            print(df['max_probability'].describe())
            
            print("\n--- END OF REPORT ---")
            print("\nUse the 'min', 'max', '25%', '50%', and '75%' values above to set new, informed thresholds in the filtering script.")

✅ Analyzing latest run directory: V3_20250714_093845
✅ Loaded '/Users/natalyagrokh/AI/ml_expressions/img_expressions/flywheel/V3_20250714_093845/filtered_emotion_log.csv' with 5152 relevant emotional events.

--- DATA DIAGNOSTIC REPORT ---

Confidence Score Distribution:
count    5152.000000
mean        0.891417
std         0.085860
min         0.411724
25%         0.872956
50%         0.925737
75%         0.943520
max         0.986008
Name: confidence, dtype: float64

Entropy Score Distribution:
count    5152.000000
mean        0.521497
std         0.271253
min         0.103852
25%         0.339708
50%         0.408021
75%         0.579768
max         1.784440
Name: entropy, dtype: float64

Max Emotion Probability Distribution:
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: max_probability, dtype: float64

--- END OF REPORT ---

Use the 'min', 'max', '25%', '50%', and '75%' values above to set new, informed thresholds in t