In [1]:
import pandas as pd
import os
import shutil
from tqdm import tqdm

In [2]:
# ==============================================================================
# 1. CONFIGURATION
# ==============================================================================

# --- IMPORTANT: Point this to the versioned folder from your last analysis run ---
RUN_DIRECTORY = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/data_flywheel/V9_20250725_154347"

# Define the name for the new dataset folder
OUTPUT_DATASET_NAME = "CorrectionSet_V2"

In [3]:
# ==============================================================================
# 2. SCRIPT LOGIC
# ==============================================================================

# Reads manually curated log file and sorts reviewed images into 
    # binary classification dataset ('Emotion' vs. 'Non-Emotional_Action').
def create_correction_dataset(run_dir, output_name):
    # --- Setup Paths for BOTH review queues ---
    micro_review_folder = os.path.join(run_dir, "certainty_micro_expression_review")
    stable_review_folder = os.path.join(run_dir, "certainty_stable_emotion_review")
    
    micro_csv_path = os.path.join(micro_review_folder, "simplified_review_log.csv")
    stable_csv_path = os.path.join(stable_review_folder, "simplified_review_log.csv")

    output_dir = os.path.join(os.path.dirname(run_dir), output_name)
    emotion_dir = os.path.join(output_dir, "Emotion")
    action_dir = os.path.join(output_dir, "Speech_Action")

    # Check if both required CSV files exist before proceeding
    if not os.path.exists(micro_csv_path) or not os.path.exists(stable_csv_path):
        print(f"❌ Error: One or both review CSV files are missing.")
        if not os.path.exists(micro_csv_path):
            print(f"   - Missing: {micro_csv_path}")
        if not os.path.exists(stable_csv_path):
            print(f"   - Missing: {stable_csv_path}")
        return

    # Create the new dataset directories
    os.makedirs(emotion_dir, exist_ok=True)
    os.makedirs(action_dir, exist_ok=True)
    
    # --- Load and Combine Both Curated CSVs ---
    df_micro = pd.read_csv(micro_csv_path) if os.path.exists(micro_csv_path) else pd.DataFrame()
    df_stable = pd.read_csv(stable_csv_path) if os.path.exists(stable_csv_path) else pd.DataFrame()
    
    combined_df = pd.concat([df_micro, df_stable]).drop_duplicates(subset=['face_crop_path']).reset_index(drop=True)
    print(f"✅ Loaded and combined both review logs. Total unique images to process: {len(combined_df)}")

    # --- Process and Sort Images ---
    copied_count = 0
    for _, row in tqdm(combined_df.iterrows(), total=combined_df.shape[0], desc="Sorting images"):
        actual_label = str(row.get('actual_label', '')).lower()
        notes = str(row.get('notes', '')).lower()
        
        source_filename = row['face_crop_path']
        # The source images could be in either review folder, so we check both.
        source_path_micro = os.path.join(micro_review_folder, source_filename)    
        source_path_stable = os.path.join(stable_review_folder, source_filename)
        
        source_path = source_path_micro if os.path.exists(source_path_micro) else source_path_stable
        
        if not os.path.exists(source_path):
            print(f"⚠️ Warning: Source image not found, skipping: {source_path}")
            continue

        # --- The Sorting Logic ---
        # If the actual_label is empty, it means the model was correct.
        # This is a genuine, classifiable emotion.
        if pd.isna(row.get('actual_label')) or row.get('actual_label') == '':
             shutil.copy(source_path, emotion_dir)
             copied_count += 1
        # If your notes or labels indicate a non-emotional action, sort it accordingly.
        elif any(keyword in actual_label for keyword in ['mid-speech', 'mid speech', 'laughter', 'mixed']):
            shutil.copy(source_path, action_dir)
            copied_count += 1
        # Otherwise, it's a genuine emotion that was mislabeled by the original model.
        else:
            shutil.copy(source_path, emotion_dir)
            copied_count += 1

    print(f"\n✅ Success! Created the '{output_name}' dataset with {copied_count} images.")
    print(f"   - Location: {output_dir}")
    print(f"   - Emotion examples: {len(os.listdir(emotion_dir))}")
    print(f"   - Speech Action examples: {len(os.listdir(action_dir))}")

In [4]:
# ==============================================================================
# 3. MAIN EXECUTION BLOCK
# ==============================================================================
if __name__ == '__main__':
    if not os.path.isdir(RUN_DIRECTORY):
        print(f"❌ Error: The specified RUN_DIRECTORY does not exist: {RUN_DIRECTORY}")
    else:
        create_correction_dataset(
            run_dir=RUN_DIRECTORY,
            output_name=OUTPUT_DATASET_NAME
        )

✅ Loaded and combined both review logs. Total unique images to process: 254


Sorting images: 100%|███████████████████████| 254/254 [00:00<00:00, 1073.66it/s]


✅ Success! Created the 'CorrectionSet_V2' dataset with 254 images.
   - Location: /Users/natalyagrokh/AI/ml_expressions/img_expressions/data_flywheel/CorrectionSet_V2
   - Emotion examples: 107
   - Speech Action examples: 147



