In [1]:
import pandas as pd
import os
import shutil
from tqdm import tqdm

In [2]:
# ==============================================================================
# 1. CONFIGURATION
# ==============================================================================

# --- IMPORTANT: Point this to the folder containing your two review queues ---
# This is the output folder from your last completed video analysis run.
RUN_DIRECTORY = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/data_flywheel/V8_20250722_130942"

# --- Define the name for the new, consolidated training set ---
OUTPUT_DATASET_NAME = "CorrectionSet_V2"

In [3]:
# ==============================================================================
# 2. SCRIPT LOGIC
# ==============================================================================

# Reads two manually curated review logs, finds original face crops,
    # and sorts them into new binary dataset for training V2 Gatekeeper
def build_final_correction_set(run_dir, output_name):
    
    
    # --- Setup Paths ---
    micro_expr_folder = os.path.join(run_dir, "micro_expression_review_queue")
    stable_expr_folder = os.path.join(run_dir, "stable_emotion_review_queue")
    
    micro_expr_csv = os.path.join(micro_expr_folder, "simplified_review_log_me.csv")
    stable_expr_csv = os.path.join(stable_expr_folder, "simplified_review_log_se.csv")

    output_dir = os.path.join(os.path.dirname(run_dir), output_name)
    emotion_dir = os.path.join(output_dir, "emotion")
    action_dir = os.path.join(output_dir, "speech_Action")

    # --- Create the new dataset directories ---
    os.makedirs(emotion_dir, exist_ok=True)
    os.makedirs(action_dir, exist_ok=True)
    
    # --- Load and Combine Both Curated CSVs ---
    df_micro = pd.read_csv(micro_expr_csv)
    df_stable = pd.read_csv(stable_expr_csv)
    combined_df = pd.concat([df_micro, df_stable]).drop_duplicates(subset=['face_crop_path']).reset_index(drop=True)
    print(f"✅ Loaded and combined both review logs. Total unique images to process: {len(combined_df)}")

    # --- Process and Sort Images ---
    copied_count = 0
    for _, row in tqdm(combined_df.iterrows(), total=combined_df.shape[0], desc="Sorting Images"):
        # The source images are in two different folders, so we check both.
        source_filename = row['face_crop_path']
        source_path_micro = os.path.join(micro_expr_folder, source_filename)
        source_path_stable = os.path.join(stable_expr_folder, source_filename)
        
        source_path = source_path_micro if os.path.exists(source_path_micro) else source_path_stable

        if not os.path.exists(source_path):
            continue

        actual_label = str(row.get('actual_label', '')).lower()
        
        # --- The Sorting Logic ---
        # If actual_label is empty, the original prediction was correct -> Emotion
        if pd.isna(row.get('actual_label')) or row.get('actual_label') == '':
             shutil.copy(source_path, emotion_dir)
             copied_count += 1
        # If your notes indicate a non-emotional action, sort it into 'Speech_Action'
        elif any(keyword in actual_label for keyword in ['mid-speech', 'mid speech', 'laughter', 'mixed']):
            shutil.copy(source_path, action_dir)
            copied_count += 1
        # Otherwise, it's a genuine emotion that was mislabeled by the V29 model.
        else:
            shutil.copy(source_path, emotion_dir)
            copied_count += 1

    print(f"\n✅ Success! Created the '{output_name}' dataset with {copied_count} images.")
    print(f"   - Location: {output_dir}")
    print(f"   - Emotion examples: {len(os.listdir(emotion_dir))}")
    print(f"   - Speech_Action examples: {len(os.listdir(action_dir))}")

In [4]:
# ==============================================================================
# 3. MAIN EXECUTION BLOCK
# ==============================================================================
if __name__ == '__main__':
    build_final_correction_set(
        run_dir=RUN_DIRECTORY,
        output_name=OUTPUT_DATASET_NAME
    )

✅ Loaded and combined both review logs. Total unique images to process: 192


Sorting Images: 100%|███████████████████████| 192/192 [00:00<00:00, 1101.95it/s]


✅ Success! Created the 'CorrectionSet_V2' dataset with 192 images.
   - Location: /Users/natalyagrokh/AI/ml_expressions/img_expressions/data_flywheel/CorrectionSet_V2
   - Emotion examples: 47
   - Speech_Action examples: 145



