In [3]:
# These three sets of scripts are related to the V40_full_inference_log.csv
# the first isolates the "irrelevant" true labels and outputs mismatched predictions
# the second isolates the "core emotions" labels and outputs mismatched predictions
# the third acts as an auditor to check for accuracy

In [1]:
import pandas as pd

# 1. Load your file (REPLACE 'your_file.csv' with your actual file path)
df = pd.read_csv('/Users/natalyagrokh/Desktop/V40_full_inference_log.csv')

# --- DEFINING YOUR LISTS ---
# The specific true_labels you want to keep
keep_labels = [
    "not_quiqte_neutral_to_sort", "polite_smile_sort", "posed_for_photo",
    "posed_happiness", "posed_neutral", "questioning_other",
    "questioning_surprise", "resigned_despair_sort", "sadness_other",
    "skepticism_disbelief", "smugness", "surprise_sort", "to_sort",
    "unknown", "yawn_sort"
]

# The predictions that go into "List A"
core_predictions = [
    "anger", "contempt", "disgust", "fear", "happiness", "neutral",
    "neutral_speech", "questioning", "sadness", "speech_action", "surprise"
]

# --- PROCESSING ---

# Step 1: Filter by true_label (The Whitelist)
df_clean = df[df['true_label'].isin(keep_labels)].copy()

# Step 2: Remove 'irrelevant' predictions
df_clean = df_clean[df_clean['prediction'] != 'irrelevant']

# Step 3: Split into two lists
# List A: Prediction is in the core list
list_a = df_clean[df_clean['prediction'].isin(core_predictions)]

# List B: Prediction is NOT in the core list (The Outliers)
list_b = df_clean[~df_clean['prediction'].isin(core_predictions)]

# --- OUTPUT ---
print(f"Total Rows Processed: {len(df_clean)}")
print(f"List A (Core) Count: {len(list_a)}")
print(f"List B (Outlier) Count: {len(list_b)}")

# Save to files
list_a.to_csv('list_a_core_emotions.csv', index=False)
list_b.to_csv('list_b_outliers.csv', index=False)

print("‚úÖ Files saved: 'list_a_core_emotions.csv' and 'list_b_outliers.csv'")

Total Rows Processed: 3035
List A (Core) Count: 2354
List B (Outlier) Count: 681
‚úÖ Files saved: 'list_a_core_emotions.csv' and 'list_b_outliers.csv'


In [2]:
import pandas as pd

# 1. Load your file
df = pd.read_csv('/Users/natalyagrokh/Desktop/V40_full_inference_log.csv')

# --- DEFINING YOUR CORE LIST ---
# We use this list as the filter for the TRUE_LABEL this time
core_emotions = [
    "anger", "contempt", "disgust", "fear", "happiness", "neutral",
    "neutral_speech", "questioning", "sadness", "speech_action", "surprise"
]

# --- PROCESSING ---

# Step 1: Filter to keep only rows where true_label is a Core Emotion
df_core = df[df['true_label'].isin(core_emotions)].copy()

# Step 2: Find the Mismatches (Errors)
# We keep rows where the Prediction does NOT match the True Label
mismatches = df_core[df_core['prediction'] != df_core['true_label']]

# --- OUTPUT ---
print(f"Total Core Emotion Rows: {len(df_core)}")
print(f"Total Mismatches found: {len(mismatches)}")

# Save to file
mismatches.to_csv('core_emotion_mismatches.csv', index=False)

print("‚úÖ File saved: 'core_emotion_mismatches.csv'")

Total Core Emotion Rows: 5176
Total Mismatches found: 3821
‚úÖ File saved: 'core_emotion_mismatches.csv'


In [4]:
import pandas as pd
import numpy as np

# --- CONFIGURATION ---
# Replace with your actual filenames
ORIGINAL_FILE = '/Users/natalyagrokh/Desktop/V40_full_inference_log.csv'
FILE_A = '/Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/list_a_core_emotions.csv'
FILE_B = '/Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/list_b_outliers.csv'
FILE_MISMATCH = '/Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/core_emotion_mismatches.csv'

# --- THE LOGIC DEFINITIONS (Must match previous scripts exactly) ---

# Logic Set 1: For List A & B
whitelist_labels = [
    "not_quite_neutral_to_sort", "polite_smile_sort", "posed_for_photo",
    "posed_happiness", "posed_neutral", "questioning_other",
    "questioning_surprise", "resigned_despair_sort", "sadness_other",
    "skepticism_disbelief", "smugness", "surprise_sort", "to_sort",
    "unknown", "yawn_sort"
]

core_predictions_list = [
    "anger", "contempt", "disgust", "fear", "happiness", "neutral",
    "neutral_speech", "questioning", "sadness", "speech_action", "surprise"
]

# Logic Set 2: For Mismatches
core_true_labels = [
    "anger", "contempt", "disgust", "fear", "happiness", "neutral",
    "neutral_speech", "questioning", "sadness", "speech_action", "surprise"
]

def verify_data():
    print("--- STARTING AUDIT ---")
    
    # Load the Original Source of Truth
    try:
        df_orig = pd.read_csv(ORIGINAL_FILE)
    except FileNotFoundError:
        print(f"‚ùå CRITICAL: Could not find original file: {ORIGINAL_FILE}")
        return

    # ==============================================================================
    # AUDIT 1: Verify 'List A' and 'List B' (The Whitelist Split)
    # ==============================================================================
    print("\nüîç Auditing List A & List B Generation...")
    
    # 1. Re-calculate expected results from scratch
    # Filter by whitelist
    step1 = df_orig[df_orig['true_label'].isin(whitelist_labels)].copy()
    # Remove irrelevant
    step2 = step1[step1['prediction'] != 'irrelevant']
    
    # Split
    expected_a = step2[step2['prediction'].isin(core_predictions_list)].sort_values('image_path').reset_index(drop=True)
    expected_b = step2[~step2['prediction'].isin(core_predictions_list)].sort_values('image_path').reset_index(drop=True)

    # 2. Load actual files from disk
    try:
        actual_a = pd.read_csv(FILE_A).sort_values('image_path').reset_index(drop=True)
        actual_b = pd.read_csv(FILE_B).sort_values('image_path').reset_index(drop=True)
    except FileNotFoundError:
        print("‚ùå CRITICAL: Could not find one of the output files (List A or List B).")
        return

    # 3. Compare
    if actual_a.equals(expected_a):
        print(f"‚úÖ PASS: 'list_a_core_emotions.csv' is 100% correct ({len(actual_a)} rows).")
    else:
        print(f"‚ùå FAIL: 'list_a_core_emotions.csv' does NOT match expected logic.")
        # Debugging info
        print(f"   Expected rows: {len(expected_a)} vs Actual rows: {len(actual_a)}")

    if actual_b.equals(expected_b):
        print(f"‚úÖ PASS: 'list_b_outliers.csv' is 100% correct ({len(actual_b)} rows).")
    else:
        print(f"‚ùå FAIL: 'list_b_outliers.csv' does NOT match expected logic.")
        print(f"   Expected rows: {len(expected_b)} vs Actual rows: {len(actual_b)}")

    # ==============================================================================
    # AUDIT 2: Verify 'Core Emotion Mismatches'
    # ==============================================================================
    print("\nüîç Auditing Core Mismatches...")

    # 1. Re-calculate expected results
    # Filter for Core True Labels
    core_rows = df_orig[df_orig['true_label'].isin(core_true_labels)].copy()
    # Find mismatches
    expected_mismatch = core_rows[core_rows['prediction'] != core_rows['true_label']].sort_values('image_path').reset_index(drop=True)

    # 2. Load actual file
    try:
        actual_mismatch = pd.read_csv(FILE_MISMATCH).sort_values('image_path').reset_index(drop=True)
    except FileNotFoundError:
        print(f"‚ùå CRITICAL: Could not find output file: {FILE_MISMATCH}")
        return

    # 3. Compare
    if actual_mismatch.equals(expected_mismatch):
        print(f"‚úÖ PASS: 'core_emotion_mismatches.csv' is 100% correct ({len(actual_mismatch)} rows).")
    else:
        print(f"‚ùå FAIL: 'core_emotion_mismatches.csv' does NOT match expected logic.")
        print(f"   Expected rows: {len(expected_mismatch)} vs Actual rows: {len(actual_mismatch)}")

    print("\n--- AUDIT COMPLETE ---")

if __name__ == "__main__":
    verify_data()

--- STARTING AUDIT ---

üîç Auditing List A & List B Generation...
‚úÖ PASS: 'list_a_core_emotions.csv' is 100% correct (2354 rows).
‚úÖ PASS: 'list_b_outliers.csv' is 100% correct (681 rows).

üîç Auditing Core Mismatches...
‚úÖ PASS: 'core_emotion_mismatches.csv' is 100% correct (3821 rows).

--- AUDIT COMPLETE ---
