In [4]:
import pandas as pd
import os

def mine_speech_action_negatives_v2():
    """
    Analyzes the full inference log to find images the model *thought*
    were 'speech_action' before the thresholding policy was applied.
    """
    log_file = '/Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V36_20251019_105248/V36_full_inference_log.csv'
    output_file = 'hard_negatives_for_speech_action.csv'
    
    print(f"⛏️  Mining for 'speech_action' hard negatives (V2)...")
    
    try:
        df = pd.read_csv(log_file)
        print(f"✅ Successfully loaded '{log_file}'.")
    except FileNotFoundError:
        print(f"❌ ERROR: Could not find '{log_file}'. Make sure you have re-run the main script to generate the new log.")
        return

    # Check if the required 'top1_label' column exists
    if 'top1_label' not in df.columns:
        print("❌ ERROR: The log file is missing the 'top1_label' column.")
        print("   Please replace the hierarchical_predict function and re-run your main training script first.")
        return

    # The new logic: look at the model's raw guess in 'top1_label'
    misclassified_mask = (
        (df['top1_label'] == 'speech_action') & 
        (df['true_label'] != 'speech_action')
    )
    
    hard_negatives_df = df[misclassified_mask]
    
    if not hard_negatives_df.empty:
        output_columns = ['image_path', 'true_label', 'top1_label', 'prediction', 'confidence', 'entropy']
        final_columns = [col for col in output_columns if col in hard_negatives_df.columns]
        
        hard_negatives_df[final_columns].to_csv(output_file, index=False)
        
        print(f"\n✅ Success! Found {len(hard_negatives_df)} images originally predicted as 'speech_action'.")
        print(f"   - Saved results to: '{output_file}'")
        
        print("\n   --- Breakdown of True Labels ---")
        print(hard_negatives_df['true_label'].value_counts().to_string())
    else:
        print("\n✅ No images were found to be misclassified as 'speech_action'.")

if __name__ == "__main__":
    mine_speech_action_negatives_v2()

⛏️  Mining for 'speech_action' hard negatives (V2)...
✅ Successfully loaded '/Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V36_20251019_105248/V36_full_inference_log.csv'.
❌ ERROR: The log file is missing the 'top1_label' column.
   Please replace the hierarchical_predict function and re-run your main training script first.
