In [1]:
import os
import pandas as pd
import shutil
from pathlib import Path

In [2]:
# Provide the correct, current path to your main dataset folder.
DATASET_ROOT_PATH = "/Users/natalyagrokh/AI/ml_expressions/img_datasets"

# The path to the CSV file generated by the V39 run.
HARD_NEGATIVES_CSV = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V39_20251022_093948/hard_negatives_neutral_speech_vs_speech_action.csv"

# The name of the folder you want to clean.
FOLDER_TO_CLEAN = "speech_action"

# Where to move the flagged images.
QUARANTINE_FOLDER = "/Users/natalyagrokh/AI/ml_expressions/img_datasets/quarantined_speech_action"


print("Configuration:")
print(f"  - Dataset Root: '{os.path.abspath(DATASET_ROOT_PATH)}'")
print(f"  - Source CSV: '{os.path.abspath(HARD_NEGATIVES_CSV)}'")
print(f"  - Quarantine Destination: '{os.path.abspath(QUARANTINE_FOLDER)}'")

Configuration:
  - Dataset Root: '/Users/natalyagrokh/AI/ml_expressions/img_datasets'
  - Source CSV: '/Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V39_20251022_093948/hard_negatives_neutral_speech_vs_speech_action.csv'
  - Quarantine Destination: '/Users/natalyagrokh/AI/ml_expressions/img_datasets/quarantined_speech_action'


In [3]:
def clean_folder():
    """
    Reads a hard-negatives CSV, intelligently finds files based on the
    DATASET_ROOT_PATH, copies them to a quarantine directory, and then
    deletes them from the source folder.
    """
    if not os.path.exists(HARD_NEGATIVES_CSV):
        print(f"‚ùå Error: Cannot find '{HARD_NEGATIVES_CSV}'.")
        return
        
    if not os.path.isdir(DATASET_ROOT_PATH):
        print(f"‚ùå Error: The DATASET_ROOT_PATH does not exist: '{DATASET_ROOT_PATH}'")
        return

    os.makedirs(QUARANTINE_FOLDER, exist_ok=True)
    print(f"\nüìÅ Quarantine folder is ready at: '{os.path.abspath(QUARANTINE_FOLDER)}'")

    df = pd.read_csv(HARD_NEGATIVES_CSV)
    
    if "image_path" not in df.columns or "true_label" not in df.columns:
        print("‚ùå Error: CSV must contain 'image_path' and 'true_label' columns.")
        return

    files_to_process_df = df[df["true_label"] == FOLDER_TO_CLEAN]
    
    if files_to_process_df.empty:
        print(f"‚úÖ No images with true_label '{FOLDER_TO_CLEAN}' found in the CSV. No files were moved.")
        return

    print(f"\nFound {len(files_to_process_df)} images to remove from the '{FOLDER_TO_CLEAN}' folder...")

    moved_count = 0
    not_found_count = 0
    error_count = 0
    for index, row in files_to_process_df.iterrows():
        # V3 FIX: Reconstruct the source path instead of trusting the CSV directly.
        # This makes the script resilient to the dataset being moved.
        old_path = Path(row["image_path"])
        filename = old_path.name
        true_label_folder = old_path.parent.name
        
        # Build the new, correct path based on the provided DATASET_ROOT_PATH
        source_path = Path(DATASET_ROOT_PATH) / true_label_folder / filename
        
        if not source_path.exists():
            print(f"  ‚ö†Ô∏è Warning: Source file not found at new path. Skipping: {source_path}")
            not_found_count += 1
            continue

        destination_path = Path(QUARANTINE_FOLDER) / source_path.name

        try:
            # Perform an explicit copy and then delete
            shutil.copy2(source_path, destination_path)
            os.remove(source_path)
            moved_count += 1
        except Exception as e:
            print(f"  ‚ùå Error processing {source_path}: {e}")
            error_count += 1
            
    print("\n--- Summary ---")
    print(f"‚úÖ Successfully quarantined and removed {moved_count} images.")
    if not_found_count > 0:
        print(f"‚ö†Ô∏è Skipped {not_found_count} files that could not be found.")
    if error_count > 0:
        print(f"‚ùå Encountered {error_count} errors during the process.")
    print("‚ú® Dataset cleaning complete.")

In [4]:
clean_folder()


üìÅ Quarantine folder is ready at: '/Users/natalyagrokh/AI/ml_expressions/img_datasets/quarantined_speech_action'

Found 998 images to remove from the 'speech_action' folder...

--- Summary ---
‚úÖ Successfully quarantined and removed 998 images.
‚ú® Dataset cleaning complete.
