In [3]:
import os

def list_files_in_folder(folder_path, output_file=None):
    """
    Scans a folder and lists all file paths.
    Optionally writes the list to a text file.
    """
    
    # Check if the folder exists
    if not os.path.exists(folder_path):
        print(f"Error: The folder '{folder_path}' does not exist.")
        return

    file_paths = []

    print(f"Scanning '{folder_path}'...\n")

    # os.walk yields a 3-tuple (dirpath, dirnames, filenames)
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            # Join the root path and filename to get the full absolute path
            full_path = os.path.join(root, filename)
            file_paths.append(full_path)
            
            # Print to console (optional - comment out if too many files)
            print(full_path)

    print(f"\nFound {len(file_paths)} files.")

    # If an output file is specified, write the results there
    if output_file:
        try:
            with open(output_file, 'w', encoding='utf-8') as f:
                for path in file_paths:
                    f.write(path + '\n')
            print(f"Successfully saved list to '{output_file}'")
        except IOError as e:
            print(f"Error writing to file: {e}")

# --- CONFIGURATION ---
# Replace this with the path you want to scan
# Windows Example: r"C:\Users\YourName\Documents"
# Mac/Linux Example: "/home/yourname/documents"
target_directory = r"/Users/natalyagrokh/AI/ml_expressions/img_datasets/ferckjalfaga_dataset_14_labels/speech_action" 

# Optional: Set to "file_list.txt" to save the output, or None to just print
save_to_file = "speech_action_contents_list.txt" 

if __name__ == "__main__":
    list_files_in_folder(target_directory, save_to_file)

Scanning '/Users/natalyagrokh/AI/ml_expressions/img_datasets/ferckjalfaga_dataset_14_labels/speech_action'...

/Users/natalyagrokh/AI/ml_expressions/img_datasets/ferckjalfaga_dataset_14_labels/speech_action/frame_560_Person_1_face_0.png
/Users/natalyagrokh/AI/ml_expressions/img_datasets/ferckjalfaga_dataset_14_labels/speech_action/Sean_Penn_0002.jpg
/Users/natalyagrokh/AI/ml_expressions/img_datasets/ferckjalfaga_dataset_14_labels/speech_action/Training_82073038.jpg
/Users/natalyagrokh/AI/ml_expressions/img_datasets/ferckjalfaga_dataset_14_labels/speech_action/Spencer_Abraham_0009.jpg
/Users/natalyagrokh/AI/ml_expressions/img_datasets/ferckjalfaga_dataset_14_labels/speech_action/Training_69523426.jpg
/Users/natalyagrokh/AI/ml_expressions/img_datasets/ferckjalfaga_dataset_14_labels/speech_action/frame_1847_Person_1_face_0.png
/Users/natalyagrokh/AI/ml_expressions/img_datasets/ferckjalfaga_dataset_14_labels/speech_action/Chok_Tong_Goh_0001 copy.heic
/Users/natalyagrokh/AI/ml_expressions/i

In [4]:
import os

def clean_dataset():
    # --- CONFIGURATION ---
    list1_filename = '/Users/natalyagrokh/Documents/list1.txt' # The Master List (Path + Labels)
    list2_filename = '/Users/natalyagrokh/Documents/list2.txt' # The Blocklist (Paths only)
    output_filename = 'cleaned_list_1.tsv' # The Output file
    
    # Set to store the filenames we want to remove
    blocklist_filenames = set()

    print("--- STEP 1: Processing List 2 (Blocklist) ---")
    try:
        with open(list2_filename, 'r', encoding='utf-8') as f2:
            for line in f2:
                line = line.strip()
                if not line: continue
                
                # Extract the actual filename from the path
                # Example: /Users/name/Desktop/image.jpg -> image.jpg
                filename = os.path.basename(line)
                blocklist_filenames.add(filename)
                
        print(f"Loaded {len(blocklist_filenames)} unique filenames to remove.")
        
    except FileNotFoundError:
        print(f"ERROR: Could not find '{list2_filename}'. Make sure the file exists.")
        return

    print("\n--- STEP 2: Cleaning List 1 ---")
    kept_count = 0
    removed_count = 0
    
    try:
        with open(list1_filename, 'r', encoding='utf-8') as f1, \
             open(output_filename, 'w', encoding='utf-8') as f_out:
            
            for line in f1:
                line = line.strip()
                if not line: continue
                
                # List 1 structure is often: /path/to/image.jpg [TAB/SPACE] label1 [TAB/SPACE] label2
                # We split by whitespace to get the path (first element)
                parts = line.split() 
                full_path = parts[0]
                
                # Extract filename to compare
                filename_in_list1 = os.path.basename(full_path)
                
                # --- THE CHECK ---
                if filename_in_list1 in blocklist_filenames:
                    removed_count += 1
                    # Do nothing (this effectively deletes the line)
                else:
                    kept_count += 1
                    # Write the original line to the new file
                    f_out.write(line + '\n')
                    
        print("-" * 30)
        print(f"Processing Complete.")
        print(f"Original Lines Processed: {kept_count + removed_count}")
        print(f"Rows Removed: {removed_count}")
        print(f"Rows Remaining: {kept_count}")
        print(f"\nSUCCESS! Cleaned data saved to: {output_filename}")
        print("You can copy/paste the content of that file directly into Excel/Sheets.")

    except FileNotFoundError:
        print(f"ERROR: Could not find '{list1_filename}'. Make sure the file exists.")

if __name__ == "__main__":
    clean_dataset()

--- STEP 1: Processing List 2 (Blocklist) ---
Loaded 128 unique filenames to remove.

--- STEP 2: Cleaning List 1 ---
------------------------------
Processing Complete.
Original Lines Processed: 2122
Rows Removed: 1
Rows Remaining: 2121

SUCCESS! Cleaned data saved to: cleaned_list_1.tsv
You can copy/paste the content of that file directly into Excel/Sheets.
