## Description
This script selects a specified number of random samples from each class folder in a dataset and deletes the rest, ensuring consistency and optimal storage usage. It utilizes multithreading for efficient processing.

## Library Imports
Imports libraries for file operations (os, shutil), randomization (random), and parallel processing (ThreadPoolExecutor).

In [None]:
import os
import random
import shutil
from concurrent.futures import ThreadPoolExecutor

## Process Single Class
Defines process_class to select a subset of files from a class folder, remove unselected files, and handle folder cleanup.

In [None]:
def process_class(class_dir, base_dir, sample_size=1000):
    
    class_path = os.path.join(base_dir, class_dir)
    if os.path.isdir(class_path):
        
        selected_samples_folder = os.path.join(class_path, "selected_samples")
        if os.path.isdir(selected_samples_folder):
            shutil.rmtree(selected_samples_folder)
            print(f"Deleted 'selected_samples' folder in class '{class_dir}'.")

        # List all files in the class directory
        files = os.listdir(class_path)

        # If there are fewer than the requested samples, skip this class
        if len(files) < sample_size:
            print(f"Not enough files in class '{class_dir}' to select {sample_size} samples.")
            return

        # Randomly shuffle the files
        random.shuffle(files)

        # Select the first 'sample_size' files
        selected_files = files[:sample_size]

        # Loop through all the files and delete the ones that are not selected
        for file in files:
            if file not in selected_files:
                file_path = os.path.join(class_path, file)
                if os.path.isfile(file_path):
                    os.remove(file_path)

        print(f"Kept {sample_size} files and deleted the rest from class '{class_dir}'.")

## Parallel Class Processing
Implements select_samples_from_class_parallel to process multiple class folders in parallel, leveraging ThreadPoolExecutor for scalability.

In [2]:


def select_samples_from_class_parallel(base_dir, sample_size=1000, max_workers=4):
    
    # List all class directories (subdirectories)
    class_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]

    # Create a ThreadPoolExecutor to process each class in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit the processing task for each class
        for class_dir in class_dirs:
            executor.submit(process_class, class_dir, base_dir, sample_size)

# Usage Example
base_address = r'C:\Users\jamee\Favorites\Downloads\ucf\Test'  # Update this path to your dataset path
select_samples_from_class_parallel(base_address, sample_size=1000, max_workers=32)

Not enough files in class 'Abuse' to select 1000 samples.
Deleted 'selected_samples' folder in class 'Arson'.
Deleted 'selected_samples' folder in class 'RoadAccidents'.
Deleted 'selected_samples' folder in class 'Burglary'.
Deleted 'selected_samples' folder in class 'Shooting'.
Deleted 'selected_samples' folder in class 'Fighting'.
Deleted 'selected_samples' folder in class 'Robbery'.
Deleted 'selected_samples' folder in class 'NormalVideos'.
Deleted 'selected_samples' folder in class 'Explosion'.
Not enough files in class 'Robbery' to select 1000 samples.
Not enough files in class 'Fighting' to select 1000 samples.
Deleted 'selected_samples' folder in class 'Arrest'.
Deleted 'selected_samples' folder in class 'Stealing'.
Deleted 'selected_samples' folder in class 'Vandalism'.
Deleted 'selected_samples' folder in class 'Assault'.
Deleted 'selected_samples' folder in class 'Shoplifting'.
Not enough files in class 'Vandalism' to select 1000 samples.
Kept 1000 files and deleted the rest 