In [1]:
import os
import shutil
import random

def separate_samples(main_folder, separated_folder, num_samples_per_class):
    """
    Randomly selects and moves a specified number of samples from each class
    folder into a new directory structure.

    Args:
        main_folder (str): The path to the main dataset directory.
        separated_folder (str): The path to the destination directory for separated files.
        num_samples_per_class (int): The number of samples to move from each class.
    """
    # Create the main destination folder if it doesn't exist
    if not os.path.exists(separated_folder):
        os.makedirs(separated_folder)
        print(f"Created directory: {separated_folder}")

    # Get the list of class names (subdirectories in the main folder)
    class_names = [d for d in os.listdir(main_folder) if os.path.isdir(os.path.join(main_folder, d))]

    if not class_names:
        print("Error: No class subfolders found in the main directory.")
        return

    print(f"Found classes: {', '.join(class_names)}")

    # Process each class
    for class_name in class_names:
        source_class_path = os.path.join(main_folder, class_name)
        dest_class_path = os.path.join(separated_folder, class_name)

        # Create corresponding class folder in the destination directory
        if not os.path.exists(dest_class_path):
            os.makedirs(dest_class_path)

        # Get all file names in the source class directory
        all_files = [f for f in os.listdir(source_class_path) if os.path.isfile(os.path.join(source_class_path, f))]

        # Check if there are enough files to separate
        if len(all_files) < num_samples_per_class:
            print(f"⚠️ Warning: Class '{class_name}' has only {len(all_files)} samples, which is less than {num_samples_per_class}. Moving all available samples.")
            files_to_move = all_files
        else:
            # Randomly select the files to move
            files_to_move = random.sample(all_files, num_samples_per_class)

        # Move the selected files
        moved_count = 0
        for file_name in files_to_move:
            source_file_path = os.path.join(source_class_path, file_name)
            dest_file_path = os.path.join(dest_class_path, file_name)
            shutil.move(source_file_path, dest_file_path)
            moved_count += 1
        
        print(f"-> Moved {moved_count} samples for class '{class_name}'.")

    print("\n✅ Separation complete!")


# --- HOW TO USE ---

# 1. Set the path to your main dataset folder
#    Example: 'C:/Users/YourUser/Desktop/Vocal_Disorders_Dataset'
main_dataset_path = 'VoiceDS_SHR/'

# 2. Set the path for the new folder where separated files will be stored
#    Example: 'C:/Users/YourUser/Desktop/Vocal_Disorders_Test_Set'
separated_dataset_path = 'Dataset/'

# 3. Set the number of samples you want to separate from each class
num_samples =130

# --- Run the function ---
# Make sure to replace the placeholder paths above before running!
if main_dataset_path == 'path/to/your/main_dataset_folder' or separated_dataset_path == 'path/to/your/separated_dataset_folder':
    print("🚨 PLEASE UPDATE THE FOLDER PATHS IN THE SCRIPT BEFORE RUNNING!")
else:
    separate_samples(main_dataset_path, separated_dataset_path, num_samples)

Created directory: Dataset/
Found classes: Dysarthia, Dysphonie, Laryngitis, Laryngozele, parkinson, spasmodische_dysphonie, Vox senilis
-> Moved 130 samples for class 'Dysarthia'.
-> Moved 130 samples for class 'Dysphonie'.
-> Moved 130 samples for class 'Laryngitis'.
-> Moved 85 samples for class 'Laryngozele'.
-> Moved 130 samples for class 'parkinson'.
-> Moved 130 samples for class 'spasmodische_dysphonie'.
-> Moved 130 samples for class 'Vox senilis'.

✅ Separation complete!


In [2]:
import os
import librosa
import soundfile as sf
import numpy as np
from tqdm import tqdm

def process_audio_to_length(input_folder, output_folder, target_seconds, sr=22050):
    """
    Processes all audio files in a directory to a uniform length.
    - Trims files that are too long.
    - Pads files that are too short with silence.

    Args:
        input_folder (str): Path to the folder with original audio files.
        output_folder (str): Path to save the new, processed audio files.
        target_seconds (int): The desired uniform length of the audio in seconds.
        sr (int): The sample rate to use for processing.
    """
    # Calculate the target length in samples
    target_length = int(target_seconds * sr)

    # A list to store all found audio file paths
    audio_files = []
    for root, _, files in os.walk(input_folder):
        for file in files:
            # Check for common audio file extensions
            if file.lower().endswith(('.wav', '.mp3', '.flac', '.ogg')):
                audio_files.append(os.path.join(root, file))

    if not audio_files:
        print("🚨 Error: No audio files found in the input folder.")
        return

    print(f"Found {len(audio_files)} audio files. Starting processing...")

    # Process each file with a progress bar
    for file_path in tqdm(audio_files, desc="Processing audio files"):
        try:
            # Load the audio file
            y, current_sr = librosa.load(file_path, sr=sr)

            current_length = len(y)

            # --- TRIMMING OR PADDING LOGIC ---
            if current_length > target_length:
                # Trim the audio from the beginning
                y = y[:target_length]
            elif current_length < target_length:
                # Pad the audio with silence (zeros) at the end
                padding_needed = target_length - current_length
                y = np.pad(y, (0, padding_needed), mode='constant')

            # --- SAVE THE PROCESSED FILE ---
            # Create the corresponding subfolder structure in the output directory
            relative_path = os.path.relpath(file_path, input_folder)
            output_path = os.path.join(output_folder, relative_path)
            
            # Ensure the output directory exists
            os.makedirs(os.path.dirname(output_path), exist_ok=True)

            # Save the processed audio
            sf.write(output_path, y, sr)

        except Exception as e:
            print(f"⚠️ Error processing file {file_path}: {e}")

    print(f"\n✅ Processing complete! Standardized files are saved in: {output_folder}")


# --- HOW TO USE ---

# 1. Set the path to your main dataset folder (where your class folders are)
#    Example: 'C:/Vocal_Disorders/Dataset'
input_audio_folder = 'Dataset/'

# 2. Set the path for the new folder where processed files will be stored
#    This script will create it for you. DO NOT use the same path as the input folder.
#    Example: 'C:/Vocal_Disorders/Processed_Dataset'
output_audio_folder = 'Datasets/'

# 3. Set your desired audio length in seconds. 
#    A good choice is often slightly longer than your shortest files.
#    For voice, 3-5 seconds is a common standard.
TARGET_SECONDS = 4

# --- Run the function ---
# Make sure to replace the placeholder paths above before running!
if 'path/to/your' in input_audio_folder or 'path/to/your' in output_audio_folder:
    print("🚨 PLEASE UPDATE THE FOLDER PATHS IN THE SCRIPT BEFORE RUNNING!")
else:
    process_audio_to_length(input_audio_folder, output_audio_folder, TARGET_SECONDS)

Found 864 audio files. Starting processing...


Processing audio files: 100%|██████████| 864/864 [00:09<00:00, 89.09it/s] 


✅ Processing complete! Standardized files are saved in: Datasets/



