I have a test audio dataset `test/` just containing a few files. These need to be split into 5s audio chunks with padding added where necessary and placed in a `temp/` folder

In [None]:
import soundfile as sf
import numpy as np
from pathlib import Path

# Configuration
DATASET_NAME = "test_dataset"

INPUT_DIR = Path("test")
OUTPUT_DIR = Path("test_dataset/data/")
CHUNK_DURATION = 5  # seconds

# Create output directory
OUTPUT_DIR.mkdir(exist_ok=True)

# Supported audio extensions
AUDIO_EXTENSIONS = {".wav", ".ogg", ".flac"}

# Process each audio file
for audio_path in INPUT_DIR.iterdir():
    if audio_path.suffix.lower() not in AUDIO_EXTENSIONS:
        continue
    
    print(f"Processing: {audio_path.name}")
    
    # Load audio
    y, sr = sf.read(audio_path)
    
    # Handle stereo -> mono conversion if needed
    if len(y.shape) > 1:
        y = y.mean(axis=1)
    
    # Calculate chunk size in samples
    chunk_samples = int(CHUNK_DURATION * sr)
    
    # Split into chunks
    num_chunks = int(np.ceil(len(y) / chunk_samples))
    
    for i in range(num_chunks):
        start = i * chunk_samples
        end = start + chunk_samples
        chunk = y[start:end]
        
        # Pad with zeros if chunk is shorter than target duration
        if len(chunk) < chunk_samples:
            padding = chunk_samples - len(chunk)
            chunk = np.pad(chunk, (0, padding), mode='constant', constant_values=0)
        
        # Generate output filename: originalname_starttime_endtime.wav
        start_sec = i * CHUNK_DURATION
        end_sec = start_sec + CHUNK_DURATION
        output_name = f"{audio_path.stem}_{start_sec:03d}_{end_sec:03d}.wav"
        output_path = OUTPUT_DIR / output_name
        
        # Save chunk
        sf.write(output_path, chunk, sr)
    
    print(f"  -> Created {num_chunks} chunks")

print(f"\nDone! Chunks saved to {OUTPUT_DIR}/")

Processing: CHE_01_20190101_163410.wav
  -> Created 13 chunks
Processing: CHE_02_20190101_183410.wav
  -> Created 2 chunks
Processing: CHE_03_20190201_163410.wav
  -> Created 2 chunks
Processing: CHE_04_20190203_175410.wav
  -> Created 2 chunks

Done! Chunks saved to temp/
