In [None]:
zip_path="/content/audiofile.zip"

In [None]:
import os
import zipfile
import librosa
import soundfile as sf
import pandas as pd

# Step 1: Unzip the dataset
def unzip_dataset(zip_path, extract_to='unzipped_data'):
    if not os.path.exists(extract_to):
        os.makedirs(extract_to)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"✅ Unzipped dataset to '{extract_to}'")

In [None]:
# Step 2: Load Audacity label file
def load_labels(label_file_path):
    return pd.read_csv(label_file_path, sep="\t", header=None, names=["start", "end", "label"])


In [None]:
# Step 3: Slice and save audio segments
def slice_audio(audio_path, label_df, output_base_dir, file_prefix):
    y, sr = librosa.load(audio_path, sr=None)
    for i, row in label_df.iterrows():
        start_sample = int(row['start'] * sr)
        end_sample = int(row['end'] * sr)
        segment = y[start_sample:end_sample]

        label = row['label'].strip().lower()
        out_dir = os.path.join(output_base_dir, label)
        os.makedirs(out_dir, exist_ok=True)

        out_path = os.path.join(out_dir, f"{file_prefix}_{i}.wav")
        sf.write(out_path, segment, sr)
        print(f"    ➤ Saved: {out_path}")

In [None]:
def process_all(zip_path):
    unzip_dataset(zip_path)

    audio_dir = os.path.join("unzipped_data", "audiodata", "audio")
    label_dir = os.path.join("unzipped_data", "audiodata", "labels")
    output_dir = "split_data"

    os.makedirs(output_dir, exist_ok=True)

    # Create a mapping from label file (without "_labels") to full label path
    label_map = {}
    for label_file in os.listdir(label_dir):
        if label_file.endswith(".txt"):
            base = label_file.replace("_labels.txt", "").lower()
            label_map[base] = os.path.join(label_dir, label_file)

    for audio_file in os.listdir(audio_dir):
        if audio_file.endswith(".wav"):
            base_name = os.path.splitext(audio_file)[0].lower()
            audio_path = os.path.join(audio_dir, audio_file)

            if base_name in label_map:
                label_path = label_map[base_name]
                print(f"🔄 Processing: {audio_file} with {os.path.basename(label_path)}")
                labels_df = load_labels(label_path)
                slice_audio(audio_path, labels_df, output_dir, base_name)
            else:
                print(f"⚠️ Warning: No label file found for {audio_file}")

    print("✅ All files processed and saved to 'split_data/'")


In [None]:
# Run the process
if __name__ == "__main__":
    process_all(zip_path)

✅ Unzipped dataset to 'unzipped_data'
🔄 Processing: R4.wav with R4_labels.txt
    ➤ Saved: split_data/dictation/r4_0.wav
    ➤ Saved: split_data/command/r4_1.wav
    ➤ Saved: split_data/dictation/r4_2.wav
    ➤ Saved: split_data/command/r4_3.wav
    ➤ Saved: split_data/dictation/r4_4.wav
    ➤ Saved: split_data/command/r4_5.wav
    ➤ Saved: split_data/dictation/r4_6.wav
    ➤ Saved: split_data/command/r4_7.wav
    ➤ Saved: split_data/dictation/r4_8.wav
    ➤ Saved: split_data/command/r4_9.wav
🔄 Processing: 10R.wav with 10R_labels.txt
    ➤ Saved: split_data/dictation/10r_0.wav
    ➤ Saved: split_data/command/10r_1.wav
    ➤ Saved: split_data/dictation/10r_2.wav
    ➤ Saved: split_data/command/10r_3.wav
    ➤ Saved: split_data/dictation/10r_4.wav
    ➤ Saved: split_data/command/10r_5.wav
    ➤ Saved: split_data/dictation/10r_6.wav
    ➤ Saved: split_data/command/10r_7.wav
    ➤ Saved: split_data/dictation/10r_8.wav
    ➤ Saved: split_data/command/10r_9.wav
🔄 Processing: R1.wav with R1_l

In [None]:
import shutil
from google.colab import files

# Zip the output folder
shutil.make_archive('split_data', 'zip', 'split_data')

# Download to local system
files.download('split_data.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>