In [1]:
import os
import zipfile
import shutil
import random
from pathlib import Path

In [2]:
# ==== CONFIG ====
ZIP_PATH = "l2arctic_release_v5.0.zip"  # ← Change this to your zip file location
OUTPUT_DIR = "L2_ARCTIC_SUBSET"  # ← Folder to create your reduced dataset
SELECTED_SPEAKERS = ["HJK", "BWC", "YBAA", "SVBI","THV"]  # Choose any 4 speakers you want
MAX_FILES_PER_SPK = 500
SEED = 42


In [3]:
# ==== STEP 1: EXTRACT THE MAIN ZIP FILE TO TEMP ==== 
extract_path = "temp_l2arctic_extract"
if not os.path.exists(extract_path):
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
print("✅ Full dataset extracted to:", extract_path)

✅ Full dataset extracted to: temp_l2arctic_extract


In [4]:
# ==== STEP 2: EXTRACT SPEAKER ZIPS AND SAMPLE WAV FILES ==== 
random.seed(SEED)
os.makedirs(OUTPUT_DIR, exist_ok=True)

for speaker in SELECTED_SPEAKERS:
    speaker_zip_path = os.path.join(extract_path, f"{speaker}.zip")  # Path to the speaker's zip
    if os.path.exists(speaker_zip_path):
        # Extract speaker's zip
        speaker_extract_path = os.path.join(extract_path, speaker)
        if not os.path.exists(speaker_extract_path):
            with zipfile.ZipFile(speaker_zip_path, 'r') as zip_ref:
                zip_ref.extractall(speaker_extract_path)
        print(f"✅ Extracted {speaker}'s zip to: {speaker_extract_path}")

        # Now navigate to the WAV directory inside the speaker's subfolder
        speaker_folder_path = os.path.join(speaker_extract_path, speaker)  # Speaker's subfolder (e.g., HJK)
        src_wav_dir = os.path.join(speaker_folder_path, "wav")
        
        if not os.path.exists(src_wav_dir):
            print(f"⚠️ WAV folder not found for speaker {speaker}!")
            continue
        
        target_wav_dir = os.path.join(OUTPUT_DIR, speaker, "wav")
        os.makedirs(target_wav_dir, exist_ok=True)

        # List all WAV files and sample them
        all_wavs = [f for f in os.listdir(src_wav_dir) if f.endswith(".wav")]
        sampled_wavs = random.sample(all_wavs, min(MAX_FILES_PER_SPK, len(all_wavs)))

        # Copy the sampled files to the output directory
        for wav in sampled_wavs:
            shutil.copy2(os.path.join(src_wav_dir, wav), os.path.join(target_wav_dir, wav))

        print(f"✅ Copied {len(sampled_wavs)} files for speaker {speaker}")
    else:
        print(f"⚠️ Speaker {speaker} zip not found!")

print("🎉 Reduced dataset ready at:", OUTPUT_DIR)

# (Optional) Clean up temporary extraction
shutil.rmtree(extract_path)
print("🗑️ Temporary extraction cleaned up.")

✅ Extracted HJK's zip to: temp_l2arctic_extract\HJK
✅ Copied 500 files for speaker HJK
✅ Extracted BWC's zip to: temp_l2arctic_extract\BWC
✅ Copied 500 files for speaker BWC
✅ Extracted YBAA's zip to: temp_l2arctic_extract\YBAA
✅ Copied 500 files for speaker YBAA
✅ Extracted SVBI's zip to: temp_l2arctic_extract\SVBI
✅ Copied 500 files for speaker SVBI
✅ Extracted THV's zip to: temp_l2arctic_extract\THV
✅ Copied 500 files for speaker THV
🎉 Reduced dataset ready at: L2_ARCTIC_SUBSET
🗑️ Temporary extraction cleaned up.
