In [None]:
import os
import shutil
import random
import glob
from pathlib import Path
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm  # specific for Jupyter progress bars

# ================= CONFIGURATION =================
# Based on your screenshot
SOURCE_DIR = Path("processed_data")
OUTPUT_DIR = Path("dataset_split")

# Extensions to look for (add .mp3 or others if needed)
EXTENSIONS = ("*.wav", "*.flac")

# Split Ratios
TRAIN_RATIO = 0.8
VAL_RATIO   = 0.1
TEST_RATIO  = 0.1

SEED = 42
# =================================================

def split_dataset_notebook():
    # 1. Setup
    random.seed(SEED)

    if not SOURCE_DIR.exists():
        print(f"‚ùå Error: Folder '{SOURCE_DIR}' not found. Check your path!")
        return

    # Detect classes automatically from subfolders
    classes = [d.name for d in SOURCE_DIR.iterdir() if d.is_dir()]
    classes.sort()

    print(f"üìÇ Found {len(classes)} classes: {classes}")
    print("-" * 60)

    # 2. Iterate and Split
    # We use tqdm to show a progress bar in Jupyter
    for cls in tqdm(classes, desc="Processing Classes"):

        # Gather files
        cls_dir = SOURCE_DIR / cls
        files = []
        for ext in EXTENSIONS:
            files.extend(list(cls_dir.glob(ext)))

        # Sort to ensure reproducibility before shuffling
        files.sort()

        if not files:
            print(f"‚ö†Ô∏è Warning: No audio files found in '{cls}'")
            continue

        # --- Stratified Split Logic ---
        # 1. Split Train vs (Val + Test)
        train_files, temp_files = train_test_split(
            files, test_size=(1 - TRAIN_RATIO), random_state=SEED, shuffle=True
        )

        # 2. Split Val vs Test (adjusting ratio for the smaller temp set)
        val_relative_ratio = VAL_RATIO / (VAL_RATIO + TEST_RATIO)
        val_files, test_files = train_test_split(
            temp_files, test_size=(1 - val_relative_ratio), random_state=SEED, shuffle=True
        )

        # --- Copying Files ---
        splits = {
            "train": train_files,
            "val":   val_files,
            "test":  test_files
        }

        for split_name, split_files in splits.items():
            save_dir = OUTPUT_DIR / split_name / cls
            os.makedirs(save_dir, exist_ok=True)

            for f in split_files:
                shutil.copy2(f, save_dir / f.name)

    # 3. Final Summary
    print("\n‚úÖ Dataset Split Complete!")
    print(f"   Output Location: {OUTPUT_DIR.resolve()}")
    print("-" * 30)
    print(f"{'Class':<25} | {'Train':<5} | {'Val':<5} | {'Test':<5}")
    print("-" * 30)

    for cls in classes:
        n_train = len(list((OUTPUT_DIR / "train" / cls).glob("*")))
        n_val   = len(list((OUTPUT_DIR / "val" / cls).glob("*")))
        n_test  = len(list((OUTPUT_DIR / "test" / cls).glob("*")))
        print(f"{cls:<25} | {n_train:<5} | {n_val:<5} | {n_test:<5}")

# Run the function
split_dataset_notebook()

In [None]:
import os
import glob
import librosa
import soundfile as sf
import numpy as np
from pathlib import Path
from tqdm.notebook import tqdm
from audiomentations import Compose, AddGaussianNoise, TimeStretch, Gain, Shift

# ================= CONFIGURATION =================
TRAIN_DIR = Path("dataset_split/train")
EXTENSIONS = ("*.wav", "*.flac")

# Define Safe Augmentations for Voice Pathology
augmenter = Compose([
    # Add varying levels of noise (simulates different recording environments)
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),

    # Change speed slightly without changing pitch (preserves formants)
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),

    # Change volume (robustness to microphone gain)
    # FIX 1: 'min_gain_in_db' -> 'min_gain_db'
    Gain(min_gain_db=-12, max_gain_db=12, p=0.5),

    # Shift time (start signal slightly later/earlier)
    # FIX 2: 'min_fraction' -> 'min_shift' (defaults to fraction unit)
    Shift(min_shift=-0.1, max_shift=0.1, p=0.5),
])
# =================================================

def run_offline_augmentation():
    if not TRAIN_DIR.exists():
        print(f"‚ùå Error: {TRAIN_DIR} does not exist. Run the split step first.")
        return

    # Get all classes in train
    classes = [d.name for d in TRAIN_DIR.iterdir() if d.is_dir()]
    total_new_files = 0

    print(f"üöÄ Starting Augmentation on {len(classes)} classes...")

    for cls in classes:
        cls_dir = TRAIN_DIR / cls

        # Gather original files
        files = []
        for ext in EXTENSIONS:
            files.extend(list(cls_dir.glob(ext)))

        # Filter out files that are ALREADY augmented (if you run this twice)
        files = [f for f in files if "_aug" not in f.name]

        print(f"   ‚Ä¢ {cls}: Augmenting {len(files)} files...")

        for file_path in tqdm(files, desc=f"Augmenting {cls}", leave=False):
            try:
                # 1. Load Audio
                # We use librosa to load as float32, which audiomentations expects
                y, sr = librosa.load(file_path, sr=None)

                # 2. Apply Augmentation
                augmented_y = augmenter(samples=y, sample_rate=sr)

                # 3. Save
                # Create output filename: "original_aug.wav"
                out_name = file_path.stem + "_aug" + file_path.suffix
                out_path = cls_dir / out_name

                sf.write(out_path, augmented_y, sr)
                total_new_files += 1

            except Exception as e:
                print(f"     ‚ö†Ô∏è Error processing {file_path.name}: {e}")

    print("-" * 40)
    print(f"‚úÖ Augmentation Complete!")
    print(f"üéâ Created {total_new_files} new training samples.")
    print(f"üìÇ Check folder: {TRAIN_DIR}")

# Run it
run_offline_augmentation()

In [1]:
import torch
import librosa
import numpy as np
from pathlib import Path
from tqdm import tqdm
from transformers import Wav2Vec2Processor, Wav2Vec2Model

# ================= CONFIG =================
DATASET_DIR = Path("dataset_split")
OUTPUT_DIR  = Path("features")

MODEL_NAME = "facebook/wav2vec2-base-960h"
TARGET_SR = 16000
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

BATCH_SIZE = 32   # reduce to 4 if VRAM < 8GB
# =========================================


def align_features(feat: np.ndarray, target_len: int):
    """Linear interpolation for temporal alignment"""
    x_old = np.linspace(0, 1, feat.shape[0])
    x_new = np.linspace(0, 1, target_len)
    return np.stack([
        np.interp(x_new, x_old, feat[:, i])
        for i in range(feat.shape[1])
    ], axis=1)


def extract_features():

    print(f"üîÑ Loading wav2vec2 on {DEVICE}")
    processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
    model = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(DEVICE)
    model.eval()

    # Create output directories
    for split in ["train", "val", "test"]:
        for cls in (DATASET_DIR / split).iterdir():
            if cls.is_dir():
                (OUTPUT_DIR / "ssl" / split / cls.name).mkdir(parents=True, exist_ok=True)
                (OUTPUT_DIR / "sfm" / split / cls.name).mkdir(parents=True, exist_ok=True)

    audio_files = list(DATASET_DIR.rglob("*.wav")) + list(DATASET_DIR.rglob("*.flac"))
    print(f"üöÄ Found {len(audio_files)} files")

    batch_audio = []
    batch_paths = []

    for wav_path in tqdm(audio_files, desc="Extracting features"):

        try:
            y, _ = librosa.load(wav_path, sr=TARGET_SR)
            batch_audio.append(y)
            batch_paths.append(wav_path)

            # ---------- PROCESS BATCH ----------
            if len(batch_audio) == BATCH_SIZE:
                process_batch(batch_audio, batch_paths, processor, model)
                batch_audio, batch_paths = [], []

        except Exception as e:
            print(f"‚ùå Failed loading {wav_path.name}: {e}")

    # Process remaining files
    if len(batch_audio) > 0:
        process_batch(batch_audio, batch_paths, processor, model)

    print("\n‚úÖ EXTRACTION COMPLETE")
    print(f"Saved to: {OUTPUT_DIR.resolve()}")


def process_batch(batch_audio, batch_paths, processor, model):

    # ================= SSL (GPU) =================
    inputs = processor(
        batch_audio,
        sampling_rate=TARGET_SR,
        return_tensors="pt",
        padding=True
    )

    with torch.no_grad():
        outputs = model(
            inputs.input_values.to(DEVICE),
            output_hidden_states=True
        )

    # Average last 4 layers
    ssl_batch = torch.stack(outputs.hidden_states[-4:]).mean(0)
    ssl_batch = ssl_batch.cpu().numpy()  # (B, T, 768)

    # ================= PER FILE: SFM + SAVE =================
    for i, wav_path in enumerate(batch_paths):

        ssl_feat = ssl_batch[i]
        T = ssl_feat.shape[0]

        y, _ = librosa.load(wav_path, sr=TARGET_SR)

        # -------- SOURCE FEATURES --------
        f0, _, voiced_prob = librosa.pyin(
            y, fmin=50, fmax=500, sr=TARGET_SR
        )
        f0 = np.nan_to_num(f0)
        voiced_prob = np.nan_to_num(voiced_prob)

        energy = librosa.feature.rms(y=y)[0]

        # -------- FILTER FEATURES (LPC) --------
        lpc = librosa.lpc(y, order=12)
        lpc = np.abs(lpc[:4])

        min_len = min(len(f0), len(energy))
        f0 = f0[:min_len]
        energy = energy[:min_len]
        voiced_prob = voiced_prob[:min_len]

        lpc_feat = np.repeat(lpc[:, None], min_len, axis=1)

        sfm_raw = np.vstack([
            f0,
            energy,
            voiced_prob,
            lpc_feat
        ]).T

        sfm_feat = align_features(sfm_raw, T)

        # -------- SAVE --------
        rel = wav_path.relative_to(DATASET_DIR)
        ssl_path = OUTPUT_DIR / "ssl" / rel.with_suffix(".npy")
        sfm_path = OUTPUT_DIR / "sfm" / rel.with_suffix(".npy")

        np.save(ssl_path, ssl_feat.astype(np.float32))
        np.save(sfm_path, sfm_feat.astype(np.float32))


# ================= RUN =================
extract_features()


üîÑ Loading wav2vec2 on cuda


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üöÄ Found 16585 files


Extracting features: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16585/16585 [57:44<00:00,  4.79it/s] 



‚úÖ EXTRACTION COMPLETE
Saved to: A:\Conferences\VOICE\features
