In [10]:
# Imports and helper functions (enhanced extractor)
import os
from pathlib import Path
import librosa
import numpy as np
import pandas as pd
import json
import warnings
warnings.filterwarnings('ignore')

def extract_basic_features(path, sr=22050):
    """Extract extended audio features for a single file."""
    try:
        y, sr = librosa.load(path, sr=sr)
    except Exception as e:
        print('Failed to load', path, e)
        return None
    feats = {}
    # basic metadata
    try:
        duration = float(librosa.get_duration(y=y, sr=sr))
    except Exception:
        duration = 0.0
    feats['duration'] = duration
    feats['sample_rate'] = float(sr)
    # RMS energy (mean/std)
    try:
        rms = librosa.feature.rms(y=y)[0]
        feats['rms_mean'] = float(np.mean(rms))
        feats['rms_std'] = float(np.std(rms))
    except Exception:
        feats['rms_mean'] = 0.0
        feats['rms_std'] = 0.0
    # Zero-crossing rate (mean/std)
    try:
        zcr = librosa.feature.zero_crossing_rate(y)[0]
        feats['zcr_mean'] = float(np.mean(zcr))
        feats['zcr_std'] = float(np.std(zcr))
    except Exception:
        feats['zcr_mean'] = 0.0
        feats['zcr_std'] = 0.0
    # Tempo
    try:
        onset_env = librosa.onset.onset_strength(y=y, sr=sr)
        tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)
        feats['tempo'] = float(tempo[0]) if len(tempo) else 0.0
    except Exception:
        feats['tempo'] = 0.0
    # Spectral centroid / rolloff / bandwidth (mean/std)
    try:
        sc = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
        feats['spectral_centroid_mean'] = float(np.mean(sc))
        feats['spectral_centroid_std'] = float(np.std(sc))
    except Exception:
        feats['spectral_centroid_mean'] = 0.0
        feats['spectral_centroid_std'] = 0.0
    try:
        roll = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
        feats['spectral_rolloff_mean'] = float(np.mean(roll))
        feats['spectral_rolloff_std'] = float(np.std(roll))
    except Exception:
        feats['spectral_rolloff_mean'] = 0.0
        feats['spectral_rolloff_std'] = 0.0
    try:
        bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
        feats['spectral_bandwidth_mean'] = float(np.mean(bw))
        feats['spectral_bandwidth_std'] = float(np.std(bw))
    except Exception:
        feats['spectral_bandwidth_mean'] = 0.0
        feats['spectral_bandwidth_std'] = 0.0
    # MFCCs + deltas + delta2 (means and stds)
    try:
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
        for i in range(mfcc.shape[0]):
            feats[f'mfcc_{i}_mean'] = float(np.mean(mfcc[i]))
            feats[f'mfcc_{i}_std'] = float(np.std(mfcc[i]))
            feats[f'mfcc_delta_{i}_mean'] = float(np.mean(mfcc_delta[i]))
            feats[f'mfcc_delta_{i}_std'] = float(np.std(mfcc_delta[i]))
            feats[f'mfcc_delta2_{i}_mean'] = float(np.mean(mfcc_delta2[i]))
            feats[f'mfcc_delta2_{i}_std'] = float(np.std(mfcc_delta2[i]))
    except Exception:
        for i in range(13):
            feats[f'mfcc_{i}_mean'] = 0.0
            feats[f'mfcc_{i}_std'] = 0.0
            feats[f'mfcc_delta_{i}_mean'] = 0.0
            feats[f'mfcc_delta_{i}_std'] = 0.0
            feats[f'mfcc_delta2_{i}_mean'] = 0.0
            feats[f'mfcc_delta2_{i}_std'] = 0.0
    # Chroma (mean/std per bin)
    try:
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        for i in range(chroma.shape[0]):
            feats[f'chroma_{i}_mean'] = float(np.mean(chroma[i]))
            feats[f'chroma_{i}_std'] = float(np.std(chroma[i]))
    except Exception:
        for i in range(12):
            feats[f'chroma_{i}_mean'] = 0.0
            feats[f'chroma_{i}_std'] = 0.0
    # Spectral contrast (mean/std per band)
    try:
        contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        for i in range(contrast.shape[0]):
            feats[f'spectral_contrast_{i}_mean'] = float(np.mean(contrast[i]))
            feats[f'spectral_contrast_{i}_std'] = float(np.std(contrast[i]))
    except Exception:
        for i in range(7):
            feats[f'spectral_contrast_{i}_mean'] = 0.0
            feats[f'spectral_contrast_{i}_std'] = 0.0
    return feats

In [None]:
# Discovery, extraction loop, and saving
from pathlib import Path
import re

def infer_person_phrase(p: Path):
    """Infer canonical person and phrase from a Path or filename.

    Rules:
    - Map any 'fidel' or 'fidele' occurrences to person 'Fidele'.
    - Map 'kerie' -> 'Kerie', 'irais' -> 'Irais'.
    - Phrase detection prefers keywords and returns one of:
        'confirm_transaction' or 'yes_approve'.
      When keywords aren't found the function falls back to a best-effort token.
    - Ignore augmentation tokens like 'aug','noise','pitch','time','fast','slow','volume','up','down','stretch'.
    """
    name = p.name.lower()
    parts = [s.lower() for s in p.parts if s not in ('.', '')]

    # Normalize person
    person = 'unknown'
    if any('kerie' in part for part in parts):
        person = 'Kerie'
    elif any('irais' in part for part in parts):
        person = 'Irais'
    elif any(('fidel' in part) or ('fidele' in part) for part in parts) or 'fidel' in name or 'fidele' in name:
        person = 'Fidele'

    # Build tokens from stem, remove augmentation suffixes
    aug_tokens = {'aug', 'augmented', 'noise', 'pitch', 'time', 'fast', 'slow', 'volume', 'up', 'down', 'stretch', 'orig', 'original'}
    tokens = [t for t in re.split(r'[_\-\s]+', p.stem.lower()) if t]
    tokens = [t for t in tokens if t not in aug_tokens]

    # Phrase detection by keyword presence
    phrase = p.stem.lower()
    if any(k in name for k in ('confirm', 'transaction', 'confirm_tx', 'confirmtransaction')) or any(k in tok for tok in tokens for k in ('confirm', 'transaction', 'confirmtx', 'confirm_tx')):
        phrase = 'confirm_transaction'
    elif any(k in name for k in ('approve', 'yes', 'approve_yes', 'yes_approve')) or any(k in tok for tok in tokens for k in ('approve', 'yes')):
        phrase = 'yes_approve'
    else:
        # fallback: try to pick a short token that is likely a phrase
        candidates = [t for t in tokens if len(t) > 2 and t not in ('fidel','fidele','kerie','irais')]
        phrase = candidates[-1] if candidates else p.stem.lower()

    return person, phrase

# Prepare output directory
out_dir = Path('features_audio')
out_dir.mkdir(parents=True, exist_ok=True)

# Candidate search roots (adjust if your audio lives elsewhere)
search_roots = [Path('audio_data'), Path('augmented_audio'), Path('audio')]
exts = {'.wav', '.flac', '.mp3', '.m4a', '.ogg'}
files = []
for root in search_roots:
    if root.exists():
        files += [p for p in root.rglob('*') if p.suffix.lower() in exts]
# de-duplicate and sort
files = sorted(set(files))
print(f'Found {len(files)} audio files to process')

rows = []
seen = 0
for f in files:
    seen += 1
    if seen % 50 == 0:
        print('Processed', seen, 'files')
    person, phrase = infer_person_phrase(f)
    feats = extract_basic_features(str(f))
    if feats is None:
        continue
    audio_id = f.stem
    audio_name = f.name
    audio_path = str(f)
    augmentation = 'augmented' if any(s in audio_path.lower() for s in ('aug', 'augment', 'augmented')) else 'original'
    is_augmented = augmentation != 'original'
    row = {
        'audio_id': audio_id,
        'person': str(person),
        'phrase': str(phrase),
        'audio_name': audio_name,
        'audio_path': audio_path,
        'augmentation': augmentation,
        'is_augmented': is_augmented,
    }
    row.update(feats)
    rows.append(row)

print('Extraction complete, rows:', len(rows))
if len(rows) == 0:
    raise RuntimeError('No feature rows were extracted. Check audio file locations.')

import numpy as np

df = pd.DataFrame(rows)
meta = ['audio_id','person','phrase','audio_name','audio_path','augmentation','is_augmented']
feature_cols = [c for c in df.columns if c not in meta]
# keep only numeric feature columns in the feature_columns file
numeric_features = list(df[feature_cols].select_dtypes(include=[np.number]).columns)
# save merged CSV and feature list
out_csv = out_dir / 'audio_features.csv'
df.to_csv(out_csv, index=False, encoding='utf-8')
print('Saved merged features to', out_csv)
with open(out_dir / 'feature_columns.txt', 'w', encoding='utf-8') as fh:
    fh.write('\n'.join(numeric_features))
print('Saved feature column list to', out_dir / 'feature_columns.txt')
print('Columns saved:', len(numeric_features))
print(df.head().to_string(index=False))

Found 76 audio files to process
Processed 50 files
Processed 50 files
Extraction complete, rows: 76
Saved merged features to features_audio\audio_features.csv
Saved feature column list to features_audio\feature_columns.txt
Columns saved: 129
                                audio_id  person phrase                                   audio_name                                                   audio_path augmentation  is_augmented  duration  sample_rate  rms_mean  rms_std  zcr_mean  zcr_std      tempo  spectral_centroid_mean  spectral_centroid_std  spectral_rolloff_mean  spectral_rolloff_std  spectral_bandwidth_mean  spectral_bandwidth_std  mfcc_0_mean  mfcc_0_std  mfcc_delta_0_mean  mfcc_delta_0_std  mfcc_delta2_0_mean  mfcc_delta2_0_std  mfcc_1_mean  mfcc_1_std  mfcc_delta_1_mean  mfcc_delta_1_std  mfcc_delta2_1_mean  mfcc_delta2_1_std  mfcc_2_mean  mfcc_2_std  mfcc_delta_2_mean  mfcc_delta_2_std  mfcc_delta2_2_mean  mfcc_delta2_2_std  mfcc_3_mean  mfcc_3_std  mfcc_delta_3_mean  mfcc_del