# Phoneme Audio Slicing with Context

Slice phoneme-level audio clips from MFA-aligned phoneme intervals **with extended context windows**.

**Key improvements:**
- Extended context: ±100ms before and after each phoneme
- Boundary handling: automatically adjusts for utterance start/end
- Optimized for maximum accuracy: captures coarticulation, formant transitions, and VOT

Uses `phoneme_intervals.csv` (full list of all phonemes after MFA alignment).

In [None]:
from pathlib import Path
import pandas as pd
import subprocess
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing
import json
import os
from datetime import datetime

PROJECT_ROOT = Path('/Volumes/SSanDisk/SpeechRec-German')

# Base dataset configuration
PHONEME_TBL = PROJECT_ROOT / 'artifacts' / 'phoneme_intervals.csv'  # Full MFA output
AUDIO_ROOT = PROJECT_ROOT / 'data_wav'
PHONEME_AUDIO = PROJECT_ROOT / 'artifacts' / 'phoneme_wav_with_context'

PHONEME_AUDIO.mkdir(parents=True, exist_ok=True)

# Context window configuration
CONTEXT_MS = 100  # Context window in milliseconds (±100ms)
# For paired consonants (d/t, g/k, b/p), this captures:
# - Coarticulation with neighboring phonemes
# - Formant transitions
# - Full Voice Onset Time (VOT)
# - Positional effects

#pd.options.display.max_rows = 30
pd.options.display.max_columns = None


In [None]:
LOG_PATH = Path('/Volumes/SSanDisk/SpeechRec-German/.cursor/debug.log')

def debug_log(location, message, data, hypothesis_id=None):
    """Write debug log entry."""
    try:
        log_entry = {
            "sessionId": "debug-session",
            "runId": "run1",
            "hypothesisId": hypothesis_id,
            "location": location,
            "message": message,
            "data": data,
            "timestamp": int(datetime.now().timestamp() * 1000)
        }
        with open(LOG_PATH, 'a', encoding='utf-8') as f:
            f.write(json.dumps(log_entry) + '\n')
    except Exception:
        pass

def get_audio_duration_ms(audio_path: Path) -> float:
    """Get audio file duration in milliseconds using ffprobe."""
    try:
        cmd = [
            'ffprobe', '-v', 'error',
            '-show_entries', 'format=duration',
            '-of', 'default=noprint_wrappers=1:nokey=1',
            str(audio_path)
        ]
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        duration_seconds = float(result.stdout.strip())
        return duration_seconds * 1000.0  # Convert to milliseconds
    except Exception as e:
        # Fallback: return None if ffprobe fails
        return None

def build_audio_index(audio_root: Path) -> dict[str, Path]:
    """Build index of all audio files: utterance_id -> file_path."""
    debug_log("build_audio_index:entry", "Building audio file index", {
        "audio_root": str(audio_root),
        "audio_root_exists": audio_root.exists()
    }, "A")
    
    index = {}
    total_files = 0
    
    # First check root directory
    for wav_file in audio_root.glob("*.wav"):
        utt_id = wav_file.stem
        index[utt_id] = wav_file
        total_files += 1
    
    # Then check all subdirectories
    for subdir in audio_root.iterdir():
        if subdir.is_dir():
            for wav_file in subdir.glob("*.wav"):
                utt_id = wav_file.stem
                index[utt_id] = wav_file
                total_files += 1
    
    debug_log("build_audio_index:complete", "Index built", {
        "total_files": total_files,
        "index_size": len(index),
        "sample_keys": list(index.keys())[:5] if index else []
    }, "A")
    
    return index

def slice_with_context_ffmpeg(src_wav: Path, start_ms: float, end_ms: float, 
                               context_ms: float, utterance_duration_ms: float,
                               dst_wav: Path):
    """
    Slice audio segment with context using ffmpeg.
    
    Args:
        src_wav: Source audio file path
        start_ms: Phoneme start time (ms)
        end_ms: Phoneme end time (ms)
        context_ms: Context window size (ms) to add before and after
        utterance_duration_ms: Total duration of utterance (ms)
        dst_wav: Destination audio file path
    
    Returns:
        tuple: (extended_start_ms, extended_end_ms) - actual boundaries used
    """
    # Calculate extended boundaries with context
    extended_start_ms = max(0.0, start_ms - context_ms)
    extended_end_ms = min(utterance_duration_ms, end_ms + context_ms)
    
    # Calculate duration
    duration_ms = extended_end_ms - extended_start_ms
    
    # Ensure minimum duration
    if duration_ms < 1.0:
        duration_ms = 1.0
    
    # Slice with ffmpeg
    cmd = [
        'ffmpeg', '-y', '-i', str(src_wav),
        '-ac', '1', '-ar', '16000',
        '-ss', f'{extended_start_ms/1000:.3f}',
        '-t', f'{duration_ms/1000:.3f}',
        str(dst_wav)
    ]
    subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    
    return extended_start_ms, extended_end_ms

def slice_single_phoneme_with_context(args):
    """
    Slice a single phoneme interval with context. Designed for parallel execution.
    
    Args:
        args: tuple of (utt_id, phoneme, start_ms, end_ms, src_path_str, dst_path_str, context_ms)
    
    Returns:
        tuple: (status, utt_id, error_msg)
    """
    utt_id, phoneme, start_ms, end_ms, src_path_str, dst_path_str, context_ms = args
    
    src = Path(src_path_str)
    dst = Path(dst_path_str)
    
    if not src.exists():
        return ('skipped', utt_id, None)
    
    dst.parent.mkdir(parents=True, exist_ok=True)
    
    try:
        # Get utterance duration
        utterance_duration_ms = get_audio_duration_ms(src)
        
        if utterance_duration_ms is None:
            # Fallback: use original boundaries without context if duration can't be determined
            duration_ms = max(end_ms - start_ms, 1.0)
            cmd = [
                'ffmpeg', '-y', '-i', str(src),
                '-ac', '1', '-ar', '16000',
                '-ss', f'{start_ms/1000:.3f}', '-t', f'{duration_ms/1000:.3f}',
                str(dst)
            ]
            subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            return ('created', utt_id, None)
        
        # Slice with context
        extended_start, extended_end = slice_with_context_ffmpeg(
            src, start_ms, end_ms, context_ms, utterance_duration_ms, dst
        )
        
        return ('created', utt_id, None)
    except Exception as e:
        return ('error', utt_id, str(e))

def slice_all_with_context(phoneme_df: pd.DataFrame, out_root=PHONEME_AUDIO, 
                          context_ms=CONTEXT_MS, limit=None, use_tqdm=True, n_workers=None):
    """
    Slice all phoneme intervals into audio clips with context using parallel processing.
    
    Args:
        phoneme_df: DataFrame with phoneme intervals
        out_root: Output directory for sliced phonemes
        context_ms: Context window size in milliseconds
        limit: Limit number of phonemes to process (None for all)
        use_tqdm: Show progress bar
        n_workers: Number of parallel workers (None = use all CPU cores)
    """
    rows = phoneme_df if limit is None else phoneme_df.head(limit)
    
    debug_log("slice_all_with_context:entry", "Starting slice_all_with_context", {
        "total_rows": len(rows),
        "audio_root": str(AUDIO_ROOT),
        "audio_root_exists": AUDIO_ROOT.exists(),
        "context_ms": context_ms
    }, "A")
    
    # Build audio file index once
    print("Building audio file index...")
    audio_index = build_audio_index(AUDIO_ROOT)
    print(f"Index built: {len(audio_index):,} audio files found")
    
    # Prepare tasks
    tasks = []
    for _, row in rows.iterrows():
        utt_id = row['utterance_id']
        src = audio_index.get(utt_id)
        
        if not src or not src.exists():
            continue
        
        # Get phoneme label (try 'phoneme' first, fallback to 'mapped_phoneme' for compatibility)
        phoneme = row.get('phoneme', row.get('mapped_phoneme', 'unknown'))
        
        # Output filename includes original boundaries for reference
        # Format: {utt_id}__{phoneme}__{start_ms}-{end_ms}.wav
        # The actual audio will have extended boundaries with context
        dst = out_root / f"{utt_id}__{phoneme}__{int(row['start_ms'])}-{int(row['end_ms'])}.wav"
        
        # Convert Path objects to strings for multiprocessing
        tasks.append((
            utt_id,
            phoneme,
            row['start_ms'],
            row['end_ms'],
            str(src),
            str(dst),
            context_ms
        ))
    
    print(f"Prepared {len(tasks):,} tasks for processing")
    print(f"Context window: ±{context_ms}ms (total extension: {context_ms * 2}ms)")
    
    # Determine number of workers (default: number of CPU cores)
    if n_workers is None:
        n_workers = multiprocessing.cpu_count()
    print(f"Using {n_workers} parallel workers")
    
    # Process in parallel
    created = 0
    skipped = 0
    errors = 0
    error_messages = []
    
    with ThreadPoolExecutor(max_workers=n_workers) as executor:
        # Submit all tasks
        future_to_task = {executor.submit(slice_single_phoneme_with_context, task): task for task in tasks}
        
        # Process results with progress bar
        if use_tqdm:
            iterator = tqdm(as_completed(future_to_task), total=len(tasks), desc='Slicing phonemes with context')
        else:
            iterator = as_completed(future_to_task)
        
        for future in iterator:
            status, utt_id, error_msg = future.result()
            
            if status == 'created':
                created += 1
            elif status == 'skipped':
                skipped += 1
            elif status == 'error':
                errors += 1
                if len(error_messages) < 5:
                    error_messages.append((utt_id, error_msg))
                    print(f"Error slicing {utt_id}: {error_msg}")
    
    debug_log("slice_all_with_context:complete", "Slicing complete", {
        "created": created,
        "skipped": skipped,
        "errors": errors,
        "index_size": len(audio_index),
        "n_workers": n_workers,
        "context_ms": context_ms
    }, "A")
    
    print(f'\nSlicing complete:')
    print(f'  Created: {created:,} clips')
    print(f'  Skipped: {skipped:,} (source audio missing)')
    print(f'  Errors: {errors:,}')
    print(f'  Output: {out_root}')
    print(f'  Context: ±{context_ms}ms per phoneme')
    
    if error_messages:
        print(f'\nFirst {len(error_messages)} errors:')
        for utt_id, msg in error_messages:
            print(f'  {utt_id}: {msg}')


## Load Phoneme Intervals


In [3]:
# Load phoneme intervals from MFA alignment (full list, all phonemes)
print(f"Loading {PHONEME_TBL}...")
df = pd.read_csv(PHONEME_TBL)
print(f"Loaded {len(df):,} phoneme intervals")

# Verify required columns (try 'phoneme' first, fallback to 'mapped_phoneme')
required_cols = ['utterance_id', 'start_ms', 'end_ms']
if 'phoneme' in df.columns:
    required_cols.append('phoneme')
    phoneme_col = 'phoneme'
elif 'mapped_phoneme' in df.columns:
    required_cols.append('mapped_phoneme')
    phoneme_col = 'mapped_phoneme'
    # Create 'phoneme' column for consistency
    df['phoneme'] = df['mapped_phoneme']
else:
    raise ValueError("Neither 'phoneme' nor 'mapped_phoneme' column found in CSV")

missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

print(f"\nUnique phonemes: {df[phoneme_col].nunique()}")
print(f"Unique utterances: {df['utterance_id'].nunique()}")

# Show phoneme distribution
print(f"\nTop 20 phonemes:")
print(df[phoneme_col].value_counts().head(60))

# Show statistics
if 'duration_ms' in df.columns:
    print(f"\nDuration statistics:")
    print(df['duration_ms'].describe())
else:
    # Calculate duration if not present
    df['duration_ms'] = df['end_ms'] - df['start_ms']
    print(f"\nDuration statistics (calculated):")
    print(df['duration_ms'].describe())

display(df.head())


Loading /Volumes/SSanDisk/SpeechRec-German/artifacts/phoneme_intervals.csv...
Loaded 1,337,749 phoneme intervals

Unique phonemes: 53
Unique utterances: 37139

Top 20 phonemes:
phoneme
n     126506
ə      92825
t      74454
ɪ      64786
d      58538
       ...  
tʃ       605
ɲ        560
l̩       180
c        117
m̩        13
Name: count, Length: 53, dtype: int64

Duration statistics:
count    1.337749e+06
mean     8.783295e+01
std      7.683843e+01
min      5.000000e+00
25%      5.000000e+01
50%      8.000000e+01
75%      1.100000e+02
max      2.788000e+03
Name: duration_ms, dtype: float64


Unnamed: 0,utterance_id,phoneme,start_ms,end_ms,duration_ms
0,4aeeae88-0777-2c8c-5c93-2e844a462e49---4783573...,d,0.0,30.0,30.0
1,4aeeae88-0777-2c8c-5c93-2e844a462e49---4783573...,iː,30.0,200.0,170.0
2,4aeeae88-0777-2c8c-5c93-2e844a462e49---4783573...,h,200.0,210.0,10.0
3,4aeeae88-0777-2c8c-5c93-2e844a462e49---4783573...,øː,210.0,460.0,250.0
4,4aeeae88-0777-2c8c-5c93-2e844a462e49---4783573...,ə,460.0,600.0,140.0


## Configuration and Executi

In [None]:
# Configuration
SLICE_ALL = True  # Set False to skip slicing
LIMIT = None  # Set to number (e.g., 1000) to limit for testing, None for all
N_WORKERS = None  # Number of parallel workers (None = use all CPU cores, or set specific number like 4, 8, etc.)

# Context window configuration
CONTEXT_MS = 100  # ±100ms context window
# This captures:
# - Coarticulation with neighboring phonemes
# - Formant transitions (F1, F2, F3)
# - Full Voice Onset Time (VOT) for paired consonants
# - Positional effects (word-initial, word-medial, word-final)

if SLICE_ALL:
    if LIMIT:
        print(f"⚠️  WARNING: Limiting to first {LIMIT:,} intervals for testing")
    slice_all_with_context(df, out_root=PHONEME_AUDIO, context_ms=CONTEXT_MS, limit=LIMIT, n_workers=N_WORKERS)
else:
    print("Slicing disabled (set SLICE_ALL=True to run)")


Building audio file index...
Index built: 39,248 audio files found
Prepared 1,337,749 tasks for processing
Context window: ±100ms (total extension: 200ms)
Using 8 parallel workers


Slicing phonemes with context:   0%|          | 5296/1337749 [01:27<6:08:19, 60.29it/s] 


## Verification

Check a few samples to verify the context extension is working correctly.


In [None]:
import librosa
import soundfile as sf
from IPython.display import Audio, display

# Check a few samples
sample_files = list(PHONEME_AUDIO.glob("*.wav"))[:5]

if sample_files:
    print(f"Checking {len(sample_files)} sample files...")
    for sample_file in sample_files:
        audio, sr = librosa.load(sample_file, sr=16000, mono=True)
        duration_ms = len(audio) / sr * 1000
        
        # Parse filename to get original boundaries
        # Format: {utt_id}__{phoneme}__{start_ms}-{end_ms}.wav
        parts = sample_file.stem.split('__')
        if len(parts) >= 3:
            boundary_str = parts[2]
            if '-' in boundary_str:
                orig_start, orig_end = map(int, boundary_str.split('-'))
                orig_duration = orig_end - orig_start
                context_added = duration_ms - orig_duration
                
                print(f"\nFile: {sample_file.name}")
                print(f"  Original: {orig_start}-{orig_end}ms ({orig_duration:.1f}ms)")
                print(f"  With context: {duration_ms:.1f}ms")
                print(f"  Context added: {context_added:.1f}ms")
            else:
                print(f"\nFile: {sample_file.name}")
                print(f"  Duration: {duration_ms:.1f}ms")
else:
    print("No sample files found. Run slicing first.")


Checking 5 sample files...

File: 4aeeae88-0777-2c8c-5c93-2e844a462e49---f7108371f2d34bf141a8f47ed562d12f__l__4100-4190.wav
  Original: 4100-4190ms (90.0ms)
  With context: 290.0ms
  Context added: 200.0ms

File: dd01c488-10f3-a683-00cf-4d215f4d9b19---25440f463c2793c8e1ac644192df9f3c__ʁ__1640-1690.wav
  Original: 1640-1690ms (50.0ms)
  With context: 250.0ms
  Context added: 200.0ms

File: 4aeeae88-0777-2c8c-5c93-2e844a462e49---de5cfabc2dbc5cf5d474e066c839754d__z__500-620.wav
  Original: 500-620ms (120.0ms)
  With context: 320.0ms
  Context added: 200.0ms

File: dd01c488-10f3-a683-00cf-4d215f4d9b19---9d4b962eadb910852ad16f9c77babdd2__aː__2360-2430.wav
  Original: 2360-2430ms (70.0ms)
  With context: 270.0ms
  Context added: 200.0ms

File: 4aeeae88-0777-2c8c-5c93-2e844a462e49---7ea325f654b9d558ae0346593800a30c__m__940-1040.wav
  Original: 940-1040ms (100.0ms)
  With context: 300.0ms
  Context added: 200.0ms
