# Phoneme dataset preparation

Pipeline to build phoneme-level dataset from `data_wav` while excluding Hessisch accent. Code and comments are in English. Run cells step by step; heavy steps default to dry-run flags.

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import subprocess
import json
import time

# Root paths
# Determine project root (parent of notebooks directory)
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
META_SRC = PROJECT_ROOT / 'data_wav' / 'metadata_wav.csv'
META_NO_HESS = PROJECT_ROOT / 'data_wav' / 'metadata_wav_no_hessisch.csv'
# Audio already normalized/resampled by `02.convert_parquet_to_wav.ipynb`
# so we reuse `data_wav` directly as the normalized source.
AUDIO_ROOT = PROJECT_ROOT / 'data_wav'
AUDIO_OUT = AUDIO_ROOT
ALIGN_INPUT = PROJECT_ROOT / 'artifacts' / 'align_input'
ALIGN_OUTPUT = PROJECT_ROOT / 'artifacts' / 'align_output'
PHONEME_TBL = PROJECT_ROOT / 'artifacts' / 'phoneme_intervals.csv'
PHONEME_AUDIO = PROJECT_ROOT / 'artifacts' / 'phoneme_wav'
LOG_PATH = PROJECT_ROOT / '.cursor' / 'debug.log'
SESSION_ID = 'debug-session'
RUN_ID = 'post-fix'

ALIGN_INPUT.mkdir(parents=True, exist_ok=True)
ALIGN_OUTPUT.mkdir(parents=True, exist_ok=True)
PHONEME_AUDIO.mkdir(parents=True, exist_ok=True)

pd.options.display.max_rows = 30
pd.options.display.max_columns = None

def dbg_log(hypothesis_id: str, location: str, message: str, data: dict | None = None):
    payload = {
        "sessionId": SESSION_ID,
        "runId": RUN_ID,
        "hypothesisId": hypothesis_id,
        "location": location,
        "message": message,
        "data": data or {},
        "timestamp": int(time.time() * 1000),
    }
    with open(LOG_PATH, "a", encoding="utf-8") as f:
        f.write(json.dumps(payload, ensure_ascii=False) + "\n")

In [2]:
# Load metadata, ensure Hessisch is excluded
df = pd.read_csv(META_SRC)
mask_no_hess = ~(df['subset'].str.contains('Hessisch', case=False, na=False) | df['style'].str.contains('Hessisch', case=False, na=False))
df_no_hess = df[mask_no_hess].copy()
print(f'Rows original={len(df)}, kept={len(df_no_hess)}, removed={len(df) - len(df_no_hess)}')
df_no_hess.to_csv(META_NO_HESS, index=False)
df_no_hess.head()

Rows original=39248, kept=37142, removed=2106


Unnamed: 0,idx,id,subset,style,text,samplerate,durationSeconds,recording_year_month,microphone,language,comment,audio_wav_path
0,0,4aeeae88-0777-2c8c-5c93-2e844a462e49---0e52cfa...,TV-2021.02-Neutral,neutral,"Man könnte sagen, ich sei für diese Aufgabe pr...",16000,4.0,2019-11,bad_usbHeadset,german,,/Volumes/SSanDisk/SpeechRec-German/data_wav/TV...
1,1,4aeeae88-0777-2c8c-5c93-2e844a462e49---2a5f795...,TV-2021.02-Neutral,neutral,Das heutige vereinte System von Postleitzahlen...,16000,7.851562,2020-02,good_rodePodcaster,german,,/Volumes/SSanDisk/SpeechRec-German/data_wav/TV...
2,2,4aeeae88-0777-2c8c-5c93-2e844a462e49---6dba565...,TV-2021.02-Neutral,neutral,Eine komplizierte Story mit unzähligen Charakt...,16000,5.75,2020-02,good_rodePodcaster,german,,/Volumes/SSanDisk/SpeechRec-German/data_wav/TV...
3,3,4aeeae88-0777-2c8c-5c93-2e844a462e49---9d651de...,TV-2021.02-Neutral,neutral,Siehe weiter unten.,16000,1.540039,2019-12,bad_usbHeadset,german,,/Volumes/SSanDisk/SpeechRec-German/data_wav/TV...
4,4,4aeeae88-0777-2c8c-5c93-2e844a462e49---9c33fb6...,TV-2021.02-Neutral,neutral,Bei niedriger Last werden bis zu vier der acht...,16000,5.21875,2019-10,bad_usbHeadset,german,,/Volumes/SSanDisk/SpeechRec-German/data_wav/TV...


In [3]:
# Audio preprocessing is already done in `02.convert_parquet_to_wav.ipynb`
# which outputs mono 16 kHz, peak-normalized WAVs into `data_wav/`.
# Here we simply verify availability and, if needed, create a pointer table;
# no reprocessing or VAD is performed.
try:
    import torch  # optional: only for potential future steps
except ImportError:
    torch = None


def verify_audio_exists(df):
    missing = []
    for p in df['audio_wav_path'].tolist():
        if not Path(p).exists():
            missing.append(p)
    print(f"Checked {len(df)} files, missing: {len(missing)}")
    if missing:
        print("Examples of missing:", missing[:5])

# Run a quick check (no processing)
verify_audio_exists(df_no_hess)

Checked 37142 files, missing: 0


In [4]:
# Populate MFA corpus with symlinks to normalized WAVs
from pathlib import Path
import shutil
import pandas as pd

ALIGN_CORPUS = ALIGN_INPUT / 'corpus'
ALIGN_CORPUS.mkdir(parents=True, exist_ok=True)

META_CLEAN = PROJECT_ROOT / 'data_wav' / 'metadata_wav_clean.csv'
META_FALLBACK = PROJECT_ROOT / 'data_wav' / 'metadata_wav.csv'
if META_CLEAN.exists():
    df_meta = pd.read_csv(META_CLEAN)
else:
    df_meta = pd.read_csv(META_FALLBACK)
    print('Warning: metadata_wav_clean.csv not found, using metadata_wav.csv')

USE_SYMLINKS = True  # macOS supports symlinks; set False to copy files

created = 0
for _, row in df_meta.iterrows():
    src_wav = Path(row['audio_wav_path'])
    dst_wav = ALIGN_CORPUS / f"{row['id']}.wav"
    dst_wav.parent.mkdir(parents=True, exist_ok=True)
    if dst_wav.exists():
        dst_wav.unlink()
    if USE_SYMLINKS:
        dst_wav.symlink_to(src_wav)
    else:
        shutil.copyfile(src_wav, dst_wav)
    created += 1

print(f'Prepared MFA corpus entries: {created} files at {ALIGN_CORPUS}')

Prepared MFA corpus entries: 37142 files at /Volumes/SSanDisk/SpeechRec-German/artifacts/align_input/corpus


In [5]:
# Text normalization
import re
try:
    from num2words import num2words
except ImportError:
    num2words = None

def normalize_text(text: str) -> str:
    if not isinstance(text, str):
        return ''
    t = text.strip()
    def repl_digit(match):
        num = match.group(0)
        if num2words:
            return num2words(num, lang='de')
        return num  # fallback: keep digits
    t = re.sub(r'\d+', repl_digit, t)
    t = re.sub(r'\s+', ' ', t)
    return t

df_no_hess['text_norm'] = df_no_hess['text'].apply(normalize_text)
df_no_hess[['id', 'text_norm']].head()

Unnamed: 0,id,text_norm
0,4aeeae88-0777-2c8c-5c93-2e844a462e49---0e52cfa...,"Man könnte sagen, ich sei für diese Aufgabe pr..."
1,4aeeae88-0777-2c8c-5c93-2e844a462e49---2a5f795...,Das heutige vereinte System von Postleitzahlen...
2,4aeeae88-0777-2c8c-5c93-2e844a462e49---6dba565...,Eine komplizierte Story mit unzähligen Charakt...
3,4aeeae88-0777-2c8c-5c93-2e844a462e49---9d651de...,Siehe weiter unten.
4,4aeeae88-0777-2c8c-5c93-2e844a462e49---9c33fb6...,Bei niedriger Last werden bis zu vier der acht...


In [None]:
# G2P using phonemizer + espeak-ng (de)
# Hypotheses:
# A: espeak-ng binary missing or not discoverable on PATH
# B: German voice data missing/unavailable for espeak-ng
# C: Phonemizer binding cannot load espeak-ng library (dylib path issue)

# Ensure phonemizer finds the native espeak/espeak-ng library on macOS.
import os
from pathlib import Path
for candidate in [
    Path('/opt/homebrew/lib/libespeak-ng.dylib'),
    Path('/opt/homebrew/opt/espeak-ng/lib/libespeak-ng.dylib'),
    Path('/opt/homebrew/lib/libespeak.dylib'),
    Path('/opt/homebrew/opt/espeak/lib/libespeak.dylib'),
]:
    if candidate.exists():
        os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = str(candidate)
        dbg_log('A', 'cell6:env', 'Set PHONEMIZER_ESPEAK_LIBRARY', {'runId': RUN_ID, 'path': str(candidate)})
        break

try:
    #region agent log
    dbg_log('A', 'cell6:import', 'Attempting to import EspeakBackend', {'runId': RUN_ID})
    #endregion
    from phonemizer.backend import EspeakBackend
    #region agent log
    dbg_log('A', 'cell6:is_available', 'EspeakBackend availability check', {'runId': RUN_ID, 'available': EspeakBackend.is_available()})
    #endregion
    espeak = None
    try:
        espeak = EspeakBackend(language='de', punctuation_marks=';:,.!?¡¿—…""''“”„”()')
        #region agent log
        dbg_log('B', 'cell6:init', 'EspeakBackend instantiated', {'runId': RUN_ID, 'voice': 'de'})
        #endregion
    except RuntimeError as exc:
        #region agent log
        dbg_log('C', 'cell6:init_error', 'EspeakBackend init failed', {'runId': RUN_ID, 'error': str(exc)})
        #endregion
        raise
except ImportError:
    espeak = None
    print('phonemizer not installed; install with `pip install phonemizer`')

def g2p(text: str) -> str:
    if espeak is None:
        return ''
    return espeak.phonemize([text], strip=True, njobs=1)[0]

df_no_hess['phonemes_ipa'] = df_no_hess['text_norm'].apply(g2p)
df_no_hess[['id', 'text_norm', 'phonemes_ipa']].head()

Unnamed: 0,id,text_norm,phonemes_ipa
0,4aeeae88-0777-2c8c-5c93-2e844a462e49---0e52cfa...,"Man könnte sagen, ich sei für diese Aufgabe pr...",man kœntə zɑːɡən ɪç zaɪ fyːɾ diːzə aʊfɡɑːbə pɾ...
1,4aeeae88-0777-2c8c-5c93-2e844a462e49---2a5f795...,Das heutige vereinte System von Postleitzahlen...,das hɔøtɪɡə fɛɾaɪntə zʏsteːm fɔn pɔstlaɪtsɑːlə...
2,4aeeae88-0777-2c8c-5c93-2e844a462e49---6dba565...,Eine komplizierte Story mit unzähligen Charakt...,aɪnə kɔmpliːtsiːɾtə ʃtoːriː mɪt ʊntsɛːlɪɡən ka...
3,4aeeae88-0777-2c8c-5c93-2e844a462e49---9d651de...,Siehe weiter unten.,ziːə vaɪtɜ ʊntən
4,4aeeae88-0777-2c8c-5c93-2e844a462e49---9c33fb6...,Bei niedriger Last werden bis zu vier der acht...,baɪ niːdɾɪɡɜ last vɛɾdən bɪs tsuː fiːɾ dɛɾ axt...


In [7]:
# Save normalized metadata with G2P
META_CLEAN = PROJECT_ROOT / 'data_wav' / 'metadata_wav_clean.csv'
df_no_hess.to_csv(META_CLEAN, index=False)
print('Saved', META_CLEAN)

Saved /Volumes/SSanDisk/SpeechRec-German/data_wav/metadata_wav_clean.csv


In [8]:
# Prepare MFA input (utterance list and transcriptions)
TRANS_TXT = ALIGN_INPUT / 'transcriptions.txt'
with open(TRANS_TXT, 'w') as f:
    for _, row in df_no_hess.iterrows():
        utt_id = row['id']
        text = row['text_norm']
        f.write(f"{utt_id} {text}\n")
print('Wrote transcripts to', TRANS_TXT)

# MFA alignment (toggle RUN_MFA to actually run; heavy)
import os
from shutil import which

MFA_CORPUS = ALIGN_INPUT / 'corpus'
MFA_DICT = ALIGN_INPUT / 'lexicon.txt'  # set to dictionary path, or use pretrained dict
MFA_MODEL = os.environ.get('MFA_MODEL', 'german_mfa')  # replace with actual acoustic model name
RUN_MFA = False  # set True to run alignment
MFA_JOBS = int(os.environ.get('MFA_JOBS', '4'))

MFA_CORPUS.mkdir(parents=True, exist_ok=True)
# corpus already populated above with symlinks

mfa_bin = os.environ.get('MFA_BIN') or which('mfa')
cmd = None
if mfa_bin:
    cmd = [
        mfa_bin,
        'align',
        str(MFA_CORPUS),
        str(MFA_DICT),
        str(MFA_MODEL),
        str(ALIGN_OUTPUT),
        '--clean',
        '--overwrite',
        '--num_jobs', str(MFA_JOBS),
    ]
    print('MFA command:', ' '.join(cmd))
else:
    print('mfa CLI not found; install Montreal Forced Aligner and/or set MFA_BIN')

if RUN_MFA and cmd:
    dbg_log('A', 'cell_mfa:run', 'Starting MFA alignment', {'cmd': cmd})
    result = subprocess.run(cmd, check=False, capture_output=True, text=True)
    dbg_log('A', 'cell_mfa:stdout', 'MFA stdout', {'stdout': result.stdout})
    dbg_log('A', 'cell_mfa:stderr', 'MFA stderr', {'stderr': result.stderr})
    if result.returncode != 0:
        raise RuntimeError(f'MFA failed with code {result.returncode}')
    print('MFA alignment completed.')
else:
    print('Alignment not executed (set RUN_MFA=True to run).')
    print('Example CLI: mfa align <corpus_dir> <dictionary> <acoustic_model> <output_dir> --clean --overwrite')

Wrote transcripts to /Volumes/SSanDisk/SpeechRec-German/artifacts/align_input/transcriptions.txt
Example MFA CLI:
mfa align <corpus_dir> <dictionary> <acoustic_model> <output_dir> --clean --overwrite


In [9]:
# Parse TextGrid outputs into a phoneme table
try:
    import textgrid
except ImportError:
    textgrid = None
    print('textgrid not installed; install with `pip install praatio` or `pip install textgrid`.')

records = []
textgrids = list(ALIGN_OUTPUT.glob('*.TextGrid')) if ALIGN_OUTPUT.exists() else []
if textgrid is not None and textgrids:
    print(f'Found {len(textgrids)} TextGrid files, parsing...')
    for tg_path in textgrids:
        utt_id = tg_path.stem
        tg = textgrid.TextGrid.fromFile(str(tg_path))
        # assume phoneme tier is named 'phones' or 'phonemes'
        tier = next((t for t in tg.tiers if t.name.lower() in {'phones', 'phonemes', 'phone', 'phonem'}), None)
        if tier is None:
            continue
        for interval in tier.intervals:
            label = interval.mark.strip()
            if not label:
                continue
            records.append({
                'utterance_id': utt_id,
                'phoneme': label,
                'start_ms': interval.minTime * 1000,
                'end_ms': interval.maxTime * 1000,
                'duration_ms': (interval.maxTime - interval.minTime) * 1000,
            })
elif textgrid is not None:
    print('No TextGrid files found in', ALIGN_OUTPUT)

phoneme_df = None
if records:
    phoneme_df = pd.DataFrame(records)
    phoneme_df.to_csv(PHONEME_TBL, index=False)
    display(phoneme_df.head())
    print('Saved phoneme table to', PHONEME_TBL)
else:
    print('No TextGrid files parsed yet; run MFA cell first.')

No TextGrid files parsed yet.


In [10]:
# Optional: slice phoneme-level audio using ffmpeg
def slice_with_ffmpeg(src_wav: Path, start_ms: float, end_ms: float, dst_wav: Path):
    duration_ms = max(end_ms - start_ms, 1.0)
    cmd = [
        'ffmpeg', '-y', '-i', str(src_wav),
        '-ac', '1', '-ar', '16000',
        '-ss', f'{start_ms/1000:.3f}', '-t', f'{duration_ms/1000:.3f}',
        str(dst_wav)
    ]
    subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

def slice_all(phoneme_df: pd.DataFrame, out_root=PHONEME_AUDIO, limit=None):
    rows = phoneme_df if limit is None else phoneme_df.head(limit)
    for _, row in rows.iterrows():
        utt_id = row['utterance_id']
        src = AUDIO_OUT / f"{utt_id}.wav"
        dst = out_root / f"{utt_id}__{row['phoneme']}__{int(row['start_ms'])}-{int(row['end_ms'])}.wav"
        dst.parent.mkdir(parents=True, exist_ok=True)
        slice_with_ffmpeg(src, row['start_ms'], row['end_ms'], dst)
    print(f'Sliced {len(rows)} phoneme clips')

# Example usage after alignment is available:
# slice_all(phoneme_df, limit=100)  # limit to avoid long run

In [11]:
# Quick QC: sample a few aligned utterances/phonemes
def qc_sample(phoneme_df: pd.DataFrame, n=5):
    sample = phoneme_df.sample(n=min(n, len(phoneme_df)), random_state=42)
    return sample

# Example after alignment
# qc_sample(phoneme_df)