# MFA alignment

End-to-end MFA run (creates corpus, transcripts, runs MFA, parses TextGrid).


In [None]:
#!pip install pandas tqdm

In [None]:
from pathlib import Path
import os
import subprocess
from shutil import which
import pandas as pd
from tqdm import tqdm

# Determine project root (parent of notebooks directory)
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
ARTIFACTS = PROJECT_ROOT / 'artifacts'
ALIGN_INPUT = ARTIFACTS / 'align_input'
ALIGN_OUTPUT = ARTIFACTS / 'align_output'
ALIGN_CORPUS = ALIGN_INPUT / 'corpus'
TRANS_TXT = ALIGN_INPUT / 'transcriptions.txt'
PHONEME_TBL = ARTIFACTS / 'phoneme_intervals.csv'
META_CLEAN = PROJECT_ROOT / 'data_wav' / 'metadata_wav_clean.csv'
AUDIO_ROOT = PROJECT_ROOT / 'data_wav'

for p in [ARTIFACTS, ALIGN_INPUT, ALIGN_OUTPUT, ALIGN_CORPUS]:
    p.mkdir(parents=True, exist_ok=True)

assert META_CLEAN.exists(), f"Missing {META_CLEAN}, run 04 notebook first"
df = pd.read_csv(META_CLEAN)
print('Loaded rows:', len(df))

Loaded rows: 37142


In [None]:
# Populate MFA corpus with symlinks (resumable)
created = 0
skipped = 0
for _, row in tqdm(df.iterrows(), total=len(df), desc='symlinks'):
    utt_id = row['id']
    src = AUDIO_ROOT / f"{utt_id}.wav"
    dst = ALIGN_CORPUS / f"{utt_id}.wav"
    if dst.exists():
        skipped += 1
        continue
    if not src.exists():
        continue
    dst.parent.mkdir(parents=True, exist_ok=True)
    try:
        dst.symlink_to(src)
        created += 1
    except FileExistsError:
        skipped += 1

print(f'Symlinks created={created}, skipped={skipped}, corpus={ALIGN_CORPUS}')

symlinks: 100%|██████████| 37142/37142 [00:00<00:00, 44321.10it/s]

Symlinks created=0, skipped=37142, corpus=/Volumes/SSanDisk/SpeechRec-German/artifacts/align_input/corpus





In [None]:
# Write transcripts file
with open(TRANS_TXT, 'w') as f:
    for _, row in df.iterrows():
        f.write(f"{row['id']} {row['text_norm']}\n")
print('Wrote transcripts to', TRANS_TXT)

Wrote transcripts to /Volumes/SSanDisk/SpeechRec-German/artifacts/align_input/transcriptions.txt


In [None]:
# MFA alignment (instrumented)
# Configure paths/models
MFA_BIN = os.environ.get('MFA_BIN') or '/Volumes/SSanDisk/SpeechRec-German/miniforge/envs/mfa310/bin/mfa'
MFA_DICT = os.environ.get('MFA_DICT') or 'german_mfa'  # set to dictionary path or model name
MFA_MODEL = os.environ.get('MFA_MODEL') or 'german_mfa'  # acoustic model name or path
MFA_JOBS = int(os.environ.get('MFA_JOBS', '4'))
RUN_MFA = True
RESUME = False  # if True, do not pass --clean/--overwrite

# Count corpus files
total_wavs = len(list(ALIGN_CORPUS.glob('*.wav')))
print(f'Corpus contains {total_wavs} WAV files')

# Create .lab files for each .wav (MFA expects .lab files, not transcriptions.txt)
trans_dict = dict(zip(df['id'], df['text_norm']))
lab_created = 0
lab_missing = 0
for wav_path in tqdm(ALIGN_CORPUS.glob('*.wav'), desc='creating .lab files', total=total_wavs):
    utt_id = wav_path.stem
    lab_path = wav_path.with_suffix('.lab')
    if lab_path.exists():
        continue  # skip if already exists
    if utt_id in trans_dict:
        with open(lab_path, 'w') as f:
            f.write(trans_dict[utt_id])
        lab_created += 1
    else:
        lab_missing += 1
print(f'Created {lab_created} .lab files, missing transcriptions: {lab_missing}')

# Remove transcriptions.txt if exists (MFA should use .lab files instead)
corpus_trans = ALIGN_CORPUS / 'transcriptions.txt'
if corpus_trans.exists():
    corpus_trans.unlink()

cmd = [MFA_BIN, 'align', str(ALIGN_CORPUS), str(MFA_DICT), str(MFA_MODEL), str(ALIGN_OUTPUT), '--num_jobs', str(MFA_JOBS)]
if not RESUME:
    cmd.extend(['--clean', '--overwrite'])

print('Running MFA:', ' '.join(cmd))
result = subprocess.run(cmd, text=True)
print('MFA exited with', result.returncode)
if result.returncode != 0:
    print('MFA failed. Check output above for errors.')

# Check if alignment succeeded
textgrids = list(ALIGN_OUTPUT.glob('*.TextGrid')) if ALIGN_OUTPUT.exists() else []
print(f'MFA alignment completed. TextGrid files created: {len(textgrids)}')


creating .lab files: 100%|██████████| 39248/39248 [00:04<00:00, 9730.01it/s] 


Created 37142 .lab files, missing transcriptions: 2106
Running MFA: /Volumes/SSanDisk/SpeechRec-German/miniforge/envs/mfa310/bin/mfa align /Volumes/SSanDisk/SpeechRec-German/artifacts/align_input/corpus german_mfa german_mfa /Volumes/SSanDisk/SpeechRec-German/artifacts/align_output --num_jobs 4 -v DEBUG --clean --overwrite
MFA exited with 0
MFA stdout (tail):
[2K[35m  92%[0m [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m36,031/39,248 [0m [ [33m0:00:35[0m < [36m0:00:03[0m , [31m1,327 it/s[0m ]
[2K[35m  92%[0m [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m36,181/39,248 [0m [ [33m0:00:35[0m < [36m0:00:03[0m , [31m1,328 it/s[0m ]
[2K[35m  93%[0m [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m36,321/39,248 [0m [ [33m0:00:35[0m < [36m0:00:03[0m , [31m1,328 it/s[0m ]
[2K[35m  93%[0m [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m36,461/39,248 [0m [ [33m0:00:35[0m < [36m0:00:03[0m , [31m1,328 it/s[0m ]
[2K[35m 

In [7]:
# Parse TextGrid outputs into a phoneme table with progress
try:
    import textgrid
except ImportError:
    textgrid = None
    print('textgrid not installed; install with `pip install praatio` or `pip install textgrid`.')

records = []
textgrids = list(ALIGN_OUTPUT.glob('*.TextGrid')) if ALIGN_OUTPUT.exists() else []
if textgrid is not None and textgrids:
    for tg_path in tqdm(textgrids, desc='parse TextGrid'):
        utt_id = tg_path.stem
        tg = textgrid.TextGrid.fromFile(str(tg_path))
        tier = next((t for t in tg.tiers if t.name.lower() in {'phones', 'phonemes', 'phone', 'phonem'}), None)
        if tier is None:
            continue
        for interval in tier.intervals:
            label = interval.mark.strip()
            if not label:
                continue
            records.append({
                'utterance_id': utt_id,
                'phoneme': label,
                'start_ms': interval.minTime * 1000,
                'end_ms': interval.maxTime * 1000,
                'duration_ms': (interval.maxTime - interval.minTime) * 1000,
            })
elif textgrid is not None:
    print('No TextGrid files found in', ALIGN_OUTPUT)

phoneme_df = None
if records:
    phoneme_df = pd.DataFrame(records)
    phoneme_df.to_csv(PHONEME_TBL, index=False)
    print('Saved phoneme intervals to', PHONEME_TBL)
    display(phoneme_df.head())
else:
    print('No TextGrid files parsed yet; run MFA cell first.')

parse TextGrid: 100%|██████████| 37139/37139 [00:27<00:00, 1369.08it/s]


Saved phoneme intervals to /Volumes/SSanDisk/SpeechRec-German/artifacts/phoneme_intervals.csv


Unnamed: 0,utterance_id,phoneme,start_ms,end_ms,duration_ms
0,4aeeae88-0777-2c8c-5c93-2e844a462e49---4783573...,d,0.0,30.0,30.0
1,4aeeae88-0777-2c8c-5c93-2e844a462e49---4783573...,iː,30.0,200.0,170.0
2,4aeeae88-0777-2c8c-5c93-2e844a462e49---4783573...,h,200.0,210.0,10.0
3,4aeeae88-0777-2c8c-5c93-2e844a462e49---4783573...,øː,210.0,460.0,250.0
4,4aeeae88-0777-2c8c-5c93-2e844a462e49---4783573...,ə,460.0,600.0,140.0
