# Word Boundary Forced Alignment Test

This notebook tests the MFA-based word boundary insertion algorithm.

## Goals:
1. Test MFA alignment on sample audio files
2. Compare OLD (proportional) vs NEW (MFA-based) word boundary insertion
3. Verify that phonemes are preserved at word boundaries
4. Evaluate boundary accuracy

In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path('.').absolute().parent))

from modules.mfa_aligner import get_mfa_aligner
from modules.word_boundary_utils import insert_word_boundaries_mfa, insert_word_boundaries
from modules.phoneme_recognition import PhonemeRecognizer
from modules.g2p_module import get_g2p
import pandas as pd
import torchaudio
import numpy as np

ModuleNotFoundError: No module named 'modules.mfa_aligner'

In [None]:
# Initialize MFA aligner with explicit path
mfa_aligner = get_mfa_aligner(
    mfa_bin="/Volumes/SSanDisk/SpeechRec-German/miniforge/envs/mfa310/bin/mfa",
    mfa_dict="german_mfa",
    mfa_model="german_mfa"
)

print("MFA aligner initialized!")
print(f"MFA binary: {mfa_aligner.mfa_bin}")

MFA aligner initialized!
MFA binary: /Volumes/SSanDisk/SpeechRec-German/miniforge/envs/mfa310/bin/mfa


In [None]:
# Load a few examples from metadata
df = pd.read_csv('/Volumes/SSanDisk/SpeechRec-German-diagnostic/data/dictionaries/metadata_wav_clean_hochdeutsch.csv')

# Fix audio paths
df['audio_path_fixed'] = df['audio_wav_path'].str.replace(
    '/Volumes/SSanDisk/SpeechRec-German/',
    '/Volumes/SSanDisk/audio_data/'
)

# Sample 10 records for testing
test_df = df.sample(n=10, random_state=42)
print(f"Loaded {len(test_df)} test records")
test_df[['text', 'audio_path_fixed']].head()

Loaded 10 test records


Unnamed: 0,text,audio_path_fixed
17466,Kannst du programmieren?,/Volumes/SSanDisk/audio_data/data_wav/TV-2021....
34962,nichts desto trotz ist das jedoch ein schönes ...,/Volumes/SSanDisk/audio_data/data_wav/TV-2022....
5561,Verschiedene Karrierewege sind denkbar.,/Volumes/SSanDisk/audio_data/data_wav/TV-2021....
10405,Aber Fotosynthese mit F sieht schon komisch aus.,/Volumes/SSanDisk/audio_data/data_wav/TV-2021....
7516,Manchmal muss man einfach ein bisschen Quatsch...,/Volumes/SSanDisk/audio_data/data_wav/TV-2021....


In [None]:
# Test MFA alignment on first record
test_record = test_df.iloc[0]

print(f"Text: {test_record['text']}")
print(f"Audio: {test_record['audio_path_fixed']}")

# Get MFA alignment
try:
    mfa_result = mfa_aligner.align_single_file(
        test_record['audio_path_fixed'],
        test_record['text']
    )
    
    print(f"\nMFA phonemes ({len(mfa_result)}):")
    for p in mfa_result[:20]:
        print(f"  {p['phoneme']:5s} {p['start_ms']:7.1f} - {p['end_ms']:7.1f} ms")
except Exception as e:
    print(f"Error: {e}")

Text: Kannst du programmieren?
Audio: /Volumes/SSanDisk/audio_data/data_wav/TV-2021.02-Neutral/4aeeae88-0777-2c8c-5c93-2e844a462e49---6e5325ee54e5d4a685fb3ba30c0efa69.wav

MFA phonemes (18):
  kʰ        0.0 -    80.0 ms
  a        80.0 -   130.0 ms
  n       130.0 -   180.0 ms
  s       180.0 -   260.0 ms
  t       260.0 -   300.0 ms
  d       300.0 -   310.0 ms
  uː      310.0 -   370.0 ms
  p       370.0 -   490.0 ms
  ʁ       490.0 -   500.0 ms
  ɔ       500.0 -   560.0 ms
  ɡ       560.0 -   680.0 ms
  ʁ       680.0 -   690.0 ms
  a       690.0 -   740.0 ms
  m       740.0 -   800.0 ms
  iː      800.0 -   890.0 ms
  ʁ       890.0 -   970.0 ms
  ə       970.0 -  1020.0 ms
  n      1020.0 -  1130.0 ms


In [None]:
# Test full pipeline: OLD vs NEW method
recognizer = PhonemeRecognizer()
g2p = get_g2p()

for i, row in test_df.head(3).iterrows():
    text = row['text']
    audio_path = row['audio_path_fixed']
    
    print(f"\n{'='*80}")
    print(f"Text: {text}")
    
    try:
        # Get expected phonemes
        expected_dict = g2p.process_sentence(text)
        expected_phonemes = [p.get('phoneme', '') for p in expected_dict if p.get('phoneme')]
        
        # Get recognized phonemes
        logits, _ = recognizer.recognize_phonemes(audio_path)
        recognized_str = recognizer.decode_phonemes(logits)
        recognized_phonemes = recognized_str.split()
        
        # OLD method (proportional)
        old_result = insert_word_boundaries(text, expected_phonemes, recognized_phonemes)
        
        # NEW method (MFA)
        waveform, sr = torchaudio.load(audio_path)
        audio_duration = waveform.shape[1] / sr
        
        new_result = insert_word_boundaries_mfa(
            text=text,
            recognized_phonemes=recognized_phonemes,
            audio_path=audio_path,
            logits=logits,
            audio_duration=audio_duration,
            mfa_aligner=mfa_aligner
        )
        
        # Compare
        print(f"\nExpected:   {' '.join(expected_phonemes[:50])}...")
        print(f"OLD method: {' '.join(old_result[:50])}...")
        print(f"NEW method: {' '.join(new_result[:50])}...")
        
        # Count || positions
        expected_boundaries = [i for i, p in enumerate(expected_phonemes) if p == '||']
        old_boundaries = [i for i, p in enumerate(old_result) if p == '||']
        new_boundaries = [i for i, p in enumerate(new_result) if p == '||']
        
        print(f"\nBoundaries:")
        print(f"  Expected: {len(expected_boundaries)} at positions {expected_boundaries[:5]}...")
        print(f"  OLD:      {len(old_boundaries)} at positions {old_boundaries[:5]}...")
        print(f"  NEW:      {len(new_boundaries)} at positions {new_boundaries[:5]}...")
        
    except Exception as e:
        print(f"Error processing: {e}")
        import traceback
        traceback.print_exc()

Loading model: facebook/wav2vec2-xlsr-53-espeak-cv-ft
Model loaded on device: mps
Vocabulary size: 392
Sample IPA phonemes in vocab: ['n', 's', 't', 'ə', 'l', 'a', 'i', 'k', 'd', 'm', 'ɛ', 'ɾ', 'e', 'ɪ', 'p', 'o', 'ɐ', 'z', 'ð', 'f']

Text: Kannst du programmieren?
Loaded 143249 words from lexicon cache in 0.41 seconds.
Loaded 278343 words from DSL lexicon cache in 0.72 seconds.
Loaded phoneme normalization table from /Volumes/SSanDisk/SpeechRec-German-diagnostic/phoneme_normalization_table.json
  - Phoneme mappings: 4
  - Diacritics to remove: 6
  - Suprasegmentals to remove: 4
  - Invalid patterns: 4
  - Characters to remove: 34

Expected:   k a n s t || ː || p ʀ o ɡ ʀ a m iː ʀ ə n...
OLD method: k a n s t || uː || p ɾ oː ɡ ɾ a m iː r ə n...
NEW method: k a n s || t || uː p ɾ oː ɡ ɾ a m iː r ə n...

Boundaries:
  Expected: 2 at positions [5, 7]...
  OLD:      2 at positions [5, 7]...
  NEW:      2 at positions [4, 6]...

Text: nichts desto trotz ist das jedoch ein schönes zusammentre

In [None]:
# Calculate boundary accuracy
def calculate_boundary_accuracy(expected, predicted, tolerance=1):
    """
    Calculate accuracy of word boundary positions.
    
    Args:
        expected: List with || markers
        predicted: List with || markers
        tolerance: Allowed position difference (±tolerance)
    
    Returns:
        Accuracy score (0.0 to 1.0)
    """
    expected_positions = set(i for i, p in enumerate(expected) if p == '||')
    predicted_positions = set(i for i, p in enumerate(predicted) if p == '||')
    
    if not expected_positions:
        return 1.0 if not predicted_positions else 0.0
    
    # Allow ±tolerance position tolerance
    correct = 0
    for exp_pos in expected_positions:
        if any(abs(exp_pos - pred_pos) <= tolerance for pred_pos in predicted_positions):
            correct += 1
    
    return correct / len(expected_positions)

# Test on all records
old_accuracies = []
new_accuracies = []

for i, row in test_df.iterrows():
    text = row['text']
    audio_path = row['audio_path_fixed']
    
    try:
        # Get expected phonemes
        expected_dict = g2p.process_sentence(text)
        expected_phonemes = [p.get('phoneme', '') for p in expected_dict if p.get('phoneme')]
        
        # Get recognized phonemes
        logits, _ = recognizer.recognize_phonemes(audio_path)
        recognized_str = recognizer.decode_phonemes(logits)
        recognized_phonemes = recognized_str.split()
        
        # OLD method
        old_result = insert_word_boundaries(text, expected_phonemes, recognized_phonemes)
        old_acc = calculate_boundary_accuracy(expected_phonemes, old_result)
        old_accuracies.append(old_acc)
        
        # NEW method
        waveform, sr = torchaudio.load(audio_path)
        audio_duration = waveform.shape[1] / sr
        
        new_result = insert_word_boundaries_mfa(
            text=text,
            recognized_phonemes=recognized_phonemes,
            audio_path=audio_path,
            logits=logits,
            audio_duration=audio_duration,
            mfa_aligner=mfa_aligner
        )
        new_acc = calculate_boundary_accuracy(expected_phonemes, new_result)
        new_accuracies.append(new_acc)
        
    except Exception as e:
        print(f"Error on record {i}: {e}")

print(f"\n{'='*80}")
print(f"Boundary Accuracy Results:")
print(f"  OLD method: {np.mean(old_accuracies):.2%} (mean)")
print(f"  NEW method: {np.mean(new_accuracies):.2%} (mean)")
print(f"\n  OLD method: {np.median(old_accuracies):.2%} (median)")
print(f"  NEW method: {np.median(new_accuracies):.2%} (median)")


Boundary Accuracy Results:
  OLD method: 89.58% (mean)
  NEW method: 71.33% (mean)

  OLD method: 100.00% (median)
  NEW method: 69.05% (median)
