# Extracting Unique Phonemes from MFA Dictionaries

This notebook extracts all unique phonemes from MFA dictionaries for the following languages:
- German
- English
- French
- Italian
- Russian
- Spanish

Goal: Determine how many unique IPA phonemes will be in the combined dictionary for these 6 languages and compare with 101 phonemes from CommonPhone.

In [22]:
import subprocess
import re
from pathlib import Path
from typing import Set, Dict, List
from collections import defaultdict

# Path settings
# Determine project root (parent of notebooks directory)
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
MFA_DIR = PROJECT_ROOT / 'MFA' / 'pretrained_models' / 'dictionary'

print(f"Project: {PROJECT_ROOT}")
print(f"MFA dictionaries: {MFA_DIR}")
print(f"Exists: {MFA_DIR.exists()}")

Project: /Volumes/SSanDisk/SpeechRec-German
MFA dictionaries: /Volumes/SSanDisk/SpeechRec-German/MFA/pretrained_models/dictionary
Exists: True


In [23]:
def find_mfa_dict_path(dict_name: str) -> Path:
    """
    Finds the path to an MFA dictionary.
    Checks several possible locations.
    """
    possible_paths = [
        # Local project folder
        MFA_DIR / f"{dict_name}.dict",
        # Standard MFA locations
        Path.home() / "Documents" / "MFA" / "pretrained_models" / "dictionary" / f"{dict_name}.dict",
        Path.home() / ".local" / "share" / "montreal-forced-alignment" / "pretrained_models" / "dictionary" / f"{dict_name}.dict",
    ]
    
    for path in possible_paths:
        if path.exists():
            return path
    
    return None

def extract_phonemes_from_dict(dict_path: Path) -> Set[str]:
    """
    Extracts all unique phonemes from an MFA dictionary.
    
    MFA dictionary format (example):
    word    probability1 probability2 probability3 probability4    phoneme1 phoneme2 phoneme3 ...
    's      0.99    0.24    0.19    1.16    s
    """
    phonemes = set()
    
    if not dict_path or not dict_path.exists():
        return phonemes
    
    print(f"  Reading dictionary: {dict_path}")
    
    try:
        with open(dict_path, 'r', encoding='utf-8') as f:
            line_count = 0
            for line in f:
                line = line.strip()
                if not line or line.startswith('#'):
                    continue
                
                # Split line into columns (tab or space separated)
                # MFA format: word + 4 numbers + phonemes
                parts = line.split()
                if len(parts) < 6:  # Minimum: word + 4 numbers + at least 1 phoneme
                    continue
                
                # First part is word, next 4 are probabilities, rest are phonemes
                # Take all parts starting from index 5
                word_phonemes = parts[5:]
                
                for phoneme in word_phonemes:
                    # Clean phoneme from extra characters
                    phoneme = phoneme.strip()
                    # Skip special tokens
                    if phoneme and phoneme not in ['sp', 'sil', 'spn', '', 'sp']:
                        phonemes.add(phoneme)
                
                line_count += 1
                if line_count % 10000 == 0:
                    print(f"    Processed {line_count} lines...")
        
        print(f"  Extracted {len(phonemes)} unique phonemes")
        return phonemes
        
    except Exception as e:
        print(f"  Error reading dictionary: {e}")
        import traceback
        traceback.print_exc()
        return set()

print("Phoneme extraction functions created")

Phoneme extraction functions created


In [24]:
# MFA dictionaries for each language
languages = {
    'german': 'german_mfa',
    'english': 'english_us_mfa',  # Can also try 'english_uk_mfa'
    'french': 'french_mfa',
    'italian': 'italian_mfa',
    'russian': 'russian_mfa',
    'spanish': 'spanish_mfa',
}

print("=" * 80)
print("SEARCHING FOR MFA DICTIONARIES")
print("=" * 80)
print()

# Check which dictionaries are available locally
available_dicts = {}
for lang_name, dict_name in languages.items():
    dict_path = find_mfa_dict_path(dict_name)
    if dict_path:
        available_dicts[lang_name] = dict_path
        print(f"‚úì {lang_name.capitalize():12s} ({dict_name:20s}): found")
    else:
        print(f"‚úó {lang_name.capitalize():12s} ({dict_name:20s}): not found")
        available_dicts[lang_name] = None

print()
print(f"Found dictionaries: {sum(1 for v in available_dicts.values() if v is not None)} out of {len(languages)}")

# Attempt to download missing dictionaries
missing_dicts = {k: v for k, v in languages.items() if available_dicts[k] is None}
if missing_dicts:
    print()
    print("=" * 80)
    print("ATTEMPTING TO DOWNLOAD MISSING DICTIONARIES")
    print("=" * 80)
    print()
    
    # Check if MFA CLI is available
    try:
        result = subprocess.run(['mfa', '--version'], capture_output=True, text=True, timeout=5)
        mfa_available = result.returncode == 0
    except:
        mfa_available = False
    
    if mfa_available:
        print("MFA CLI available, attempting to download dictionaries...")
        for lang_name, dict_name in missing_dicts.items():
            print(f"  Downloading {dict_name}...")
            try:
                result = subprocess.run(
                    ['mfa', 'model', 'download', 'dictionary', dict_name],
                    capture_output=True,
                    text=True,
                    timeout=60
                )
                if result.returncode == 0:
                    # Check again after downloading
                    dict_path = find_mfa_dict_path(dict_name)
                    if dict_path:
                        available_dicts[lang_name] = dict_path
                        print(f"    ‚úì Successfully downloaded and found")
                    else:
                        print(f"    ‚ö† Downloaded but path not found")
                else:
                    print(f"    ‚úó Error: {result.stderr[:100]}")
            except subprocess.TimeoutExpired:
                print(f"    ‚úó Timeout while downloading")
            except Exception as e:
                print(f"    ‚úó Error: {e}")
        print()
    else:
        print("‚ö† MFA CLI not available. Dictionaries can be downloaded manually:")
        print("   mfa model download dictionary <dict_name>")
        print()

SEARCHING FOR MFA DICTIONARIES

‚úì German       (german_mfa          ): found
‚úì English      (english_us_mfa      ): found
‚úì French       (french_mfa          ): found
‚úó Italian      (italian_mfa         ): not found
‚úì Russian      (russian_mfa         ): found
‚úì Spanish      (spanish_mfa         ): found

Found dictionaries: 5 out of 6

ATTEMPTING TO DOWNLOAD MISSING DICTIONARIES

‚ö† MFA CLI not available. Dictionaries can be downloaded manually:
   mfa model download dictionary <dict_name>



In [25]:
# Extract phonemes from all available dictionaries
print("=" * 80)
print("EXTRACTING PHONEMES FROM DICTIONARIES")
print("=" * 80)
print()

all_phonemes_by_lang: Dict[str, Set[str]] = {}
all_unique_phonemes = set()

for lang_name, dict_path in available_dicts.items():
    if dict_path is None:
        print(f"‚ö† {lang_name.capitalize():12s}: dictionary not found, skipping")
        all_phonemes_by_lang[lang_name] = set()
        continue
    
    print(f"üìñ {lang_name.capitalize():12s}:")
    phonemes = extract_phonemes_from_dict(dict_path)
    all_phonemes_by_lang[lang_name] = phonemes
    all_unique_phonemes.update(phonemes)
    print()

EXTRACTING PHONEMES FROM DICTIONARIES

üìñ German      :
  Reading dictionary: /Volumes/SSanDisk/SpeechRec-German/MFA/pretrained_models/dictionary/german_mfa.dict
    Processed 10000 lines...
    Processed 20000 lines...
    Processed 30000 lines...
    Processed 40000 lines...
    Processed 50000 lines...
    Processed 60000 lines...
    Processed 70000 lines...
    Processed 80000 lines...
    Processed 90000 lines...
    Processed 100000 lines...
    Processed 110000 lines...
    Processed 120000 lines...
    Processed 130000 lines...
    Processed 140000 lines...
    Processed 150000 lines...
  Extracted 52 unique phonemes

üìñ English     :
  Reading dictionary: /Volumes/SSanDisk/SpeechRec-German/MFA/pretrained_models/dictionary/english_us_mfa.dict
    Processed 10000 lines...
    Processed 20000 lines...
    Processed 30000 lines...
    Processed 40000 lines...
    Processed 50000 lines...
    Processed 60000 lines...
  Extracted 78 unique phonemes

üìñ French      :
  Reading

In [26]:
# Display results
print("=" * 80)
print("RESULTS")
print("=" * 80)
print()

print("Phonemes by language:")
print("-" * 80)
for lang_name, phonemes in all_phonemes_by_lang.items():
    print(f"{lang_name.capitalize():12s}: {len(phonemes):3d} unique phonemes")
    if len(phonemes) > 0 and len(phonemes) <= 60:
        print(f"  {sorted(phonemes)}")
    elif len(phonemes) > 60:
        print(f"  First 30: {sorted(list(phonemes))[:30]}")
        print(f"  ... and {len(phonemes) - 30} more phonemes")
    print()

print("=" * 80)
print("COMBINED SET OF UNIQUE PHONEMES")
print("=" * 80)
print(f"Total unique phonemes: {len(all_unique_phonemes)}")
print()
print("All unique phonemes (sorted):")
print(sorted(all_unique_phonemes))
print()

RESULTS

Phonemes by language:
--------------------------------------------------------------------------------
German      :  52 unique phonemes
  ['a', 'aj', 'aw', 'aÀê', 'b', 'c', 'c ∞', 'd', 'eÀê', 'f', 'h', 'iÀê', 'j', 'k', 'k ∞', 'l', 'lÃ©', 'm', 'mÃ©', 'n', 'nÃ©', 'oÀê', 'p', 'pf', 'p ∞', 's', 't', 'ts', 't É', 't ∞', 'uÀê', 'v', 'x', 'yÀê', 'z', '√ß', '√∏Àê', '≈ã', '≈ì', '…ê', '…î', '…î è', '…ô', '…õ', '…ü', '…°', '…™', '…≤', ' Å', ' É', ' ä', ' è']

English     :  78 unique phonemes
  First 30: ['aj', 'aw', 'b', 'b ≤', 'c', 'c ∞', 'c ∑', 'd', 'd í', 'd ≤', 'dÃ™', 'ej', 'f', 'f ≤', 'h', 'i', 'iÀê', 'j', 'k', 'k ∞', 'k ∑', 'l', 'm', 'm ≤', 'mÃ©', 'n', 'nÃ©', 'ow', 'p', 'p ∞']
  ... and 48 more phonemes

French      :  43 unique phonemes
  ['a', 'b', 'c', 'd', 'd í', 'e', 'f', 'i', 'j', 'k', 'l', 'm', 'm ≤', 'n', 'o', 'p', 's', 't', 'ts', 't É', 'u', 'v', 'w', 'y', 'z', '√∏', '≈ã', '≈ì', '…ë', '…ëÃÉ', '…î', '…îÃÉ', '…ô', '…õ', '…õÃÉ', '…ü', '…°', '…•', '…≤', ' Å', ' É', ' é', ' í

In [27]:
# Find common phonemes (present in all languages)
print("=" * 80)
print("ANALYSIS OF COMMON AND UNIQUE PHONEMES")
print("=" * 80)
print()

# Only languages for which dictionaries are available
available_languages = {k: v for k, v in all_phonemes_by_lang.items() if len(v) > 0}

if len(available_languages) > 1:
    # Common phonemes (present in all available languages)
    common_phonemes = set.intersection(*available_languages.values())
    print(f"Common phonemes (present in all {len(available_languages)} languages):")
    print(f"  Count: {len(common_phonemes)}")
    print(f"  Phonemes: {sorted(common_phonemes)}")
    print()
    
    # Language-specific phonemes
    print("Language-specific phonemes:")
    print("-" * 80)
    for lang_name, phonemes in available_languages.items():
        # Phonemes that exist only in this language
        other_phonemes = set()
        for other_lang, other_ph in available_languages.items():
            if other_lang != lang_name:
                other_phonemes.update(other_ph)
        
        specific = phonemes - other_phonemes
        if specific:
            print(f"{lang_name.capitalize():12s}: {len(specific)} unique phonemes")
            print(f"  {sorted(specific)}")
            print()
else:
    print(f"‚ö† Insufficient data for comparison (only {len(available_languages)} language(s) available)")

ANALYSIS OF COMMON AND UNIQUE PHONEMES

Common phonemes (present in all 5 languages):
  Count: 10
  Phonemes: ['b', 'c', 'f', 'j', 'k', 'm', 'p', '…ü', '…°', '…≤']

Language-specific phonemes:
--------------------------------------------------------------------------------
German      : 10 unique phonemes
  ['aÀê', 'eÀê', 'lÃ©', 'oÀê', 'pf', 'uÀê', 'yÀê', '√∏Àê', '…î è', ' è']

English     : 21 unique phonemes
  ['c ∑', 'ej', 'k ∑', 'ow', 'p ∑', 't ∑', '…ëÀê', '…í', '…íÀê', '…îj', '…ö', '…ù', '…ü ∑', '…° ∑', '…´Ã©', '…±', '…π', '…æ ≤', '…æÃÉ', ' âÀê', ' î']

French      : 6 unique phonemes
  ['y', '√∏', '…ëÃÉ', '…îÃÉ', '…õÃÉ', '…•']

Russian     : 54 unique phonemes
  ['b ≤Àê', 'bÀê', 'cÀê', 'dz ≤Àê', 'd êÀê', 'd ≤Àê', 'dÃ™zÃ™', 'dÃ™zÃ™Àê', 'dÃ™Àê', 'f ≤Àê', 'fÀê', 'jÀê', 'kÀê', 'm ≤Àê', 'mÀê', 'nÃ™', 'nÃ™Àê', 'p ≤Àê', 'pÀê', 'r ≤', 'r ≤Àê', 'rÀê', 's ≤', 's ≤Àê', 'sÃ™', 'sÃ™Àê', 'ts ≤', 't…ï', 't…ïÀê', 't Ç', 't ÇÀê', 't ≤Àê', 'tÃ™sÃ™', 'tÃ™sÃ™Àê', 'tÃ™Àê', 'v ≤Àê', 'vÀê', 'z ≤', 'zÃ™

In [28]:
# Comparison with CommonPhone (101 phonemes)
print("=" * 80)
print("COMPARISON WITH COMMONPHONE")
print("=" * 80)
print()

print(f"MFA unique phonemes (for available languages): {len(all_unique_phonemes)}")
print(f"CommonPhone declares: 101 phonemes")
print(f"Difference: {abs(len(all_unique_phonemes) - 101)}")
print()

if len(all_unique_phonemes) > 0:
    print("Statistics:")
    print(f"  - Languages processed: {len(available_languages)}")
    print(f"  - Average phonemes per language: {sum(len(v) for v in available_languages.values()) / len(available_languages):.1f}")
    print(f"  - Minimum phonemes: {min(len(v) for v in available_languages.values())}")
    print(f"  - Maximum phonemes: {max(len(v) for v in available_languages.values())}")
    print()
    
    # Save results in variables for further use
    print("Results saved in variables:")
    print("  - all_phonemes_by_lang: dictionary {language: set_of_phonemes}")
    print("  - all_unique_phonemes: set of all unique phonemes")
    print("  - available_languages: languages with available dictionaries")

COMPARISON WITH COMMONPHONE

MFA unique phonemes (for available languages): 162
CommonPhone declares: 101 phonemes
Difference: 61

Statistics:
  - Languages processed: 5
  - Average phonemes per language: 60.0
  - Minimum phonemes: 35
  - Maximum phonemes: 92

Results saved in variables:
  - all_phonemes_by_lang: dictionary {language: set_of_phonemes}
  - all_unique_phonemes: set of all unique phonemes
  - available_languages: languages with available dictionaries


## Filtering Phonemes Similar to German

This cell filters and identifies phonemes that are similar to German phonemes but may be incorrectly pronounced by non-native speakers. These are phonemes from other languages that are close to German phonemes but not identical, which can cause pronunciation errors.

In [29]:
# German phonemes reference (from config or extracted from German dictionary)
german_phonemes = all_phonemes_by_lang.get('german', set())

if len(german_phonemes) == 0:
    # Fallback: use standard German IPA phonemes
    german_phonemes = {
        'a', 'aÀê', '…õ', 'e', 'eÀê', '…ô', '…™', 'i', 'iÀê', '…î', 'o', 'oÀê', '≈ì', '√∏', '√∏Àê',
        ' ä', 'u', 'uÀê', ' è', 'y', 'yÀê',
        'a…™ÃØ', 'a äÃØ', '…î èÃØ',
        'b', 'p', 'd', 't', 'g', 'k', 'k ∞', 'f', 'v', 's', 'z', ' É', ' í', '√ß', 'x', 'h',
        'j', 'l', 'm', 'n', '≈ã', ' Å', '…ê',
        'pf', 'ts', 't É', 'd í',
    }

print("=" * 80)
print("FILTERING PHONEMES SIMILAR TO GERMAN")
print("=" * 80)
print()
print(f"German phonemes reference: {len(german_phonemes)} phonemes")
print(f"German phonemes: {sorted(german_phonemes)}")
print()

FILTERING PHONEMES SIMILAR TO GERMAN

German phonemes reference: 52 phonemes
German phonemes: ['a', 'aj', 'aw', 'aÀê', 'b', 'c', 'c ∞', 'd', 'eÀê', 'f', 'h', 'iÀê', 'j', 'k', 'k ∞', 'l', 'lÃ©', 'm', 'mÃ©', 'n', 'nÃ©', 'oÀê', 'p', 'pf', 'p ∞', 's', 't', 'ts', 't É', 't ∞', 'uÀê', 'v', 'x', 'yÀê', 'z', '√ß', '√∏Àê', '≈ã', '≈ì', '…ê', '…î', '…î è', '…ô', '…õ', '…ü', '…°', '…™', '…≤', ' Å', ' É', ' ä', ' è']



In [30]:
def are_phonemes_similar(ph1: str, ph2: str, similarity_threshold: float = 0.7) -> bool:
    """
    Check if two phonemes are similar.
    Uses simple character-based similarity and phonetic knowledge.
    """
    # Exact match
    if ph1 == ph2:
        return True
    
    # Remove length markers and diacritics for base comparison
    def normalize_phoneme(ph):
        # Remove length markers (Àê), diacritics (ÃØ,  ∞,  ≤,  ∑), etc.
        normalized = ph.replace('Àê', '').replace('ÃØ', '').replace(' ∞', '').replace(' ≤', '').replace(' ∑', '')
        return normalized
    
    base1 = normalize_phoneme(ph1)
    base2 = normalize_phoneme(ph2)
    
    # Same base phoneme
    if base1 == base2:
        return True
    
    # Similar base phonemes (common substitutions)
    similar_pairs = [
        ('i', '…™'), ('u', ' ä'), ('e', '…õ'), ('o', '…î'),
        ('a', '…ë'), ('y', ' è'), ('√∏', '≈ì'),
        ('r', ' Å'), ('r', '…π'), ('r', '…æ'),
        ('√ß', ' É'), ('x', 'h'), ('x', 'k'),
        ('ts', 's'), ('t É', ' É'), ('d í', ' í'),
    ]
    
    for pair in similar_pairs:
        if (base1 == pair[0] and base2 == pair[1]) or (base1 == pair[1] and base2 == pair[0]):
            return True
    
    # Character-based similarity (for complex phonemes)
    if len(base1) > 0 and len(base2) > 0:
        common_chars = set(base1) & set(base2)
        total_chars = set(base1) | set(base2)
        if len(total_chars) > 0:
            similarity = len(common_chars) / len(total_chars)
            if similarity >= similarity_threshold:
                return True
    
    return False

# Find phonemes from other languages that are similar to German phonemes
print("=" * 80)
print("FINDING SIMILAR PHONEMES FROM OTHER LANGUAGES")
print("=" * 80)
print()

similar_phonemes_by_lang = {}
problematic_pairs = {}  # {language: [(other_lang_phoneme, similar_german_phoneme), ...]}

for lang_name, lang_phonemes in all_phonemes_by_lang.items():
    if lang_name == 'german' or len(lang_phonemes) == 0:
        continue
    
    similar = set()
    pairs = []
    
    for other_ph in lang_phonemes:
        for ger_ph in german_phonemes:
            if are_phonemes_similar(other_ph, ger_ph):
                similar.add(other_ph)
                pairs.append((other_ph, ger_ph))
                break
    
    similar_phonemes_by_lang[lang_name] = similar
    problematic_pairs[lang_name] = pairs
    
    print(f"{lang_name.capitalize():12s}: {len(similar)} similar phonemes out of {len(lang_phonemes)}")
    if len(similar) > 0:
        print(f"  Similar phonemes: {sorted(similar)}")
    print()

print("=" * 80)
print("PROBLEMATIC PHONEME PAIRS (may cause pronunciation errors)")
print("=" * 80)
print()
print("These are phonemes from other languages that are similar to German phonemes.")
print("Non-native speakers may incorrectly substitute these when trying to pronounce German.")
print()

for lang_name, pairs in problematic_pairs.items():
    if len(pairs) > 0:
        print(f"{lang_name.capitalize()}:")
        # Group by German phoneme
        by_german = {}
        for other_ph, ger_ph in pairs:
            if ger_ph not in by_german:
                by_german[ger_ph] = []
            by_german[ger_ph].append(other_ph)
        
        for ger_ph in sorted(by_german.keys()):
            other_phs = sorted(set(by_german[ger_ph]))
            print(f"  German '{ger_ph}' ‚Üê similar to: {other_phs}")
        print()

FINDING SIMILAR PHONEMES FROM OTHER LANGUAGES

English     : 52 similar phonemes out of 78
  Similar phonemes: ['aj', 'aw', 'b', 'b ≤', 'c', 'c ∞', 'c ∑', 'd', 'd ≤', 'f', 'f ≤', 'h', 'i', 'iÀê', 'j', 'k', 'k ∞', 'k ∑', 'l', 'm', 'm ≤', 'mÃ©', 'n', 'nÃ©', 'p', 'p ∞', 'p ≤', 'p ∑', 's', 't', 't É', 't ∞', 't ≤', 't ∑', 'v', 'v ≤', 'z', '√ß', '≈ã', '…ê', '…ë', '…ëÀê', '…ô', '…õ', '…ü', '…ü ∑', '…°', '…° ∑', '…™', '…≤', ' É', ' ä']

French      : 35 similar phonemes out of 43
  Similar phonemes: ['a', 'b', 'c', 'd', 'e', 'f', 'i', 'j', 'k', 'l', 'm', 'm ≤', 'n', 'o', 'p', 's', 't', 'ts', 't É', 'u', 'v', 'y', 'z', '√∏', '≈ã', '≈ì', '…ë', '…î', '…ô', '…õ', '…ü', '…°', '…≤', ' Å', ' É']

Russian     : 56 similar phonemes out of 92
  Similar phonemes: ['a', 'b', 'b ≤', 'b ≤Àê', 'bÀê', 'c', 'cÀê', 'd ≤', 'd ≤Àê', 'e', 'f', 'f ≤', 'f ≤Àê', 'fÀê', 'i', 'j', 'jÀê', 'k', 'kÀê', 'm', 'm ≤', 'm ≤Àê', 'mÀê', 'o', 'p', 'p ≤', 'p ≤Àê', 'pÀê', 'r', 'r ≤', 'r ≤Àê', 'rÀê', 's ≤', 's ≤Àê', 'ts ≤', 't ≤', 

In [31]:
# Create filtered set: German phonemes + similar phonemes from other languages
# This is the set that should be included in training to help model recognize
# both correct German phonemes and common mispronunciations

filtered_phonemes = set(german_phonemes)

for lang_name, similar_phs in similar_phonemes_by_lang.items():
    filtered_phonemes.update(similar_phs)

print("=" * 80)
print("FILTERED PHONEME SET FOR TRAINING")
print("=" * 80)
print()
print("This set includes:")
print(f"  - All German phonemes: {len(german_phonemes)}")
print(f"  - Similar phonemes from other languages: {len(filtered_phonemes - german_phonemes)}")
print(f"  - Total filtered phonemes: {len(filtered_phonemes)}")
print()
print("Rationale:")
print("  Including similar phonemes helps the model:")
print("  1. Recognize when learners substitute their native phonemes for German ones")
print("  2. Provide better feedback on pronunciation errors")
print("  3. Distinguish between correct German phonemes and common mispronunciations")
print()
print("Filtered phonemes (sorted):")
print(sorted(filtered_phonemes))
print()

# Comparison
print("=" * 80)
print("COMPARISON")
print("=" * 80)
print()
print(f"Original MFA unique phonemes (all languages): {len(all_unique_phonemes)}")
print(f"German phonemes only: {len(german_phonemes)}")
print(f"Filtered phonemes (German + similar): {len(filtered_phonemes)}")
print(f"CommonPhone phonemes: 101")
print()
print(f"Filtered set is {len(filtered_phonemes) - 101:+d} phonemes compared to CommonPhone")
print(f"Filtered set includes {len(filtered_phonemes - german_phonemes)} non-German phonemes")
print("  (these are similar to German and may be used incorrectly by learners)")
print()

# Save results
print("Results saved in variables:")
print("  - german_phonemes: set of German phonemes")
print("  - similar_phonemes_by_lang: dict {language: set of similar phonemes}")
print("  - problematic_pairs: dict {language: list of (other_ph, german_ph) pairs}")
print("  - filtered_phonemes: set of German + similar phonemes (recommended for training)")

FILTERED PHONEME SET FOR TRAINING

This set includes:
  - All German phonemes: 52
  - Similar phonemes from other languages: 47
  - Total filtered phonemes: 99

Rationale:
  Including similar phonemes helps the model:
  1. Recognize when learners substitute their native phonemes for German ones
  2. Provide better feedback on pronunciation errors
  3. Distinguish between correct German phonemes and common mispronunciations

Filtered phonemes (sorted):
['a', 'aj', 'aw', 'aÀê', 'b', 'b ≤', 'b ≤Àê', 'bÀê', 'c', 'c ∞', 'c ∑', 'cÀê', 'd', 'd ≤', 'd ≤Àê', 'e', 'eÀê', 'f', 'f ≤', 'f ≤Àê', 'fÀê', 'h', 'i', 'iÀê', 'j', 'jÀê', 'k', 'k ∞', 'k ∑', 'kÀê', 'l', 'lÃ©', 'm', 'm ≤', 'm ≤Àê', 'mÀê', 'mÃ©', 'n', 'nÃ©', 'o', 'oÀê', 'p', 'pf', 'p ∞', 'p ≤', 'p ≤Àê', 'p ∑', 'pÀê', 'r', 'r ≤', 'r ≤Àê', 'rÀê', 's', 's ≤', 's ≤Àê', 't', 'ts', 'ts ≤', 't É', 't ∞', 't ≤', 't ≤Àê', 't ∑', 'u', 'uÀê', 'v', 'v ≤', 'v ≤Àê', 'vÀê', 'x', 'y', 'yÀê', 'z', 'z ≤', '√ß', '√∏', '√∏Àê', '≈ã', '≈ì', '…ê', '…ë', '…ëÀê', '…î'