# Phoneme Symbol Analysis from Dictionaries and Model

This notebook collects and compares all possible phoneme and special symbol representations from:
1. **IPA-Dict-DSL** dictionary
2. **MFA Dictionary** dictionary  
3. **Phoneme recognition model** (Wav2Vec2)

Goal: Identify unique symbols for each source and ensure consistent representation of the same phonemes.

In [13]:
import sys
from pathlib import Path
import re
import json
from typing import Set, Dict, List

# Add project path
project_root = Path('/Volumes/SSanDisk/SpeechRec-German-diagnostic')
sys.path.insert(0, str(project_root))

# Import modules
import config
from modules.g2p_module import DSLG2P, LexiconG2P
from modules.phoneme_recognition import get_phoneme_recognizer

print("Imports completed successfully")

Imports completed successfully


In [14]:
def extract_all_characters_from_text(text: str) -> Set[str]:
    """
    Extract all unique characters from text, including diacritics and special symbols.
    
    Args:
        text: Text to analyze
        
    Returns:
        Set of all unique characters
    """
    return set(text)


def extract_characters_from_dsl_lexicon(dsl_path: Path) -> Set[str]:
    """
    Extract all unique characters from IPA-Dict-DSL dictionary.
    
    Args:
        dsl_path: Path to DSL file
        
    Returns:
        Set of all unique characters
    """
    all_chars: Set[str] = set()
    
    if not dsl_path.exists():
        print(f"⚠ DSL file not found: {dsl_path}")
        return all_chars
    
    print(f"Loading characters from DSL dictionary: {dsl_path}")
    
    try:
        with open(dsl_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.rstrip('\n\r')
                
                # If line without indentation - it's a new word entry
                if line and not line.startswith(' ') and not line.startswith('\t'):
                    continue
                
                # If line with indentation - it's a transcription
                elif line and (line.startswith(' ') or line.startswith('\t')):
                    # Extract IPA from [m1]...[/m] tags
                    match = re.search(r'\[m\d*\](.*?)\[/m\]', line)
                    if match:
                        raw_transcription = match.group(1).strip()
                        if raw_transcription:
                            # Add all characters from transcription
                            all_chars.update(extract_all_characters_from_text(raw_transcription))
        
        print(f"✓ Extracted {len(all_chars)} unique characters from DSL dictionary")
        return all_chars
    except Exception as e:
        print(f"✗ Error loading DSL dictionary: {e}")
        return all_chars


def extract_characters_from_mfa_lexicon(mfa_path: Path) -> Set[str]:
    """
    Extract all unique characters from MFA dictionary.
    
    Args:
        mfa_path: Path to MFA dictionary
        
    Returns:
        Set of all unique characters
    """
    all_chars: Set[str] = set()
    
    if not mfa_path.exists():
        print(f"⚠ MFA file not found: {mfa_path}")
        return all_chars
    
    print(f"Loading characters from MFA dictionary: {mfa_path}")
    
    try:
        with open(mfa_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 2:
                    # Process all parts after the word (phonemes)
                    for part in parts[1:]:
                        try:
                            # Skip numbers (probabilities)
                            float(part)
                            continue
                        except ValueError:
                            # This is a phoneme - add all its characters
                            all_chars.update(extract_all_characters_from_text(part))
        
        print(f"✓ Extracted {len(all_chars)} unique characters from MFA dictionary")
        return all_chars
    except Exception as e:
        print(f"✗ Error loading MFA dictionary: {e}")
        return all_chars


def extract_characters_from_model_vocab(model_recognizer) -> Set[str]:
    """
    Extract all unique characters from model vocabulary.
    
    Args:
        model_recognizer: PhonemeRecognizer instance
        
    Returns:
        Set of all unique characters
    """
    all_chars: Set[str] = set()
    
    try:
        vocab = model_recognizer.get_vocab()
        print(f"Loading characters from model: {model_recognizer.model_name}")
        print(f"Model vocabulary size: {len(vocab)}")
        
        # Special tokens to exclude
        skip_tokens = {
            '[PAD]', '[UNK]', '<pad>', '<unk>', '<blank>', '[BLANK]', 
            '<s>', '</s>', '<|endoftext|>', '|', 'h#', 'spn', '',
            '<sos>', '<eos>', '[CLS]', '[SEP]', '[MASK]'
        }
        
        for token in vocab.keys():
            # Skip special tokens
            if token in skip_tokens:
                continue
            # Skip tokens that look like service tokens
            if token.startswith('<') and token.endswith('>'):
                continue
            if token.startswith('[') and token.endswith(']'):
                continue
            
            # Add all characters from token
            all_chars.update(extract_all_characters_from_text(token))
        
        print(f"✓ Extracted {len(all_chars)} unique characters from model")
        return all_chars
    except Exception as e:
        print(f"✗ Error loading model vocabulary: {e}")
        import traceback
        traceback.print_exc()
        return all_chars


print("Character extraction functions created")

Character extraction functions created


In [15]:
# Load characters from IPA-Dict-DSL dictionary
print("=" * 80)
print("1. LOADING CHARACTERS FROM IPA-Dict-DSL DICTIONARY")
print("=" * 80)
dsl_chars = extract_characters_from_dsl_lexicon(config.IPA_DSL_LEXICON_PATH)
print(f"Total unique characters: {len(dsl_chars)}")
print()

1. LOADING CHARACTERS FROM IPA-Dict-DSL DICTIONARY
Loading characters from DSL dictionary: /Volumes/SSanDisk/SpeechRec-German-diagnostic/data/dictionaries/de_ipa.dsl
✓ Extracted 97 unique characters from DSL dictionary
Total unique characters: 97



In [16]:
# Load characters from MFA dictionary
print("=" * 80)
print("2. LOADING CHARACTERS FROM MFA DICTIONARY")
print("=" * 80)
mfa_chars = extract_characters_from_mfa_lexicon(config.MFA_GERMAN_LEXICON_PATH)
print(f"Total unique characters: {len(mfa_chars)}")
print()

2. LOADING CHARACTERS FROM MFA DICTIONARY
Loading characters from MFA dictionary: /Volumes/SSanDisk/SpeechRec-German-diagnostic/data/dictionaries/german_mfa.dict
✓ Extracted 42 unique characters from MFA dictionary
Total unique characters: 42



In [17]:
# Load characters from phoneme recognition model
print("=" * 80)
print("3. LOADING CHARACTERS FROM PHONEME RECOGNITION MODEL")
print("=" * 80)
try:
    model_recognizer = get_phoneme_recognizer(
        model_name=config.MODEL_NAME,
        device=config.MODEL_DEVICE if config.MODEL_DEVICE != "auto" else None
    )
    model_chars = extract_characters_from_model_vocab(model_recognizer)
    print(f"Total unique characters: {len(model_chars)}")
except Exception as e:
    print(f"✗ Error loading model: {e}")
    import traceback
    traceback.print_exc()
    model_chars: Set[str] = set()
print()

3. LOADING CHARACTERS FROM PHONEME RECOGNITION MODEL
Loading characters from model: facebook/wav2vec2-xlsr-53-espeak-cv-ft
Model vocabulary size: 392
✓ Extracted 106 unique characters from model
Total unique characters: 106



In [18]:
# Compare and analyze characters
print("=" * 80)
print("4. CHARACTER COMPARISON AND ANALYSIS")
print("=" * 80)

# Union of all characters
all_chars = dsl_chars | mfa_chars | model_chars
print(f"Total unique characters across all sources: {len(all_chars)}")
print()

# Common characters
common_all = dsl_chars & mfa_chars & model_chars
common_dsl_mfa = dsl_chars & mfa_chars
common_dsl_model = dsl_chars & model_chars
common_mfa_model = mfa_chars & model_chars

print(f"Characters present in all three sources: {len(common_all)}")
print(f"Characters common to DSL and MFA: {len(common_dsl_mfa)}")
print(f"Characters common to DSL and model: {len(common_dsl_model)}")
print(f"Characters common to MFA and model: {len(common_mfa_model)}")
print()

# Unique characters for each source
only_dsl = dsl_chars - mfa_chars - model_chars
only_mfa = mfa_chars - dsl_chars - model_chars
only_model = model_chars - dsl_chars - mfa_chars

print(f"Characters unique to DSL: {len(only_dsl)}")
print(f"Characters unique to MFA: {len(only_mfa)}")
print(f"Characters unique to model: {len(only_model)}")
print()

4. CHARACTER COMPARISON AND ANALYSIS
Total unique characters across all sources: 146

Characters present in all three sources: 38
Characters common to DSL and MFA: 39
Characters common to DSL and model: 57
Characters common to MFA and model: 41

Characters unique to DSL: 39
Characters unique to MFA: 0
Characters unique to model: 46



In [19]:
# Detailed character output
print("=" * 80)
print("5. DETAILED CHARACTER OUTPUT")
print("=" * 80)


def print_char_set(name: str, char_set: Set[str], max_display: int = 100) -> None:
    """Print a set of characters with their Unicode codes."""
    if not char_set:
        print(f"\n{name}: (empty)")
        return
    
    sorted_chars = sorted(char_set)
    print(f"\n{name} ({len(char_set)} characters):")
    print("-" * 80)
    
    # Group by categories
    ipa_vowels: List[str] = []
    ipa_consonants: List[str] = []
    ipa_diacritics: List[str] = []
    ipa_suprasegmentals: List[str] = []
    ipa_other: List[str] = []
    ascii_chars: List[str] = []
    
    for char in sorted_chars:
        # Get Unicode code
        unicode_code = ord(char)
        char_info = f"'{char}' (U+{unicode_code:04X})"
        
        # Classify characters
        if char in 'aeiouyøœɛɔɪʊʏəɐ':
            ipa_vowels.append(char_info)
        elif char in 'pbtdkgfvszʃʒçxhmnlrʁŋ':
            ipa_consonants.append(char_info)
        elif char in 'ːˌˈ̯͜':
            ipa_suprasegmentals.append(char_info)
        elif 0x0300 <= ord(char) <= 0x036F:  # Combining diacritics
            ipa_diacritics.append(char_info)
        elif char.isascii() and char.isprintable():
            ascii_chars.append(char_info)
        else:
            ipa_other.append(char_info)
    
    # Print by categories
    if ipa_vowels:
        print(f"\nIPA Vowels ({len(ipa_vowels)}):")
        for i, char_info in enumerate(ipa_vowels[:max_display], 1):
            print(f"  {i:3d}. {char_info}")
        if len(ipa_vowels) > max_display:
            print(f"  ... and {len(ipa_vowels) - max_display} more characters")
    
    if ipa_consonants:
        print(f"\nIPA Consonants ({len(ipa_consonants)}):")
        for i, char_info in enumerate(ipa_consonants[:max_display], 1):
            print(f"  {i:3d}. {char_info}")
        if len(ipa_consonants) > max_display:
            print(f"  ... and {len(ipa_consonants) - max_display} more characters")
    
    if ipa_diacritics:
        print(f"\nDiacritical Marks ({len(ipa_diacritics)}):")
        for i, char_info in enumerate(ipa_diacritics[:max_display], 1):
            print(f"  {i:3d}. {char_info}")
        if len(ipa_diacritics) > max_display:
            print(f"  ... and {len(ipa_diacritics) - max_display} more characters")
    
    if ipa_suprasegmentals:
        print(f"\nSuprasegmental Marks ({len(ipa_suprasegmentals)}):")
        for i, char_info in enumerate(ipa_suprasegmentals[:max_display], 1):
            print(f"  {i:3d}. {char_info}")
        if len(ipa_suprasegmentals) > max_display:
            print(f"  ... and {len(ipa_suprasegmentals) - max_display} more characters")
    
    if ascii_chars:
        print(f"\nASCII Characters ({len(ascii_chars)}):")
        for i, char_info in enumerate(ascii_chars[:max_display], 1):
            print(f"  {i:3d}. {char_info}")
        if len(ascii_chars) > max_display:
            print(f"  ... and {len(ascii_chars) - max_display} more characters")
    
    if ipa_other:
        print(f"\nOther Symbols ({len(ipa_other)}):")
        for i, char_info in enumerate(ipa_other[:max_display], 1):
            print(f"  {i:3d}. {char_info}")
        if len(ipa_other) > max_display:
            print(f"  ... and {len(ipa_other) - max_display} more characters")


# Print all characters from each source
print_char_set("DSL Dictionary", dsl_chars)
print_char_set("MFA Dictionary", mfa_chars)
print_char_set("Model", model_chars)

5. DETAILED CHARACTER OUTPUT

DSL Dictionary (97 characters):
--------------------------------------------------------------------------------

IPA Vowels (15):
    1. 'a' (U+0061)
    2. 'e' (U+0065)
    3. 'i' (U+0069)
    4. 'o' (U+006F)
    5. 'u' (U+0075)
    6. 'y' (U+0079)
    7. 'ø' (U+00F8)
    8. 'œ' (U+0153)
    9. 'ɐ' (U+0250)
   10. 'ɔ' (U+0254)
   11. 'ə' (U+0259)
   12. 'ɛ' (U+025B)
   13. 'ɪ' (U+026A)
   14. 'ʊ' (U+028A)
   15. 'ʏ' (U+028F)

IPA Consonants (21):
    1. 'b' (U+0062)
    2. 'd' (U+0064)
    3. 'f' (U+0066)
    4. 'g' (U+0067)
    5. 'h' (U+0068)
    6. 'k' (U+006B)
    7. 'l' (U+006C)
    8. 'm' (U+006D)
    9. 'n' (U+006E)
   10. 'p' (U+0070)
   11. 'r' (U+0072)
   12. 's' (U+0073)
   13. 't' (U+0074)
   14. 'v' (U+0076)
   15. 'x' (U+0078)
   16. 'z' (U+007A)
   17. 'ç' (U+00E7)
   18. 'ŋ' (U+014B)
   19. 'ʁ' (U+0281)
   20. 'ʃ' (U+0283)
   21. 'ʒ' (U+0292)

Diacritical Marks (6):
    1. '̃' (U+0303)
    2. '̆' (U+0306)
    3. '̍' (U+030D)
    4. '̥' (U

In [20]:
# Unique characters for each source
print("=" * 80)
print("6. UNIQUE CHARACTERS FOR EACH SOURCE")
print("=" * 80)

print_char_set("Only in DSL dictionary", only_dsl)
print_char_set("Only in MFA dictionary", only_mfa)
print_char_set("Only in model", only_model)

6. UNIQUE CHARACTERS FOR EACH SOURCE

Only in DSL dictionary (39 characters):
--------------------------------------------------------------------------------

IPA Consonants (1):
    1. 'g' (U+0067)

Diacritical Marks (4):
    1. '̆' (U+0306)
    2. '̍' (U+030D)
    3. '̥' (U+0325)
    4. '͡' (U+0361)

Suprasegmental Marks (4):
    1. 'ˈ' (U+02C8)
    2. 'ˌ' (U+02CC)
    3. '̯' (U+032F)
    4. '͜' (U+035C)

ASCII Characters (18):
    1. ''' (U+0027)
    2. '(' (U+0028)
    3. ')' (U+0029)
    4. '/' (U+002F)
    5. 'A' (U+0041)
    6. 'C' (U+0043)
    7. 'D' (U+0044)
    8. 'E' (U+0045)
    9. 'F' (U+0046)
   10. 'O' (U+004F)
   11. 'Q' (U+0051)
   12. 'R' (U+0052)
   13. 'T' (U+0054)
   14. 'U' (U+0055)
   15. '\' (U+005C)
   16. ']' (U+005D)
   17. '|' (U+007C)
   18. '~' (U+007E)

Other Symbols (12):
    1. 'ã' (U+00E3)
    2. 'õ' (U+00F5)
    3. 'ĭ' (U+012D)
    4. 'ɘ' (U+0258)
    5. 'ɱ' (U+0271)
    6. 'ʀ' (U+0280)
    7. 'ʧ' (U+02A7)
    8. 'ˀ' (U+02C0)
    9. 'ˑ' (U+02D1)
   1

In [21]:
# Analysis of potential issues and recommendations
print("=" * 80)
print("7. ANALYSIS AND RECOMMENDATIONS")
print("=" * 80)

# Check for similar symbols that may be different variants of the same phoneme
print("\nPotential encoding issues:")
print("-" * 80)

# Check symbol variants (e.g., different variants of 'a', 'g', 'r')
variants_to_check = {
    'a': ['a', 'ɑ', 'ɐ'],
    'g': ['g', 'ɡ'],
    'r': ['r', 'ɾ', 'ʁ'],
    'e': ['e', 'ɛ', 'ɜ'],
    'i': ['i', 'ɪ'],
    'u': ['u', 'ʊ'],
    'o': ['o', 'ɔ'],
}

for base_char, variants in variants_to_check.items():
    found_variants = []
    for variant in variants:
        if variant in all_chars:
            sources = []
            if variant in dsl_chars:
                sources.append("DSL")
            if variant in mfa_chars:
                sources.append("MFA")
            if variant in model_chars:
                sources.append("Model")
            found_variants.append((variant, sources))
    
    if len(found_variants) > 1:
        print(f"\nVariants of symbol '{base_char}':")
        for variant, sources in found_variants:
            print(f"  '{variant}' (U+{ord(variant):04X}) - found in: {', '.join(sources)}")

# Check diacritical marks
print("\n\nDiacritical and special marks:")
print("-" * 80)
diacritics = [c for c in all_chars if 0x0300 <= ord(c) <= 0x036F]
if diacritics:
    print(f"Found {len(diacritics)} diacritical marks:")
    for diacritic in sorted(diacritics):
        sources = []
        if diacritic in dsl_chars:
            sources.append("DSL")
        if diacritic in mfa_chars:
            sources.append("MFA")
        if diacritic in model_chars:
            sources.append("Model")
        print(f"  '{diacritic}' (U+{ord(diacritic):04X}) - found in: {', '.join(sources)}")

# Suprasegmental marks
suprasegmentals = [c for c in all_chars if c in 'ːˌˈ̯͜']
if suprasegmentals:
    print(f"\nFound {len(suprasegmentals)} suprasegmental marks:")
    for sup in sorted(suprasegmentals):
        sources = []
        if sup in dsl_chars:
            sources.append("DSL")
        if sup in mfa_chars:
            sources.append("MFA")
        if sup in model_chars:
            sources.append("Model")
        print(f"  '{sup}' (U+{ord(sup):04X}) - found in: {', '.join(sources)}")

print("\n\nRecommendations:")
print("-" * 80)
print("1. Unique symbols that are not found elsewhere:")
if only_dsl:
    print(f"   - DSL: {len(only_dsl)} symbols - decide: ignore or map")
if only_mfa:
    print(f"   - MFA: {len(only_mfa)} symbols - decide: ignore or map")
if only_model:
    print(f"   - Model: {len(only_model)} symbols - decide: ignore or map")

print("\n2. Symbol variants (e.g., different 'a', 'g', 'r'):")
print("   - Need to decide which variants to use for normalization")
print("   - Important: do not simplify close but different phonemes")

print("\n3. Special symbols (diacritics, suprasegmentals):")
print("   - Decide how to handle: keep or remove")
print("   - Note that some may be important for phoneme distinction")

7. ANALYSIS AND RECOMMENDATIONS

Potential encoding issues:
--------------------------------------------------------------------------------

Variants of symbol 'a':
  'a' (U+0061) - found in: DSL, MFA, Model
  'ɑ' (U+0251) - found in: DSL, Model
  'ɐ' (U+0250) - found in: DSL, MFA, Model

Variants of symbol 'g':
  'g' (U+0067) - found in: DSL
  'ɡ' (U+0261) - found in: DSL, MFA, Model

Variants of symbol 'r':
  'r' (U+0072) - found in: DSL, Model
  'ɾ' (U+027E) - found in: DSL, Model
  'ʁ' (U+0281) - found in: DSL, MFA, Model

Variants of symbol 'e':
  'e' (U+0065) - found in: DSL, MFA, Model
  'ɛ' (U+025B) - found in: DSL, MFA, Model
  'ɜ' (U+025C) - found in: Model

Variants of symbol 'i':
  'i' (U+0069) - found in: DSL, MFA, Model
  'ɪ' (U+026A) - found in: DSL, MFA, Model

Variants of symbol 'u':
  'u' (U+0075) - found in: DSL, MFA, Model
  'ʊ' (U+028A) - found in: DSL, MFA, Model

Variants of symbol 'o':
  'o' (U+006F) - found in: DSL, MFA, Model
  'ɔ' (U+0254) - found in: DSL, M

## 9. Creating Unified Phoneme Normalization Table

This cell creates a unified normalization table that will be used across all project modules to synchronize phoneme extraction from dictionaries and the model.

**Normalization Strategy:**
- **Unicode normalization**: g → ɡ (only mandatory mapping)
- **Diacritics and special symbols**: keep only those present in the model
- **Affricates**: expand into phoneme sequences
- **Phonemes**: do not map different phonemes, even if they are close (a/ɑ/ɐ, r/ɾ/ʁ, etc.)

In [None]:
# Create unified phoneme normalization table
print("=" * 80)
print("9. CREATING UNIFIED PHONEME NORMALIZATION TABLE")
print("=" * 80)

# 1. Phoneme mapping (Unicode normalization)
# Only mandatory mapping: g → ɡ
phoneme_mapping = {
    'g': 'ɡ',  # U+0067 → U+0261 (LATIN SMALL LETTER G → IPA SMALL LETTER G)
}

# 2. Diacritical marks: decision (keep/remove)
# Rule: keep only those present in the model
diacritics_decision = {}

# Diacritical marks from analysis
diacritics_list = [
    ('̃', 0x0303, 'COMBINING TILDE'),
    ('̆', 0x0306, 'COMBINING BREVE'),
    ('̊', 0x030A, 'COMBINING RING ABOVE'),
    ('̍', 0x030D, 'COMBINING VERTICAL LINE ABOVE'),
    ('̝', 0x031D, 'COMBINING UP TACK BELOW'),
    ('̞', 0x031E, 'COMBINING DOWN TACK BELOW'),
    ('̥', 0x0325, 'COMBINING RING BELOW'),
    ('̩', 0x0329, 'COMBINING VERTICAL LINE BELOW'),
    ('̪', 0x032A, 'COMBINING BRIDGE BELOW'),
    ('̯', 0x032F, 'COMBINING INVERTED BREVE BELOW'),
    ('͜', 0x035C, 'COMBINING DOUBLE BREVE BELOW'),
    ('͡', 0x0361, 'COMBINING DOUBLE INVERTED BREVE'),
]

for char, code, name in diacritics_list:
    in_model = char in model_chars
    in_dsl = char in dsl_chars
    in_mfa = char in mfa_chars
    in_dicts = in_dsl or in_mfa
    
    # Decision: keep only if present in model
    decision = "keep" if in_model else "remove"
    reason = ""
    if in_model and in_dicts:
        reason = "Present in model and dictionaries"
    elif in_model:
        reason = "Present in model (may be used in recognition)"
    else:
        reason = "Not in model - will be removed from dictionaries"
    
    diacritics_decision[char] = {
        "unicode": f"U+{code:04X}",
        "name": name,
        "in_model": in_model,
        "in_dsl": in_dsl,
        "in_mfa": in_mfa,
        "decision": decision,
        "reason": reason
    }

# 3. Suprasegmental marks: decision (keep/remove)
suprasegmentals_decision = {}

suprasegmentals_list = [
    ('ˈ', 0x02C8, 'MODIFIER LETTER PRIMARY STRESS'),
    ('ˌ', 0x02CC, 'MODIFIER LETTER SECONDARY STRESS'),
    ('ː', 0x02D0, 'MODIFIER LETTER TRIANGULAR COLON'),
    ('̯', 0x032F, 'COMBINING INVERTED BREVE BELOW'),  # also a diacritic
    ('͜', 0x035C, 'COMBINING DOUBLE BREVE BELOW'),  # also a diacritic
]

for char, code, name in suprasegmentals_list:
    in_model = char in model_chars
    in_dsl = char in dsl_chars
    in_mfa = char in mfa_chars
    in_dicts = in_dsl or in_mfa
    
    # Decision: keep only if present in model
    decision = "keep" if in_model else "remove"
    reason = ""
    if in_model and in_dicts:
        reason = "Present in model and dictionaries"
    elif in_model:
        reason = "Present in model (may be used in recognition)"
    else:
        reason = "Not in model - will be removed from dictionaries"
    
    suprasegmentals_decision[char] = {
        "unicode": f"U+{code:04X}",
        "name": name,
        "in_model": in_model,
        "in_dsl": in_dsl,
        "in_mfa": in_mfa,
        "decision": decision,
        "reason": reason
    }

# 4. Affricates and composite symbols: expansion rules
# If model doesn't use ligatures (͡, ͜), expand into sequence
affricates_expansion = {
    't͡s': 't s',  # or 'ts' if model supports it
    'd͡ʒ': 'd ʒ',  # or 'dʒ' if model supports it
    't͜s': 't s',
    'd͜ʒ': 'd ʒ',
    'pf': 'p f',  # if expansion needed
    'ts': 't s',  # if expansion needed
}

# 5. Phonemes that should NOT be mapped (preserve distinctions)
# These phonemes are phonologically distinct and should remain separate
phonemes_preserve_distinctions = {
    'vowels': {
        'a': 'a (U+0061)',
        'ɑ': 'ɑ (U+0251) - open back unrounded',
        'ɐ': 'ɐ (U+0250) - near-open central',
    },
    'e_variants': {
        'e': 'e (U+0065)',
        'ɛ': 'ɛ (U+025B) - open-mid front',
        'ɜ': 'ɜ (U+025C) - open-mid central',
    },
    'i_variants': {
        'i': 'i (U+0069)',
        'ɪ': 'ɪ (U+026A) - near-close near-front',
    },
    'u_variants': {
        'u': 'u (U+0075)',
        'ʊ': 'ʊ (U+028A) - near-close near-back',
    },
    'o_variants': {
        'o': 'o (U+006F)',
        'ɔ': 'ɔ (U+0254) - open-mid back',
    },
    'r_variants': {
        'r': 'r (U+0072) - alveolar trill',
        'ɾ': 'ɾ (U+027E) - alveolar tap',
        'ʁ': 'ʁ (U+0281) - uvular fricative',
    },
}

# 6. Symbols to remove from dictionaries (if not in model)
chars_to_remove_from_dicts = []

# Characters that are only in dictionaries, but not in model
for char in (dsl_chars | mfa_chars) - model_chars:
    # Skip ASCII formatting symbols (/, [, ], |, etc.)
    if char.isascii() and char in '()[]/|\\~':
        continue
    # Skip spaces and invisible characters
    if char.isspace() or ord(char) in [0x200B, 0x2060]:  # ZERO WIDTH SPACE, WORD JOINER
        continue
    chars_to_remove_from_dicts.append({
        "char": char,
        "unicode": f"U+{ord(char):04X}",
        "in_dsl": char in dsl_chars,
        "in_mfa": char in mfa_chars,
        "reason": "Not in model vocabulary - will be removed during normalization"
    })

# 7. Model characters not in dictionaries (this is normal - keep them)
chars_only_in_model = sorted(list(only_model))

# 8. Create final normalization table
normalization_table = {
    "version": "1.0",
    "description": "Phoneme Normalization & Comparison Strategy for DSL/MFA/G2P → HF phoneme model",
    "strategy": {
        "principle": "Unicode normalization, not phonological normalization",
        "key_priority": "Dictionaries should not contain symbols that model cannot produce",
        "phoneme_preservation": "Do not merge different phonemes, even if articulatorily similar"
    },
    "phoneme_mapping": phoneme_mapping,
    "diacritics": diacritics_decision,
    "suprasegmentals": suprasegmentals_decision,
    "affricates_expansion": affricates_expansion,
    "phonemes_preserve_distinctions": phonemes_preserve_distinctions,
    "chars_to_remove_from_dicts": chars_to_remove_from_dicts,
    "chars_only_in_model": chars_only_in_model,
    "model_inventory": sorted(list(model_chars)),
    "statistics": {
        "phoneme_mappings": len(phoneme_mapping),
        "diacritics_keep": len([d for d in diacritics_decision.values() if d["decision"] == "keep"]),
        "diacritics_remove": len([d for d in diacritics_decision.values() if d["decision"] == "remove"]),
        "suprasegmentals_keep": len([s for s in suprasegmentals_decision.values() if s["decision"] == "keep"]),
        "suprasegmentals_remove": len([s for s in suprasegmentals_decision.values() if s["decision"] == "remove"]),
        "chars_to_remove_count": len(chars_to_remove_from_dicts),
        "chars_only_in_model_count": len(chars_only_in_model),
        "model_inventory_size": len(model_chars),
    }
}

# Save normalization table
output_path = project_root / "phoneme_normalization_table.json"
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(normalization_table, f, ensure_ascii=False, indent=2)

print(f"\n✓ Normalization table created and saved to: {output_path}")
print(f"\nStatistics:")
print(f"  - Phoneme mappings: {normalization_table['statistics']['phoneme_mappings']}")
print(f"  - Diacritics (keep): {normalization_table['statistics']['diacritics_keep']}")
print(f"  - Diacritics (remove): {normalization_table['statistics']['diacritics_remove']}")
print(f"  - Suprasegmentals (keep): {normalization_table['statistics']['suprasegmentals_keep']}")
print(f"  - Suprasegmentals (remove): {normalization_table['statistics']['suprasegmentals_remove']}")
print(f"  - Characters to remove from dictionaries: {normalization_table['statistics']['chars_to_remove_count']}")
print(f"  - Characters only in model: {normalization_table['statistics']['chars_only_in_model_count']}")
print(f"  - Model inventory size: {normalization_table['statistics']['model_inventory_size']}")

print("\n" + "=" * 80)
print("DETAILED DECISION OUTPUT")
print("=" * 80)

print("\n1. Phoneme mapping (Unicode normalization):")
for from_char, to_char in phoneme_mapping.items():
    print(f"   '{from_char}' (U+{ord(from_char):04X}) → '{to_char}' (U+{ord(to_char):04X})")

print("\n2. Diacritical marks:")
for char, info in sorted(diacritics_decision.items(), key=lambda x: x[1]["unicode"]):
    decision_mark = "✓" if info["decision"] == "keep" else "✗"
    print(f"   {decision_mark} '{char}' ({info['unicode']}) - {info['decision']}: {info['reason']}")

print("\n3. Suprasegmental marks:")
for char, info in sorted(suprasegmentals_decision.items(), key=lambda x: x[1]["unicode"]):
    decision_mark = "✓" if info["decision"] == "keep" else "✗"
    print(f"   {decision_mark} '{char}' ({info['unicode']}) - {info['decision']}: {info['reason']}")

print("\n4. Affricates (expansion):")
for affricate, expansion in affricates_expansion.items():
    print(f"   '{affricate}' → '{expansion}'")

print("\n5. Phonemes preserving distinctions (NOT mapped):")
for category, variants in phonemes_preserve_distinctions.items():
    print(f"   {category}:")
    for char, description in variants.items():
        print(f"     - {description}")

print(f"\n6. Characters to remove from dictionaries ({len(chars_to_remove_from_dicts)}):")
for item in chars_to_remove_from_dicts[:20]:  # Show first 20
    sources = []
    if item["in_dsl"]:
        sources.append("DSL")
    if item["in_mfa"]:
        sources.append("MFA")
    print(f"   '{item['char']}' ({item['unicode']}) - in: {', '.join(sources)}")
if len(chars_to_remove_from_dicts) > 20:
    print(f"   ... and {len(chars_to_remove_from_dicts) - 20} more characters")

print("\n" + "=" * 80)
print("✓ Normalization table ready for use in the project!")
print("=" * 80)
print(f"\nUsage in code:")
print(f"  import json")
print(f"  with open('{output_path.name}', 'r', encoding='utf-8') as f:")
print(f"      normalization = json.load(f)")
print(f"  ")
print(f"  # Apply phoneme mapping")
print(f"  for from_char, to_char in normalization['phoneme_mapping'].items():")
print(f"      text = text.replace(from_char, to_char)")
print(f"  ")
print(f"  # Remove diacritics not in model")
print(f"  for char, info in normalization['diacritics'].items():")
print(f"      if info['decision'] == 'remove':")
print(f"          text = text.replace(char, '')")

9. CREATING UNIFIED PHONEME NORMALIZATION TABLE

✓ Таблица нормализации создана и сохранена в: /Volumes/SSanDisk/SpeechRec-German-diagnostic/phoneme_normalization_table.json

Статистика:
  - Фонемных маппингов: 1
  - Диакритик (оставить): 6
  - Диакритик (удалить): 6
  - Suprasegmentals (keep): 1
  - Suprasegmentals (remove): 4
  - Characters to remove from dictionaries: 31
  - Characters only in model: 46
  - Model inventory size: 106

DETAILED DECISION OUTPUT

1. Phoneme mapping (Unicode normalization):
   'g' (U+0067) → 'ɡ' (U+0261)

2. Diacritical marks:
   ✓ '̃' (U+0303) - keep: Present in model and dictionaries
   ✗ '̆' (U+0306) - remove: Not in model - will be removed from dictionaries
   ✓ '̊' (U+030A) - keep: Present in model (may be used in recognition)
   ✗ '̍' (U+030D) - remove: Not in model - will be removed from dictionaries
   ✓ '̝' (U+031D) - keep: Present in model (may be used in recognition)
   ✓ '̞' (U+031E) - keep: Present in model (may be used in recognition)
   ✗ 

In [None]:
# Save results to JSON for further analysis
print("=" * 80)
print("8. SAVING RESULTS")
print("=" * 80)

results = {
    "dsl_chars": sorted(list(dsl_chars)),
    "mfa_chars": sorted(list(mfa_chars)),
    "model_chars": sorted(list(model_chars)),
    "all_chars": sorted(list(all_chars)),
    "only_dsl": sorted(list(only_dsl)),
    "only_mfa": sorted(list(only_mfa)),
    "only_model": sorted(list(only_model)),
    "common_all": sorted(list(common_all)),
    "common_dsl_mfa": sorted(list(common_dsl_mfa)),
    "common_dsl_model": sorted(list(common_dsl_model)),
    "common_mfa_model": sorted(list(common_mfa_model)),
    "statistics": {
        "dsl_total": len(dsl_chars),
        "mfa_total": len(mfa_chars),
        "model_total": len(model_chars),
        "all_total": len(all_chars),
        "only_dsl_count": len(only_dsl),
        "only_mfa_count": len(only_mfa),
        "only_model_count": len(only_model),
        "common_all_count": len(common_all),
    }
}

# Add Unicode code information
results["char_info"] = {}
for char in all_chars:
    results["char_info"][char] = {
        "unicode": f"U+{ord(char):04X}",
        "unicode_code": ord(char),
        "in_dsl": char in dsl_chars,
        "in_mfa": char in mfa_chars,
        "in_model": char in model_chars,
    }

output_path = project_root / "phoneme_symbols_analysis.json"
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"✓ Results saved to: {output_path}")
print(f"  - Total characters analyzed: {len(all_chars)}")
print(f"  - Unique to DSL: {len(only_dsl)}")
print(f"  - Unique to MFA: {len(only_mfa)}")
print(f"  - Unique to model: {len(only_model)}")

8. SAVING RESULTS
✓ Results saved to: /Volumes/SSanDisk/SpeechRec-German-diagnostic/phoneme_symbols_analysis.json
  - Total characters analyzed: 146
  - Unique to DSL: 39
  - Unique to MFA: 0
  - Unique to model: 46
