# Comparison of phonemes from all models

This notebook displays all phonemes returned by four models:
1. Main model (vitouphy/wav2vec2-xls-r-300m-phoneme) via PhonemeRecognizer
2. Wav2Vec2Phoneme model (vitouphy/wav2vec2-xls-r-300m-phoneme) via Wav2Vec2PhonemeRecognizer
3. Wav2Vec2 Large model (facebook/wav2vec2-large-960h-lv60-self) via PhonemeRecognizer
4. **CommonPhone model (pklumpp/Wav2Vec2_CommonPhone)** - trained on IPA phonemes from Common Phone dataset (includes German)

In [8]:
import sys
from pathlib import Path

# Determine project root (parent of notebooks directory)
# This allows the notebook to work regardless of where the project is located
current_dir = Path.cwd()
if current_dir.name == 'notebooks':
    project_root = current_dir.parent
else:
    # Fallback: assume we're at project root
    project_root = current_dir

sys.path.insert(0, str(project_root))

# Import modules
from modules.phoneme_recognition import get_phoneme_recognizer
from modules.wav2vec2_phoneme_recognition import get_wav2vec2_phoneme_recognizer
import config

print("Imports completed successfully")


Imports completed successfully


In [9]:
def filter_phonemes(vocab):
    """
    Filter phonemes from vocabulary, excluding special tokens.
    
    Args:
        vocab: Dictionary {token: id}
        
    Returns:
        List of phonemes (without special tokens)
    """
    # Special tokens to exclude
    skip_tokens = {
        '[PAD]', '[UNK]', '<pad>', '<unk>', '<blank>', '[BLANK]', 
        '<s>', '</s>', '<|endoftext|>', '|', 'h#', 'spn', '',
        '<sos>', '<eos>', '[CLS]', '[SEP]', '[MASK]'
    }
    
    phonemes = []
    for token in vocab.keys():
        # Skip special tokens
        if token in skip_tokens:
            continue
        # Skip tokens that look like service tokens
        if token.startswith('<') and token.endswith('>'):
            continue
        if token.startswith('[') and token.endswith(']'):
            continue
        phonemes.append(token)
    
    return sorted(phonemes)

print("Filter function created")


Filter function created


In [10]:
# Load first model (main model)
print("=" * 80)
print("MODEL 1: Main model (PhonemeRecognizer)")
print("=" * 80)
print(f"Model: {config.MODEL_NAME}")
print()

try:
    model1 = get_phoneme_recognizer(
        model_name=config.MODEL_NAME,
        device=config.MODEL_DEVICE if config.MODEL_DEVICE != "auto" else None
    )
    vocab1 = model1.get_vocab()
    phonemes1 = filter_phonemes(vocab1)
    
    print(f"Vocabulary size: {len(vocab1)}")
    print(f"Number of phonemes (after filtering): {len(phonemes1)}")
    print()
    print("All phonemes from model 1:")
    print("-" * 80)
    for i, phoneme in enumerate(phonemes1, 1):
        print(f"{i:3d}. {phoneme}")
    print()
    
except Exception as e:
    print(f"Error loading model 1: {e}")
    import traceback
    traceback.print_exc()
    phonemes1 = []


MODEL 1: Main model (PhonemeRecognizer)
Model: vitouphy/wav2vec2-xls-r-300m-phoneme

Vocabulary size: 45
Number of phonemes (after filtering): 38

All phonemes from model 1:
--------------------------------------------------------------------------------
  1. aa
  2. ae
  3. ah
  4. aw
  5. ay
  6. b
  7. ch
  8. d
  9. dh
 10. dx
 11. eh
 12. er
 13. ey
 14. f
 15. g
 16. hh
 17. ih
 18. iy
 19. jh
 20. k
 21. l
 22. m
 23. n
 24. ng
 25. ow
 26. oy
 27. p
 28. r
 29. s
 30. sh
 31. t
 32. th
 33. uh
 34. uw
 35. v
 36. w
 37. y
 38. z



In [11]:
# Load second model (Wav2Vec2Phoneme)
print("=" * 80)
print("MODEL 2: Wav2Vec2Phoneme model (Wav2Vec2PhonemeRecognizer)")
print("=" * 80)
print(f"Model: {config.WAV2VEC2_PHONEME_MODEL_NAME}")
print()

try:
    model2 = get_wav2vec2_phoneme_recognizer(
        model_name=config.WAV2VEC2_PHONEME_MODEL_NAME,
        device=config.MODEL_DEVICE if config.MODEL_DEVICE != "auto" else None
    )
    vocab2 = model2.get_vocab()
    phonemes2 = filter_phonemes(vocab2)
    
    print(f"Vocabulary size: {len(vocab2)}")
    print(f"Number of phonemes (after filtering): {len(phonemes2)}")
    print()
    print("All phonemes from model 2:")
    print("-" * 80)
    for i, phoneme in enumerate(phonemes2, 1):
        print(f"{i:3d}. {phoneme}")
    print()
    
except Exception as e:
    print(f"Error loading model 2: {e}")
    import traceback
    traceback.print_exc()
    phonemes2 = []


MODEL 2: Wav2Vec2Phoneme model (Wav2Vec2PhonemeRecognizer)
Model: vitouphy/wav2vec2-xls-r-300m-phoneme

Vocabulary size: 45
Number of phonemes (after filtering): 38

All phonemes from model 2:
--------------------------------------------------------------------------------
  1. aa
  2. ae
  3. ah
  4. aw
  5. ay
  6. b
  7. ch
  8. d
  9. dh
 10. dx
 11. eh
 12. er
 13. ey
 14. f
 15. g
 16. hh
 17. ih
 18. iy
 19. jh
 20. k
 21. l
 22. m
 23. n
 24. ng
 25. ow
 26. oy
 27. p
 28. r
 29. s
 30. sh
 31. t
 32. th
 33. uh
 34. uw
 35. v
 36. w
 37. y
 38. z



In [12]:
# Load third model (Wav2Vec2 Large)
print("=" * 80)
print("MODEL 3: Wav2Vec2 Large model (PhonemeRecognizer)")
print("=" * 80)
print(f"Model: {config.WAV2VEC2_LARGE_MODEL_NAME}")
print()

try:
    model3 = get_phoneme_recognizer(
        model_name=config.WAV2VEC2_LARGE_MODEL_NAME,
        device=config.MODEL_DEVICE if config.MODEL_DEVICE != "auto" else None
    )
    vocab3 = model3.get_vocab()
    phonemes3 = filter_phonemes(vocab3)
    
    print(f"Vocabulary size: {len(vocab3)}")
    print(f"Number of phonemes (after filtering): {len(phonemes3)}")
    print()
    print("All phonemes from model 3:")
    print("-" * 80)
    for i, phoneme in enumerate(phonemes3, 1):
        print(f"{i:3d}. {phoneme}")
    print()
    
except Exception as e:
    print(f"Error loading model 3: {e}")
    import traceback
    traceback.print_exc()
    phonemes3 = []


MODEL 3: Wav2Vec2 Large model (PhonemeRecognizer)
Model: facebook/wav2vec2-large-960h-lv60-self

Vocabulary size: 32
Number of phonemes (after filtering): 27

All phonemes from model 3:
--------------------------------------------------------------------------------
  1. '
  2. A
  3. B
  4. C
  5. D
  6. E
  7. F
  8. G
  9. H
 10. I
 11. J
 12. K
 13. L
 14. M
 15. N
 16. O
 17. P
 18. Q
 19. R
 20. S
 21. T
 22. U
 23. V
 24. W
 25. X
 26. Y
 27. Z



In [13]:
# Load fourth model (CommonPhone - IPA phonemes)
print("=" * 80)
print("MODEL 4: CommonPhone model (Direct loading)")
print("=" * 80)
print("Model: pklumpp/Wav2Vec2_CommonPhone")
print("Note: This model uses IPA phonemes and is trained on Common Phone dataset")
print("which includes German speech. Should be better for German phoneme recognition.")
print()

try:
    from transformers import Wav2Vec2ForCTC, AutoModelForCTC
    from huggingface_hub import hf_hub_download, list_repo_files
    import json
    from pathlib import Path
    
    model_name = "pklumpp/Wav2Vec2_CommonPhone"
    print(f"Loading CommonPhone model vocabulary: {model_name}")
    
    vocab4 = {}
    
    # Method 1: Try to load vocab.json directly from Hugging Face
    try:
        print("Method 1: Attempting to download vocab.json from Hugging Face...")
        vocab_path = hf_hub_download(repo_id=model_name, filename="vocab.json")
        if vocab_path and Path(vocab_path).exists():
            with open(vocab_path, 'r', encoding='utf-8') as f:
                vocab4 = json.load(f)
            print(f"✓ Successfully loaded vocab.json with {len(vocab4)} tokens")
        else:
            raise FileNotFoundError("vocab.json file not found after download")
    except Exception as e1:
        print(f"✗ Method 1 failed: {e1}")
        
        # Method 2: Check what files are available in the repository
        try:
            print("Method 2: Checking available files in repository...")
            repo_files = list_repo_files(repo_id=model_name, repo_type="model")
            print(f"Available files: {repo_files}")
            
            # Try to find vocab file with different names
            vocab_files = [f for f in repo_files if 'vocab' in f.lower() or 'tokenizer' in f.lower()]
            if vocab_files:
                print(f"Found potential vocab files: {vocab_files}")
                for vocab_file in vocab_files:
                    try:
                        vocab_path = hf_hub_download(repo_id=model_name, filename=vocab_file)
                        if vocab_path and Path(vocab_path).exists():
                            with open(vocab_path, 'r', encoding='utf-8') as f:
                                vocab4 = json.load(f)
                            print(f"✓ Successfully loaded {vocab_file} with {len(vocab4)} tokens")
                            break
                    except:
                        continue
        except Exception as e2:
            print(f"✗ Method 2 failed: {e2}")
        
        # Method 3: Try to load model and extract vocabulary from model config/state
        if not vocab4:
            try:
                print("Method 3: Attempting to load model and extract vocabulary from model state...")
                # Try to load model config first
                config_path = hf_hub_download(repo_id=model_name, filename="config.json")
                with open(config_path, 'r', encoding='utf-8') as f:
                    model_config = json.load(f)
                
                vocab_size = model_config.get('vocab_size', None)
                print(f"Model vocab_size from config: {vocab_size}")
                
                # Try to load the model (without processor) and see if we can get vocab from it
                try:
                    model = Wav2Vec2ForCTC.from_pretrained(model_name)
                    # Check if model has vocab in its config or attributes
                    if hasattr(model, 'config') and hasattr(model.config, 'vocab_size'):
                        print(f"Model loaded, vocab_size: {model.config.vocab_size}")
                        
                        # Try to get vocab from model's lm_head if possible
                        # Unfortunately, we can't directly extract token strings from model weights
                        # But we can try to load tokenizer with ignore_mismatched_sizes
                        print("Note: Cannot extract token strings directly from model weights.")
                        print("Trying Method 4: Install protobuf and load tokenizer...")
                except Exception as e3:
                    print(f"Could not load model: {e3}")
                    raise
            except Exception as e3:
                print(f"✗ Method 3 failed: {e3}")
        
        # Method 4: Try using AutoProcessor (may work even without vocab.json)
        if not vocab4:
            try:
                print("Method 4: Attempting to load AutoProcessor...")
                from transformers import AutoProcessor
                
                processor = AutoProcessor.from_pretrained(model_name)
                
                if hasattr(processor, 'tokenizer'):
                    tokenizer = processor.tokenizer
                    if hasattr(tokenizer, 'get_vocab'):
                        vocab4 = tokenizer.get_vocab()
                        print(f"✓ Successfully loaded vocabulary from AutoProcessor.tokenizer.get_vocab(): {len(vocab4)} tokens")
                    elif hasattr(tokenizer, 'vocab'):
                        vocab4 = tokenizer.vocab
                        print(f"✓ Successfully loaded vocabulary from AutoProcessor.tokenizer.vocab: {len(vocab4)} tokens")
                    elif hasattr(tokenizer, 'convert_ids_to_tokens'):
                        # Build vocab from convert_ids_to_tokens
                        print("Building vocabulary from AutoProcessor.tokenizer.convert_ids_to_tokens()...")
                        vocab_size = getattr(tokenizer, 'vocab_size', 32)  # CommonPhone uses 32
                        for i in range(vocab_size):
                            try:
                                token = tokenizer.convert_ids_to_tokens(i)
                                if token is not None:
                                    if isinstance(token, str):
                                        vocab4[token] = i
                                    else:
                                        vocab4[str(token)] = i
                            except (IndexError, KeyError, ValueError):
                                continue
                        print(f"✓ Built vocabulary from AutoProcessor.tokenizer: {len(vocab4)} tokens")
                    else:
                        raise AttributeError("AutoProcessor.tokenizer does not have expected methods")
                else:
                    raise AttributeError("AutoProcessor does not have tokenizer attribute")
            except Exception as e4:
                print(f"✗ Method 4 failed: {e4}")
        
        # Method 5: Try loading tokenizer from base model and adapt it
        if not vocab4:
            try:
                print("Method 5: Attempting to load tokenizer from base model and extract vocab...")
                from transformers import Wav2Vec2CTCTokenizer
                
                # Try to load from base model that CommonPhone is based on
                base_model = "facebook/wav2vec2-large-xlsr-53"
                print(f"  → Loading base model tokenizer: {base_model}")
                base_tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(base_model)
                
                # Get vocab from base model
                if hasattr(base_tokenizer, 'get_vocab'):
                    base_vocab = base_tokenizer.get_vocab()
                    print(f"  ✓ Base model vocab size: {len(base_vocab)}")
                    
                    # CommonPhone uses vocab_size=32, so we need to map or extract first 32 tokens
                    # But this won't give us the actual CommonPhone phonemes, just base model tokens
                    print("  ⚠ Note: Base model vocab may not match CommonPhone phonemes exactly")
                    print("  → Trying to use base vocab as fallback...")
                    vocab4 = base_vocab
                    print(f"  ✓ Using base model vocab: {len(vocab4)} tokens (may not be accurate for CommonPhone)")
            except Exception as e5:
                print(f"✗ Method 5 failed: {e5}")
        
        # Method 6: Try to read README.md for vocab information
        if not vocab4:
            try:
                print("Method 6: Attempting to read README.md for vocabulary information...")
                readme_path = hf_hub_download(repo_id=model_name, filename="README.md")
                if readme_path and Path(readme_path).exists():
                    with open(readme_path, 'r', encoding='utf-8') as f:
                        readme_content = f.read()
                    print("  ✓ README.md loaded")
                    # Look for vocab information in README (unlikely but worth trying)
                    if 'vocab' in readme_content.lower() or 'phoneme' in readme_content.lower():
                        print("  → Found vocab/phoneme mentions in README, but cannot extract automatically")
                    print("  ✗ Cannot extract vocab from README automatically")
            except Exception as e6:
                print(f"✗ Method 6 failed: {e6}")
        
        # Final check
        if not vocab4:
            raise RuntimeError(
                f"Failed to load vocabulary from CommonPhone model '{model_name}'. "
                f"Tried all methods:\n"
                f"  1. vocab.json download (404 - file not found)\n"
                f"  2. Repository file listing (no vocab files found)\n"
                f"  3. Model config (vocab_size=32 found, but no token strings)\n"
                f"  4. AutoProcessor (failed)\n"
                f"  5. Base model tokenizer (failed or not accurate)\n"
                f"  6. README.md (cannot extract automatically)\n\n"
                f"The model repository does not contain vocab.json or tokenizer files.\n"
                f"This appears to be a limitation of how the model was uploaded to Hugging Face.\n"
                f"Model repository: https://huggingface.co/{model_name}\n"
                f"Available files: {repo_files if 'repo_files' in locals() else 'unknown'}"
            )
    
    if not vocab4:
        raise RuntimeError("Vocabulary is empty after all loading attempts")
    
    phonemes4 = filter_phonemes(vocab4)
    
    print(f"\nVocabulary size: {len(vocab4)}")
    print(f"Number of phonemes (after filtering): {len(phonemes4)}")
    print()
    print("All phonemes from model 4:")
    print("-" * 80)
    for i, phoneme in enumerate(phonemes4, 1):
        print(f"{i:3d}. {phoneme}")
    print()
    
except Exception as e:
    print(f"\n✗ Error: Could not load vocabulary from CommonPhone model")
    print(f"Error details: {e}")
    print("\n" + "="*80)
    print("SUMMARY:")
    print("="*80)
    print("The CommonPhone model repository does not contain vocabulary files (vocab.json, tokenizer files).")
    print("This is a limitation of how the model was uploaded to Hugging Face.")
    print("\nThe model itself is available and can be used for inference, but the vocabulary")
    print("(list of phonemes) cannot be extracted automatically from the repository.")
    print("\nTo get the vocabulary, you would need to:")
    print("  1. Contact the model author (pklumpp) to request vocab.json")
    print("  2. Check the original paper/dataset documentation")
    print("  3. Use the model for inference and observe which phonemes it outputs")
    print("\nFor now, we'll continue with the other three models for comparison.")
    print("="*80)
    phonemes4 = []

MODEL 4: CommonPhone model (Direct loading)
Model: pklumpp/Wav2Vec2_CommonPhone
Note: This model uses IPA phonemes and is trained on Common Phone dataset
which includes German speech. Should be better for German phoneme recognition.

Loading CommonPhone model vocabulary: pklumpp/Wav2Vec2_CommonPhone
Method 1: Attempting to download vocab.json from Hugging Face...
✗ Method 1 failed: 404 Client Error. (Request ID: Root=1-695f868d-62f189ca0248367b111f0fba;9abe477e-6715-40cb-b7c1-4d4ba8f78a8f)

Entry Not Found for url: https://huggingface.co/pklumpp/Wav2Vec2_CommonPhone/resolve/main/vocab.json.
Method 2: Checking available files in repository...
Available files: ['.gitattributes', 'README.md', 'config.json', 'model.safetensors', 'pytorch_model.bin']
Method 3: Attempting to load model and extract vocabulary from model state...
Model vocab_size from config: None


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at pklumpp/Wav2Vec2_CommonPhone and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.layer_norm.bias', 'wav2vec2.encoder.layer_norm.weight', 'wav2vec2.encoder.layers.0.attention.k_proj.bias', 'wav2vec2.encoder.layers.0.attention.k_proj.weight', 'wav2vec2.encoder.layers.0.attention.out_proj.bias', 'wav2vec2.encoder.layers.0.attention.out_proj.weight', 'wav2vec2.encoder.layers.0.attention.q_proj.bias', 'wav2vec2.encoder.layers.0.attention.q_proj.weight', 'wav2vec2.encoder.layers.0.attention.v_proj.bias', 'wav2vec2.encoder.layers.0.attention.v_proj.weight', 'wav2vec2.encoder.layers.0.feed_forward.intermediate_dense.bias', 'wav2vec2.encoder.layers.0.feed_forward.intermediate_dense.weight', 'wav2vec2.encoder.layers.0.feed_forward.output_dense.bias', 'wav2vec2.encoder.layers.0.feed_forward.output_dense.weight', 'wav2vec2.encoder.layers.0.final_layer_norm.bias', 'wav2vec2.encoder.layers.0.

Model loaded, vocab_size: 32
Note: Cannot extract token strings directly from model weights.
Trying Method 4: Install protobuf and load tokenizer...
Method 4: Attempting to load AutoProcessor...
✗ Method 4 failed: Unrecognized model in pklumpp/Wav2Vec2_CommonPhone. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: aimv2, aimv2_vision_model, albert, align, altclip, apertus, arcee, aria, aria_text, audio-spectrogram-transformer, autoformer, aya_vision, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, bitnet, blenderbot, blenderbot-small, blip, blip-2, blip_2_qformer, bloom, blt, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, cohere2_vision, colpali, colqwen2, conditional_detr, convbert, convnext, convnextv2, cpmant, csm, ctrl, cvt, d_fine, dab-detr, dac, 

In [14]:
# Compare phonemes from all four models
print("=" * 80)
print("COMPARISON OF PHONEMES FROM ALL FOUR MODELS")
print("=" * 80)
print()

# Convert to sets for comparison
set1 = set(phonemes1) if 'phonemes1' in locals() else set()
set2 = set(phonemes2) if 'phonemes2' in locals() else set()
set3 = set(phonemes3) if 'phonemes3' in locals() else set()
set4 = set(phonemes4) if 'phonemes4' in locals() else set()

print(f"Model 1 ({config.MODEL_NAME}):")
print(f"  - Total phonemes: {len(set1)}")
print(f"  - Format: ARPABET (English phoneme system)")
print(f"  - Phonemes: {sorted(set1)}")
print()

print(f"Model 2 ({config.WAV2VEC2_PHONEME_MODEL_NAME}):")
print(f"  - Total phonemes: {len(set2)}")
print(f"  - Format: ARPABET (English phoneme system)")
print(f"  - Phonemes: {sorted(set2)}")
print()

print(f"Model 3 ({config.WAV2VEC2_LARGE_MODEL_NAME}):")
print(f"  - Total phonemes: {len(set3)}")
print(f"  - Format: Latin alphabet (letters, not phonemes)")
print(f"  - Phonemes: {sorted(set3)}")
print()

print(f"Model 4 (pklumpp/Wav2Vec2_CommonPhone):")
print(f"  - Total phonemes: {len(set4)}")
print(f"  - Format: IPA (International Phonetic Alphabet)")
if len(set4) == 0:
    print(f"  - Status: Vocabulary not available (model repository lacks vocab.json/tokenizer files)")
    print(f"  - Note: Model can be used for inference, but phoneme list cannot be extracted")
else:
    print(f"  - Phonemes: {sorted(set4)}")
print()

# Common phonemes
all_phonemes = set1 | set2 | set3 | set4
common_all = set1 & set2 & set3 & set4
common_arpabet = set1 & set2  # Models 1 and 2 use same ARPABET
only_model1 = set1 - set2 - set3 - set4
only_model2 = set2 - set1 - set3 - set4
only_model3 = set3 - set1 - set2 - set4
only_model4 = set4 - set1 - set2 - set3

print("=" * 80)
print("ANALYSIS:")
print("=" * 80)
print(f"Total unique phonemes across all models: {len(all_phonemes)}")
print(f"Common phonemes (present in all four): {len(common_all)}")
print(f"Common phonemes (ARPABET models 1 & 2): {len(common_arpabet)}")
print(f"Only in model 1: {len(only_model1)}")
print(f"Only in model 2: {len(only_model2)}")
print(f"Only in model 3: {len(only_model3)}")
print(f"Only in model 4 (CommonPhone IPA): {len(only_model4)}")
print()

if common_all:
    print("Common phonemes (present in all four models):")
    print(sorted(common_all))
    print()

if common_arpabet:
    print("Common phonemes (ARPABET models 1 & 2):")
    print(sorted(common_arpabet))
    print()

if only_model1:
    print("Phonemes only in model 1:")
    print(sorted(only_model1))
    print()

if only_model2:
    print("Phonemes only in model 2:")
    print(sorted(only_model2))
    print()

if only_model3:
    print("Phonemes only in model 3:")
    print(sorted(only_model3))
    print()

if only_model4:
    print("Phonemes only in model 4 (CommonPhone IPA):")
    print(sorted(only_model4))
    print()

# Check which German IPA phonemes are present in CommonPhone
print("=" * 80)
print("GERMAN IPA PHONEMES CHECK:")
print("=" * 80)
german_ipa_phonemes = set(config.GERMAN_IPA_PHONEMES)

if len(set4) > 0:
    commonphone_has_german = german_ipa_phonemes & set4
    commonphone_missing_german = german_ipa_phonemes - set4
    
    print(f"German IPA phonemes defined in config: {len(german_ipa_phonemes)}")
    print(f"Present in CommonPhone model: {len(commonphone_has_german)}")
    print(f"Missing in CommonPhone model: {len(commonphone_missing_german)}")
    print()
    
    if commonphone_has_german:
        print("German IPA phonemes found in CommonPhone:")
        print(sorted(commonphone_has_german))
        print()
    
    if commonphone_missing_german:
        print("German IPA phonemes missing in CommonPhone:")
        print(sorted(commonphone_missing_german))
        print()
else:
    print(f"German IPA phonemes defined in config: {len(german_ipa_phonemes)}")
    print("⚠ Cannot check CommonPhone model: vocabulary not available")
    print("\nGerman IPA phonemes that should be checked in CommonPhone:")
    print(sorted(german_ipa_phonemes))
    print()


COMPARISON OF PHONEMES FROM ALL FOUR MODELS

Model 1 (vitouphy/wav2vec2-xls-r-300m-phoneme):
  - Total phonemes: 38
  - Format: ARPABET (English phoneme system)
  - Phonemes: ['aa', 'ae', 'ah', 'aw', 'ay', 'b', 'ch', 'd', 'dh', 'dx', 'eh', 'er', 'ey', 'f', 'g', 'hh', 'ih', 'iy', 'jh', 'k', 'l', 'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', 'sh', 't', 'th', 'uh', 'uw', 'v', 'w', 'y', 'z']

Model 2 (vitouphy/wav2vec2-xls-r-300m-phoneme):
  - Total phonemes: 38
  - Format: ARPABET (English phoneme system)
  - Phonemes: ['aa', 'ae', 'ah', 'aw', 'ay', 'b', 'ch', 'd', 'dh', 'dx', 'eh', 'er', 'ey', 'f', 'g', 'hh', 'ih', 'iy', 'jh', 'k', 'l', 'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', 'sh', 't', 'th', 'uh', 'uw', 'v', 'w', 'y', 'z']

Model 3 (facebook/wav2vec2-large-960h-lv60-self):
  - Total phonemes: 27
  - Format: Latin alphabet (letters, not phonemes)
  - Phonemes: ["'", 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y'