In [7]:
import torch
import torchaudio
from tqdm import tqdm
from underthesea import sent_tokenize

from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts



In [8]:
# Device configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Model paths
xtts_checkpoint = "/var/lit2425/pib/XTTSv2-Finetuning-for-New-Languages/checkpoints/GPT_XTTS_FT-April-05-2025_03+25PM-ab5a481/checkpoint_5000.pth"
xtts_config = "/var/lit2425/pib/XTTSv2-Finetuning-for-New-Languages/checkpoints/GPT_XTTS_FT-April-05-2025_03+25PM-ab5a481/config.json"
xtts_vocab = "checkpoints/XTTS_v2.0_original_model_files/vocab.json"

# Load model
config = XttsConfig()
config.load_json(xtts_config)
XTTS_MODEL = Xtts.init_from_config(config)
XTTS_MODEL.load_checkpoint(
    config,
    checkpoint_path=xtts_checkpoint,
    checkpoint_dir="/var/lit2425/pib/XTTSv2-Finetuning-for-New-Languages/checkpoints/GPT_XTTS_FT-April-05-2025_03+25PM-ab5a481",
    vocab_path=xtts_vocab,
    use_deepspeed=False,
    speaker_file_path=None
)


XTTS_MODEL.to(device)

print("Model loaded successfully!")

Model loaded successfully!


In [9]:
# Inference
tts_text = "आपको देखकर अच्छा लगा"
speaker_audio_file = "/var/lit2425/pib/XTTSv2-Finetuning-for-New-Languages/datasets-1/wavs/common_voice_hi_23795238.wav"
lang = "hi"

gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
    audio_path=speaker_audio_file,
    gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
    max_ref_length=XTTS_MODEL.config.max_ref_len,
    sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
)

tts_texts = sent_tokenize(tts_text)

wav_chunks = []
for text in tqdm(tts_texts):
    wav_chunk = XTTS_MODEL.inference(
        text=text,
        language=lang,
        gpt_cond_latent=gpt_cond_latent,
        speaker_embedding=speaker_embedding,
        temperature=0.1,
        length_penalty=1.0,
        repetition_penalty=10.0,
        top_k=10,
        top_p=0.3,
    )
    wav_chunks.append(torch.tensor(wav_chunk["wav"]))

out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu()

# Save the TTS output to a WAV file
torchaudio.save("output_hi_tts.wav", out_wav, sample_rate=24000)

# Play audio (for Jupyter Notebook)
from IPython.display import Audio
Audio(out_wav, rate=24000)

100%|██████████| 1/1 [00:00<00:00,  2.41it/s]


In [10]:

# Save the TTS output to a WAV file
torchaudio.save("output_hi_tts.wav", out_wav, sample_rate=24000)


In [12]:
import os
import torch
import torchaudio
from tqdm import tqdm
from underthesea import sent_tokenize

# Create eval_set directory if it doesn't exist
os.makedirs("eval_set", exist_ok=True)

# 50 Simple Hindi Sentences
hindi_sentences = [
    "हाँ",  
    "नहीं",
    "ठीक है",
    "धन्यवाद",
    "कृपया",
    "माफ कीजिए",
    "रुकिए",
    "जल्दी करो",
    "समझ गया",
    "फिर से बोलो",
    "मदद चाहिए",
    "बहुत अच्छा",
    "यह सही है",
    "गलत है",
    "क्या हुआ?",
    "कैसे हो?",
    "मैं ठीक हूँ",
    "आपका नाम क्या है?",
    "मेरा नाम है",
    "समय क्या हुआ है?",
    "आज कौन सा दिन है?",
    "मुझे याद दिलाओ",
    "बाद में",
    "अभी नहीं",
    "हो गया",
    "शुरू करो",
    "बंद करो",
    "चालू करो",
    "बढ़िया",
    "ऐसा मत करो",
    "यहाँ आओ",
    "वहाँ जाओ",
    "इंतज़ार करो",
    "जाने दो",
    "मुझे पसंद है",
    "मुझे पसंद नहीं",
    "क्या आप तैयार हैं?",
    "मैं तैयार हूँ",
    "खुशी की बात है",
    "दुख हुआ",
    "मैं थक गया हूँ",
    "आराम करो",
    "चिंता मत करो",
    "कोई बात नहीं",
    "मैं भूल गया",
    "याद आ गया",
    "ज़रूर",
    "कभी नहीं",
    "शायद",
    "बिल्कुल"
]

# Generate 50 audio files
for i in tqdm(range(50)):
    tts_text = hindi_sentences[i]
    
    # Tokenize text (though for short sentences this may not split)
    tts_texts = sent_tokenize(tts_text)
    
    # Generate audio
    wav_chunks = []
    for text in tts_texts:
        wav_chunk = XTTS_MODEL.inference(
            text=text,
            language=lang,
            gpt_cond_latent=gpt_cond_latent,
            speaker_embedding=speaker_embedding,
            temperature=0.1,
            length_penalty=1.0,
            repetition_penalty=10.0,
            top_k=10,
            top_p=0.3,
        )
        wav_chunks.append(torch.tensor(wav_chunk["wav"]))
    
    out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu()
    
    # Save file
    output_path = f"eval_set/sample_{i+1:02d}.wav"
    torchaudio.save(output_path, out_wav, sample_rate=24000)

print(f"Successfully generated 50 audio files in the eval_set directory.")

100%|██████████| 50/50 [00:16<00:00,  2.95it/s]

Successfully generated 50 audio files in the eval_set directory.





In [14]:
import os
from allosaurus.app import read_recognizer
from tqdm import tqdm

# Initialize Allosaurus
print("Loading Allosaurus model...")
phoneme_model = read_recognizer()
print("Model loaded successfully")

# Create directory for phoneme results
os.makedirs("phoneme_results", exist_ok=True)

# Process all 50 audio files
print("\nExtracting phonemes from audio files:")
results = []
for i in tqdm(range(1, 51)):
    audio_path = f"eval_set/sample_{i:02d}.wav"
    
    # Extract phonemes
    phonemes = phoneme_model.recognize(audio_path)  # 'hin' for Hindi
    
    # Save results
    results.append(f"sample_{i:02d}.wav: {phonemes}")
    
    # Also save to individual text files
    with open(f"phoneme_results/sample_{i:02d}.txt", "w", encoding='utf-8') as f:
        f.write(phonemes)

# Save all results to one file
with open("phoneme_results/all_phonemes.txt", "w", encoding='utf-8') as f:
    f.write("\n".join(results))

# Print first 5 results for verification
print("\nSample results (first 5 files):")
for result in results[:5]:
    print(result)

print("\nPhoneme extraction complete!")
print(f"Individual results saved in 'phoneme_results' folder")
print(f"Combined results saved in 'phoneme_results/all_phonemes.txt'")

Loading Allosaurus model...
Model loaded successfully

Extracting phonemes from audio files:


100%|██████████| 50/50 [00:07<00:00,  6.97it/s]


Sample results (first 5 files):
sample_01.wav: h ʌ
sample_02.wav: n ɒ
sample_03.wav: ts o b iː h æ ʂ ɒ ʔ ɒ
sample_04.wav: bʲ ɛ x ə
sample_05.wav: tʂ ʏ ɾ ij j ɤ

Phoneme extraction complete!
Individual results saved in 'phoneme_results' folder
Combined results saved in 'phoneme_results/all_phonemes.txt'





In [None]:
import os
from phonemizer import phonemize
from allosaurus.app import read_recognizer
from tqdm import tqdm

# Text sentences (same 50 as before)
hindi_sentences = [
    "हाँ", "नहीं", "ठीक है", "धन्यवाद", "कृपया",
    "माफ कीजिए", "रुकिए", "जल्दी करो", "समझ गया", "फिर से बोलो",
    "मदद चाहिए", "बहुत अच्छा", "यह सही है", "गलत है", "क्या हुआ?",
    "कैसे हो?", "मैं ठीक हूँ", "आपका नाम क्या है?", "मेरा नाम है", "समय क्या हुआ है?",
    "आज कौन सा दिन है?", "मुझे याद दिलाओ", "बाद में", "अभी नहीं", "हो गया",
    "शुरू करो", "बंद करो", "चालू करो", "बढ़िया", "ऐसा मत करो",
    "यहाँ आओ", "वहाँ जाओ", "इंतज़ार करो", "जाने दो", "मुझे पसंद है",
    "मुझे पसंद नहीं", "क्या आप तैयार हैं?", "मैं तैयार हूँ", "खुशी की बात है", "दुख हुआ",
    "मैं थक गया हूँ", "आराम करो", "चिंता मत करो", "कोई बात नहीं", "मैं भूल गया",
    "याद आ गया", "ज़रूर", "कभी नहीं", "शायद", "बिल्कुल"
]

# Initialize models
print("Loading models...")
phoneme_model = read_recognizer()  # Allosaurus
print("Allosaurus model loaded")

# Create results directory
os.makedirs("phoneme_comparison", exist_ok=True)

# Process all 50 sentences
results = []
for i in tqdm(range(50), desc="Processing sentences"):
    # Text to Phonemes (using Phonemizer)
    text_phonemes = phonemize(
        hindi_sentences[i],
        language='hi',
        backend='espeak',
        strip=True,
        preserve_punctuation=False,
        with_stress=True
    )
    
    # Audio to Phonemes (using Allosaurus)
    audio_path = f"eval_set/sample_{i+1:02d}.wav"
    audio_phonemes = phoneme_model.recognize(audio_path)
    
    # Store comparison
    results.append({
        'sentence': hindi_sentences[i],
        'text_phonemes': text_phonemes,
        'audio_phonemes': audio_phonemes,
        'match': text_phonemes.strip() == audio_phonemes.strip()
    })
    
    # Save to individual files
    with open(f"phoneme_comparison/sample_{i+1:02d}.txt", "w", encoding='utf-8') as f:
        f.write(f"Text: {hindi_sentences[i]}\n")
        f.write(f"Text Phonemes: {text_phonemes}\n")
        f.write(f"Audio Phonemes: {audio_phonemes}\n")
        f.write(f"Match: {'YES' if results[-1]['match'] else 'NO'}\n")

# Save comprehensive results
with open("phoneme_comparison/summary.csv", "w", encoding='utf-8') as f:
    f.write("ID,Sentence,TextPhonemes,AudioPhonemes,Match\n")
    for i, result in enumerate(results):
        f.write(f"{i+1},\"{result['sentence']}\",\"{result['text_phonemes']}\",\"{result['audio_phonemes']}\",{result['match']}\n")

# Print sample comparison
print("\nSample comparison (first 5 sentences):")
for i in range(5):
    print(f"\nSentence {i+1}: {results[i]['sentence']}")
    print(f"Text Phonemes: {results[i]['text_phonemes']}")
    print(f"Audio Phonemes: {results[i]['audio_phonemes']}")
    print(f"Match: {'✅' if results[i]['match'] else '❌'}")

print("\nProcessing complete!")
print(f"Individual comparisons saved in 'phoneme_comparison' folder")
print(f"Summary saved as 'phoneme_comparison/summary.csv'")

Loading models...
Allosaurus model loaded


Processing sentences:   0%|          | 0/50 [00:00<?, ?it/s]


TypeError: Recognizer.recognize() got an unexpected keyword argument 'lang'