In [3]:
import torch
import torchaudio
from tqdm import tqdm
from underthesea import sent_tokenize

from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts



In [5]:
# Device configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Model paths
xtts_checkpoint = "/var/lit2425/humanize/MMFM/XTTSv2-Finetuning-for-New-Languages/checkpoints/GPT_XTTS_FT-April-07-2025_12+30PM-8223ca1/checkpoint_6000.pth"
xtts_config = "/var/lit2425/humanize/MMFM/XTTSv2-Finetuning-for-New-Languages/checkpoints/GPT_XTTS_FT-April-07-2025_12+30PM-8223ca1/config.json"
xtts_vocab = "/var/lit2425/humanize/MMFM/XTTSv2-Finetuning-for-New-Languages/checkpoints/XTTS_v2.0_original_model_files/vocab.json"
# MMFM/XTTSv2-Finetuning-for-New-Languages/checkpoints/XTTS_v2.0_original_model_files/model.pth
# Load model
config = XttsConfig()
config.load_json(xtts_config)
XTTS_MODEL = Xtts.init_from_config(config)
XTTS_MODEL.load_checkpoint(
    config,
    checkpoint_path=xtts_checkpoint,
    checkpoint_dir="MMFM/XTTSv2-Finetuning-for-New-Languages/checkpoints/GPT_XTTS_FT-April-07-2025_12+30PM-8223ca1",
    vocab_path=xtts_vocab,
    use_deepspeed=False,
    speaker_file_path=None
)


XTTS_MODEL.to(device)

print("Model loaded successfully!")

Model loaded successfully!


In [6]:
# Inference
tts_text = "आपको देखकर अच्छा लगा"
speaker_audio_file = "/var/lit2425/humanize/MMFM/XTTSv2-Finetuning-for-New-Languages/datasets/wavs/common_voice_hi_23795238.wav"
lang = "hi"

gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
    audio_path=speaker_audio_file,
    gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
    max_ref_length=XTTS_MODEL.config.max_ref_len,
    sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
)

tts_texts = sent_tokenize(tts_text)

wav_chunks = []
for text in tqdm(tts_texts):
    wav_chunk = XTTS_MODEL.inference(
        text=text,
        language=lang,
        gpt_cond_latent=gpt_cond_latent,
        speaker_embedding=speaker_embedding,
        temperature=0.1,
        length_penalty=1.0,
        repetition_penalty=10.0,
        top_k=10,
        top_p=0.3,
    )
    wav_chunks.append(torch.tensor(wav_chunk["wav"]))

out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu()

# Save the TTS output to a WAV file
torchaudio.save("output_hi_tts.wav", out_wav, sample_rate=24000)

# Play audio (for Jupyter Notebook)
from IPython.display import Audio
Audio(out_wav, rate=24000)

  0%|          | 0/1 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


100%|██████████| 1/1 [00:00<00:00,  1.59it/s]


In [7]:

# Save the TTS output to a WAV file
torchaudio.save("output_hi_tts.wav", out_wav, sample_rate=24000)


In [8]:
import os
import torch
import torchaudio
from tqdm import tqdm
from underthesea import sent_tokenize

# Create eval_set directory if it doesn't exist
os.makedirs("eval_set", exist_ok=True)

# 50 Simple Hindi Sentences
hindi_sentences = [
    "हाँ",  
    "नहीं",
    "ठीक है",
    "धन्यवाद",
    "कृपया",
    "माफ कीजिए",
    "रुकिए",
    "जल्दी करो",
    "समझ गया",
    "फिर से बोलो",
    "मदद चाहिए",
    "बहुत अच्छा",
    "यह सही है",
    "गलत है",
    "क्या हुआ?",
    "कैसे हो?",
    "मैं ठीक हूँ",
    "आपका नाम क्या है?",
    "मेरा नाम है",
    "समय क्या हुआ है?",
    "आज कौन सा दिन है?",
    "मुझे याद दिलाओ",
    "बाद में",
    "अभी नहीं",
    "हो गया",
    "शुरू करो",
    "बंद करो",
    "चालू करो",
    "बढ़िया",
    "ऐसा मत करो",
    "यहाँ आओ",
    "वहाँ जाओ",
    "इंतज़ार करो",
    "जाने दो",
    "मुझे पसंद है",
    "मुझे पसंद नहीं",
    "क्या आप तैयार हैं?",
    "मैं तैयार हूँ",
    "खुशी की बात है",
    "दुख हुआ",
    "मैं थक गया हूँ",
    "आराम करो",
    "चिंता मत करो",
    "कोई बात नहीं",
    "मैं भूल गया",
    "याद आ गया",
    "ज़रूर",
    "कभी नहीं",
    "शायद",
    "बिल्कुल"
]

# Generate 50 audio files
for i in tqdm(range(50)):
    tts_text = hindi_sentences[i]
    
    # Tokenize text (though for short sentences this may not split)
    tts_texts = sent_tokenize(tts_text)
    
    # Generate audio
    wav_chunks = []
    for text in tts_texts:
        wav_chunk = XTTS_MODEL.inference(
            text=text,
            language=lang,
            gpt_cond_latent=gpt_cond_latent,
            speaker_embedding=speaker_embedding,
            temperature=0.1,
            length_penalty=1.0,
            repetition_penalty=10.0,
            top_k=10,
            top_p=0.3,
        )
        wav_chunks.append(torch.tensor(wav_chunk["wav"]))
    
    out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu()
    
    # Save file
    output_path = f"eval_set/sample_{i+1:02d}.wav"
    torchaudio.save(output_path, out_wav, sample_rate=24000)

print(f"Successfully generated 50 audio files in the eval_set directory.")

100%|██████████| 50/50 [00:17<00:00,  2.94it/s]

Successfully generated 50 audio files in the eval_set directory.





In [9]:

import os
from allosaurus.app import read_recognizer
from tqdm import tqdm

# Initialize Allosaurus
print("Loading Allosaurus model...")
phoneme_model = read_recognizer()
print("Model loaded successfully")

# Create directory for phoneme results
os.makedirs("phoneme_results", exist_ok=True)

# Process all 50 audio files
print("\nExtracting phonemes from audio files:")
results = []
for i in tqdm(range(1, 51)):
    audio_path = f"eval_set/sample_{i:02d}.wav"
    
    # Extract phonemes
    phonemes = phoneme_model.recognize(audio_path)  # 'hin' for Hindi
    
    # Save results
    results.append(f"sample_{i:02d}.wav: {phonemes}")
    
    # Also save to individual text files
    with open(f"phoneme_results/sample_{i:02d}.txt", "w", encoding='utf-8') as f:
        f.write(phonemes)

# Save all results to one file
with open("phoneme_results/all_phonemes.txt", "w", encoding='utf-8') as f:
    f.write("\n".join(results))

# Print first 5 results for verification
print("\nSample results (first 5 files):")
for result in results[:5]:
    print(result)

print("\nPhoneme extraction complete!")
print(f"Individual results saved in 'phoneme_results' folder")
print(f"Combined results saved in 'phoneme_results/all_phonemes.txt'")

Loading Allosaurus model...
Model loaded successfully

Extracting phonemes from audio files:


  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [00:07<00:00,  6.95it/s]


Sample results (first 5 files):
sample_01.wav: ɔ w
sample_02.wav: l ə ɪ
sample_03.wav: ɒ l̪ m i ɛ
sample_04.wav: d ə ɴ ɲ a m ɒ l e
sample_05.wav: p ʏ ɾ ə b̤ e a

Phoneme extraction complete!
Individual results saved in 'phoneme_results' folder
Combined results saved in 'phoneme_results/all_phonemes.txt'





In [10]:
import os
from phonemizer import phonemize
from allosaurus.app import read_recognizer
from tqdm import tqdm

# Text sentences (same 50 as before)
hindi_sentences = [
    "हाँ", "नहीं", "ठीक है", "धन्यवाद", "कृपया",
    "माफ कीजिए", "रुकिए", "जल्दी करो", "समझ गया", "फिर से बोलो",
    "मदद चाहिए", "बहुत अच्छा", "यह सही है", "गलत है", "क्या हुआ?",
    "कैसे हो?", "मैं ठीक हूँ", "आपका नाम क्या है?", "मेरा नाम है", "समय क्या हुआ है?",
    "आज कौन सा दिन है?", "मुझे याद दिलाओ", "बाद में", "अभी नहीं", "हो गया",
    "शुरू करो", "बंद करो", "चालू करो", "बढ़िया", "ऐसा मत करो",
    "यहाँ आओ", "वहाँ जाओ", "इंतज़ार करो", "जाने दो", "मुझे पसंद है",
    "मुझे पसंद नहीं", "क्या आप तैयार हैं?", "मैं तैयार हूँ", "खुशी की बात है", "दुख हुआ",
    "मैं थक गया हूँ", "आराम करो", "चिंता मत करो", "कोई बात नहीं", "मैं भूल गया",
    "याद आ गया", "ज़रूर", "कभी नहीं", "शायद", "बिल्कुल"
]

# Initialize models
print("Loading models...")
phoneme_model = read_recognizer()  # Allosaurus
print("Allosaurus model loaded")

# Create results directory
os.makedirs("phoneme_comparison", exist_ok=True)

# Process all 50 sentences
results = []
for i in tqdm(range(50), desc="Processing sentences"):
    # Text to Phonemes (using Phonemizer)
    text_phonemes = phonemize(
        hindi_sentences[i],
        language='hi',
        backend='espeak',
        strip=True,
        preserve_punctuation=False,
        with_stress=True
    )
    
    # Audio to Phonemes (using Allosaurus)
    audio_path = f"eval_set/sample_{i+1:02d}.wav"
    audio_phonemes = phoneme_model.recognize(audio_path)
    
    # Store comparison
    results.append({
        'sentence': hindi_sentences[i],
        'text_phonemes': text_phonemes,
        'audio_phonemes': audio_phonemes,
        'match': text_phonemes.strip() == audio_phonemes.strip()
    })
    
    # Save to individual files
    with open(f"phoneme_comparison/sample_{i+1:02d}.txt", "w", encoding='utf-8') as f:
        f.write(f"Text: {hindi_sentences[i]}\n")
        f.write(f"Text Phonemes: {text_phonemes}\n")
        f.write(f"Audio Phonemes: {audio_phonemes}\n")
        f.write(f"Match: {'YES' if results[-1]['match'] else 'NO'}\n")

# Save comprehensive results
with open("phoneme_comparison/summary.csv", "w", encoding='utf-8') as f:
    f.write("ID,Sentence,TextPhonemes,AudioPhonemes,Match\n")
    for i, result in enumerate(results):
        f.write(f"{i+1},\"{result['sentence']}\",\"{result['text_phonemes']}\",\"{result['audio_phonemes']}\",{result['match']}\n")

# Print sample comparison
print("\nSample comparison (first 5 sentences):")
for i in range(5):
    print(f"\nSentence {i+1}: {results[i]['sentence']}")
    print(f"Text Phonemes: {results[i]['text_phonemes']}")
    print(f"Audio Phonemes: {results[i]['audio_phonemes']}")
    print(f"Match: {'✅' if results[i]['match'] else '❌'}")

print("\nProcessing complete!")
print(f"Individual comparisons saved in 'phoneme_comparison' folder")
print(f"Summary saved as 'phoneme_comparison/summary.csv'")

Loading models...
Allosaurus model loaded


Processing sentences:   2%|▏         | 1/50 [00:00<00:08,  6.05it/s]

Processing sentences: 100%|██████████| 50/50 [00:07<00:00,  6.28it/s]


Sample comparison (first 5 sentences):

Sentence 1: हाँ
Text Phonemes: hˈã
Audio Phonemes: ɔ w
Match: ❌

Sentence 2: नहीं
Text Phonemes: nˈʌhĩ
Audio Phonemes: l ə ɪ
Match: ❌

Sentence 3: ठीक है
Text Phonemes: ʈʰˈiːk hɛː
Audio Phonemes: ɒ l̪ m i ɛ
Match: ❌

Sentence 4: धन्यवाद
Text Phonemes: dʰˌənjəʋˈaːd
Audio Phonemes: d ə ɴ ɲ a m ɒ l e
Match: ❌

Sentence 5: कृपया
Text Phonemes: kɾˈɪpjˌaː
Audio Phonemes: p ʏ ɾ ə b̤ e a
Match: ❌

Processing complete!
Individual comparisons saved in 'phoneme_comparison' folder
Summary saved as 'phoneme_comparison/summary.csv'





In [11]:
!pip install resemblyzer


Defaulting to user installation because normal site-packages is not writeable


In [15]:
import sys
print("Python executable:", sys.executable)
print("sys.path:", sys.path)


Python executable: /var/lit2425/humanize/MMFM/.venv/bin/python
sys.path: ['/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/var/lit2425/humanize/MMFM/.venv/lib/python3.10/site-packages', '/tmp/tmp5q9sfa3t']


In [16]:
!{sys.executable} -m pip install resemblyzer


Collecting resemblyzer
  Using cached Resemblyzer-0.1.4-py3-none-any.whl (15.7 MB)
Collecting typing
  Using cached typing-3.7.4.3-py3-none-any.whl
Collecting webrtcvad>=2.0.10
  Using cached webrtcvad-2.0.10-cp310-cp310-linux_x86_64.whl
Installing collected packages: webrtcvad, typing, resemblyzer
Successfully installed resemblyzer-0.1.4 typing-3.7.4.3 webrtcvad-2.0.10


In [17]:
from resemblyzer import VoiceEncoder, preprocess_wav
import numpy as np

In [18]:
# initialize the encode
encoder = VoiceEncoder()

# load and preprocess the synthesized file
wav_synth = preprocess_wav("output_hi_tts.wav")

# extract the embeding from it
synth_embedding = encoder.embed_utterance(wav_synth)


Loaded the voice encoder model on cuda in 0.02 seconds.


In [20]:
target_speaker_audio_file = "/var/lit2425/humanize/MMFM/XTTSv2-Finetuning-for-New-Languages/datasets/wavs/common_voice_hi_23795238.wav"

# Load and preprocess the target speaker audio file
wav_target = preprocess_wav(target_speaker_audio_file)

# Compute the target speaker embedding
target_embedding_resemblyzer = encoder.embed_utterance(wav_target)

cos_sim = np.dot(synth_embedding, target_embedding_resemblyzer) / (
    np.linalg.norm(synth_embedding) * np.linalg.norm(target_embedding_resemblyzer)
)
print("Cosine Similarity (SECS):", cos_sim)


Cosine Similarity (SECS): 0.75168955


In [21]:
import os
import numpy as np
from resemblyzer import VoiceEncoder, preprocess_wav

# Initialize the encoder
encoder = VoiceEncoder()

# Define the target speaker file and get embedding 
target_speaker_audio_file = "/var/lit2425/humanize/MMFM/XTTSv2-Finetuning-for-New-Languages/datasets/wavs/common_voice_hi_23795238.wav"
wav_target = preprocess_wav(target_speaker_audio_file)
target_embedding_resemblyzer = encoder.embed_utterance(wav_target)

# Directory of samples
eval_set_dir = "eval_set"
cosine_similarities = []

for i in range(1, 51):
    sample_path = os.path.join(eval_set_dir, f"sample_{i:02d}.wav")
    
    # Preprocess and extract embedding for the synthesized sample
    wav_sample = preprocess_wav(sample_path)
    sample_embedding = encoder.embed_utterance(wav_sample)
    
    # Compute cosine similarity
    cos_sim = np.dot(sample_embedding, target_embedding_resemblyzer) / (
        np.linalg.norm(sample_embedding) * np.linalg.norm(target_embedding_resemblyzer)
    )
    cosine_similarities.append(cos_sim)
    print(f"Sample {i:02d} - Cosine Similarity: {cos_sim:.4f}")

# Calculate aggregate metrics
mean_similarity = np.mean(cosine_similarities)
std_similarity = np.std(cosine_similarities)
print(f"\nMean Cosine Similarity: {mean_similarity:.4f}")
print(f"Standard Deviation: {std_similarity:.4f}")

# Write the results to a CSV file for record-keeping
import csv
with open("secs_evaluation_results.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Sample", "Cosine Similarity"])
    for i, sim in enumerate(cosine_similarities, start=1):
        writer.writerow([f"sample_{i:02d}.wav", sim])


Loaded the voice encoder model on cuda in 0.02 seconds.
Sample 01 - Cosine Similarity: 0.5054
Sample 02 - Cosine Similarity: 0.5838
Sample 03 - Cosine Similarity: 0.7039
Sample 04 - Cosine Similarity: 0.7433
Sample 05 - Cosine Similarity: 0.7084
Sample 06 - Cosine Similarity: 0.7524
Sample 07 - Cosine Similarity: 0.6801
Sample 08 - Cosine Similarity: 0.7174
Sample 09 - Cosine Similarity: 0.7223
Sample 10 - Cosine Similarity: 0.7294
Sample 11 - Cosine Similarity: 0.7322
Sample 12 - Cosine Similarity: 0.7653
Sample 13 - Cosine Similarity: 0.7274
Sample 14 - Cosine Similarity: 0.7488
Sample 15 - Cosine Similarity: 0.7884
Sample 16 - Cosine Similarity: 0.7685
Sample 17 - Cosine Similarity: 0.7403
Sample 18 - Cosine Similarity: 0.8174
Sample 19 - Cosine Similarity: 0.7497
Sample 20 - Cosine Similarity: 0.8136
Sample 21 - Cosine Similarity: 0.7601
Sample 22 - Cosine Similarity: 0.7832
Sample 23 - Cosine Similarity: 0.7190
Sample 24 - Cosine Similarity: 0.7565
Sample 25 - Cosine Similarity: 0

In [22]:
!pip install librosa


Defaulting to user installation because normal site-packages is not writeable


In [29]:
import os
import librosa
import numpy as np
import math

sr = 24000                # Sample rate used in your TTS outputs
n_mfcc = 13               # Number of MFCC coefficients (e.g., 13 is common)
n_fft = 1024              # FFT window size
hop_length = 256          # Hop length (should match your TTS settings)


In [31]:
def compute_mcd(mfcc_ref, mfcc_synth):
    """
    Computes the Mel Cepstral Distortion (MCD) between two aligned MFCC matrices.
    
    Parameters:
      mfcc_ref: np.ndarray of shape (n_mfcc, T) for the reference audio.
      mfcc_synth: np.ndarray of shape (n_mfcc, T) for the synthesized audio.
      
    Returns:
      mcd_value: float, the average MCD over all aligned frames.
    """
    # Calculate the difference between the two matrices
    diff = mfcc_ref - mfcc_synth  # shape: (n_mfcc, number_of_aligned_frames)
    # For each frame, compute √(2 * sum_{m}(difference^2))
    framewise_mcd = np.sqrt(2 * np.sum(diff**2, axis=0))
    mcd_constant = 10 / math.log(10)
    mcd_value = mcd_constant * np.mean(framewise_mcd)
    return mcd_value


In [32]:
# Path to your synthesized output
synth_audio_path = "output_hi_tts.wav"

# Load the synthesized audio and extract MFCCs
y_synth, _ = librosa.load(synth_audio_path, sr=sr)
mfcc_synth = mfcc_synth[1:, :]


In [33]:
# Folder containing your reference audio files
reference_folder = "/var/lit2425/humanize/MMFM/XTTSv2-Finetuning-for-New-Languages/datasets/wavs"

# List all .wav files in the folder
reference_files = [
    os.path.join(reference_folder, fname)
    for fname in os.listdir(reference_folder)
    if fname.lower().endswith(".wav")
]

mcd_values = {}  # To store MCD values for each file

# Ensure the synthesized MFCCs have the 0th coefficient removed too:
mfcc_synth_full = librosa.feature.mfcc(y=y_synth, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
# Drop 0th coefficient: Typically, we use coefficients 1 through (n_mfcc-1)
mfcc_synth = mfcc_synth_full[1:, :]

for ref_file in reference_files:
    # Load and extract MFCCs from the reference audio file
    y_ref, _ = librosa.load(ref_file, sr=sr)
    mfcc_ref_full = librosa.feature.mfcc(y=y_ref, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    # Drop the 0th coefficient
    mfcc_ref = mfcc_ref_full[1:, :]
    
    # Align the MFCCs using DTW (Dynamic Time Warping)
    # D: cost matrix, wp: warping path as a list of (ref_index, synth_index)
    D, wp = librosa.sequence.dtw(X=mfcc_ref, Y=mfcc_synth, metric='euclidean')
    wp = np.array(wp)
    
    # Align the MFCC sequences according to the warping path
    mfcc_ref_aligned = mfcc_ref[:, wp[:, 0]]
    mfcc_synth_aligned = mfcc_synth[:, wp[:, 1]]
    
    # Compute MCD for the current reference-synthesized pair
    mcd_value = compute_mcd(mfcc_ref_aligned, mfcc_synth_aligned)
    
    # Save the value along with the file name
    mcd_values[ref_file] = mcd_value
    print(f"{ref_file} -> MCD: {mcd_value:.4f}")

    print("MFCC sample values:", mfcc_synth[:, 0])



/var/lit2425/humanize/MMFM/XTTSv2-Finetuning-for-New-Languages/datasets/wavs/common_voice_hi_24510939.wav -> MCD: 444.0093
MFCC sample values: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
/var/lit2425/humanize/MMFM/XTTSv2-Finetuning-for-New-Languages/datasets/wavs/common_voice_hi_24258336.wav -> MCD: 341.4555
MFCC sample values: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
/var/lit2425/humanize/MMFM/XTTSv2-Finetuning-for-New-Languages/datasets/wavs/common_voice_hi_27408205.wav -> MCD: 483.8989
MFCC sample values: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
/var/lit2425/humanize/MMFM/XTTSv2-Finetuning-for-New-Languages/datasets/wavs/common_voice_hi_26114703.wav -> MCD: 492.5141
MFCC sample values: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
/var/lit2425/humanize/MMFM/XTTSv2-Finetuning-for-New-Languages/datasets/wavs/common_voice_hi_24957722.wav -> MCD: 605.9939
MFCC sample values: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
/var/lit2425/humanize/MMFM/XTTSv2-Finetuning-for-New-Languages/datasets/wavs/common_voice_hi_29

KeyboardInterrupt: 

In [None]:
def compute_mcd(mfcc_ref, mfcc_synth):
    diff = mfcc_ref - mfcc_synth
    framewise_mcd = np.sqrt(2 * np.sum(diff**2, axis=0))
    mcd_constant = 10 / math.log(10)
    mcd_value = mcd_constant * np.mean(framewise_mcd)
    return mcd_value

In [28]:
# Calculate aggregate metrics (mean and standard deviation)
all_mcd = np.array(list(mcd_values.values()))
mean_mcd = np.mean(all_mcd)
std_mcd = np.std(all_mcd)
print(f"\nMean MCD over all reference files: {mean_mcd:.4f}")
print(f"Standard Deviation: {std_mcd:.4f}")

# Optionally, write the results to a CSV file
import csv
with open("mcd_evaluation_results.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Reference_File", "MCD"])
    for ref, mcd_val in mcd_values.items():
        writer.writerow([ref, mcd_val])



Mean MCD over all reference files: 768.7485
Standard Deviation: 184.4507
