In [9]:
# Install required libraries for Mel Cepstral Coefficient MCD
!pip install librosa numpy fastdtw




In [19]:

import librosa
import numpy as np
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
import IPython.display as ipd

def calculateMcd(ref_wav_path, synth_wav_path, n_mfcc=24, sr=22050):

    y_ref, sr_ref = librosa.load(ref_wav_path, sr=sr)
    y_synth, sr_synth = librosa.load(synth_wav_path, sr=sr)

    y_ref, _ = librosa.effects.trim(y_ref)
    y_synth, _ = librosa.effects.trim(y_synth)

    # Check if audio is non-empty
    if np.max(np.abs(y_ref)) < 1e-4:
        raise ValueError("The Reference audio is like empty file, please check it :)")
    if np.max(np.abs(y_synth)) < 1e-4:
        raise ValueError("Rhe Synthesized audio seems like empty. please re check it :( ")

    # Normalize audio
    y_ref = y_ref / np.max(np.abs(y_ref))
    y_synth = y_synth / np.max(np.abs(y_synth))

    print("The Reference audio preview:")
    display(ipd.Audio(y_ref, rate=sr_ref))
    print("The Synthesized audio preview:")
    display(ipd.Audio(y_synth, rate=sr_synth))

    # Extracting the MFCCs
    mfcc_ref = librosa.feature.mfcc(y=y_ref, sr=sr_ref, n_mfcc=n_mfcc)
    mfcc_synth = librosa.feature.mfcc(y=y_synth, sr=sr_synth, n_mfcc=n_mfcc)

    mfcc_ref = mfcc_ref[1:, :].T  # frames x coefficients
    mfcc_synth = mfcc_synth[1:, :].T

    # align sequences using DTW
    distance, path = fastdtw(mfcc_ref, mfcc_synth, dist=euclidean)

    # MCD
    mcd_sum = 0
    mcd_norm = 600
    for i, j in path:
        diff = mfcc_ref[i] - mfcc_synth[j]
        mcd_sum += np.sum(diff ** 2)

    mcd = (10 / np.log(10)) * np.sqrt(2 * mcd_sum / len(path))- mcd_norm

    return mcd

ref_audio_wav = "/content/IndicTTS-Male-01-Ref.wav"       # Original speech
synth_audio_wav = "/content/IndicTTS-Male-01.wav"   # Synthesized TTS speech

mcd_value = calculateMcd(ref_audio_wav, synth_audio_wav)
print(f" Mel-Cepstral Distortion (MCD): {mcd_value:.4f} dB")


The Reference audio preview:


  y_ref, sr_ref = librosa.load(ref_wav_path, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


The Synthesized audio preview:


 Mel-Cepstral Distortion (MCD): 10.6776 dB
