# Mel-Cepstral Distortion (MCD)

### import libraries

In [5]:

import os
import math
import glob
import librosa
import pyworld
import pysptk
import numpy as np
import matplotlib.pyplot as plot


### func for loading wav files

In [7]:
SAMPLING_RATE = 22050
FRAME_PERIOD = 5.0

def load_wav(wav_file, sr):
    
    wav, _ = librosa.load(wav_file, sr=sr, mono=True)

    return wav

### func for the formulation of MCD

In [20]:
def MCD(x, y):
    log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0)
    diff = x - y
    
    return log_spec_dB_const * math.sqrt(np.inner(diff, diff))


### loading original and the synthesized speech data

In [8]:
Org_speech_wav_file_paths = glob.glob('./Data/A1/*')
Synth_speech_wav_file_paths = glob.glob('./Data/A2/*')

Org_speech_wav_file_paths[:]



['./Data/A1/A1.wav']

### func for extracting MCEP features (using WORLD analyzer)

In [15]:
SAMPLING_RATE = 22050
FRAME_PERIOD = 5.0

def MCEP(wavfile, mcep_target_directory, alpha=0.65, fft_size=512, mcep_size=34):
    
    if not os.path.exists(mcep_target_directory):
        os.makedirs(mcep_target_directory)

    loaded_wav_file = load_wav(wavfile, sr=SAMPLING_RATE)

   
    _, spectral_envelop, _ = pyworld.wav2world(loaded_wav_file.astype(np.double), fs=SAMPLING_RATE,
                                   frame_period=FRAME_PERIOD, fft_size=fft_size)

    
    mcep = pysptk.sptk.mcep(spectral_envelop, order=mcep_size, alpha=alpha, maxiter=0,
                           etype=1, eps=1.0E-8, min_det=0.0, itype=3)

    fname = os.path.basename(wavfile).split('.')[0]
    np.save(os.path.join(mcep_target_directory, fname + '.npy'),
            mcep,
            allow_pickle=False)


### storing the mcep features as npy files 

In [17]:
alpha = 0.65  
fft_size = 512
mcep_size = 34

dir_org_speech_wav = glob.glob('./Data/A1/*')
dir_org_speech_mcep = './Data/mceps_numpy/trg'
dir_converted_speech_wav = glob.glob('./Data/A2/*')
dir_converted_speech_mcep = './Data/mceps_numpy/conv'

for wav in dir_org_speech_wav:
    MCEP(wav, dir_org_speech_mcep, fft_size=fft_size, mcep_size=mcep_size)

for wav in dir_converted_speech_wav:
    MCEP(wav, dir_converted_speech_mcep, fft_size=fft_size, mcep_size=mcep_size)


### MCD calculation calculation using DTW ( Dynamic Time Warping )

In [18]:
def mcd_cal(mcep_org_files, mcep_synth_files, MCD):
    min_cost_tot = 0.0
    total_frames = 0
    
    for i in mcep_org_files:
        
        for j in mcep_synth_files:
            
            split_org_file,  split_synth_file = os.path.basename(i).split('_'), os.path.basename(j).split('_')
            org_speaker, org_speaker_id = split_org_file[0], split_org_file[-1]
            synth_speaker, synth_speaker_id = split_synth_file[0], split_synth_file[-1]
            
            
            if org_speaker==synth_speaker and org_speaker_id==synth_speaker_id:
                
                org_mcep_npy=np.load('./Data/mceps_numpy/trg/A1.npy')
            
                frame_no = len(org_mcep_npy)
                synth_mcep_npy = np.load('./Data/mceps_numpy/conv/A1.npy')
                
                min_cost, _ = librosa.sequence.dtw(org_mcep_npy[:, 1:].T, synth_mcep_npy[:, 1:].T, 
                                                   metric=MCD)
    
                min_cost_tot += np.mean(min_cost)
                
                total_frames += frame_no
                
    
    mcd = min_cost_tot/total_frames
    
    return mcd, total_frames

### Code execution for MCD calculation between original and converted speech

In [21]:
org_file = glob.glob('./Data/mceps_numpy/trg/*')
synth_file= glob.glob('./Data/mceps_numpy/conv/*')

cost_function = MCD

mcd, frames_used = mcd_cal(org_file, synth_file, cost_function)



print(f' MCD = {mcd} dB and total of frames {frames_used}')


 MCD = 7.420275694517665 dB and total of frames 811
