In [110]:
import os
import math
import glob
import librosa
import pyworld
import pysptk
import numpy as np
import matplotlib.pyplot as plot

### paths of audio files

In [111]:
train_A_dir= './Data/org'
train_B_dir='./Data/tar'

### functions to load files

In [112]:

def load_wavs(wav_dir, sr):
    
    wavs = []  
    ori_wav = os.listdir(wav_dir)
    for i in range(len(ori_wav)):
        file = ori_wav[i]
        file_path = os.path.join(wav_dir, file)
        wav, _ = librosa.load(file_path, sr = sr, mono = True)
        
        wavs.append(wav)

    return wavs

In [113]:
sampling_rate = 16000
num_mcep = 24
frame_period = 5.0
n_frames = 128


wavs_A = load_wavs(wav_dir = train_A_dir, sr = sampling_rate)
wavs_B = load_wavs(wav_dir = train_B_dir, sr = sampling_rate)


### WORLD Analyzer ( log F0 feature calculation )

In [114]:
def world_encode_data(wavs, fs, frame_period = 5.0, coded_dim = 24):

    f0s = []
    
    num_mcep = 24
    for i in range(len(wavs)):
        wav = wavs[i]
        wav = wav.astype(np.float64)
        f0, _  = pyworld.harvest(wav, fs, frame_period = frame_period, f0_floor = 71.0, f0_ceil = 800.0)
        #f0s.append(f0)
        #log_f0s_concatenated0 = np.ma.log(np.concatenate(f0s))
        

    return f0 #log_f0s_concatenated0

In [115]:

f0s_A = world_encode_data(wavs = wavs_A, fs = sampling_rate, frame_period = frame_period, coded_dim = num_mcep)
f0s_B = world_encode_data(wavs = wavs_B, fs = sampling_rate, frame_period = frame_period, coded_dim = num_mcep)

    

In [116]:
frame_len=0
if len(f0s_A)<len(f0s_B):
    frame_len=f0s_A
else:
    frame_len=f0s_B
    

In [117]:
def logf0_rmse(x, y):
    log_spec_dB_const = 1/len(frame_len)
    diff = x - y
    
    return log_spec_dB_const * math.sqrt(np.inner(diff, diff))


In [118]:
cost_function = logf0_rmse

### using dynamic time warping (DTW) F0 RMSE calculation

In [119]:
min_cost, _ = librosa.sequence.dtw(f0s_A[:].T, f0s_B[:].T, 
                                                   metric=cost_function)
min_cost_tot=0
min_cost_tot += np.mean(min_cost)

In [120]:
min_cost_tot

23.363098021922227

In [108]:
#mean_logf0_rmse = min_cost_tot/len(frame_len)
#mean_logf0_rmse