In [1]:
# from utils import process_data
# from data_objects.utterance import Utterance
import os, yaml, sys, random, torchaudio
import pyworld as pw
import soundfile as sf

for i in sys.path:
    if i == '/homes/bdoc3/wavenet_vocoder':
        sys.path.remove(i)

sys.path.insert(1, '/homes/bdoc3/my_utils')
from audio.worldvocoder import code_harmonic, sp_to_mfsc
from my_os import recursive_file_retrieval

In [14]:
import numpy as np
import math, librosa, warnings, pdb
from scipy.io import wavfile

"""Minimally altered code from https://github.com/Trebolium/Real-Time-Voice-Cloning/tree/master/encoder/data_objects"""

class Utterance:
    def __init__(self, frames_fpath, wave_fpath, config, feat_params):
        self.frames_fpath = frames_fpath
        self.wave_fpath = wave_fpath
        self.config = config
        self.feat_params = feat_params
        if config.feats_type == 'mel':
            num_total_feats = feat_params['num_harm_feats']
            self.mel_filter = mel(config.sampling_rate, config.fft_size, fmin=config.fmin, fmax=config.fmax, n_mels=num_total_feats).T
            # self.mel_filter = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T
            self.min_level = np.exp(-100 / 20 * np.log(10))
            self.hop_size = int((self.config.frame_dur_ms/1000) * self.config.sampling_rate)

            
    def get_chunk(self, frames, n_frames, start=None):

        if frames.shape[0] > n_frames:
            if start == None:
                start = np.random.randint(0, frames.shape[0] - n_frames)
        else:
#             print(f'frames.shape[0] {frames.shape[0]}, n_frames {n_frames}')
            start = 0
            pad_size = math.ceil(n_frames - frames.shape[0]/2)
            if frames.ndim == 1:
                pad_vec = np.full((pad_size), np.min(frames))
            else:
                pad_vec = np.full((pad_size, frames.shape[1]), np.min(frames))
            frames = np.concatenate((pad_vec, frames, pad_vec))
            
        end = start + n_frames
        # print('start', start)
        return frames[start:end], (start, end)

    
# get features, either from audio or precomputed npy arrays.
    def get_frames(self, n_frames, start=None):

        if self.config.use_audio:
            _, y = wavfile.read(self.frames_fpath)
            samps_per_frame = (self.feat_params['frame_dur_ms']/1000) * self.feat_params['sr']
            required_size =  int(samps_per_frame * n_frames)
            if y.shape[0] < 1:
                # '+2' at end is for f0_estimation vectors
                frames = np.zeros((n_frames, (self.feat_params['num_harm_feats']+self.feat_params['num_aper_feats']+2)))
                start_end = (0, required_size)
            else:
                counter = 0
                looper = True
                while looper:
                    if counter < 10:
                        try:
                            if start == None:
                                y_chunk, start_end = self.get_chunk(y, required_size)
                            else:
                                y_chunk, start_end = self.get_chunk(y, required_size, start)
                            if self.config.feats_type == 'mel':
                                db_unnormed_melspec = audio_to_mel_autovc(y_chunk, self.config.fft_size, self.hop_size, self.mel_filter)
                                frames = db_normalize(db_unnormed_melspec, self.min_level)
                            elif self.config.feats_type == 'world':
                                frames = get_world_feats(y_chunk.astype('double'), self.feat_params, self.config)
                            
                            looper = False
                        except ValueError as e:
                            print(f'ValueError: {e}. Trying another random chunk from uttr: {self.frames_fpath}')
                            counter +=1
                    else:
                        print(f'Could not find vocal segments. Returning zero\'d array instead')
                        frames = np.zeros((n_frames, (self.feat_params['num_harm_feats']+self.feat_params['num_aper_feats']+2))) # might need to alter if making aper gens conditional of config.use_aper_feats
                        start_end = (0, required_size)
                        looper = False
        else:
            frames = np.load(self.frames_fpath)
            frames, start_end = self.get_chunk(frames, n_frames)
        # print('another utterance processed')
        return frames[:n_frames], start_end

    
    def random_partial(self, n_frames, num_total_feats):
        """
        Crops the frames into a partial utterance of n_frames
        
        :param n_frames: The number of frames of the partial utterance
        :return: the partial utterance frames and a tuple indicating the start and end of the 
        partial utterance in the complete utterance.
        """
        # pdb.set_trace()

        frames, start_end = self.get_frames(n_frames)
        frames = frames[:,:num_total_feats]

        # frames = (frames - frames.mean()) / frames.std() # normalise from 0-1 across entire numpy
        # frames = (frames - frames.mean(axis=0)) / frames.std(axis=0) # normalise from 0-1 across features
        # pdb.set_trace()   
        return frames, start_end

    
    def specific_partial(self, n_frames, num_total_feats, start):
        """
        Crops the frames into a partial utterance of n_frames
        
        :param n_frames: The number of frames of the partial utterance
        :return: the partial utterance frames and a tuple indicating the start and end of the 
        partial utterance in the complete utterance.
        """
        # pdb.set_trace()

        frames, start_end = self.get_frames(n_frames, start)
        frames = frames[:,:num_total_feats]

        # frames = (frames - frames.mean()) / frames.std() # normalise from 0-1 across entire numpy
        # frames = (frames - frames.mean(axis=0)) / frames.std(axis=0) # normalise from 0-1 across features
        # pdb.set_trace()   
        return frames, start_end 


def nan_helper(y):
    """Helper to handle indices and logical indices of NaNs.

    Input:
        - y, 1d numpy array with possible NaNs
    Output:
        - nans, logical indices of NaNs
        - index, a function, with signature indices= index(logical_indices),
          to convert logical indices of NaNs to 'equivalent' indices
    Example:
        >>> # linear interpolation of NaNs
        >>> nans, x= nan_helper(y)
        >>> y[nans]= np.interp(x(nans), x(~nans), y[~nans])
    """

    return np.isinf(y), lambda z: z.nonzero()[0]


#Convert to midi notes, with second vector displaying 1 when there's no pitch detected
def freq_to_vuv_midi(f0):
    with warnings.catch_warnings(): # warning 
        warnings.simplefilter("ignore", category=RuntimeWarning)
        notes_y = 69+12*np.log2(f0/440)
    y = notes_y
    "Nan related"
    nans, x= nan_helper(y)
    if np.all(nans) == True:
        raise ValueError('No voice pitch detected in segment')
    naners=np.isinf(y)
    y[nans]= np.interp(x(nans), x(~nans), y[~nans])
    y=np.array(y).reshape([len(y),1])
    guy=np.array(naners).reshape([len(y),1])
    y=np.concatenate((y,guy),axis=-1)
    return y

In [15]:
# with open(os.path.join(data_path, 'feat_params.yaml'), 'rb') as Handle:
#     feat_params = yaml.load(Handle, Loader=yaml.FullLoader)

class Object():
    pass

In [4]:
config = Object()
config.use_audio = True
config.use_wav2world = True
config.f0_extract = 'harvest'
config.dim_red_method = 'chandna'
feat_params = {"use_wav2world":config.use_wav2world,
                                "f0_extract":config.f0_extract,
                                "dim_red_method":config.dim_red_method,
                                "fmin":71,
                                "fmax":800,
                                'num_feats':40,
                                'num_aper_feats':4,
                                'frame_dur_ms':5,
                                'sr':16000,
                                'fft_size':None}
n_frames = 307
num_feats = 40

data_path = '/homes/bdoc3/my_data/audio_data/deslienced_concat_DAMP'

s_ids = ['1468732648'] #
# s_ids = ['1006811699'] # 
# s_ids = ['424702701'] # there is no audio data saved for this singer
# s_ids = ['1418829056']

uttrs = []
counter = 0
while counter<3:
    for s_id in s_ids:
        dir_path = os.path.join(data_path, 'train', s_id)
        if not os.path.exists(dir_path):
            dir_path = os.path.join(data_path, 'val', s_id)
            if not os.path.exists(dir_path):
                raise IOError('path doesn\'t exist')
        _, _, files = next(os.walk(dir_path))
        for file_name in files:
            print(f'Processing {file_name}, process {counter}')
            file_path = os.path.join(dir_path, file_name)
            u = Utterance(file_path, file_path, config, feat_params)
            u_part = u.random_partial(n_frames, num_feats)
#             u_part = u.specific_partial(n_frames, num_feats, 31000)
            uttrs.append(u_part)
            counter += 1

Processing 1468732648_1822074274.wav, process 0
start 1934796
Processing 1468732648_1822074274.wav, process 1
start 1735895
Processing 1468732648_1822074274.wav, process 2
start 598058


## Use original dataset files

In [12]:
data_path = '/import/c4dm-datasets/DAMP_Intonation_Dataset/vocal_tracks'
_, all_fps = recursive_file_retrieval(data_path)

# issue_singers = ['1468732648'] #
# issue_singers = ['1006811699'] # 
# issue_singers = ['424702701'] # there is no audio data saved for this singer
s_id = '1418829056'



uttrs = []
counter = 0
for fp in all_fps:
    if s_id in fp:
        while counter <3:
            print(f'Processing {fp}, process {counter}')
            u = Utterance(fp, fp, config, feat_params)
            u_part = u.random_partial(n_frames, num_feats)
    #             u_part = u.specific_partial(n_frames, num_feats, 31000)
            uttrs.append(u_part)
            counter += 1

Processing /import/c4dm-datasets/DAMP_Intonation_Dataset/vocal_tracks/1418829056_1704865349.m4a, process 0
start 2795860
Processing /import/c4dm-datasets/DAMP_Intonation_Dataset/vocal_tracks/1418829056_1704865349.m4a, process 1
start 4688143
Processing /import/c4dm-datasets/DAMP_Intonation_Dataset/vocal_tracks/1418829056_1704865349.m4a, process 2
start 5744079


## Timeit experiments

In [10]:
data_path = '/homes/bdoc3/my_data/audio_data/deslienced_concat_DAMP'

all_dirs, files = recursive_file_retrieval(data_path)
wav_files = [f for f in files if f.endswith('wav')]

In [11]:
data_path = '/import/c4dm-datasets/DAMP_Intonation_Dataset/vocal_tracks'

all_dirs, files = recursive_file_retrieval(data_path)
m4a_files = [f for f in files if f.endswith('m4a')]

## Timeit results for loading files only

In [7]:
# %timeit librosa.load(wav_files[random.randint(0, len(wav_files))]) #  1.57 s ± 631 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# %timeit sf.read(wav_files[random.randint(0, len(wav_files))]) #53.8 ms ± 16.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# %timeit torchaudio.load(wav_files[random.randint(0, len(wav_files))]) #44.2 ms ± 23.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Timeit results for loading and processing files

In [21]:
%%timeit
# FOR HARM FEATS ONLY
# with librosa IO on wav - 1.98 s ± 220 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# with sf IO on wav - 264 ms ± 14.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# with sf IO on m4a - 367 ms ± 53.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# with scipy.io.wavfile (on wav, obviously) - 221 ms ± 97.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

num_feats = 44
# FOR HARM AND APER FEATS
# WITH  scipy.io.wavfile (on wav, obviously) - 

file_path = wav_files[random.randint(0, len(wav_files))]
u = Utterance(file_path, file_path, config, feat_params)
u_part = u.random_partial(n_frames, num_feats)

start 1951484
start 1072990
start 635304
start 232686
start 2397059
start 1415726
start 2148817
start 910972
235 ms ± 20.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [57]:
file_path = wav_files[random.randint(0, len(wav_files))]
file_path = '/homes/bdoc3/my_data/audio_data/deslienced_concat_DAMP/train/374399338/374399338_1741657119.wav'
u = Utterance(file_path, file_path, config, feat_params)
u.frames_fpath
u.random_partial(307, 40)

start 664268


(array([[ -5.53413142,  -5.16761401,  -6.13489713, ..., -23.56622401,
         -24.75902111, -35.46205331],
        [ -7.44956002,  -8.1365492 ,  -7.68187535, ..., -22.74666417,
         -23.07679609, -32.91781073],
        [ -7.82786729,  -8.06915859,  -7.88852626, ..., -24.24993222,
         -24.08777826, -32.59682484],
        ...,
        [ -4.24098048,  -5.18928258,  -4.57552981, ..., -21.62389486,
         -20.34128273, -28.53943275],
        [ -4.33712282,  -5.15864007,  -4.64674877, ..., -21.21071475,
         -19.01734806, -30.59034519],
        [ -4.28971528,  -4.9698928 ,  -4.67096784, ..., -19.60396634,
         -22.53332584, -33.21100276]]),
 (664268, 713388))