In [5]:
# from utils import process_data
# from data_objects.utterance import Utterance
import os, yaml, sys, random, torchaudio
import pyworld as pw
import soundfile as sf
sys.path.insert(1, '/homes/bdoc3/my_utils')
from audio.worldvocoder import code_harmonic, sp_to_mfsc
from my_os import recursive_file_retrieval

In [7]:
import numpy as np
import math, librosa, warnings, pdb

# UTTERANCE CLASS

class Utterance:
    def __init__(self, frames_fpath, wave_fpath, config, feat_params):
        self.frames_fpath = frames_fpath
        self.wave_fpath = wave_fpath
        self.config = config
        self.feat_params = feat_params

    def get_chunk(self, frames, n_frames, start=None):

        if frames.shape[0] > n_frames:
            if start == None:
                start = np.random.randint(0, frames.shape[0] - n_frames)
        else:
#             print(f'frames.shape[0] {frames.shape[0]}, n_frames {n_frames}')
            start = 0
            pad_size = math.ceil(n_frames - frames.shape[0]/2)
            if frames.ndim == 1:
                pad_vec = np.full((pad_size), np.min(frames))
            else:
                pad_vec = np.full((pad_size, frames.shape[1]), np.min(frames))
            frames = np.concatenate((pad_vec, frames, pad_vec))
            
        end = start + n_frames
        print('start', start)
        return frames[start:end], (start, end)

# get features, either from audio or precomputed npy arrays.
    def get_frames(self, n_frames, start=None):

        if self.config.use_audio:
            y, _ = sf.read(self.frames_fpath)
            samps_per_frame = (self.feat_params['frame_dur_ms']/1000) * self.feat_params['sr']
            required_size =  int(samps_per_frame * n_frames)
            if y.shape[0] < 1:
                # '+2' at end is for f0_estimation vector
                frames = np.zeros((n_frames, (self.feat_params['num_feats']+self.feat_params['num_aper_feats']+2)))
                start_end = (0, required_size)
            else:
                counter = 0
                looper = True
                while looper:
                    if counter > 10:
                        raise Exception(f'Could not find vocal segments after randomly selecting 10 segments of length {n_frames}.')
                    try:
                        if start == None:
                            y_chunk, start_end = self.get_chunk(y, required_size)
                        else:
                            y_chunk, start_end = self.get_chunk(y, required_size, start)
                        frames = process_data(y_chunk.astype('double'), self.feat_params, self.config)
                        looper = False
                    except ValueError as e:
                        print(f'ValueError: {e}. Trying another random chunk from uttr: {self.frames_fpath}')
                        counter +=1

        else:
            frames = np.load(self.frames_fpath)
            frames, start_end = self.get_chunk(frames, n_frames)
        # print('another utterance processed')
        return frames[:n_frames], start_end

    def random_partial(self, n_frames, num_feats):
        """
        Crops the frames into a partial utterance of n_frames
        
        :param n_frames: The number of frames of the partial utterance
        :return: the partial utterance frames and a tuple indicating the start and end of the 
        partial utterance in the complete utterance.
        """
        # pdb.set_trace()

        frames, start_end = self.get_frames(n_frames)
        frames = frames[:,:num_feats]

        # frames = (frames - frames.mean()) / frames.std() # normalise from 0-1 across entire numpy
        # frames = (frames - frames.mean(axis=0)) / frames.std(axis=0) # normalise from 0-1 across features
        # pdb.set_trace()   
        return frames, start_end

    def specific_partial(self, n_frames, num_feats, start):
        """
        Crops the frames into a partial utterance of n_frames
        
        :param n_frames: The number of frames of the partial utterance
        :return: the partial utterance frames and a tuple indicating the start and end of the 
        partial utterance in the complete utterance.
        """
        # pdb.set_trace()

        frames, start_end = self.get_frames(n_frames, start)
        frames = frames[:,:num_feats]

        # frames = (frames - frames.mean()) / frames.std() # normalise from 0-1 across entire numpy
        # frames = (frames - frames.mean(axis=0)) / frames.std(axis=0) # normalise from 0-1 across features
        # pdb.set_trace()   
        return frames, start_end 

def process_data(y, feat_params, config):

    if config.use_wav2world:
        feats=pw.wav2world(y, feat_params['sr'],frame_period=feat_params['frame_dur_ms'])
        harm = feats[1]
        aper = feats[2]
        refined_f0 = feats[0]
    else:
        if config.f0_extract == 'harvest':
            f0, t_stamp = pw.harvest(y, feat_params['sr'], feat_params['fmin'], feat_params['fmax'], feat_params['frame_dur_ms'])
        elif config.f0_extract =='dio':
            f0, t_stamp = pw.dio(y, feat_params['sr'], feat_params['fmin'], feat_params['fmax'], frame_period = feat_params['frame_dur_ms'])
        refined_f0 = pw.stonemask(y, f0, t_stamp, feat_params['sr'])
        harm = pw.cheaptrick(y, refined_f0, t_stamp, feat_params['sr'], f0_floor=feat_params['fmin'])
        aper = pw.d4c(y, refined_f0, t_stamp, feat_params['sr'])
#     pdb.set_trace()
    refined_f0 = freq_to_vuv_midi(refined_f0) # <<< this can be done at training time

    # print('basic harm/aper/f0 features extracted')

    if config.dim_red_method == 'code-h':
        harm = code_harmonic(harm, feat_params['num_feats'])
        aper = code_harmonic(aper, feat_params['num_aper_feats'])
    elif config.dim_red_method == 'world':
        harm = pw.code_spectral_envelope(harm, feat_params['sr'], feat_params['num_feats'])
        aper = pw.code_aperiodicity(aper, feat_params['num_feats'])
    elif config.dim_red_method == 'chandna':
        harm = 10*np.log10(harm) # previously, using these logs was a separate optional process to 'chandna'
        aper = 10*np.log10(aper**2)
        harm = sp_to_mfsc(harm, feat_params['num_feats'], 0.45)
        aper =sp_to_mfsc(aper, 4, 0.45)
    else:
        raise Exception("The value for dim_red_method was not recognised")
    # print(f'{random.randint(0,100)}feature dims reduced')


    out_feats=np.concatenate((harm,aper,refined_f0),axis=1)

    return out_feats


def nan_helper(y):
    """Helper to handle indices and logical indices of NaNs.

    Input:
        - y, 1d numpy array with possible NaNs
    Output:
        - nans, logical indices of NaNs
        - index, a function, with signature indices= index(logical_indices),
          to convert logical indices of NaNs to 'equivalent' indices
    Example:
        >>> # linear interpolation of NaNs
        >>> nans, x= nan_helper(y)
        >>> y[nans]= np.interp(x(nans), x(~nans), y[~nans])
    """

    return np.isinf(y), lambda z: z.nonzero()[0]


def freq_to_vuv_midi(f0):
    "Convert to midi notes, with second vector displaying 1 when there's no pitch detected"
    with warnings.catch_warnings(): # warning 
        warnings.simplefilter("ignore", category=RuntimeWarning)
        notes_y = 69+12*np.log2(f0/440)
    y = notes_y
    "Nan related"
    nans, x= nan_helper(y)
    if np.all(nans) == True:
        raise ValueError('No voice pitch detected in segment')
    naners=np.isinf(y)
    y[nans]= np.interp(x(nans), x(~nans), y[~nans])
    y=np.array(y).reshape([len(y),1])
    guy=np.array(naners).reshape([len(y),1])
    y=np.concatenate((y,guy),axis=-1)
    return y

In [8]:
# with open(os.path.join(data_path, 'feat_params.yaml'), 'rb') as Handle:
#     feat_params = yaml.load(Handle, Loader=yaml.FullLoader)

class Object():
    pass

In [9]:
config = Object()
config.use_audio = True
config.use_wav2world = True
config.f0_extract = 'harvest'
config.dim_red_method = 'chandna'
feat_params = {"use_wav2world":config.use_wav2world,
                                "f0_extract":config.f0_extract,
                                "dim_red_method":config.dim_red_method,
                                "fmin":71,
                                "fmax":800,
                                'num_feats':40,
                                'num_aper_feats':4,
                                'frame_dur_ms':5,
                                'sr':16000,
                                'fft_size':None}
n_frames = 307
num_feats = 40

data_path = '/homes/bdoc3/my_data/audio_data/deslienced_concat_DAMP'

# issue_singers = ['1468732648'] #
# issue_singers = ['1006811699'] # 
# issue_singers = ['424702701'] # there is no audio data saved for this singer
issue_singers = ['1418829056']

uttrs = []
counter = 0
while counter<3:
    for i in issue_singers:
        dir_path = os.path.join(data_path, 'train', i)
        if not os.path.exists(dir_path):
            dir_path = os.path.join(data_path, 'val', i)
        _, _, files = next(os.walk(dir_path))
        for file_name in files:
            print(f'Processing {file_name}, process {counter}')
            file_path = os.path.join(dir_path, file_name)
            u = Utterance(file_path, file_path, config, feat_params)
            u_part = u.random_partial(n_frames, num_feats)
#             u_part = u.specific_partial(n_frames, num_feats, 31000)
            uttrs.append(u_part)
            counter += 1

Processing 1418829056_1704865349.wav, process 0
start 57
ValueError: No voice pitch detected in segment. Trying another random chunk from uttr: /homes/bdoc3/my_data/audio_data/deslienced_concat_DAMP/train/1418829056/1418829056_1704865349.wav
start 22105
ValueError: No voice pitch detected in segment. Trying another random chunk from uttr: /homes/bdoc3/my_data/audio_data/deslienced_concat_DAMP/train/1418829056/1418829056_1704865349.wav
start 8250
ValueError: No voice pitch detected in segment. Trying another random chunk from uttr: /homes/bdoc3/my_data/audio_data/deslienced_concat_DAMP/train/1418829056/1418829056_1704865349.wav
start 42645
ValueError: No voice pitch detected in segment. Trying another random chunk from uttr: /homes/bdoc3/my_data/audio_data/deslienced_concat_DAMP/train/1418829056/1418829056_1704865349.wav
start 2578
ValueError: No voice pitch detected in segment. Trying another random chunk from uttr: /homes/bdoc3/my_data/audio_data/deslienced_concat_DAMP/train/141882905

Exception: Could not find vocal segments after randomly selecting 10 segments of length 307.

## Use original dataset files

In [12]:
data_path = '/import/c4dm-datasets/DAMP_Intonation_Dataset/vocal_tracks'
_, all_fps = recursive_file_retrieval(data_path)

# issue_singers = ['1468732648'] #
# issue_singers = ['1006811699'] # 
# issue_singers = ['424702701'] # there is no audio data saved for this singer
s_id = '1418829056'



uttrs = []
counter = 0
for fp in all_fps:
    if s_id in fp:
        while counter <3:
            print(f'Processing {fp}, process {counter}')
            u = Utterance(fp, fp, config, feat_params)
            u_part = u.random_partial(n_frames, num_feats)
    #             u_part = u.specific_partial(n_frames, num_feats, 31000)
            uttrs.append(u_part)
            counter += 1

Processing /import/c4dm-datasets/DAMP_Intonation_Dataset/vocal_tracks/1418829056_1704865349.m4a, process 0
start 2795860
Processing /import/c4dm-datasets/DAMP_Intonation_Dataset/vocal_tracks/1418829056_1704865349.m4a, process 1
start 4688143
Processing /import/c4dm-datasets/DAMP_Intonation_Dataset/vocal_tracks/1418829056_1704865349.m4a, process 2
start 5744079


## Timeit experiments

In [19]:
# data_path = '/homes/bdoc3/my_data/audio_data/deslienced_concat_DAMP'

# all_dirs, files = recursive_file_retrieval(data_path)
# wav_files = [f for f in files if f.endswith('wav')]

In [20]:
data_path = '/import/c4dm-datasets/DAMP_Intonation_Dataset/vocal_tracks'

all_dirs, files = recursive_file_retrieval(data_path)
wav_files = [f for f in files if f.endswith('m4a')]

## Timeit results for loading files only

In [23]:
# %timeit librosa.load(wav_files[random.randint(0, len(wav_files))]) #  1.57 s ± 631 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# %timeit sf.read(wav_files[random.randint(0, len(wav_files))]) #53.8 ms ± 16.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# %timeit torchaudio.load(wav_files[random.randint(0, len(wav_files))]) #44.2 ms ± 23.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Timeit results for loading and processing files

In [22]:
%%timeit
# with librosa IO on wav - 1.98 s ± 220 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# with sf IO on wav - 264 ms ± 14.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# with sf IO on m4a - 367 ms ± 53.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


file_path = wav_files[random.randint(0, len(wav_files))]
u = Utterance(file_path, file_path, config, feat_params)
u_part = u.random_partial(n_frames, num_feats)

start 2364070
start 4671904
start 551183
start 1747801
ValueError: No voice pitch detected in segment. Trying another random chunk from uttr: /import/c4dm-datasets/DAMP_Intonation_Dataset/vocal_tracks/313835696_1839888461.m4a
start 982368
start 505968
start 2330506
start 5386925
start 2066420
367 ms ± 53.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [57]:
file_path = wav_files[random.randint(0, len(wav_files))]
file_path = '/homes/bdoc3/my_data/audio_data/deslienced_concat_DAMP/train/374399338/374399338_1741657119.wav'
u = Utterance(file_path, file_path, config, feat_params)
u.frames_fpath
u.random_partial(307, 40)

start 664268


(array([[ -5.53413142,  -5.16761401,  -6.13489713, ..., -23.56622401,
         -24.75902111, -35.46205331],
        [ -7.44956002,  -8.1365492 ,  -7.68187535, ..., -22.74666417,
         -23.07679609, -32.91781073],
        [ -7.82786729,  -8.06915859,  -7.88852626, ..., -24.24993222,
         -24.08777826, -32.59682484],
        ...,
        [ -4.24098048,  -5.18928258,  -4.57552981, ..., -21.62389486,
         -20.34128273, -28.53943275],
        [ -4.33712282,  -5.15864007,  -4.64674877, ..., -21.21071475,
         -19.01734806, -30.59034519],
        [ -4.28971528,  -4.9698928 ,  -4.67096784, ..., -19.60396634,
         -22.53332584, -33.21100276]]),
 (664268, 713388))