# Generate z datasets

## Import dependencies

In [1]:
# Ignore a bunch of deprecation warnings
import warnings
warnings.filterwarnings("ignore")
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds 
import ddsp
import utils
import os
import gin
import pickle
import matplotlib
import IPython.display as ipd
import numpy as np
import soundfile as sf 
import matplotlib.pyplot as plt
import scipy as sp
import librosa
import librosa.display
 
%matplotlib inline
sample_rate = 16000



## Setting the path of audios and model

### Model path with z encoder

In [2]:
# model folder direction 
model_dir_z = 'Pretrained_Models_for_T2/piano_ae'
model_name = os.path.basename(model_dir_z)

# dataset_statistics.pkl in .model folder
dataset_stats_file_z = os.path.join(model_dir_z, 'dataset_statistics.pkl')

# operative_config-0.gin in model folder
gin_file_z = os.path.join(model_dir_z, 'operative_config-0.gin')

# Assumes only one checkpoint in the folder, 'ckpt-[iter]`.
ckpt_files = [f for f in tf.io.gfile.listdir(model_dir_z) if 'ckpt' in f]
ckpt_name = ckpt_files[0].split('.')[0]
ckpt = os.path.join(model_dir_z, ckpt_name)

print(model_dir_z,'\n',dataset_stats_file_z,'\n',gin_file_z)
print('find checkpoint:',ckpt)

Pretrained_Models_for_T2/piano_ae 
 Pretrained_Models_for_T2/piano_ae/dataset_statistics.pkl 
 Pretrained_Models_for_T2/piano_ae/operative_config-0.gin
find checkpoint: Pretrained_Models_for_T2/piano_ae/ckpt-8400


### Audio directory

In [3]:
# audio directory path
audio_dir = 'Datasets/Piano/Audio'
audio_folder = []

files = os.listdir(audio_dir)
for tmp in files:
    if os.path.splitext(tmp)[1] == '.wav' or os.path.splitext(tmp)[1] == '.mp3': 
        audio_folder.append(os.path.join(audio_dir,tmp))

audio_folder.sort()
print(audio_folder)
audio_num = len(audio_folder)
print('\n number of audios:', audio_num)

['Datasets/Piano/Audio/Piano_01.wav', 'Datasets/Piano/Audio/Piano_02.wav', 'Datasets/Piano/Audio/Piano_03.wav', 'Datasets/Piano/Audio/Piano_04.wav', 'Datasets/Piano/Audio/Piano_05.wav', 'Datasets/Piano/Audio/Piano_06.wav', 'Datasets/Piano/Audio/Piano_07.wav', 'Datasets/Piano/Audio/Piano_08.wav', 'Datasets/Piano/Audio/Piano_09.wav', 'Datasets/Piano/Audio/Piano_10.wav', 'Datasets/Piano/Audio/Piano_11.wav', 'Datasets/Piano/Audio/Piano_12.wav', 'Datasets/Piano/Audio/Piano_13.wav', 'Datasets/Piano/Audio/Piano_14.wav', 'Datasets/Piano/Audio/Piano_15.wav']

 number of audios: 15


## Load the model

We use utils.model_loading function

## Generate z datasets

In [None]:
z_datasets = np.zeros(shape=(audio_num,1,15000,16)) # shape of z =(1, frame_rate*length_audio, 16)

i = 0
for audio_path in audio_folder:
    print('processing the audio',i+1,'/',audio_num,':',audio_path)
    
    x_all, sr = sf.read(audio_path) #data,samplerate
    #print('shape of original signal:',np.shape(x_all),'\n','original sample rate:',sr)
    sig = x_all[:] # choose the first channel of the original audio

    # resample (down sampling to 16kHz) and take the 10-20 seconds
    sig_re = librosa.resample(sig,sr,sample_rate)
    audio = sig_re#[10*sample_rate:30*sample_rate]
    #print('audio shape:',np.shape(audio))
    audio = audio[np.newaxis,:]

    #extracting f0 with CREPE
    ddsp.spectral_ops.reset_crepe()
    f0_crepe, f0_confidence = ddsp.spectral_ops.compute_f0(audio[0], 
                                                           sample_rate= sample_rate,
                                                           frame_rate=250,
                                                           viterbi=False)
    #extracting loudness 
    loudness =ddsp.spectral_ops.compute_loudness(audio[0],
                         sample_rate= sample_rate,
                         frame_rate=250,
                         n_fft=2048,
                         ref_db=20.7,
                         use_tf=False)

    # audio_features dictionary
    audio_features_key = ['audio','f0_hz','f0_confidence','loudness_db']
    audio_features = dict([(k,[]) for k in audio_features_key])
    audio_features['audio'] = audio
    audio_features['f0_hz'] = f0_crepe
    audio_features['f0_confidence'] = f0_confidence
    audio_features['loudness_db'] = loudness
    
    
    # get z feature of the audio
    outputs = utils.model_loading(audio, audio_features, model_dir, training = False) # Run the forward pass, add losses, and create a dictionary of outputs.
    z_feature = outputs['z']
    print('shape of z feature',np.shape(z_feature))
    z_datasets[i] = z_feature
    
    i += 1
    
print('finished')

In [None]:
print(outputs.keys())
# print()
audio_gen = model.get_audio_from_outputs(outputs)

In [None]:
print(np.shape(audio_gen))
print(audio_features)
print(audio_features.keys())
print(audio_features['audio'],np.shape(audio_features['audio']))

print(np.shape(audio_features['f0_hz']))
print(np.shape(outputs['f0_hz']))

### Save z datasets

In [None]:
save_path = './z_datasets/z_' + model_name + '.npy'
print(save_path)
np.save(save_path, z_datasets)
print(np.shape(z_datasets),'\n')
#print(z_datasets)