# Generate z datasets

## Import dependencies

In [1]:
# Ignore a bunch of deprecation warnings
import warnings
warnings.filterwarnings("ignore")
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds 
import ddsp
import utils
import os
import gin
import pickle
import matplotlib
import IPython.display as ipd
import numpy as np
import soundfile as sf 
import matplotlib.pyplot as plt
import scipy as sp
import librosa
import librosa.display
 
%matplotlib inline
sample_rate = 16000



## Setting the path of audios and model

### Model path with z encoder

In [10]:
# model folder direction 
model_dir_z = 'Pretrained_Models_for_T2/piano_ae'
model_name = os.path.basename(model_dir_z)

# dataset_statistics.pkl in .model folder
dataset_stats_file_z = os.path.join(model_dir_z, 'dataset_statistics.pkl')

# operative_config-0.gin in model folder
gin_file_z = os.path.join(model_dir_z, 'operative_config-0.gin')

# Assumes only one checkpoint in the folder, 'ckpt-[iter]`.
ckpt_files = [f for f in tf.io.gfile.listdir(model_dir_z) if 'ckpt' in f]
ckpt_name = ckpt_files[0].split('.')[0]
ckpt = os.path.join(model_dir_z, ckpt_name)

print(model_dir_z,'\n',dataset_stats_file_z,'\n',gin_file_z)
print('find checkpoint:',ckpt)

Pretrained_Models_for_T2/piano_ae 
 Pretrained_Models_for_T2/piano_ae/dataset_statistics.pkl 
 Pretrained_Models_for_T2/piano_ae/operative_config-0.gin
find checkpoint: Pretrained_Models_for_T2/piano_ae/ckpt-8400


### Audio directory

In [4]:
# audio directory path
audio_dir = 'Datasets/Piano/Audio'
audio_folder = []

files = os.listdir(audio_dir)
for tmp in files:
    if os.path.splitext(tmp)[1] == '.wav' or os.path.splitext(tmp)[1] == '.mp3': 
        audio_folder.append(os.path.join(audio_dir,tmp))

audio_folder.sort()
print(audio_folder)
audio_num = len(audio_folder)
print('\n number of audios:', audio_num)

['Datasets/Piano/Audio/Piano_01.wav', 'Datasets/Piano/Audio/Piano_02.wav', 'Datasets/Piano/Audio/Piano_03.wav', 'Datasets/Piano/Audio/Piano_04.wav', 'Datasets/Piano/Audio/Piano_05.wav', 'Datasets/Piano/Audio/Piano_06.wav', 'Datasets/Piano/Audio/Piano_07.wav', 'Datasets/Piano/Audio/Piano_08.wav', 'Datasets/Piano/Audio/Piano_09.wav', 'Datasets/Piano/Audio/Piano_10.wav', 'Datasets/Piano/Audio/Piano_11.wav', 'Datasets/Piano/Audio/Piano_12.wav', 'Datasets/Piano/Audio/Piano_13.wav', 'Datasets/Piano/Audio/Piano_14.wav', 'Datasets/Piano/Audio/Piano_15.wav']

 number of audios: 15


## Load the model

In [5]:
# Load the dataset statistics.
print(f'Loading dataset statistics from {dataset_stats_file_z}')
try:
  if tf.io.gfile.exists(dataset_stats_file_z):
    with tf.io.gfile.GFile(dataset_stats_file_z, 'rb') as f:
      DATASET_STATS_Z = pickle.load(f)
except Exception as err:
  print('Loading dataset statistics from pickle failed: {}.'.format(err),'\n')

# Parse gin config,
with gin.unlock_config():
  gin.parse_config_file(gin_file_z, skip_unknown=True)
time_steps_train = gin.query_parameter('F0LoudnessPreprocessor.time_steps')
n_samples_train = gin.query_parameter('Harmonic.n_samples')
hop_size = int(n_samples_train / time_steps_train)

# Set up the model just to predict audio given new conditioning
model = ddsp.training.models.Autoencoder()
model.restore(ckpt)

Loading dataset statistics from Pretrained_Models_for_T2/piano_ae/dataset_statistics.pkl


## Generate z datasets

In [6]:
z_datasets = np.zeros(shape=(audio_num,1,1000,16)) # shape of z =(1, 1000, 16)

i = 0
for audio_path in audio_folder:
    print('processing the audio',i+1,'/',audio_num,':',audio_path)
    
    x_all, sr = sf.read(audio_path) #data,samplerate
    #print('shape of original signal:',np.shape(x_all),'\n','original sample rate:',sr)
    sig = x_all[:] # choose the first channel of the original audio

    # resample (down sampling to 16kHz)
    audio = librosa.resample(sig,sr,sample_rate)
    #print('audio shape:',np.shape(audio))
    audio = audio[np.newaxis,:]

    #extracting f0 with CREPE
    ddsp.spectral_ops.reset_crepe()
    f0_crepe, f0_confidence = ddsp.spectral_ops.compute_f0(audio[0], 
                                                           sample_rate= sample_rate,
                                                           frame_rate=31.25,
                                                           viterbi=False)
    #extracting loudness 
    loudness =ddsp.spectral_ops.compute_loudness(audio[0],
                         sample_rate= sample_rate,
                         frame_rate=250,
                         n_fft=2048,
                         ref_db=20.7,
                         use_tf=False)

    # audio_features dictionary
    audio_features_key = ['audio','f0_hz','f0_confidence','loundness_db']
    audio_features = dict([(k,[]) for k in audio_features_key])
    audio_features['audio'] = audio
    audio_features['f0_hz'] = f0_crepe
    audio_features['f0_confidence'] = f0_confidence
    audio_features['loudness_db'] = loudness
    
    # Trim all input vectors to correct lengths
    time_steps = int(audio.shape[1] / hop_size)
    n_samples = time_steps * hop_size 
    for key in ['f0_hz', 'f0_confidence', 'loudness_db']:
        audio_features[key] = audio_features[key][:time_steps]
    audio_features['audio'] = audio_features['audio'][:n_samples]
    
    # get z feature of the audio
    outputs = model(audio_features, training=False) # Run the forward pass, add losses, and create a dictionary of outputs.
    z_feature = outputs['z']
    print('shape of z feature',np.shape(z_feature))
    z_datasets[i] = z_feature
    
    i += 1
    
print('finished')

processing the audio 1 / 15 : Datasets/Piano/Audio/Piano_01.wav
processing the audio 2 / 15 : Datasets/Piano/Audio/Piano_02.wav
processing the audio 3 / 15 : Datasets/Piano/Audio/Piano_03.wav
processing the audio 4 / 15 : Datasets/Piano/Audio/Piano_04.wav
processing the audio 5 / 15 : Datasets/Piano/Audio/Piano_05.wav
processing the audio 6 / 15 : Datasets/Piano/Audio/Piano_06.wav
processing the audio 7 / 15 : Datasets/Piano/Audio/Piano_07.wav
processing the audio 8 / 15 : Datasets/Piano/Audio/Piano_08.wav
processing the audio 9 / 15 : Datasets/Piano/Audio/Piano_09.wav
processing the audio 10 / 15 : Datasets/Piano/Audio/Piano_10.wav
processing the audio 11 / 15 : Datasets/Piano/Audio/Piano_11.wav
processing the audio 12 / 15 : Datasets/Piano/Audio/Piano_12.wav
processing the audio 13 / 15 : Datasets/Piano/Audio/Piano_13.wav
processing the audio 14 / 15 : Datasets/Piano/Audio/Piano_14.wav
processing the audio 15 / 15 : Datasets/Piano/Audio/Piano_15.wav


### Save z datasets

In [9]:
save_path = './z_datasets/z_' + model_name + '.npy'
print(save_path)
np.save(save_path, z_datasets)
print(np.shape(z_datasets),'\n')
#print(z_datasets)

./z_datasets/z_piano_ae.npy
(15, 1, 1000, 16) 

