## Generate z datasets

## Import dependencies

In [None]:
# Ignore a bunch of deprecation warnings
import warnings
warnings.filterwarnings("ignore")
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds 
import ddsp
import utils
import os
import gin
import pickle
import matplotlib
import IPython.display as ipd
import numpy as np
import soundfile as sf 
import matplotlib.pyplot as plt
import scipy as sp
import librosa
import librosa.display
 
%matplotlib inline
sample_rate = 16000

## Setting the path of audios and model

In [None]:
# audio directory path
audio_dir = 'Datasets/Piano/Audio'

### Model path with z encoder

In [None]:
# model folder direction 
model_dir_z = 'Pretrained_Models_for_T2/piano_ae'

# dataset_statistics.pkl in .model folder
dataset_stats_file_z = os.path.join(model_dir_z, 'dataset_statistics.pkl')

# operative_config-0.gin in model folder
gin_file_z = os.path.join(model_dir_z, 'operative_config-0.gin')

## Preprocess audio files

In [None]:
def audio_features(audio):
    x_all, sr = sf.read(audio_path) #data,samplerate
    print('shape of original signal:',np.shape(x_all),'\n','original sample rate:',sr)
    sig = x_all[:] # choose the first channel of the original audio

    # resample (down sampling to 16kHz)
    audio = librosa.resample(sig,sr,sample_rate)
    print('audio shape:',np.shape(audio))
    audio = audio[np.newaxis,:]

    #extracting f0 with CREPE
    ddsp.spectral_ops.reset_crepe()
    f0_crepe, f0_confidence = ddsp.spectral_ops.compute_f0(audio[0], 
                                                           sample_rate= sample_rate,
                                                           frame_rate=31.25,
                                                           viterbi=False)
    #extracting loudness 
    loudness =ddsp.spectral_ops.compute_loudness(audio[0],
                         sample_rate= sample_rate,
                         frame_rate=250,
                         n_fft=2048,
                         ref_db=20.7,
                         use_tf=False)

    # audio_features dictionary
    audio_features_key = ['audio','f0_hz','f0_confidence','loundness_db']
    audio_features = dict([(k,[]) for k in audio_features_key])
    audio_features['audio'] = audio
    audio_features['f0_hz'] = f0_crepe
    audio_features['f0_confidence'] = f0_confidence
    audio_features['loudness_db'] = loudness
    
    return audio_features

In [None]:
feature_datasets = []

for i in audio_dir:
    audio_path = 
    features_datasets[i] = audio_features(audio_path)
    
print(np.shape(feature_datasets))

## Load the model

In [None]:
print(model_dir_z,'\n',dataset_stats_file_z,'\n',gin_file_z)

In [None]:
# Load the dataset statistics.
print(f'Loading dataset statistics from {dataset_stats_file_z}')
try:
  if tf.io.gfile.exists(dataset_stats_file_z):
    with tf.io.gfile.GFile(dataset_stats_file_z, 'rb') as f:
      DATASET_STATS_Z = pickle.load(f)
except Exception as err:
  print('Loading dataset statistics from pickle failed: {}.'.format(err),'\n')


# Parse gin config,
with gin.unlock_config():
  gin.parse_config_file(gin_file_z, skip_unknown=True)

# Assumes only one checkpoint in the folder, 'ckpt-[iter]`.
ckpt_files = [f for f in tf.io.gfile.listdir(model_dir_z) if 'ckpt' in f]
ckpt_name = ckpt_files[0].split('.')[0]
ckpt = os.path.join(model_dir_z, ckpt_name)

# Ensure dimensions and sampling rates are equal
time_steps_train = gin.query_parameter('F0LoudnessPreprocessor.time_steps')
n_samples_train = gin.query_parameter('Harmonic.n_samples')
hop_size = int(n_samples_train / time_steps_train)

time_steps = int(audio.shape[1] / hop_size)
n_samples = time_steps * hop_size

# print("===Trained model===")
# print("Time Steps", time_steps_train)
# print("Samples", n_samples_train)
# print("Hop Size", hop_size)
# print("\n===Resynthesis===")
# print("Time Steps", time_steps)
# print("Samples", n_samples)
# print('')

gin_params = [
    'Harmonic.n_samples = {}'.format(n_samples),
    'FilteredNoise.n_samples = {}'.format(n_samples),
    'F0LoudnessPreprocessor.time_steps = {}'.format(time_steps),
    'oscillator_bank.use_angular_cumsum = True',  # Avoids cumsum accumulation errors.
]

with gin.unlock_config():
  gin.parse_config(gin_params)

# Set up the model just to predict audio given new conditioning
model = ddsp.training.models.Autoencoder()
model.restore(ckpt)

## Generate z datasets

In [None]:
# Trim all input vectors to correct lengths 
for audio_features in feature_datasets:
    for key in ['f0_hz', 'f0_confidence', 'loudness_db']:
      audio_features[key] = audio_features[key][:time_steps]
    audio_features['audio'] = audio_features['audio'][:n_samples]

In [None]:
z_datasets = []
for i in len(feature_datasets)
    outputs = model_z(temp, training=False) # Run the forward pass, add losses, and create a dictionary of outputs.
    z_feature = outputs['z']
    z_datasets[i] = z_feature

print(np.shape(z_datasets))