## Setup Google Drive

In [None]:
import os
from google.colab import files, drive
drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Access Harmonic NSynth Guitar Subset from Google Drive

In [None]:
dataset_dir = '/content/drive/My Drive/new_data'

## Install Dependencies

First we install the required dependencies with `pip`.

In [None]:
%tensorflow_version 2.x
!pip install -qU ddsp[data_preparation]==1.0.1

In [None]:
pip install git+https://github.com/fabiodimarco/tf-spectral-modeling-synthesis.git

Collecting git+https://github.com/fabiodimarco/tf-spectral-modeling-synthesis.git
  Cloning https://github.com/fabiodimarco/tf-spectral-modeling-synthesis.git to /tmp/pip-req-build-jp1a296s
  Running command git clone -q https://github.com/fabiodimarco/tf-spectral-modeling-synthesis.git /tmp/pip-req-build-jp1a296s
Building wheels for collected packages: tsms
  Building wheel for tsms (setup.py) ... [?25l[?25hdone
  Created wheel for tsms: filename=tsms-0.0.1-cp37-none-any.whl size=10497 sha256=fc5a9602071b05d8a57fa2dccf7687cecc1b9d360eff0005803b1ba110b26919
  Stored in directory: /tmp/pip-ephem-wheel-cache-q_tr9z88/wheels/6f/31/3b/a01e898e4e31b4151712a4c8f139310b8cc2f1eb69f4326291
Successfully built tsms


In [None]:
import tensorflow as tf
import ddsp.training.data as data
import tsms
import numpy as np
import pandas as pd
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import math

## Define DataProvider class

In [None]:
class CompleteTFRecordProvider(data.RecordProvider):
    def __init__(self,
                 file_pattern=None,
                 example_secs=4,
                 sample_rate=16000,
                 frame_rate=250,
                 map_func=None):
        super().__init__(file_pattern, example_secs, sample_rate,
                         frame_rate, tf.data.TFRecordDataset)
        self._map_func = map_func

    def get_dataset(self, shuffle=True):
      def parse_tfexample(record):
        features = tf.io.parse_single_example(record, self.features_dict)
        if self._map_func is not None:
          return self._map_func(features)
        else:
          return features

      filenames = tf.data.Dataset.list_files(self._file_pattern, shuffle=shuffle)
      dataset = filenames.interleave(
          map_func=self._data_format_map_fn,
          cycle_length=40,
          num_parallel_calls=tf.data.experimental.AUTOTUNE,
          deterministic=True)
      dataset = dataset.map(parse_tfexample,
                            num_parallel_calls=tf.data.experimental.AUTOTUNE,
                            deterministic=True)
      return dataset

    @property
    def features_dict(self):
        return {
            'sample_name': tf.io.FixedLenFeature([1], dtype=tf.string),
            'instrument_id': tf.io.FixedLenFeature([1], dtype=tf.int64),
            'note_number': tf.io.FixedLenFeature([1], dtype=tf.int64),
            'velocity': tf.io.FixedLenFeature([1], dtype=tf.int64),
            'instrument_source': tf.io.FixedLenFeature([1], dtype=tf.int64),
            'qualities': tf.io.FixedLenFeature([10], dtype=tf.int64),
            'audio': tf.io.FixedLenFeature([self._audio_length], dtype=tf.float32),
            'f0_hz': tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
            'f0_confidence': tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
            'loudness_db': tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
            'f0_scaled': tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
            'ld_scaled': tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
            'z': tf.io.FixedLenFeature([self._feature_length * 16], dtype=tf.float32),
            'f0_estimate': tf.io.FixedLenFeature([], dtype=tf.string),
            'h_freq': tf.io.FixedLenFeature([], dtype=tf.string),
            'h_mag': tf.io.FixedLenFeature([], dtype=tf.string),
            'h_phase': tf.io.FixedLenFeature([], dtype=tf.string),
        }


## Features Map


In [None]:
def features_map(features):
    sample_name = features['sample_name']
    instrument_id = features['instrument_id']
    note_number = features['note_number']
    velocity = features['velocity']
    instrument_source = features['instrument_source']
    qualities = features['qualities']
    audio = features['audio']
    f0_hz = features['f0_hz']
    f0_confidence = features['f0_confidence']
    loudness_db = features['loudness_db']
    f0_estimate = features['f0_estimate']
    h_freq = features['h_freq']
    h_mag = features['h_mag']
    h_phase = features['h_phase']

    f0_estimate = tf.io.parse_tensor(f0_estimate, out_type=tf.string)
    h_freq = tf.io.parse_tensor(h_freq, out_type=tf.string)
    h_mag = tf.io.parse_tensor(h_mag, out_type=tf.string)
    h_phase = tf.io.parse_tensor(h_phase, out_type=tf.string)

    f0_estimate = tf.io.parse_tensor(f0_estimate, out_type=tf.float32)
    h_freq = tf.io.parse_tensor(h_freq, out_type=tf.float32)
    h_mag = tf.io.parse_tensor(h_mag, out_type=tf.float32)
    h_phase = tf.io.parse_tensor(h_phase, out_type=tf.float32)

    h_freq = tf.expand_dims(h_freq, axis=0)
    h_mag = tf.expand_dims(h_mag, axis=0)
    h_phase = tf.expand_dims(h_phase, axis=0)

    f0 = tsms.core.harmonic_analysis_to_f0(h_freq, h_mag)
    f0_mean = tf.math.reduce_mean(f0, axis=1)
    harmonics = tf.shape(h_freq)[-1]
    harmonic_indices = tf.range(1, harmonics + 1, dtype=tf.float32)
    harmonic_indices = harmonic_indices[tf.newaxis, tf.newaxis, :]
    h_freq_centered = h_freq - (f0_mean * harmonic_indices)
    g_phase = tsms.core.generate_phase(h_freq, sample_rate=16000, frame_step=64)
    d_phase = tsms.core.phase_diff(h_phase, g_phase)
    # unwrap d_phase from +/- pi to +/- 2*pi
    d_phase = tsms.core.phase_unwrap(d_phase, axis=1)
    d_phase = (d_phase + 2.0 * np.pi) % (4.0 * np.pi) - 2.0 * np.pi

    h_freq = tf.squeeze(h_freq, axis=0)
    h_mag = tf.squeeze(h_mag, axis=0)
    h_phase = tf.squeeze(h_phase, axis=0)
    h_freq_centered = tf.squeeze(h_freq_centered, axis=0)
    d_phase = tf.squeeze(d_phase, axis=0)

    h_freq_norm = (h_freq_centered - tf.reduce_mean(h_freq_centered)) / tf.math.reduce_std(h_freq_centered)
    h_mag_norm = (h_mag - tf.reduce_mean(h_mag)) / tf.math.reduce_std(h_mag)
    d_phase_norm = (d_phase - tf.reduce_mean(d_phase)) / tf.math.reduce_std(d_phase)

    element_dict = {
        'sample_name': sample_name,
        'instrument_id': instrument_id,
        'note_number': note_number,
        'velocity': velocity,
        'h_freq': h_freq,
        'h_mag': h_mag,
        'h_phase': h_phase,
        'd_phase': d_phase,
        'h_freq_centered': h_freq_centered,
        'h_freq_norm': h_freq_norm,
        'h_mag_norm': h_mag_norm,
        'd_phase_norm': d_phase_norm
    }
    return element_dict

## Create datasets

In [None]:
train_tfrecord_file = os.path.join(dataset_dir, 'train.tfrecord')
valid_tfrecord_file = os.path.join(dataset_dir, 'valid.tfrecord')
test_tfrecord_file = os.path.join(dataset_dir, 'test.tfrecord')

example_secs = 4
sample_rate = 16000
frame_rate = 250

# Create train dataset
train_data_provider = CompleteTFRecordProvider(
    file_pattern=train_tfrecord_file,
    example_secs=example_secs,
    sample_rate=sample_rate,
    frame_rate=frame_rate,
    map_func=features_map)

train_dataset = train_data_provider.get_dataset(shuffle=False) 

# Create valid dataset
valid_data_provider = CompleteTFRecordProvider(
    file_pattern=valid_tfrecord_file,
    example_secs=example_secs,
    sample_rate=sample_rate,
    frame_rate=frame_rate,
    map_func=features_map)

valid_dataset = valid_data_provider.get_dataset(shuffle=False) 

# Create test dataset
test_data_provider = CompleteTFRecordProvider(
    file_pattern=test_tfrecord_file,
    example_secs=example_secs,
    sample_rate=sample_rate,
    frame_rate=frame_rate,
    map_func=features_map)

test_dataset = test_data_provider.get_dataset(shuffle=False)

## Filter datasets

In [None]:
def conditioning_function(sample):
    return sample['note_number'], sample['instrument_id'], sample['velocity']

def return_conditioning(dataset):
    return dataset.map(lambda x: conditioning_function(x),
                       num_parallel_calls=tf.data.experimental.AUTOTUNE, 
                       deterministic=True)

def pad_function(sample):
    return tf.pad(sample, 
                  tf.convert_to_tensor([[0,0], [0, 130 - tf.shape(sample)[1]]]))
    

def pad_dataset(dataset):
    return dataset.map(lambda x: pad_function(x),
                       num_parallel_calls=tf.data.experimental.AUTOTUNE, 
                       deterministic=True)
    
def filter_instruments(sample):
    banned_ids = tf.constant([6,11,13,19,23,25,30,48,49,51,64,71,79,80,82,90,92])
    instrument_id = sample['instrument_id']
    isbanned = tf.equal(banned_ids, tf.cast(instrument_id, banned_ids.dtype))
    reduced = tf.reduce_sum(tf.cast(isbanned, tf.float32))
    return tf.equal(reduced, tf.constant(0.))

def filter_pitches(sample):
    pitches=tf.constant([x for x in range(40,89)])
    note_number = sample['note_number']
    isvalid = tf.equal(pitches, tf.cast(note_number, pitches.dtype))
    reduced = tf.reduce_sum(tf.cast(isvalid, tf.float32))
    return tf.greater(reduced, tf.constant(0.))

def input_preprocessing(dataset):
    dataset_filter = dataset.filter(filter_instruments).filter(filter_pitches)

    h_freq_centered = dataset_filter.map(lambda x: x['h_freq_centered'])
    h_mag = dataset_filter.map(lambda x: x['h_mag'])
    d_phase = dataset_filter.map(lambda x: x['d_phase'])
    h_freq_norm = dataset_filter.map(lambda x: x['h_freq_norm'])
    h_mag_norm = dataset_filter.map(lambda x: x['h_mag_norm'])
    d_phase_norm = dataset_filter.map(lambda x: x['d_phase_norm'])

    h_freq_pad = pad_dataset(h_freq_centered)
    h_mag_pad = pad_dataset(h_mag)
    d_phase_pad = pad_dataset(d_phase)
    h_freq_norm_pad = pad_dataset(h_freq_norm)
    h_mag_norm_pad = pad_dataset(h_mag_norm)
    d_phase_norm_pad = pad_dataset(d_phase_norm)

    conditioning_dataset = return_conditioning(dataset_filter)

    return tf.data.Dataset.zip((h_freq_norm_pad, h_mag_norm_pad, d_phase_norm_pad, 
                                h_freq_pad, h_mag_pad, d_phase_pad, 
                                conditioning_dataset))

In [None]:
X_train = input_preprocessing(train_dataset)
X_valid = input_preprocessing(valid_dataset)
X_test = input_preprocessing(test_dataset)

## Build VAE Architecture

In [None]:
class Sampling(layers.Layer):
  """Uses (z_mean, z_log_var) to sample z, the vector encoding the input."""

  def call(self, inputs):
      z_mean, z_log_var = inputs
      batch = tf.shape(z_mean)[0]
      dim = tf.shape(z_mean)[1]
      epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
      return z_mean + tf.exp(0.5 * z_log_var) * epsilon

class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(
            name="reconstruction_loss"
        )
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        x = tf.reshape(tf.transpose(tf.stack(data[:3], axis=0), perm=[1, 2, 0]), [-1,1001,130,3])
        y = tf.reshape(tf.transpose(tf.stack(data[3:6], axis=0), perm=[1, 2, 0]), [-1,1001,130,3])
        conditioning = data[6]
        pitch = conditioning[0]
        instrument = conditioning[1]
        velocity = conditioning[2]
        mag = data[4]
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(x)
            reconstruction = self.decoder([z, pitch, instrument, velocity]) #qualities
            reconstruction_loss = tf.reduce_mean(
                tf.reduce_sum(
                    tf.math.multiply(
                        keras.losses.mean_squared_error(y, reconstruction), 
                        mag), 
                    axis=(1,2))
                )
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

In [None]:
def build_encoder(latent_dim, lstm_dim, units=[32,32,64,64], kernel_sizes=[3,3,3,3], strides=[2,2,2,2]):
    encoder_inputs = keras.Input(shape=(1001, 130, 3))
    for i, (unit, kernel_size, stride) in enumerate(zip(units,kernel_sizes,strides)):
        if i == 0:
            x = layers.Conv2D(unit, (kernel_size), activation="relu", strides=(stride), padding="same")(encoder_inputs)
        else:
            x = layers.Conv2D(unit, (kernel_size), activation="relu", strides=(stride), padding="same")(x)
    x = layers.TimeDistributed(layers.Flatten())(x)
    # x = layers.TimeDistributed(layers.Dense(lstm_dim, activation="relu"))(x)
    # x = layers.Bidirectional(layers.LSTM(lstm_dim, activation="tanh", return_sequences=True, dropout=0.1))(x) 
    x = layers.LSTM(lstm_dim, activation="relu", return_sequences=False, dropout=0.1)(x)
    z_mean = layers.Dense(latent_dim, activation="relu", name="z_mean")(x)
    z_log_var = layers.Dense(latent_dim, activation="relu", name="z_log_var")(x)
    z = Sampling()([z_mean, z_log_var])
    encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
    encoder.summary()
    
    return encoder

def _conv_shape(strides, dim_size=[1001, 130, 3]):
    for i in strides:
        dim_size = [math.ceil(x / i) for x in dim_size]
    return dim_size

def build_decoder(latent_dim, lstm_dim, units=[32,32,64,64], kernel_sizes=[3,3,3,3], strides=[2,2,2,2]):
    conv_shape = _conv_shape(strides)
    units.reverse()
    kernel_sizes.reverse()
    strides.reverse()

    latent_inputs = keras.Input(shape=(latent_dim,))
    pitch_inputs = keras.Input(shape=(1,))
    instrument_inputs = keras.Input(shape=(1,))
    velocity_inputs = keras.Input(shape=(1,))
    
    pitch_embeddings = layers.Flatten()(layers.Embedding(128, 64, input_length=1, name="pitch_emb")(pitch_inputs))
    instrument_embeddings = layers.Flatten()(layers.Embedding(97, 8, input_length=1, name="instrument_emb")(instrument_inputs))
    velocity_embeddings = layers.Flatten()(layers.Embedding(128, 4, input_length=1, name="vel_emb")(velocity_inputs))

    x = tf.keras.layers.Concatenate(axis=1)([latent_inputs, pitch_embeddings, 
                                                  instrument_embeddings, velocity_embeddings]) 
    # x = layers.Dense(lstm_dim, activation="relu")(x)
    x = layers.RepeatVector(conv_shape[0])(x)
    x = layers.LSTM(lstm_dim, activation="relu", return_sequences=True, dropout=0.1)(x)
    # x = layers.Bidirectional(layers.LSTM(lstm_dim, activation="tanh", return_sequences=True, dropout=0.1))(x) 
    # x = layers.TimeDistributed(layers.Dense(conv_shape[1] * units[0], activation="relu"))(x)
    x = layers.Reshape((conv_shape[0], conv_shape[1], int(x.shape[2]/conv_shape[1])))(x) 
    for i, (unit, kernel_size, stride) in enumerate(zip(units,kernel_sizes,strides)): 
        x = layers.Conv2DTranspose(unit, (kernel_size), activation="relu", strides=(stride), padding="same")(x)
    x = layers.Cropping2D(cropping=((3, 4), (7, 7)))(x)
    decoder_outputs = layers.Conv2DTranspose(3, 3, activation="linear", padding="same")(x)
    decoder = keras.Model([latent_inputs, pitch_inputs, 
                           instrument_inputs, velocity_inputs], 
                          decoder_outputs, name="decoder") 
    decoder.summary()
    
    return decoder

def build_vae(latent_dim, lstm_dim, learning_rate=0.001, units=[32,32,64,64], kernel_sizes=[3,3,3,3], strides=[2,2,2,2]):
    encoder = build_encoder(latent_dim, lstm_dim, units, kernel_sizes, strides)
    decoder = build_decoder(latent_dim, lstm_dim, units, kernel_sizes, strides)
    vae = VAE(encoder, decoder)
    vae.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate))    
    return vae

In [None]:
vae = build_vae(latent_dim = 128, lstm_dim = 128, learning_rate = 0.001, 
                units = [64,64,128,128,128,128], kernel_sizes = [7,7,5,5,5,5], strides = [3,3,2,2,2,2])

Model: "encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           [(None, 1001, 130, 3 0                                            
__________________________________________________________________________________________________
conv2d_12 (Conv2D)              (None, 334, 44, 64)  9472        input_11[0][0]                   
__________________________________________________________________________________________________
conv2d_13 (Conv2D)              (None, 112, 15, 64)  200768      conv2d_12[0][0]                  
__________________________________________________________________________________________________
conv2d_14 (Conv2D)              (None, 56, 8, 128)   204928      conv2d_13[0][0]                  
____________________________________________________________________________________________

In [None]:
vae.fit(x=X_train, epochs=5, batch_size=16)

Epoch 1/5
   1981/Unknown - 2307s 1s/step - loss: nan - reconstruction_loss: nan - kl_loss: nan

KeyboardInterrupt: ignored