# Mapping model training

## Setup Google Drive

In [93]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Make directories to save model and data

In [94]:
import os

drive_dir = '/content/drive/My Drive/nsynth_guitar'
checkpoint_dir = os.path.join(drive_dir, 'harmonic_autoencoder/checkpoint')

assert os.path.exists(drive_dir)
print('Drive Directory Exists:', drive_dir)

!mkdir -p "$checkpoint_dir"

Drive Directory Exists: /content/drive/My Drive/nsynth_guitar


## Clear existing checkpoints

In [95]:
import shutil

try:
    shutil.rmtree(checkpoint_dir)
except OSError as e:
    print("Error: %s : %s" % (checkpoint_dir, e.strerror))

## Install Dependencies

First we install the required dependencies with `pip`.

In [None]:
%tensorflow_version 2.x
!pip install git+https://github.com/fabiodimarco/tf-spectral-modeling-synthesis
!pip install -q ddsp

## Define DataProvider class

In [97]:
import tensorflow as tf
import ddsp.training.data as data


class CompleteTFRecordProvider(data.RecordProvider):
    def __init__(self,
                 file_pattern=None,
                 example_secs=4,
                 sample_rate=16000,
                 frame_rate=250,
                 map_func=None):
        super().__init__(file_pattern, example_secs, sample_rate,
                         frame_rate, tf.data.TFRecordDataset)
        self._map_func = map_func

    def get_dataset(self, shuffle=True):
        def parse_tfexample(record):
            features = tf.io.parse_single_example(record, self.features_dict)
            if self._map_func is not None:
                return self._map_func(features)
            else:
                return features

        filenames = tf.data.Dataset.list_files(
            self._file_pattern, shuffle=shuffle)
        dataset = filenames.interleave(
            map_func=self._data_format_map_fn,
            cycle_length=40,
            num_parallel_calls=tf.data.experimental.AUTOTUNE,
            deterministic=True)
        dataset = dataset.map(
            parse_tfexample,
            num_parallel_calls=tf.data.experimental.AUTOTUNE,
            deterministic=True)
        return dataset

    @property
    def features_dict(self):
        return {
            'sample_name': tf.io.FixedLenFeature([1], dtype=tf.string),
            'instrument_id': tf.io.FixedLenFeature([1], dtype=tf.int64),
            'note_number': tf.io.FixedLenFeature([1], dtype=tf.int64),
            'velocity': tf.io.FixedLenFeature([1], dtype=tf.int64),
            'instrument_source': tf.io.FixedLenFeature([1], dtype=tf.int64),
            'qualities': tf.io.FixedLenFeature([10], dtype=tf.int64),
            'audio': tf.io.FixedLenFeature([self._audio_length], dtype=tf.float32),
            'f0_hz': tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
            'f0_confidence': tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
            'loudness_db': tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
            'f0_scaled': tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
            'ld_scaled': tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
            'z': tf.io.FixedLenFeature([self._feature_length * 16], dtype=tf.float32),
            'f0_estimate': tf.io.FixedLenFeature([], dtype=tf.string),
            'h_freq': tf.io.FixedLenFeature([], dtype=tf.string),
            'h_mag': tf.io.FixedLenFeature([], dtype=tf.string),
            'h_phase': tf.io.FixedLenFeature([], dtype=tf.string),
        }

## Define features map function

In [98]:
import tsms


min_note_number = 40
max_note_number = 88
max_harmonics = 98
sample_rate = 16000
frame_step = 64
frame_rate = sample_rate // frame_step
example_secs = 4


def normalize_h_freq(h_freq, h_mag, note_number):
    f0 = tsms.core.harmonic_analysis_to_f0(h_freq, h_mag)
    f0_mean = tf.math.reduce_mean(f0, axis=1)
    note_number = tf.cast(note_number, dtype=tf.float32)
    f0_note = tsms.core.midi_to_hz(note_number)

    harmonics = tf.shape(h_freq)[-1]
    harmonic_indices = tf.range(1, harmonics + 1, dtype=tf.float32)
    harmonic_indices = harmonic_indices[tf.newaxis, tf.newaxis, :]

    st_var = (2.0 ** (1.0 / 12.0) - 1.0)

    h_freq_mean = f0_mean * harmonic_indices
    h_freq_note = f0_note * harmonic_indices

    h_freq_norm = (h_freq - h_freq_mean) / (h_freq_note * st_var)

    return h_freq_norm


def denormalize_h_freq(h_freq_norm, note_number):
    note_number = tf.cast(note_number, dtype=tf.float32)
    f0_note = tsms.core.midi_to_hz(note_number)

    harmonics = tf.shape(h_freq_norm)[-1]
    harmonic_indices = tf.range(1, harmonics + 1, dtype=tf.float32)
    harmonic_indices = harmonic_indices[tf.newaxis, tf.newaxis, :]

    st_var = (2.0 ** (1.0 / 12.0) - 1.0)

    h_freq_note = f0_note * harmonic_indices

    h_freq = h_freq_note * (h_freq_norm * st_var + 1.0)

    return h_freq


def normalize_h_mag(h_mag, db_limit=-120.0):
    h_mag = tsms.core.lin_to_db(h_mag)
    h_mag = h_mag - tf.math.reduce_max(h_mag)
    h_mag_norm = (tf.maximum(h_mag, db_limit) - db_limit) / (-db_limit)

    return h_mag_norm


def denormalize_h_mag(h_mag_norm, db_limit=-120.0):
    h_mag = h_mag_norm * (-db_limit) + db_limit
    h_mag = tsms.core.db_to_lin(h_mag)

    return h_mag


def map_features(features):
    note_number = tf.cast(features['note_number'], dtype=tf.int32)
    velocity = tf.cast(features['velocity'], dtype=tf.int32)
    instrument_id = tf.cast(features['instrument_id'], dtype=tf.int32)

    h_freq = features['h_freq']
    h_mag = features['h_mag']

    h_freq = tf.io.parse_tensor(h_freq, out_type=tf.string)
    h_mag = tf.io.parse_tensor(h_mag, out_type=tf.string)

    h_freq = tf.io.parse_tensor(h_freq, out_type=tf.float32)
    h_mag = tf.io.parse_tensor(h_mag, out_type=tf.float32)

    f0_note = tsms.core.midi_to_hz(tf.cast(note_number, dtype=tf.float32))
    harmonics = tsms.core.get_number_harmonics(f0_note, sample_rate=16000)
    harmonics = tf.squeeze(harmonics)

    h_freq = h_freq[:, :harmonics]
    h_mag = h_mag[:, :harmonics]

    h_freq = tf.expand_dims(h_freq, axis=0)
    h_mag = tf.expand_dims(h_mag, axis=0)

    h_freq_norm = normalize_h_freq(h_freq, h_mag, note_number)
    h_mag_norm = normalize_h_mag(h_mag)

    h_freq_norm = tf.squeeze(h_freq_norm, axis=0)
    h_mag_norm = tf.squeeze(h_mag_norm, axis=0)

    pad_size = max_harmonics - tf.shape(h_freq_norm)[1]
    h_freq_norm = tf.pad(h_freq_norm, paddings=((0, 0), (0, pad_size)))
    h_mag_norm = tf.pad(h_mag_norm, paddings=((0, 0), (0, pad_size)))

    mask = tf.concat([
        tf.ones(shape=(tf.shape(h_freq_norm)[0], harmonics)),
        tf.zeros(shape=(tf.shape(h_freq_norm)[0], max_harmonics - harmonics)),
    ], axis=1)

    inputs = {
        'note_number': note_number,
        'velocity': velocity,
        'instrument_id': instrument_id,
        'h_freq_norm': h_freq_norm,
        'h_mag_norm': h_mag_norm,
        'mask': mask,
        'harmonics': harmonics
    }

    targets = tf.stack([h_freq_norm, h_mag_norm, mask], axis=-1)

    return inputs, targets

## Create datasets

In [99]:
batch_size = 8

dataset_dir = "/content/drive/My Drive/nsynth_guitar/dataset/new_data"

train_tfrecord_file = os.path.join(dataset_dir, 'train.tfrecord')
valid_tfrecord_file = os.path.join(dataset_dir, 'valid.tfrecord')
test_tfrecord_file = os.path.join(dataset_dir, 'test.tfrecord')

# Create train dataset
train_data_provider = CompleteTFRecordProvider(
    file_pattern=train_tfrecord_file + '*',
    example_secs=example_secs,
    sample_rate=sample_rate,
    frame_rate=frame_rate,
    map_func=map_features)

train_dataset = train_data_provider.get_batch(
    batch_size,
    shuffle=True,
    repeats=1)

# Create valid dataset
valid_data_provider = CompleteTFRecordProvider(
    file_pattern=valid_tfrecord_file + '*',
    example_secs=example_secs,
    sample_rate=sample_rate,
    frame_rate=frame_rate,
    map_func=map_features)

valid_dataset = valid_data_provider.get_batch(
    batch_size,
    shuffle=True,
    repeats=1)

# Create test dataset
test_data_provider = CompleteTFRecordProvider(
    file_pattern=test_tfrecord_file + '*',
    example_secs=example_secs,
    sample_rate=sample_rate,
    frame_rate=frame_rate,
    map_func=map_features)

test_dataset = test_data_provider.get_batch(
    1,
    shuffle=True,
    repeats=1)


train_dataset = train_dataset.apply(
    tf.data.experimental.assert_cardinality(10542))
valid_dataset = valid_dataset.apply(
    tf.data.experimental.assert_cardinality(2906))
test_dataset = test_dataset.apply(
    tf.data.experimental.assert_cardinality(1588))

In [100]:
# train_count = 0
# for step, batch in enumerate(train_dataset):
#     train_count += 1

# print("train: ", train_count)

# valid_count = 0
# for step, batch in enumerate(valid_dataset):
#     valid_count += 1

# print("valid: ", valid_count)

# test_count = 0
# for step, batch in enumerate(test_dataset):
#     test_count += 1

# print("test: ", test_count)

# Model definition

In [101]:
def ffn(input_shape, num_layers, hidden_units, output_units, name='ffn'):
    inputs = tf.keras.layers.Input(shape=input_shape)
    x = tf.keras.layers.Dense(hidden_units, name=name + '_in_dense')(inputs)
    x = tf.keras.layers.LayerNormalization(name=name + '_in_ln')(x)

    for i in range(num_layers):
        y = tf.keras.layers.Dense(hidden_units, activation='relu',
                                  name=name + '_dense_' + str(i))(x)
        x = tf.keras.layers.Add(name=name + '_add_' + str(i))([x, y])
        x = tf.keras.layers.LayerNormalization(name=name + '_ln_' + str(i))(x)

    outputs = tf.keras.layers.Dense(output_units, name=name + '_out_dense_')(x)
    return tf.keras.Model(inputs, outputs)


def rnn_sandwich(seq_len, inputs, outputs, num_layers, hidden_units,
                 return_sequences=True, name='rnn_sandwich'):
    ffn_in = ffn(input_shape=(seq_len, inputs),
                 num_layers=num_layers,
                 hidden_units=hidden_units,
                 output_units=hidden_units,
                 name=name + '_fnn_in_')

    gru = tf.keras.layers.GRU(hidden_units, return_sequences=return_sequences,
                              name=name + '_gru_')

    s = (seq_len, hidden_units) if return_sequences else (hidden_units,)
    ffn_out = ffn(input_shape=s,
                  num_layers=num_layers,
                  hidden_units=hidden_units,
                  output_units=outputs,
                  name=name + '_fnn_out_')

    return tf.keras.Sequential([ffn_in, gru, ffn_out])


class Encoder(tf.keras.Model):
    def __init__(self, in_ch=128, h_ch=256, z_ch=256, seq_len=1001):
        super(Encoder, self).__init__()
        self.seq_len = seq_len

        self.note_embedding = tf.keras.layers.Embedding(
            input_dim=(max_note_number - min_note_number + 1),
            output_dim=(in_ch - max_harmonics))

        self.freq_rnn = rnn_sandwich(
            seq_len=seq_len,
            inputs=in_ch,
            outputs=z_ch // 2,
            num_layers=2,
            hidden_units=h_ch,
            return_sequences=True,
            name='freq_rnn')

        self.mag_rnn = rnn_sandwich(
            seq_len=seq_len,
            inputs=in_ch,
            outputs=z_ch // 2,
            num_layers=2,
            hidden_units=h_ch,
            return_sequences=True,
            name='mag_rnn')

        self.downsampler = rnn_sandwich(
            seq_len=seq_len,
            inputs=z_ch,
            outputs=z_ch,
            num_layers=2,
            hidden_units=z_ch,
            return_sequences=False,
            name='downsampler')

    def call(self, inputs, training=None, mask=None):
        note_index = inputs['note_number'] - min_note_number
        e = self.note_embedding(note_index)
        e = tf.repeat(e, self.seq_len, axis=1)

        freq = tf.concat([inputs['h_freq_norm'], e], axis=-1)
        mag = tf.concat([inputs['h_mag_norm'], e], axis=-1)

        freq = self.freq_rnn(freq, training=training)
        mag = self.mag_rnn(mag, training=training)

        x = tf.concat([freq, mag], axis=-1)
        z = self.downsampler(x, training=training)

        return z


class Decoder(tf.keras.Model):
    def __init__(self, in_ch=128, h_ch=64, z_ch=256, seq_len=1001):
        super(Decoder, self).__init__()
        self.seq_len = seq_len

        self.upsampler = self.add_weight(
            name='upsampler',
            shape=(1, z_ch, seq_len),
            initializer='ones')

        self.rnn = rnn_sandwich(
            seq_len=seq_len,
            inputs=z_ch,
            outputs=h_ch * 2,
            num_layers=2,
            hidden_units=h_ch,
            return_sequences=True,
            name='rnn')

        self.note_embedding = tf.keras.layers.Embedding(
            input_dim=(max_note_number - min_note_number + 1),
            output_dim=h_ch)

        self.freq_rnn = rnn_sandwich(
            seq_len=seq_len,
            inputs=h_ch * 2,
            outputs=max_harmonics,
            num_layers=2,
            hidden_units=h_ch * 2,
            return_sequences=True,
            name='freq_rnn')

        self.mag_rnn = rnn_sandwich(
            seq_len=seq_len,
            inputs=h_ch * 2,
            outputs=max_harmonics,
            num_layers=2,
            hidden_units=h_ch * 2,
            return_sequences=True,
            name='mag_rnn')

    def call(self, inputs, z, training=None, mask=None):
        z = tf.expand_dims(z, axis=-1)
        x = tf.math.multiply(z, self.upsampler)
        x = tf.transpose(x, perm=(0, 2, 1))

        x = self.rnn(x, training=training)
        x_freq, x_mag = tf.split(x, 2, axis=-1)

        note_index = inputs['note_number'] - min_note_number
        e = self.note_embedding(note_index)
        e = tf.repeat(e, self.seq_len, axis=1)

        x_freq = tf.concat([x_freq, e], axis=-1)
        x_mag = tf.concat([x_mag, e], axis=-1)

        h_freq_norm = self.freq_rnn(x_freq, training=training)
        h_mag_norm = self.mag_rnn(x_mag, training=training)

        h_freq_norm *= inputs['mask']
        h_mag_norm *= inputs['mask']

        y = tf.stack([h_freq_norm, h_mag_norm], axis=-1)

        return y


class AutoEncoder(tf.keras.Model):
    def __init__(self, in_ch=128, h_ch=64, z_ch=256, seq_len=1001):
        super(AutoEncoder, self).__init__()

        self.encoder = Encoder(in_ch, h_ch, z_ch, seq_len)
        self.decoder = Decoder(in_ch, h_ch, z_ch, seq_len)

    def call(self, inputs, training=None, mask=None):
        z = self.encoder(inputs)
        y = self.decoder(inputs, z)

        return y

In [102]:
class FreqLoss(tf.keras.losses.Loss):
    def __init__(self,
                 reduction=tf.keras.losses.Reduction.AUTO,
                 name='freq_loss'):
        super(FreqLoss, self).__init__(
            reduction=reduction,
            name=name)

    def call(self, y_true, y_pred):
        h_freq_norm_true, h_mag_norm_true, mask = tf.unstack(y_true, axis=-1)
        h_freq_norm_pred, h_mag_norm_pred = tf.unstack(y_pred, axis=-1)

        num_elems = tf.math.reduce_sum(mask)

        h_mag = denormalize_h_mag(h_mag_norm_true)
        h_mag_mean = tf.math.reduce_sum(h_mag)
        
        freq_loss = tf.math.reduce_sum(
            h_mag * tf.square(h_freq_norm_true - h_freq_norm_pred))
        freq_loss = freq_loss / h_mag_mean

        return freq_loss


class MagLoss(tf.keras.losses.Loss):
    def __init__(self,
                 reduction=tf.keras.losses.Reduction.AUTO,
                 name='mag_loss'):
        super(MagLoss, self).__init__(
            reduction=reduction,
            name=name)

    def call(self, y_true, y_pred):
        h_freq_norm_true, h_mag_norm_true, mask = tf.unstack(y_true, axis=-1)
        h_freq_norm_pred, h_mag_norm_pred = tf.unstack(y_pred, axis=-1)

        num_elems = tf.math.reduce_sum(mask)

        mag_loss = tf.math.reduce_sum(
            tf.square(h_mag_norm_true - h_mag_norm_pred)) / num_elems

        return mag_loss

# Create and compile mapping model

In [103]:
auto_encoder = AutoEncoder()
freq_loss = FreqLoss()
mag_loss = MagLoss()

auto_encoder.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=lambda yt, yp: freq_loss(yt, yp) + mag_loss(yt, yp),
    metrics=[freq_loss, mag_loss],
    run_eagerly=False)

## Build model

In [None]:
x_true, y_true = next(iter(train_dataset))
y_pred = auto_encoder(x_true)
# loss_value = harmonic_loss(y_true, y_pred)

auto_encoder.summary()

# Load model checkpoint

In [105]:
checkpoint_file = os.path.join(checkpoint_dir, 'cp.ckpt')

if os.path.isdir(checkpoint_dir) and os.listdir(checkpoint_dir):
    auto_encoder.load_weights(checkpoint_file)

## Create training callbacks

In [106]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_file,
    save_weights_only=True,
    verbose=0,
    save_freq='epoch')

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)

def scheduler(epoch, lr):
  if epoch < 10:
    return lr
  else:
    return lr * 0.9

lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)

## Train the model

In [None]:
epochs = 50
steps_per_epoch = 100
validation_steps = 10

auto_encoder.fit(
    train_dataset,
    epochs=epochs,
    steps_per_epoch=steps_per_epoch,
    validation_data=valid_dataset,
    validation_steps=validation_steps,
    callbacks=[early_stop, lr_scheduler, checkpoint])

## Evaluate model on test dataset

In [None]:
auto_encoder.evaluate(test_dataset, steps=100)

### Get Predictions

In [108]:
iterator = iter(test_dataset)

In [None]:
import soundfile as sf
import matplotlib.pyplot as plt
import IPython
import numpy as np


def reconstruct_audio(h_freq, h_mag):
    h_phase = tsms.core.generate_phase(h_freq, sample_rate, frame_step)
    return tsms.core.harmonic_synthesis(
        h_freq, h_mag, h_phase, sample_rate, frame_step)


x_true, y_true = next(iterator)
y_pred = auto_encoder(x_true)

note_number = x_true['note_number']
harmonics = tf.squeeze(x_true['harmonics'])

h_freq_norm_true, h_mag_norm_true, mask = tf.unstack(y_true, axis=-1)
h_freq_norm_pred, h_mag_norm_pred = tf.unstack(y_pred, axis=-1)

h_freq_true = denormalize_h_freq(h_freq_norm_true, note_number)
h_freq_pred = denormalize_h_freq(h_freq_norm_pred, note_number)

h_mag_true = denormalize_h_mag(h_mag_norm_true)
h_mag_pred = denormalize_h_mag(h_mag_norm_pred)

h_freq_true = h_freq_true[:, :, :harmonics]
h_freq_pred = h_freq_pred[:, :, :harmonics]

h_mag_true = h_mag_true[:, :, :harmonics]
h_mag_pred = h_mag_pred[:, :, :harmonics]

audio_true = reconstruct_audio(h_freq_true, h_mag_true)
audio_pred = reconstruct_audio(h_freq_pred, h_mag_pred)

audio_true = np.squeeze(audio_true.numpy())
audio_pred = np.squeeze(audio_pred.numpy())

sf.write('/content/audio_true.wav', audio_true, sample_rate)
sf.write('/content/audio_pred.wav', audio_pred, sample_rate)

plt.figure()
plt.plot(np.squeeze(h_freq_true))

plt.figure()
plt.plot(np.squeeze(h_mag_true))

plt.figure()
plt.plot(np.squeeze(h_freq_pred))

plt.figure()
plt.plot(np.squeeze(h_mag_pred))


print('\True\n')
IPython.display.display(IPython.display.Audio('/content/audio_true.wav'))
print('\Pred\n')
IPython.display.display(IPython.display.Audio('/content/audio_pred.wav'))