# Mapping model training

## Setup Google Drive

In [9]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Download Complete NSynth Guitar Subset

In [10]:
import os

'''This one download the folder recursively'''
def folder_download(folder_id):
  # authenticate
  from google.colab import auth
  auth.authenticate_user()
  # get folder_name
  from googleapiclient.discovery import build
  service = build('drive', 'v3')
  folder_name = service.files().get(fileId=folder_id).execute()['name']
  # import library and download
  !wget -qnc https://github.com/segnolin/google-drive-folder-downloader/raw/master/download.py
  from download import download_folder
  download_folder(service, folder_id, './', folder_name)
  return folder_name

dataset_dir = '/content/complete_dataset'
if not os.path.exists(dataset_dir):
  folder_name = folder_download('1-lJfBAVswi8JXR_kKbOkfvNHRNvAZ1TB')

## Install Dependencies

First we install the required dependencies with `pip`.

In [None]:
%tensorflow_version 2.x
!pip install -qU ddsp[data_preparation]==1.0.1
!pip install git+https://github.com/fabiodimarco/tf-spectral-modeling-synthesis.git

## Define DataProvider class

In [12]:
import tensorflow as tf
import ddsp.training.data as data


class CompleteTFRecordProvider(data.RecordProvider):
  def __init__(self,
               file_pattern=None,
               example_secs=4,
               sample_rate=16000,
               frame_rate=250,
               map_func=None):
    super().__init__(file_pattern, example_secs, sample_rate,
                      frame_rate, tf.data.TFRecordDataset)
    self._map_func = map_func

  def get_dataset(self, shuffle=True):
    def parse_tfexample(record):
      features = tf.io.parse_single_example(record, self.features_dict)
      if self._map_func is not None:
        return self._map_func(features)
      else:
        return features

    filenames = tf.data.Dataset.list_files(self._file_pattern, shuffle=shuffle)
    dataset = filenames.interleave(
        map_func=self._data_format_map_fn,
        cycle_length=40,
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.map(parse_tfexample,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    return dataset

  @property
  def features_dict(self):
    return {
      'sample_name':
        tf.io.FixedLenFeature([1], dtype=tf.string),
      'note_number':
        tf.io.FixedLenFeature([1], dtype=tf.int64),
      'velocity':
        tf.io.FixedLenFeature([1], dtype=tf.int64),
      'instrument_source':
        tf.io.FixedLenFeature([1], dtype=tf.int64),
      'qualities':
        tf.io.FixedLenFeature([10], dtype=tf.int64),
      'audio':
        tf.io.FixedLenFeature([self._audio_length], dtype=tf.float32),
      'f0_hz':
        tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
      'f0_confidence':
        tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
      'loudness_db':
        tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
      'f0_scaled':
        tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
      'ld_scaled':
        tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
      'z':
        tf.io.FixedLenFeature([self._feature_length * 16], dtype=tf.float32),
    }


class HarmonicTFRecordProvider(data.RecordProvider):
  def __init__(self,
               file_pattern=None,
               example_secs=4,
               sample_rate=16000,
               frame_rate=250,
               map_func=None):
    super().__init__(file_pattern, example_secs, sample_rate,
                      frame_rate, tf.data.TFRecordDataset)
    self._map_func = map_func

  def get_dataset(self, shuffle=True):
    def parse_tfexample(record):
      features = tf.io.parse_single_example(record, self.features_dict)
      if self._map_func is not None:
        return self._map_func(features)
      else:
        return features

    filenames = tf.data.Dataset.list_files(self._file_pattern, shuffle=shuffle)
    dataset = filenames.interleave(
        map_func=self._data_format_map_fn,
        cycle_length=40,
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.map(parse_tfexample,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    return dataset

  @property
  def features_dict(self):
    return {
      'sample_name':
        tf.io.FixedLenFeature([], dtype=tf.string),
      'note_number':
        tf.io.FixedLenFeature([1], dtype=tf.int64),
      'velocity':
        tf.io.FixedLenFeature([1], dtype=tf.int64),
      'instrument_source':
        tf.io.FixedLenFeature([1], dtype=tf.int64),
      'qualities':
        tf.io.FixedLenFeature([10], dtype=tf.int64),
      'audio':
        tf.io.FixedLenFeature([self._audio_length], dtype=tf.float32),
      'f0_hz':
        tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
      'f0_confidence':
        tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
      'loudness_db':
        tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
      'f0_estimate':
        tf.io.FixedLenFeature([], dtype=tf.string),
      'h_freq':
        tf.io.FixedLenFeature([], dtype=tf.string),
      'h_mag':
        tf.io.FixedLenFeature([], dtype=tf.string),
      'h_phase':
        tf.io.FixedLenFeature([], dtype=tf.string),
    }

## Create datasets

In [13]:
train_dataset_dir = os.path.join(dataset_dir, 'train')
valid_dataset_dir = os.path.join(dataset_dir, 'valid')
test_dataset_dir = os.path.join(dataset_dir, 'test')

train_tfrecord_file = os.path.join(train_dataset_dir, 'complete.tfrecord')
valid_tfrecord_file = os.path.join(valid_dataset_dir, 'complete.tfrecord')
test_tfrecord_file = os.path.join(test_dataset_dir, 'complete.tfrecord')

example_secs = 4
sample_rate = 16000
frame_rate = 250

# Create train dataset
train_data_provider = CompleteTFRecordProvider(
    file_pattern=train_tfrecord_file + '*',
    example_secs=example_secs,
    sample_rate=sample_rate,
    frame_rate=frame_rate)

train_dataset = train_data_provider.get_batch(1, shuffle=False, repeats=1)

# Create valid dataset
valid_data_provider = CompleteTFRecordProvider(
    file_pattern=valid_tfrecord_file + '*',
    example_secs=example_secs,
    sample_rate=sample_rate,
    frame_rate=frame_rate)

valid_dataset = valid_data_provider.get_batch(1, shuffle=False, repeats=1)

# Create test dataset
test_data_provider = CompleteTFRecordProvider(
    file_pattern=test_tfrecord_file + '*',
    example_secs=example_secs,
    sample_rate=sample_rate,
    frame_rate=frame_rate)

test_dataset = test_data_provider.get_batch(1, shuffle=False, repeats=1)

## Make directories to save model and data

In [None]:
!ln -s "/content/drive/My Drive" /gdrive

dataset_dir = os.path.normpath('/gdrive/nsynth_guitar/dataset/harmonic')

assert os.path.exists(dataset_dir)
print('Dataset Directory Exists:', dataset_dir)

In [15]:
import glob
import os
import time
import matplotlib.pyplot as plt
import ddsp
import tsms


def _byte_feature(value):
    return tf.train.Feature(
        bytes_list=tf.train.BytesList(value=value))


def _float_feature(value):
    return tf.train.Feature(
        float_list=tf.train.FloatList(value=value))


def _int64_feature(value):
    return tf.train.Feature(
        int64_list=tf.train.Int64List(value=value))
  

def _tensor_feature(value):
    value = tf.constant(value)
    value = tf.io.serialize_tensor(value)
    value = tf.expand_dims(value, axis=0)
    return _byte_feature(value.numpy())


def prepare_harmonic_tfrecord(dataset,
                              save_dir,
                              split='train',
                              sample_rate=16000,
                              frame_rate=250):

  split_dir = os.path.join(save_dir, split)
  file_list = glob.glob(os.path.join(split_dir, "harmonic.tfrecord*"))

  if len(file_list) == 0:
    start_index = 0
  else:
    # harmonic.tfrecord-00000-to-00099
    last_file = file_list[-1]
    start_index = int(last_file[-5:]) + 1
  
  def filter_fn(e):
    note_number = e['note_number'][0][0]

    return tf.math.logical_and(
        tf.math.greater_equal(note_number, 36),
        tf.math.less_equal(note_number, 84))

  window_size = 100

  dataset = dataset.filter(filter_fn)
  dataset = dataset.skip(start_index)
  dataset = dataset.window(window_size)

  start_index -= window_size

  for window in dataset:
    window = tf.data.Dataset.zip(window)
    cardinality = window.cardinality()

    start_index += window_size
    end_index = start_index + cardinality - 1

    tfrecord_filename = f"harmonic.tfrecord-{start_index:05n}-{end_index:05n}"
    harmonic_tfrecord_file = os.path.join(split_dir, tfrecord_filename)

    print('\nFile: ', tfrecord_filename)
    print('Complete Path: ', harmonic_tfrecord_file)
    print('Start Index: ', start_index)
    print('End Index: ', end_index.numpy())
    print('Cardinality: ', cardinality.numpy())

    with tf.io.TFRecordWriter(harmonic_tfrecord_file) as writer:
        window_start_time = time.perf_counter()
        for step, e in enumerate(window):
            start_time = time.perf_counter()

            sample_name = e['sample_name'][0].numpy()
            note_number = e['note_number'][0].numpy()
            velocity = e['velocity'][0].numpy()
            instrument_source = e['instrument_source'][0].numpy()
            qualities = e['qualities'][0].numpy()
            audio = e['audio'][0].numpy()
            f0_hz = e['f0_hz'][0].numpy()
            f0_confidence = e['f0_confidence'][0].numpy()
            loudness_db = e['loudness_db'][0].numpy()

            print('{} - sample_name: {}'.format(
                step, e['sample_name'][0][0].numpy().decode('UTF-8')),
                end='')

            signals = tf.cast(audio, dtype=tf.float32)
            signals = tf.reshape(signals, shape=(1, -1))

            frame_step = 64

            f0_estimate = tsms.core.midi_to_f0_estimate(
                note_number, signals.shape[1], frame_step)
            f0_estimate = tf.cast(f0_estimate, dtype=tf.float32)

            f0_estimate, _, _ = tsms.core.refine_f0(
                signals, f0_estimate, sample_rate, frame_step)
            
            h_freq, h_mag, h_phase = tsms.core.iterative_harmonic_analysis(
                signals=signals,
                f0_estimate=f0_estimate,
                sample_rate=sample_rate,
                frame_step=frame_step)
            
            f0_estimate = tf.squeeze(f0_estimate, axis=0)
            h_freq = tf.squeeze(h_freq, axis=0)
            h_mag = tf.squeeze(h_mag, axis=0)
            h_phase = tf.squeeze(h_phase, axis=0)

            harmonic_dataset_dict = {
                'sample_name': _byte_feature(sample_name),
                'note_number': _int64_feature(note_number),
                'velocity': _int64_feature(velocity),
                'instrument_source': _int64_feature(instrument_source),
                'qualities': _int64_feature(qualities),
                'audio': _float_feature(audio),
                'f0_hz': _float_feature(f0_hz),
                'f0_confidence': _float_feature(f0_confidence),
                'loudness_db': _float_feature(loudness_db),
                'f0_estimate': _tensor_feature(f0_estimate),
                'h_freq': _tensor_feature(h_freq),
                'h_mag': _tensor_feature(h_mag),
                'h_phase': _tensor_feature(h_phase),
            }

            tf_example = tf.train.Example(
                features=tf.train.Features(feature=harmonic_dataset_dict))

            writer.write(tf_example.SerializeToString())

            stop_time = time.perf_counter()
            elapsed_time = stop_time - start_time
            print(' - elapsed_time: {:.3f}'.format(elapsed_time))
        
        window_stop_time = time.perf_counter()
        window_elapsed_time = window_stop_time - window_start_time
        print('Window elapsed_time: ', window_elapsed_time)

## Prepare Harmonic tfrecord

In [None]:
prepare_harmonic_tfrecord(
    dataset=train_dataset,
    save_dir=dataset_dir,
    split='train',
    sample_rate=16000,
    frame_rate=250)
  
prepare_harmonic_tfrecord(
  dataset=valid_dataset,
  save_dir=dataset_dir,
  split='valid',
  sample_rate=16000,
  frame_rate=250)

prepare_harmonic_tfrecord(
  dataset=test_dataset,
  save_dir=dataset_dir,
  split='test',
  sample_rate=16000,
  frame_rate=250)