## Setup Google Drive

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

### Download Harmonic NSynth Guitar Subset

In [55]:
# '''This one download the folder recursively'''
# def folder_download(folder_id):
#   # authenticate
#   from google.colab import auth
#   auth.authenticate_user()
#   # get folder_name
#   from googleapiclient.discovery import build
#   service = build('drive', 'v3')
#   folder_name = service.files().get(fileId=folder_id).execute()['name']
#   # import library and download
#   !wget -qnc https://github.com/segnolin/google-drive-folder-downloader/raw/master/download.py
#   from download import download_folder
#   download_folder(service, folder_id, './', folder_name)
#   return folder_name

# dataset_dir = '/content/harmonic_dataset'
# if not os.path.exists(dataset_dir):
#   folder_name = folder_download('19gLqATp6-fLTOIE1z4pGmey0BvVJBjqG')

### Access Harmonic NSynth Guitar Subset from Google Drive

In [56]:
dataset_dir = '/content/drive/My Drive/nsynth_guitar/dataset/harmonic'

## Install Dependencies

First we install the required dependencies with `pip`.

In [57]:
%tensorflow_version 2.x
!pip install -qU ddsp[data_preparation]==1.0.1

[K     |████████████████████████████████| 2.9MB 3.0MB/s 
[31mERROR: tensorflow 2.5.0 has requirement h5py~=3.1.0, but you'll have h5py 2.10.0 which is incompatible.[0m
[?25h

## Define DataProvider class

In [58]:
import tensorflow as tf
import ddsp.training.data as data


class HarmonicTFRecordProvider(data.RecordProvider):
  def __init__(self,
               file_pattern=None,
               example_secs=4,
               sample_rate=16000,
               frame_rate=250,
               map_func=None):
    super().__init__(file_pattern, example_secs, sample_rate,
                      frame_rate, tf.data.TFRecordDataset)
    self._map_func = map_func

  def get_dataset(self, shuffle=True):
    def parse_tfexample(record):
      features = tf.io.parse_single_example(record, self.features_dict)
      if self._map_func is not None:
        return self._map_func(features)
      else:
        return features

    filenames = tf.data.Dataset.list_files(self._file_pattern, shuffle=shuffle)
    dataset = filenames.interleave(
        map_func=self._data_format_map_fn,
        cycle_length=40,
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.map(parse_tfexample,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    return dataset

  @property
  def features_dict(self):
    return {
      'sample_name':
        tf.io.FixedLenFeature([], dtype=tf.string),
      'note_number':
        tf.io.FixedLenFeature([1], dtype=tf.int64),
      'velocity':
        tf.io.FixedLenFeature([1], dtype=tf.int64),
      'instrument_source':
        tf.io.FixedLenFeature([1], dtype=tf.int64),
      'qualities':
        tf.io.FixedLenFeature([10], dtype=tf.int64),
      'audio':
        tf.io.FixedLenFeature([self._audio_length], dtype=tf.float32),
      'f0_hz':
        tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
      'f0_confidence':
        tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
      'loudness_db':
        tf.io.FixedLenFeature([self._feature_length], dtype=tf.float32),
      'f0_estimate':
        tf.io.FixedLenFeature([], dtype=tf.string),
      'h_freq':
        tf.io.FixedLenFeature([], dtype=tf.string),
      'h_mag':
        tf.io.FixedLenFeature([], dtype=tf.string),
      'h_phase':
        tf.io.FixedLenFeature([], dtype=tf.string),
    }

## Define features map function

In [59]:
def features_map(features):
    sample_name = features['sample_name']
    note_number = features['note_number']
    velocity = features['velocity']
    instrument_source = features['instrument_source']
    qualities = features['qualities']
    audio = features['audio']
    f0_hz = features['f0_hz']
    f0_confidence = features['f0_confidence']
    loudness_db = features['loudness_db']
    f0_estimate = features['f0_estimate']
    h_freq = features['h_freq']
    h_mag = features['h_mag']
    h_phase = features['h_phase']

    f0_estimate = tf.io.parse_tensor(f0_estimate, out_type=tf.float32)
    h_freq = tf.io.parse_tensor(h_freq, out_type=tf.float32)
    h_mag = tf.io.parse_tensor(h_mag, out_type=tf.float32)
    h_phase = tf.io.parse_tensor(h_phase, out_type=tf.float32)

    element_dict = {
        'sample_name': sample_name,
        'note_number': note_number,
        'velocity': velocity,
        'instrument_source': instrument_source,
        'qualities': qualities,
        'audio': audio,
        'f0_hz': f0_hz,
        'f0_confidence': f0_confidence,
        'loudness_db': loudness_db,
        'f0_estimate': f0_estimate,
        'h_freq': h_freq,
        'h_mag': h_mag,
        'h_phase': h_phase,
    }

    return element_dict

## Create datasets

In [60]:
train_dataset_dir = os.path.join(dataset_dir, 'train')
valid_dataset_dir = os.path.join(dataset_dir, 'valid')
test_dataset_dir = os.path.join(dataset_dir, 'test')

train_tfrecord_file = os.path.join(train_dataset_dir, 'harmonic.tfrecord')
valid_tfrecord_file = os.path.join(valid_dataset_dir, 'harmonic.tfrecord')
test_tfrecord_file = os.path.join(test_dataset_dir, 'harmonic.tfrecord')

example_secs = 4
sample_rate = 16000
frame_rate = 250

# Create train dataset
train_data_provider = HarmonicTFRecordProvider(
    file_pattern=train_tfrecord_file + '*',
    example_secs=example_secs,
    sample_rate=sample_rate,
    frame_rate=frame_rate,
    map_func=features_map)

train_dataset = train_data_provider.get_batch(1, shuffle=False, repeats=1)

# Create valid dataset
valid_data_provider = HarmonicTFRecordProvider(
    file_pattern=valid_tfrecord_file + '*',
    example_secs=example_secs,
    sample_rate=sample_rate,
    frame_rate=frame_rate,
    map_func=features_map)

valid_dataset = valid_data_provider.get_batch(1, shuffle=False, repeats=1)

# Create test dataset
test_data_provider = HarmonicTFRecordProvider(
    file_pattern=test_tfrecord_file + '*',
    example_secs=example_secs,
    sample_rate=sample_rate,
    frame_rate=frame_rate,
    map_func=features_map)

test_dataset = test_data_provider.get_batch(1, shuffle=False, repeats=1)

## Harmonic Model

In [61]:
import numpy as np


@tf.function
def mod_cumsum(x, mod):
    size = x.shape[1]
    y = tf.TensorArray(tf.float32, size=size, dynamic_size=False)

    x = x % mod
    s = tf.gather(x, 0, axis=1) * 0.0
    for i in tf.range(size):
        v = tf.gather(x, i, axis=1)
        s = (s + v) % mod
        y = y.write(i, s)

    y = y.stack()
    y = tf.transpose(y, perm=(1, 0, 2))

    return y


def generate_phase(h_freq, sample_rate, frame_step, initial_h_phase=None):
    if initial_h_phase is None:
        initial_h_phase = tf.zeros((h_freq.shape[0], 1, h_freq.shape[2]))

    frame_rate = sample_rate / frame_step
    norm_omega = 0.5 * (h_freq[:, :-1, :] + h_freq[:, 1:, :]) / frame_rate
    h_phase = mod_cumsum(norm_omega, 1.0)
    h_phase = tf.pad(h_phase, ((0, 0), (1, 0), (0, 0))) + initial_h_phase
    h_phase = h_phase % 1.0
    h_phase = h_phase * (2.0 * np.pi)

    return h_phase


def harmonic_synthesis(h_freq, h_mag, h_phase, sample_rate, frame_step):
    # remove components above nyquist frequency
    h_mag = tf.where(
        tf.greater_equal(h_freq, sample_rate / 2.0),
        tf.zeros_like(h_mag), h_mag)

    h_freq = tf.expand_dims(h_freq, axis=-1)
    h_phase = tf.expand_dims(h_phase, axis=-1)
    h_mag = tf.expand_dims(h_mag, axis=-1)

    # triangular window
    window = tf.range(0, frame_step + 1, dtype=tf.float32) / frame_step
    window = tf.concat([window[:-1], window[::-1]], axis=0)
    window = window[tf.newaxis, tf.newaxis, :]

    # time axis
    t = tf.range(-frame_step, frame_step + 1, dtype=tf.float32) / sample_rate
    t = t[tf.newaxis, tf.newaxis, tf.newaxis, :]

    phases = 2.0 * np.pi * h_freq * t + h_phase
    wavs = tf.cos(phases)
    wavs = h_mag * wavs
    wavs = tf.reduce_sum(wavs, axis=-2)
    wavs = window * wavs
    audio = tf.signal.overlap_and_add(wavs, frame_step)
    audio = audio[:, frame_step:-(frame_step + 1)]

    return audio


class HarmonicModel(tf.keras.Model):
    def __init__(self, sample_rate, frame_step, batches, frames, harmonics,
                 h_freq=None, h_mag=None, h_phase=None, generate_phase=False):
        super(HarmonicModel, self).__init__()
        self.sample_rate = sample_rate
        self.frame_step = frame_step
        self.batches = batches
        self.frames = frames
        self.harmonics = harmonics

        if h_freq is None:
            h_freq = tf.zeros(shape=(1, 1, 1))
        if h_mag is None:
            h_mag = tf.zeros(shape=(1, 1, 1))
        if h_phase is None:
            h_phase = tf.zeros(shape=(1, 1, 1))

        self._h_freq = h_freq
        self._h_mag = h_mag
        self._h_phase = h_phase
        self.generate_phase = generate_phase

        self._shifts = self.add_weight(
            name='shifts',
            shape=(self.batches, self.frames, self.harmonics, 3),
            dtype=tf.float32,
            initializer=tf.keras.initializers.Zeros(),
            trainable=True)

    @property
    def h_freq_shift(self):
        return self._shifts[:, :, :, 0]

    @property
    def h_mag_shift(self):
        return self._shifts[:, :, :, 1]

    @property
    def h_phase_shift(self):
        return self._shifts[:, :, :, 2]

    @property
    def h_freq(self):
        frame_rate = self.sample_rate / self.frame_step
        return self._h_freq + frame_rate * self.h_freq_shift
    
    @h_freq.setter
    def h_freq(self, value):
        frame_rate = self.sample_rate / self.frame_step
        self._h_freq = value - frame_rate * self.h_freq_shift

    @property
    def h_mag(self):
        return self._h_mag + self.h_mag_shift

    @h_mag.setter
    def h_mag(self, value):
        self._h_mag = value - self.h_mag_shift

    @property
    def h_phase(self):
        return self._h_phase + 2.0 * np.pi * self.h_phase_shift

    @h_phase.setter
    def h_phase(self, value):
        self._h_phase = value - 2.0 * np.pi * self.h_phase_shift

    def call(self, inputs=None, training=None, mask=None):
        sample_rate = self.sample_rate
        frame_step = self.frame_step

        h_freq = self.h_freq
        h_phase = self.h_phase
        h_mag = self.h_mag

        if self.generate_phase:
            h_phase = generate_phase(h_freq, sample_rate, frame_step,
                                     initial_h_phase=None)

        audio = harmonic_synthesis(
            h_freq, h_mag, h_phase, sample_rate, frame_step)

        return audio

    def get_config(self):
        pass

## Display harmonic decomposition results

In [62]:
iterator = iter(train_dataset)

In [None]:
import soundfile as sf
import matplotlib.pyplot as plt
import IPython
import warnings

warnings.filterwarnings("ignore", category=RuntimeWarning) 

e = next(iterator)

sample_name = e['sample_name'][0].numpy().decode('UTF-8')
note_number = e['note_number']
velocity = e['velocity']
instrument_source = e['instrument_source']
qualities = e['qualities']
audio = e['audio']
f0_hz = e['f0_hz']
f0_confidence = e['f0_confidence']
loudness_db = e['loudness_db']

f0_estimate = e['f0_estimate']
h_freq = e['h_freq']
h_mag = e['h_mag']
h_phase = e['h_phase']

print(f'sample_name: {sample_name}')
print(f'note_number: {int(note_number[0][0])}')
print(f'velocity: {int(velocity[0][0])}')

audio = tf.cast(audio, dtype=tf.float32)
audio = tf.reshape(audio, shape=(1, -1))

frame_step = 64

# f0_estimate = tf.reshape(e['f0_hz'][0], shape=(1, -1, 1))
# f0_estimate = tf.pad(f0_estimate, ((0, 0), (0, 1), (0, 0)))
# f0_mean = non_zero_mean(f0_estimate, axis=1)
# f0_estimate = tf.where(f0_estimate > 0.0, f0_estimate, f0_mean)

harmonic_model = HarmonicModel(
    sample_rate=sample_rate,
    frame_step=frame_step,
    batches=h_freq.shape[0],
    frames=h_freq.shape[1],
    harmonics=h_freq.shape[2],
    h_freq=h_freq, h_mag=h_mag, h_phase=h_phase,
    generate_phase=False)

harmonic = harmonic_model([])
residual = audio - harmonic

harmonic_model.generate_phase = True
no_phase = harmonic_model([])

original = np.squeeze(audio.numpy())
harmonic = np.squeeze(harmonic.numpy())
residual = np.squeeze(residual.numpy())
no_phase = np.squeeze(no_phase.numpy())

sf.write('original.wav', 0.5 * original, sample_rate)
sf.write('harmonic.wav', 0.5 * harmonic, sample_rate)
sf.write('residual.wav', 0.5 * residual, sample_rate)
sf.write('no_phase.wav', 0.5 * no_phase, sample_rate)

print('\nOriginal\n')
IPython.display.display(IPython.display.Audio('original.wav'))
print('\nHarmonic\n')
IPython.display.display(IPython.display.Audio('harmonic.wav'))
print('\nResidual\n')
IPython.display.display(IPython.display.Audio('residual.wav'))
print('\nNo Phase\n')
IPython.display.display(IPython.display.Audio('no_phase.wav'))

def specgrams(x):
    plt.figure()
    plt.subplot(2, 1, 1)
    plt.specgram(x, NFFT=256, Fs=sample_rate, window=None,
                  noverlap=256 - frame_step, mode='psd', vmin=-180)
    plt.subplot(2, 1, 2)
    plt.specgram(x, NFFT=1024, Fs=sample_rate, window=None,
                  noverlap=1024 - frame_step, mode='psd', vmin=-180)
    plt.show()

print('\nWaveforms\n')
plt.figure()
plt.plot(original, label='original')
plt.plot(harmonic, label='harmonic')
plt.plot(no_phase, label='no_phase')
plt.legend()
plt.show()

print('\nF0 estimate\n')
plt.figure()
plt.plot(np.squeeze(f0_estimate.numpy()))
plt.show()

print('\nFrequency tracks\n')
plt.figure()
h_freq = harmonic_model.h_freq
h_freq = tf.where(h_freq == 0.0, np.inf, h_freq)
plt.plot(np.squeeze(h_freq.numpy()))
plt.show()

print('\nOriginal spectorgram\n')
specgrams(original)
print('\nHarmonic spectorgram\n')
specgrams(harmonic)
print('\nResidual spectorgram\n')
specgrams(residual)