In [None]:
!git clone https://github.com/TheSoundOfAIOSR/rg_sound_generation.git
%cd rg_sound_generation

In [None]:
!pip install -q -r requirements_colab.txt

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

In [None]:
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import soundfile as sf
import tsms
from tqdm import tqdm
from tcae.train import ModelWrapper
from tcae.model import TCAEModel
from tcae.localconfig import LocalConfig
from tcae.dataset import get_dataset


conf = LocalConfig()

target_dir = "/content/drive/MyDrive/the_sound_of_ai/new_data/sounds_measures"

In [None]:
# Architecture
conf.use_encoder = True
conf.simple_encoder = True
conf.simple_decoder = True
conf.using_categorical = False
conf.latent_dim = 5
conf.use_embeddings = False
conf.lc_dropout_rate = 0.0

conf.use_note_number = True
conf.use_velocity = True
conf.use_instrument_id = False
conf.use_heuristics = True
conf.use_one_hot_conditioning = True
conf.create_decoder_function = 'lc'

conf.print_model_summary = False

# Outputs
conf.data_handler.update_losses_weights(
    f0_shifts=1.0,
    h_freq_shifts=1.0,
    mag_env=1.0,
    h_mag_dist=1.0,
    h_mag=1.0,
    h_phase_diff=0.0,
    measures=1.0)

conf.data_handler.compact_measures_logs = False
conf.data_handler.freq_scale_fn = 'tanh'  # 'none', 'tanh'
conf.data_handler.mag_scale_fn = 'exp_sigmoid'  # 'none', 'exp_sigmoid'
conf.data_handler.phase_scale_fn = 'tanh'  # 'none', 'tanh'

# Training
conf.batch_size = 32
conf.learning_rate = 3e-04
conf.lr_factor = 0.5
conf.lr_plateau = 10
conf.model_name = "mt_new_decoder_5_measures"
conf.early_stopping = 100


conf.dataset_dir = "/content/drive/MyDrive/the_sound_of_ai/new_data"
conf.batch_size = 1

In [None]:
model = ModelWrapper(TCAEModel(conf), conf.data_handler.loss)

_, valid_dataset, test_dataset = get_dataset(conf)

In [None]:
x, y = next(iter(valid_dataset))
_ = model(x)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=conf.learning_rate))
model.load_weights("/content/drive/MyDrive/the_sound_of_ai/new_data/checkpoints/mt_measures/mt_new_decoder_5_measures_23_0.00930.ckpt")

In [None]:
loss_keys = ["loss"] + [x + "_loss" for x in list(conf.data_handler.losses_weights.keys()) if conf.data_handler.losses_weights[x] > 0]
loss_keys += [x + "_loss" for x in conf.data_handler.measures_losses_weights.keys()]

loss_keys

In [None]:
results = dict((k, []) for k in loss_keys)
results["name"] = []

class EvalCallback(tf.keras.callbacks.Callback):
    def on_test_batch_end(self, batch, logs=None):
        for key in loss_keys:
            results[key].append(logs.get(key))


for x, y in tqdm(iter(test_dataset)):
    losses = model.evaluate(x, y, verbose=False, callbacks=[EvalCallback()])
    name = x["name:"][0]
    name = name.numpy()[0].decode()
    results["name"].append(name)
    
    note_number = x["note_number"]
    note_number = tf.argmax(note_number, axis=-1) + conf.starting_midi_pitch
    
    preds = model.predict(x)
    transformed = conf.data_handler.output_transform({}, preds)
    transformed["mask"] = x["mask"]
    transformed["note_number"] = note_number

    h_freq, h_mag, h_phase = conf.data_handler.denormalize(transformed)
    audio = tsms.core.harmonic_synthesis(h_freq, h_mag, h_phase, conf.sample_rate, conf.frame_size)
    audio = audio[0]
    audio = np.array(audio) / np.max(np.abs(audio))

    target_path = os.path.join(target_dir, f"{name}.wav")
    sf.write(target_path, audio, samplerate=conf.sample_rate)

In [None]:
df = pd.DataFrame(results)
df.head()

In [None]:
df.to_csv("test_results_with_measures_loss.csv")

Export synthesised audio for ground truth

In [None]:
for x, y in tqdm(iter(test_dataset)):
    name = x["name:"][0]
    name = name.numpy()[0].decode()

    # Check denormalize if the following maybe required
    # note_number = x["note_number"]
    # note_number = tf.argmax(note_number, axis=-1) + conf.starting_midi_pitch
    # x["note_number"] = note_number

    h_freq, h_mag, h_phase = conf.data_handler.denormalize(x)
    audio = tsms.core.harmonic_synthesis(h_freq, h_mag, h_phase, conf.sample_rate, conf.frame_size)
    audio = audio[0]
    audio = np.array(audio) / np.max(np.abs(audio))

    target_path = os.path.join(target_dir, f"{name}_true.wav")
    sf.write(target_path, audio, samplerate=conf.sample_rate)