In [1]:
!git clone -b dataset https://github.com/TheSoundOfAIOSR/rg_sound_generation.git
%cd rg_sound_generation

fatal: destination path 'rg_sound_generation' already exists and is not an empty directory.
/content/rg_sound_generation


In [2]:
!pip install -q -r requirements_colab.txt

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import soundfile as sf
import tsms
from tqdm import tqdm
from tcae.train import ModelWrapper
from tcae.model import TCAEModel
from tcae.localconfig import LocalConfig
from tcae.dataset import get_dataset


conf = LocalConfig()

In [5]:
conf.dataset_dir = "/content/drive/MyDrive/the_sound_of_ai/dataset/nsynth_guitar_splits"
conf.checkpoints_dir = "/content/drive/MyDrive/the_sound_of_ai/dataset/checkpoints"

# Architecture
conf.use_encoder = False
conf.simple_encoder = True
conf.using_categorical = False
conf.latent_dim = 2
conf.use_embeddings = False
conf.lc_dropout_rate = 0.0

conf.use_note_number = True
conf.use_velocity = True
conf.use_instrument_id = True
conf.use_heuristics = False
conf.use_one_hot_conditioning = False
conf.create_decoder_function = 'lc'

conf.print_model_summary = False

# Outputs
conf.data_handler.update_losses_weights(
    audio=1.0,
    h_freq=0.0,
    f0_shifts=0.0,
    h_freq_shifts=0.0,
    h_mag=1.0,
    mag_env=0.0,
    h_mag_dist=0.0,
    measures=0.0)

conf.data_handler.compact_measures_logs = False
conf.data_handler.freq_scale_fn = 'tanh'  # 'none', 'tanh'
conf.data_handler.mag_scale_fn = 'exp_sigmoid'  # 'none', 'exp_sigmoid'
conf.data_handler.phase_scale_fn = 'tanh'  # 'none', 'tanh'
conf.data_handler.measures_mapping_type = 'linear'

# conf.pretrained_model_path = "/content/drive/MyDrive/the_sound_of_ai/checkpoints/lc_2/lc_2_59_0.00644.ckpt"

# Training
conf.batch_size = 32
conf.learning_rate = 3e-04
conf.lr_factor = 0.5
conf.lr_plateau = 7
conf.model_name = "aud_mag"

conf.batch_size = 1

In [6]:
model = ModelWrapper(TCAEModel(conf), conf.data_handler.loss)

_, valid_dataset, _ = get_dataset(conf)

In [7]:
x, y = next(iter(valid_dataset))
p = model(x)

In [8]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=conf.learning_rate))
model.load_weights("/content/drive/MyDrive/the_sound_of_ai/dataset/checkpoints/aud_mag_278_0.00911.ckpt")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f307d7640d0>

In [None]:
loss_keys = ["loss"] + [x + "_loss" for x in list(conf.data_handler.losses_weights.keys()) if conf.data_handler.losses_weights[x] > 0]
loss_keys += [x + "_loss" for x in conf.data_handler.measures_losses_weights.keys()]

loss_keys

['loss',
 'audio_loss',
 'h_mag_loss',
 'inharmonic_loss',
 'even_odd_loss',
 'sparse_rich_loss',
 'attack_rms_loss',
 'decay_rms_loss',
 'attack_time_loss',
 'decay_time_loss',
 'bass_loss',
 'mid_loss',
 'high_mid_loss',
 'high_loss']

In [9]:
loss_keys = ['loss',
    'audio_loss',
    'h_mag_loss']

In [10]:
target_dir = "/content/drive/MyDrive/the_sound_of_ai/dataset/eval"

results = dict((k, []) for k in loss_keys)
results["name"] = []

class EvalCallback(tf.keras.callbacks.Callback):
    def on_test_batch_end(self, batch, logs=None):
        for key in loss_keys:
            results[key].append(logs.get(key))

for x, y in tqdm(iter(valid_dataset)):
    losses = model.evaluate(x, y, verbose=False, callbacks=[EvalCallback()])
    name = x["name"][0]
    name = name.numpy()[0].decode()
    results["name"].append(name)
    
    note_number = x["note_number"] * conf.num_pitches + conf.starting_midi_pitch
    
    preds = model.predict(x)
    transformed = conf.data_handler.output_transform({}, preds)
    transformed["mask"] = x["mask"]
    transformed["note_number"] = note_number

    h_freq, h_mag, h_phase = conf.data_handler.denormalize(transformed)
    audio = tsms.core.harmonic_synthesis(h_freq, h_mag, h_phase, conf.sample_rate, conf.frame_size)
    audio = audio[0]
    audio = np.array(audio) / np.max(np.abs(audio))

    target_path = os.path.join(target_dir, f"{name}.wav")
    sf.write(target_path, audio, samplerate=conf.sample_rate)

1508it [26:12,  1.04s/it]


In [11]:
df = pd.DataFrame(results)
df.head()

Unnamed: 0,loss,audio_loss,h_mag_loss,name
0,0.008995,0.004387,0.004608,guitar_synthetic_012-046-127
1,0.017445,0.012292,0.005153,guitar_electronic_039-052-025
2,0.003839,0.000162,0.003677,guitar_synthetic_001-077-100
3,0.008353,0.001363,0.006991,guitar_acoustic_007-056-050
4,0.005851,0.001957,0.003893,guitar_acoustic_033-047-025


In [12]:
df.to_csv("valid_losses.csv")

Export synthesised audio for ground truth

In [13]:
for x, y in tqdm(iter(valid_dataset)):
    name = x["name"][0]
    name = name.numpy()[0].decode()

    note_number = x["note_number"] * conf.num_pitches + conf.starting_midi_pitch
    x["note_number"] = note_number

    h_freq, h_mag, h_phase = conf.data_handler.denormalize(x)
    audio = tsms.core.harmonic_synthesis(h_freq, h_mag, h_phase, conf.sample_rate, conf.frame_size)
    audio = audio[0]
    audio = np.array(audio) / np.max(np.abs(audio))

    target_path = os.path.join(target_dir, f"{name}_true.wav")
    sf.write(target_path, audio, samplerate=conf.sample_rate)

1508it [19:00,  1.32it/s]
