# Sound Generation

## Overview

![System](docs/SG_System.jpg)

## Encoder

![Encoder](docs/SG_Encoder.jpg)

## Decoder

![Decoder](docs/SG_Decoder.jpg)

In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import numpy as np
import soundfile as sf
import tsms
from tcvae import dataset, localconfig, model
from tcvae.compute_measures import heuristic_names
from IPython.display import Audio

## Load Config, Data and Model

In [6]:
conf = localconfig.LocalConfig()
conf.load_config_from_file("checkpoints/mt_no_phase/conf.txt")

conf.dataset_dir = "complete_dataset"
conf.batch_size = 1

measure_to_index = dict((n, i) for i, n in enumerate(heuristic_names))
index_to_measure = dict((v, k) for k, v in measure_to_index.items())

print("Configuration loaded")

_, valid, test = dataset.get_dataset(conf)
test_iter = iter(test)
valid_iter = iter(valid)

print("Data loaded")

mt_model = model.MtVae(conf)
_ = mt_model(next(valid_iter))

mt_model.load_weights("checkpoints/mt_no_phase/74_mt_no_phase_0.01227.h5")

print("Model loaded")

Configuration loaded
Data loaded
Model loaded


## Helper Functions

In [7]:
def change_params(batch, params):
    note_number = params.get("note_number")
    velocity = params.get("velocity")
    measures = params.get("measures")
    
    original_note = np.argmax(batch["note_number"], axis=-1)[0] + conf.starting_midi_pitch
    original_vel = (np.argmax(batch["velocity"], axis=-1)[0] + 1) * 25
    original_measures = batch["measures"]
    
    if note_number is not None:
        assert 40 <= note_number <= 88
        print(f"note_number changed from {original_note} to {note_number}")
        note_number -= conf.starting_midi_pitch
        updated_note = np.zeros((1, conf.num_pitches))
        updated_note[:, note_number] = 1.
        batch["note_number"] = updated_note
    if velocity is not None:
        assert 25 <= velocity <= 127
        print(f"velocity changed from {original_vel} to {velocity}")
        velocity = int(velocity / 25 - 1)
        updated_vel = np.zeros((1, conf.num_velocities))
        updated_vel[:, velocity] = 1.
        batch["velocity"] = updated_vel
    if measure_to_index is not None:
        if bool(measures):
            updated_measures = original_measures.numpy()
            for m, val in measures.items():
                assert m in heuristic_names
                original_value = updated_measures[:, measure_to_index[m]][0]
                print(f"{m} changed from {original_value} to {val}")
                updated_measures[:, measure_to_index[m]] = val
            batch["measures"] = updated_measures
    return batch


def get_prediction(batch, conf, prediction=None):
    batch = batch.copy()
    pred = True if prediction is not None else False
    transform_input = prediction if prediction is not None else batch
    note_number = np.argmax(batch["note_number"], axis=-1) + conf.starting_midi_pitch
    transform = conf.data_handler.output_transform(transform_input, pred=pred)
    f, m, p = conf.data_handler.denormalize(transform, batch["mask"], note_number)
    audio = tsms.core.harmonic_synthesis(f, m, p, conf.sample_rate, conf.frame_size)
    return np.squeeze(audio.numpy())


def get_audios(batch, update_params=None):
    batch = batch.copy()
    audio_gt = get_prediction(batch, conf, prediction=None)
    
    if update_params is not None:
        batch = change_params(batch, update_params)

    prediction = mt_model.predict(batch)
    audio_pred = get_prediction(batch, conf, prediction=prediction)

    return audio_pred, audio_gt

## Get Predictions

In [8]:
batch = next(test_iter)

In [21]:
update_params = {
    # "note_number": 40,
    # "velocity": 25,
    "measures": {
        # "bass": 0.,
        # "high_mid": 1.,
        # "inharmonicity": 1.,
        # "even_odd": 1.,
        # "sparse_rich": 1.,
    }
}

audio_pred, audio_gt = get_audios(batch, update_params)

even_odd changed from 0.43971604108810425 to 1.0


In [22]:
Audio(audio_gt, rate=conf.sample_rate)

In [23]:
Audio(audio_pred, rate=conf.sample_rate)


## Export Audio Files

In [None]:
test_iter = iter(test)

def write_audio(audio, conf, audio_path):
    audio = audio / np.max(np.abs(audio))
    sf.write(audio_path, audio, samplerate=conf.sample_rate)


for i in range(0, 20):
    batch = next(test_iter)

    audio_pred, audio_gt = get_pred_and_get(batch)

    true_path = os.path.join(os.getcwd(), "predictions", f"{i}_true.wav")
    pred_path = os.path.join(os.getcwd(), "predictions", f"{i}_pred.wav")

    write_audio(audio_pred, conf, pred_path)
    write_audio(audio_gt, conf, true_path)

    print(i)