In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import numpy as np
from tcvae import dataset, localconfig, model
import tsms
from IPython.display import Audio
import soundfile as sf

In [2]:
conf = localconfig.LocalConfig()
# conf.load_config_from_file(
#     "checkpoints/cnn_ae5_2d/Default_cnn_ae5_2d.json"
# )

# Architecture
conf.use_encoder = False
conf.simple_encoder = False
conf.simple_decoder = False
conf.use_max_pool = False
conf.is_variational = False
conf.use_note_number = True
conf.use_velocity = True
conf.use_heuristics = True
conf.latent_dim = 16
conf.print_model_summary = False
# Outputs
conf.mt_outputs["f0_shifts"]["enabled"] = True
conf.mt_outputs["h_freq_shifts"]["enabled"] = True
conf.mt_outputs["mag_env"]["enabled"] = True
conf.mt_outputs["h_mag_dist"]["enabled"] = True
conf.mt_outputs["h_phase_diff"]["enabled"] = False
conf.data_handler.losses_weights["f0_loss"] = 1.0
conf.data_handler.losses_weights["mag_env_loss"] = 1.0
conf.data_handler.losses_weights["h_freq_shifts_loss"] = 1.0
conf.data_handler.losses_weights["h_mag_loss"] = 1.0
conf.data_handler.losses_weights["h_phase_diff_loss"] = 0.0
# Training
conf.batch_size = 16
conf.learning_rate = 1e-3
conf.lr_factor = 0.5
conf.lr_plateau = 3
conf.model_name = "mt_decoder_no_phase"
conf.early_stopping = 10

conf.dataset_dir = "complete_dataset"
conf.batch_size = 1

_, valid, test = dataset.get_dataset(conf)
test_iter = iter(test)
valid_iter = iter(valid)

In [3]:
# _model = predict.load_model(
#     conf,
#     "checkpoints/mt_no_phase/74_mt_no_phase_0.01227.h5"
# )

_mt_model = model.MtVae(conf)
_ = _mt_model(next(valid_iter))

_mt_model.load_weights("checkpoints/mt_decoder_no_phase/89_mt_decoder_no_phase_0.02012.h5")
print("Model loaded")

Model loaded


In [4]:
batch = next(valid_iter)

In [5]:
p = _mt_model.predict(batch)

In [6]:
note = np.argmax(batch["note_number"], axis=-1) + conf.starting_midi_pitch
print(note)
transformed = conf.data_handler.output_transform(p)

[47]


In [7]:
de_normalized = conf.data_handler.denormalize(transformed, batch["mask"], note)

In [8]:
freq, mag, phase = de_normalized

audio = tsms.core.harmonic_synthesis(
        freq, mag, phase, conf.sample_rate, conf.frame_size)
audio = np.squeeze(audio.numpy())


In [9]:
Audio(audio, rate=16000)

In [10]:
gt_transform = conf.data_handler.output_transform(batch, pred=False)
f, m, p = conf.data_handler.denormalize(
        gt_transform, batch["mask"], note
)

audio_gt = tsms.core.harmonic_synthesis(
        f, m, p, conf.sample_rate, conf.frame_size)
audio_gt = np.squeeze(audio_gt.numpy())


In [11]:
Audio(audio_gt, rate=16000)


## Export Some Examples

In [12]:
def get_pred_and_get(batch):
    prediction = _mt_model.predict(batch)
    note_number = np.argmax(batch["note_number"], axis=-1) + conf.starting_midi_pitch
    transformed = conf.data_handler.output_transform(prediction)
    freq, mag, phase = conf.data_handler.denormalize(
        transformed, batch["mask"], note_number)
    audio_pred = tsms.core.harmonic_synthesis(freq, mag, phase, conf.sample_rate, conf.frame_size)
    audio_pred = np.squeeze(audio_pred.numpy())

    gt_transform = conf.data_handler.output_transform(batch, pred=False)
    f, m, p = conf.data_handler.denormalize(
            gt_transform, batch["mask"], note_number
    )

    audio_gt = tsms.core.harmonic_synthesis(
            f, m, p, conf.sample_rate, conf.frame_size)
    audio_gt = np.squeeze(audio_gt.numpy())

    return audio_pred, audio_gt

In [13]:
test_iter = iter(test)

def write_audio(audio, conf, audio_path):
    audio = audio / np.max(np.abs(audio))
    sf.write(audio_path, audio, samplerate=conf.sample_rate)


for i in range(0, 20):
    batch = next(test_iter)

    audio_pred, audio_gt = get_pred_and_get(batch)

    true_path = os.path.join(os.getcwd(), "predictions", f"{i}_true.wav")
    pred_path = os.path.join(os.getcwd(), "predictions", f"{i}_pred.wav")

    write_audio(audio_pred, conf, pred_path)
    write_audio(audio_gt, conf, true_path)

    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [1]:
import numpy as np
from sound_generator import SoundGenerator
from IPython.display import Audio
from tcvae import dataset

config = "checkpoints/cnn_ae5_2d/Default_cnn_ae5_2d.json"
checkpoint = "checkpoints/cnn_ae5_2d/79_cnn_ae5_2d_0.01495.h5"

sg = SoundGenerator(config)
sg.load_model(checkpoint)

sg.conf.dataset_dir = "complete_dataset"
train, _, _ = dataset.get_dataset(sg.conf)

INFO:root:Initializing SoundGenerator
INFO:root:SoundGenerator initialized
INFO:root:Creating complete model from config


Creating Auto Encoder


INFO:root:Loading pretrained weights for complete model
INFO:root:Complete model loaded
INFO:root:Creating decoder
INFO:root:Decoder created


In [2]:
train_iter = iter(train)

In [3]:
batch = next(train_iter)
z = sg.encoder.predict(batch["h"])

print(z)

print(np.mean(z))
print(np.min(z))
print(np.max(z))
print(np.std(z))


[[1.0609375 1.2616953 0.5713314 1.0012288 1.4979545]]
1.0786295
0.5713314
1.4979545
0.3075717


In [4]:
measures = batch["measures"]
print(measures)

tf.Tensor(
[[0.9971938  0.65744567 0.12538967 0.01675248 0.02236341 0.003996
  0.22877122 0.02519462 0.6991602  0.29892266 0.29033065]], shape=(1, 11), dtype=float32)


In [13]:
pred = sg.get_prediction({
    "z": None, # z[0].tolist(),
    "measures": measures.numpy()[0].tolist(),
    "pitch": 42,
    "velocity": 100
})

87


In [14]:
audio = pred["audio"]
Audio(audio, rate=16000)

In [24]:
pred["z"]

[-2.1677251774040167,
 -2.2009313405638093,
 2.76934910754696,
 0.2627049180160301,
 4.6762568405942]

In [25]:
pred["measures"]

[0.2825229369195575,
 0.862190980888438,
 0.05869520677498814,
 0.7105248969803164,
 -0.47228500590957967,
 -0.2826879948034669,
 0.2154043564434022,
 -0.7093931763034094,
 -0.1222837169921461,
 1.5441439261725305,
 -0.0692045076970951]