This notebook shows how to use pretrianed PhraseVAE and PhraseLDM.

In [None]:
# Encode and decode phrases using the pretrained PhraseVAE model

import os
from models.vae_inference import PhraseVAE

# Create model
vae = PhraseVAE.from_pretrained("LongshenOu/phrase-vae")
vae.eval()

phrases = [
    'i-0 o-0 p-41 d-18 o-3 p-48 d-15 o-6 p-53 d-9 o-9 p-60 d-15 o-24 p-43 d-15 o-27 p-50 d-12 o-30 p-55 d-12 o-33 p-59 d-9',
    'i-25 o-0 p-69 d-6 o-6 p-67 d-3 o-12 p-65 d-3 o-18 p-74 d-3 o-21 p-76 d-3 o-24 p-74 d-15 o-42 p-60 d-3',
    'b-1',
]

latents = vae.encode_batch(phrases, do_sample=False)
print(f'Latent shape: {latents.shape}')  # (n_phrase, latent_dim)
decoded = vae.decode_batch(latents.unsqueeze(1)) # input shape: (n_song, n_phrase, latent_dim)
print('\n'.join(decoded))


  from .autonotebook import tqdm as notebook_tqdm


Latent shape: torch.Size([3, 64])
i-0 o-0 p-41 d-18 o-3 p-48 d-15 o-6 p-53 d-9 o-9 p-60 d-15 o-24 p-43 d-15 o-27 p-50 d-12 o-30 p-55 d-12 o-33 p-59 d-9
i-25 o-0 p-69 d-6 o-6 p-67 d-3 o-12 p-65 d-3 o-18 p-74 d-3 o-21 p-76 d-3 o-24 p-74 d-15 o-42 p-60 d-3
b-1


In [2]:
# Generate phrases using the pretrained PhraseLDM model
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
from models.ldm_inference import PhraseLDM
ldm = PhraseLDM.from_pretrained("LongshenOu/phrase-ldm-uncond").cuda()
ldm.eval()
vae.cuda()
decoded, latents = ldm.generate(n_sample=2, vae=vae) 
print(decoded)
print(f'Latent shape: {latents.shape}')


VAE scale factor: 0.7590118646621704


  rotary_embed_dim = self.attention_head_dim // 2
  deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)
  4%|▍         | 41/1000 [00:07<01:09, 13.80it/s] 

KeyboardInterrupt: 

In [12]:
# Save generation as MIDI
from remi_z import MultiTrack
for i, out_str in enumerate(decoded):
    mt = MultiTrack.from_remiz_str(out_str)
    mt.set_tempo(90)
    mt.to_midi(f'misc/test_output_{i}.mid')

MIDI file successfully written to misc/test_output_0.mid
MIDI file successfully written to misc/test_output_1.mid


In [None]:
# Generate using length conditioned PhraseLDM model
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
from models.ldm_inference import PhraseLDM
ldm = PhraseLDM.from_pretrained("LongshenOu/phrase-ldm-lencond").cuda()
ldm.eval()
vae = vae.cuda()
n_bars = 64 # This actually means [60, 70) bars
decoded, latents = ldm.generate(n_sample=2, vae=vae, n_bars=n_bars)
print(decoded)
print(f'Latent shape: {latents.shape}')

from remi_z import MultiTrack
for i, out_str in enumerate(decoded):
    mt = MultiTrack.from_remiz_str(out_str)
    mt.set_tempo(90)
    mt.to_midi(f'misc/test_output_lencond_{i}.mid')

VAE scale factor: 0.7590118646621704
Length Buckets: tensor([6, 6], device='cuda:0')


  rotary_embed_dim = self.attention_head_dim // 2
  deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)
100%|██████████| 1000/1000 [00:40<00:00, 24.68it/s]


['i-25 o-0 p-72 d-6 o-6 p-76 d-6 o-12 p-79 d-6 o-18 p-76 d-3 o-21 p-79 d-3 o-24 p-79 d-3 o-30 p-72 d-6 o-36 p-76 d-3 o-39 p-72 d-3 o-42 p-79 d-6 i-0 o-0 p-48 d-15 o-6 p-55 d-9 o-12 p-67 d-12 o-18 p-60 d-6 o-24 p-48 d-15 o-30 p-55 d-6 o-36 p-60 d-12 o-42 p-64 d-6 [INST] b-1 i-25 o-0 p-74 d-18 o-36 p-74 d-6 o-42 p-77 d-3 p-76 d-3 o-45 p-81 d-3 p-77 d-3 i-0 o-0 p-50 d-39 o-6 p-57 d-33 o-12 p-62 d-15 o-18 p-65 d-12 o-24 p-69 d-12 o-30 p-57 d-6 o-36 p-65 d-12 o-42 p-62 d-6 [INST] b-1 i-25 o-0 p-79 d-15 o-12 p-79 d-3 o-18 p-79 d-3 o-24 p-79 d-6 o-30 p-79 d-6 o-36 p-84 d-6 o-42 p-86 d-6 i-0 o-0 p-48 d-15 o-6 p-55 d-9 o-12 p-67 d-12 o-18 p-60 d-6 o-24 p-43 d-18 o-30 p-50 d-12 o-36 p-71 d-12 p-67 d-12 p-55 d-9 o-42 p-62 d-6 [INST] b-1 i-25 o-0 p-84 d-12 i-0 o-0 p-48 d-36 o-6 p-55 d-18 o-12 p-60 d-33 o-18 p-55 d-24 o-24 p-67 d-24 p-64 d-24 [INST] b-1 i-13 o-0 p-76 d-15 o-18 p-76 d-6 o-24 p-76 d-3 o-30 p-74 d-3 o-36 p-74 d-6 o-42 p-76 d-6 i-0 o-0 p-48 d-21 o-6 p-55 d-18 o-12 p-67 d-21 o-18 p-60 d

In [None]:
# Generate using length and section conditioned PhraseLDM model
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
from models.ldm_inference import PhraseLDM
ldm = PhraseLDM.from_pretrained("LongshenOu/phrase-ldm-lencond").cuda()
ldm.eval()
vae = vae.cuda()
n_bars = 64 # This actually means [60, 70) bars
sec_cond = 'i-8 A-8 A-8 B-4 B-4 x-4 A-8 B-4 B-4 B-4 B-4 X-4'

decoded, latents = ldm.generate(n_sample=2, vae=vae, n_bars=n_bars, sec_cond=sec_cond)
print(decoded)
print(f'Latent shape: {latents.shape}')

from remi_z import MultiTrack
for i, out_str in enumerate(decoded):
    mt = MultiTrack.from_remiz_str(out_str)
    mt.set_tempo(90)
    mt.to_midi(f'misc/test_output_seccond_{i}.mid')

VAE scale factor: 0.7590118646621704
Length Buckets: tensor([6, 6], device='cuda:0')


  rotary_embed_dim = self.attention_head_dim // 2
  deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)
100%|██████████| 1000/1000 [00:40<00:00, 24.58it/s]


['i-0 o-0 p-67 d-6 p-60 d-6 o-6 p-64 d-6 o-12 p-67 d-6 p-60 d-6 o-18 p-60 d-9 o-36 p-67 d-6 p-62 d-6 o-42 p-60 d-6 [INST] [INST] b-1 i-25 o-0 p-67 d-6 o-6 p-60 d-6 o-12 p-67 d-6 o-18 p-60 d-6 o-36 p-60 d-6 o-42 p-62 d-6 i-0 o-0 p-67 d-6 p-64 d-6 p-48 d-18 o-6 p-60 d-6 o-12 p-67 d-3 p-64 d-3 p-60 d-6 o-18 p-59 d-3 o-24 p-67 d-6 p-64 d-6 p-60 d-6 p-45 d-12 o-30 p-57 d-6 o-36 p-64 d-12 p-60 d-6 p-45 d-15 [INST] b-1 i-25 o-0 p-60 d-6 o-6 p-69 d-6 o-12 p-60 d-6 o-18 p-60 d-6 o-24 p-67 d-21 i-0 o-0 p-60 d-9 p-57 d-6 p-53 d-9 p-41 d-18 o-6 p-48 d-6 o-12 p-60 d-9 p-57 d-6 o-18 p-53 d-3 o-24 p-59 d-9 p-55 d-9 p-43 d-15 o-30 p-50 d-12 o-36 p-62 d-9 p-59 d-9 p-55 d-3 o-42 p-55 d-3 [INST] b-1 i-0 o-0 p-67 d-9 p-60 d-12 p-48 d-12 o-6 p-55 d-6 o-12 p-67 d-6 p-64 d-6 p-60 d-6 o-18 p-55 d-6 o-24 p-64 d-12 p-60 d-12 p-55 d-12 p-48 d-12 [INST] [INST] b-1 i-13 o-6 p-72 d-3 o-9 p-72 d-3 o-12 p-72 d-3 o-18 p-72 d-3 o-24 p-71 d-3 o-30 p-71 d-6 o-36 p-71 d-6 o-42 p-67 d-6 i-0 o-0 p-64 d-9 p-60 d-6 p-55 d-3 p