This notebook shows how to use pretrained PhraseVAE and PhraseLDM.

In [1]:
# Obtaining phrase sequence with REMI-z
from remi_z import MultiTrack
mt = MultiTrack.from_midi('misc/test_output_0.mid')
print(mt)
phrases = mt[20].get_phrases(with_bar_end=True)  # Get phrases of 20th bar
print('\n'.join(phrases))

MultiTrack: 62 bars
i-25 o-36 p-81 d-6 o-42 p-79 d-6
i-13 o-0 p-69 d-6 o-6 p-60 d-6 o-12 p-60 d-6 o-18 p-62 d-6 o-24 p-60 d-18
i-0 o-0 p-60 d-9 p-57 d-9 p-45 d-39 o-6 p-52 d-9 o-12 p-60 d-6 p-57 d-6 o-18 p-52 d-9 o-24 p-64 d-6 p-60 d-6 p-57 d-9 o-30 p-52 d-9 o-36 p-60 d-6 p-57 d-3 o-42 p-57 d-6
b-1


In [2]:
# Encode and decode phrases using the pretrained PhraseVAE model
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

from models.vae_inference import PhraseVAE

# Create model
vae = PhraseVAE.from_pretrained("LongshenOu/phrase-vae").cuda()

latents = vae.encode_batch(phrases, do_sample=False)
print(f'Latent shape: {latents.shape}')  # (n_phrase, latent_dim)
decoded = vae.decode_batch(latents.unsqueeze(1)) # input shape: (n_song, n_phrase, latent_dim)
print('\n'.join(decoded))


  from .autonotebook import tqdm as notebook_tqdm


Latent shape: torch.Size([4, 64])
i-25 o-36 p-81 d-6 o-42 p-79 d-6
i-13 o-0 p-69 d-6 o-6 p-60 d-6 o-12 p-60 d-6 o-18 p-62 d-6 o-24 p-60 d-18
i-0 o-0 p-60 d-9 p-57 d-9 p-45 d-39 o-6 p-52 d-9 o-12 p-60 d-6 p-57 d-6 o-18 p-52 d-9 o-24 p-64 d-6 p-60 d-6 p-57 d-9 o-30 p-52 d-9 o-36 p-60 d-6 p-57 d-3 o-42 p-57 d-6
b-1


In [3]:
# Generate phrases using the pretrained PhraseLDM model
from models.ldm_inference import PhraseLDM
ldm = PhraseLDM.from_pretrained("LongshenOu/phrase-ldm-uncond").cuda()

decoded, latents = ldm.generate(n_sample=2, vae=vae) 
print(decoded)
print(f'Latent shape: {latents.shape}')

# Save generation as MIDI
from remi_z import MultiTrack
for i, out_str in enumerate(decoded):
    mt = MultiTrack.from_remiz_str(out_str)
    mt.set_tempo(90)
    mt.to_midi(f'misc/test_output_{i}.mid')


  rotary_embed_dim = self.attention_head_dim // 2
  deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)
100%|██████████| 1000/1000 [00:10<00:00, 95.52it/s]


['i-25 o-33 p-76 d-3 o-36 p-76 d-3 o-39 p-74 d-3 o-42 p-72 d-3 o-45 p-74 d-3 i-0 o-0 p-45 d-42 o-6 p-52 d-18 o-12 p-57 d-15 o-18 p-60 d-24 o-24 p-64 d-12 o-30 p-52 d-6 o-36 p-69 d-12 o-39 p-57 d-3 o-42 p-60 d-6 [INST] b-1 i-25 o-33 p-76 d-3 o-36 p-76 d-3 o-39 p-74 d-3 o-42 p-72 d-3 o-45 p-74 d-3 i-0 o-0 p-45 d-21 o-6 p-52 d-6 o-12 p-60 d-12 o-18 p-57 d-12 o-24 p-45 d-12 o-30 p-52 d-3 o-33 p-57 d-15 o-36 p-60 d-12 o-42 p-52 d-6 [INST] b-1 i-25 o-0 p-72 d-15 o-15 p-72 d-3 o-18 p-71 d-3 o-21 p-72 d-6 o-33 p-69 d-6 o-42 p-71 d-6 i-0 o-0 p-41 d-39 o-6 p-48 d-30 o-12 p-53 d-15 o-18 p-57 d-18 o-24 p-65 d-24 o-30 p-53 d-6 o-36 p-60 d-12 o-42 p-57 d-6 [INST] b-1 i-25 o-0 p-68 d-12 i-0 o-0 p-43 d-39 o-6 p-50 d-30 o-12 p-55 d-24 o-18 p-59 d-30 o-24 p-62 d-24 p-59 d-18 o-42 p-55 d-6 [INST] b-1 i-13 o-0 p-64 d-3 o-6 p-64 d-3 o-9 p-64 d-3 o-12 p-64 d-3 o-15 p-62 d-3 o-18 p-60 d-3 o-21 p-62 d-9 o-39 p-62 d-3 o-42 p-64 d-3 o-45 p-62 d-3 i-0 o-0 p-36 d-39 o-6 p-43 d-33 o-12 p-48 d-15 o-18 p-60 d-15 o-2

In [4]:
# Generate using length conditioned PhraseLDM model
ldm = PhraseLDM.from_pretrained("LongshenOu/phrase-ldm-lencond").cuda()

n_bars = 64 # This actually means [60, 70) bars
decoded, latents = ldm.generate(n_sample=2, vae=vae, n_bars=n_bars)
print(decoded)
print(f'Latent shape: {latents.shape}')

from remi_z import MultiTrack
for i, out_str in enumerate(decoded):
    mt = MultiTrack.from_remiz_str(out_str)
    mt.set_tempo(90)
    mt.to_midi(f'misc/test_output_lencond_{i}.mid')

Length Buckets: tensor([6, 6], device='cuda:0')


100%|██████████| 1000/1000 [00:10<00:00, 97.43it/s]


['i-0 o-0 p-64 d-42 p-60 d-42 p-57 d-42 p-45 d-42 [INST] b-1 i-25 o-0 p-72 d-6 o-6 p-69 d-6 o-12 p-72 d-6 o-18 p-69 d-6 o-24 p-74 d-18 i-0 o-0 p-64 d-21 p-60 d-21 p-57 d-18 p-52 d-21 o-24 p-67 d-18 p-62 d-18 p-59 d-18 p-55 d-15 p-43 d-15 [INST] b-1 i-25 o-0 p-69 d-6 o-6 p-67 d-6 o-12 p-69 d-6 o-18 p-67 d-6 o-24 p-72 d-12 o-42 p-67 d-6 i-0 o-0 p-64 d-39 p-60 d-39 p-55 d-39 p-52 d-39 p-36 d-39 [INST] b-1 i-25 o-0 p-67 d-6 o-6 p-69 d-6 o-12 p-69 d-6 o-18 p-67 d-6 o-24 p-67 d-24 i-0 o-0 p-64 d-33 p-60 d-30 p-57 d-27 p-45 d-21 o-33 p-60 d-30 o-45 p-45 d-42 [INST] b-1 i-0 o-0 p-71 d-36 p-64 d-36 p-55 d-36 o-36 p-64 d-9 p-55 d-9 [INST] [INST] b-1 i-0 o-0 p-41 d-18 o-6 p-53 d-12 o-12 p-60 d-12 p-57 d-12 o-24 p-43 d-18 o-30 p-50 d-12 o-36 p-62 d-12 p-55 d-12 [INST] [INST] b-1 [INST] [INST] [INST] b-1 i-0 o-0 p-62 d-48 p-59 d-48 p-55 d-48 p-43 d-48 [INST] [INST] b-1 i-25 o-36 p-76 d-3 o-42 p-69 d-3 [INST] [INST] b-1 i-25 o-0 p-72 d-6 o-6 p-69 d-6 o-12 p-72 d-6 o-18 p-69 d-6 o-24 p-74 d-18 o-42 p

In [None]:
# Generate using length and section conditioned PhraseLDM model
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
from models.ldm_inference import PhraseLDM
ldm = PhraseLDM.from_pretrained("LongshenOu/phrase-ldm-seccond").cuda()

n_bars = 64 # This actually means [60, 70) bars
sec_cond = 'i-8 A-8 A-8 B-4 B-4 x-4 A-8 B-4 B-4 B-4 B-4 X-4'

decoded, latents = ldm.generate(n_sample=2, vae=vae, n_bars=n_bars, sec_cond=sec_cond)
print(decoded)
print(f'Latent shape: {latents.shape}')

from remi_z import MultiTrack
for i, out_str in enumerate(decoded):
    mt = MultiTrack.from_remiz_str(out_str)
    mt.set_tempo(90)
    mt.to_midi(f'misc/test_output_seccond_{i}.mid')

Length Buckets: tensor([6, 6], device='cuda:0')


100%|██████████| 1000/1000 [00:10<00:00, 93.57it/s]


['i-0 o-0 p-64 d-9 p-60 d-9 p-57 d-9 p-41 d-15 o-12 p-64 d-9 p-60 d-3 p-57 d-3 o-18 p-62 d-3 o-24 p-57 d-9 p-53 d-9 p-38 d-21 o-30 p-45 d-12 o-36 p-65 d-9 p-60 d-12 p-53 d-12 [INST] [INST] b-1 i-0 o-0 p-64 d-9 p-60 d-6 p-55 d-9 p-36 d-18 o-6 p-43 d-12 o-9 p-60 d-3 o-12 p-64 d-3 p-60 d-6 p-55 d-6 o-18 p-47 d-3 o-24 p-60 d-12 p-45 d-12 o-36 p-64 d-3 p-60 d-6 o-42 p-59 d-3 p-52 d-3 [INST] [INST] b-1 i-0 o-0 p-60 d-3 p-53 d-3 p-50 d-9 p-38 d-3 o-12 p-60 d-3 p-55 d-3 o-18 p-53 d-3 o-24 p-64 d-9 p-60 d-9 p-57 d-9 p-45 d-18 o-30 p-52 d-9 o-36 p-64 d-6 p-60 d-6 o-42 p-52 d-6 [INST] [INST] b-1 i-25 o-0 p-69 d-21 i-0 o-0 p-65 d-42 p-60 d-42 p-57 d-39 p-53 d-42 p-41 d-42 [INST] b-1 i-13 o-18 p-69 d-3 o-21 p-72 d-3 o-24 p-72 d-6 o-30 p-71 d-9 o-42 p-69 d-3 o-45 p-71 d-3 i-0 o-0 p-65 d-6 p-60 d-6 p-57 d-6 p-53 d-18 p-41 d-33 o-12 p-65 d-9 p-60 d-3 p-57 d-3 o-24 p-60 d-9 p-57 d-12 p-53 d-9 o-36 p-65 d-6 p-60 d-6 p-57 d-6 o-42 p-53 d-3 p-41 d-6 [INST] b-1 i-13 o-0 p-65 d-3 o-3 p-64 d-3 o-6 p-65 d-3 o