### Huggingface Unconditional generation

In [None]:
from transformers import MusicgenForConditionalGeneration

# initialize model and model's input
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
unconditional_inputs = model.get_unconditional_inputs(num_samples=1)

# generate audio
audio_values = model.generate(**unconditional_inputs, do_sample=True, max_new_tokens=256)

In [None]:
from IPython.display import Audio

sampling_rate = model.config.audio_encoder.sampling_rate

# listen to our audio sample
Audio(audio_values[0].cpu(), rate=sampling_rate)

### Huggingface Text-conditional generation

In [None]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration

# initialize model
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")

# set device to GPU
device = 'cuda'
model = model.to(device)

# our text description for the model
input_text = ["epic movie theme", "sad jazz"]

# create input
inputs = processor(
    text=input_text,
    padding=True,
    return_tensors="pt",
).to(device)

# generate audio
audio_values_from_text = model.generate(**inputs, max_new_tokens=512)

print(audio_values_from_text.shape)

In [None]:
from IPython.display import Audio

sampling_rate = model.config.audio_encoder.sampling_rate

# listen to our first audio sample from input text "epic music theme"
Audio(audio_values_from_text[0].cpu(), rate=sampling_rate)

# listen to our second audio sample from input text "sad jazz"
Audio(audio_values_from_text[1].cpu(), rate=sampling_rate)

### Huggingface Audio-prompted generation

In [None]:
# take the first half of the generated audio
sample = audio_values_from_text[1][0].cpu().numpy()
sample = sample[: len(sample) // 2]

# use it as input
inputs = processor(
    audio=sample,
    sampling_rate=sampling_rate,
    text=["sad jazz"],
    padding=True,
    return_tensors="pt",
).to(device)
audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256)

In [None]:
Audio(audio_values[0].cpu(), rate=sampling_rate)