In [None]:
export LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}

In [None]:
%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu Pillow scipy "torch>=2.1" torchaudio "diffusers>=0.16.1" "transformers>=4.33.0"
%pip install -q "git+https://github.com/huggingface/optimum-intel.git" "gradio>=3.34.0"
%pip install -q -U --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly "openvino>=2025.1" "openvino-genai>=2025.1"

In [None]:
%pip install openvino-tokenizers

In [2]:
import transformers
transformers.__version__
from transformers import GlmModel

In [9]:
from pathlib import Path
import requests
import openvino_genai as ov_genai
import io
import numpy as np
from PIL import Image
from scipy.io import wavfile
import torch
import torchaudio
import IPython.display as ipd
import os
import sys

In [13]:
if not Path("notebook_utils.py").exists():
    r = requests.get(
        url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py",
    )
    open("notebook_utils.py", "w").write(r.text)

if not Path("cmd_helper.py").exists():
    r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py")
    open("cmd_helper.py", "w").write(r.text)

if not Path("gradio_helper.py").exists():
    r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/riffusion-text-to-music/gradio_helper.py")
    open("gradio_helper.py", "w").write(r.text)

from gradio_helper import make_demo

# Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry
from notebook_utils import collect_telemetry

collect_telemetry("riffusion-text-to-music.ipynb")

MODEL_ID = "riffusion/riffusion-model-v1"
MODEL_DIR = Path("riffusion_pipeline")

In [None]:
from huggingface_hub import configure_http_backend
from optimum.intel.openvino import OVStableDiffusionPipeline

In [7]:
def backend_factory() -> requests.Session:
    session = requests.Session()
    session.verify = False
    return session

configure_http_backend(backend_factory=backend_factory)

In [None]:
pipe=OVStableDiffusionPipeline.from_pretrained(
    MODEL_ID,
    EXPORT=True,
    device="CPU",
    compile=False
)

In [7]:
from transformers import CLIPTokenizerFast
from openvino_tokenizers import convert_tokenizer
from openvino.runtime import serialize



In [None]:
hf_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch16")
ov_tok, ov_detok = convert_tokenizer(hf_tok, with_detokenizer=True)
pipe.save_pretrained("riffuson_pipeline", save_config=True)
pipe = ov_genai.OVGenAIPipeline.from_pretrained(MODEL_ID, device="CPU", compile=False)
pipe

In [11]:
def wav_bytes_from_spectrogram_image(image: Image.Image) -> tuple[io.BytesIO, float]:
    """
    Reconstruct a WAV audio clip from a spectrogram image. Also returns the duration in seconds.

    Parameters:
      image (Image.Image): generated spectrogram image
    Returns:
      wav_bytes (io.BytesIO): audio signal encoded in wav bytes
      duration_s (float): duration in seconds
    """

    max_volume = 50
    power_for_image = 0.25
    Sxx = spectrogram_from_image(image, max_volume=max_volume, power_for_image=power_for_image)

    sample_rate = 44100  # [Hz]
    clip_duration_ms = 5000  # [ms]

    bins_per_image = 512
    n_mels = 512

    # FFT parameters
    window_duration_ms = 100  # [ms]
    padded_duration_ms = 400  # [ms]
    step_size_ms = 10  # [ms]

    # Derived parameters
    num_samples = int(image.width / float(bins_per_image) * clip_duration_ms) * sample_rate
    n_fft = int(padded_duration_ms / 1000.0 * sample_rate)
    hop_length = int(step_size_ms / 1000.0 * sample_rate)
    win_length = int(window_duration_ms / 1000.0 * sample_rate)

    samples = waveform_from_spectrogram(
        Sxx=Sxx,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        num_samples=num_samples,
        sample_rate=sample_rate,
        mel_scale=True,
        n_mels=n_mels,
        num_griffin_lim_iters=32,
    )

    wav_bytes = io.BytesIO()
    wavfile.write(wav_bytes, sample_rate, samples.astype(np.int16))
    wav_bytes.seek(0)

    duration_s = float(len(samples)) / sample_rate

    return wav_bytes, duration_s


def spectrogram_from_image(image: Image.Image, max_volume: float = 50, power_for_image: float = 0.25) -> np.ndarray:
    """
    Compute a spectrogram magnitude array from a spectrogram image.

    Parameters:
      image (image.Image): input image
      max_volume (float, *optional*, 50): max volume for spectrogram magnitude
      power_for_image (float, *optional*, 0.25): power for reversing power curve
    """
    # Convert to a numpy array of floats
    data = np.array(image).astype(np.float32)

    # Flip Y take a single channel
    data = data[::-1, :, 0]

    # Invert
    data = 255 - data

    # Rescale to max volume
    data = data * max_volume / 255

    # Reverse the power curve
    data = np.power(data, 1 / power_for_image)

    return data


def waveform_from_spectrogram(
    Sxx: np.ndarray,
    n_fft: int,
    hop_length: int,
    win_length: int,
    num_samples: int,
    sample_rate: int,
    mel_scale: bool = True,
    n_mels: int = 512,
    num_griffin_lim_iters: int = 32,
    device: str = "cpu",
) -> np.ndarray:
    """
    Reconstruct a waveform from a spectrogram.
    This is an approximate waveform, using the Griffin-Lim algorithm
    to approximate the phase.
    """
    Sxx_torch = torch.from_numpy(Sxx).to(device)

    if mel_scale:
        mel_inv_scaler = torchaudio.transforms.InverseMelScale(
            n_mels=n_mels,
            sample_rate=sample_rate,
            f_min=0,
            f_max=10000,
            n_stft=n_fft // 2 + 1,
            norm=None,
            mel_scale="htk",
        ).to(device)

        Sxx_torch = mel_inv_scaler(Sxx_torch)

    griffin_lim = torchaudio.transforms.GriffinLim(
        n_fft=n_fft,
        win_length=win_length,
        hop_length=hop_length,
        power=1.0,
        n_iter=num_griffin_lim_iters,
    ).to(device)

    waveform = griffin_lim(Sxx_torch).cpu().numpy()

    return waveform

In [12]:
def generate(prompt: str, negative_prompt: str = "") -> tuple[Image.Image, str]:
    """
    function for generation audio from text prompt

    Parameters:
      prompt (str): input prompt for generation.
      negative_prompt (str): negative prompt for generation, contains undesired concepts for generation, which should be avoided. Can be empty.
    Returns:
      spec (Image.Image) - generated spectrogram image
    """
    spec_tokens = pipe.generate(prompt, negative_prompt=negative_prompt, num_inference_steps=20)
    spec = Image.fromarray(spec_tokens.data[0])
    wav = wav_bytes_from_spectrogram_image(spec)
    with open("output.wav", "wb") as f:
        f.write(wav[0].getbuffer())
    return spec, "output.wav"

In [None]:
tok_dir = Path("riffusion_pipeline/tokenizer")
tok_dir.mkdir(parents=True, exist_ok=True)
serialize(ov_tok, tok_dir / "openvino_tokenizer.xml")
serialize(ov_detok, tok_dir / "openvino_detokenizer.xml")

In [None]:
spectrogram, wav_path = generate("Techno beat")

In [None]:
spectrogram

In [None]:
ipd.Audio(wav_path)

In [None]:
def select_device(device_str: str, current_text: str = "", progress: gr.Progress = gr.Progress()):
    """
    Helper function for uploading model on the device.

    Parameters:
      device_str (str): Device name.
      current_text (str): Current content of user instruction field (used only for backup purposes, temporally replacing it on the progress bar during model loading).
      progress (gr.Progress): gradio progress tracker
    Returns:
      current_text
    """
    if device_str != pipe._device:
        pipe.clear_requests()
        pipe.to(device_str)

        for i in progress.tqdm(range(1), desc=f"Model loading on {device_str}"):
            pipe.compile()
    return current_text

In [None]:
demo = make_demo(generate_fn=generate, select_device_fn=select_device)

try:
    demo.queue().launch(debug=True, height=800)
except Exception:
    demo.queue().launch(debug=True, share=True, height=800)

In [None]:
# Initialize HiFi-GAN vocoder bundle
bundle = HIFIGAN_VOCODER_V3_LJSPEECH
vocoder = bundle.get_vocoder().to(device)


def wav_bytes_from_spectrogram_image(image: Image.Image) -> tuple[torch.Tensor, int]:
    """
    Convert spectrogram image to waveform using HiFi-GAN for better audio fidelity.
    """
    # Tweak spectrogram normalization
    max_volume = 80
    power_for_image = 0.5
    Sxx = spectrogram_from_image(image, max_volume=max_volume, power_for_image=power_for_image)

    # Convert to tensor and batch
    mel = torch.from_numpy(Sxx).unsqueeze(0).to(device)

    # Generate waveform
    with torch.no_grad():
        waveform = vocoder(mel).squeeze(0).cpu()

    # Use sample rate from bundle
    sample_rate = bundle._sample_rate  # typically 22050
    return waveform, sample_rate

In [None]:
def generate(
    prompt: str,
    negative_prompt: str = "",
    num_inference_steps: int = 50,
    guidance_scale: float = 7.5,
    seed: int | None = None
) -> tuple[Image.Image, str]:
    """
    Generate spectrogram from text, invert to audio, and save WAV.
    """
    if seed is not None:
        torch.manual_seed(seed)

    outputs = pipe.generate(
        prompt,
        negative_prompt=negative_prompt,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale
    )

    spec = Image.fromarray(outputs.data[0])
    waveform, sr = wav_bytes_from_spectrogram_image(spec)
    out_path = f"output_{seed or 'latest'}.wav"
    torchaudio.save(out_path, waveform.unsqueeze(0), sr)
    return spec, out_path

In [None]:
with gr.Blocks() as demo:
    gr.Markdown("## Riff-2 Text-to-Audio Demo")

    with gr.Row():
        prompt = gr.Textbox(label="Prompt", placeholder="Type description...")
        neg_prompt = gr.Textbox(label="Negative Prompt", placeholder="Avoid words... (optional)")

    with gr.Row():
        steps = gr.Slider(10, 100, value=50, step=5, label="Inference Steps")
        scale = gr.Slider(1.0, 15.0, value=7.5, step=0.5, label="Guidance Scale")
        seed_input = gr.Number(label="Seed (optional)", precision=0)

    generate_btn = gr.Button("Generate Audio")
    spec_out = gr.Image(label="Spectrogram Output")
    audio_out = gr.Audio(label="Generated Audio")

    generate_btn.click(
        fn=generate,
        inputs=[prompt, neg_prompt, steps, scale, seed_input],
        outputs=[spec_out, audio_out]
    )

    demo.launch(server_name="0.0.0.0", share=False)

In [1]:
!pip install audiocraft

Collecting audiocraft
  Downloading audiocraft-1.3.0.tar.gz (635 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m635.7/635.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting av==11.0.0 (from audiocraft)
  Downloading av-11.0.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (4.5 kB)
Collecting einops (from audiocraft)
  Using cached einops-0.8.1-py3-none-any.whl.metadata (13 kB)
Collecting flashy>=0.0.1 (from audiocraft)
  Downloading flashy-0.0.2.tar.gz (72 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting hydra-core>=1.1 (from audiocraft)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Collecting hydra_colorlog (from audiocraft)
  Downloading hydra_colorlog-1.2.0-py3-none-any.whl.metadata (949 bytes)
Collecting julius (from audiocraft)
  Downloading jul

In [2]:
!pip install 'torch==2.1.0'

[31mERROR: Could not find a version that satisfies the requirement torch==2.1.0 (from versions: 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.4.0, 2.4.1, 2.5.0, 2.5.1, 2.6.0, 2.7.0)[0m[31m
[0m[31mERROR: No matching distribution found for torch==2.1.0[0m[31m
[0m

In [None]:
# Prepend LLVM’s bin to your PATH so that “clang” is the brew one:
export PATH="$(brew --prefix llvm)/bin:$PATH"

# (Optional but safer) Also set CC/CXX explicitly:
export CC="$(brew --prefix llvm)/bin/clang"
export CXX="$(brew --prefix llvm)/bin/clang++"

In [None]:
export CPPFLAGS="-I$(brew --prefix libomp)/include"
export LDFLAGS="-L$(brew --prefix libomp)/lib"


"""eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJrZXlfaWQiOiI1ZmFkZGRiYi0yMTMxLTQ2YWYtYmZiYS1hNDIwMTA3MTA5ZGIiLCJvcmdfaWQiOiJiYTZDZWZ0cjhCVUdOa2pWZWlvdmZac3hNb1dlQnBJZCIsImlhdCI6MTc0ODYyNzI2MCwidmVyIjoxfQ.69vCGQG29LU6YEaSOd9GSJ9LUC_YLz2yDTf6d4ZiKN0"""

You are a system that rewrites descriptive sentences into short audio event phrases suitable for an audio generation model.

First, identify the main sound-producing source described in the sentence.

Then answer the question: “What sound is this source most likely to make?”

If the described sound in the sentence is incidental (e.g., a lid closing, a switch clicking), and not the primary or characteristic sound of the source, override it with the most likely, contextually accurate, continuous sound the source would produce.

Your answer should be:

Concise

Noun-verb phrased

In continuous tense

Lowercase

No punctuation

No explanations or preambles

Examples:

Input: “The sound of a pressure cooker lid being closed, with the handle being firmly grasped, would likely be described as a ‘crackling’ or ‘clang’ sound.”
Output: a pressure cooker whistling

Input: “The toaster makes a clicking sound when its lever is pushed down.”
Output: a toaster popping

Input: “A blender placed on a counter makes a thud, and its motor begins to spin.”
Output: a blender whirring