In [None]:
#  The MIT License (MIT)
#
#  Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
#
#  Permission is hereby granted, free of charge, to any person obtaining a copy
#  of this software and associated documentation files (the 'Software'), to deal
#  in the Software without restriction, including without limitation the rights
#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the Software is
#  furnished to do so, subject to the following conditions:
#
#  The above copyright notice and this permission notice shall be included in
#  all copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
#  THE SOFTWARE.

# Stable Diffusion 2.1

The following example will show how to run `Stable Diffusion 2.1` with `MIGraphX`.

Install the required dependencies.

In [None]:
# Install dependencies
# We need this version to run torch with gpu tensors
!pip install torch==2.1.1 -f https://repo.radeon.com/rocm/manylinux/rocm-rel-6.0/
!pip install optimum[onnxruntime] transformers diffusers accelerate

We will use optimum to generate the onnx files.

In [None]:
# export models
!optimum-cli export onnx --model stabilityai/stable-diffusion-2-1 models/sd21-onnx --task stable-diffusion

We will use torch tensors for all calculation. Everything will be allocated on the GPU to avoid Host-Device copies.

We installed the rocm version of pytorch, let's confirm that we can access the GPU.

In [None]:
import torch
import torch.version

print(f"{torch.cuda.is_available() = }")
print(f"{torch.cuda.get_device_name(0) = }")
print(f"{torch.version.cuda = }")
print(f"{torch.version.hip = }")

If it is not working properly, try restaring the kernel.

Now it is time to load these models with python.

First, we make sure that MIGraphX module is found in the python path.

In [None]:
import sys
mgx_lib_path = "/opt/rocm/lib/"
# or if you locally built MIGraphX
# mgx_lib_path = "/code/AMDMIGraphX/build/lib/"
if mgx_lib_path not in sys.path:
    sys.path.append(mgx_lib_path)
import migraphx as mgx

Next, a helper method to load and cache the models.

This will use the `models/sd21-onnx` path. If you changed it, make sure to update here as well.

In [None]:
import os
# helper for model loading
def load_mgx_model(name, shapes):
    file = f"models/sd21-onnx/{name}/model"
    print(f"Loading {name} model from {file}")
    if os.path.isfile(f"{file}.mxr"):
        print(f"Found mxr, loading it...")
        model = mgx.load(f"{file}.mxr", format="msgpack")
    elif os.path.isfile(f"{file}.onnx"):
        print(f"Parsing from onnx file...")
        model = mgx.parse_onnx(f"{file}.onnx", map_input_dims=shapes)
        model.compile(mgx.get_target("gpu"), offload_copy=False)
        print(f"Saving {name} model to mxr file...")
        mgx.save(model, f"{file}.mxr", format="msgpack")
    else:
        print(f"No {name} model found. Please verify the path is correct and re-try, or re-download model.")
        sys.exit(1)
    return model

With that, we can load the models. This could take several minutes.

In [None]:
text_encoder = load_mgx_model("text_encoder", {"input_ids": [2, 77]})

In [None]:
unet = load_mgx_model(
        "unet", {
            "sample": [2, 4, 64, 64],
            "encoder_hidden_states": [2, 77, 1024],
            "timestep": [1],
        })

In [None]:
vae = load_mgx_model("vae_decoder", {"latent_sample": [1, 4, 64, 64]})

To pass a tensor to MIGraphX, first we need to convert it an argument.

We avoid the copy via allocating the tensor on the gpu, so we only need to pass the address of the tensor.

First, we need to have a mapping between torch and migraphx data types.

In [None]:
mgx_to_torch_dtype_dict = {
    "bool_type": torch.bool,
    "uint8_type": torch.uint8,
    "int8_type": torch.int8,
    "int16_type": torch.int16,
    "int32_type": torch.int32,
    "int64_type": torch.int64,
    "float_type": torch.float32,
    "double_type": torch.float64,
    "half_type": torch.float16,
}

torch_to_mgx_dtype_dict = {
    value: key
    for (key, value) in mgx_to_torch_dtype_dict.items()
}

Next, we need a way to allocate the torch buffers for the models.

In [None]:
def allocate_torch_tensors(model):
    input_shapes = model.get_parameter_shapes()
    data_mapping = {
        name:
        torch.zeros(shape.lens()).to(
            mgx_to_torch_dtype_dict[shape.type_string()]).to(device="cuda")
        for name, shape in input_shapes.items()
    }
    return data_mapping

Next, we allocate tensors for the models.

In [None]:
text_encoder_tensors = allocate_torch_tensors(text_encoder)
unet_tensors = allocate_torch_tensors(unet)
vae_tensors = allocate_torch_tensors(vae)

Lastly, we need to tell MIGraphX how to access these tensors.

In [None]:
def tensor_to_arg(tensor):
    return mgx.argument_from_pointer(
        mgx.shape(
            **{
                "type": torch_to_mgx_dtype_dict[tensor.dtype],
                "lens": list(tensor.size()),
                "strides": list(tensor.stride())
            }), tensor.data_ptr())

def tensors_to_args(tensors):
    return {name: tensor_to_arg(tensor) for name, tensor in tensors.items()}


Since the tensors won't change, we only need to do this once, an cache it.

In [None]:
text_encoder_args = tensors_to_args(text_encoder_tensors)
unet_args = tensors_to_args(unet_tensors)
vae_args = tensors_to_args(vae_tensors)

The model outputs will be called `main:#output_*`. We create a helper to access them more easily.

In [None]:
def get_output_name(idx):
    return f"main:#output_{idx}"

Import the remaining packages.

In [None]:
from diffusers import EulerDiscreteScheduler
from transformers import CLIPTokenizer
from tqdm.auto import tqdm
from PIL import Image

Time to load the scheduler and tokenizer from the original source.

In [None]:
model_id = "stabilityai/stable-diffusion-2-1"
scheduler = EulerDiscreteScheduler.from_pretrained(model_id,
                                                   subfolder="scheduler")
tokenizer = CLIPTokenizer.from_pretrained(model_id, subfolder="tokenizer")

Next, we will define all the steps one by one, to make the last step short and simple.

The first step will be to tokenize the user prompt. It will make a `(1, 77)` shaped `input_ids`.

In [None]:
def tokenize(*inputs):
    return tokenizer([*inputs],
                     padding="max_length",
                     max_length=tokenizer.model_max_length,
                     truncation=True,
                     return_tensors="pt")

In [None]:
# Optional
test_tk = tokenize("test tokenizer to see the tokens")
test_tk.input_ids.shape

We run the tokenized prompts through the `Text Encoder` model. It expects the `(2, 77)` data as `int32`. It is `2` because we will also pass the negative prompt.

In [None]:
# Optional
text_encoder.get_parameter_shapes()

In [None]:
def get_embeddings(prompt_tokens):
    text_encoder_tensors["input_ids"].copy_(prompt_tokens.input_ids.to(torch.int32))
    torch.cuda.synchronize()

    text_encoder.run(text_encoder_args)
    mgx.gpu_sync()

    return text_encoder_tensors[get_output_name(0)]

In [None]:
# Optional
test_emb = get_embeddings(tokenize("test tokenizer to see the tokens"))
test_emb.shape

The other input of the model is latent representation (pure noise). It will be transformed into a 512x512 image later.
The last input will be the timestep.

In [None]:
def generate_latents(seed):
    return torch.randn(
        (1, 4, 64, 64),
        generator=torch.manual_seed(seed),
    ).to(device="cuda")

In [None]:
# Optional
test_latents = generate_latents(42)
test_latents.shape

Now we add two helpers to access and convert from torch to numpy with the proper datatype.

In [None]:
def get_scaled_sample(latents, t):
    return scheduler.scale_model_input(latents, t).to(torch.float32).to(device="cuda")

def get_timestep(t):
    return torch.atleast_1d(t.to(torch.int64).to(device="cuda"))  # convert 0D -> 1D

The UNet model will be run in a loop. It will predict the noise residual.

In [None]:
# Optional
unet.get_parameter_shapes()

In [None]:
def denoise(sample, embeddings, timestep):
    unet_tensors["sample"].copy_(sample)
    unet_tensors["encoder_hidden_states"].copy_(embeddings)
    unet_tensors["timestep"].copy_(timestep)
    torch.cuda.synchronize()

    unet.run(unet_args)
    mgx.gpu_sync()

    return torch.tensor_split(unet_tensors[get_output_name(0)], 2)

Helpers to do the classifier-free guidance and computing the previous noisy sample.

In [None]:
def perform_guidance(noise_pred_uncond, noise_pred_text, scale):
    return noise_pred_uncond + scale * (noise_pred_text - noise_pred_uncond)

def compute_previous(noise_pred, t, latents):
    # compute the previous noisy sample x_t -> x_t-1
    return scheduler.step(noise_pred, t, latents).prev_sample


Scale and decode the image latents with VAE.

In [None]:
def scale_denoised(latents):
    return 1 / 0.18215 * latents


def decode(latents):
    vae_tensors["latent_sample"].copy_(latents)
    torch.cuda.synchronize()

    vae.run(vae_args)
    mgx.gpu_sync()

    return vae_tensors[get_output_name(0)]

And lastly, we need to convert it to an image to display or save.

In [None]:
def convert_to_rgb_image(image):
    image = (image / 2 + 0.5).clamp(0, 1)
    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
    images = (image * 255).round().astype("uint8")
    return Image.fromarray(images[0])

def save_image(pil_image, filename="output.png"):
    pil_image.save(filename, format="png")

Feel free to play around with these params.

In [None]:
prompt = "a photograph of an astronaut riding a horse"
negative_prompt = ""
steps = 20
seed = 13
scale = 7.0

And now, to put everything together and run the whole pipeline:

In [None]:
scheduler.set_timesteps(steps, device="cuda")

input_tokens = tokenize(prompt, negative_prompt)
text_embeddings = get_embeddings(input_tokens)
latents = generate_latents(seed) * scheduler.init_noise_sigma

for t in tqdm(scheduler.timesteps):
    sample = get_scaled_sample(torch.cat([latents] * 2), t)
    timestep = get_timestep(t)

    noise_pred_text, noise_pred_uncond = denoise(sample, text_embeddings, timestep)

    noise_pred = perform_guidance(noise_pred_uncond, noise_pred_text, scale)
    latents = compute_previous(noise_pred, t, latents)

latents = scale_denoised(latents)
result = decode(latents)
image = convert_to_rgb_image(result)

# show the image
image

If you like the generated image, save it with the following:

In [None]:
save_image(image, "output.png")