From 3901935a09c9ded6399b759f1bee0b08f15a1c0c Mon Sep 17 00:00:00 2001 From: sa_ddam213 Date: Thu, 18 Apr 2024 21:27:45 +1200 Subject: [PATCH 1/9] Safetensor to Onnx Olive conversion scripts --- OnnxStack.Converter/README.md | 20 + .../latent_consistency/.gitignore | 2 + .../latent_consistency/README.md | 180 +++++++++ .../latent_consistency/config.py | 8 + .../latent_consistency/config_controlnet.json | 124 +++++++ .../config_safety_checker.json | 124 +++++++ .../config_text_encoder.json | 121 +++++++ .../latent_consistency/config_unet.json | 129 +++++++ .../config_vae_decoder.json | 121 +++++++ .../config_vae_encoder.json | 121 +++++++ .../latent_consistency/convert.py | 272 ++++++++++++++ .../latent_consistency/models.py | 336 +++++++++++++++++ .../latent_consistency/requirements.txt | 9 + .../latent_consistency/sd_utils/ort.py | 172 +++++++++ .../stable_diffusion/.gitignore | 2 + .../stable_diffusion/README.md | 180 +++++++++ .../stable_diffusion/config.py | 8 + .../stable_diffusion/config_controlnet.json | 123 +++++++ .../config_safety_checker.json | 124 +++++++ .../stable_diffusion/config_text_encoder.json | 121 +++++++ .../stable_diffusion/config_unet.json | 128 +++++++ .../stable_diffusion/config_vae_decoder.json | 121 +++++++ .../stable_diffusion/config_vae_encoder.json | 121 +++++++ .../stable_diffusion/convert.py | 273 ++++++++++++++ .../stable_diffusion/models.py | 342 ++++++++++++++++++ .../stable_diffusion/requirements.txt | 9 + .../stable_diffusion/sd_utils/ort.py | 172 +++++++++ 27 files changed, 3463 insertions(+) create mode 100644 OnnxStack.Converter/README.md create mode 100644 OnnxStack.Converter/latent_consistency/.gitignore create mode 100644 OnnxStack.Converter/latent_consistency/README.md create mode 100644 OnnxStack.Converter/latent_consistency/config.py create mode 100644 OnnxStack.Converter/latent_consistency/config_controlnet.json create mode 100644 OnnxStack.Converter/latent_consistency/config_safety_checker.json create mode 100644 OnnxStack.Converter/latent_consistency/config_text_encoder.json create mode 100644 OnnxStack.Converter/latent_consistency/config_unet.json create mode 100644 OnnxStack.Converter/latent_consistency/config_vae_decoder.json create mode 100644 OnnxStack.Converter/latent_consistency/config_vae_encoder.json create mode 100644 OnnxStack.Converter/latent_consistency/convert.py create mode 100644 OnnxStack.Converter/latent_consistency/models.py create mode 100644 OnnxStack.Converter/latent_consistency/requirements.txt create mode 100644 OnnxStack.Converter/latent_consistency/sd_utils/ort.py create mode 100644 OnnxStack.Converter/stable_diffusion/.gitignore create mode 100644 OnnxStack.Converter/stable_diffusion/README.md create mode 100644 OnnxStack.Converter/stable_diffusion/config.py create mode 100644 OnnxStack.Converter/stable_diffusion/config_controlnet.json create mode 100644 OnnxStack.Converter/stable_diffusion/config_safety_checker.json create mode 100644 OnnxStack.Converter/stable_diffusion/config_text_encoder.json create mode 100644 OnnxStack.Converter/stable_diffusion/config_unet.json create mode 100644 OnnxStack.Converter/stable_diffusion/config_vae_decoder.json create mode 100644 OnnxStack.Converter/stable_diffusion/config_vae_encoder.json create mode 100644 OnnxStack.Converter/stable_diffusion/convert.py create mode 100644 OnnxStack.Converter/stable_diffusion/models.py create mode 100644 OnnxStack.Converter/stable_diffusion/requirements.txt create mode 100644 OnnxStack.Converter/stable_diffusion/sd_utils/ort.py diff --git a/OnnxStack.Converter/README.md b/OnnxStack.Converter/README.md new file mode 100644 index 0000000..00108ee --- /dev/null +++ b/OnnxStack.Converter/README.md @@ -0,0 +1,20 @@ +# OnnxStack.Converter + +## Requirements +```bash +pip install onnxruntime-directml +pip install olive-ai[directml] +python -m pip install -r requirements.txt +``` + +## Usage +```bash +convert.py --optimize --model_input '..\stable-diffusion-v1-5' --model_output '..\converted' --controlnet +``` +`--optimize` - Run the model optimization + +`--model_input` - Safetensor model to convert + +`--model_output` - Output for converted ONNX model + +`--controlnet` - Create a ControlNet enabled Unet model \ No newline at end of file diff --git a/OnnxStack.Converter/latent_consistency/.gitignore b/OnnxStack.Converter/latent_consistency/.gitignore new file mode 100644 index 0000000..324c183 --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/.gitignore @@ -0,0 +1,2 @@ +/footprints/ +/result_*.png diff --git a/OnnxStack.Converter/latent_consistency/README.md b/OnnxStack.Converter/latent_consistency/README.md new file mode 100644 index 0000000..a12b762 --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/README.md @@ -0,0 +1,180 @@ +# Stable Diffusion Optimization + +This folder contains sample use cases of Olive with ONNX Runtime and OpenVINO to optimize: +- Stable Diffusion: [Stable Diffusion v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4), [Stable Diffusion v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [Stable Diffusion v2](https://huggingface.co/stabilityai/stable-diffusion-2) +- Stable Diffusion XL: [Stable Diffusion XL Base](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), [Stable Diffusion XL Refiner](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0) + +Stable Diffusion comprises multiple PyTorch models tied together into a *pipeline*. + +The ONNX Runtime optimization sample will convert each PyTorch model to ONNX, and then run the converted ONNX models through the `OrtTransformersOptimization` pass. The transformer optimization pass performs several time-consuming graph transformations that make the models more efficient for inference at runtime. + +The OpenVINO optimization sample will convert each PyTorch model to OpenVINO IR model by `OpenVINOConversion` pass, and create an `OpenVINOStableDiffusionPipeline` for inference. + +- ONNX Runtime with + - [CUDA EP](#stable-diffusion-and-stable-diffusion-xl-optimization-with-onnx-runtime-cuda-ep) + - DirectML EP: go to examples [Stable Diffusion](../directml/stable_diffusion/README.md), [Stable Diffusion XL](../directml/stable_diffusion_xl/README.md) +- [OpenVINO](#stable-diffusion-optimization-with-openvino) + +## Stable Diffusion and Stable Diffusion XL Optimization with ONNX Runtime CUDA EP + +This sample performs the following optimization workflow for each model in the Stable Diffusion pipeline: +- *PyTorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp16* +

+ +Transformers optimization uses the following optimizations to speed up Stable Diffusion in CUDA: +* [Flash Attention](https://arxiv.org/abs/2205.14135) for float16 precision. Flash Attention uses tiling to reduce number of GPU memory reads/writes, and improves performance with less memory for long sequence length. The kernel requires GPUs of Compute Capability >= 7.5 (like T4, A100, and RTX 2060~4090). Only availanle in Linux. +* [Memory Efficient Attention](https://arxiv.org/abs/2112.05682v2) for float32 precision or older GPUs (like V100). We used the fused multi-head attention kernel in CUTLASS, and the kernel was contributed by xFormers. +* Channel-last (NHWC) convolution. For NVidia GPU with Tensor Cores support, NHWC tensor layout is recommended for convolution. See [Tensor Layouts In Memory: NCHW vs NHWC](https://docs.nvidia.com/deeplearning/performance/dl-performance-convolutional/index.html#tensor-layout). +* GroupNorm for NHWC tensor layout, and SkipGroupNorm fusion which fuses GroupNorm with Add bias and residual inputs +* SkipLayerNormalization which fuses LayerNormalization with Add bias and residual inputs. +* BiasSplitGelu is a fusion of Add bias with SplitGelu activation. +* BiasAdd fuses Add bias and residual. +* Reduce Transpose nodes by graph transformation. + +#### Prerequisites +##### Clone the repository and install Olive + +Refer to the instructions in the [examples README](../README.md) to clone the repository and install Olive. + + +We use the same olive workflow config files and scripts as the DirectML examples. The only difference is the `--provider cuda` option provided to the `stable_diffusion.py` and `stable_diffusion_xl.py` scripts. + +So, cd into the corresponding DirectML example folder from the root of the cloned repository: + +**_Stable Diffusion_** +```bash +cd examples/stable_diffusion +``` + +**_Stable Diffusion XL_** +```bash +cd examples/directml/stable_diffusion_xl +``` + +##### Install onnxruntime + +This example requires the latest onnxruntime-gpu code which can either be built from source or installed from the nightly builds. The following command can be used to install the latest nightly build of onnxruntime-gpu: + +```bash +# uninstall any pre-existing onnxruntime packages +pip uninstall -y onnxruntime onnxruntime-gpu onnxruntime-directml ort-nightly ort-nightly-gpu ort-nightly-directml + +# install onnxruntime-gpu nightly build +pip install ort-nightly-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ +``` + +##### Install other dependencies + +Install the necessary python packages: + +```bash +python -m pip install -r requirements-common.txt +``` + +#### Conversion to ONNX and Latency Optimization + +The easiest way to optimize the pipeline is with the `stable_diffusion.py` and `stable_diffusion_xl.py` scripts. These scripts will enumerate the `config_.json` files and optimize each with Olive, then gather the optimized models into a directory structure suitable for testing inference. + +**_Stable Diffusion_** +```bash +# default model_id is "runwayml/stable-diffusion-v1-5" +python stable_diffusion.py --provider cuda --optimize +``` + +**_Stable Diffusion XL_** +```bash +# default model_id is "stabilityai/stable-diffusion-xl-base-1.0" +python stable_diffusion_xl.py --provider cuda --optimize [--use_fp16_fixed_vae] + +# or specify a different model_id +python stable_diffusion_xl.py --provider cuda --model_id stabilityai/stable-diffusion-xl-refiner-1.0 --optimize [--use_fp16_fixed_vae] +``` + +`--use_fp16_fixed_vae` is optional. If provided, will use [madebyollin/sdxl-vae-fp16-fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) for the vae models and all sub-models will be entirely in fp16. +Otherwise, the vae models (vae-decoder for base and both vae-decoder and vae-encoder for refiner) will be in fp32 and all other sub-models will be in fp16 with fp32 input/output. + +Once the script successfully completes: +- The optimized ONNX pipeline will be stored under `models/optimized-cuda/[model_id]` (for example `models/optimized-cuda/runwayml/stable-diffusion-v1-5` or `models/optimized-cuda/stabilityai/stable-diffusion-xl-base-1.0`). +- The unoptimized ONNX pipeline (models converted to ONNX, but not run through transformer optimization pass) will be stored under `models/unoptimized/[model_id]` (for example `models/unoptimized/runwayml/stable-diffusion-v1-5` or `models/unoptimized/stabilityai/stable-diffusion-xl-base-1.0`). + +Re-running the script with `--optimize` will delete the output models, but it will *not* delete the Olive cache. Subsequent runs will complete much faster since it will simply be copying previously optimized models; you may use the `--clean_cache` option to start from scratch (not typically used unless you are modifying the scripts, for example). + +### Test Inference with CUDA + +Test ONNX runtime inference with the optimized models using `OnnxStableDiffusionPipeline`: + +**_Stable Diffusion_** +```bash +python stable_diffusion.py --provider cuda --num_images 2 +``` +Inference will loop until the generated image passes the safety checker (otherwise you would see black images). The result will be saved as `result_.png` on disk. + +**_Stable Diffusion XL_** +```bash +python stable_diffusion_xl.py --provider cuda --num_images 2 +``` +The result will be saved as `result_.png` on disk. + +Refer to the corresponding section in the DirectML READMEs for more details on the test inference options: +- [Stable Diffusion](../directml/stable_diffusion/README.md#test-inference) +- [Stable Diffusion XL](../directml/stable_diffusion_xl/README.md#test-inference) + + +## Stable Diffusion Optimization with OpenVINO + +**Contents**: +- [Setup](#setup) +- [Conversion to OpenVINO IR model](#convert-to-openvino-ir-model) +- [Test Inference](#test-inference-with-openvino) + +### Setup + +Olive is currently under pre-release, with constant updates and improvements to the functions and usage. This sample code will be frequently updated as Olive evolves, so it is important to install Olive from source when checking out this code from the main branch. See the [README for examples](https://github.com/microsoft/Olive/blob/main/examples/README.md#important) for detailed instructions on how to do this. + +**Alternatively**, you may install a stable release that we have validated. For example: + +``` +# Install Olive from main branch +pip install git+https://github.com/microsoft/Olive#egg=olive-ai[openvino] + +# Clone Olive repo to access sample code +git clone https://github.com/microsoft/olive +``` + +Once you've installed Olive, install the requirements for this sample matching the version of the library you are using: +``` +cd olive/examples/stable_diffusion +pip install -r requirements-ov.txt +``` + +### Convert to OpenVINO IR model + +The easiest way to optimize the pipeline is with the `stable_diffusion.py` helper script: + +``` +python stable_diffusion.py --optimize +``` + +The above command will enumerate the `config_.json` files and optimize each with Olive, then gather the optimized models into a directory structure suitable for testing inference. + +The stable diffusion models are large, and the optimization process is resource intensive. It is recommended to run optimization on a system with a minimum of 16GB of memory (preferably 32GB). Expect optimization to take several minutes (especially the U-Net model). + +Once the script successfully completes: +- The converted OpenVINO IR model will be stored under `models/optimized-openvino/[model_id]` (for example `models/optimized-openvino/runwayml/stable-diffusion-v1-5`). + +Re-running the script with `--optimize` will delete the output models, but it will *not* delete the Olive cache. Subsequent runs will complete much faster since it will simply be copying previously optimized models; you may use the `--clean_cache` option to start from scratch (not typically used unless you are modifying the scripts, for example). + +### Test Inference with OpenVINO + +This sample code is primarily intended to illustrate model optimization with Olive, but it also provides a simple interface for testing inference with the OpenVINO models. Inference is done by creating an `OVStableDiffusionPipeline` from the saved models. + + +``` +python stable_diffusion.py --inference --provider openvino +``` +Inference will loop until the generated image. The result will be saved as `result_.png` on disk. + + +Run `python stable_diffusion.py --help` for additional options. A few particularly relevant ones: +- `--image_path `: the input image path for image to image inference. +- `--img_to_img_example`: image to image example. The default input image is `assets/dog.png`, the default prompt is `amazing watercolor painting`. diff --git a/OnnxStack.Converter/latent_consistency/config.py b/OnnxStack.Converter/latent_consistency/config.py new file mode 100644 index 0000000..1806391 --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/config.py @@ -0,0 +1,8 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +vae_sample_size = 768 +unet_sample_size = 96 +cross_attention_dim = 768 diff --git a/OnnxStack.Converter/latent_consistency/config_controlnet.json b/OnnxStack.Converter/latent_consistency/config_controlnet.json new file mode 100644 index 0000000..0d9331f --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/config_controlnet.json @@ -0,0 +1,124 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "SimianLuo/LCM_Dreamshaper_v7", + "model_loader": "controlnet_unet_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample", "timestep", "encoder_hidden_states", "down_block_0_additional_residual", "down_block_1_additional_residual", "down_block_2_additional_residual", "down_block_3_additional_residual", "down_block_4_additional_residual", "down_block_5_additional_residual", "down_block_6_additional_residual", "down_block_7_additional_residual", "down_block_8_additional_residual", "down_block_9_additional_residual", "down_block_10_additional_residual", "down_block_11_additional_residual", "mid_block_additional_residual", "return_dict" ], + "output_names": [ "out_sample" ], + "dynamic_axes": { + "sample": {"0": "unet_sample_batch", "1": "unet_sample_channels", "2": "unet_sample_height", "3": "unet_sample_width"}, + "timestep": {"0": "unet_time_batch"}, + "encoder_hidden_states": {"0": "unet_hidden_batch", "1": "unet_hidden_sequence"}, + "timestep_cond": { "0": "batch_size" }, + "down_block_0_additional_residual": {"0": "cnet_db0_batch", "1": "cnet_db0_channels", "2": "cnet_db0_height", "3": "cnet_db0_width"}, + "down_block_1_additional_residual": {"0": "cnet_db1_batch", "1": "cnet_db1_channels", "2": "cnet_db1_height", "3": "cnet_db1_width"}, + "down_block_2_additional_residual": {"0": "cnet_db2_batch", "1": "cnet_db2_channels", "2": "cnet_db2_height", "3": "cnet_db2_width"}, + "down_block_3_additional_residual": {"0": "cnet_db3_batch", "1": "cnet_db3_channels", "2": "cnet_db3_height2", "3": "cnet_db3_width2"}, + "down_block_4_additional_residual": {"0": "cnet_db4_batch", "1": "cnet_db4_channels", "2": "cnet_db4_height2", "3": "cnet_db4_width2"}, + "down_block_5_additional_residual": {"0": "cnet_db5_batch", "1": "cnet_db5_channels", "2": "cnet_db5_height2", "3": "cnet_db5_width2"}, + "down_block_6_additional_residual": {"0": "cnet_db6_batch", "1": "cnet_db6_channels", "2": "cnet_db6_height4", "3": "cnet_db6_width4"}, + "down_block_7_additional_residual": {"0": "cnet_db7_batch", "1": "cnet_db7_channels", "2": "cnet_db7_height4", "3": "cnet_db7_width4"}, + "down_block_8_additional_residual": {"0": "cnet_db8_batch", "1": "cnet_db8_channels", "2": "cnet_db8_height4", "3": "cnet_db8_width4"}, + "down_block_9_additional_residual": {"0": "cnet_db9_batch", "1": "cnet_db9_channels", "2": "cnet_db9_height8", "3": "cnet_db9_width8"}, + "down_block_10_additional_residual": {"0": "cnet_db10_batch", "1": "cnet_db10_channels", "2": "cnet_db10_height8", "3": "cnet_db10_width8"}, + "down_block_11_additional_residual": {"0": "cnet_db11_batch", "1": "cnet_db11_channels", "2": "cnet_db11_height8", "3": "cnet_db11_width8"}, + "mid_block_additional_residual": {"0": "cnet_mbar_batch", "1": "cnet_mbar_channels", "2": "cnet_mbar_height8", "3": "cnet_mbar_width8"} + } + }, + "dummy_inputs_func": "controlnet_unet_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "controlnet_unet_data_loader", + "batch_size": 2 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14, + "save_as_external_data": true, + "all_tensors_to_one_file": true, + "external_data_name": "weights.pb" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "controlnet", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/latent_consistency/config_safety_checker.json b/OnnxStack.Converter/latent_consistency/config_safety_checker.json new file mode 100644 index 0000000..bef935f --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/config_safety_checker.json @@ -0,0 +1,124 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "SimianLuo/LCM_Dreamshaper_v7", + "model_loader": "safety_checker_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "clip_input", "images" ], + "output_names": [ "out_images", "has_nsfw_concepts" ], + "dynamic_axes": { + "clip_input": { "0": "batch", "1": "channels", "2": "height", "3": "width" }, + "images": { "0": "batch", "1": "height", "2": "width", "3": "channels" } + } + }, + "dummy_inputs_func": "safety_checker_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "safety_checker_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14 + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "safety_checker_conversion_inputs", + "output_model": "safety_checker" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "safety_checker", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/latent_consistency/config_text_encoder.json b/OnnxStack.Converter/latent_consistency/config_text_encoder.json new file mode 100644 index 0000000..0a1c5de --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/config_text_encoder.json @@ -0,0 +1,121 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "SimianLuo/LCM_Dreamshaper_v7", + "model_loader": "text_encoder_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "input_ids" ], + "output_names": [ "last_hidden_state", "pooler_output" ], + "dynamic_axes": { "input_ids": { "0": "batch", "1": "sequence" } } + }, + "dummy_inputs_func": "text_encoder_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "text_encoder_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14 + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "text_encoder_conversion_inputs", + "output_model": "text_encoder" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "clip", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "clip", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "text_encoder", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/latent_consistency/config_unet.json b/OnnxStack.Converter/latent_consistency/config_unet.json new file mode 100644 index 0000000..1c3b983 --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/config_unet.json @@ -0,0 +1,129 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "SimianLuo/LCM_Dreamshaper_v7", + "model_loader": "unet_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample", "timestep", "encoder_hidden_states", "timestep_cond", "return_dict" ], + "output_names": [ "out_sample" ], + "dynamic_axes": { + "sample": {"0": "unet_sample_batch", "1": "unet_sample_channels", "2": "unet_sample_height", "3": "unet_sample_width"}, + "timestep": {"0": "unet_time_batch"}, + "encoder_hidden_states": {"0": "unet_hidden_batch", "1": "unet_hidden_sequence"}, + "timestep_cond": { "0": "batch_size" } + } + }, + "dummy_inputs_func": "unet_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "unet_data_loader", + "batch_size": 2 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14, + "save_as_external_data": true, + "all_tensors_to_one_file": true, + "external_data_name": "weights.pb" + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "get_unet_ov_example_input", + "output_model": "unet" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "unet", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/latent_consistency/config_vae_decoder.json b/OnnxStack.Converter/latent_consistency/config_vae_decoder.json new file mode 100644 index 0000000..755ab9a --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/config_vae_decoder.json @@ -0,0 +1,121 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "SimianLuo/LCM_Dreamshaper_v7", + "model_loader": "vae_decoder_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "latent_sample", "return_dict" ], + "output_names": [ "sample" ], + "dynamic_axes": { "latent_sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } } + }, + "dummy_inputs_func": "vae_decoder_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "vae_decoder_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14 + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "vae_decoder_conversion_inputs", + "output_model": "vae_decoder" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "vae_decoder", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/latent_consistency/config_vae_encoder.json b/OnnxStack.Converter/latent_consistency/config_vae_encoder.json new file mode 100644 index 0000000..7a664ea --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/config_vae_encoder.json @@ -0,0 +1,121 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "SimianLuo/LCM_Dreamshaper_v7", + "model_loader": "vae_encoder_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample", "return_dict" ], + "output_names": [ "latent_sample" ], + "dynamic_axes": { "sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } } + }, + "dummy_inputs_func": "vae_encoder_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "vae_encoder_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14 + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "vae_encoder_conversion_inputs", + "output_model": "vae_encoder" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "vae_encoder", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/latent_consistency/convert.py b/OnnxStack.Converter/latent_consistency/convert.py new file mode 100644 index 0000000..2c476a0 --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/convert.py @@ -0,0 +1,272 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import argparse +import json +import shutil +import sys +import warnings +from pathlib import Path +from typing import Dict + +import config +import torch +from diffusers import DiffusionPipeline +from packaging import version + +from olive.common.utils import set_tempdir +from olive.workflows import run as olive_run + +# pylint: disable=redefined-outer-name +# ruff: noqa: TID252, T201 + + +def save_image(result, batch_size, provider, num_images, images_saved, image_callback=None): + passed_safety_checker = 0 + for image_index in range(batch_size): + if result.nsfw_content_detected is None or not result.nsfw_content_detected[image_index]: + passed_safety_checker += 1 + if images_saved < num_images: + output_path = f"result_{images_saved}.png" + result.images[image_index].save(output_path) + if image_callback: + image_callback(images_saved, output_path) + images_saved += 1 + print(f"Generated {output_path}") + print(f"Inference Batch End ({passed_safety_checker}/{batch_size} images).") + print("Images passed the safety checker.") + return images_saved + + +def run_inference_loop( + pipeline, + prompt, + num_images, + batch_size, + image_size, + num_inference_steps, + guidance_scale, + strength: float, + provider: str, + image_callback=None, + step_callback=None, +): + images_saved = 0 + + def update_steps(step, timestep, latents): + if step_callback: + step_callback((images_saved // batch_size) * num_inference_steps + step) + + while images_saved < num_images: + print(f"\nInference Batch Start (batch size = {batch_size}).") + + kwargs = {} + + result = pipeline( + [prompt] * batch_size, + num_inference_steps=num_inference_steps, + callback=update_steps if step_callback else None, + height=image_size, + width=image_size, + guidance_scale=guidance_scale, + **kwargs, + ) + + images_saved = save_image(result, batch_size, provider, num_images, images_saved, image_callback) + + +def update_config_with_provider(config: Dict, provider: str): + if provider == "dml": + # DirectML EP is the default, so no need to update config. + return config + elif provider == "cuda": + from sd_utils.ort import update_cuda_config + + return update_cuda_config(config) + else: + raise ValueError(f"Unsupported provider: {provider}") + + +def optimize( + model_input: str, + model_output: Path, + provider: str, + controlnet: bool +): + from google.protobuf import __version__ as protobuf_version + + # protobuf 4.x aborts with OOM when optimizing unet + if version.parse(protobuf_version) > version.parse("3.20.3"): + print("This script requires protobuf 3.20.3. Please ensure your package version matches requirements.txt.") + sys.exit(1) + + model_dir = model_input + script_dir = Path(__file__).resolve().parent + + # Clean up previously optimized models, if any. + shutil.rmtree(script_dir / "footprints", ignore_errors=True) + shutil.rmtree(model_output, ignore_errors=True) + + + # Load the entire PyTorch pipeline to ensure all models and their configurations are downloaded and cached. + # This avoids an issue where the non-ONNX components (tokenizer, scheduler, and feature extractor) are not + # automatically cached correctly if individual models are fetched one at a time. + print("Download stable diffusion PyTorch pipeline...") + pipeline = DiffusionPipeline.from_pretrained(model_dir, torch_dtype=torch.float32, **{"local_files_only": True}) + config.vae_sample_size = pipeline.vae.config.sample_size + config.cross_attention_dim = pipeline.unet.config.cross_attention_dim + config.unet_sample_size = pipeline.unet.config.sample_size + + model_info = {} + + submodel_names = ["vae_encoder", "vae_decoder", "unet" , "text_encoder"] + + has_safety_checker = getattr(pipeline, "safety_checker", None) is not None + + if has_safety_checker: + submodel_names.append("safety_checker") + + if controlnet: + submodel_names.append("controlnet") + + for submodel_name in submodel_names: + print(f"\nOptimizing {submodel_name}") + + olive_config = None + with (script_dir / f"config_{submodel_name}.json").open() as fin: + olive_config = json.load(fin) + olive_config = update_config_with_provider(olive_config, provider) + + if submodel_name in ("unet", "controlnet", "text_encoder"): + olive_config["input_model"]["config"]["model_path"] = model_dir + else: + # Only the unet & text encoder are affected by LoRA, so it's better to use the base model ID for + # other models: the Olive cache is based on the JSON config, and two LoRA variants with the same + # base model ID should be able to reuse previously optimized copies. + olive_config["input_model"]["config"]["model_path"] = model_dir + + run_res = olive_run(olive_config) + + from sd_utils.ort import save_optimized_onnx_submodel + + save_optimized_onnx_submodel(submodel_name, provider, model_info) + + from sd_utils.ort import save_onnx_pipeline + + save_onnx_pipeline( + has_safety_checker, model_info, model_output, pipeline, submodel_names + ) + + return model_info + + +def parse_common_args(raw_args): + parser = argparse.ArgumentParser("Common arguments") + + parser.add_argument("--model_input", default="stable-diffusion-v1-5", type=str) + parser.add_argument("--model_output", default="stable-diffusion-v1-5", type=Path) + parser.add_argument("--controlnet",action="store_true", help="Create ControlNet Unet Model") + parser.add_argument( + "--provider", default="dml", type=str, choices=["dml", "cuda"], help="Execution provider to use" + ) + parser.add_argument("--optimize", action="store_true", help="Runs the optimization step") + parser.add_argument("--clean_cache", action="store_true", help="Deletes the Olive cache") + parser.add_argument("--test_unoptimized", action="store_true", help="Use unoptimized model for inference") + parser.add_argument("--batch_size", default=1, type=int, help="Number of images to generate per batch") + parser.add_argument( + "--prompt", + default=( + "castle surrounded by water and nature, village, volumetric lighting, photorealistic, " + "detailed and intricate, fantasy, epic cinematic shot, mountains, 8k ultra hd" + ), + type=str, + ) + parser.add_argument( + "--guidance_scale", + default=7.5, + type=float, + help="Guidance scale as defined in Classifier-Free Diffusion Guidance", + ) + parser.add_argument("--num_images", default=1, type=int, help="Number of images to generate") + parser.add_argument("--num_inference_steps", default=50, type=int, help="Number of steps in diffusion process") + parser.add_argument("--tempdir", default=None, type=str, help="Root directory for tempfile directories and files") + parser.add_argument( + "--strength", + default=1.0, + type=float, + help="Value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. " + "Values that approach 1.0 enable lots of variations but will also produce images " + "that are not semantically consistent with the input.", + ) + parser.add_argument("--image_size", default=512, type=int, help="Width and height of the images to generate") + + return parser.parse_known_args(raw_args) + + +def parse_ort_args(raw_args): + parser = argparse.ArgumentParser("ONNX Runtime arguments") + + parser.add_argument( + "--static_dims", + action="store_true", + help="DEPRECATED (now enabled by default). Use --dynamic_dims to disable static_dims.", + ) + parser.add_argument("--dynamic_dims", action="store_true", help="Disable static shape optimization") + + return parser.parse_known_args(raw_args) + + +def main(raw_args=None): + common_args, extra_args = parse_common_args(raw_args) + + provider = common_args.provider + model_input = common_args.model_input + model_output = common_args.model_output + + script_dir = Path(__file__).resolve().parent + + + if common_args.clean_cache: + shutil.rmtree(script_dir / "cache", ignore_errors=True) + + guidance_scale = common_args.guidance_scale + + ort_args = None, None + ort_args, extra_args = parse_ort_args(extra_args) + + if common_args.optimize or not model_output.exists(): + set_tempdir(common_args.tempdir) + + # TODO(jstoecker): clean up warning filter (mostly during conversion from torch to ONNX) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + + from sd_utils.ort import validate_args + + validate_args(ort_args, common_args.provider) + optimize(common_args.model_input, common_args.model_output, common_args.provider, common_args.controlnet) + + if not common_args.optimize: + model_dir = model_output / "F32" if common_args.test_unoptimized else model_output / "F16" + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + + from sd_utils.ort import get_ort_pipeline + + pipeline = get_ort_pipeline(model_dir, common_args, ort_args, guidance_scale) + run_inference_loop( + pipeline, + common_args.prompt, + common_args.num_images, + common_args.batch_size, + common_args.image_size, + common_args.num_inference_steps, + guidance_scale, + common_args.strength, + provider=provider, + ) + + +if __name__ == "__main__": + main() diff --git a/OnnxStack.Converter/latent_consistency/models.py b/OnnxStack.Converter/latent_consistency/models.py new file mode 100644 index 0000000..8b3de3f --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/models.py @@ -0,0 +1,336 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import config +import torch +from typing import Union, Optional, Tuple +from diffusers import AutoencoderKL, UNet2DConditionModel, ControlNetModel +from diffusers.models.controlnet import ControlNetOutput, BaseOutput as ControlNetBaseOutput +from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker +from transformers.models.clip.modeling_clip import CLIPTextModel +from dataclasses import dataclass + +# Helper latency-only dataloader that creates random tensors with no label +class RandomDataLoader: + def __init__(self, create_inputs_func, batchsize, torch_dtype): + self.create_input_func = create_inputs_func + self.batchsize = batchsize + self.torch_dtype = torch_dtype + + def __getitem__(self, idx): + label = None + return self.create_input_func(self.batchsize, self.torch_dtype), label + + + +# ----------------------------------------------------------------------------- +# TEXT ENCODER +# ----------------------------------------------------------------------------- + + +def text_encoder_inputs(batchsize, torch_dtype): + return torch.zeros((batchsize, 77), dtype=torch_dtype) + + +def text_encoder_load(model_name): + model = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder") + if is_lora_model(model_name): + merge_lora_weights(model, model_name, "text_encoder") + return model + + +def text_encoder_conversion_inputs(model=None): + return text_encoder_inputs(1, torch.int32) + + +def text_encoder_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(text_encoder_inputs, batchsize, torch.int32) + + +# ----------------------------------------------------------------------------- +# UNET +# ----------------------------------------------------------------------------- + + +def unet_inputs(batchsize, torch_dtype, is_conversion_inputs=False): + # TODO(jstoecker): Rename onnx::Concat_4 to text_embeds and onnx::Shape_5 to time_ids + inputs = { + "sample": torch.rand((batchsize, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "timestep": torch.rand((batchsize,), dtype=torch_dtype), + "encoder_hidden_states": torch.rand((batchsize, 77, config.cross_attention_dim), dtype=torch_dtype), + } + + # use as kwargs since they won't be in the correct position if passed along with the tuple of inputs + kwargs = { + "timestep_cond": torch.rand((batchsize, 256), dtype=torch_dtype), + "return_dict": False, + } + if is_conversion_inputs: + inputs["additional_inputs"] = { + **kwargs, + "added_cond_kwargs": { + "text_embeds": torch.rand((1, 1280), dtype=torch_dtype), + "time_ids": torch.rand((1, 5), dtype=torch_dtype), + }, + } + else: + inputs.update(kwargs) + inputs["onnx::Concat_4"] = torch.rand((1, 1280), dtype=torch_dtype) + inputs["onnx::Shape_5"] = torch.rand((1, 5), dtype=torch_dtype) + + return inputs + + +def unet_load(model_name): + model = UNet2DConditionModel.from_pretrained(model_name, subfolder="unet") + if is_lora_model(model_name): + merge_lora_weights(model, model_name, "unet") + return model + + +def unet_conversion_inputs(model=None): + return tuple(unet_inputs(1, torch.float32, True).values()) + + +def unet_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(unet_inputs, batchsize, torch.float16) + +# ----------------------------------------------------------------------------- +# CONTROLNET - UNET +# ----------------------------------------------------------------------------- + +class PatchedUNet2DConditionModel(UNet2DConditionModel): + def forward( + self, + sample: torch.FloatTensor, + timestep: Union[torch.Tensor, float, int], + encoder_hidden_states: torch.Tensor, + timestep_cond: torch.Tensor, + down_block_0_additional_residual: torch.Tensor, + down_block_1_additional_residual: torch.Tensor, + down_block_2_additional_residual: torch.Tensor, + down_block_3_additional_residual: torch.Tensor, + down_block_4_additional_residual: torch.Tensor, + down_block_5_additional_residual: torch.Tensor, + down_block_6_additional_residual: torch.Tensor, + down_block_7_additional_residual: torch.Tensor, + down_block_8_additional_residual: torch.Tensor, + down_block_9_additional_residual: torch.Tensor, + down_block_10_additional_residual: torch.Tensor, + down_block_11_additional_residual: torch.Tensor, + mid_block_additional_residual: torch.Tensor, + ) -> Union[UNet2DConditionModel, Tuple]: + down_block_add_res = ( + down_block_0_additional_residual, down_block_1_additional_residual, down_block_2_additional_residual, + down_block_3_additional_residual, down_block_4_additional_residual, down_block_5_additional_residual, + down_block_6_additional_residual, down_block_7_additional_residual, down_block_8_additional_residual, + down_block_9_additional_residual, down_block_10_additional_residual, down_block_11_additional_residual) + return super().forward( + sample = sample, + timestep = timestep, + encoder_hidden_states = encoder_hidden_states, + timestep_cond = timestep_cond, + down_block_additional_residuals = down_block_add_res, + mid_block_additional_residual = mid_block_additional_residual, + return_dict = False + ) + +def controlnet_unet_inputs(batchsize, torch_dtype): + return { + "sample": torch.rand((batchsize, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "timestep": torch.rand((batchsize,), dtype=torch_dtype), + "encoder_hidden_states": torch.rand((batchsize, 77, config.cross_attention_dim), dtype=torch_dtype), + "timestep_cond": torch.rand((batchsize, 256), dtype=torch_dtype), + "down_block_0_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "down_block_1_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "down_block_2_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "down_block_3_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size // 2, config.unet_sample_size // 2), dtype=torch_dtype), + "down_block_4_additional_residual": torch.rand((batchsize, 640, config.unet_sample_size // 2, config.unet_sample_size // 2), dtype=torch_dtype), + "down_block_5_additional_residual": torch.rand((batchsize, 640, config.unet_sample_size // 2, config.unet_sample_size // 2), dtype=torch_dtype), + "down_block_6_additional_residual": torch.rand((batchsize, 640, config.unet_sample_size // 4, config.unet_sample_size // 4), dtype=torch_dtype), + "down_block_7_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 4, config.unet_sample_size // 4), dtype=torch_dtype), + "down_block_8_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 4, config.unet_sample_size // 4), dtype=torch_dtype), + "down_block_9_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype), + "down_block_10_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype), + "down_block_11_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype), + "mid_block_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype) + } + + +def controlnet_unet_load(model_name): + model = PatchedUNet2DConditionModel.from_pretrained(model_name, subfolder="unet") + return model + + +def controlnet_unet_conversion_inputs(model): + return tuple(controlnet_unet_inputs(1, torch.float32).values()) + + +def controlnet_unet_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(controlnet_unet_inputs, batchsize, torch.float16) + +# ----------------------------------------------------------------------------- +# VAE ENCODER +# ----------------------------------------------------------------------------- + + +def vae_encoder_inputs(batchsize, torch_dtype): + return {"sample": torch.rand((batchsize, 3, config.vae_sample_size, config.vae_sample_size), dtype=torch_dtype)} + + +def vae_encoder_load(model_name): + model = AutoencoderKL.from_pretrained(model_name, subfolder="vae") + model.forward = lambda sample: model.encode(sample)[0].sample() + return model + + +def vae_encoder_conversion_inputs(model=None): + return tuple(vae_encoder_inputs(1, torch.float32).values()) + + +def vae_encoder_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(vae_encoder_inputs, batchsize, torch.float16) + + +# ----------------------------------------------------------------------------- +# VAE DECODER +# ----------------------------------------------------------------------------- + + +def vae_decoder_inputs(batchsize, torch_dtype): + return { + "latent_sample": torch.rand((batchsize, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype) + } + + +def vae_decoder_load(model_name): + model = AutoencoderKL.from_pretrained(model_name, subfolder="vae") + model.forward = model.decode + return model + + +def vae_decoder_conversion_inputs(model=None): + return tuple(vae_decoder_inputs(1, torch.float32).values()) + + +def vae_decoder_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(vae_decoder_inputs, batchsize, torch.float16) + + +# ----------------------------------------------------------------------------- +# SAFETY CHECKER +# ----------------------------------------------------------------------------- + + +def safety_checker_inputs(batchsize, torch_dtype): + return { + "clip_input": torch.rand((batchsize, 3, 224, 224), dtype=torch_dtype), + "images": torch.rand((batchsize, config.vae_sample_size, config.vae_sample_size, 3), dtype=torch_dtype), + } + + +def safety_checker_load(model_name): + model = StableDiffusionSafetyChecker.from_pretrained(model_name, subfolder="safety_checker") + model.forward = model.forward_onnx + return model + + +def safety_checker_conversion_inputs(model=None): + return tuple(safety_checker_inputs(1, torch.float32).values()) + + +def safety_checker_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(safety_checker_inputs, batchsize, torch.float16) + + +# ----------------------------------------------------------------------------- +# LoRA weights +# ----------------------------------------------------------------------------- + +def is_lora_model(model_name): + # TODO(jstoecker): might be a better way to detect (e.g. presence of LORA weights file) + return False + + +# Merges LoRA weights into the layers of a base model +def merge_lora_weights(base_model, lora_model_id, submodel_name="unet", scale=1.0): + import inspect + from collections import defaultdict + from functools import reduce + + try: + from diffusers.loaders import LORA_WEIGHT_NAME + except ImportError: + # moved in version 0.24.0 + from diffusers.loaders.lora import LORA_WEIGHT_NAME + from diffusers.models.attention_processor import LoRAAttnProcessor + from diffusers.utils.hub_utils import _get_model_file + + parameters = inspect.signature(_get_model_file).parameters + + kwargs = {} + if "use_auth_token" in parameters: + kwargs["use_auth_token"] = None + elif "token" in parameters: + kwargs["token"] = None + + # Load LoRA weights + model_file = _get_model_file( + lora_model_id, + weights_name=LORA_WEIGHT_NAME, + cache_dir=None, + force_download=False, + resume_download=False, + proxies=None, + local_files_only=False, + revision=None, + subfolder=None, + user_agent={ + "file_type": "attn_procs_weights", + "framework": "pytorch", + }, + **kwargs, + ) + lora_state_dict = torch.load(model_file, map_location="cpu") + + # All keys in the LoRA state dictionary should have 'lora' somewhere in the string. + keys = list(lora_state_dict.keys()) + assert all("lora" in k for k in keys) + + if all(key.startswith(submodel_name) for key in keys): + # New format (https://github.com/huggingface/diffusers/pull/2918) supports LoRA weights in both the + # unet and text encoder where keys are prefixed with 'unet' or 'text_encoder', respectively. + submodel_state_dict = {k: v for k, v in lora_state_dict.items() if k.startswith(submodel_name)} + else: + # Old format. Keys will not have any prefix. This only applies to unet, so exit early if this is + # optimizing the text encoder. + if submodel_name != "unet": + return + submodel_state_dict = lora_state_dict + + # Group LoRA weights into attention processors + attn_processors = {} + lora_grouped_dict = defaultdict(dict) + for key, value in submodel_state_dict.items(): + attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:]) + lora_grouped_dict[attn_processor_key][sub_key] = value + + for key, value_dict in lora_grouped_dict.items(): + rank = value_dict["to_k_lora.down.weight"].shape[0] + cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[1] + hidden_size = value_dict["to_k_lora.up.weight"].shape[0] + + attn_processors[key] = LoRAAttnProcessor( + hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank + ) + attn_processors[key].load_state_dict(value_dict) + + # Merge LoRA attention processor weights into existing Q/K/V/Out weights + for name, proc in attn_processors.items(): + attention_name = name[: -len(".processor")] + attention = reduce(getattr, attention_name.split(sep="."), base_model) + attention.to_q.weight.data += scale * torch.mm(proc.to_q_lora.up.weight, proc.to_q_lora.down.weight) + attention.to_k.weight.data += scale * torch.mm(proc.to_k_lora.up.weight, proc.to_k_lora.down.weight) + attention.to_v.weight.data += scale * torch.mm(proc.to_v_lora.up.weight, proc.to_v_lora.down.weight) + attention.to_out[0].weight.data += scale * torch.mm(proc.to_out_lora.up.weight, proc.to_out_lora.down.weight) diff --git a/OnnxStack.Converter/latent_consistency/requirements.txt b/OnnxStack.Converter/latent_consistency/requirements.txt new file mode 100644 index 0000000..15b9198 --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/requirements.txt @@ -0,0 +1,9 @@ +accelerate +diffusers +onnx +pillow +protobuf==3.20.3 # protobuf 4.x aborts with OOM when optimizing unet +tabulate +torch +transformers +onnxruntime-directml>=1.16.0 diff --git a/OnnxStack.Converter/latent_consistency/sd_utils/ort.py b/OnnxStack.Converter/latent_consistency/sd_utils/ort.py new file mode 100644 index 0000000..ad49818 --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/sd_utils/ort.py @@ -0,0 +1,172 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import os +import json +import shutil +import sys +from pathlib import Path +from typing import Dict + +import onnxruntime as ort +from diffusers import OnnxRuntimeModel, OnnxStableDiffusionPipeline +from onnxruntime import __version__ as OrtVersion +from packaging import version + +from olive.model import ONNXModelHandler + +# ruff: noqa: TID252, T201 + + +def update_cuda_config(config: Dict): + if version.parse(OrtVersion) < version.parse("1.17.0"): + # disable skip_group_norm fusion since there is a shape inference bug which leads to invalid models + config["passes"]["optimize_cuda"]["config"]["optimization_options"] = {"enable_skip_group_norm": False} + config["pass_flows"] = [["convert", "optimize_cuda"]] + config["systems"]["local_system"]["config"]["accelerators"][0]["execution_providers"] = ["CUDAExecutionProvider"] + return config + + +def validate_args(args, provider): + ort.set_default_logger_severity(4) + if args.static_dims: + print( + "WARNING: the --static_dims option is deprecated, and static shape optimization is enabled by default. " + "Use --dynamic_dims to disable static shape optimization." + ) + + validate_ort_version(provider) + + +def validate_ort_version(provider: str): + if provider == "dml" and version.parse(OrtVersion) < version.parse("1.16.0"): + print("This script requires onnxruntime-directml 1.16.0 or newer") + sys.exit(1) + elif provider == "cuda" and version.parse(OrtVersion) < version.parse("1.17.0"): + if version.parse(OrtVersion) < version.parse("1.16.2"): + print("This script requires onnxruntime-gpu 1.16.2 or newer") + sys.exit(1) + print( + f"WARNING: onnxruntime {OrtVersion} has known issues with shape inference for SkipGroupNorm. Will disable" + " skip_group_norm fusion. onnxruntime-gpu 1.17.0 or newer is strongly recommended!" + ) + + +def save_optimized_onnx_submodel(submodel_name, provider, model_info): + footprints_file_path = ( + Path(__file__).resolve().parents[1] / "footprints" / f"{submodel_name}_gpu-{provider}_footprints.json" + ) + with footprints_file_path.open("r") as footprint_file: + footprints = json.load(footprint_file) + + conversion_footprint = None + optimizer_footprint = None + for footprint in footprints.values(): + if footprint["from_pass"] == "OnnxConversion": + conversion_footprint = footprint + elif footprint["from_pass"] == "OrtTransformersOptimization": + optimizer_footprint = footprint + + assert conversion_footprint + assert optimizer_footprint + + unoptimized_olive_model = ONNXModelHandler(**conversion_footprint["model_config"]["config"]) + optimized_olive_model = ONNXModelHandler(**optimizer_footprint["model_config"]["config"]) + + model_info[submodel_name] = { + "unoptimized": { + "path": Path(unoptimized_olive_model.model_path), + }, + "optimized": { + "path": Path(optimized_olive_model.model_path), + }, + } + + print(f"Unoptimized Model : {model_info[submodel_name]['unoptimized']['path']}") + print(f"Optimized Model : {model_info[submodel_name]['optimized']['path']}") + + +def save_onnx_pipeline( + has_safety_checker, model_info, model_output, pipeline, submodel_names +): + # Save the unoptimized models in a directory structure that the diffusers library can load and run. + # This is optional, and the optimized models can be used directly in a custom pipeline if desired. + print("\nCreating ONNX pipeline...") + + optimized_model_dir = model_output / "Optimized" + unoptimized_model_dir = model_output / "Default" + has_controlnet = 'controlnet' in submodel_names + if has_safety_checker: + safety_checker = OnnxRuntimeModel.from_pretrained(model_info["safety_checker"]["unoptimized"]["path"].parent) + else: + safety_checker = None + + onnx_pipeline = OnnxStableDiffusionPipeline( + vae_encoder=OnnxRuntimeModel.from_pretrained(model_info["vae_encoder"]["unoptimized"]["path"].parent), + vae_decoder=OnnxRuntimeModel.from_pretrained(model_info["vae_decoder"]["unoptimized"]["path"].parent), + text_encoder=OnnxRuntimeModel.from_pretrained(model_info["text_encoder"]["unoptimized"]["path"].parent), + tokenizer=pipeline.tokenizer, + unet=OnnxRuntimeModel.from_pretrained(model_info["unet"]["unoptimized"]["path"].parent), + scheduler=pipeline.scheduler, + safety_checker=safety_checker, + feature_extractor=pipeline.feature_extractor, + requires_safety_checker=True, + ) + + if has_controlnet: + controlnet=OnnxRuntimeModel.from_pretrained(model_info["controlnet"]["unoptimized"]["path"].parent) + + print("Saving unoptimized models...") + onnx_pipeline.save_pretrained(unoptimized_model_dir) + if has_controlnet: + controlnet.save_pretrained(unoptimized_model_dir / "controlnet" ) + + # Create a copy of the unoptimized model directory, then overwrite with optimized models from the olive cache. + print("Copying optimized models...") + shutil.copytree(unoptimized_model_dir, optimized_model_dir, ignore=shutil.ignore_patterns("weights.pb")) + for submodel_name in submodel_names: + src_path = model_info[submodel_name]["optimized"]["path"] + dst_path = optimized_model_dir / submodel_name / "model.onnx" + exists = os.path.exists(dst_path) + if not exists: + os.mkdir(optimized_model_dir / submodel_name) + shutil.copyfile(src_path, dst_path) + + print(f"The default pipeline is located here: {unoptimized_model_dir}") + print(f"The optimized pipeline is located here: {optimized_model_dir}") + + +def get_ort_pipeline(model_dir, common_args, ort_args, guidance_scale): + ort.set_default_logger_severity(3) + + print("Loading models into ORT session...") + sess_options = ort.SessionOptions() + sess_options.enable_mem_pattern = False + + static_dims = not ort_args.dynamic_dims + batch_size = common_args.batch_size + image_size = common_args.image_size + provider = common_args.provider + + if static_dims: + hidden_batch_size = batch_size if (guidance_scale == 0.0) else batch_size * 2 + # Not necessary, but helps DML EP further optimize runtime performance. + # batch_size is doubled for sample & hidden state because of classifier free guidance: + # https://github.com/huggingface/diffusers/blob/46c52f9b9607e6ecb29c782c052aea313e6487b7/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L672 + sess_options.add_free_dimension_override_by_name("unet_sample_batch", hidden_batch_size) + sess_options.add_free_dimension_override_by_name("unet_sample_channels", 4) + sess_options.add_free_dimension_override_by_name("unet_sample_height", image_size // 8) + sess_options.add_free_dimension_override_by_name("unet_sample_width", image_size // 8) + sess_options.add_free_dimension_override_by_name("unet_time_batch", 1) + sess_options.add_free_dimension_override_by_name("unet_hidden_batch", hidden_batch_size) + sess_options.add_free_dimension_override_by_name("unet_hidden_sequence", 77) + + provider_map = { + "dml": "DmlExecutionProvider", + "cuda": "CUDAExecutionProvider", + } + assert provider in provider_map, f"Unsupported provider: {provider}" + return OnnxStableDiffusionPipeline.from_pretrained( + model_dir, provider=provider_map[provider], sess_options=sess_options + ) diff --git a/OnnxStack.Converter/stable_diffusion/.gitignore b/OnnxStack.Converter/stable_diffusion/.gitignore new file mode 100644 index 0000000..324c183 --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/.gitignore @@ -0,0 +1,2 @@ +/footprints/ +/result_*.png diff --git a/OnnxStack.Converter/stable_diffusion/README.md b/OnnxStack.Converter/stable_diffusion/README.md new file mode 100644 index 0000000..a12b762 --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/README.md @@ -0,0 +1,180 @@ +# Stable Diffusion Optimization + +This folder contains sample use cases of Olive with ONNX Runtime and OpenVINO to optimize: +- Stable Diffusion: [Stable Diffusion v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4), [Stable Diffusion v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [Stable Diffusion v2](https://huggingface.co/stabilityai/stable-diffusion-2) +- Stable Diffusion XL: [Stable Diffusion XL Base](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), [Stable Diffusion XL Refiner](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0) + +Stable Diffusion comprises multiple PyTorch models tied together into a *pipeline*. + +The ONNX Runtime optimization sample will convert each PyTorch model to ONNX, and then run the converted ONNX models through the `OrtTransformersOptimization` pass. The transformer optimization pass performs several time-consuming graph transformations that make the models more efficient for inference at runtime. + +The OpenVINO optimization sample will convert each PyTorch model to OpenVINO IR model by `OpenVINOConversion` pass, and create an `OpenVINOStableDiffusionPipeline` for inference. + +- ONNX Runtime with + - [CUDA EP](#stable-diffusion-and-stable-diffusion-xl-optimization-with-onnx-runtime-cuda-ep) + - DirectML EP: go to examples [Stable Diffusion](../directml/stable_diffusion/README.md), [Stable Diffusion XL](../directml/stable_diffusion_xl/README.md) +- [OpenVINO](#stable-diffusion-optimization-with-openvino) + +## Stable Diffusion and Stable Diffusion XL Optimization with ONNX Runtime CUDA EP + +This sample performs the following optimization workflow for each model in the Stable Diffusion pipeline: +- *PyTorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp16* +

+ +Transformers optimization uses the following optimizations to speed up Stable Diffusion in CUDA: +* [Flash Attention](https://arxiv.org/abs/2205.14135) for float16 precision. Flash Attention uses tiling to reduce number of GPU memory reads/writes, and improves performance with less memory for long sequence length. The kernel requires GPUs of Compute Capability >= 7.5 (like T4, A100, and RTX 2060~4090). Only availanle in Linux. +* [Memory Efficient Attention](https://arxiv.org/abs/2112.05682v2) for float32 precision or older GPUs (like V100). We used the fused multi-head attention kernel in CUTLASS, and the kernel was contributed by xFormers. +* Channel-last (NHWC) convolution. For NVidia GPU with Tensor Cores support, NHWC tensor layout is recommended for convolution. See [Tensor Layouts In Memory: NCHW vs NHWC](https://docs.nvidia.com/deeplearning/performance/dl-performance-convolutional/index.html#tensor-layout). +* GroupNorm for NHWC tensor layout, and SkipGroupNorm fusion which fuses GroupNorm with Add bias and residual inputs +* SkipLayerNormalization which fuses LayerNormalization with Add bias and residual inputs. +* BiasSplitGelu is a fusion of Add bias with SplitGelu activation. +* BiasAdd fuses Add bias and residual. +* Reduce Transpose nodes by graph transformation. + +#### Prerequisites +##### Clone the repository and install Olive + +Refer to the instructions in the [examples README](../README.md) to clone the repository and install Olive. + + +We use the same olive workflow config files and scripts as the DirectML examples. The only difference is the `--provider cuda` option provided to the `stable_diffusion.py` and `stable_diffusion_xl.py` scripts. + +So, cd into the corresponding DirectML example folder from the root of the cloned repository: + +**_Stable Diffusion_** +```bash +cd examples/stable_diffusion +``` + +**_Stable Diffusion XL_** +```bash +cd examples/directml/stable_diffusion_xl +``` + +##### Install onnxruntime + +This example requires the latest onnxruntime-gpu code which can either be built from source or installed from the nightly builds. The following command can be used to install the latest nightly build of onnxruntime-gpu: + +```bash +# uninstall any pre-existing onnxruntime packages +pip uninstall -y onnxruntime onnxruntime-gpu onnxruntime-directml ort-nightly ort-nightly-gpu ort-nightly-directml + +# install onnxruntime-gpu nightly build +pip install ort-nightly-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ +``` + +##### Install other dependencies + +Install the necessary python packages: + +```bash +python -m pip install -r requirements-common.txt +``` + +#### Conversion to ONNX and Latency Optimization + +The easiest way to optimize the pipeline is with the `stable_diffusion.py` and `stable_diffusion_xl.py` scripts. These scripts will enumerate the `config_.json` files and optimize each with Olive, then gather the optimized models into a directory structure suitable for testing inference. + +**_Stable Diffusion_** +```bash +# default model_id is "runwayml/stable-diffusion-v1-5" +python stable_diffusion.py --provider cuda --optimize +``` + +**_Stable Diffusion XL_** +```bash +# default model_id is "stabilityai/stable-diffusion-xl-base-1.0" +python stable_diffusion_xl.py --provider cuda --optimize [--use_fp16_fixed_vae] + +# or specify a different model_id +python stable_diffusion_xl.py --provider cuda --model_id stabilityai/stable-diffusion-xl-refiner-1.0 --optimize [--use_fp16_fixed_vae] +``` + +`--use_fp16_fixed_vae` is optional. If provided, will use [madebyollin/sdxl-vae-fp16-fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) for the vae models and all sub-models will be entirely in fp16. +Otherwise, the vae models (vae-decoder for base and both vae-decoder and vae-encoder for refiner) will be in fp32 and all other sub-models will be in fp16 with fp32 input/output. + +Once the script successfully completes: +- The optimized ONNX pipeline will be stored under `models/optimized-cuda/[model_id]` (for example `models/optimized-cuda/runwayml/stable-diffusion-v1-5` or `models/optimized-cuda/stabilityai/stable-diffusion-xl-base-1.0`). +- The unoptimized ONNX pipeline (models converted to ONNX, but not run through transformer optimization pass) will be stored under `models/unoptimized/[model_id]` (for example `models/unoptimized/runwayml/stable-diffusion-v1-5` or `models/unoptimized/stabilityai/stable-diffusion-xl-base-1.0`). + +Re-running the script with `--optimize` will delete the output models, but it will *not* delete the Olive cache. Subsequent runs will complete much faster since it will simply be copying previously optimized models; you may use the `--clean_cache` option to start from scratch (not typically used unless you are modifying the scripts, for example). + +### Test Inference with CUDA + +Test ONNX runtime inference with the optimized models using `OnnxStableDiffusionPipeline`: + +**_Stable Diffusion_** +```bash +python stable_diffusion.py --provider cuda --num_images 2 +``` +Inference will loop until the generated image passes the safety checker (otherwise you would see black images). The result will be saved as `result_.png` on disk. + +**_Stable Diffusion XL_** +```bash +python stable_diffusion_xl.py --provider cuda --num_images 2 +``` +The result will be saved as `result_.png` on disk. + +Refer to the corresponding section in the DirectML READMEs for more details on the test inference options: +- [Stable Diffusion](../directml/stable_diffusion/README.md#test-inference) +- [Stable Diffusion XL](../directml/stable_diffusion_xl/README.md#test-inference) + + +## Stable Diffusion Optimization with OpenVINO + +**Contents**: +- [Setup](#setup) +- [Conversion to OpenVINO IR model](#convert-to-openvino-ir-model) +- [Test Inference](#test-inference-with-openvino) + +### Setup + +Olive is currently under pre-release, with constant updates and improvements to the functions and usage. This sample code will be frequently updated as Olive evolves, so it is important to install Olive from source when checking out this code from the main branch. See the [README for examples](https://github.com/microsoft/Olive/blob/main/examples/README.md#important) for detailed instructions on how to do this. + +**Alternatively**, you may install a stable release that we have validated. For example: + +``` +# Install Olive from main branch +pip install git+https://github.com/microsoft/Olive#egg=olive-ai[openvino] + +# Clone Olive repo to access sample code +git clone https://github.com/microsoft/olive +``` + +Once you've installed Olive, install the requirements for this sample matching the version of the library you are using: +``` +cd olive/examples/stable_diffusion +pip install -r requirements-ov.txt +``` + +### Convert to OpenVINO IR model + +The easiest way to optimize the pipeline is with the `stable_diffusion.py` helper script: + +``` +python stable_diffusion.py --optimize +``` + +The above command will enumerate the `config_.json` files and optimize each with Olive, then gather the optimized models into a directory structure suitable for testing inference. + +The stable diffusion models are large, and the optimization process is resource intensive. It is recommended to run optimization on a system with a minimum of 16GB of memory (preferably 32GB). Expect optimization to take several minutes (especially the U-Net model). + +Once the script successfully completes: +- The converted OpenVINO IR model will be stored under `models/optimized-openvino/[model_id]` (for example `models/optimized-openvino/runwayml/stable-diffusion-v1-5`). + +Re-running the script with `--optimize` will delete the output models, but it will *not* delete the Olive cache. Subsequent runs will complete much faster since it will simply be copying previously optimized models; you may use the `--clean_cache` option to start from scratch (not typically used unless you are modifying the scripts, for example). + +### Test Inference with OpenVINO + +This sample code is primarily intended to illustrate model optimization with Olive, but it also provides a simple interface for testing inference with the OpenVINO models. Inference is done by creating an `OVStableDiffusionPipeline` from the saved models. + + +``` +python stable_diffusion.py --inference --provider openvino +``` +Inference will loop until the generated image. The result will be saved as `result_.png` on disk. + + +Run `python stable_diffusion.py --help` for additional options. A few particularly relevant ones: +- `--image_path `: the input image path for image to image inference. +- `--img_to_img_example`: image to image example. The default input image is `assets/dog.png`, the default prompt is `amazing watercolor painting`. diff --git a/OnnxStack.Converter/stable_diffusion/config.py b/OnnxStack.Converter/stable_diffusion/config.py new file mode 100644 index 0000000..f8cfccd --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/config.py @@ -0,0 +1,8 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +vae_sample_size = 512 +unet_sample_size = 64 +cross_attention_dim = 768 diff --git a/OnnxStack.Converter/stable_diffusion/config_controlnet.json b/OnnxStack.Converter/stable_diffusion/config_controlnet.json new file mode 100644 index 0000000..02902ea --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/config_controlnet.json @@ -0,0 +1,123 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "runwayml/stable-diffusion-v1-5", + "model_loader": "controlnet_unet_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample", "timestep", "encoder_hidden_states", "down_block_0_additional_residual", "down_block_1_additional_residual", "down_block_2_additional_residual", "down_block_3_additional_residual", "down_block_4_additional_residual", "down_block_5_additional_residual", "down_block_6_additional_residual", "down_block_7_additional_residual", "down_block_8_additional_residual", "down_block_9_additional_residual", "down_block_10_additional_residual", "down_block_11_additional_residual", "mid_block_additional_residual", "return_dict" ], + "output_names": [ "out_sample" ], + "dynamic_axes": { + "sample": {"0": "unet_sample_batch", "1": "unet_sample_channels", "2": "unet_sample_height", "3": "unet_sample_width"}, + "timestep": {"0": "unet_time_batch"}, + "encoder_hidden_states": {"0": "unet_hidden_batch", "1": "unet_hidden_sequence"}, + "down_block_0_additional_residual": {"0": "cnet_db0_batch", "1": "cnet_db0_channels", "2": "cnet_db0_height", "3": "cnet_db0_width"}, + "down_block_1_additional_residual": {"0": "cnet_db1_batch", "1": "cnet_db1_channels", "2": "cnet_db1_height", "3": "cnet_db1_width"}, + "down_block_2_additional_residual": {"0": "cnet_db2_batch", "1": "cnet_db2_channels", "2": "cnet_db2_height", "3": "cnet_db2_width"}, + "down_block_3_additional_residual": {"0": "cnet_db3_batch", "1": "cnet_db3_channels", "2": "cnet_db3_height2", "3": "cnet_db3_width2"}, + "down_block_4_additional_residual": {"0": "cnet_db4_batch", "1": "cnet_db4_channels", "2": "cnet_db4_height2", "3": "cnet_db4_width2"}, + "down_block_5_additional_residual": {"0": "cnet_db5_batch", "1": "cnet_db5_channels", "2": "cnet_db5_height2", "3": "cnet_db5_width2"}, + "down_block_6_additional_residual": {"0": "cnet_db6_batch", "1": "cnet_db6_channels", "2": "cnet_db6_height4", "3": "cnet_db6_width4"}, + "down_block_7_additional_residual": {"0": "cnet_db7_batch", "1": "cnet_db7_channels", "2": "cnet_db7_height4", "3": "cnet_db7_width4"}, + "down_block_8_additional_residual": {"0": "cnet_db8_batch", "1": "cnet_db8_channels", "2": "cnet_db8_height4", "3": "cnet_db8_width4"}, + "down_block_9_additional_residual": {"0": "cnet_db9_batch", "1": "cnet_db9_channels", "2": "cnet_db9_height8", "3": "cnet_db9_width8"}, + "down_block_10_additional_residual": {"0": "cnet_db10_batch", "1": "cnet_db10_channels", "2": "cnet_db10_height8", "3": "cnet_db10_width8"}, + "down_block_11_additional_residual": {"0": "cnet_db11_batch", "1": "cnet_db11_channels", "2": "cnet_db11_height8", "3": "cnet_db11_width8"}, + "mid_block_additional_residual": {"0": "cnet_mbar_batch", "1": "cnet_mbar_channels", "2": "cnet_mbar_height8", "3": "cnet_mbar_width8"} + } + }, + "dummy_inputs_func": "controlnet_unet_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "controlnet_unet_data_loader", + "batch_size": 2 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14, + "save_as_external_data": true, + "all_tensors_to_one_file": true, + "external_data_name": "weights.pb" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "controlnet", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_diffusion/config_safety_checker.json b/OnnxStack.Converter/stable_diffusion/config_safety_checker.json new file mode 100644 index 0000000..f5234a8 --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/config_safety_checker.json @@ -0,0 +1,124 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "runwayml/stable-diffusion-v1-5", + "model_loader": "safety_checker_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "clip_input", "images" ], + "output_names": [ "out_images", "has_nsfw_concepts" ], + "dynamic_axes": { + "clip_input": { "0": "batch", "1": "channels", "2": "height", "3": "width" }, + "images": { "0": "batch", "1": "height", "2": "width", "3": "channels" } + } + }, + "dummy_inputs_func": "safety_checker_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "safety_checker_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14 + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "safety_checker_conversion_inputs", + "output_model": "safety_checker" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "safety_checker", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_diffusion/config_text_encoder.json b/OnnxStack.Converter/stable_diffusion/config_text_encoder.json new file mode 100644 index 0000000..db7115f --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/config_text_encoder.json @@ -0,0 +1,121 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "runwayml/stable-diffusion-v1-5", + "model_loader": "text_encoder_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "input_ids" ], + "output_names": [ "last_hidden_state", "pooler_output" ], + "dynamic_axes": { "input_ids": { "0": "batch", "1": "sequence" } } + }, + "dummy_inputs_func": "text_encoder_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "text_encoder_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14 + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "text_encoder_conversion_inputs", + "output_model": "text_encoder" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "clip", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "clip", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "text_encoder", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_diffusion/config_unet.json b/OnnxStack.Converter/stable_diffusion/config_unet.json new file mode 100644 index 0000000..d5e4ab2 --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/config_unet.json @@ -0,0 +1,128 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "runwayml/stable-diffusion-v1-5", + "model_loader": "unet_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample", "timestep", "encoder_hidden_states", "return_dict" ], + "output_names": [ "out_sample" ], + "dynamic_axes": { + "sample": {"0": "unet_sample_batch", "1": "unet_sample_channels", "2": "unet_sample_height", "3": "unet_sample_width"}, + "timestep": {"0": "unet_time_batch"}, + "encoder_hidden_states": {"0": "unet_hidden_batch", "1": "unet_hidden_sequence"} + } + }, + "dummy_inputs_func": "unet_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "unet_data_loader", + "batch_size": 2 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14, + "save_as_external_data": true, + "all_tensors_to_one_file": true, + "external_data_name": "weights.pb" + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "get_unet_ov_example_input", + "output_model": "unet" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "unet", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_diffusion/config_vae_decoder.json b/OnnxStack.Converter/stable_diffusion/config_vae_decoder.json new file mode 100644 index 0000000..40c42b8 --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/config_vae_decoder.json @@ -0,0 +1,121 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "runwayml/stable-diffusion-v1-5", + "model_loader": "vae_decoder_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "latent_sample", "return_dict" ], + "output_names": [ "sample" ], + "dynamic_axes": { "latent_sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } } + }, + "dummy_inputs_func": "vae_decoder_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "vae_decoder_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14 + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "vae_decoder_conversion_inputs", + "output_model": "vae_decoder" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "vae_decoder", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_diffusion/config_vae_encoder.json b/OnnxStack.Converter/stable_diffusion/config_vae_encoder.json new file mode 100644 index 0000000..780b250 --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/config_vae_encoder.json @@ -0,0 +1,121 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "runwayml/stable-diffusion-v1-5", + "model_loader": "vae_encoder_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample", "return_dict" ], + "output_names": [ "latent_sample" ], + "dynamic_axes": { "sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } } + }, + "dummy_inputs_func": "vae_encoder_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "vae_encoder_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14 + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "vae_encoder_conversion_inputs", + "output_model": "vae_encoder" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "vae_encoder", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_diffusion/convert.py b/OnnxStack.Converter/stable_diffusion/convert.py new file mode 100644 index 0000000..c011d45 --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/convert.py @@ -0,0 +1,273 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import argparse +import json +import shutil +import sys +import warnings +from pathlib import Path +from typing import Dict + +import config +import torch +from diffusers import DiffusionPipeline +from packaging import version + +from olive.common.utils import set_tempdir +from olive.workflows import run as olive_run + + +# pylint: disable=redefined-outer-name +# ruff: noqa: TID252, T201 + + +def save_image(result, batch_size, provider, num_images, images_saved, image_callback=None): + passed_safety_checker = 0 + for image_index in range(batch_size): + if result.nsfw_content_detected is None or not result.nsfw_content_detected[image_index]: + passed_safety_checker += 1 + if images_saved < num_images: + output_path = f"result_{images_saved}.png" + result.images[image_index].save(output_path) + if image_callback: + image_callback(images_saved, output_path) + images_saved += 1 + print(f"Generated {output_path}") + print(f"Inference Batch End ({passed_safety_checker}/{batch_size} images).") + print("Images passed the safety checker.") + return images_saved + + +def run_inference_loop( + pipeline, + prompt, + num_images, + batch_size, + image_size, + num_inference_steps, + guidance_scale, + strength: float, + provider: str, + image_callback=None, + step_callback=None, +): + images_saved = 0 + + def update_steps(step, timestep, latents): + if step_callback: + step_callback((images_saved // batch_size) * num_inference_steps + step) + + while images_saved < num_images: + print(f"\nInference Batch Start (batch size = {batch_size}).") + + kwargs = {} + + result = pipeline( + [prompt] * batch_size, + num_inference_steps=num_inference_steps, + callback=update_steps if step_callback else None, + height=image_size, + width=image_size, + guidance_scale=guidance_scale, + **kwargs, + ) + + images_saved = save_image(result, batch_size, provider, num_images, images_saved, image_callback) + + +def update_config_with_provider(config: Dict, provider: str): + if provider == "dml": + # DirectML EP is the default, so no need to update config. + return config + elif provider == "cuda": + from sd_utils.ort import update_cuda_config + + return update_cuda_config(config) + else: + raise ValueError(f"Unsupported provider: {provider}") + + +def optimize( + model_input: str, + model_output: Path, + provider: str, + controlnet: bool +): + from google.protobuf import __version__ as protobuf_version + + # protobuf 4.x aborts with OOM when optimizing unet + if version.parse(protobuf_version) > version.parse("3.20.3"): + print("This script requires protobuf 3.20.3. Please ensure your package version matches requirements.txt.") + sys.exit(1) + + model_dir = model_input + script_dir = Path(__file__).resolve().parent + + # Clean up previously optimized models, if any. + shutil.rmtree(script_dir / "footprints", ignore_errors=True) + shutil.rmtree(model_output, ignore_errors=True) + + + # Load the entire PyTorch pipeline to ensure all models and their configurations are downloaded and cached. + # This avoids an issue where the non-ONNX components (tokenizer, scheduler, and feature extractor) are not + # automatically cached correctly if individual models are fetched one at a time. + print("Download stable diffusion PyTorch pipeline...") + pipeline = DiffusionPipeline.from_pretrained(model_dir, torch_dtype=torch.float32, **{"local_files_only": True}) + config.vae_sample_size = pipeline.vae.config.sample_size + config.cross_attention_dim = pipeline.unet.config.cross_attention_dim + config.unet_sample_size = pipeline.unet.config.sample_size + + model_info = {} + + submodel_names = ["vae_encoder", "vae_decoder", "unet" , "text_encoder"] + + has_safety_checker = getattr(pipeline, "safety_checker", None) is not None + + if has_safety_checker: + submodel_names.append("safety_checker") + + if controlnet: + submodel_names.append("controlnet") + + for submodel_name in submodel_names: + print(f"\nOptimizing {submodel_name}") + + olive_config = None + with (script_dir / f"config_{submodel_name}.json").open() as fin: + olive_config = json.load(fin) + olive_config = update_config_with_provider(olive_config, provider) + + if submodel_name in ("unet", "controlnet", "text_encoder"): + olive_config["input_model"]["config"]["model_path"] = model_dir + else: + # Only the unet & text encoder are affected by LoRA, so it's better to use the base model ID for + # other models: the Olive cache is based on the JSON config, and two LoRA variants with the same + # base model ID should be able to reuse previously optimized copies. + olive_config["input_model"]["config"]["model_path"] = model_dir + + run_res = olive_run(olive_config) + + from sd_utils.ort import save_optimized_onnx_submodel + + save_optimized_onnx_submodel(submodel_name, provider, model_info) + + from sd_utils.ort import save_onnx_pipeline + + save_onnx_pipeline( + has_safety_checker, model_info, model_output, pipeline, submodel_names + ) + + return model_info + + +def parse_common_args(raw_args): + parser = argparse.ArgumentParser("Common arguments") + + parser.add_argument("--model_input", default="stable-diffusion-v1-5", type=str) + parser.add_argument("--model_output", default="stable-diffusion-v1-5", type=Path) + parser.add_argument("--controlnet",action="store_true", help="Create ControlNet Unet Model") + parser.add_argument( + "--provider", default="dml", type=str, choices=["dml", "cuda"], help="Execution provider to use" + ) + parser.add_argument("--optimize", action="store_true", help="Runs the optimization step") + parser.add_argument("--clean_cache", action="store_true", help="Deletes the Olive cache") + parser.add_argument("--test_unoptimized", action="store_true", help="Use unoptimized model for inference") + parser.add_argument("--batch_size", default=1, type=int, help="Number of images to generate per batch") + parser.add_argument( + "--prompt", + default=( + "castle surrounded by water and nature, village, volumetric lighting, photorealistic, " + "detailed and intricate, fantasy, epic cinematic shot, mountains, 8k ultra hd" + ), + type=str, + ) + parser.add_argument( + "--guidance_scale", + default=7.5, + type=float, + help="Guidance scale as defined in Classifier-Free Diffusion Guidance", + ) + parser.add_argument("--num_images", default=1, type=int, help="Number of images to generate") + parser.add_argument("--num_inference_steps", default=50, type=int, help="Number of steps in diffusion process") + parser.add_argument("--tempdir", default=None, type=str, help="Root directory for tempfile directories and files") + parser.add_argument( + "--strength", + default=1.0, + type=float, + help="Value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. " + "Values that approach 1.0 enable lots of variations but will also produce images " + "that are not semantically consistent with the input.", + ) + parser.add_argument("--image_size", default=512, type=int, help="Width and height of the images to generate") + + return parser.parse_known_args(raw_args) + + +def parse_ort_args(raw_args): + parser = argparse.ArgumentParser("ONNX Runtime arguments") + + parser.add_argument( + "--static_dims", + action="store_true", + help="DEPRECATED (now enabled by default). Use --dynamic_dims to disable static_dims.", + ) + parser.add_argument("--dynamic_dims", action="store_true", help="Disable static shape optimization") + + return parser.parse_known_args(raw_args) + + +def main(raw_args=None): + common_args, extra_args = parse_common_args(raw_args) + + provider = common_args.provider + model_input = common_args.model_input + model_output = common_args.model_output + + script_dir = Path(__file__).resolve().parent + + + if common_args.clean_cache: + shutil.rmtree(script_dir / "cache", ignore_errors=True) + + guidance_scale = common_args.guidance_scale + + ort_args = None, None + ort_args, extra_args = parse_ort_args(extra_args) + + if common_args.optimize or not model_output.exists(): + set_tempdir(common_args.tempdir) + + # TODO(jstoecker): clean up warning filter (mostly during conversion from torch to ONNX) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + + from sd_utils.ort import validate_args + + validate_args(ort_args, common_args.provider) + optimize(common_args.model_input, common_args.model_output, common_args.provider, common_args.controlnet) + + if not common_args.optimize: + model_dir = model_output / "F32" if common_args.test_unoptimized else model_output / "F16" + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + + from sd_utils.ort import get_ort_pipeline + + pipeline = get_ort_pipeline(model_dir, common_args, ort_args, guidance_scale) + run_inference_loop( + pipeline, + common_args.prompt, + common_args.num_images, + common_args.batch_size, + common_args.image_size, + common_args.num_inference_steps, + guidance_scale, + common_args.strength, + provider=provider, + ) + + +if __name__ == "__main__": + main() diff --git a/OnnxStack.Converter/stable_diffusion/models.py b/OnnxStack.Converter/stable_diffusion/models.py new file mode 100644 index 0000000..196135d --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/models.py @@ -0,0 +1,342 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import config +import torch +from typing import Union, Optional, Tuple +from diffusers import AutoencoderKL, UNet2DConditionModel, ControlNetModel +from diffusers.models.controlnet import ControlNetOutput, BaseOutput as ControlNetBaseOutput +from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker +from transformers.models.clip.modeling_clip import CLIPTextModel +from dataclasses import dataclass + +# Helper latency-only dataloader that creates random tensors with no label +class RandomDataLoader: + def __init__(self, create_inputs_func, batchsize, torch_dtype): + self.create_input_func = create_inputs_func + self.batchsize = batchsize + self.torch_dtype = torch_dtype + + def __getitem__(self, idx): + label = None + return self.create_input_func(self.batchsize, self.torch_dtype), label + + + +# ----------------------------------------------------------------------------- +# TEXT ENCODER +# ----------------------------------------------------------------------------- + + +def text_encoder_inputs(batchsize, torch_dtype): + return torch.zeros((batchsize, 77), dtype=torch_dtype) + + +def text_encoder_load(model_name): + model = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder") + if is_lora_model(model_name): + merge_lora_weights(model, model_name, "text_encoder") + return model + + +def text_encoder_conversion_inputs(model=None): + return text_encoder_inputs(1, torch.int32) + + +def text_encoder_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(text_encoder_inputs, batchsize, torch.int32) + + +# ----------------------------------------------------------------------------- +# UNET +# ----------------------------------------------------------------------------- + + +def unet_inputs(batchsize, torch_dtype, is_conversion_inputs=False): + # TODO(jstoecker): Rename onnx::Concat_4 to text_embeds and onnx::Shape_5 to time_ids + inputs = { + "sample": torch.rand((batchsize, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "timestep": torch.rand((batchsize,), dtype=torch_dtype), + "encoder_hidden_states": torch.rand((batchsize, 77, config.cross_attention_dim), dtype=torch_dtype), + } + + # use as kwargs since they won't be in the correct position if passed along with the tuple of inputs + kwargs = { + "return_dict": False, + } + if is_conversion_inputs: + inputs["additional_inputs"] = { + **kwargs, + "added_cond_kwargs": { + "text_embeds": torch.rand((1, 1280), dtype=torch_dtype), + "time_ids": torch.rand((1, 5), dtype=torch_dtype), + }, + } + else: + inputs.update(kwargs) + inputs["onnx::Concat_4"] = torch.rand((1, 1280), dtype=torch_dtype) + inputs["onnx::Shape_5"] = torch.rand((1, 5), dtype=torch_dtype) + + return inputs + + +def get_unet_ov_example_input(): + import numpy as np + + encoder_hidden_state = torch.ones((2, 77, 768)) + latents_shape = (2, 4, 512 // 8, 512 // 8) + latents = torch.randn(latents_shape) + t = torch.from_numpy(np.array(1, dtype=float)) + return (latents, t, encoder_hidden_state) + + +def unet_load(model_name): + model = UNet2DConditionModel.from_pretrained(model_name, subfolder="unet") + if is_lora_model(model_name): + merge_lora_weights(model, model_name, "unet") + return model + + +def unet_conversion_inputs(model=None): + return tuple(unet_inputs(1, torch.float32, True).values()) + + +def unet_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(unet_inputs, batchsize, torch.float16) + +# ----------------------------------------------------------------------------- +# CONTROLNET - UNET +# ----------------------------------------------------------------------------- + +class PatchedUNet2DConditionModel(UNet2DConditionModel): + def forward( + self, + sample: torch.FloatTensor, + timestep: Union[torch.Tensor, float, int], + encoder_hidden_states: torch.Tensor, + down_block_0_additional_residual: torch.Tensor, + down_block_1_additional_residual: torch.Tensor, + down_block_2_additional_residual: torch.Tensor, + down_block_3_additional_residual: torch.Tensor, + down_block_4_additional_residual: torch.Tensor, + down_block_5_additional_residual: torch.Tensor, + down_block_6_additional_residual: torch.Tensor, + down_block_7_additional_residual: torch.Tensor, + down_block_8_additional_residual: torch.Tensor, + down_block_9_additional_residual: torch.Tensor, + down_block_10_additional_residual: torch.Tensor, + down_block_11_additional_residual: torch.Tensor, + mid_block_additional_residual: torch.Tensor, + ) -> Union[UNet2DConditionModel, Tuple]: + down_block_add_res = ( + down_block_0_additional_residual, down_block_1_additional_residual, down_block_2_additional_residual, + down_block_3_additional_residual, down_block_4_additional_residual, down_block_5_additional_residual, + down_block_6_additional_residual, down_block_7_additional_residual, down_block_8_additional_residual, + down_block_9_additional_residual, down_block_10_additional_residual, down_block_11_additional_residual) + return super().forward( + sample = sample, + timestep = timestep, + encoder_hidden_states = encoder_hidden_states, + down_block_additional_residuals = down_block_add_res, + mid_block_additional_residual = mid_block_additional_residual, + return_dict = False + ) + +def controlnet_unet_inputs(batchsize, torch_dtype): + return { + "sample": torch.rand((batchsize, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "timestep": torch.rand((batchsize,), dtype=torch_dtype), + "encoder_hidden_states": torch.rand((batchsize, 77, config.cross_attention_dim), dtype=torch_dtype), + "down_block_0_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "down_block_1_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "down_block_2_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "down_block_3_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size // 2, config.unet_sample_size // 2), dtype=torch_dtype), + "down_block_4_additional_residual": torch.rand((batchsize, 640, config.unet_sample_size // 2, config.unet_sample_size // 2), dtype=torch_dtype), + "down_block_5_additional_residual": torch.rand((batchsize, 640, config.unet_sample_size // 2, config.unet_sample_size // 2), dtype=torch_dtype), + "down_block_6_additional_residual": torch.rand((batchsize, 640, config.unet_sample_size // 4, config.unet_sample_size // 4), dtype=torch_dtype), + "down_block_7_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 4, config.unet_sample_size // 4), dtype=torch_dtype), + "down_block_8_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 4, config.unet_sample_size // 4), dtype=torch_dtype), + "down_block_9_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype), + "down_block_10_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype), + "down_block_11_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype), + "mid_block_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype) + } + + +def controlnet_unet_load(model_name): + model = PatchedUNet2DConditionModel.from_pretrained(model_name, subfolder="unet") + return model + + +def controlnet_unet_conversion_inputs(model): + return tuple(controlnet_unet_inputs(1, torch.float32).values()) + + +def controlnet_unet_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(controlnet_unet_inputs, batchsize, torch.float16) + +# ----------------------------------------------------------------------------- +# VAE ENCODER +# ----------------------------------------------------------------------------- + + +def vae_encoder_inputs(batchsize, torch_dtype): + return {"sample": torch.rand((batchsize, 3, config.vae_sample_size, config.vae_sample_size), dtype=torch_dtype)} + + +def vae_encoder_load(model_name): + model = AutoencoderKL.from_pretrained(model_name, subfolder="vae") + model.forward = lambda sample: model.encode(sample)[0].sample() + return model + + +def vae_encoder_conversion_inputs(model=None): + return tuple(vae_encoder_inputs(1, torch.float32).values()) + + +def vae_encoder_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(vae_encoder_inputs, batchsize, torch.float16) + + +# ----------------------------------------------------------------------------- +# VAE DECODER +# ----------------------------------------------------------------------------- + + +def vae_decoder_inputs(batchsize, torch_dtype): + return { + "latent_sample": torch.rand((batchsize, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype) + } + + +def vae_decoder_load(model_name): + model = AutoencoderKL.from_pretrained(model_name, subfolder="vae") + model.forward = model.decode + return model + + +def vae_decoder_conversion_inputs(model=None): + return tuple(vae_decoder_inputs(1, torch.float32).values()) + + +def vae_decoder_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(vae_decoder_inputs, batchsize, torch.float16) + + +# ----------------------------------------------------------------------------- +# SAFETY CHECKER +# ----------------------------------------------------------------------------- + + +def safety_checker_inputs(batchsize, torch_dtype): + return { + "clip_input": torch.rand((batchsize, 3, 224, 224), dtype=torch_dtype), + "images": torch.rand((batchsize, config.vae_sample_size, config.vae_sample_size, 3), dtype=torch_dtype), + } + + +def safety_checker_load(model_name): + model = StableDiffusionSafetyChecker.from_pretrained(model_name, subfolder="safety_checker") + model.forward = model.forward_onnx + return model + + +def safety_checker_conversion_inputs(model=None): + return tuple(safety_checker_inputs(1, torch.float32).values()) + + +def safety_checker_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(safety_checker_inputs, batchsize, torch.float16) + + +# ----------------------------------------------------------------------------- +# LoRA weights +# ----------------------------------------------------------------------------- + +def is_lora_model(model_name): + # TODO(jstoecker): might be a better way to detect (e.g. presence of LORA weights file) + return False + + +# Merges LoRA weights into the layers of a base model +def merge_lora_weights(base_model, lora_model_id, submodel_name="unet", scale=1.0): + import inspect + from collections import defaultdict + from functools import reduce + + try: + from diffusers.loaders import LORA_WEIGHT_NAME + except ImportError: + # moved in version 0.24.0 + from diffusers.loaders.lora import LORA_WEIGHT_NAME + from diffusers.models.attention_processor import LoRAAttnProcessor + from diffusers.utils.hub_utils import _get_model_file + + parameters = inspect.signature(_get_model_file).parameters + + kwargs = {} + if "use_auth_token" in parameters: + kwargs["use_auth_token"] = None + elif "token" in parameters: + kwargs["token"] = None + + # Load LoRA weights + model_file = _get_model_file( + lora_model_id, + weights_name=LORA_WEIGHT_NAME, + cache_dir=None, + force_download=False, + resume_download=False, + proxies=None, + local_files_only=False, + revision=None, + subfolder=None, + user_agent={ + "file_type": "attn_procs_weights", + "framework": "pytorch", + }, + **kwargs, + ) + lora_state_dict = torch.load(model_file, map_location="cpu") + + # All keys in the LoRA state dictionary should have 'lora' somewhere in the string. + keys = list(lora_state_dict.keys()) + assert all("lora" in k for k in keys) + + if all(key.startswith(submodel_name) for key in keys): + # New format (https://github.com/huggingface/diffusers/pull/2918) supports LoRA weights in both the + # unet and text encoder where keys are prefixed with 'unet' or 'text_encoder', respectively. + submodel_state_dict = {k: v for k, v in lora_state_dict.items() if k.startswith(submodel_name)} + else: + # Old format. Keys will not have any prefix. This only applies to unet, so exit early if this is + # optimizing the text encoder. + if submodel_name != "unet": + return + submodel_state_dict = lora_state_dict + + # Group LoRA weights into attention processors + attn_processors = {} + lora_grouped_dict = defaultdict(dict) + for key, value in submodel_state_dict.items(): + attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:]) + lora_grouped_dict[attn_processor_key][sub_key] = value + + for key, value_dict in lora_grouped_dict.items(): + rank = value_dict["to_k_lora.down.weight"].shape[0] + cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[1] + hidden_size = value_dict["to_k_lora.up.weight"].shape[0] + + attn_processors[key] = LoRAAttnProcessor( + hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank + ) + attn_processors[key].load_state_dict(value_dict) + + # Merge LoRA attention processor weights into existing Q/K/V/Out weights + for name, proc in attn_processors.items(): + attention_name = name[: -len(".processor")] + attention = reduce(getattr, attention_name.split(sep="."), base_model) + attention.to_q.weight.data += scale * torch.mm(proc.to_q_lora.up.weight, proc.to_q_lora.down.weight) + attention.to_k.weight.data += scale * torch.mm(proc.to_k_lora.up.weight, proc.to_k_lora.down.weight) + attention.to_v.weight.data += scale * torch.mm(proc.to_v_lora.up.weight, proc.to_v_lora.down.weight) + attention.to_out[0].weight.data += scale * torch.mm(proc.to_out_lora.up.weight, proc.to_out_lora.down.weight) diff --git a/OnnxStack.Converter/stable_diffusion/requirements.txt b/OnnxStack.Converter/stable_diffusion/requirements.txt new file mode 100644 index 0000000..15b9198 --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/requirements.txt @@ -0,0 +1,9 @@ +accelerate +diffusers +onnx +pillow +protobuf==3.20.3 # protobuf 4.x aborts with OOM when optimizing unet +tabulate +torch +transformers +onnxruntime-directml>=1.16.0 diff --git a/OnnxStack.Converter/stable_diffusion/sd_utils/ort.py b/OnnxStack.Converter/stable_diffusion/sd_utils/ort.py new file mode 100644 index 0000000..ad49818 --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/sd_utils/ort.py @@ -0,0 +1,172 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import os +import json +import shutil +import sys +from pathlib import Path +from typing import Dict + +import onnxruntime as ort +from diffusers import OnnxRuntimeModel, OnnxStableDiffusionPipeline +from onnxruntime import __version__ as OrtVersion +from packaging import version + +from olive.model import ONNXModelHandler + +# ruff: noqa: TID252, T201 + + +def update_cuda_config(config: Dict): + if version.parse(OrtVersion) < version.parse("1.17.0"): + # disable skip_group_norm fusion since there is a shape inference bug which leads to invalid models + config["passes"]["optimize_cuda"]["config"]["optimization_options"] = {"enable_skip_group_norm": False} + config["pass_flows"] = [["convert", "optimize_cuda"]] + config["systems"]["local_system"]["config"]["accelerators"][0]["execution_providers"] = ["CUDAExecutionProvider"] + return config + + +def validate_args(args, provider): + ort.set_default_logger_severity(4) + if args.static_dims: + print( + "WARNING: the --static_dims option is deprecated, and static shape optimization is enabled by default. " + "Use --dynamic_dims to disable static shape optimization." + ) + + validate_ort_version(provider) + + +def validate_ort_version(provider: str): + if provider == "dml" and version.parse(OrtVersion) < version.parse("1.16.0"): + print("This script requires onnxruntime-directml 1.16.0 or newer") + sys.exit(1) + elif provider == "cuda" and version.parse(OrtVersion) < version.parse("1.17.0"): + if version.parse(OrtVersion) < version.parse("1.16.2"): + print("This script requires onnxruntime-gpu 1.16.2 or newer") + sys.exit(1) + print( + f"WARNING: onnxruntime {OrtVersion} has known issues with shape inference for SkipGroupNorm. Will disable" + " skip_group_norm fusion. onnxruntime-gpu 1.17.0 or newer is strongly recommended!" + ) + + +def save_optimized_onnx_submodel(submodel_name, provider, model_info): + footprints_file_path = ( + Path(__file__).resolve().parents[1] / "footprints" / f"{submodel_name}_gpu-{provider}_footprints.json" + ) + with footprints_file_path.open("r") as footprint_file: + footprints = json.load(footprint_file) + + conversion_footprint = None + optimizer_footprint = None + for footprint in footprints.values(): + if footprint["from_pass"] == "OnnxConversion": + conversion_footprint = footprint + elif footprint["from_pass"] == "OrtTransformersOptimization": + optimizer_footprint = footprint + + assert conversion_footprint + assert optimizer_footprint + + unoptimized_olive_model = ONNXModelHandler(**conversion_footprint["model_config"]["config"]) + optimized_olive_model = ONNXModelHandler(**optimizer_footprint["model_config"]["config"]) + + model_info[submodel_name] = { + "unoptimized": { + "path": Path(unoptimized_olive_model.model_path), + }, + "optimized": { + "path": Path(optimized_olive_model.model_path), + }, + } + + print(f"Unoptimized Model : {model_info[submodel_name]['unoptimized']['path']}") + print(f"Optimized Model : {model_info[submodel_name]['optimized']['path']}") + + +def save_onnx_pipeline( + has_safety_checker, model_info, model_output, pipeline, submodel_names +): + # Save the unoptimized models in a directory structure that the diffusers library can load and run. + # This is optional, and the optimized models can be used directly in a custom pipeline if desired. + print("\nCreating ONNX pipeline...") + + optimized_model_dir = model_output / "Optimized" + unoptimized_model_dir = model_output / "Default" + has_controlnet = 'controlnet' in submodel_names + if has_safety_checker: + safety_checker = OnnxRuntimeModel.from_pretrained(model_info["safety_checker"]["unoptimized"]["path"].parent) + else: + safety_checker = None + + onnx_pipeline = OnnxStableDiffusionPipeline( + vae_encoder=OnnxRuntimeModel.from_pretrained(model_info["vae_encoder"]["unoptimized"]["path"].parent), + vae_decoder=OnnxRuntimeModel.from_pretrained(model_info["vae_decoder"]["unoptimized"]["path"].parent), + text_encoder=OnnxRuntimeModel.from_pretrained(model_info["text_encoder"]["unoptimized"]["path"].parent), + tokenizer=pipeline.tokenizer, + unet=OnnxRuntimeModel.from_pretrained(model_info["unet"]["unoptimized"]["path"].parent), + scheduler=pipeline.scheduler, + safety_checker=safety_checker, + feature_extractor=pipeline.feature_extractor, + requires_safety_checker=True, + ) + + if has_controlnet: + controlnet=OnnxRuntimeModel.from_pretrained(model_info["controlnet"]["unoptimized"]["path"].parent) + + print("Saving unoptimized models...") + onnx_pipeline.save_pretrained(unoptimized_model_dir) + if has_controlnet: + controlnet.save_pretrained(unoptimized_model_dir / "controlnet" ) + + # Create a copy of the unoptimized model directory, then overwrite with optimized models from the olive cache. + print("Copying optimized models...") + shutil.copytree(unoptimized_model_dir, optimized_model_dir, ignore=shutil.ignore_patterns("weights.pb")) + for submodel_name in submodel_names: + src_path = model_info[submodel_name]["optimized"]["path"] + dst_path = optimized_model_dir / submodel_name / "model.onnx" + exists = os.path.exists(dst_path) + if not exists: + os.mkdir(optimized_model_dir / submodel_name) + shutil.copyfile(src_path, dst_path) + + print(f"The default pipeline is located here: {unoptimized_model_dir}") + print(f"The optimized pipeline is located here: {optimized_model_dir}") + + +def get_ort_pipeline(model_dir, common_args, ort_args, guidance_scale): + ort.set_default_logger_severity(3) + + print("Loading models into ORT session...") + sess_options = ort.SessionOptions() + sess_options.enable_mem_pattern = False + + static_dims = not ort_args.dynamic_dims + batch_size = common_args.batch_size + image_size = common_args.image_size + provider = common_args.provider + + if static_dims: + hidden_batch_size = batch_size if (guidance_scale == 0.0) else batch_size * 2 + # Not necessary, but helps DML EP further optimize runtime performance. + # batch_size is doubled for sample & hidden state because of classifier free guidance: + # https://github.com/huggingface/diffusers/blob/46c52f9b9607e6ecb29c782c052aea313e6487b7/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L672 + sess_options.add_free_dimension_override_by_name("unet_sample_batch", hidden_batch_size) + sess_options.add_free_dimension_override_by_name("unet_sample_channels", 4) + sess_options.add_free_dimension_override_by_name("unet_sample_height", image_size // 8) + sess_options.add_free_dimension_override_by_name("unet_sample_width", image_size // 8) + sess_options.add_free_dimension_override_by_name("unet_time_batch", 1) + sess_options.add_free_dimension_override_by_name("unet_hidden_batch", hidden_batch_size) + sess_options.add_free_dimension_override_by_name("unet_hidden_sequence", 77) + + provider_map = { + "dml": "DmlExecutionProvider", + "cuda": "CUDAExecutionProvider", + } + assert provider in provider_map, f"Unsupported provider: {provider}" + return OnnxStableDiffusionPipeline.from_pretrained( + model_dir, provider=provider_map[provider], sess_options=sess_options + ) From 1f50960e7444f37ee269fd02242ea69efb81fc27 Mon Sep 17 00:00:00 2001 From: sa_ddam213 Date: Thu, 18 Apr 2024 21:35:31 +1200 Subject: [PATCH 2/9] Stable Cascade converter --- OnnxStack.Converter/stable_cascade/.gitignore | 2 + OnnxStack.Converter/stable_cascade/README.md | 180 ++++++++++++ OnnxStack.Converter/stable_cascade/config.py | 8 + .../stable_cascade/config_decoder.json | 121 ++++++++ .../stable_cascade/config_prior.json | 122 ++++++++ .../stable_cascade/config_text_encoder.json | 121 ++++++++ OnnxStack.Converter/stable_cascade/convert.py | 273 ++++++++++++++++++ OnnxStack.Converter/stable_cascade/models.py | 116 ++++++++ .../stable_cascade/requirements.txt | 9 + .../stable_cascade/sd_utils/ort.py | 163 +++++++++++ 10 files changed, 1115 insertions(+) create mode 100644 OnnxStack.Converter/stable_cascade/.gitignore create mode 100644 OnnxStack.Converter/stable_cascade/README.md create mode 100644 OnnxStack.Converter/stable_cascade/config.py create mode 100644 OnnxStack.Converter/stable_cascade/config_decoder.json create mode 100644 OnnxStack.Converter/stable_cascade/config_prior.json create mode 100644 OnnxStack.Converter/stable_cascade/config_text_encoder.json create mode 100644 OnnxStack.Converter/stable_cascade/convert.py create mode 100644 OnnxStack.Converter/stable_cascade/models.py create mode 100644 OnnxStack.Converter/stable_cascade/requirements.txt create mode 100644 OnnxStack.Converter/stable_cascade/sd_utils/ort.py diff --git a/OnnxStack.Converter/stable_cascade/.gitignore b/OnnxStack.Converter/stable_cascade/.gitignore new file mode 100644 index 0000000..324c183 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/.gitignore @@ -0,0 +1,2 @@ +/footprints/ +/result_*.png diff --git a/OnnxStack.Converter/stable_cascade/README.md b/OnnxStack.Converter/stable_cascade/README.md new file mode 100644 index 0000000..a12b762 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/README.md @@ -0,0 +1,180 @@ +# Stable Diffusion Optimization + +This folder contains sample use cases of Olive with ONNX Runtime and OpenVINO to optimize: +- Stable Diffusion: [Stable Diffusion v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4), [Stable Diffusion v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [Stable Diffusion v2](https://huggingface.co/stabilityai/stable-diffusion-2) +- Stable Diffusion XL: [Stable Diffusion XL Base](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), [Stable Diffusion XL Refiner](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0) + +Stable Diffusion comprises multiple PyTorch models tied together into a *pipeline*. + +The ONNX Runtime optimization sample will convert each PyTorch model to ONNX, and then run the converted ONNX models through the `OrtTransformersOptimization` pass. The transformer optimization pass performs several time-consuming graph transformations that make the models more efficient for inference at runtime. + +The OpenVINO optimization sample will convert each PyTorch model to OpenVINO IR model by `OpenVINOConversion` pass, and create an `OpenVINOStableDiffusionPipeline` for inference. + +- ONNX Runtime with + - [CUDA EP](#stable-diffusion-and-stable-diffusion-xl-optimization-with-onnx-runtime-cuda-ep) + - DirectML EP: go to examples [Stable Diffusion](../directml/stable_diffusion/README.md), [Stable Diffusion XL](../directml/stable_diffusion_xl/README.md) +- [OpenVINO](#stable-diffusion-optimization-with-openvino) + +## Stable Diffusion and Stable Diffusion XL Optimization with ONNX Runtime CUDA EP + +This sample performs the following optimization workflow for each model in the Stable Diffusion pipeline: +- *PyTorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp16* +

+ +Transformers optimization uses the following optimizations to speed up Stable Diffusion in CUDA: +* [Flash Attention](https://arxiv.org/abs/2205.14135) for float16 precision. Flash Attention uses tiling to reduce number of GPU memory reads/writes, and improves performance with less memory for long sequence length. The kernel requires GPUs of Compute Capability >= 7.5 (like T4, A100, and RTX 2060~4090). Only availanle in Linux. +* [Memory Efficient Attention](https://arxiv.org/abs/2112.05682v2) for float32 precision or older GPUs (like V100). We used the fused multi-head attention kernel in CUTLASS, and the kernel was contributed by xFormers. +* Channel-last (NHWC) convolution. For NVidia GPU with Tensor Cores support, NHWC tensor layout is recommended for convolution. See [Tensor Layouts In Memory: NCHW vs NHWC](https://docs.nvidia.com/deeplearning/performance/dl-performance-convolutional/index.html#tensor-layout). +* GroupNorm for NHWC tensor layout, and SkipGroupNorm fusion which fuses GroupNorm with Add bias and residual inputs +* SkipLayerNormalization which fuses LayerNormalization with Add bias and residual inputs. +* BiasSplitGelu is a fusion of Add bias with SplitGelu activation. +* BiasAdd fuses Add bias and residual. +* Reduce Transpose nodes by graph transformation. + +#### Prerequisites +##### Clone the repository and install Olive + +Refer to the instructions in the [examples README](../README.md) to clone the repository and install Olive. + + +We use the same olive workflow config files and scripts as the DirectML examples. The only difference is the `--provider cuda` option provided to the `stable_diffusion.py` and `stable_diffusion_xl.py` scripts. + +So, cd into the corresponding DirectML example folder from the root of the cloned repository: + +**_Stable Diffusion_** +```bash +cd examples/stable_diffusion +``` + +**_Stable Diffusion XL_** +```bash +cd examples/directml/stable_diffusion_xl +``` + +##### Install onnxruntime + +This example requires the latest onnxruntime-gpu code which can either be built from source or installed from the nightly builds. The following command can be used to install the latest nightly build of onnxruntime-gpu: + +```bash +# uninstall any pre-existing onnxruntime packages +pip uninstall -y onnxruntime onnxruntime-gpu onnxruntime-directml ort-nightly ort-nightly-gpu ort-nightly-directml + +# install onnxruntime-gpu nightly build +pip install ort-nightly-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ +``` + +##### Install other dependencies + +Install the necessary python packages: + +```bash +python -m pip install -r requirements-common.txt +``` + +#### Conversion to ONNX and Latency Optimization + +The easiest way to optimize the pipeline is with the `stable_diffusion.py` and `stable_diffusion_xl.py` scripts. These scripts will enumerate the `config_.json` files and optimize each with Olive, then gather the optimized models into a directory structure suitable for testing inference. + +**_Stable Diffusion_** +```bash +# default model_id is "runwayml/stable-diffusion-v1-5" +python stable_diffusion.py --provider cuda --optimize +``` + +**_Stable Diffusion XL_** +```bash +# default model_id is "stabilityai/stable-diffusion-xl-base-1.0" +python stable_diffusion_xl.py --provider cuda --optimize [--use_fp16_fixed_vae] + +# or specify a different model_id +python stable_diffusion_xl.py --provider cuda --model_id stabilityai/stable-diffusion-xl-refiner-1.0 --optimize [--use_fp16_fixed_vae] +``` + +`--use_fp16_fixed_vae` is optional. If provided, will use [madebyollin/sdxl-vae-fp16-fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) for the vae models and all sub-models will be entirely in fp16. +Otherwise, the vae models (vae-decoder for base and both vae-decoder and vae-encoder for refiner) will be in fp32 and all other sub-models will be in fp16 with fp32 input/output. + +Once the script successfully completes: +- The optimized ONNX pipeline will be stored under `models/optimized-cuda/[model_id]` (for example `models/optimized-cuda/runwayml/stable-diffusion-v1-5` or `models/optimized-cuda/stabilityai/stable-diffusion-xl-base-1.0`). +- The unoptimized ONNX pipeline (models converted to ONNX, but not run through transformer optimization pass) will be stored under `models/unoptimized/[model_id]` (for example `models/unoptimized/runwayml/stable-diffusion-v1-5` or `models/unoptimized/stabilityai/stable-diffusion-xl-base-1.0`). + +Re-running the script with `--optimize` will delete the output models, but it will *not* delete the Olive cache. Subsequent runs will complete much faster since it will simply be copying previously optimized models; you may use the `--clean_cache` option to start from scratch (not typically used unless you are modifying the scripts, for example). + +### Test Inference with CUDA + +Test ONNX runtime inference with the optimized models using `OnnxStableDiffusionPipeline`: + +**_Stable Diffusion_** +```bash +python stable_diffusion.py --provider cuda --num_images 2 +``` +Inference will loop until the generated image passes the safety checker (otherwise you would see black images). The result will be saved as `result_.png` on disk. + +**_Stable Diffusion XL_** +```bash +python stable_diffusion_xl.py --provider cuda --num_images 2 +``` +The result will be saved as `result_.png` on disk. + +Refer to the corresponding section in the DirectML READMEs for more details on the test inference options: +- [Stable Diffusion](../directml/stable_diffusion/README.md#test-inference) +- [Stable Diffusion XL](../directml/stable_diffusion_xl/README.md#test-inference) + + +## Stable Diffusion Optimization with OpenVINO + +**Contents**: +- [Setup](#setup) +- [Conversion to OpenVINO IR model](#convert-to-openvino-ir-model) +- [Test Inference](#test-inference-with-openvino) + +### Setup + +Olive is currently under pre-release, with constant updates and improvements to the functions and usage. This sample code will be frequently updated as Olive evolves, so it is important to install Olive from source when checking out this code from the main branch. See the [README for examples](https://github.com/microsoft/Olive/blob/main/examples/README.md#important) for detailed instructions on how to do this. + +**Alternatively**, you may install a stable release that we have validated. For example: + +``` +# Install Olive from main branch +pip install git+https://github.com/microsoft/Olive#egg=olive-ai[openvino] + +# Clone Olive repo to access sample code +git clone https://github.com/microsoft/olive +``` + +Once you've installed Olive, install the requirements for this sample matching the version of the library you are using: +``` +cd olive/examples/stable_diffusion +pip install -r requirements-ov.txt +``` + +### Convert to OpenVINO IR model + +The easiest way to optimize the pipeline is with the `stable_diffusion.py` helper script: + +``` +python stable_diffusion.py --optimize +``` + +The above command will enumerate the `config_.json` files and optimize each with Olive, then gather the optimized models into a directory structure suitable for testing inference. + +The stable diffusion models are large, and the optimization process is resource intensive. It is recommended to run optimization on a system with a minimum of 16GB of memory (preferably 32GB). Expect optimization to take several minutes (especially the U-Net model). + +Once the script successfully completes: +- The converted OpenVINO IR model will be stored under `models/optimized-openvino/[model_id]` (for example `models/optimized-openvino/runwayml/stable-diffusion-v1-5`). + +Re-running the script with `--optimize` will delete the output models, but it will *not* delete the Olive cache. Subsequent runs will complete much faster since it will simply be copying previously optimized models; you may use the `--clean_cache` option to start from scratch (not typically used unless you are modifying the scripts, for example). + +### Test Inference with OpenVINO + +This sample code is primarily intended to illustrate model optimization with Olive, but it also provides a simple interface for testing inference with the OpenVINO models. Inference is done by creating an `OVStableDiffusionPipeline` from the saved models. + + +``` +python stable_diffusion.py --inference --provider openvino +``` +Inference will loop until the generated image. The result will be saved as `result_.png` on disk. + + +Run `python stable_diffusion.py --help` for additional options. A few particularly relevant ones: +- `--image_path `: the input image path for image to image inference. +- `--img_to_img_example`: image to image example. The default input image is `assets/dog.png`, the default prompt is `amazing watercolor painting`. diff --git a/OnnxStack.Converter/stable_cascade/config.py b/OnnxStack.Converter/stable_cascade/config.py new file mode 100644 index 0000000..7b1b47e --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/config.py @@ -0,0 +1,8 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +vae_sample_size = 512 +unet_sample_size = 24 +cross_attention_dim = 1280 \ No newline at end of file diff --git a/OnnxStack.Converter/stable_cascade/config_decoder.json b/OnnxStack.Converter/stable_cascade/config_decoder.json new file mode 100644 index 0000000..82622d0 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/config_decoder.json @@ -0,0 +1,121 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "stabilityai/stable-cascade", + "model_loader": "decoder_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample", "timestep_ratio", "clip_text_pooled", "effnet", "return_dict" ], + "output_names": [ "out_sample" ], + "dynamic_axes": { + "sample": {"0": "unet_sample_batch", "1": "unet_sample_channels", "2": "unet_sample_height", "3": "unet_sample_width"}, + "timestep_ratio": {"0": "unet_timestep_ratio"}, + "clip_text_pooled": {"0": "unet_clip_text_pooled_batch", "1": "unet_clip_text_pooled_size"}, + "effnet": {"0": "unet_hidden_batch", "1": "unet_hidden_size"} + } + }, + "dummy_inputs_func": "decoder_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "decoder_data_loader", + "batch_size": 2 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14, + "save_as_external_data": true, + "all_tensors_to_one_file": true, + "external_data_name": "weights.pb" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "decoder", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "decoder", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_cascade/config_prior.json b/OnnxStack.Converter/stable_cascade/config_prior.json new file mode 100644 index 0000000..17813f3 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/config_prior.json @@ -0,0 +1,122 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "stabilityai/stable-cascade", + "model_loader": "prior_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample", "timestep_ratio", "clip_text_pooled", "clip_text", "clip_img", "return_dict" ], + "output_names": [ "out_sample" ], + "dynamic_axes": { + "sample": {"0": "unet_sample_batch", "1": "unet_sample_channels", "2": "unet_sample_height", "3": "unet_sample_width"}, + "timestep_ratio": {"0": "unet_timestep_ratio"}, + "clip_text_pooled": {"0": "unet_clip_text_pooled_batch", "1": "unet_clip_text_pooled_size", "2": "unet_clip_text_pooled_length"}, + "clip_text": {"0": "unet_clip_text_batch", "1": "unet_clip_text_size", "2": "unet_clip_text_length"}, + "clip_img": {"0": "unet_clip_img_batch", "1": "unet_clip_img_size", "2": "unet_clip_img_length"} + } + }, + "dummy_inputs_func": "prior_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "prior_data_loader", + "batch_size": 2 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14, + "save_as_external_data": true, + "all_tensors_to_one_file": true, + "external_data_name": "weights.pb" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "prior", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_cascade/config_text_encoder.json b/OnnxStack.Converter/stable_cascade/config_text_encoder.json new file mode 100644 index 0000000..dd7fdb8 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/config_text_encoder.json @@ -0,0 +1,121 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "stabilityai/stable-cascade", + "model_loader": "text_encoder_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "input_ids" ], + "output_names": [ "last_hidden_state", "pooler_output" ], + "dynamic_axes": { "input_ids": { "0": "batch", "1": "sequence" } } + }, + "dummy_inputs_func": "text_encoder_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "text_encoder_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14 + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "text_encoder_conversion_inputs", + "output_model": "text_encoder" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "clip", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "clip", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "text_encoder", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_cascade/convert.py b/OnnxStack.Converter/stable_cascade/convert.py new file mode 100644 index 0000000..ff6e159 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/convert.py @@ -0,0 +1,273 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import argparse +import json +import shutil +import sys +import warnings +from pathlib import Path +from typing import Dict + +import config +import torch +from diffusers import DiffusionPipeline +from packaging import version + +from olive.common.utils import set_tempdir +from olive.workflows import run as olive_run + + +# pylint: disable=redefined-outer-name +# ruff: noqa: TID252, T201 + + +def save_image(result, batch_size, provider, num_images, images_saved, image_callback=None): + passed_safety_checker = 0 + for image_index in range(batch_size): + if result.nsfw_content_detected is None or not result.nsfw_content_detected[image_index]: + passed_safety_checker += 1 + if images_saved < num_images: + output_path = f"result_{images_saved}.png" + result.images[image_index].save(output_path) + if image_callback: + image_callback(images_saved, output_path) + images_saved += 1 + print(f"Generated {output_path}") + print(f"Inference Batch End ({passed_safety_checker}/{batch_size} images).") + print("Images passed the safety checker.") + return images_saved + + +def run_inference_loop( + pipeline, + prompt, + num_images, + batch_size, + image_size, + num_inference_steps, + guidance_scale, + strength: float, + provider: str, + image_callback=None, + step_callback=None, +): + images_saved = 0 + + def update_steps(step, timestep, latents): + if step_callback: + step_callback((images_saved // batch_size) * num_inference_steps + step) + + while images_saved < num_images: + print(f"\nInference Batch Start (batch size = {batch_size}).") + + kwargs = {} + + result = pipeline( + [prompt] * batch_size, + num_inference_steps=num_inference_steps, + callback=update_steps if step_callback else None, + height=image_size, + width=image_size, + guidance_scale=guidance_scale, + **kwargs, + ) + + images_saved = save_image(result, batch_size, provider, num_images, images_saved, image_callback) + + +def update_config_with_provider(config: Dict, provider: str): + if provider == "dml": + # DirectML EP is the default, so no need to update config. + return config + elif provider == "cuda": + from sd_utils.ort import update_cuda_config + + return update_cuda_config(config) + else: + raise ValueError(f"Unsupported provider: {provider}") + + +def optimize( + model_input: str, + model_output: Path, + provider: str, + controlnet: bool +): + from google.protobuf import __version__ as protobuf_version + + # protobuf 4.x aborts with OOM when optimizing unet + if version.parse(protobuf_version) > version.parse("3.20.3"): + print("This script requires protobuf 3.20.3. Please ensure your package version matches requirements.txt.") + sys.exit(1) + + model_dir = model_input + script_dir = Path(__file__).resolve().parent + + # Clean up previously optimized models, if any. + shutil.rmtree(script_dir / "footprints", ignore_errors=True) + shutil.rmtree(model_output, ignore_errors=True) + + + # Load the entire PyTorch pipeline to ensure all models and their configurations are downloaded and cached. + # This avoids an issue where the non-ONNX components (tokenizer, scheduler, and feature extractor) are not + # automatically cached correctly if individual models are fetched one at a time. + print("Download stable diffusion PyTorch pipeline...") + pipeline = DiffusionPipeline.from_pretrained(model_dir, torch_dtype=torch.float32, **{"local_files_only": True}) + # config.vae_sample_size = pipeline.vae.config.sample_size + # config.cross_attention_dim = pipeline.unet.config.cross_attention_dim + # config.unet_sample_size = pipeline.unet.config.sample_size + + model_info = {} + + submodel_names = [ "text_encoder", "decoder", "prior"] + + has_safety_checker = getattr(pipeline, "safety_checker", None) is not None + + if has_safety_checker: + submodel_names.append("safety_checker") + + if controlnet: + submodel_names.append("controlnet") + + for submodel_name in submodel_names: + print(f"\nOptimizing {submodel_name}") + + olive_config = None + with (script_dir / f"config_{submodel_name}.json").open() as fin: + olive_config = json.load(fin) + olive_config = update_config_with_provider(olive_config, provider) + + if submodel_name in ("unet", "controlnet", "text_encoder"): + olive_config["input_model"]["config"]["model_path"] = model_dir + else: + # Only the unet & text encoder are affected by LoRA, so it's better to use the base model ID for + # other models: the Olive cache is based on the JSON config, and two LoRA variants with the same + # base model ID should be able to reuse previously optimized copies. + olive_config["input_model"]["config"]["model_path"] = model_dir + + run_res = olive_run(olive_config) + + from sd_utils.ort import save_optimized_onnx_submodel + + save_optimized_onnx_submodel(submodel_name, provider, model_info) + + from sd_utils.ort import save_onnx_pipeline + + save_onnx_pipeline( + has_safety_checker, model_info, model_output, pipeline, submodel_names + ) + + return model_info + + +def parse_common_args(raw_args): + parser = argparse.ArgumentParser("Common arguments") + + parser.add_argument("--model_input", default="stable-diffusion-v1-5", type=str) + parser.add_argument("--model_output", default="stable-diffusion-v1-5", type=Path) + parser.add_argument("--controlnet",action="store_true", help="Create ControlNet Unet Model") + parser.add_argument( + "--provider", default="dml", type=str, choices=["dml", "cuda"], help="Execution provider to use" + ) + parser.add_argument("--optimize", action="store_true", help="Runs the optimization step") + parser.add_argument("--clean_cache", action="store_true", help="Deletes the Olive cache") + parser.add_argument("--test_unoptimized", action="store_true", help="Use unoptimized model for inference") + parser.add_argument("--batch_size", default=1, type=int, help="Number of images to generate per batch") + parser.add_argument( + "--prompt", + default=( + "castle surrounded by water and nature, village, volumetric lighting, photorealistic, " + "detailed and intricate, fantasy, epic cinematic shot, mountains, 8k ultra hd" + ), + type=str, + ) + parser.add_argument( + "--guidance_scale", + default=7.5, + type=float, + help="Guidance scale as defined in Classifier-Free Diffusion Guidance", + ) + parser.add_argument("--num_images", default=1, type=int, help="Number of images to generate") + parser.add_argument("--num_inference_steps", default=50, type=int, help="Number of steps in diffusion process") + parser.add_argument("--tempdir", default=None, type=str, help="Root directory for tempfile directories and files") + parser.add_argument( + "--strength", + default=1.0, + type=float, + help="Value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. " + "Values that approach 1.0 enable lots of variations but will also produce images " + "that are not semantically consistent with the input.", + ) + parser.add_argument("--image_size", default=512, type=int, help="Width and height of the images to generate") + + return parser.parse_known_args(raw_args) + + +def parse_ort_args(raw_args): + parser = argparse.ArgumentParser("ONNX Runtime arguments") + + parser.add_argument( + "--static_dims", + action="store_true", + help="DEPRECATED (now enabled by default). Use --dynamic_dims to disable static_dims.", + ) + parser.add_argument("--dynamic_dims", action="store_true", help="Disable static shape optimization") + + return parser.parse_known_args(raw_args) + + +def main(raw_args=None): + common_args, extra_args = parse_common_args(raw_args) + + provider = common_args.provider + model_input = common_args.model_input + model_output = common_args.model_output + + script_dir = Path(__file__).resolve().parent + + + if common_args.clean_cache: + shutil.rmtree(script_dir / "cache", ignore_errors=True) + + guidance_scale = common_args.guidance_scale + + ort_args = None, None + ort_args, extra_args = parse_ort_args(extra_args) + + if common_args.optimize or not model_output.exists(): + set_tempdir(common_args.tempdir) + + # TODO(jstoecker): clean up warning filter (mostly during conversion from torch to ONNX) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + + from sd_utils.ort import validate_args + + validate_args(ort_args, common_args.provider) + optimize(common_args.model_input, common_args.model_output, common_args.provider, common_args.controlnet) + + if not common_args.optimize: + model_dir = model_output / "F32" if common_args.test_unoptimized else model_output / "F16" + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + + from sd_utils.ort import get_ort_pipeline + + pipeline = get_ort_pipeline(model_dir, common_args, ort_args, guidance_scale) + run_inference_loop( + pipeline, + common_args.prompt, + common_args.num_images, + common_args.batch_size, + common_args.image_size, + common_args.num_inference_steps, + guidance_scale, + common_args.strength, + provider=provider, + ) + + +if __name__ == "__main__": + main() diff --git a/OnnxStack.Converter/stable_cascade/models.py b/OnnxStack.Converter/stable_cascade/models.py new file mode 100644 index 0000000..b64e846 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/models.py @@ -0,0 +1,116 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import config +import torch +from typing import Union, Optional, Tuple +from diffusers import AutoencoderKL, StableCascadeUNet, ControlNetModel +from diffusers.models.controlnet import ControlNetOutput, BaseOutput as ControlNetBaseOutput +from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker +from transformers.models.clip.modeling_clip import CLIPTextModelWithProjection +from dataclasses import dataclass + +# Helper latency-only dataloader that creates random tensors with no label +class RandomDataLoader: + def __init__(self, create_inputs_func, batchsize, torch_dtype): + self.create_input_func = create_inputs_func + self.batchsize = batchsize + self.torch_dtype = torch_dtype + + def __getitem__(self, idx): + label = None + return self.create_input_func(self.batchsize, self.torch_dtype), label + + + +# ----------------------------------------------------------------------------- +# TEXT ENCODER +# ----------------------------------------------------------------------------- + + +def text_encoder_inputs(batchsize, torch_dtype): + return torch.zeros((batchsize, 77), dtype=torch_dtype) + + +def text_encoder_load(model_name): + model = CLIPTextModelWithProjection.from_pretrained(model_name, subfolder="text_encoder") + return model + + +def text_encoder_conversion_inputs(model=None): + return text_encoder_inputs(1, torch.int32) + + +def text_encoder_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(text_encoder_inputs, batchsize, torch.int32) + + +# ----------------------------------------------------------------------------- +# decoder +# ----------------------------------------------------------------------------- + + +def decoder_inputs(batchsize, torch_dtype, is_conversion_inputs=False): + # TODO(jstoecker): Rename onnx::Concat_4 to text_embeds and onnx::Shape_5 to time_ids + inputs = { + "sample": torch.rand((batchsize, 4, 256, 256), dtype=torch_dtype), + "timestep_ratio": torch.rand((batchsize,), dtype=torch_dtype), + "clip_text_pooled": torch.rand((batchsize , 1, 1280), dtype=torch_dtype), + "effnet": torch.rand((batchsize, 16, 24, 24), dtype=torch_dtype) + } + + # use as kwargs since they won't be in the correct position if passed along with the tuple of inputs + kwargs = { + "return_dict": False, + } + + return inputs + + +def decoder_load(model_name): + model = StableCascadeUNet.from_pretrained(model_name, subfolder="decoder") + return model + + +def decoder_conversion_inputs(model=None): + return tuple(decoder_inputs(1, torch.float32, True).values()) + + +def decoder_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(decoder_inputs, batchsize, torch.float16) + + + +# ----------------------------------------------------------------------------- +# prior +# ----------------------------------------------------------------------------- + +def prior_inputs(batchsize, torch_dtype, is_conversion_inputs=False): + inputs = { + "sample": torch.rand((batchsize, 16, 24, 24), dtype=torch_dtype), + "timestep_ratio": torch.rand(((batchsize *2),), dtype=torch_dtype), + "clip_text_pooled": torch.rand(((batchsize *2) , 1, 1280), dtype=torch_dtype), + "clip_text": torch.rand(((batchsize *2) , 77, 1280), dtype=torch_dtype), + "clip_img": torch.rand(((batchsize *2) , 1, 768), dtype=torch_dtype) + } + + # use as kwargs since they won't be in the correct position if passed along with the tuple of inputs + kwargs = { + "return_dict": False, + } + + return inputs + + +def prior_load(model_name): + model = StableCascadeUNet.from_pretrained(model_name, subfolder="prior") + return model + + +def prior_conversion_inputs(model=None): + return tuple(prior_inputs(1, torch.float32, True).values()) + + +def prior_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(prior_inputs, batchsize, torch.float16) \ No newline at end of file diff --git a/OnnxStack.Converter/stable_cascade/requirements.txt b/OnnxStack.Converter/stable_cascade/requirements.txt new file mode 100644 index 0000000..15b9198 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/requirements.txt @@ -0,0 +1,9 @@ +accelerate +diffusers +onnx +pillow +protobuf==3.20.3 # protobuf 4.x aborts with OOM when optimizing unet +tabulate +torch +transformers +onnxruntime-directml>=1.16.0 diff --git a/OnnxStack.Converter/stable_cascade/sd_utils/ort.py b/OnnxStack.Converter/stable_cascade/sd_utils/ort.py new file mode 100644 index 0000000..5750135 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/sd_utils/ort.py @@ -0,0 +1,163 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import os +import json +import shutil +import sys +from pathlib import Path +from typing import Dict + +import onnxruntime as ort +from diffusers import OnnxRuntimeModel, OnnxStableDiffusionPipeline +from onnxruntime import __version__ as OrtVersion +from packaging import version + +from olive.model import ONNXModelHandler + +# ruff: noqa: TID252, T201 + + +def update_cuda_config(config: Dict): + if version.parse(OrtVersion) < version.parse("1.17.0"): + # disable skip_group_norm fusion since there is a shape inference bug which leads to invalid models + config["passes"]["optimize_cuda"]["config"]["optimization_options"] = {"enable_skip_group_norm": False} + config["pass_flows"] = [["convert", "optimize_cuda"]] + config["systems"]["local_system"]["config"]["accelerators"][0]["execution_providers"] = ["CUDAExecutionProvider"] + return config + + +def validate_args(args, provider): + ort.set_default_logger_severity(4) + if args.static_dims: + print( + "WARNING: the --static_dims option is deprecated, and static shape optimization is enabled by default. " + "Use --dynamic_dims to disable static shape optimization." + ) + + validate_ort_version(provider) + + +def validate_ort_version(provider: str): + if provider == "dml" and version.parse(OrtVersion) < version.parse("1.16.0"): + print("This script requires onnxruntime-directml 1.16.0 or newer") + sys.exit(1) + elif provider == "cuda" and version.parse(OrtVersion) < version.parse("1.17.0"): + if version.parse(OrtVersion) < version.parse("1.16.2"): + print("This script requires onnxruntime-gpu 1.16.2 or newer") + sys.exit(1) + print( + f"WARNING: onnxruntime {OrtVersion} has known issues with shape inference for SkipGroupNorm. Will disable" + " skip_group_norm fusion. onnxruntime-gpu 1.17.0 or newer is strongly recommended!" + ) + + +def save_optimized_onnx_submodel(submodel_name, provider, model_info): + footprints_file_path = ( + Path(__file__).resolve().parents[1] / "footprints" / f"{submodel_name}_gpu-{provider}_footprints.json" + ) + with footprints_file_path.open("r") as footprint_file: + footprints = json.load(footprint_file) + + conversion_footprint = None + optimizer_footprint = None + for footprint in footprints.values(): + if footprint["from_pass"] == "OnnxConversion": + conversion_footprint = footprint + elif footprint["from_pass"] == "OrtTransformersOptimization": + optimizer_footprint = footprint + + assert conversion_footprint + assert optimizer_footprint + + unoptimized_olive_model = ONNXModelHandler(**conversion_footprint["model_config"]["config"]) + optimized_olive_model = ONNXModelHandler(**optimizer_footprint["model_config"]["config"]) + + model_info[submodel_name] = { + "unoptimized": { + "path": Path(unoptimized_olive_model.model_path), + }, + "optimized": { + "path": Path(optimized_olive_model.model_path), + }, + } + + print(f"Unoptimized Model : {model_info[submodel_name]['unoptimized']['path']}") + print(f"Optimized Model : {model_info[submodel_name]['optimized']['path']}") + + +def save_onnx_pipeline( + has_safety_checker, model_info, model_output, pipeline, submodel_names +): + # Save the unoptimized models in a directory structure that the diffusers library can load and run. + # This is optional, and the optimized models can be used directly in a custom pipeline if desired. + print("\nCreating ONNX pipeline...") + + optimized_model_dir = model_output / "Optimized" + unoptimized_model_dir = model_output / "Default" + has_controlnet = 'controlnet' in submodel_names + if has_safety_checker: + safety_checker = OnnxRuntimeModel.from_pretrained(model_info["safety_checker"]["unoptimized"]["path"].parent) + else: + safety_checker = None + + text_encoder=OnnxRuntimeModel.from_pretrained(model_info["text_encoder"]["unoptimized"]["path"].parent) + decoder=OnnxRuntimeModel.from_pretrained(model_info["text_encoder"]["unoptimized"]["path"].parent) + prior=OnnxRuntimeModel.from_pretrained(model_info["text_encoder"]["unoptimized"]["path"].parent) + + + + print("Saving unoptimized models...") + text_encoder.save_pretrained(unoptimized_model_dir / "text_encoder") + decoder.save_pretrained(unoptimized_model_dir/ "decoder") + prior.save_pretrained(unoptimized_model_dir/ "prior") + + # Create a copy of the unoptimized model directory, then overwrite with optimized models from the olive cache. + print("Copying optimized models...") + shutil.copytree(unoptimized_model_dir, optimized_model_dir, ignore=shutil.ignore_patterns("weights.pb")) + for submodel_name in submodel_names: + src_path = model_info[submodel_name]["optimized"]["path"] + dst_path = optimized_model_dir / submodel_name / "model.onnx" + exists = os.path.exists(dst_path) + if not exists: + os.mkdir(optimized_model_dir / submodel_name) + shutil.copyfile(src_path, dst_path) + + print(f"The default pipeline is located here: {unoptimized_model_dir}") + print(f"The optimized pipeline is located here: {optimized_model_dir}") + + +def get_ort_pipeline(model_dir, common_args, ort_args, guidance_scale): + ort.set_default_logger_severity(3) + + print("Loading models into ORT session...") + sess_options = ort.SessionOptions() + sess_options.enable_mem_pattern = False + + static_dims = not ort_args.dynamic_dims + batch_size = common_args.batch_size + image_size = common_args.image_size + provider = common_args.provider + + if static_dims: + hidden_batch_size = batch_size if (guidance_scale == 0.0) else batch_size * 2 + # Not necessary, but helps DML EP further optimize runtime performance. + # batch_size is doubled for sample & hidden state because of classifier free guidance: + # https://github.com/huggingface/diffusers/blob/46c52f9b9607e6ecb29c782c052aea313e6487b7/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L672 + sess_options.add_free_dimension_override_by_name("unet_sample_batch", hidden_batch_size) + sess_options.add_free_dimension_override_by_name("unet_sample_channels", 4) + sess_options.add_free_dimension_override_by_name("unet_sample_height", image_size // 8) + sess_options.add_free_dimension_override_by_name("unet_sample_width", image_size // 8) + sess_options.add_free_dimension_override_by_name("unet_time_batch", 1) + sess_options.add_free_dimension_override_by_name("unet_hidden_batch", hidden_batch_size) + sess_options.add_free_dimension_override_by_name("unet_hidden_sequence", 77) + + provider_map = { + "dml": "DmlExecutionProvider", + "cuda": "CUDAExecutionProvider", + } + assert provider in provider_map, f"Unsupported provider: {provider}" + return OnnxStableDiffusionPipeline.from_pretrained( + model_dir, provider=provider_map[provider], sess_options=sess_options + ) From 20de49323798d35e9ec52bf64caa0a29975afcd5 Mon Sep 17 00:00:00 2001 From: sa_ddam213 Date: Thu, 18 Apr 2024 21:39:31 +1200 Subject: [PATCH 3/9] Remove old README --- .../latent_consistency/README.md | 180 ------------------ OnnxStack.Converter/stable_cascade/README.md | 180 ------------------ .../stable_diffusion/README.md | 180 ------------------ 3 files changed, 540 deletions(-) delete mode 100644 OnnxStack.Converter/latent_consistency/README.md delete mode 100644 OnnxStack.Converter/stable_cascade/README.md delete mode 100644 OnnxStack.Converter/stable_diffusion/README.md diff --git a/OnnxStack.Converter/latent_consistency/README.md b/OnnxStack.Converter/latent_consistency/README.md deleted file mode 100644 index a12b762..0000000 --- a/OnnxStack.Converter/latent_consistency/README.md +++ /dev/null @@ -1,180 +0,0 @@ -# Stable Diffusion Optimization - -This folder contains sample use cases of Olive with ONNX Runtime and OpenVINO to optimize: -- Stable Diffusion: [Stable Diffusion v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4), [Stable Diffusion v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [Stable Diffusion v2](https://huggingface.co/stabilityai/stable-diffusion-2) -- Stable Diffusion XL: [Stable Diffusion XL Base](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), [Stable Diffusion XL Refiner](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0) - -Stable Diffusion comprises multiple PyTorch models tied together into a *pipeline*. - -The ONNX Runtime optimization sample will convert each PyTorch model to ONNX, and then run the converted ONNX models through the `OrtTransformersOptimization` pass. The transformer optimization pass performs several time-consuming graph transformations that make the models more efficient for inference at runtime. - -The OpenVINO optimization sample will convert each PyTorch model to OpenVINO IR model by `OpenVINOConversion` pass, and create an `OpenVINOStableDiffusionPipeline` for inference. - -- ONNX Runtime with - - [CUDA EP](#stable-diffusion-and-stable-diffusion-xl-optimization-with-onnx-runtime-cuda-ep) - - DirectML EP: go to examples [Stable Diffusion](../directml/stable_diffusion/README.md), [Stable Diffusion XL](../directml/stable_diffusion_xl/README.md) -- [OpenVINO](#stable-diffusion-optimization-with-openvino) - -## Stable Diffusion and Stable Diffusion XL Optimization with ONNX Runtime CUDA EP - -This sample performs the following optimization workflow for each model in the Stable Diffusion pipeline: -- *PyTorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp16* -

- -Transformers optimization uses the following optimizations to speed up Stable Diffusion in CUDA: -* [Flash Attention](https://arxiv.org/abs/2205.14135) for float16 precision. Flash Attention uses tiling to reduce number of GPU memory reads/writes, and improves performance with less memory for long sequence length. The kernel requires GPUs of Compute Capability >= 7.5 (like T4, A100, and RTX 2060~4090). Only availanle in Linux. -* [Memory Efficient Attention](https://arxiv.org/abs/2112.05682v2) for float32 precision or older GPUs (like V100). We used the fused multi-head attention kernel in CUTLASS, and the kernel was contributed by xFormers. -* Channel-last (NHWC) convolution. For NVidia GPU with Tensor Cores support, NHWC tensor layout is recommended for convolution. See [Tensor Layouts In Memory: NCHW vs NHWC](https://docs.nvidia.com/deeplearning/performance/dl-performance-convolutional/index.html#tensor-layout). -* GroupNorm for NHWC tensor layout, and SkipGroupNorm fusion which fuses GroupNorm with Add bias and residual inputs -* SkipLayerNormalization which fuses LayerNormalization with Add bias and residual inputs. -* BiasSplitGelu is a fusion of Add bias with SplitGelu activation. -* BiasAdd fuses Add bias and residual. -* Reduce Transpose nodes by graph transformation. - -#### Prerequisites -##### Clone the repository and install Olive - -Refer to the instructions in the [examples README](../README.md) to clone the repository and install Olive. - - -We use the same olive workflow config files and scripts as the DirectML examples. The only difference is the `--provider cuda` option provided to the `stable_diffusion.py` and `stable_diffusion_xl.py` scripts. - -So, cd into the corresponding DirectML example folder from the root of the cloned repository: - -**_Stable Diffusion_** -```bash -cd examples/stable_diffusion -``` - -**_Stable Diffusion XL_** -```bash -cd examples/directml/stable_diffusion_xl -``` - -##### Install onnxruntime - -This example requires the latest onnxruntime-gpu code which can either be built from source or installed from the nightly builds. The following command can be used to install the latest nightly build of onnxruntime-gpu: - -```bash -# uninstall any pre-existing onnxruntime packages -pip uninstall -y onnxruntime onnxruntime-gpu onnxruntime-directml ort-nightly ort-nightly-gpu ort-nightly-directml - -# install onnxruntime-gpu nightly build -pip install ort-nightly-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ -``` - -##### Install other dependencies - -Install the necessary python packages: - -```bash -python -m pip install -r requirements-common.txt -``` - -#### Conversion to ONNX and Latency Optimization - -The easiest way to optimize the pipeline is with the `stable_diffusion.py` and `stable_diffusion_xl.py` scripts. These scripts will enumerate the `config_.json` files and optimize each with Olive, then gather the optimized models into a directory structure suitable for testing inference. - -**_Stable Diffusion_** -```bash -# default model_id is "runwayml/stable-diffusion-v1-5" -python stable_diffusion.py --provider cuda --optimize -``` - -**_Stable Diffusion XL_** -```bash -# default model_id is "stabilityai/stable-diffusion-xl-base-1.0" -python stable_diffusion_xl.py --provider cuda --optimize [--use_fp16_fixed_vae] - -# or specify a different model_id -python stable_diffusion_xl.py --provider cuda --model_id stabilityai/stable-diffusion-xl-refiner-1.0 --optimize [--use_fp16_fixed_vae] -``` - -`--use_fp16_fixed_vae` is optional. If provided, will use [madebyollin/sdxl-vae-fp16-fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) for the vae models and all sub-models will be entirely in fp16. -Otherwise, the vae models (vae-decoder for base and both vae-decoder and vae-encoder for refiner) will be in fp32 and all other sub-models will be in fp16 with fp32 input/output. - -Once the script successfully completes: -- The optimized ONNX pipeline will be stored under `models/optimized-cuda/[model_id]` (for example `models/optimized-cuda/runwayml/stable-diffusion-v1-5` or `models/optimized-cuda/stabilityai/stable-diffusion-xl-base-1.0`). -- The unoptimized ONNX pipeline (models converted to ONNX, but not run through transformer optimization pass) will be stored under `models/unoptimized/[model_id]` (for example `models/unoptimized/runwayml/stable-diffusion-v1-5` or `models/unoptimized/stabilityai/stable-diffusion-xl-base-1.0`). - -Re-running the script with `--optimize` will delete the output models, but it will *not* delete the Olive cache. Subsequent runs will complete much faster since it will simply be copying previously optimized models; you may use the `--clean_cache` option to start from scratch (not typically used unless you are modifying the scripts, for example). - -### Test Inference with CUDA - -Test ONNX runtime inference with the optimized models using `OnnxStableDiffusionPipeline`: - -**_Stable Diffusion_** -```bash -python stable_diffusion.py --provider cuda --num_images 2 -``` -Inference will loop until the generated image passes the safety checker (otherwise you would see black images). The result will be saved as `result_.png` on disk. - -**_Stable Diffusion XL_** -```bash -python stable_diffusion_xl.py --provider cuda --num_images 2 -``` -The result will be saved as `result_.png` on disk. - -Refer to the corresponding section in the DirectML READMEs for more details on the test inference options: -- [Stable Diffusion](../directml/stable_diffusion/README.md#test-inference) -- [Stable Diffusion XL](../directml/stable_diffusion_xl/README.md#test-inference) - - -## Stable Diffusion Optimization with OpenVINO - -**Contents**: -- [Setup](#setup) -- [Conversion to OpenVINO IR model](#convert-to-openvino-ir-model) -- [Test Inference](#test-inference-with-openvino) - -### Setup - -Olive is currently under pre-release, with constant updates and improvements to the functions and usage. This sample code will be frequently updated as Olive evolves, so it is important to install Olive from source when checking out this code from the main branch. See the [README for examples](https://github.com/microsoft/Olive/blob/main/examples/README.md#important) for detailed instructions on how to do this. - -**Alternatively**, you may install a stable release that we have validated. For example: - -``` -# Install Olive from main branch -pip install git+https://github.com/microsoft/Olive#egg=olive-ai[openvino] - -# Clone Olive repo to access sample code -git clone https://github.com/microsoft/olive -``` - -Once you've installed Olive, install the requirements for this sample matching the version of the library you are using: -``` -cd olive/examples/stable_diffusion -pip install -r requirements-ov.txt -``` - -### Convert to OpenVINO IR model - -The easiest way to optimize the pipeline is with the `stable_diffusion.py` helper script: - -``` -python stable_diffusion.py --optimize -``` - -The above command will enumerate the `config_.json` files and optimize each with Olive, then gather the optimized models into a directory structure suitable for testing inference. - -The stable diffusion models are large, and the optimization process is resource intensive. It is recommended to run optimization on a system with a minimum of 16GB of memory (preferably 32GB). Expect optimization to take several minutes (especially the U-Net model). - -Once the script successfully completes: -- The converted OpenVINO IR model will be stored under `models/optimized-openvino/[model_id]` (for example `models/optimized-openvino/runwayml/stable-diffusion-v1-5`). - -Re-running the script with `--optimize` will delete the output models, but it will *not* delete the Olive cache. Subsequent runs will complete much faster since it will simply be copying previously optimized models; you may use the `--clean_cache` option to start from scratch (not typically used unless you are modifying the scripts, for example). - -### Test Inference with OpenVINO - -This sample code is primarily intended to illustrate model optimization with Olive, but it also provides a simple interface for testing inference with the OpenVINO models. Inference is done by creating an `OVStableDiffusionPipeline` from the saved models. - - -``` -python stable_diffusion.py --inference --provider openvino -``` -Inference will loop until the generated image. The result will be saved as `result_.png` on disk. - - -Run `python stable_diffusion.py --help` for additional options. A few particularly relevant ones: -- `--image_path `: the input image path for image to image inference. -- `--img_to_img_example`: image to image example. The default input image is `assets/dog.png`, the default prompt is `amazing watercolor painting`. diff --git a/OnnxStack.Converter/stable_cascade/README.md b/OnnxStack.Converter/stable_cascade/README.md deleted file mode 100644 index a12b762..0000000 --- a/OnnxStack.Converter/stable_cascade/README.md +++ /dev/null @@ -1,180 +0,0 @@ -# Stable Diffusion Optimization - -This folder contains sample use cases of Olive with ONNX Runtime and OpenVINO to optimize: -- Stable Diffusion: [Stable Diffusion v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4), [Stable Diffusion v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [Stable Diffusion v2](https://huggingface.co/stabilityai/stable-diffusion-2) -- Stable Diffusion XL: [Stable Diffusion XL Base](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), [Stable Diffusion XL Refiner](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0) - -Stable Diffusion comprises multiple PyTorch models tied together into a *pipeline*. - -The ONNX Runtime optimization sample will convert each PyTorch model to ONNX, and then run the converted ONNX models through the `OrtTransformersOptimization` pass. The transformer optimization pass performs several time-consuming graph transformations that make the models more efficient for inference at runtime. - -The OpenVINO optimization sample will convert each PyTorch model to OpenVINO IR model by `OpenVINOConversion` pass, and create an `OpenVINOStableDiffusionPipeline` for inference. - -- ONNX Runtime with - - [CUDA EP](#stable-diffusion-and-stable-diffusion-xl-optimization-with-onnx-runtime-cuda-ep) - - DirectML EP: go to examples [Stable Diffusion](../directml/stable_diffusion/README.md), [Stable Diffusion XL](../directml/stable_diffusion_xl/README.md) -- [OpenVINO](#stable-diffusion-optimization-with-openvino) - -## Stable Diffusion and Stable Diffusion XL Optimization with ONNX Runtime CUDA EP - -This sample performs the following optimization workflow for each model in the Stable Diffusion pipeline: -- *PyTorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp16* -

- -Transformers optimization uses the following optimizations to speed up Stable Diffusion in CUDA: -* [Flash Attention](https://arxiv.org/abs/2205.14135) for float16 precision. Flash Attention uses tiling to reduce number of GPU memory reads/writes, and improves performance with less memory for long sequence length. The kernel requires GPUs of Compute Capability >= 7.5 (like T4, A100, and RTX 2060~4090). Only availanle in Linux. -* [Memory Efficient Attention](https://arxiv.org/abs/2112.05682v2) for float32 precision or older GPUs (like V100). We used the fused multi-head attention kernel in CUTLASS, and the kernel was contributed by xFormers. -* Channel-last (NHWC) convolution. For NVidia GPU with Tensor Cores support, NHWC tensor layout is recommended for convolution. See [Tensor Layouts In Memory: NCHW vs NHWC](https://docs.nvidia.com/deeplearning/performance/dl-performance-convolutional/index.html#tensor-layout). -* GroupNorm for NHWC tensor layout, and SkipGroupNorm fusion which fuses GroupNorm with Add bias and residual inputs -* SkipLayerNormalization which fuses LayerNormalization with Add bias and residual inputs. -* BiasSplitGelu is a fusion of Add bias with SplitGelu activation. -* BiasAdd fuses Add bias and residual. -* Reduce Transpose nodes by graph transformation. - -#### Prerequisites -##### Clone the repository and install Olive - -Refer to the instructions in the [examples README](../README.md) to clone the repository and install Olive. - - -We use the same olive workflow config files and scripts as the DirectML examples. The only difference is the `--provider cuda` option provided to the `stable_diffusion.py` and `stable_diffusion_xl.py` scripts. - -So, cd into the corresponding DirectML example folder from the root of the cloned repository: - -**_Stable Diffusion_** -```bash -cd examples/stable_diffusion -``` - -**_Stable Diffusion XL_** -```bash -cd examples/directml/stable_diffusion_xl -``` - -##### Install onnxruntime - -This example requires the latest onnxruntime-gpu code which can either be built from source or installed from the nightly builds. The following command can be used to install the latest nightly build of onnxruntime-gpu: - -```bash -# uninstall any pre-existing onnxruntime packages -pip uninstall -y onnxruntime onnxruntime-gpu onnxruntime-directml ort-nightly ort-nightly-gpu ort-nightly-directml - -# install onnxruntime-gpu nightly build -pip install ort-nightly-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ -``` - -##### Install other dependencies - -Install the necessary python packages: - -```bash -python -m pip install -r requirements-common.txt -``` - -#### Conversion to ONNX and Latency Optimization - -The easiest way to optimize the pipeline is with the `stable_diffusion.py` and `stable_diffusion_xl.py` scripts. These scripts will enumerate the `config_.json` files and optimize each with Olive, then gather the optimized models into a directory structure suitable for testing inference. - -**_Stable Diffusion_** -```bash -# default model_id is "runwayml/stable-diffusion-v1-5" -python stable_diffusion.py --provider cuda --optimize -``` - -**_Stable Diffusion XL_** -```bash -# default model_id is "stabilityai/stable-diffusion-xl-base-1.0" -python stable_diffusion_xl.py --provider cuda --optimize [--use_fp16_fixed_vae] - -# or specify a different model_id -python stable_diffusion_xl.py --provider cuda --model_id stabilityai/stable-diffusion-xl-refiner-1.0 --optimize [--use_fp16_fixed_vae] -``` - -`--use_fp16_fixed_vae` is optional. If provided, will use [madebyollin/sdxl-vae-fp16-fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) for the vae models and all sub-models will be entirely in fp16. -Otherwise, the vae models (vae-decoder for base and both vae-decoder and vae-encoder for refiner) will be in fp32 and all other sub-models will be in fp16 with fp32 input/output. - -Once the script successfully completes: -- The optimized ONNX pipeline will be stored under `models/optimized-cuda/[model_id]` (for example `models/optimized-cuda/runwayml/stable-diffusion-v1-5` or `models/optimized-cuda/stabilityai/stable-diffusion-xl-base-1.0`). -- The unoptimized ONNX pipeline (models converted to ONNX, but not run through transformer optimization pass) will be stored under `models/unoptimized/[model_id]` (for example `models/unoptimized/runwayml/stable-diffusion-v1-5` or `models/unoptimized/stabilityai/stable-diffusion-xl-base-1.0`). - -Re-running the script with `--optimize` will delete the output models, but it will *not* delete the Olive cache. Subsequent runs will complete much faster since it will simply be copying previously optimized models; you may use the `--clean_cache` option to start from scratch (not typically used unless you are modifying the scripts, for example). - -### Test Inference with CUDA - -Test ONNX runtime inference with the optimized models using `OnnxStableDiffusionPipeline`: - -**_Stable Diffusion_** -```bash -python stable_diffusion.py --provider cuda --num_images 2 -``` -Inference will loop until the generated image passes the safety checker (otherwise you would see black images). The result will be saved as `result_.png` on disk. - -**_Stable Diffusion XL_** -```bash -python stable_diffusion_xl.py --provider cuda --num_images 2 -``` -The result will be saved as `result_.png` on disk. - -Refer to the corresponding section in the DirectML READMEs for more details on the test inference options: -- [Stable Diffusion](../directml/stable_diffusion/README.md#test-inference) -- [Stable Diffusion XL](../directml/stable_diffusion_xl/README.md#test-inference) - - -## Stable Diffusion Optimization with OpenVINO - -**Contents**: -- [Setup](#setup) -- [Conversion to OpenVINO IR model](#convert-to-openvino-ir-model) -- [Test Inference](#test-inference-with-openvino) - -### Setup - -Olive is currently under pre-release, with constant updates and improvements to the functions and usage. This sample code will be frequently updated as Olive evolves, so it is important to install Olive from source when checking out this code from the main branch. See the [README for examples](https://github.com/microsoft/Olive/blob/main/examples/README.md#important) for detailed instructions on how to do this. - -**Alternatively**, you may install a stable release that we have validated. For example: - -``` -# Install Olive from main branch -pip install git+https://github.com/microsoft/Olive#egg=olive-ai[openvino] - -# Clone Olive repo to access sample code -git clone https://github.com/microsoft/olive -``` - -Once you've installed Olive, install the requirements for this sample matching the version of the library you are using: -``` -cd olive/examples/stable_diffusion -pip install -r requirements-ov.txt -``` - -### Convert to OpenVINO IR model - -The easiest way to optimize the pipeline is with the `stable_diffusion.py` helper script: - -``` -python stable_diffusion.py --optimize -``` - -The above command will enumerate the `config_.json` files and optimize each with Olive, then gather the optimized models into a directory structure suitable for testing inference. - -The stable diffusion models are large, and the optimization process is resource intensive. It is recommended to run optimization on a system with a minimum of 16GB of memory (preferably 32GB). Expect optimization to take several minutes (especially the U-Net model). - -Once the script successfully completes: -- The converted OpenVINO IR model will be stored under `models/optimized-openvino/[model_id]` (for example `models/optimized-openvino/runwayml/stable-diffusion-v1-5`). - -Re-running the script with `--optimize` will delete the output models, but it will *not* delete the Olive cache. Subsequent runs will complete much faster since it will simply be copying previously optimized models; you may use the `--clean_cache` option to start from scratch (not typically used unless you are modifying the scripts, for example). - -### Test Inference with OpenVINO - -This sample code is primarily intended to illustrate model optimization with Olive, but it also provides a simple interface for testing inference with the OpenVINO models. Inference is done by creating an `OVStableDiffusionPipeline` from the saved models. - - -``` -python stable_diffusion.py --inference --provider openvino -``` -Inference will loop until the generated image. The result will be saved as `result_.png` on disk. - - -Run `python stable_diffusion.py --help` for additional options. A few particularly relevant ones: -- `--image_path `: the input image path for image to image inference. -- `--img_to_img_example`: image to image example. The default input image is `assets/dog.png`, the default prompt is `amazing watercolor painting`. diff --git a/OnnxStack.Converter/stable_diffusion/README.md b/OnnxStack.Converter/stable_diffusion/README.md deleted file mode 100644 index a12b762..0000000 --- a/OnnxStack.Converter/stable_diffusion/README.md +++ /dev/null @@ -1,180 +0,0 @@ -# Stable Diffusion Optimization - -This folder contains sample use cases of Olive with ONNX Runtime and OpenVINO to optimize: -- Stable Diffusion: [Stable Diffusion v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4), [Stable Diffusion v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [Stable Diffusion v2](https://huggingface.co/stabilityai/stable-diffusion-2) -- Stable Diffusion XL: [Stable Diffusion XL Base](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), [Stable Diffusion XL Refiner](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0) - -Stable Diffusion comprises multiple PyTorch models tied together into a *pipeline*. - -The ONNX Runtime optimization sample will convert each PyTorch model to ONNX, and then run the converted ONNX models through the `OrtTransformersOptimization` pass. The transformer optimization pass performs several time-consuming graph transformations that make the models more efficient for inference at runtime. - -The OpenVINO optimization sample will convert each PyTorch model to OpenVINO IR model by `OpenVINOConversion` pass, and create an `OpenVINOStableDiffusionPipeline` for inference. - -- ONNX Runtime with - - [CUDA EP](#stable-diffusion-and-stable-diffusion-xl-optimization-with-onnx-runtime-cuda-ep) - - DirectML EP: go to examples [Stable Diffusion](../directml/stable_diffusion/README.md), [Stable Diffusion XL](../directml/stable_diffusion_xl/README.md) -- [OpenVINO](#stable-diffusion-optimization-with-openvino) - -## Stable Diffusion and Stable Diffusion XL Optimization with ONNX Runtime CUDA EP - -This sample performs the following optimization workflow for each model in the Stable Diffusion pipeline: -- *PyTorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp16* -

- -Transformers optimization uses the following optimizations to speed up Stable Diffusion in CUDA: -* [Flash Attention](https://arxiv.org/abs/2205.14135) for float16 precision. Flash Attention uses tiling to reduce number of GPU memory reads/writes, and improves performance with less memory for long sequence length. The kernel requires GPUs of Compute Capability >= 7.5 (like T4, A100, and RTX 2060~4090). Only availanle in Linux. -* [Memory Efficient Attention](https://arxiv.org/abs/2112.05682v2) for float32 precision or older GPUs (like V100). We used the fused multi-head attention kernel in CUTLASS, and the kernel was contributed by xFormers. -* Channel-last (NHWC) convolution. For NVidia GPU with Tensor Cores support, NHWC tensor layout is recommended for convolution. See [Tensor Layouts In Memory: NCHW vs NHWC](https://docs.nvidia.com/deeplearning/performance/dl-performance-convolutional/index.html#tensor-layout). -* GroupNorm for NHWC tensor layout, and SkipGroupNorm fusion which fuses GroupNorm with Add bias and residual inputs -* SkipLayerNormalization which fuses LayerNormalization with Add bias and residual inputs. -* BiasSplitGelu is a fusion of Add bias with SplitGelu activation. -* BiasAdd fuses Add bias and residual. -* Reduce Transpose nodes by graph transformation. - -#### Prerequisites -##### Clone the repository and install Olive - -Refer to the instructions in the [examples README](../README.md) to clone the repository and install Olive. - - -We use the same olive workflow config files and scripts as the DirectML examples. The only difference is the `--provider cuda` option provided to the `stable_diffusion.py` and `stable_diffusion_xl.py` scripts. - -So, cd into the corresponding DirectML example folder from the root of the cloned repository: - -**_Stable Diffusion_** -```bash -cd examples/stable_diffusion -``` - -**_Stable Diffusion XL_** -```bash -cd examples/directml/stable_diffusion_xl -``` - -##### Install onnxruntime - -This example requires the latest onnxruntime-gpu code which can either be built from source or installed from the nightly builds. The following command can be used to install the latest nightly build of onnxruntime-gpu: - -```bash -# uninstall any pre-existing onnxruntime packages -pip uninstall -y onnxruntime onnxruntime-gpu onnxruntime-directml ort-nightly ort-nightly-gpu ort-nightly-directml - -# install onnxruntime-gpu nightly build -pip install ort-nightly-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ -``` - -##### Install other dependencies - -Install the necessary python packages: - -```bash -python -m pip install -r requirements-common.txt -``` - -#### Conversion to ONNX and Latency Optimization - -The easiest way to optimize the pipeline is with the `stable_diffusion.py` and `stable_diffusion_xl.py` scripts. These scripts will enumerate the `config_.json` files and optimize each with Olive, then gather the optimized models into a directory structure suitable for testing inference. - -**_Stable Diffusion_** -```bash -# default model_id is "runwayml/stable-diffusion-v1-5" -python stable_diffusion.py --provider cuda --optimize -``` - -**_Stable Diffusion XL_** -```bash -# default model_id is "stabilityai/stable-diffusion-xl-base-1.0" -python stable_diffusion_xl.py --provider cuda --optimize [--use_fp16_fixed_vae] - -# or specify a different model_id -python stable_diffusion_xl.py --provider cuda --model_id stabilityai/stable-diffusion-xl-refiner-1.0 --optimize [--use_fp16_fixed_vae] -``` - -`--use_fp16_fixed_vae` is optional. If provided, will use [madebyollin/sdxl-vae-fp16-fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) for the vae models and all sub-models will be entirely in fp16. -Otherwise, the vae models (vae-decoder for base and both vae-decoder and vae-encoder for refiner) will be in fp32 and all other sub-models will be in fp16 with fp32 input/output. - -Once the script successfully completes: -- The optimized ONNX pipeline will be stored under `models/optimized-cuda/[model_id]` (for example `models/optimized-cuda/runwayml/stable-diffusion-v1-5` or `models/optimized-cuda/stabilityai/stable-diffusion-xl-base-1.0`). -- The unoptimized ONNX pipeline (models converted to ONNX, but not run through transformer optimization pass) will be stored under `models/unoptimized/[model_id]` (for example `models/unoptimized/runwayml/stable-diffusion-v1-5` or `models/unoptimized/stabilityai/stable-diffusion-xl-base-1.0`). - -Re-running the script with `--optimize` will delete the output models, but it will *not* delete the Olive cache. Subsequent runs will complete much faster since it will simply be copying previously optimized models; you may use the `--clean_cache` option to start from scratch (not typically used unless you are modifying the scripts, for example). - -### Test Inference with CUDA - -Test ONNX runtime inference with the optimized models using `OnnxStableDiffusionPipeline`: - -**_Stable Diffusion_** -```bash -python stable_diffusion.py --provider cuda --num_images 2 -``` -Inference will loop until the generated image passes the safety checker (otherwise you would see black images). The result will be saved as `result_.png` on disk. - -**_Stable Diffusion XL_** -```bash -python stable_diffusion_xl.py --provider cuda --num_images 2 -``` -The result will be saved as `result_.png` on disk. - -Refer to the corresponding section in the DirectML READMEs for more details on the test inference options: -- [Stable Diffusion](../directml/stable_diffusion/README.md#test-inference) -- [Stable Diffusion XL](../directml/stable_diffusion_xl/README.md#test-inference) - - -## Stable Diffusion Optimization with OpenVINO - -**Contents**: -- [Setup](#setup) -- [Conversion to OpenVINO IR model](#convert-to-openvino-ir-model) -- [Test Inference](#test-inference-with-openvino) - -### Setup - -Olive is currently under pre-release, with constant updates and improvements to the functions and usage. This sample code will be frequently updated as Olive evolves, so it is important to install Olive from source when checking out this code from the main branch. See the [README for examples](https://github.com/microsoft/Olive/blob/main/examples/README.md#important) for detailed instructions on how to do this. - -**Alternatively**, you may install a stable release that we have validated. For example: - -``` -# Install Olive from main branch -pip install git+https://github.com/microsoft/Olive#egg=olive-ai[openvino] - -# Clone Olive repo to access sample code -git clone https://github.com/microsoft/olive -``` - -Once you've installed Olive, install the requirements for this sample matching the version of the library you are using: -``` -cd olive/examples/stable_diffusion -pip install -r requirements-ov.txt -``` - -### Convert to OpenVINO IR model - -The easiest way to optimize the pipeline is with the `stable_diffusion.py` helper script: - -``` -python stable_diffusion.py --optimize -``` - -The above command will enumerate the `config_.json` files and optimize each with Olive, then gather the optimized models into a directory structure suitable for testing inference. - -The stable diffusion models are large, and the optimization process is resource intensive. It is recommended to run optimization on a system with a minimum of 16GB of memory (preferably 32GB). Expect optimization to take several minutes (especially the U-Net model). - -Once the script successfully completes: -- The converted OpenVINO IR model will be stored under `models/optimized-openvino/[model_id]` (for example `models/optimized-openvino/runwayml/stable-diffusion-v1-5`). - -Re-running the script with `--optimize` will delete the output models, but it will *not* delete the Olive cache. Subsequent runs will complete much faster since it will simply be copying previously optimized models; you may use the `--clean_cache` option to start from scratch (not typically used unless you are modifying the scripts, for example). - -### Test Inference with OpenVINO - -This sample code is primarily intended to illustrate model optimization with Olive, but it also provides a simple interface for testing inference with the OpenVINO models. Inference is done by creating an `OVStableDiffusionPipeline` from the saved models. - - -``` -python stable_diffusion.py --inference --provider openvino -``` -Inference will loop until the generated image. The result will be saved as `result_.png` on disk. - - -Run `python stable_diffusion.py --help` for additional options. A few particularly relevant ones: -- `--image_path `: the input image path for image to image inference. -- `--img_to_img_example`: image to image example. The default input image is `assets/dog.png`, the default prompt is `amazing watercolor painting`. From 861a2b00a8e4098d2e50c665b780524ac240e460 Mon Sep 17 00:00:00 2001 From: sa_ddam213 Date: Fri, 19 Apr 2024 09:35:44 +1200 Subject: [PATCH 4/9] Ignore cache files --- OnnxStack.Converter/latent_consistency/.gitignore | 1 + OnnxStack.Converter/stable_cascade/.gitignore | 1 + 2 files changed, 2 insertions(+) diff --git a/OnnxStack.Converter/latent_consistency/.gitignore b/OnnxStack.Converter/latent_consistency/.gitignore index 324c183..4cf6f30 100644 --- a/OnnxStack.Converter/latent_consistency/.gitignore +++ b/OnnxStack.Converter/latent_consistency/.gitignore @@ -1,2 +1,3 @@ /footprints/ +/cache/ /result_*.png diff --git a/OnnxStack.Converter/stable_cascade/.gitignore b/OnnxStack.Converter/stable_cascade/.gitignore index 324c183..4cf6f30 100644 --- a/OnnxStack.Converter/stable_cascade/.gitignore +++ b/OnnxStack.Converter/stable_cascade/.gitignore @@ -1,2 +1,3 @@ /footprints/ +/cache/ /result_*.png From c1b8fee20009f0793f0a824afaa4c13bb1d7d689 Mon Sep 17 00:00:00 2001 From: sa_ddam213 Date: Fri, 19 Apr 2024 09:37:09 +1200 Subject: [PATCH 5/9] Revert prior model test batch size --- OnnxStack.Converter/stable_cascade/models.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/OnnxStack.Converter/stable_cascade/models.py b/OnnxStack.Converter/stable_cascade/models.py index b64e846..97cc9b5 100644 --- a/OnnxStack.Converter/stable_cascade/models.py +++ b/OnnxStack.Converter/stable_cascade/models.py @@ -5,9 +5,7 @@ import config import torch from typing import Union, Optional, Tuple -from diffusers import AutoencoderKL, StableCascadeUNet, ControlNetModel -from diffusers.models.controlnet import ControlNetOutput, BaseOutput as ControlNetBaseOutput -from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker +from diffusers import AutoencoderKL, StableCascadeUNet from transformers.models.clip.modeling_clip import CLIPTextModelWithProjection from dataclasses import dataclass @@ -89,10 +87,10 @@ def decoder_data_loader(data_dir, batchsize, *args, **kwargs): def prior_inputs(batchsize, torch_dtype, is_conversion_inputs=False): inputs = { "sample": torch.rand((batchsize, 16, 24, 24), dtype=torch_dtype), - "timestep_ratio": torch.rand(((batchsize *2),), dtype=torch_dtype), - "clip_text_pooled": torch.rand(((batchsize *2) , 1, 1280), dtype=torch_dtype), - "clip_text": torch.rand(((batchsize *2) , 77, 1280), dtype=torch_dtype), - "clip_img": torch.rand(((batchsize *2) , 1, 768), dtype=torch_dtype) + "timestep_ratio": torch.rand((batchsize,), dtype=torch_dtype), + "clip_text_pooled": torch.rand((batchsize , 1, 1280), dtype=torch_dtype), + "clip_text": torch.rand((batchsize , 77, 1280), dtype=torch_dtype), + "clip_img": torch.rand((batchsize , 1, 768), dtype=torch_dtype) } # use as kwargs since they won't be in the correct position if passed along with the tuple of inputs From 48c18a4b235b56cdab06cf37128e61a077925574 Mon Sep 17 00:00:00 2001 From: sa_ddam213 Date: Fri, 19 Apr 2024 09:37:58 +1200 Subject: [PATCH 6/9] use onnx opset 16 --- .../stable_cascade/config_decoder.json | 4 ++-- OnnxStack.Converter/stable_cascade/config_prior.json | 4 ++-- .../stable_cascade/config_text_encoder.json | 12 ++---------- OnnxStack.Converter/stable_diffusion/.gitignore | 1 + 4 files changed, 7 insertions(+), 14 deletions(-) diff --git a/OnnxStack.Converter/stable_cascade/config_decoder.json b/OnnxStack.Converter/stable_cascade/config_decoder.json index 82622d0..5ad1586 100644 --- a/OnnxStack.Converter/stable_cascade/config_decoder.json +++ b/OnnxStack.Converter/stable_cascade/config_decoder.json @@ -53,7 +53,7 @@ "convert": { "type": "OnnxConversion", "config": { - "target_opset": 14, + "target_opset": 16, "save_as_external_data": true, "all_tensors_to_one_file": true, "external_data_name": "weights.pb" @@ -66,7 +66,7 @@ "opt_level": 0, "float16": true, "use_gpu": true, - "keep_io_types": false, + "keep_io_types": true, "optimization_options": { "enable_gelu": true, "enable_layer_norm": true, diff --git a/OnnxStack.Converter/stable_cascade/config_prior.json b/OnnxStack.Converter/stable_cascade/config_prior.json index 17813f3..4b2fb8e 100644 --- a/OnnxStack.Converter/stable_cascade/config_prior.json +++ b/OnnxStack.Converter/stable_cascade/config_prior.json @@ -54,7 +54,7 @@ "convert": { "type": "OnnxConversion", "config": { - "target_opset": 14, + "target_opset": 16, "save_as_external_data": true, "all_tensors_to_one_file": true, "external_data_name": "weights.pb" @@ -67,7 +67,7 @@ "opt_level": 0, "float16": true, "use_gpu": true, - "keep_io_types": false, + "keep_io_types": true, "optimization_options": { "enable_gelu": true, "enable_layer_norm": true, diff --git a/OnnxStack.Converter/stable_cascade/config_text_encoder.json b/OnnxStack.Converter/stable_cascade/config_text_encoder.json index dd7fdb8..a16f5e5 100644 --- a/OnnxStack.Converter/stable_cascade/config_text_encoder.json +++ b/OnnxStack.Converter/stable_cascade/config_text_encoder.json @@ -48,15 +48,7 @@ "convert": { "type": "OnnxConversion", "config": { - "target_opset": 14 - } - }, - "ov_convert": { - "type": "OpenVINOConversion", - "config": { - "user_script": "models.py", - "example_input_func": "text_encoder_conversion_inputs", - "output_model": "text_encoder" + "target_opset": 16 } }, "optimize": { @@ -66,7 +58,7 @@ "opt_level": 0, "float16": true, "use_gpu": true, - "keep_io_types": false, + "keep_io_types": true, "optimization_options": { "enable_gelu": true, "enable_layer_norm": true, diff --git a/OnnxStack.Converter/stable_diffusion/.gitignore b/OnnxStack.Converter/stable_diffusion/.gitignore index 324c183..4cf6f30 100644 --- a/OnnxStack.Converter/stable_diffusion/.gitignore +++ b/OnnxStack.Converter/stable_diffusion/.gitignore @@ -1,2 +1,3 @@ /footprints/ +/cache/ /result_*.png From 8c76a77dc08896b77fe1282c3e61b1880a278b1e Mon Sep 17 00:00:00 2001 From: sa_ddam213 Date: Fri, 19 Apr 2024 10:50:43 +1200 Subject: [PATCH 7/9] StableCascade image_encoder --- .../stable_cascade/config_image_encoder.json | 113 ++++++++++++++++++ OnnxStack.Converter/stable_cascade/convert.py | 2 +- OnnxStack.Converter/stable_cascade/models.py | 31 ++++- 3 files changed, 143 insertions(+), 3 deletions(-) create mode 100644 OnnxStack.Converter/stable_cascade/config_image_encoder.json diff --git a/OnnxStack.Converter/stable_cascade/config_image_encoder.json b/OnnxStack.Converter/stable_cascade/config_image_encoder.json new file mode 100644 index 0000000..1d723c9 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/config_image_encoder.json @@ -0,0 +1,113 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "stabilityai/stable-cascade", + "model_loader": "image_encoder_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample"], + "output_names": [ "latent_sample" ], + "dynamic_axes": { "sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } } + }, + "dummy_inputs_func": "image_encoder_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "image_encoder_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 16 + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "clip", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": true, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "clip", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "image_encoder", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_cascade/convert.py b/OnnxStack.Converter/stable_cascade/convert.py index ff6e159..3010e54 100644 --- a/OnnxStack.Converter/stable_cascade/convert.py +++ b/OnnxStack.Converter/stable_cascade/convert.py @@ -121,7 +121,7 @@ def optimize( model_info = {} - submodel_names = [ "text_encoder", "decoder", "prior"] + submodel_names = [ "text_encoder", "decoder", "prior", "image_encoder"] has_safety_checker = getattr(pipeline, "safety_checker", None) is not None diff --git a/OnnxStack.Converter/stable_cascade/models.py b/OnnxStack.Converter/stable_cascade/models.py index 97cc9b5..e163350 100644 --- a/OnnxStack.Converter/stable_cascade/models.py +++ b/OnnxStack.Converter/stable_cascade/models.py @@ -6,7 +6,7 @@ import torch from typing import Union, Optional, Tuple from diffusers import AutoencoderKL, StableCascadeUNet -from transformers.models.clip.modeling_clip import CLIPTextModelWithProjection +from transformers.models.clip.modeling_clip import CLIPTextModelWithProjection, CLIPVisionModelWithProjection from dataclasses import dataclass # Helper latency-only dataloader that creates random tensors with no label @@ -111,4 +111,31 @@ def prior_conversion_inputs(model=None): def prior_data_loader(data_dir, batchsize, *args, **kwargs): - return RandomDataLoader(prior_inputs, batchsize, torch.float16) \ No newline at end of file + return RandomDataLoader(prior_inputs, batchsize, torch.float16) + + + + + +# ----------------------------------------------------------------------------- +# image_encoder +# ----------------------------------------------------------------------------- + +def image_encoder_inputs(batchsize, torch_dtype, is_conversion_inputs=False): + inputs = { + "sample": torch.rand((batchsize, 3, 224, 224), dtype=torch_dtype) + } + return inputs + + +def image_encoder_load(model_name): + model = CLIPVisionModelWithProjection.from_pretrained(model_name, subfolder="image_encoder") + return model + + +def image_encoder_conversion_inputs(model=None): + return tuple(image_encoder_inputs(1, torch.float32, True).values()) + + +def image_encoder_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(image_encoder_inputs, batchsize, torch.float16) \ No newline at end of file From 7979acff75a83e1c5826450393feec3f0634fc32 Mon Sep 17 00:00:00 2001 From: sa_ddam213 Date: Fri, 19 Apr 2024 12:41:07 +1200 Subject: [PATCH 8/9] StableCascade vqgan --- .../stable_cascade/config_vqgan.json | 103 ++++++++++++++++++ OnnxStack.Converter/stable_cascade/models.py | 29 ++++- 2 files changed, 130 insertions(+), 2 deletions(-) create mode 100644 OnnxStack.Converter/stable_cascade/config_vqgan.json diff --git a/OnnxStack.Converter/stable_cascade/config_vqgan.json b/OnnxStack.Converter/stable_cascade/config_vqgan.json new file mode 100644 index 0000000..edf5969 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/config_vqgan.json @@ -0,0 +1,103 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "stabilityai/stable-cascade", + "model_loader": "vqgan_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample", "return_dict" ], + "output_names": [ "latent_sample" ], + "dynamic_axes": { "sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } } + }, + "dummy_inputs_func": "vqgan_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "vqgan_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 16 + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "vqgan", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_cascade/models.py b/OnnxStack.Converter/stable_cascade/models.py index e163350..249017c 100644 --- a/OnnxStack.Converter/stable_cascade/models.py +++ b/OnnxStack.Converter/stable_cascade/models.py @@ -6,6 +6,7 @@ import torch from typing import Union, Optional, Tuple from diffusers import AutoencoderKL, StableCascadeUNet +from diffusers.pipelines.wuerstchen import PaellaVQModel from transformers.models.clip.modeling_clip import CLIPTextModelWithProjection, CLIPVisionModelWithProjection from dataclasses import dataclass @@ -129,7 +130,7 @@ def image_encoder_inputs(batchsize, torch_dtype, is_conversion_inputs=False): def image_encoder_load(model_name): - model = CLIPVisionModelWithProjection.from_pretrained(model_name, subfolder="image_encoder") + model = CLIPVisionModelWithProjection.from_pretrained(model_name, subfolder="image_encoder", use_safetensors=True) return model @@ -138,4 +139,28 @@ def image_encoder_conversion_inputs(model=None): def image_encoder_data_loader(data_dir, batchsize, *args, **kwargs): - return RandomDataLoader(image_encoder_inputs, batchsize, torch.float16) \ No newline at end of file + return RandomDataLoader(image_encoder_inputs, batchsize, torch.float16) + + +# ----------------------------------------------------------------------------- +# vqgan +# ----------------------------------------------------------------------------- + +def vqgan_inputs(batchsize, torch_dtype, is_conversion_inputs=False): + inputs = { + "sample": torch.rand((batchsize, 3, 256, 256), dtype=torch_dtype) + } + return inputs + + +def vqgan_load(model_name): + model = PaellaVQModel.from_pretrained(model_name, subfolder="vqgan", use_safetensors=True) + return model + + +def vqgan_conversion_inputs(model=None): + return tuple(vqgan_inputs(1, torch.float32, True).values()) + + +def vqgan_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(vqgan_inputs, batchsize, torch.float16) \ No newline at end of file From 06e19de96593600bd1a65fd740c6eb03269f5faa Mon Sep 17 00:00:00 2001 From: sa_ddam213 Date: Fri, 19 Apr 2024 15:03:13 +1200 Subject: [PATCH 9/9] copy models, remove obsolete commands --- OnnxStack.Converter/README.md | 2 +- OnnxStack.Converter/stable_cascade/README.md | 20 +++++ .../stable_cascade/config_decoder.json | 3 +- .../stable_cascade/config_image_encoder.json | 2 +- .../stable_cascade/config_prior.json | 3 +- OnnxStack.Converter/stable_cascade/convert.py | 82 +++-------------- OnnxStack.Converter/stable_cascade/models.py | 16 ++-- .../stable_cascade/sd_utils/ort.py | 88 +++++-------------- 8 files changed, 64 insertions(+), 152 deletions(-) create mode 100644 OnnxStack.Converter/stable_cascade/README.md diff --git a/OnnxStack.Converter/README.md b/OnnxStack.Converter/README.md index 00108ee..5b2a146 100644 --- a/OnnxStack.Converter/README.md +++ b/OnnxStack.Converter/README.md @@ -15,6 +15,6 @@ convert.py --optimize --model_input '..\stable-diffusion-v1-5' --model_output '. `--model_input` - Safetensor model to convert -`--model_output` - Output for converted ONNX model +`--model_output` - Output for converted ONNX model (NOTE: This folder is deleted before each run) `--controlnet` - Create a ControlNet enabled Unet model \ No newline at end of file diff --git a/OnnxStack.Converter/stable_cascade/README.md b/OnnxStack.Converter/stable_cascade/README.md new file mode 100644 index 0000000..40d584b --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/README.md @@ -0,0 +1,20 @@ +# OnnxStack.Converter + +## Requirements +```bash +pip install onnxruntime-directml +pip install olive-ai[directml] +python -m pip install -r requirements.txt +``` + +## Usage +```bash +convert.py --optimize --model_input '..\stable-cascade' --model_output '..\converted' +``` +`--optimize` - Run the model optimization + +`--model_input` - Safetensor model to convert + +`--model_output` - Output for converted ONNX model (NOTE: This folder is deleted before each run) + +`--image_encoder` - Convert the optional image encoder diff --git a/OnnxStack.Converter/stable_cascade/config_decoder.json b/OnnxStack.Converter/stable_cascade/config_decoder.json index 5ad1586..6d70698 100644 --- a/OnnxStack.Converter/stable_cascade/config_decoder.json +++ b/OnnxStack.Converter/stable_cascade/config_decoder.json @@ -55,8 +55,7 @@ "config": { "target_opset": 16, "save_as_external_data": true, - "all_tensors_to_one_file": true, - "external_data_name": "weights.pb" + "all_tensors_to_one_file": true } }, "optimize": { diff --git a/OnnxStack.Converter/stable_cascade/config_image_encoder.json b/OnnxStack.Converter/stable_cascade/config_image_encoder.json index 1d723c9..08cfc7e 100644 --- a/OnnxStack.Converter/stable_cascade/config_image_encoder.json +++ b/OnnxStack.Converter/stable_cascade/config_image_encoder.json @@ -7,7 +7,7 @@ "model_script": "models.py", "io_config": { "input_names": [ "sample"], - "output_names": [ "latent_sample" ], + "output_names": [ "image_embeds", "last_hidden_state"], "dynamic_axes": { "sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } } }, "dummy_inputs_func": "image_encoder_conversion_inputs" diff --git a/OnnxStack.Converter/stable_cascade/config_prior.json b/OnnxStack.Converter/stable_cascade/config_prior.json index 4b2fb8e..373e8a5 100644 --- a/OnnxStack.Converter/stable_cascade/config_prior.json +++ b/OnnxStack.Converter/stable_cascade/config_prior.json @@ -56,8 +56,7 @@ "config": { "target_opset": 16, "save_as_external_data": true, - "all_tensors_to_one_file": true, - "external_data_name": "weights.pb" + "all_tensors_to_one_file": true } }, "optimize": { diff --git a/OnnxStack.Converter/stable_cascade/convert.py b/OnnxStack.Converter/stable_cascade/convert.py index 3010e54..17450bc 100644 --- a/OnnxStack.Converter/stable_cascade/convert.py +++ b/OnnxStack.Converter/stable_cascade/convert.py @@ -93,7 +93,7 @@ def optimize( model_input: str, model_output: Path, provider: str, - controlnet: bool + image_encoder: bool ): from google.protobuf import __version__ as protobuf_version @@ -109,7 +109,6 @@ def optimize( shutil.rmtree(script_dir / "footprints", ignore_errors=True) shutil.rmtree(model_output, ignore_errors=True) - # Load the entire PyTorch pipeline to ensure all models and their configurations are downloaded and cached. # This avoids an issue where the non-ONNX components (tokenizer, scheduler, and feature extractor) are not # automatically cached correctly if individual models are fetched one at a time. @@ -121,15 +120,10 @@ def optimize( model_info = {} - submodel_names = [ "text_encoder", "decoder", "prior", "image_encoder"] - - has_safety_checker = getattr(pipeline, "safety_checker", None) is not None - - if has_safety_checker: - submodel_names.append("safety_checker") + submodel_names = [ "text_encoder", "decoder", "prior", "vqgan"] - if controlnet: - submodel_names.append("controlnet") + if image_encoder: + submodel_names.append("image_encoder") for submodel_name in submodel_names: print(f"\nOptimizing {submodel_name}") @@ -138,14 +132,7 @@ def optimize( with (script_dir / f"config_{submodel_name}.json").open() as fin: olive_config = json.load(fin) olive_config = update_config_with_provider(olive_config, provider) - - if submodel_name in ("unet", "controlnet", "text_encoder"): - olive_config["input_model"]["config"]["model_path"] = model_dir - else: - # Only the unet & text encoder are affected by LoRA, so it's better to use the base model ID for - # other models: the Olive cache is based on the JSON config, and two LoRA variants with the same - # base model ID should be able to reuse previously optimized copies. - olive_config["input_model"]["config"]["model_path"] = model_dir + olive_config["input_model"]["config"]["model_path"] = model_dir run_res = olive_run(olive_config) @@ -156,7 +143,7 @@ def optimize( from sd_utils.ort import save_onnx_pipeline save_onnx_pipeline( - has_safety_checker, model_info, model_output, pipeline, submodel_names + model_info, model_output, pipeline, submodel_names ) return model_info @@ -164,44 +151,14 @@ def optimize( def parse_common_args(raw_args): parser = argparse.ArgumentParser("Common arguments") - parser.add_argument("--model_input", default="stable-diffusion-v1-5", type=str) parser.add_argument("--model_output", default="stable-diffusion-v1-5", type=Path) - parser.add_argument("--controlnet",action="store_true", help="Create ControlNet Unet Model") - parser.add_argument( - "--provider", default="dml", type=str, choices=["dml", "cuda"], help="Execution provider to use" - ) + parser.add_argument("--image_encoder",action="store_true", help="Create image encoder model") + parser.add_argument("--provider", default="dml", type=str, choices=["dml", "cuda"], help="Execution provider to use") parser.add_argument("--optimize", action="store_true", help="Runs the optimization step") parser.add_argument("--clean_cache", action="store_true", help="Deletes the Olive cache") parser.add_argument("--test_unoptimized", action="store_true", help="Use unoptimized model for inference") - parser.add_argument("--batch_size", default=1, type=int, help="Number of images to generate per batch") - parser.add_argument( - "--prompt", - default=( - "castle surrounded by water and nature, village, volumetric lighting, photorealistic, " - "detailed and intricate, fantasy, epic cinematic shot, mountains, 8k ultra hd" - ), - type=str, - ) - parser.add_argument( - "--guidance_scale", - default=7.5, - type=float, - help="Guidance scale as defined in Classifier-Free Diffusion Guidance", - ) - parser.add_argument("--num_images", default=1, type=int, help="Number of images to generate") - parser.add_argument("--num_inference_steps", default=50, type=int, help="Number of steps in diffusion process") parser.add_argument("--tempdir", default=None, type=str, help="Root directory for tempfile directories and files") - parser.add_argument( - "--strength", - default=1.0, - type=float, - help="Value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. " - "Values that approach 1.0 enable lots of variations but will also produce images " - "that are not semantically consistent with the input.", - ) - parser.add_argument("--image_size", default=512, type=int, help="Width and height of the images to generate") - return parser.parse_known_args(raw_args) @@ -231,8 +188,6 @@ def main(raw_args=None): if common_args.clean_cache: shutil.rmtree(script_dir / "cache", ignore_errors=True) - guidance_scale = common_args.guidance_scale - ort_args = None, None ort_args, extra_args = parse_ort_args(extra_args) @@ -246,27 +201,10 @@ def main(raw_args=None): from sd_utils.ort import validate_args validate_args(ort_args, common_args.provider) - optimize(common_args.model_input, common_args.model_output, common_args.provider, common_args.controlnet) + optimize(common_args.model_input, common_args.model_output, common_args.provider, common_args.image_encoder) if not common_args.optimize: - model_dir = model_output / "F32" if common_args.test_unoptimized else model_output / "F16" - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - - from sd_utils.ort import get_ort_pipeline - - pipeline = get_ort_pipeline(model_dir, common_args, ort_args, guidance_scale) - run_inference_loop( - pipeline, - common_args.prompt, - common_args.num_images, - common_args.batch_size, - common_args.image_size, - common_args.num_inference_steps, - guidance_scale, - common_args.strength, - provider=provider, - ) + print("TODO: Create OnnxStableCascadePipeline") if __name__ == "__main__": diff --git a/OnnxStack.Converter/stable_cascade/models.py b/OnnxStack.Converter/stable_cascade/models.py index 249017c..c8b15b7 100644 --- a/OnnxStack.Converter/stable_cascade/models.py +++ b/OnnxStack.Converter/stable_cascade/models.py @@ -27,7 +27,6 @@ def __getitem__(self, idx): # TEXT ENCODER # ----------------------------------------------------------------------------- - def text_encoder_inputs(batchsize, torch_dtype): return torch.zeros((batchsize, 77), dtype=torch_dtype) @@ -45,11 +44,12 @@ def text_encoder_data_loader(data_dir, batchsize, *args, **kwargs): return RandomDataLoader(text_encoder_inputs, batchsize, torch.int32) + + # ----------------------------------------------------------------------------- -# decoder +# DECODER UNET # ----------------------------------------------------------------------------- - def decoder_inputs(batchsize, torch_dtype, is_conversion_inputs=False): # TODO(jstoecker): Rename onnx::Concat_4 to text_embeds and onnx::Shape_5 to time_ids inputs = { @@ -81,8 +81,9 @@ def decoder_data_loader(data_dir, batchsize, *args, **kwargs): + # ----------------------------------------------------------------------------- -# prior +# PRIOR UNET # ----------------------------------------------------------------------------- def prior_inputs(batchsize, torch_dtype, is_conversion_inputs=False): @@ -116,10 +117,9 @@ def prior_data_loader(data_dir, batchsize, *args, **kwargs): - # ----------------------------------------------------------------------------- -# image_encoder +# IMAGE ENCODER # ----------------------------------------------------------------------------- def image_encoder_inputs(batchsize, torch_dtype, is_conversion_inputs=False): @@ -142,8 +142,10 @@ def image_encoder_data_loader(data_dir, batchsize, *args, **kwargs): return RandomDataLoader(image_encoder_inputs, batchsize, torch.float16) + + # ----------------------------------------------------------------------------- -# vqgan +# VQGAN # ----------------------------------------------------------------------------- def vqgan_inputs(batchsize, torch_dtype, is_conversion_inputs=False): diff --git a/OnnxStack.Converter/stable_cascade/sd_utils/ort.py b/OnnxStack.Converter/stable_cascade/sd_utils/ort.py index 5750135..72746f7 100644 --- a/OnnxStack.Converter/stable_cascade/sd_utils/ort.py +++ b/OnnxStack.Converter/stable_cascade/sd_utils/ort.py @@ -10,7 +10,7 @@ from typing import Dict import onnxruntime as ort -from diffusers import OnnxRuntimeModel, OnnxStableDiffusionPipeline +from diffusers import OnnxRuntimeModel, StableCascadePriorPipeline from onnxruntime import __version__ as OrtVersion from packaging import version @@ -77,9 +77,11 @@ def save_optimized_onnx_submodel(submodel_name, provider, model_info): model_info[submodel_name] = { "unoptimized": { "path": Path(unoptimized_olive_model.model_path), + "data": Path(unoptimized_olive_model.model_path + ".data"), }, "optimized": { "path": Path(optimized_olive_model.model_path), + "data": Path(optimized_olive_model.model_path + ".data"), }, } @@ -88,76 +90,28 @@ def save_optimized_onnx_submodel(submodel_name, provider, model_info): def save_onnx_pipeline( - has_safety_checker, model_info, model_output, pipeline, submodel_names + model_info, model_output, pipeline, submodel_names ): # Save the unoptimized models in a directory structure that the diffusers library can load and run. # This is optional, and the optimized models can be used directly in a custom pipeline if desired. - print("\nCreating ONNX pipeline...") - - optimized_model_dir = model_output / "Optimized" - unoptimized_model_dir = model_output / "Default" - has_controlnet = 'controlnet' in submodel_names - if has_safety_checker: - safety_checker = OnnxRuntimeModel.from_pretrained(model_info["safety_checker"]["unoptimized"]["path"].parent) - else: - safety_checker = None - - text_encoder=OnnxRuntimeModel.from_pretrained(model_info["text_encoder"]["unoptimized"]["path"].parent) - decoder=OnnxRuntimeModel.from_pretrained(model_info["text_encoder"]["unoptimized"]["path"].parent) - prior=OnnxRuntimeModel.from_pretrained(model_info["text_encoder"]["unoptimized"]["path"].parent) - + # print("\nCreating ONNX pipeline...") + # TODO: Create OnnxStableCascadePipeline - print("Saving unoptimized models...") - text_encoder.save_pretrained(unoptimized_model_dir / "text_encoder") - decoder.save_pretrained(unoptimized_model_dir/ "decoder") - prior.save_pretrained(unoptimized_model_dir/ "prior") - # Create a copy of the unoptimized model directory, then overwrite with optimized models from the olive cache. print("Copying optimized models...") - shutil.copytree(unoptimized_model_dir, optimized_model_dir, ignore=shutil.ignore_patterns("weights.pb")) - for submodel_name in submodel_names: - src_path = model_info[submodel_name]["optimized"]["path"] - dst_path = optimized_model_dir / submodel_name / "model.onnx" - exists = os.path.exists(dst_path) - if not exists: - os.mkdir(optimized_model_dir / submodel_name) - shutil.copyfile(src_path, dst_path) - - print(f"The default pipeline is located here: {unoptimized_model_dir}") - print(f"The optimized pipeline is located here: {optimized_model_dir}") - - -def get_ort_pipeline(model_dir, common_args, ort_args, guidance_scale): - ort.set_default_logger_severity(3) - - print("Loading models into ORT session...") - sess_options = ort.SessionOptions() - sess_options.enable_mem_pattern = False - - static_dims = not ort_args.dynamic_dims - batch_size = common_args.batch_size - image_size = common_args.image_size - provider = common_args.provider - - if static_dims: - hidden_batch_size = batch_size if (guidance_scale == 0.0) else batch_size * 2 - # Not necessary, but helps DML EP further optimize runtime performance. - # batch_size is doubled for sample & hidden state because of classifier free guidance: - # https://github.com/huggingface/diffusers/blob/46c52f9b9607e6ecb29c782c052aea313e6487b7/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L672 - sess_options.add_free_dimension_override_by_name("unet_sample_batch", hidden_batch_size) - sess_options.add_free_dimension_override_by_name("unet_sample_channels", 4) - sess_options.add_free_dimension_override_by_name("unet_sample_height", image_size // 8) - sess_options.add_free_dimension_override_by_name("unet_sample_width", image_size // 8) - sess_options.add_free_dimension_override_by_name("unet_time_batch", 1) - sess_options.add_free_dimension_override_by_name("unet_hidden_batch", hidden_batch_size) - sess_options.add_free_dimension_override_by_name("unet_hidden_sequence", 77) - - provider_map = { - "dml": "DmlExecutionProvider", - "cuda": "CUDAExecutionProvider", - } - assert provider in provider_map, f"Unsupported provider: {provider}" - return OnnxStableDiffusionPipeline.from_pretrained( - model_dir, provider=provider_map[provider], sess_options=sess_options - ) + for passType in ["optimized", "unoptimized"]: + model_dir = model_output / passType + for submodel_name in submodel_names: + src_path = model_info[submodel_name][passType]["path"] # model.onnx + src_data_path = model_info[submodel_name][passType]["data"]# model.onnx.data + + dst_path = model_dir / submodel_name + if not os.path.exists(dst_path): + os.makedirs(dst_path, exist_ok=True) + + shutil.copyfile(src_path, dst_path / "model.onnx") + if os.path.exists(src_data_path): + shutil.copyfile(src_data_path, dst_path / "model.onnx.data") + + print(f"The converted model is located here: {model_output}")