diff --git a/OnnxStack.Converter/README.md b/OnnxStack.Converter/README.md new file mode 100644 index 0000000..5b2a146 --- /dev/null +++ b/OnnxStack.Converter/README.md @@ -0,0 +1,20 @@ +# OnnxStack.Converter + +## Requirements +```bash +pip install onnxruntime-directml +pip install olive-ai[directml] +python -m pip install -r requirements.txt +``` + +## Usage +```bash +convert.py --optimize --model_input '..\stable-diffusion-v1-5' --model_output '..\converted' --controlnet +``` +`--optimize` - Run the model optimization + +`--model_input` - Safetensor model to convert + +`--model_output` - Output for converted ONNX model (NOTE: This folder is deleted before each run) + +`--controlnet` - Create a ControlNet enabled Unet model \ No newline at end of file diff --git a/OnnxStack.Converter/latent_consistency/.gitignore b/OnnxStack.Converter/latent_consistency/.gitignore new file mode 100644 index 0000000..4cf6f30 --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/.gitignore @@ -0,0 +1,3 @@ +/footprints/ +/cache/ +/result_*.png diff --git a/OnnxStack.Converter/latent_consistency/config.py b/OnnxStack.Converter/latent_consistency/config.py new file mode 100644 index 0000000..1806391 --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/config.py @@ -0,0 +1,8 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +vae_sample_size = 768 +unet_sample_size = 96 +cross_attention_dim = 768 diff --git a/OnnxStack.Converter/latent_consistency/config_controlnet.json b/OnnxStack.Converter/latent_consistency/config_controlnet.json new file mode 100644 index 0000000..0d9331f --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/config_controlnet.json @@ -0,0 +1,124 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "SimianLuo/LCM_Dreamshaper_v7", + "model_loader": "controlnet_unet_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample", "timestep", "encoder_hidden_states", "down_block_0_additional_residual", "down_block_1_additional_residual", "down_block_2_additional_residual", "down_block_3_additional_residual", "down_block_4_additional_residual", "down_block_5_additional_residual", "down_block_6_additional_residual", "down_block_7_additional_residual", "down_block_8_additional_residual", "down_block_9_additional_residual", "down_block_10_additional_residual", "down_block_11_additional_residual", "mid_block_additional_residual", "return_dict" ], + "output_names": [ "out_sample" ], + "dynamic_axes": { + "sample": {"0": "unet_sample_batch", "1": "unet_sample_channels", "2": "unet_sample_height", "3": "unet_sample_width"}, + "timestep": {"0": "unet_time_batch"}, + "encoder_hidden_states": {"0": "unet_hidden_batch", "1": "unet_hidden_sequence"}, + "timestep_cond": { "0": "batch_size" }, + "down_block_0_additional_residual": {"0": "cnet_db0_batch", "1": "cnet_db0_channels", "2": "cnet_db0_height", "3": "cnet_db0_width"}, + "down_block_1_additional_residual": {"0": "cnet_db1_batch", "1": "cnet_db1_channels", "2": "cnet_db1_height", "3": "cnet_db1_width"}, + "down_block_2_additional_residual": {"0": "cnet_db2_batch", "1": "cnet_db2_channels", "2": "cnet_db2_height", "3": "cnet_db2_width"}, + "down_block_3_additional_residual": {"0": "cnet_db3_batch", "1": "cnet_db3_channels", "2": "cnet_db3_height2", "3": "cnet_db3_width2"}, + "down_block_4_additional_residual": {"0": "cnet_db4_batch", "1": "cnet_db4_channels", "2": "cnet_db4_height2", "3": "cnet_db4_width2"}, + "down_block_5_additional_residual": {"0": "cnet_db5_batch", "1": "cnet_db5_channels", "2": "cnet_db5_height2", "3": "cnet_db5_width2"}, + "down_block_6_additional_residual": {"0": "cnet_db6_batch", "1": "cnet_db6_channels", "2": "cnet_db6_height4", "3": "cnet_db6_width4"}, + "down_block_7_additional_residual": {"0": "cnet_db7_batch", "1": "cnet_db7_channels", "2": "cnet_db7_height4", "3": "cnet_db7_width4"}, + "down_block_8_additional_residual": {"0": "cnet_db8_batch", "1": "cnet_db8_channels", "2": "cnet_db8_height4", "3": "cnet_db8_width4"}, + "down_block_9_additional_residual": {"0": "cnet_db9_batch", "1": "cnet_db9_channels", "2": "cnet_db9_height8", "3": "cnet_db9_width8"}, + "down_block_10_additional_residual": {"0": "cnet_db10_batch", "1": "cnet_db10_channels", "2": "cnet_db10_height8", "3": "cnet_db10_width8"}, + "down_block_11_additional_residual": {"0": "cnet_db11_batch", "1": "cnet_db11_channels", "2": "cnet_db11_height8", "3": "cnet_db11_width8"}, + "mid_block_additional_residual": {"0": "cnet_mbar_batch", "1": "cnet_mbar_channels", "2": "cnet_mbar_height8", "3": "cnet_mbar_width8"} + } + }, + "dummy_inputs_func": "controlnet_unet_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "controlnet_unet_data_loader", + "batch_size": 2 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14, + "save_as_external_data": true, + "all_tensors_to_one_file": true, + "external_data_name": "weights.pb" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "controlnet", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/latent_consistency/config_safety_checker.json b/OnnxStack.Converter/latent_consistency/config_safety_checker.json new file mode 100644 index 0000000..bef935f --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/config_safety_checker.json @@ -0,0 +1,124 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "SimianLuo/LCM_Dreamshaper_v7", + "model_loader": "safety_checker_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "clip_input", "images" ], + "output_names": [ "out_images", "has_nsfw_concepts" ], + "dynamic_axes": { + "clip_input": { "0": "batch", "1": "channels", "2": "height", "3": "width" }, + "images": { "0": "batch", "1": "height", "2": "width", "3": "channels" } + } + }, + "dummy_inputs_func": "safety_checker_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "safety_checker_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14 + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "safety_checker_conversion_inputs", + "output_model": "safety_checker" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "safety_checker", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/latent_consistency/config_text_encoder.json b/OnnxStack.Converter/latent_consistency/config_text_encoder.json new file mode 100644 index 0000000..0a1c5de --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/config_text_encoder.json @@ -0,0 +1,121 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "SimianLuo/LCM_Dreamshaper_v7", + "model_loader": "text_encoder_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "input_ids" ], + "output_names": [ "last_hidden_state", "pooler_output" ], + "dynamic_axes": { "input_ids": { "0": "batch", "1": "sequence" } } + }, + "dummy_inputs_func": "text_encoder_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "text_encoder_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14 + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "text_encoder_conversion_inputs", + "output_model": "text_encoder" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "clip", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "clip", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "text_encoder", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/latent_consistency/config_unet.json b/OnnxStack.Converter/latent_consistency/config_unet.json new file mode 100644 index 0000000..1c3b983 --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/config_unet.json @@ -0,0 +1,129 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "SimianLuo/LCM_Dreamshaper_v7", + "model_loader": "unet_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample", "timestep", "encoder_hidden_states", "timestep_cond", "return_dict" ], + "output_names": [ "out_sample" ], + "dynamic_axes": { + "sample": {"0": "unet_sample_batch", "1": "unet_sample_channels", "2": "unet_sample_height", "3": "unet_sample_width"}, + "timestep": {"0": "unet_time_batch"}, + "encoder_hidden_states": {"0": "unet_hidden_batch", "1": "unet_hidden_sequence"}, + "timestep_cond": { "0": "batch_size" } + } + }, + "dummy_inputs_func": "unet_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "unet_data_loader", + "batch_size": 2 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14, + "save_as_external_data": true, + "all_tensors_to_one_file": true, + "external_data_name": "weights.pb" + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "get_unet_ov_example_input", + "output_model": "unet" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "unet", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/latent_consistency/config_vae_decoder.json b/OnnxStack.Converter/latent_consistency/config_vae_decoder.json new file mode 100644 index 0000000..755ab9a --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/config_vae_decoder.json @@ -0,0 +1,121 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "SimianLuo/LCM_Dreamshaper_v7", + "model_loader": "vae_decoder_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "latent_sample", "return_dict" ], + "output_names": [ "sample" ], + "dynamic_axes": { "latent_sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } } + }, + "dummy_inputs_func": "vae_decoder_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "vae_decoder_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14 + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "vae_decoder_conversion_inputs", + "output_model": "vae_decoder" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "vae_decoder", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/latent_consistency/config_vae_encoder.json b/OnnxStack.Converter/latent_consistency/config_vae_encoder.json new file mode 100644 index 0000000..7a664ea --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/config_vae_encoder.json @@ -0,0 +1,121 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "SimianLuo/LCM_Dreamshaper_v7", + "model_loader": "vae_encoder_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample", "return_dict" ], + "output_names": [ "latent_sample" ], + "dynamic_axes": { "sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } } + }, + "dummy_inputs_func": "vae_encoder_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "vae_encoder_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14 + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "vae_encoder_conversion_inputs", + "output_model": "vae_encoder" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "vae_encoder", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/latent_consistency/convert.py b/OnnxStack.Converter/latent_consistency/convert.py new file mode 100644 index 0000000..2c476a0 --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/convert.py @@ -0,0 +1,272 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import argparse +import json +import shutil +import sys +import warnings +from pathlib import Path +from typing import Dict + +import config +import torch +from diffusers import DiffusionPipeline +from packaging import version + +from olive.common.utils import set_tempdir +from olive.workflows import run as olive_run + +# pylint: disable=redefined-outer-name +# ruff: noqa: TID252, T201 + + +def save_image(result, batch_size, provider, num_images, images_saved, image_callback=None): + passed_safety_checker = 0 + for image_index in range(batch_size): + if result.nsfw_content_detected is None or not result.nsfw_content_detected[image_index]: + passed_safety_checker += 1 + if images_saved < num_images: + output_path = f"result_{images_saved}.png" + result.images[image_index].save(output_path) + if image_callback: + image_callback(images_saved, output_path) + images_saved += 1 + print(f"Generated {output_path}") + print(f"Inference Batch End ({passed_safety_checker}/{batch_size} images).") + print("Images passed the safety checker.") + return images_saved + + +def run_inference_loop( + pipeline, + prompt, + num_images, + batch_size, + image_size, + num_inference_steps, + guidance_scale, + strength: float, + provider: str, + image_callback=None, + step_callback=None, +): + images_saved = 0 + + def update_steps(step, timestep, latents): + if step_callback: + step_callback((images_saved // batch_size) * num_inference_steps + step) + + while images_saved < num_images: + print(f"\nInference Batch Start (batch size = {batch_size}).") + + kwargs = {} + + result = pipeline( + [prompt] * batch_size, + num_inference_steps=num_inference_steps, + callback=update_steps if step_callback else None, + height=image_size, + width=image_size, + guidance_scale=guidance_scale, + **kwargs, + ) + + images_saved = save_image(result, batch_size, provider, num_images, images_saved, image_callback) + + +def update_config_with_provider(config: Dict, provider: str): + if provider == "dml": + # DirectML EP is the default, so no need to update config. + return config + elif provider == "cuda": + from sd_utils.ort import update_cuda_config + + return update_cuda_config(config) + else: + raise ValueError(f"Unsupported provider: {provider}") + + +def optimize( + model_input: str, + model_output: Path, + provider: str, + controlnet: bool +): + from google.protobuf import __version__ as protobuf_version + + # protobuf 4.x aborts with OOM when optimizing unet + if version.parse(protobuf_version) > version.parse("3.20.3"): + print("This script requires protobuf 3.20.3. Please ensure your package version matches requirements.txt.") + sys.exit(1) + + model_dir = model_input + script_dir = Path(__file__).resolve().parent + + # Clean up previously optimized models, if any. + shutil.rmtree(script_dir / "footprints", ignore_errors=True) + shutil.rmtree(model_output, ignore_errors=True) + + + # Load the entire PyTorch pipeline to ensure all models and their configurations are downloaded and cached. + # This avoids an issue where the non-ONNX components (tokenizer, scheduler, and feature extractor) are not + # automatically cached correctly if individual models are fetched one at a time. + print("Download stable diffusion PyTorch pipeline...") + pipeline = DiffusionPipeline.from_pretrained(model_dir, torch_dtype=torch.float32, **{"local_files_only": True}) + config.vae_sample_size = pipeline.vae.config.sample_size + config.cross_attention_dim = pipeline.unet.config.cross_attention_dim + config.unet_sample_size = pipeline.unet.config.sample_size + + model_info = {} + + submodel_names = ["vae_encoder", "vae_decoder", "unet" , "text_encoder"] + + has_safety_checker = getattr(pipeline, "safety_checker", None) is not None + + if has_safety_checker: + submodel_names.append("safety_checker") + + if controlnet: + submodel_names.append("controlnet") + + for submodel_name in submodel_names: + print(f"\nOptimizing {submodel_name}") + + olive_config = None + with (script_dir / f"config_{submodel_name}.json").open() as fin: + olive_config = json.load(fin) + olive_config = update_config_with_provider(olive_config, provider) + + if submodel_name in ("unet", "controlnet", "text_encoder"): + olive_config["input_model"]["config"]["model_path"] = model_dir + else: + # Only the unet & text encoder are affected by LoRA, so it's better to use the base model ID for + # other models: the Olive cache is based on the JSON config, and two LoRA variants with the same + # base model ID should be able to reuse previously optimized copies. + olive_config["input_model"]["config"]["model_path"] = model_dir + + run_res = olive_run(olive_config) + + from sd_utils.ort import save_optimized_onnx_submodel + + save_optimized_onnx_submodel(submodel_name, provider, model_info) + + from sd_utils.ort import save_onnx_pipeline + + save_onnx_pipeline( + has_safety_checker, model_info, model_output, pipeline, submodel_names + ) + + return model_info + + +def parse_common_args(raw_args): + parser = argparse.ArgumentParser("Common arguments") + + parser.add_argument("--model_input", default="stable-diffusion-v1-5", type=str) + parser.add_argument("--model_output", default="stable-diffusion-v1-5", type=Path) + parser.add_argument("--controlnet",action="store_true", help="Create ControlNet Unet Model") + parser.add_argument( + "--provider", default="dml", type=str, choices=["dml", "cuda"], help="Execution provider to use" + ) + parser.add_argument("--optimize", action="store_true", help="Runs the optimization step") + parser.add_argument("--clean_cache", action="store_true", help="Deletes the Olive cache") + parser.add_argument("--test_unoptimized", action="store_true", help="Use unoptimized model for inference") + parser.add_argument("--batch_size", default=1, type=int, help="Number of images to generate per batch") + parser.add_argument( + "--prompt", + default=( + "castle surrounded by water and nature, village, volumetric lighting, photorealistic, " + "detailed and intricate, fantasy, epic cinematic shot, mountains, 8k ultra hd" + ), + type=str, + ) + parser.add_argument( + "--guidance_scale", + default=7.5, + type=float, + help="Guidance scale as defined in Classifier-Free Diffusion Guidance", + ) + parser.add_argument("--num_images", default=1, type=int, help="Number of images to generate") + parser.add_argument("--num_inference_steps", default=50, type=int, help="Number of steps in diffusion process") + parser.add_argument("--tempdir", default=None, type=str, help="Root directory for tempfile directories and files") + parser.add_argument( + "--strength", + default=1.0, + type=float, + help="Value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. " + "Values that approach 1.0 enable lots of variations but will also produce images " + "that are not semantically consistent with the input.", + ) + parser.add_argument("--image_size", default=512, type=int, help="Width and height of the images to generate") + + return parser.parse_known_args(raw_args) + + +def parse_ort_args(raw_args): + parser = argparse.ArgumentParser("ONNX Runtime arguments") + + parser.add_argument( + "--static_dims", + action="store_true", + help="DEPRECATED (now enabled by default). Use --dynamic_dims to disable static_dims.", + ) + parser.add_argument("--dynamic_dims", action="store_true", help="Disable static shape optimization") + + return parser.parse_known_args(raw_args) + + +def main(raw_args=None): + common_args, extra_args = parse_common_args(raw_args) + + provider = common_args.provider + model_input = common_args.model_input + model_output = common_args.model_output + + script_dir = Path(__file__).resolve().parent + + + if common_args.clean_cache: + shutil.rmtree(script_dir / "cache", ignore_errors=True) + + guidance_scale = common_args.guidance_scale + + ort_args = None, None + ort_args, extra_args = parse_ort_args(extra_args) + + if common_args.optimize or not model_output.exists(): + set_tempdir(common_args.tempdir) + + # TODO(jstoecker): clean up warning filter (mostly during conversion from torch to ONNX) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + + from sd_utils.ort import validate_args + + validate_args(ort_args, common_args.provider) + optimize(common_args.model_input, common_args.model_output, common_args.provider, common_args.controlnet) + + if not common_args.optimize: + model_dir = model_output / "F32" if common_args.test_unoptimized else model_output / "F16" + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + + from sd_utils.ort import get_ort_pipeline + + pipeline = get_ort_pipeline(model_dir, common_args, ort_args, guidance_scale) + run_inference_loop( + pipeline, + common_args.prompt, + common_args.num_images, + common_args.batch_size, + common_args.image_size, + common_args.num_inference_steps, + guidance_scale, + common_args.strength, + provider=provider, + ) + + +if __name__ == "__main__": + main() diff --git a/OnnxStack.Converter/latent_consistency/models.py b/OnnxStack.Converter/latent_consistency/models.py new file mode 100644 index 0000000..8b3de3f --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/models.py @@ -0,0 +1,336 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import config +import torch +from typing import Union, Optional, Tuple +from diffusers import AutoencoderKL, UNet2DConditionModel, ControlNetModel +from diffusers.models.controlnet import ControlNetOutput, BaseOutput as ControlNetBaseOutput +from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker +from transformers.models.clip.modeling_clip import CLIPTextModel +from dataclasses import dataclass + +# Helper latency-only dataloader that creates random tensors with no label +class RandomDataLoader: + def __init__(self, create_inputs_func, batchsize, torch_dtype): + self.create_input_func = create_inputs_func + self.batchsize = batchsize + self.torch_dtype = torch_dtype + + def __getitem__(self, idx): + label = None + return self.create_input_func(self.batchsize, self.torch_dtype), label + + + +# ----------------------------------------------------------------------------- +# TEXT ENCODER +# ----------------------------------------------------------------------------- + + +def text_encoder_inputs(batchsize, torch_dtype): + return torch.zeros((batchsize, 77), dtype=torch_dtype) + + +def text_encoder_load(model_name): + model = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder") + if is_lora_model(model_name): + merge_lora_weights(model, model_name, "text_encoder") + return model + + +def text_encoder_conversion_inputs(model=None): + return text_encoder_inputs(1, torch.int32) + + +def text_encoder_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(text_encoder_inputs, batchsize, torch.int32) + + +# ----------------------------------------------------------------------------- +# UNET +# ----------------------------------------------------------------------------- + + +def unet_inputs(batchsize, torch_dtype, is_conversion_inputs=False): + # TODO(jstoecker): Rename onnx::Concat_4 to text_embeds and onnx::Shape_5 to time_ids + inputs = { + "sample": torch.rand((batchsize, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "timestep": torch.rand((batchsize,), dtype=torch_dtype), + "encoder_hidden_states": torch.rand((batchsize, 77, config.cross_attention_dim), dtype=torch_dtype), + } + + # use as kwargs since they won't be in the correct position if passed along with the tuple of inputs + kwargs = { + "timestep_cond": torch.rand((batchsize, 256), dtype=torch_dtype), + "return_dict": False, + } + if is_conversion_inputs: + inputs["additional_inputs"] = { + **kwargs, + "added_cond_kwargs": { + "text_embeds": torch.rand((1, 1280), dtype=torch_dtype), + "time_ids": torch.rand((1, 5), dtype=torch_dtype), + }, + } + else: + inputs.update(kwargs) + inputs["onnx::Concat_4"] = torch.rand((1, 1280), dtype=torch_dtype) + inputs["onnx::Shape_5"] = torch.rand((1, 5), dtype=torch_dtype) + + return inputs + + +def unet_load(model_name): + model = UNet2DConditionModel.from_pretrained(model_name, subfolder="unet") + if is_lora_model(model_name): + merge_lora_weights(model, model_name, "unet") + return model + + +def unet_conversion_inputs(model=None): + return tuple(unet_inputs(1, torch.float32, True).values()) + + +def unet_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(unet_inputs, batchsize, torch.float16) + +# ----------------------------------------------------------------------------- +# CONTROLNET - UNET +# ----------------------------------------------------------------------------- + +class PatchedUNet2DConditionModel(UNet2DConditionModel): + def forward( + self, + sample: torch.FloatTensor, + timestep: Union[torch.Tensor, float, int], + encoder_hidden_states: torch.Tensor, + timestep_cond: torch.Tensor, + down_block_0_additional_residual: torch.Tensor, + down_block_1_additional_residual: torch.Tensor, + down_block_2_additional_residual: torch.Tensor, + down_block_3_additional_residual: torch.Tensor, + down_block_4_additional_residual: torch.Tensor, + down_block_5_additional_residual: torch.Tensor, + down_block_6_additional_residual: torch.Tensor, + down_block_7_additional_residual: torch.Tensor, + down_block_8_additional_residual: torch.Tensor, + down_block_9_additional_residual: torch.Tensor, + down_block_10_additional_residual: torch.Tensor, + down_block_11_additional_residual: torch.Tensor, + mid_block_additional_residual: torch.Tensor, + ) -> Union[UNet2DConditionModel, Tuple]: + down_block_add_res = ( + down_block_0_additional_residual, down_block_1_additional_residual, down_block_2_additional_residual, + down_block_3_additional_residual, down_block_4_additional_residual, down_block_5_additional_residual, + down_block_6_additional_residual, down_block_7_additional_residual, down_block_8_additional_residual, + down_block_9_additional_residual, down_block_10_additional_residual, down_block_11_additional_residual) + return super().forward( + sample = sample, + timestep = timestep, + encoder_hidden_states = encoder_hidden_states, + timestep_cond = timestep_cond, + down_block_additional_residuals = down_block_add_res, + mid_block_additional_residual = mid_block_additional_residual, + return_dict = False + ) + +def controlnet_unet_inputs(batchsize, torch_dtype): + return { + "sample": torch.rand((batchsize, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "timestep": torch.rand((batchsize,), dtype=torch_dtype), + "encoder_hidden_states": torch.rand((batchsize, 77, config.cross_attention_dim), dtype=torch_dtype), + "timestep_cond": torch.rand((batchsize, 256), dtype=torch_dtype), + "down_block_0_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "down_block_1_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "down_block_2_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "down_block_3_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size // 2, config.unet_sample_size // 2), dtype=torch_dtype), + "down_block_4_additional_residual": torch.rand((batchsize, 640, config.unet_sample_size // 2, config.unet_sample_size // 2), dtype=torch_dtype), + "down_block_5_additional_residual": torch.rand((batchsize, 640, config.unet_sample_size // 2, config.unet_sample_size // 2), dtype=torch_dtype), + "down_block_6_additional_residual": torch.rand((batchsize, 640, config.unet_sample_size // 4, config.unet_sample_size // 4), dtype=torch_dtype), + "down_block_7_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 4, config.unet_sample_size // 4), dtype=torch_dtype), + "down_block_8_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 4, config.unet_sample_size // 4), dtype=torch_dtype), + "down_block_9_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype), + "down_block_10_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype), + "down_block_11_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype), + "mid_block_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype) + } + + +def controlnet_unet_load(model_name): + model = PatchedUNet2DConditionModel.from_pretrained(model_name, subfolder="unet") + return model + + +def controlnet_unet_conversion_inputs(model): + return tuple(controlnet_unet_inputs(1, torch.float32).values()) + + +def controlnet_unet_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(controlnet_unet_inputs, batchsize, torch.float16) + +# ----------------------------------------------------------------------------- +# VAE ENCODER +# ----------------------------------------------------------------------------- + + +def vae_encoder_inputs(batchsize, torch_dtype): + return {"sample": torch.rand((batchsize, 3, config.vae_sample_size, config.vae_sample_size), dtype=torch_dtype)} + + +def vae_encoder_load(model_name): + model = AutoencoderKL.from_pretrained(model_name, subfolder="vae") + model.forward = lambda sample: model.encode(sample)[0].sample() + return model + + +def vae_encoder_conversion_inputs(model=None): + return tuple(vae_encoder_inputs(1, torch.float32).values()) + + +def vae_encoder_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(vae_encoder_inputs, batchsize, torch.float16) + + +# ----------------------------------------------------------------------------- +# VAE DECODER +# ----------------------------------------------------------------------------- + + +def vae_decoder_inputs(batchsize, torch_dtype): + return { + "latent_sample": torch.rand((batchsize, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype) + } + + +def vae_decoder_load(model_name): + model = AutoencoderKL.from_pretrained(model_name, subfolder="vae") + model.forward = model.decode + return model + + +def vae_decoder_conversion_inputs(model=None): + return tuple(vae_decoder_inputs(1, torch.float32).values()) + + +def vae_decoder_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(vae_decoder_inputs, batchsize, torch.float16) + + +# ----------------------------------------------------------------------------- +# SAFETY CHECKER +# ----------------------------------------------------------------------------- + + +def safety_checker_inputs(batchsize, torch_dtype): + return { + "clip_input": torch.rand((batchsize, 3, 224, 224), dtype=torch_dtype), + "images": torch.rand((batchsize, config.vae_sample_size, config.vae_sample_size, 3), dtype=torch_dtype), + } + + +def safety_checker_load(model_name): + model = StableDiffusionSafetyChecker.from_pretrained(model_name, subfolder="safety_checker") + model.forward = model.forward_onnx + return model + + +def safety_checker_conversion_inputs(model=None): + return tuple(safety_checker_inputs(1, torch.float32).values()) + + +def safety_checker_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(safety_checker_inputs, batchsize, torch.float16) + + +# ----------------------------------------------------------------------------- +# LoRA weights +# ----------------------------------------------------------------------------- + +def is_lora_model(model_name): + # TODO(jstoecker): might be a better way to detect (e.g. presence of LORA weights file) + return False + + +# Merges LoRA weights into the layers of a base model +def merge_lora_weights(base_model, lora_model_id, submodel_name="unet", scale=1.0): + import inspect + from collections import defaultdict + from functools import reduce + + try: + from diffusers.loaders import LORA_WEIGHT_NAME + except ImportError: + # moved in version 0.24.0 + from diffusers.loaders.lora import LORA_WEIGHT_NAME + from diffusers.models.attention_processor import LoRAAttnProcessor + from diffusers.utils.hub_utils import _get_model_file + + parameters = inspect.signature(_get_model_file).parameters + + kwargs = {} + if "use_auth_token" in parameters: + kwargs["use_auth_token"] = None + elif "token" in parameters: + kwargs["token"] = None + + # Load LoRA weights + model_file = _get_model_file( + lora_model_id, + weights_name=LORA_WEIGHT_NAME, + cache_dir=None, + force_download=False, + resume_download=False, + proxies=None, + local_files_only=False, + revision=None, + subfolder=None, + user_agent={ + "file_type": "attn_procs_weights", + "framework": "pytorch", + }, + **kwargs, + ) + lora_state_dict = torch.load(model_file, map_location="cpu") + + # All keys in the LoRA state dictionary should have 'lora' somewhere in the string. + keys = list(lora_state_dict.keys()) + assert all("lora" in k for k in keys) + + if all(key.startswith(submodel_name) for key in keys): + # New format (https://github.com/huggingface/diffusers/pull/2918) supports LoRA weights in both the + # unet and text encoder where keys are prefixed with 'unet' or 'text_encoder', respectively. + submodel_state_dict = {k: v for k, v in lora_state_dict.items() if k.startswith(submodel_name)} + else: + # Old format. Keys will not have any prefix. This only applies to unet, so exit early if this is + # optimizing the text encoder. + if submodel_name != "unet": + return + submodel_state_dict = lora_state_dict + + # Group LoRA weights into attention processors + attn_processors = {} + lora_grouped_dict = defaultdict(dict) + for key, value in submodel_state_dict.items(): + attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:]) + lora_grouped_dict[attn_processor_key][sub_key] = value + + for key, value_dict in lora_grouped_dict.items(): + rank = value_dict["to_k_lora.down.weight"].shape[0] + cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[1] + hidden_size = value_dict["to_k_lora.up.weight"].shape[0] + + attn_processors[key] = LoRAAttnProcessor( + hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank + ) + attn_processors[key].load_state_dict(value_dict) + + # Merge LoRA attention processor weights into existing Q/K/V/Out weights + for name, proc in attn_processors.items(): + attention_name = name[: -len(".processor")] + attention = reduce(getattr, attention_name.split(sep="."), base_model) + attention.to_q.weight.data += scale * torch.mm(proc.to_q_lora.up.weight, proc.to_q_lora.down.weight) + attention.to_k.weight.data += scale * torch.mm(proc.to_k_lora.up.weight, proc.to_k_lora.down.weight) + attention.to_v.weight.data += scale * torch.mm(proc.to_v_lora.up.weight, proc.to_v_lora.down.weight) + attention.to_out[0].weight.data += scale * torch.mm(proc.to_out_lora.up.weight, proc.to_out_lora.down.weight) diff --git a/OnnxStack.Converter/latent_consistency/requirements.txt b/OnnxStack.Converter/latent_consistency/requirements.txt new file mode 100644 index 0000000..15b9198 --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/requirements.txt @@ -0,0 +1,9 @@ +accelerate +diffusers +onnx +pillow +protobuf==3.20.3 # protobuf 4.x aborts with OOM when optimizing unet +tabulate +torch +transformers +onnxruntime-directml>=1.16.0 diff --git a/OnnxStack.Converter/latent_consistency/sd_utils/ort.py b/OnnxStack.Converter/latent_consistency/sd_utils/ort.py new file mode 100644 index 0000000..ad49818 --- /dev/null +++ b/OnnxStack.Converter/latent_consistency/sd_utils/ort.py @@ -0,0 +1,172 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import os +import json +import shutil +import sys +from pathlib import Path +from typing import Dict + +import onnxruntime as ort +from diffusers import OnnxRuntimeModel, OnnxStableDiffusionPipeline +from onnxruntime import __version__ as OrtVersion +from packaging import version + +from olive.model import ONNXModelHandler + +# ruff: noqa: TID252, T201 + + +def update_cuda_config(config: Dict): + if version.parse(OrtVersion) < version.parse("1.17.0"): + # disable skip_group_norm fusion since there is a shape inference bug which leads to invalid models + config["passes"]["optimize_cuda"]["config"]["optimization_options"] = {"enable_skip_group_norm": False} + config["pass_flows"] = [["convert", "optimize_cuda"]] + config["systems"]["local_system"]["config"]["accelerators"][0]["execution_providers"] = ["CUDAExecutionProvider"] + return config + + +def validate_args(args, provider): + ort.set_default_logger_severity(4) + if args.static_dims: + print( + "WARNING: the --static_dims option is deprecated, and static shape optimization is enabled by default. " + "Use --dynamic_dims to disable static shape optimization." + ) + + validate_ort_version(provider) + + +def validate_ort_version(provider: str): + if provider == "dml" and version.parse(OrtVersion) < version.parse("1.16.0"): + print("This script requires onnxruntime-directml 1.16.0 or newer") + sys.exit(1) + elif provider == "cuda" and version.parse(OrtVersion) < version.parse("1.17.0"): + if version.parse(OrtVersion) < version.parse("1.16.2"): + print("This script requires onnxruntime-gpu 1.16.2 or newer") + sys.exit(1) + print( + f"WARNING: onnxruntime {OrtVersion} has known issues with shape inference for SkipGroupNorm. Will disable" + " skip_group_norm fusion. onnxruntime-gpu 1.17.0 or newer is strongly recommended!" + ) + + +def save_optimized_onnx_submodel(submodel_name, provider, model_info): + footprints_file_path = ( + Path(__file__).resolve().parents[1] / "footprints" / f"{submodel_name}_gpu-{provider}_footprints.json" + ) + with footprints_file_path.open("r") as footprint_file: + footprints = json.load(footprint_file) + + conversion_footprint = None + optimizer_footprint = None + for footprint in footprints.values(): + if footprint["from_pass"] == "OnnxConversion": + conversion_footprint = footprint + elif footprint["from_pass"] == "OrtTransformersOptimization": + optimizer_footprint = footprint + + assert conversion_footprint + assert optimizer_footprint + + unoptimized_olive_model = ONNXModelHandler(**conversion_footprint["model_config"]["config"]) + optimized_olive_model = ONNXModelHandler(**optimizer_footprint["model_config"]["config"]) + + model_info[submodel_name] = { + "unoptimized": { + "path": Path(unoptimized_olive_model.model_path), + }, + "optimized": { + "path": Path(optimized_olive_model.model_path), + }, + } + + print(f"Unoptimized Model : {model_info[submodel_name]['unoptimized']['path']}") + print(f"Optimized Model : {model_info[submodel_name]['optimized']['path']}") + + +def save_onnx_pipeline( + has_safety_checker, model_info, model_output, pipeline, submodel_names +): + # Save the unoptimized models in a directory structure that the diffusers library can load and run. + # This is optional, and the optimized models can be used directly in a custom pipeline if desired. + print("\nCreating ONNX pipeline...") + + optimized_model_dir = model_output / "Optimized" + unoptimized_model_dir = model_output / "Default" + has_controlnet = 'controlnet' in submodel_names + if has_safety_checker: + safety_checker = OnnxRuntimeModel.from_pretrained(model_info["safety_checker"]["unoptimized"]["path"].parent) + else: + safety_checker = None + + onnx_pipeline = OnnxStableDiffusionPipeline( + vae_encoder=OnnxRuntimeModel.from_pretrained(model_info["vae_encoder"]["unoptimized"]["path"].parent), + vae_decoder=OnnxRuntimeModel.from_pretrained(model_info["vae_decoder"]["unoptimized"]["path"].parent), + text_encoder=OnnxRuntimeModel.from_pretrained(model_info["text_encoder"]["unoptimized"]["path"].parent), + tokenizer=pipeline.tokenizer, + unet=OnnxRuntimeModel.from_pretrained(model_info["unet"]["unoptimized"]["path"].parent), + scheduler=pipeline.scheduler, + safety_checker=safety_checker, + feature_extractor=pipeline.feature_extractor, + requires_safety_checker=True, + ) + + if has_controlnet: + controlnet=OnnxRuntimeModel.from_pretrained(model_info["controlnet"]["unoptimized"]["path"].parent) + + print("Saving unoptimized models...") + onnx_pipeline.save_pretrained(unoptimized_model_dir) + if has_controlnet: + controlnet.save_pretrained(unoptimized_model_dir / "controlnet" ) + + # Create a copy of the unoptimized model directory, then overwrite with optimized models from the olive cache. + print("Copying optimized models...") + shutil.copytree(unoptimized_model_dir, optimized_model_dir, ignore=shutil.ignore_patterns("weights.pb")) + for submodel_name in submodel_names: + src_path = model_info[submodel_name]["optimized"]["path"] + dst_path = optimized_model_dir / submodel_name / "model.onnx" + exists = os.path.exists(dst_path) + if not exists: + os.mkdir(optimized_model_dir / submodel_name) + shutil.copyfile(src_path, dst_path) + + print(f"The default pipeline is located here: {unoptimized_model_dir}") + print(f"The optimized pipeline is located here: {optimized_model_dir}") + + +def get_ort_pipeline(model_dir, common_args, ort_args, guidance_scale): + ort.set_default_logger_severity(3) + + print("Loading models into ORT session...") + sess_options = ort.SessionOptions() + sess_options.enable_mem_pattern = False + + static_dims = not ort_args.dynamic_dims + batch_size = common_args.batch_size + image_size = common_args.image_size + provider = common_args.provider + + if static_dims: + hidden_batch_size = batch_size if (guidance_scale == 0.0) else batch_size * 2 + # Not necessary, but helps DML EP further optimize runtime performance. + # batch_size is doubled for sample & hidden state because of classifier free guidance: + # https://github.com/huggingface/diffusers/blob/46c52f9b9607e6ecb29c782c052aea313e6487b7/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L672 + sess_options.add_free_dimension_override_by_name("unet_sample_batch", hidden_batch_size) + sess_options.add_free_dimension_override_by_name("unet_sample_channels", 4) + sess_options.add_free_dimension_override_by_name("unet_sample_height", image_size // 8) + sess_options.add_free_dimension_override_by_name("unet_sample_width", image_size // 8) + sess_options.add_free_dimension_override_by_name("unet_time_batch", 1) + sess_options.add_free_dimension_override_by_name("unet_hidden_batch", hidden_batch_size) + sess_options.add_free_dimension_override_by_name("unet_hidden_sequence", 77) + + provider_map = { + "dml": "DmlExecutionProvider", + "cuda": "CUDAExecutionProvider", + } + assert provider in provider_map, f"Unsupported provider: {provider}" + return OnnxStableDiffusionPipeline.from_pretrained( + model_dir, provider=provider_map[provider], sess_options=sess_options + ) diff --git a/OnnxStack.Converter/stable_cascade/.gitignore b/OnnxStack.Converter/stable_cascade/.gitignore new file mode 100644 index 0000000..4cf6f30 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/.gitignore @@ -0,0 +1,3 @@ +/footprints/ +/cache/ +/result_*.png diff --git a/OnnxStack.Converter/stable_cascade/README.md b/OnnxStack.Converter/stable_cascade/README.md new file mode 100644 index 0000000..40d584b --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/README.md @@ -0,0 +1,20 @@ +# OnnxStack.Converter + +## Requirements +```bash +pip install onnxruntime-directml +pip install olive-ai[directml] +python -m pip install -r requirements.txt +``` + +## Usage +```bash +convert.py --optimize --model_input '..\stable-cascade' --model_output '..\converted' +``` +`--optimize` - Run the model optimization + +`--model_input` - Safetensor model to convert + +`--model_output` - Output for converted ONNX model (NOTE: This folder is deleted before each run) + +`--image_encoder` - Convert the optional image encoder diff --git a/OnnxStack.Converter/stable_cascade/config.py b/OnnxStack.Converter/stable_cascade/config.py new file mode 100644 index 0000000..7b1b47e --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/config.py @@ -0,0 +1,8 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +vae_sample_size = 512 +unet_sample_size = 24 +cross_attention_dim = 1280 \ No newline at end of file diff --git a/OnnxStack.Converter/stable_cascade/config_decoder.json b/OnnxStack.Converter/stable_cascade/config_decoder.json new file mode 100644 index 0000000..6d70698 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/config_decoder.json @@ -0,0 +1,120 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "stabilityai/stable-cascade", + "model_loader": "decoder_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample", "timestep_ratio", "clip_text_pooled", "effnet", "return_dict" ], + "output_names": [ "out_sample" ], + "dynamic_axes": { + "sample": {"0": "unet_sample_batch", "1": "unet_sample_channels", "2": "unet_sample_height", "3": "unet_sample_width"}, + "timestep_ratio": {"0": "unet_timestep_ratio"}, + "clip_text_pooled": {"0": "unet_clip_text_pooled_batch", "1": "unet_clip_text_pooled_size"}, + "effnet": {"0": "unet_hidden_batch", "1": "unet_hidden_size"} + } + }, + "dummy_inputs_func": "decoder_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "decoder_data_loader", + "batch_size": 2 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 16, + "save_as_external_data": true, + "all_tensors_to_one_file": true + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": true, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "decoder", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "decoder", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_cascade/config_image_encoder.json b/OnnxStack.Converter/stable_cascade/config_image_encoder.json new file mode 100644 index 0000000..08cfc7e --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/config_image_encoder.json @@ -0,0 +1,113 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "stabilityai/stable-cascade", + "model_loader": "image_encoder_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample"], + "output_names": [ "image_embeds", "last_hidden_state"], + "dynamic_axes": { "sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } } + }, + "dummy_inputs_func": "image_encoder_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "image_encoder_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 16 + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "clip", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": true, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "clip", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "image_encoder", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_cascade/config_prior.json b/OnnxStack.Converter/stable_cascade/config_prior.json new file mode 100644 index 0000000..373e8a5 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/config_prior.json @@ -0,0 +1,121 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "stabilityai/stable-cascade", + "model_loader": "prior_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample", "timestep_ratio", "clip_text_pooled", "clip_text", "clip_img", "return_dict" ], + "output_names": [ "out_sample" ], + "dynamic_axes": { + "sample": {"0": "unet_sample_batch", "1": "unet_sample_channels", "2": "unet_sample_height", "3": "unet_sample_width"}, + "timestep_ratio": {"0": "unet_timestep_ratio"}, + "clip_text_pooled": {"0": "unet_clip_text_pooled_batch", "1": "unet_clip_text_pooled_size", "2": "unet_clip_text_pooled_length"}, + "clip_text": {"0": "unet_clip_text_batch", "1": "unet_clip_text_size", "2": "unet_clip_text_length"}, + "clip_img": {"0": "unet_clip_img_batch", "1": "unet_clip_img_size", "2": "unet_clip_img_length"} + } + }, + "dummy_inputs_func": "prior_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "prior_data_loader", + "batch_size": 2 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 16, + "save_as_external_data": true, + "all_tensors_to_one_file": true + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": true, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "prior", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_cascade/config_text_encoder.json b/OnnxStack.Converter/stable_cascade/config_text_encoder.json new file mode 100644 index 0000000..a16f5e5 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/config_text_encoder.json @@ -0,0 +1,113 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "stabilityai/stable-cascade", + "model_loader": "text_encoder_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "input_ids" ], + "output_names": [ "last_hidden_state", "pooler_output" ], + "dynamic_axes": { "input_ids": { "0": "batch", "1": "sequence" } } + }, + "dummy_inputs_func": "text_encoder_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "text_encoder_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 16 + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "clip", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": true, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "clip", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "text_encoder", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_cascade/config_vqgan.json b/OnnxStack.Converter/stable_cascade/config_vqgan.json new file mode 100644 index 0000000..edf5969 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/config_vqgan.json @@ -0,0 +1,103 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "stabilityai/stable-cascade", + "model_loader": "vqgan_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample", "return_dict" ], + "output_names": [ "latent_sample" ], + "dynamic_axes": { "sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } } + }, + "dummy_inputs_func": "vqgan_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "vqgan_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 16 + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "vqgan", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_cascade/convert.py b/OnnxStack.Converter/stable_cascade/convert.py new file mode 100644 index 0000000..17450bc --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/convert.py @@ -0,0 +1,211 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import argparse +import json +import shutil +import sys +import warnings +from pathlib import Path +from typing import Dict + +import config +import torch +from diffusers import DiffusionPipeline +from packaging import version + +from olive.common.utils import set_tempdir +from olive.workflows import run as olive_run + + +# pylint: disable=redefined-outer-name +# ruff: noqa: TID252, T201 + + +def save_image(result, batch_size, provider, num_images, images_saved, image_callback=None): + passed_safety_checker = 0 + for image_index in range(batch_size): + if result.nsfw_content_detected is None or not result.nsfw_content_detected[image_index]: + passed_safety_checker += 1 + if images_saved < num_images: + output_path = f"result_{images_saved}.png" + result.images[image_index].save(output_path) + if image_callback: + image_callback(images_saved, output_path) + images_saved += 1 + print(f"Generated {output_path}") + print(f"Inference Batch End ({passed_safety_checker}/{batch_size} images).") + print("Images passed the safety checker.") + return images_saved + + +def run_inference_loop( + pipeline, + prompt, + num_images, + batch_size, + image_size, + num_inference_steps, + guidance_scale, + strength: float, + provider: str, + image_callback=None, + step_callback=None, +): + images_saved = 0 + + def update_steps(step, timestep, latents): + if step_callback: + step_callback((images_saved // batch_size) * num_inference_steps + step) + + while images_saved < num_images: + print(f"\nInference Batch Start (batch size = {batch_size}).") + + kwargs = {} + + result = pipeline( + [prompt] * batch_size, + num_inference_steps=num_inference_steps, + callback=update_steps if step_callback else None, + height=image_size, + width=image_size, + guidance_scale=guidance_scale, + **kwargs, + ) + + images_saved = save_image(result, batch_size, provider, num_images, images_saved, image_callback) + + +def update_config_with_provider(config: Dict, provider: str): + if provider == "dml": + # DirectML EP is the default, so no need to update config. + return config + elif provider == "cuda": + from sd_utils.ort import update_cuda_config + + return update_cuda_config(config) + else: + raise ValueError(f"Unsupported provider: {provider}") + + +def optimize( + model_input: str, + model_output: Path, + provider: str, + image_encoder: bool +): + from google.protobuf import __version__ as protobuf_version + + # protobuf 4.x aborts with OOM when optimizing unet + if version.parse(protobuf_version) > version.parse("3.20.3"): + print("This script requires protobuf 3.20.3. Please ensure your package version matches requirements.txt.") + sys.exit(1) + + model_dir = model_input + script_dir = Path(__file__).resolve().parent + + # Clean up previously optimized models, if any. + shutil.rmtree(script_dir / "footprints", ignore_errors=True) + shutil.rmtree(model_output, ignore_errors=True) + + # Load the entire PyTorch pipeline to ensure all models and their configurations are downloaded and cached. + # This avoids an issue where the non-ONNX components (tokenizer, scheduler, and feature extractor) are not + # automatically cached correctly if individual models are fetched one at a time. + print("Download stable diffusion PyTorch pipeline...") + pipeline = DiffusionPipeline.from_pretrained(model_dir, torch_dtype=torch.float32, **{"local_files_only": True}) + # config.vae_sample_size = pipeline.vae.config.sample_size + # config.cross_attention_dim = pipeline.unet.config.cross_attention_dim + # config.unet_sample_size = pipeline.unet.config.sample_size + + model_info = {} + + submodel_names = [ "text_encoder", "decoder", "prior", "vqgan"] + + if image_encoder: + submodel_names.append("image_encoder") + + for submodel_name in submodel_names: + print(f"\nOptimizing {submodel_name}") + + olive_config = None + with (script_dir / f"config_{submodel_name}.json").open() as fin: + olive_config = json.load(fin) + olive_config = update_config_with_provider(olive_config, provider) + olive_config["input_model"]["config"]["model_path"] = model_dir + + run_res = olive_run(olive_config) + + from sd_utils.ort import save_optimized_onnx_submodel + + save_optimized_onnx_submodel(submodel_name, provider, model_info) + + from sd_utils.ort import save_onnx_pipeline + + save_onnx_pipeline( + model_info, model_output, pipeline, submodel_names + ) + + return model_info + + +def parse_common_args(raw_args): + parser = argparse.ArgumentParser("Common arguments") + parser.add_argument("--model_input", default="stable-diffusion-v1-5", type=str) + parser.add_argument("--model_output", default="stable-diffusion-v1-5", type=Path) + parser.add_argument("--image_encoder",action="store_true", help="Create image encoder model") + parser.add_argument("--provider", default="dml", type=str, choices=["dml", "cuda"], help="Execution provider to use") + parser.add_argument("--optimize", action="store_true", help="Runs the optimization step") + parser.add_argument("--clean_cache", action="store_true", help="Deletes the Olive cache") + parser.add_argument("--test_unoptimized", action="store_true", help="Use unoptimized model for inference") + parser.add_argument("--tempdir", default=None, type=str, help="Root directory for tempfile directories and files") + return parser.parse_known_args(raw_args) + + +def parse_ort_args(raw_args): + parser = argparse.ArgumentParser("ONNX Runtime arguments") + + parser.add_argument( + "--static_dims", + action="store_true", + help="DEPRECATED (now enabled by default). Use --dynamic_dims to disable static_dims.", + ) + parser.add_argument("--dynamic_dims", action="store_true", help="Disable static shape optimization") + + return parser.parse_known_args(raw_args) + + +def main(raw_args=None): + common_args, extra_args = parse_common_args(raw_args) + + provider = common_args.provider + model_input = common_args.model_input + model_output = common_args.model_output + + script_dir = Path(__file__).resolve().parent + + + if common_args.clean_cache: + shutil.rmtree(script_dir / "cache", ignore_errors=True) + + ort_args = None, None + ort_args, extra_args = parse_ort_args(extra_args) + + if common_args.optimize or not model_output.exists(): + set_tempdir(common_args.tempdir) + + # TODO(jstoecker): clean up warning filter (mostly during conversion from torch to ONNX) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + + from sd_utils.ort import validate_args + + validate_args(ort_args, common_args.provider) + optimize(common_args.model_input, common_args.model_output, common_args.provider, common_args.image_encoder) + + if not common_args.optimize: + print("TODO: Create OnnxStableCascadePipeline") + + +if __name__ == "__main__": + main() diff --git a/OnnxStack.Converter/stable_cascade/models.py b/OnnxStack.Converter/stable_cascade/models.py new file mode 100644 index 0000000..c8b15b7 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/models.py @@ -0,0 +1,168 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import config +import torch +from typing import Union, Optional, Tuple +from diffusers import AutoencoderKL, StableCascadeUNet +from diffusers.pipelines.wuerstchen import PaellaVQModel +from transformers.models.clip.modeling_clip import CLIPTextModelWithProjection, CLIPVisionModelWithProjection +from dataclasses import dataclass + +# Helper latency-only dataloader that creates random tensors with no label +class RandomDataLoader: + def __init__(self, create_inputs_func, batchsize, torch_dtype): + self.create_input_func = create_inputs_func + self.batchsize = batchsize + self.torch_dtype = torch_dtype + + def __getitem__(self, idx): + label = None + return self.create_input_func(self.batchsize, self.torch_dtype), label + + + +# ----------------------------------------------------------------------------- +# TEXT ENCODER +# ----------------------------------------------------------------------------- + +def text_encoder_inputs(batchsize, torch_dtype): + return torch.zeros((batchsize, 77), dtype=torch_dtype) + + +def text_encoder_load(model_name): + model = CLIPTextModelWithProjection.from_pretrained(model_name, subfolder="text_encoder") + return model + + +def text_encoder_conversion_inputs(model=None): + return text_encoder_inputs(1, torch.int32) + + +def text_encoder_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(text_encoder_inputs, batchsize, torch.int32) + + + + +# ----------------------------------------------------------------------------- +# DECODER UNET +# ----------------------------------------------------------------------------- + +def decoder_inputs(batchsize, torch_dtype, is_conversion_inputs=False): + # TODO(jstoecker): Rename onnx::Concat_4 to text_embeds and onnx::Shape_5 to time_ids + inputs = { + "sample": torch.rand((batchsize, 4, 256, 256), dtype=torch_dtype), + "timestep_ratio": torch.rand((batchsize,), dtype=torch_dtype), + "clip_text_pooled": torch.rand((batchsize , 1, 1280), dtype=torch_dtype), + "effnet": torch.rand((batchsize, 16, 24, 24), dtype=torch_dtype) + } + + # use as kwargs since they won't be in the correct position if passed along with the tuple of inputs + kwargs = { + "return_dict": False, + } + + return inputs + + +def decoder_load(model_name): + model = StableCascadeUNet.from_pretrained(model_name, subfolder="decoder") + return model + + +def decoder_conversion_inputs(model=None): + return tuple(decoder_inputs(1, torch.float32, True).values()) + + +def decoder_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(decoder_inputs, batchsize, torch.float16) + + + + +# ----------------------------------------------------------------------------- +# PRIOR UNET +# ----------------------------------------------------------------------------- + +def prior_inputs(batchsize, torch_dtype, is_conversion_inputs=False): + inputs = { + "sample": torch.rand((batchsize, 16, 24, 24), dtype=torch_dtype), + "timestep_ratio": torch.rand((batchsize,), dtype=torch_dtype), + "clip_text_pooled": torch.rand((batchsize , 1, 1280), dtype=torch_dtype), + "clip_text": torch.rand((batchsize , 77, 1280), dtype=torch_dtype), + "clip_img": torch.rand((batchsize , 1, 768), dtype=torch_dtype) + } + + # use as kwargs since they won't be in the correct position if passed along with the tuple of inputs + kwargs = { + "return_dict": False, + } + + return inputs + + +def prior_load(model_name): + model = StableCascadeUNet.from_pretrained(model_name, subfolder="prior") + return model + + +def prior_conversion_inputs(model=None): + return tuple(prior_inputs(1, torch.float32, True).values()) + + +def prior_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(prior_inputs, batchsize, torch.float16) + + + + +# ----------------------------------------------------------------------------- +# IMAGE ENCODER +# ----------------------------------------------------------------------------- + +def image_encoder_inputs(batchsize, torch_dtype, is_conversion_inputs=False): + inputs = { + "sample": torch.rand((batchsize, 3, 224, 224), dtype=torch_dtype) + } + return inputs + + +def image_encoder_load(model_name): + model = CLIPVisionModelWithProjection.from_pretrained(model_name, subfolder="image_encoder", use_safetensors=True) + return model + + +def image_encoder_conversion_inputs(model=None): + return tuple(image_encoder_inputs(1, torch.float32, True).values()) + + +def image_encoder_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(image_encoder_inputs, batchsize, torch.float16) + + + + +# ----------------------------------------------------------------------------- +# VQGAN +# ----------------------------------------------------------------------------- + +def vqgan_inputs(batchsize, torch_dtype, is_conversion_inputs=False): + inputs = { + "sample": torch.rand((batchsize, 3, 256, 256), dtype=torch_dtype) + } + return inputs + + +def vqgan_load(model_name): + model = PaellaVQModel.from_pretrained(model_name, subfolder="vqgan", use_safetensors=True) + return model + + +def vqgan_conversion_inputs(model=None): + return tuple(vqgan_inputs(1, torch.float32, True).values()) + + +def vqgan_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(vqgan_inputs, batchsize, torch.float16) \ No newline at end of file diff --git a/OnnxStack.Converter/stable_cascade/requirements.txt b/OnnxStack.Converter/stable_cascade/requirements.txt new file mode 100644 index 0000000..15b9198 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/requirements.txt @@ -0,0 +1,9 @@ +accelerate +diffusers +onnx +pillow +protobuf==3.20.3 # protobuf 4.x aborts with OOM when optimizing unet +tabulate +torch +transformers +onnxruntime-directml>=1.16.0 diff --git a/OnnxStack.Converter/stable_cascade/sd_utils/ort.py b/OnnxStack.Converter/stable_cascade/sd_utils/ort.py new file mode 100644 index 0000000..72746f7 --- /dev/null +++ b/OnnxStack.Converter/stable_cascade/sd_utils/ort.py @@ -0,0 +1,117 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import os +import json +import shutil +import sys +from pathlib import Path +from typing import Dict + +import onnxruntime as ort +from diffusers import OnnxRuntimeModel, StableCascadePriorPipeline +from onnxruntime import __version__ as OrtVersion +from packaging import version + +from olive.model import ONNXModelHandler + +# ruff: noqa: TID252, T201 + + +def update_cuda_config(config: Dict): + if version.parse(OrtVersion) < version.parse("1.17.0"): + # disable skip_group_norm fusion since there is a shape inference bug which leads to invalid models + config["passes"]["optimize_cuda"]["config"]["optimization_options"] = {"enable_skip_group_norm": False} + config["pass_flows"] = [["convert", "optimize_cuda"]] + config["systems"]["local_system"]["config"]["accelerators"][0]["execution_providers"] = ["CUDAExecutionProvider"] + return config + + +def validate_args(args, provider): + ort.set_default_logger_severity(4) + if args.static_dims: + print( + "WARNING: the --static_dims option is deprecated, and static shape optimization is enabled by default. " + "Use --dynamic_dims to disable static shape optimization." + ) + + validate_ort_version(provider) + + +def validate_ort_version(provider: str): + if provider == "dml" and version.parse(OrtVersion) < version.parse("1.16.0"): + print("This script requires onnxruntime-directml 1.16.0 or newer") + sys.exit(1) + elif provider == "cuda" and version.parse(OrtVersion) < version.parse("1.17.0"): + if version.parse(OrtVersion) < version.parse("1.16.2"): + print("This script requires onnxruntime-gpu 1.16.2 or newer") + sys.exit(1) + print( + f"WARNING: onnxruntime {OrtVersion} has known issues with shape inference for SkipGroupNorm. Will disable" + " skip_group_norm fusion. onnxruntime-gpu 1.17.0 or newer is strongly recommended!" + ) + + +def save_optimized_onnx_submodel(submodel_name, provider, model_info): + footprints_file_path = ( + Path(__file__).resolve().parents[1] / "footprints" / f"{submodel_name}_gpu-{provider}_footprints.json" + ) + with footprints_file_path.open("r") as footprint_file: + footprints = json.load(footprint_file) + + conversion_footprint = None + optimizer_footprint = None + for footprint in footprints.values(): + if footprint["from_pass"] == "OnnxConversion": + conversion_footprint = footprint + elif footprint["from_pass"] == "OrtTransformersOptimization": + optimizer_footprint = footprint + + assert conversion_footprint + assert optimizer_footprint + + unoptimized_olive_model = ONNXModelHandler(**conversion_footprint["model_config"]["config"]) + optimized_olive_model = ONNXModelHandler(**optimizer_footprint["model_config"]["config"]) + + model_info[submodel_name] = { + "unoptimized": { + "path": Path(unoptimized_olive_model.model_path), + "data": Path(unoptimized_olive_model.model_path + ".data"), + }, + "optimized": { + "path": Path(optimized_olive_model.model_path), + "data": Path(optimized_olive_model.model_path + ".data"), + }, + } + + print(f"Unoptimized Model : {model_info[submodel_name]['unoptimized']['path']}") + print(f"Optimized Model : {model_info[submodel_name]['optimized']['path']}") + + +def save_onnx_pipeline( + model_info, model_output, pipeline, submodel_names +): + # Save the unoptimized models in a directory structure that the diffusers library can load and run. + # This is optional, and the optimized models can be used directly in a custom pipeline if desired. + # print("\nCreating ONNX pipeline...") + + # TODO: Create OnnxStableCascadePipeline + + # Create a copy of the unoptimized model directory, then overwrite with optimized models from the olive cache. + print("Copying optimized models...") + for passType in ["optimized", "unoptimized"]: + model_dir = model_output / passType + for submodel_name in submodel_names: + src_path = model_info[submodel_name][passType]["path"] # model.onnx + src_data_path = model_info[submodel_name][passType]["data"]# model.onnx.data + + dst_path = model_dir / submodel_name + if not os.path.exists(dst_path): + os.makedirs(dst_path, exist_ok=True) + + shutil.copyfile(src_path, dst_path / "model.onnx") + if os.path.exists(src_data_path): + shutil.copyfile(src_data_path, dst_path / "model.onnx.data") + + print(f"The converted model is located here: {model_output}") diff --git a/OnnxStack.Converter/stable_diffusion/.gitignore b/OnnxStack.Converter/stable_diffusion/.gitignore new file mode 100644 index 0000000..4cf6f30 --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/.gitignore @@ -0,0 +1,3 @@ +/footprints/ +/cache/ +/result_*.png diff --git a/OnnxStack.Converter/stable_diffusion/config.py b/OnnxStack.Converter/stable_diffusion/config.py new file mode 100644 index 0000000..f8cfccd --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/config.py @@ -0,0 +1,8 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +vae_sample_size = 512 +unet_sample_size = 64 +cross_attention_dim = 768 diff --git a/OnnxStack.Converter/stable_diffusion/config_controlnet.json b/OnnxStack.Converter/stable_diffusion/config_controlnet.json new file mode 100644 index 0000000..02902ea --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/config_controlnet.json @@ -0,0 +1,123 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "runwayml/stable-diffusion-v1-5", + "model_loader": "controlnet_unet_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample", "timestep", "encoder_hidden_states", "down_block_0_additional_residual", "down_block_1_additional_residual", "down_block_2_additional_residual", "down_block_3_additional_residual", "down_block_4_additional_residual", "down_block_5_additional_residual", "down_block_6_additional_residual", "down_block_7_additional_residual", "down_block_8_additional_residual", "down_block_9_additional_residual", "down_block_10_additional_residual", "down_block_11_additional_residual", "mid_block_additional_residual", "return_dict" ], + "output_names": [ "out_sample" ], + "dynamic_axes": { + "sample": {"0": "unet_sample_batch", "1": "unet_sample_channels", "2": "unet_sample_height", "3": "unet_sample_width"}, + "timestep": {"0": "unet_time_batch"}, + "encoder_hidden_states": {"0": "unet_hidden_batch", "1": "unet_hidden_sequence"}, + "down_block_0_additional_residual": {"0": "cnet_db0_batch", "1": "cnet_db0_channels", "2": "cnet_db0_height", "3": "cnet_db0_width"}, + "down_block_1_additional_residual": {"0": "cnet_db1_batch", "1": "cnet_db1_channels", "2": "cnet_db1_height", "3": "cnet_db1_width"}, + "down_block_2_additional_residual": {"0": "cnet_db2_batch", "1": "cnet_db2_channels", "2": "cnet_db2_height", "3": "cnet_db2_width"}, + "down_block_3_additional_residual": {"0": "cnet_db3_batch", "1": "cnet_db3_channels", "2": "cnet_db3_height2", "3": "cnet_db3_width2"}, + "down_block_4_additional_residual": {"0": "cnet_db4_batch", "1": "cnet_db4_channels", "2": "cnet_db4_height2", "3": "cnet_db4_width2"}, + "down_block_5_additional_residual": {"0": "cnet_db5_batch", "1": "cnet_db5_channels", "2": "cnet_db5_height2", "3": "cnet_db5_width2"}, + "down_block_6_additional_residual": {"0": "cnet_db6_batch", "1": "cnet_db6_channels", "2": "cnet_db6_height4", "3": "cnet_db6_width4"}, + "down_block_7_additional_residual": {"0": "cnet_db7_batch", "1": "cnet_db7_channels", "2": "cnet_db7_height4", "3": "cnet_db7_width4"}, + "down_block_8_additional_residual": {"0": "cnet_db8_batch", "1": "cnet_db8_channels", "2": "cnet_db8_height4", "3": "cnet_db8_width4"}, + "down_block_9_additional_residual": {"0": "cnet_db9_batch", "1": "cnet_db9_channels", "2": "cnet_db9_height8", "3": "cnet_db9_width8"}, + "down_block_10_additional_residual": {"0": "cnet_db10_batch", "1": "cnet_db10_channels", "2": "cnet_db10_height8", "3": "cnet_db10_width8"}, + "down_block_11_additional_residual": {"0": "cnet_db11_batch", "1": "cnet_db11_channels", "2": "cnet_db11_height8", "3": "cnet_db11_width8"}, + "mid_block_additional_residual": {"0": "cnet_mbar_batch", "1": "cnet_mbar_channels", "2": "cnet_mbar_height8", "3": "cnet_mbar_width8"} + } + }, + "dummy_inputs_func": "controlnet_unet_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "controlnet_unet_data_loader", + "batch_size": 2 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14, + "save_as_external_data": true, + "all_tensors_to_one_file": true, + "external_data_name": "weights.pb" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "controlnet", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_diffusion/config_safety_checker.json b/OnnxStack.Converter/stable_diffusion/config_safety_checker.json new file mode 100644 index 0000000..f5234a8 --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/config_safety_checker.json @@ -0,0 +1,124 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "runwayml/stable-diffusion-v1-5", + "model_loader": "safety_checker_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "clip_input", "images" ], + "output_names": [ "out_images", "has_nsfw_concepts" ], + "dynamic_axes": { + "clip_input": { "0": "batch", "1": "channels", "2": "height", "3": "width" }, + "images": { "0": "batch", "1": "height", "2": "width", "3": "channels" } + } + }, + "dummy_inputs_func": "safety_checker_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "safety_checker_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14 + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "safety_checker_conversion_inputs", + "output_model": "safety_checker" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "safety_checker", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_diffusion/config_text_encoder.json b/OnnxStack.Converter/stable_diffusion/config_text_encoder.json new file mode 100644 index 0000000..db7115f --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/config_text_encoder.json @@ -0,0 +1,121 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "runwayml/stable-diffusion-v1-5", + "model_loader": "text_encoder_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "input_ids" ], + "output_names": [ "last_hidden_state", "pooler_output" ], + "dynamic_axes": { "input_ids": { "0": "batch", "1": "sequence" } } + }, + "dummy_inputs_func": "text_encoder_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "text_encoder_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14 + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "text_encoder_conversion_inputs", + "output_model": "text_encoder" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "clip", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "clip", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "text_encoder", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_diffusion/config_unet.json b/OnnxStack.Converter/stable_diffusion/config_unet.json new file mode 100644 index 0000000..d5e4ab2 --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/config_unet.json @@ -0,0 +1,128 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "runwayml/stable-diffusion-v1-5", + "model_loader": "unet_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample", "timestep", "encoder_hidden_states", "return_dict" ], + "output_names": [ "out_sample" ], + "dynamic_axes": { + "sample": {"0": "unet_sample_batch", "1": "unet_sample_channels", "2": "unet_sample_height", "3": "unet_sample_width"}, + "timestep": {"0": "unet_time_batch"}, + "encoder_hidden_states": {"0": "unet_hidden_batch", "1": "unet_hidden_sequence"} + } + }, + "dummy_inputs_func": "unet_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "unet_data_loader", + "batch_size": 2 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14, + "save_as_external_data": true, + "all_tensors_to_one_file": true, + "external_data_name": "weights.pb" + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "get_unet_ov_example_input", + "output_model": "unet" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "unet", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_diffusion/config_vae_decoder.json b/OnnxStack.Converter/stable_diffusion/config_vae_decoder.json new file mode 100644 index 0000000..40c42b8 --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/config_vae_decoder.json @@ -0,0 +1,121 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "runwayml/stable-diffusion-v1-5", + "model_loader": "vae_decoder_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "latent_sample", "return_dict" ], + "output_names": [ "sample" ], + "dynamic_axes": { "latent_sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } } + }, + "dummy_inputs_func": "vae_decoder_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "vae_decoder_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14 + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "vae_decoder_conversion_inputs", + "output_model": "vae_decoder" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "vae_decoder", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_diffusion/config_vae_encoder.json b/OnnxStack.Converter/stable_diffusion/config_vae_encoder.json new file mode 100644 index 0000000..780b250 --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/config_vae_encoder.json @@ -0,0 +1,121 @@ +{ + "input_model": { + "type": "PyTorchModel", + "config": { + "model_path": "runwayml/stable-diffusion-v1-5", + "model_loader": "vae_encoder_load", + "model_script": "models.py", + "io_config": { + "input_names": [ "sample", "return_dict" ], + "output_names": [ "latent_sample" ], + "dynamic_axes": { "sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } } + }, + "dummy_inputs_func": "vae_encoder_conversion_inputs" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "DmlExecutionProvider" + ] + } + ] + } + } + }, + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "sub_types": [{"name": "avg"}], + "user_config": { + "user_script": "models.py", + "dataloader_func": "vae_encoder_data_loader", + "batch_size": 1 + } + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "config": { + "target_opset": 14 + } + }, + "ov_convert": { + "type": "OpenVINOConversion", + "config": { + "user_script": "models.py", + "example_input_func": "vae_encoder_conversion_inputs", + "output_model": "vae_encoder" + } + }, + "optimize": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": ["RandomNormalLike"], + "force_fp16_inputs": { + "GroupNorm": [0, 1, 2] + } + } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "config": { + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + } + }, + "pass_flows": [ + ["convert", "optimize"] + ], + "engine": { + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_name": "vae_encoder", + "output_dir": "footprints" + } +} diff --git a/OnnxStack.Converter/stable_diffusion/convert.py b/OnnxStack.Converter/stable_diffusion/convert.py new file mode 100644 index 0000000..c011d45 --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/convert.py @@ -0,0 +1,273 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import argparse +import json +import shutil +import sys +import warnings +from pathlib import Path +from typing import Dict + +import config +import torch +from diffusers import DiffusionPipeline +from packaging import version + +from olive.common.utils import set_tempdir +from olive.workflows import run as olive_run + + +# pylint: disable=redefined-outer-name +# ruff: noqa: TID252, T201 + + +def save_image(result, batch_size, provider, num_images, images_saved, image_callback=None): + passed_safety_checker = 0 + for image_index in range(batch_size): + if result.nsfw_content_detected is None or not result.nsfw_content_detected[image_index]: + passed_safety_checker += 1 + if images_saved < num_images: + output_path = f"result_{images_saved}.png" + result.images[image_index].save(output_path) + if image_callback: + image_callback(images_saved, output_path) + images_saved += 1 + print(f"Generated {output_path}") + print(f"Inference Batch End ({passed_safety_checker}/{batch_size} images).") + print("Images passed the safety checker.") + return images_saved + + +def run_inference_loop( + pipeline, + prompt, + num_images, + batch_size, + image_size, + num_inference_steps, + guidance_scale, + strength: float, + provider: str, + image_callback=None, + step_callback=None, +): + images_saved = 0 + + def update_steps(step, timestep, latents): + if step_callback: + step_callback((images_saved // batch_size) * num_inference_steps + step) + + while images_saved < num_images: + print(f"\nInference Batch Start (batch size = {batch_size}).") + + kwargs = {} + + result = pipeline( + [prompt] * batch_size, + num_inference_steps=num_inference_steps, + callback=update_steps if step_callback else None, + height=image_size, + width=image_size, + guidance_scale=guidance_scale, + **kwargs, + ) + + images_saved = save_image(result, batch_size, provider, num_images, images_saved, image_callback) + + +def update_config_with_provider(config: Dict, provider: str): + if provider == "dml": + # DirectML EP is the default, so no need to update config. + return config + elif provider == "cuda": + from sd_utils.ort import update_cuda_config + + return update_cuda_config(config) + else: + raise ValueError(f"Unsupported provider: {provider}") + + +def optimize( + model_input: str, + model_output: Path, + provider: str, + controlnet: bool +): + from google.protobuf import __version__ as protobuf_version + + # protobuf 4.x aborts with OOM when optimizing unet + if version.parse(protobuf_version) > version.parse("3.20.3"): + print("This script requires protobuf 3.20.3. Please ensure your package version matches requirements.txt.") + sys.exit(1) + + model_dir = model_input + script_dir = Path(__file__).resolve().parent + + # Clean up previously optimized models, if any. + shutil.rmtree(script_dir / "footprints", ignore_errors=True) + shutil.rmtree(model_output, ignore_errors=True) + + + # Load the entire PyTorch pipeline to ensure all models and their configurations are downloaded and cached. + # This avoids an issue where the non-ONNX components (tokenizer, scheduler, and feature extractor) are not + # automatically cached correctly if individual models are fetched one at a time. + print("Download stable diffusion PyTorch pipeline...") + pipeline = DiffusionPipeline.from_pretrained(model_dir, torch_dtype=torch.float32, **{"local_files_only": True}) + config.vae_sample_size = pipeline.vae.config.sample_size + config.cross_attention_dim = pipeline.unet.config.cross_attention_dim + config.unet_sample_size = pipeline.unet.config.sample_size + + model_info = {} + + submodel_names = ["vae_encoder", "vae_decoder", "unet" , "text_encoder"] + + has_safety_checker = getattr(pipeline, "safety_checker", None) is not None + + if has_safety_checker: + submodel_names.append("safety_checker") + + if controlnet: + submodel_names.append("controlnet") + + for submodel_name in submodel_names: + print(f"\nOptimizing {submodel_name}") + + olive_config = None + with (script_dir / f"config_{submodel_name}.json").open() as fin: + olive_config = json.load(fin) + olive_config = update_config_with_provider(olive_config, provider) + + if submodel_name in ("unet", "controlnet", "text_encoder"): + olive_config["input_model"]["config"]["model_path"] = model_dir + else: + # Only the unet & text encoder are affected by LoRA, so it's better to use the base model ID for + # other models: the Olive cache is based on the JSON config, and two LoRA variants with the same + # base model ID should be able to reuse previously optimized copies. + olive_config["input_model"]["config"]["model_path"] = model_dir + + run_res = olive_run(olive_config) + + from sd_utils.ort import save_optimized_onnx_submodel + + save_optimized_onnx_submodel(submodel_name, provider, model_info) + + from sd_utils.ort import save_onnx_pipeline + + save_onnx_pipeline( + has_safety_checker, model_info, model_output, pipeline, submodel_names + ) + + return model_info + + +def parse_common_args(raw_args): + parser = argparse.ArgumentParser("Common arguments") + + parser.add_argument("--model_input", default="stable-diffusion-v1-5", type=str) + parser.add_argument("--model_output", default="stable-diffusion-v1-5", type=Path) + parser.add_argument("--controlnet",action="store_true", help="Create ControlNet Unet Model") + parser.add_argument( + "--provider", default="dml", type=str, choices=["dml", "cuda"], help="Execution provider to use" + ) + parser.add_argument("--optimize", action="store_true", help="Runs the optimization step") + parser.add_argument("--clean_cache", action="store_true", help="Deletes the Olive cache") + parser.add_argument("--test_unoptimized", action="store_true", help="Use unoptimized model for inference") + parser.add_argument("--batch_size", default=1, type=int, help="Number of images to generate per batch") + parser.add_argument( + "--prompt", + default=( + "castle surrounded by water and nature, village, volumetric lighting, photorealistic, " + "detailed and intricate, fantasy, epic cinematic shot, mountains, 8k ultra hd" + ), + type=str, + ) + parser.add_argument( + "--guidance_scale", + default=7.5, + type=float, + help="Guidance scale as defined in Classifier-Free Diffusion Guidance", + ) + parser.add_argument("--num_images", default=1, type=int, help="Number of images to generate") + parser.add_argument("--num_inference_steps", default=50, type=int, help="Number of steps in diffusion process") + parser.add_argument("--tempdir", default=None, type=str, help="Root directory for tempfile directories and files") + parser.add_argument( + "--strength", + default=1.0, + type=float, + help="Value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. " + "Values that approach 1.0 enable lots of variations but will also produce images " + "that are not semantically consistent with the input.", + ) + parser.add_argument("--image_size", default=512, type=int, help="Width and height of the images to generate") + + return parser.parse_known_args(raw_args) + + +def parse_ort_args(raw_args): + parser = argparse.ArgumentParser("ONNX Runtime arguments") + + parser.add_argument( + "--static_dims", + action="store_true", + help="DEPRECATED (now enabled by default). Use --dynamic_dims to disable static_dims.", + ) + parser.add_argument("--dynamic_dims", action="store_true", help="Disable static shape optimization") + + return parser.parse_known_args(raw_args) + + +def main(raw_args=None): + common_args, extra_args = parse_common_args(raw_args) + + provider = common_args.provider + model_input = common_args.model_input + model_output = common_args.model_output + + script_dir = Path(__file__).resolve().parent + + + if common_args.clean_cache: + shutil.rmtree(script_dir / "cache", ignore_errors=True) + + guidance_scale = common_args.guidance_scale + + ort_args = None, None + ort_args, extra_args = parse_ort_args(extra_args) + + if common_args.optimize or not model_output.exists(): + set_tempdir(common_args.tempdir) + + # TODO(jstoecker): clean up warning filter (mostly during conversion from torch to ONNX) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + + from sd_utils.ort import validate_args + + validate_args(ort_args, common_args.provider) + optimize(common_args.model_input, common_args.model_output, common_args.provider, common_args.controlnet) + + if not common_args.optimize: + model_dir = model_output / "F32" if common_args.test_unoptimized else model_output / "F16" + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + + from sd_utils.ort import get_ort_pipeline + + pipeline = get_ort_pipeline(model_dir, common_args, ort_args, guidance_scale) + run_inference_loop( + pipeline, + common_args.prompt, + common_args.num_images, + common_args.batch_size, + common_args.image_size, + common_args.num_inference_steps, + guidance_scale, + common_args.strength, + provider=provider, + ) + + +if __name__ == "__main__": + main() diff --git a/OnnxStack.Converter/stable_diffusion/models.py b/OnnxStack.Converter/stable_diffusion/models.py new file mode 100644 index 0000000..196135d --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/models.py @@ -0,0 +1,342 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import config +import torch +from typing import Union, Optional, Tuple +from diffusers import AutoencoderKL, UNet2DConditionModel, ControlNetModel +from diffusers.models.controlnet import ControlNetOutput, BaseOutput as ControlNetBaseOutput +from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker +from transformers.models.clip.modeling_clip import CLIPTextModel +from dataclasses import dataclass + +# Helper latency-only dataloader that creates random tensors with no label +class RandomDataLoader: + def __init__(self, create_inputs_func, batchsize, torch_dtype): + self.create_input_func = create_inputs_func + self.batchsize = batchsize + self.torch_dtype = torch_dtype + + def __getitem__(self, idx): + label = None + return self.create_input_func(self.batchsize, self.torch_dtype), label + + + +# ----------------------------------------------------------------------------- +# TEXT ENCODER +# ----------------------------------------------------------------------------- + + +def text_encoder_inputs(batchsize, torch_dtype): + return torch.zeros((batchsize, 77), dtype=torch_dtype) + + +def text_encoder_load(model_name): + model = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder") + if is_lora_model(model_name): + merge_lora_weights(model, model_name, "text_encoder") + return model + + +def text_encoder_conversion_inputs(model=None): + return text_encoder_inputs(1, torch.int32) + + +def text_encoder_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(text_encoder_inputs, batchsize, torch.int32) + + +# ----------------------------------------------------------------------------- +# UNET +# ----------------------------------------------------------------------------- + + +def unet_inputs(batchsize, torch_dtype, is_conversion_inputs=False): + # TODO(jstoecker): Rename onnx::Concat_4 to text_embeds and onnx::Shape_5 to time_ids + inputs = { + "sample": torch.rand((batchsize, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "timestep": torch.rand((batchsize,), dtype=torch_dtype), + "encoder_hidden_states": torch.rand((batchsize, 77, config.cross_attention_dim), dtype=torch_dtype), + } + + # use as kwargs since they won't be in the correct position if passed along with the tuple of inputs + kwargs = { + "return_dict": False, + } + if is_conversion_inputs: + inputs["additional_inputs"] = { + **kwargs, + "added_cond_kwargs": { + "text_embeds": torch.rand((1, 1280), dtype=torch_dtype), + "time_ids": torch.rand((1, 5), dtype=torch_dtype), + }, + } + else: + inputs.update(kwargs) + inputs["onnx::Concat_4"] = torch.rand((1, 1280), dtype=torch_dtype) + inputs["onnx::Shape_5"] = torch.rand((1, 5), dtype=torch_dtype) + + return inputs + + +def get_unet_ov_example_input(): + import numpy as np + + encoder_hidden_state = torch.ones((2, 77, 768)) + latents_shape = (2, 4, 512 // 8, 512 // 8) + latents = torch.randn(latents_shape) + t = torch.from_numpy(np.array(1, dtype=float)) + return (latents, t, encoder_hidden_state) + + +def unet_load(model_name): + model = UNet2DConditionModel.from_pretrained(model_name, subfolder="unet") + if is_lora_model(model_name): + merge_lora_weights(model, model_name, "unet") + return model + + +def unet_conversion_inputs(model=None): + return tuple(unet_inputs(1, torch.float32, True).values()) + + +def unet_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(unet_inputs, batchsize, torch.float16) + +# ----------------------------------------------------------------------------- +# CONTROLNET - UNET +# ----------------------------------------------------------------------------- + +class PatchedUNet2DConditionModel(UNet2DConditionModel): + def forward( + self, + sample: torch.FloatTensor, + timestep: Union[torch.Tensor, float, int], + encoder_hidden_states: torch.Tensor, + down_block_0_additional_residual: torch.Tensor, + down_block_1_additional_residual: torch.Tensor, + down_block_2_additional_residual: torch.Tensor, + down_block_3_additional_residual: torch.Tensor, + down_block_4_additional_residual: torch.Tensor, + down_block_5_additional_residual: torch.Tensor, + down_block_6_additional_residual: torch.Tensor, + down_block_7_additional_residual: torch.Tensor, + down_block_8_additional_residual: torch.Tensor, + down_block_9_additional_residual: torch.Tensor, + down_block_10_additional_residual: torch.Tensor, + down_block_11_additional_residual: torch.Tensor, + mid_block_additional_residual: torch.Tensor, + ) -> Union[UNet2DConditionModel, Tuple]: + down_block_add_res = ( + down_block_0_additional_residual, down_block_1_additional_residual, down_block_2_additional_residual, + down_block_3_additional_residual, down_block_4_additional_residual, down_block_5_additional_residual, + down_block_6_additional_residual, down_block_7_additional_residual, down_block_8_additional_residual, + down_block_9_additional_residual, down_block_10_additional_residual, down_block_11_additional_residual) + return super().forward( + sample = sample, + timestep = timestep, + encoder_hidden_states = encoder_hidden_states, + down_block_additional_residuals = down_block_add_res, + mid_block_additional_residual = mid_block_additional_residual, + return_dict = False + ) + +def controlnet_unet_inputs(batchsize, torch_dtype): + return { + "sample": torch.rand((batchsize, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "timestep": torch.rand((batchsize,), dtype=torch_dtype), + "encoder_hidden_states": torch.rand((batchsize, 77, config.cross_attention_dim), dtype=torch_dtype), + "down_block_0_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "down_block_1_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "down_block_2_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype), + "down_block_3_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size // 2, config.unet_sample_size // 2), dtype=torch_dtype), + "down_block_4_additional_residual": torch.rand((batchsize, 640, config.unet_sample_size // 2, config.unet_sample_size // 2), dtype=torch_dtype), + "down_block_5_additional_residual": torch.rand((batchsize, 640, config.unet_sample_size // 2, config.unet_sample_size // 2), dtype=torch_dtype), + "down_block_6_additional_residual": torch.rand((batchsize, 640, config.unet_sample_size // 4, config.unet_sample_size // 4), dtype=torch_dtype), + "down_block_7_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 4, config.unet_sample_size // 4), dtype=torch_dtype), + "down_block_8_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 4, config.unet_sample_size // 4), dtype=torch_dtype), + "down_block_9_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype), + "down_block_10_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype), + "down_block_11_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype), + "mid_block_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype) + } + + +def controlnet_unet_load(model_name): + model = PatchedUNet2DConditionModel.from_pretrained(model_name, subfolder="unet") + return model + + +def controlnet_unet_conversion_inputs(model): + return tuple(controlnet_unet_inputs(1, torch.float32).values()) + + +def controlnet_unet_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(controlnet_unet_inputs, batchsize, torch.float16) + +# ----------------------------------------------------------------------------- +# VAE ENCODER +# ----------------------------------------------------------------------------- + + +def vae_encoder_inputs(batchsize, torch_dtype): + return {"sample": torch.rand((batchsize, 3, config.vae_sample_size, config.vae_sample_size), dtype=torch_dtype)} + + +def vae_encoder_load(model_name): + model = AutoencoderKL.from_pretrained(model_name, subfolder="vae") + model.forward = lambda sample: model.encode(sample)[0].sample() + return model + + +def vae_encoder_conversion_inputs(model=None): + return tuple(vae_encoder_inputs(1, torch.float32).values()) + + +def vae_encoder_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(vae_encoder_inputs, batchsize, torch.float16) + + +# ----------------------------------------------------------------------------- +# VAE DECODER +# ----------------------------------------------------------------------------- + + +def vae_decoder_inputs(batchsize, torch_dtype): + return { + "latent_sample": torch.rand((batchsize, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype) + } + + +def vae_decoder_load(model_name): + model = AutoencoderKL.from_pretrained(model_name, subfolder="vae") + model.forward = model.decode + return model + + +def vae_decoder_conversion_inputs(model=None): + return tuple(vae_decoder_inputs(1, torch.float32).values()) + + +def vae_decoder_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(vae_decoder_inputs, batchsize, torch.float16) + + +# ----------------------------------------------------------------------------- +# SAFETY CHECKER +# ----------------------------------------------------------------------------- + + +def safety_checker_inputs(batchsize, torch_dtype): + return { + "clip_input": torch.rand((batchsize, 3, 224, 224), dtype=torch_dtype), + "images": torch.rand((batchsize, config.vae_sample_size, config.vae_sample_size, 3), dtype=torch_dtype), + } + + +def safety_checker_load(model_name): + model = StableDiffusionSafetyChecker.from_pretrained(model_name, subfolder="safety_checker") + model.forward = model.forward_onnx + return model + + +def safety_checker_conversion_inputs(model=None): + return tuple(safety_checker_inputs(1, torch.float32).values()) + + +def safety_checker_data_loader(data_dir, batchsize, *args, **kwargs): + return RandomDataLoader(safety_checker_inputs, batchsize, torch.float16) + + +# ----------------------------------------------------------------------------- +# LoRA weights +# ----------------------------------------------------------------------------- + +def is_lora_model(model_name): + # TODO(jstoecker): might be a better way to detect (e.g. presence of LORA weights file) + return False + + +# Merges LoRA weights into the layers of a base model +def merge_lora_weights(base_model, lora_model_id, submodel_name="unet", scale=1.0): + import inspect + from collections import defaultdict + from functools import reduce + + try: + from diffusers.loaders import LORA_WEIGHT_NAME + except ImportError: + # moved in version 0.24.0 + from diffusers.loaders.lora import LORA_WEIGHT_NAME + from diffusers.models.attention_processor import LoRAAttnProcessor + from diffusers.utils.hub_utils import _get_model_file + + parameters = inspect.signature(_get_model_file).parameters + + kwargs = {} + if "use_auth_token" in parameters: + kwargs["use_auth_token"] = None + elif "token" in parameters: + kwargs["token"] = None + + # Load LoRA weights + model_file = _get_model_file( + lora_model_id, + weights_name=LORA_WEIGHT_NAME, + cache_dir=None, + force_download=False, + resume_download=False, + proxies=None, + local_files_only=False, + revision=None, + subfolder=None, + user_agent={ + "file_type": "attn_procs_weights", + "framework": "pytorch", + }, + **kwargs, + ) + lora_state_dict = torch.load(model_file, map_location="cpu") + + # All keys in the LoRA state dictionary should have 'lora' somewhere in the string. + keys = list(lora_state_dict.keys()) + assert all("lora" in k for k in keys) + + if all(key.startswith(submodel_name) for key in keys): + # New format (https://github.com/huggingface/diffusers/pull/2918) supports LoRA weights in both the + # unet and text encoder where keys are prefixed with 'unet' or 'text_encoder', respectively. + submodel_state_dict = {k: v for k, v in lora_state_dict.items() if k.startswith(submodel_name)} + else: + # Old format. Keys will not have any prefix. This only applies to unet, so exit early if this is + # optimizing the text encoder. + if submodel_name != "unet": + return + submodel_state_dict = lora_state_dict + + # Group LoRA weights into attention processors + attn_processors = {} + lora_grouped_dict = defaultdict(dict) + for key, value in submodel_state_dict.items(): + attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:]) + lora_grouped_dict[attn_processor_key][sub_key] = value + + for key, value_dict in lora_grouped_dict.items(): + rank = value_dict["to_k_lora.down.weight"].shape[0] + cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[1] + hidden_size = value_dict["to_k_lora.up.weight"].shape[0] + + attn_processors[key] = LoRAAttnProcessor( + hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank + ) + attn_processors[key].load_state_dict(value_dict) + + # Merge LoRA attention processor weights into existing Q/K/V/Out weights + for name, proc in attn_processors.items(): + attention_name = name[: -len(".processor")] + attention = reduce(getattr, attention_name.split(sep="."), base_model) + attention.to_q.weight.data += scale * torch.mm(proc.to_q_lora.up.weight, proc.to_q_lora.down.weight) + attention.to_k.weight.data += scale * torch.mm(proc.to_k_lora.up.weight, proc.to_k_lora.down.weight) + attention.to_v.weight.data += scale * torch.mm(proc.to_v_lora.up.weight, proc.to_v_lora.down.weight) + attention.to_out[0].weight.data += scale * torch.mm(proc.to_out_lora.up.weight, proc.to_out_lora.down.weight) diff --git a/OnnxStack.Converter/stable_diffusion/requirements.txt b/OnnxStack.Converter/stable_diffusion/requirements.txt new file mode 100644 index 0000000..15b9198 --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/requirements.txt @@ -0,0 +1,9 @@ +accelerate +diffusers +onnx +pillow +protobuf==3.20.3 # protobuf 4.x aborts with OOM when optimizing unet +tabulate +torch +transformers +onnxruntime-directml>=1.16.0 diff --git a/OnnxStack.Converter/stable_diffusion/sd_utils/ort.py b/OnnxStack.Converter/stable_diffusion/sd_utils/ort.py new file mode 100644 index 0000000..ad49818 --- /dev/null +++ b/OnnxStack.Converter/stable_diffusion/sd_utils/ort.py @@ -0,0 +1,172 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import os +import json +import shutil +import sys +from pathlib import Path +from typing import Dict + +import onnxruntime as ort +from diffusers import OnnxRuntimeModel, OnnxStableDiffusionPipeline +from onnxruntime import __version__ as OrtVersion +from packaging import version + +from olive.model import ONNXModelHandler + +# ruff: noqa: TID252, T201 + + +def update_cuda_config(config: Dict): + if version.parse(OrtVersion) < version.parse("1.17.0"): + # disable skip_group_norm fusion since there is a shape inference bug which leads to invalid models + config["passes"]["optimize_cuda"]["config"]["optimization_options"] = {"enable_skip_group_norm": False} + config["pass_flows"] = [["convert", "optimize_cuda"]] + config["systems"]["local_system"]["config"]["accelerators"][0]["execution_providers"] = ["CUDAExecutionProvider"] + return config + + +def validate_args(args, provider): + ort.set_default_logger_severity(4) + if args.static_dims: + print( + "WARNING: the --static_dims option is deprecated, and static shape optimization is enabled by default. " + "Use --dynamic_dims to disable static shape optimization." + ) + + validate_ort_version(provider) + + +def validate_ort_version(provider: str): + if provider == "dml" and version.parse(OrtVersion) < version.parse("1.16.0"): + print("This script requires onnxruntime-directml 1.16.0 or newer") + sys.exit(1) + elif provider == "cuda" and version.parse(OrtVersion) < version.parse("1.17.0"): + if version.parse(OrtVersion) < version.parse("1.16.2"): + print("This script requires onnxruntime-gpu 1.16.2 or newer") + sys.exit(1) + print( + f"WARNING: onnxruntime {OrtVersion} has known issues with shape inference for SkipGroupNorm. Will disable" + " skip_group_norm fusion. onnxruntime-gpu 1.17.0 or newer is strongly recommended!" + ) + + +def save_optimized_onnx_submodel(submodel_name, provider, model_info): + footprints_file_path = ( + Path(__file__).resolve().parents[1] / "footprints" / f"{submodel_name}_gpu-{provider}_footprints.json" + ) + with footprints_file_path.open("r") as footprint_file: + footprints = json.load(footprint_file) + + conversion_footprint = None + optimizer_footprint = None + for footprint in footprints.values(): + if footprint["from_pass"] == "OnnxConversion": + conversion_footprint = footprint + elif footprint["from_pass"] == "OrtTransformersOptimization": + optimizer_footprint = footprint + + assert conversion_footprint + assert optimizer_footprint + + unoptimized_olive_model = ONNXModelHandler(**conversion_footprint["model_config"]["config"]) + optimized_olive_model = ONNXModelHandler(**optimizer_footprint["model_config"]["config"]) + + model_info[submodel_name] = { + "unoptimized": { + "path": Path(unoptimized_olive_model.model_path), + }, + "optimized": { + "path": Path(optimized_olive_model.model_path), + }, + } + + print(f"Unoptimized Model : {model_info[submodel_name]['unoptimized']['path']}") + print(f"Optimized Model : {model_info[submodel_name]['optimized']['path']}") + + +def save_onnx_pipeline( + has_safety_checker, model_info, model_output, pipeline, submodel_names +): + # Save the unoptimized models in a directory structure that the diffusers library can load and run. + # This is optional, and the optimized models can be used directly in a custom pipeline if desired. + print("\nCreating ONNX pipeline...") + + optimized_model_dir = model_output / "Optimized" + unoptimized_model_dir = model_output / "Default" + has_controlnet = 'controlnet' in submodel_names + if has_safety_checker: + safety_checker = OnnxRuntimeModel.from_pretrained(model_info["safety_checker"]["unoptimized"]["path"].parent) + else: + safety_checker = None + + onnx_pipeline = OnnxStableDiffusionPipeline( + vae_encoder=OnnxRuntimeModel.from_pretrained(model_info["vae_encoder"]["unoptimized"]["path"].parent), + vae_decoder=OnnxRuntimeModel.from_pretrained(model_info["vae_decoder"]["unoptimized"]["path"].parent), + text_encoder=OnnxRuntimeModel.from_pretrained(model_info["text_encoder"]["unoptimized"]["path"].parent), + tokenizer=pipeline.tokenizer, + unet=OnnxRuntimeModel.from_pretrained(model_info["unet"]["unoptimized"]["path"].parent), + scheduler=pipeline.scheduler, + safety_checker=safety_checker, + feature_extractor=pipeline.feature_extractor, + requires_safety_checker=True, + ) + + if has_controlnet: + controlnet=OnnxRuntimeModel.from_pretrained(model_info["controlnet"]["unoptimized"]["path"].parent) + + print("Saving unoptimized models...") + onnx_pipeline.save_pretrained(unoptimized_model_dir) + if has_controlnet: + controlnet.save_pretrained(unoptimized_model_dir / "controlnet" ) + + # Create a copy of the unoptimized model directory, then overwrite with optimized models from the olive cache. + print("Copying optimized models...") + shutil.copytree(unoptimized_model_dir, optimized_model_dir, ignore=shutil.ignore_patterns("weights.pb")) + for submodel_name in submodel_names: + src_path = model_info[submodel_name]["optimized"]["path"] + dst_path = optimized_model_dir / submodel_name / "model.onnx" + exists = os.path.exists(dst_path) + if not exists: + os.mkdir(optimized_model_dir / submodel_name) + shutil.copyfile(src_path, dst_path) + + print(f"The default pipeline is located here: {unoptimized_model_dir}") + print(f"The optimized pipeline is located here: {optimized_model_dir}") + + +def get_ort_pipeline(model_dir, common_args, ort_args, guidance_scale): + ort.set_default_logger_severity(3) + + print("Loading models into ORT session...") + sess_options = ort.SessionOptions() + sess_options.enable_mem_pattern = False + + static_dims = not ort_args.dynamic_dims + batch_size = common_args.batch_size + image_size = common_args.image_size + provider = common_args.provider + + if static_dims: + hidden_batch_size = batch_size if (guidance_scale == 0.0) else batch_size * 2 + # Not necessary, but helps DML EP further optimize runtime performance. + # batch_size is doubled for sample & hidden state because of classifier free guidance: + # https://github.com/huggingface/diffusers/blob/46c52f9b9607e6ecb29c782c052aea313e6487b7/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L672 + sess_options.add_free_dimension_override_by_name("unet_sample_batch", hidden_batch_size) + sess_options.add_free_dimension_override_by_name("unet_sample_channels", 4) + sess_options.add_free_dimension_override_by_name("unet_sample_height", image_size // 8) + sess_options.add_free_dimension_override_by_name("unet_sample_width", image_size // 8) + sess_options.add_free_dimension_override_by_name("unet_time_batch", 1) + sess_options.add_free_dimension_override_by_name("unet_hidden_batch", hidden_batch_size) + sess_options.add_free_dimension_override_by_name("unet_hidden_sequence", 77) + + provider_map = { + "dml": "DmlExecutionProvider", + "cuda": "CUDAExecutionProvider", + } + assert provider in provider_map, f"Unsupported provider: {provider}" + return OnnxStableDiffusionPipeline.from_pretrained( + model_dir, provider=provider_map[provider], sess_options=sess_options + )