diff --git a/OnnxStack.Converter/README.md b/OnnxStack.Converter/README.md
new file mode 100644
index 0000000..5b2a146
--- /dev/null
+++ b/OnnxStack.Converter/README.md
@@ -0,0 +1,20 @@
+﻿# OnnxStack.Converter
+
+## Requirements
+```bash
+pip install onnxruntime-directml
+pip install olive-ai[directml]
+python -m pip install -r requirements.txt
+```
+
+## Usage
+```bash
+convert.py --optimize --model_input '..\stable-diffusion-v1-5' --model_output '..\converted' --controlnet
+```
+`--optimize`  - Run the model optimization
+
+`--model_input`  - Safetensor model to convert
+
+`--model_output`  - Output for converted ONNX model (NOTE: This folder is deleted before each run)
+
+`--controlnet`  - Create a ControlNet enabled Unet model
\ No newline at end of file
diff --git a/OnnxStack.Converter/latent_consistency/.gitignore b/OnnxStack.Converter/latent_consistency/.gitignore
new file mode 100644
index 0000000..4cf6f30
--- /dev/null
+++ b/OnnxStack.Converter/latent_consistency/.gitignore
@@ -0,0 +1,3 @@
+/footprints/
+/cache/
+/result_*.png
diff --git a/OnnxStack.Converter/latent_consistency/config.py b/OnnxStack.Converter/latent_consistency/config.py
new file mode 100644
index 0000000..1806391
--- /dev/null
+++ b/OnnxStack.Converter/latent_consistency/config.py
@@ -0,0 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+vae_sample_size = 768
+unet_sample_size = 96
+cross_attention_dim = 768
diff --git a/OnnxStack.Converter/latent_consistency/config_controlnet.json b/OnnxStack.Converter/latent_consistency/config_controlnet.json
new file mode 100644
index 0000000..0d9331f
--- /dev/null
+++ b/OnnxStack.Converter/latent_consistency/config_controlnet.json
@@ -0,0 +1,124 @@
+{
+  "input_model": {
+      "type": "PyTorchModel",
+      "config": {
+          "model_path": "SimianLuo/LCM_Dreamshaper_v7",
+          "model_loader": "controlnet_unet_load",
+          "model_script": "models.py",
+          "io_config": {
+              "input_names": [ "sample", "timestep", "encoder_hidden_states", "down_block_0_additional_residual", "down_block_1_additional_residual", "down_block_2_additional_residual", "down_block_3_additional_residual", "down_block_4_additional_residual", "down_block_5_additional_residual", "down_block_6_additional_residual", "down_block_7_additional_residual", "down_block_8_additional_residual", "down_block_9_additional_residual", "down_block_10_additional_residual", "down_block_11_additional_residual", "mid_block_additional_residual", "return_dict" ],
+              "output_names": [ "out_sample" ],
+              "dynamic_axes": {
+                  "sample": {"0": "unet_sample_batch", "1": "unet_sample_channels", "2": "unet_sample_height", "3": "unet_sample_width"},
+                  "timestep": {"0": "unet_time_batch"},
+                  "encoder_hidden_states": {"0": "unet_hidden_batch", "1": "unet_hidden_sequence"},
+				  "timestep_cond": { "0": "batch_size"  },
+                  "down_block_0_additional_residual": {"0": "cnet_db0_batch", "1": "cnet_db0_channels", "2": "cnet_db0_height", "3": "cnet_db0_width"},
+                  "down_block_1_additional_residual": {"0": "cnet_db1_batch", "1": "cnet_db1_channels", "2": "cnet_db1_height", "3": "cnet_db1_width"},
+                  "down_block_2_additional_residual": {"0": "cnet_db2_batch", "1": "cnet_db2_channels", "2": "cnet_db2_height", "3": "cnet_db2_width"},
+                  "down_block_3_additional_residual": {"0": "cnet_db3_batch", "1": "cnet_db3_channels", "2": "cnet_db3_height2", "3": "cnet_db3_width2"},
+                  "down_block_4_additional_residual": {"0": "cnet_db4_batch", "1": "cnet_db4_channels", "2": "cnet_db4_height2", "3": "cnet_db4_width2"},
+                  "down_block_5_additional_residual": {"0": "cnet_db5_batch", "1": "cnet_db5_channels", "2": "cnet_db5_height2", "3": "cnet_db5_width2"},
+                  "down_block_6_additional_residual": {"0": "cnet_db6_batch", "1": "cnet_db6_channels", "2": "cnet_db6_height4", "3": "cnet_db6_width4"},
+                  "down_block_7_additional_residual": {"0": "cnet_db7_batch", "1": "cnet_db7_channels", "2": "cnet_db7_height4", "3": "cnet_db7_width4"},
+                  "down_block_8_additional_residual": {"0": "cnet_db8_batch", "1": "cnet_db8_channels", "2": "cnet_db8_height4", "3": "cnet_db8_width4"},
+                  "down_block_9_additional_residual": {"0": "cnet_db9_batch", "1": "cnet_db9_channels", "2": "cnet_db9_height8", "3": "cnet_db9_width8"},
+                  "down_block_10_additional_residual": {"0": "cnet_db10_batch", "1": "cnet_db10_channels", "2": "cnet_db10_height8", "3": "cnet_db10_width8"},
+                  "down_block_11_additional_residual": {"0": "cnet_db11_batch", "1": "cnet_db11_channels", "2": "cnet_db11_height8", "3": "cnet_db11_width8"},
+                  "mid_block_additional_residual": {"0": "cnet_mbar_batch", "1": "cnet_mbar_channels", "2": "cnet_mbar_height8", "3": "cnet_mbar_width8"}
+              }
+          },
+          "dummy_inputs_func": "controlnet_unet_conversion_inputs"
+      }
+  },
+  "systems": {
+      "local_system": {
+          "type": "LocalSystem",
+          "config": {
+                "accelerators": [
+                    {
+                        "device": "gpu",
+                        "execution_providers": [
+                            "DmlExecutionProvider"
+                        ]
+                    }
+                ]
+          }
+      }
+  },
+  "evaluators": {
+      "common_evaluator": {
+          "metrics": [
+              {
+                  "name": "latency",
+                  "type": "latency",
+                  "sub_types": [{"name": "avg"}],
+                  "user_config": {
+                      "user_script": "models.py",
+                      "dataloader_func": "controlnet_unet_data_loader",
+                      "batch_size": 2
+                  }
+              }
+          ]
+      }
+  },
+  "passes": {
+      "convert": {
+          "type": "OnnxConversion",
+          "config": {
+              "target_opset": 14,
+              "save_as_external_data": true,
+              "all_tensors_to_one_file": true,
+              "external_data_name": "weights.pb"
+          }
+      },
+      "optimize": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "unet",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false,
+                "optimization_options": {
+                    "enable_gelu": true,
+                    "enable_layer_norm": true,
+                    "enable_attention": true,
+                    "use_multi_head_attention": true,
+                    "enable_skip_layer_norm": false,
+                    "enable_embed_layer_norm": true,
+                    "enable_bias_skip_layer_norm": false,
+                    "enable_bias_gelu": true,
+                    "enable_gelu_approximation": false,
+                    "enable_qordered_matmul": false,
+                    "enable_shape_inference": true,
+                    "enable_gemm_fast_gelu": false,
+                    "enable_nhwc_conv": false,
+                    "enable_group_norm": true,
+                    "enable_bias_splitgelu": false,
+                    "enable_packed_qkv": true,
+                    "enable_packed_kv": true,
+                    "enable_bias_add": false,
+                    "group_norm_channels_last": false
+                },
+                "force_fp32_ops": ["RandomNormalLike"],
+                "force_fp16_inputs": {
+                    "GroupNorm": [0, 1, 2]
+                }
+            }
+      }
+    },
+    "pass_flows": [
+        ["convert", "optimize"]
+    ],
+    "engine": {
+        "log_severity_level": 0,
+        "evaluator": "common_evaluator",
+        "evaluate_input_model": false,
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_name": "controlnet",
+        "output_dir": "footprints"
+    }
+}
diff --git a/OnnxStack.Converter/latent_consistency/config_safety_checker.json b/OnnxStack.Converter/latent_consistency/config_safety_checker.json
new file mode 100644
index 0000000..bef935f
--- /dev/null
+++ b/OnnxStack.Converter/latent_consistency/config_safety_checker.json
@@ -0,0 +1,124 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_path": "SimianLuo/LCM_Dreamshaper_v7",
+            "model_loader": "safety_checker_load",
+            "model_script": "models.py",
+            "io_config": {
+                "input_names": [ "clip_input", "images" ],
+                "output_names": [ "out_images", "has_nsfw_concepts" ],
+                "dynamic_axes": {
+                    "clip_input": { "0": "batch", "1": "channels", "2": "height", "3": "width" },
+                    "images": { "0": "batch", "1": "height", "2": "width", "3": "channels" }
+                }
+            },
+            "dummy_inputs_func": "safety_checker_conversion_inputs"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "gpu",
+                        "execution_providers": [
+                            "DmlExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [{"name": "avg"}],
+                    "user_config": {
+                        "user_script": "models.py",
+                        "dataloader_func": "safety_checker_data_loader",
+                        "batch_size": 1
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 14
+            }
+        },
+        "ov_convert": {
+            "type": "OpenVINOConversion",
+            "config": {
+                "user_script": "models.py",
+                "example_input_func": "safety_checker_conversion_inputs",
+                "output_model": "safety_checker"
+            }
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "unet",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false,
+                "optimization_options": {
+                    "enable_gelu": true,
+                    "enable_layer_norm": true,
+                    "enable_attention": true,
+                    "use_multi_head_attention": true,
+                    "enable_skip_layer_norm": false,
+                    "enable_embed_layer_norm": true,
+                    "enable_bias_skip_layer_norm": false,
+                    "enable_bias_gelu": true,
+                    "enable_gelu_approximation": false,
+                    "enable_qordered_matmul": false,
+                    "enable_shape_inference": true,
+                    "enable_gemm_fast_gelu": false,
+                    "enable_nhwc_conv": false,
+                    "enable_group_norm": true,
+                    "enable_bias_splitgelu": false,
+                    "enable_packed_qkv": true,
+                    "enable_packed_kv": true,
+                    "enable_bias_add": false,
+                    "group_norm_channels_last": false
+                },
+                "force_fp32_ops": ["RandomNormalLike"],
+                "force_fp16_inputs": {
+                    "GroupNorm": [0, 1, 2]
+                }
+            }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "unet",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false
+            }
+        }
+    },
+    "pass_flows": [
+        ["convert", "optimize"]
+    ],
+    "engine": {
+        "log_severity_level": 0,
+        "evaluator": "common_evaluator",
+        "evaluate_input_model": false,
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_name": "safety_checker",
+        "output_dir": "footprints"
+    }
+}
diff --git a/OnnxStack.Converter/latent_consistency/config_text_encoder.json b/OnnxStack.Converter/latent_consistency/config_text_encoder.json
new file mode 100644
index 0000000..0a1c5de
--- /dev/null
+++ b/OnnxStack.Converter/latent_consistency/config_text_encoder.json
@@ -0,0 +1,121 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_path": "SimianLuo/LCM_Dreamshaper_v7",
+            "model_loader": "text_encoder_load",
+            "model_script": "models.py",
+            "io_config": {
+                "input_names": [ "input_ids" ],
+                "output_names": [ "last_hidden_state", "pooler_output" ],
+                "dynamic_axes": { "input_ids": { "0": "batch", "1": "sequence" } }
+            },
+            "dummy_inputs_func": "text_encoder_conversion_inputs"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "gpu",
+                        "execution_providers": [
+                            "DmlExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [{"name": "avg"}],
+                    "user_config": {
+                        "user_script": "models.py",
+                        "dataloader_func": "text_encoder_data_loader",
+                        "batch_size": 1
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 14
+            }
+        },
+        "ov_convert": {
+            "type": "OpenVINOConversion",
+            "config": {
+                "user_script": "models.py",
+                "example_input_func": "text_encoder_conversion_inputs",
+                "output_model": "text_encoder"
+            }
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "clip",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false,
+                "optimization_options": {
+                    "enable_gelu": true,
+                    "enable_layer_norm": true,
+                    "enable_attention": true,
+                    "use_multi_head_attention": true,
+                    "enable_skip_layer_norm": false,
+                    "enable_embed_layer_norm": true,
+                    "enable_bias_skip_layer_norm": false,
+                    "enable_bias_gelu": true,
+                    "enable_gelu_approximation": false,
+                    "enable_qordered_matmul": false,
+                    "enable_shape_inference": true,
+                    "enable_gemm_fast_gelu": false,
+                    "enable_nhwc_conv": false,
+                    "enable_group_norm": true,
+                    "enable_bias_splitgelu": false,
+                    "enable_packed_qkv": true,
+                    "enable_packed_kv": true,
+                    "enable_bias_add": false,
+                    "group_norm_channels_last": false
+                },
+                "force_fp32_ops": ["RandomNormalLike"],
+                "force_fp16_inputs": {
+                    "GroupNorm": [0, 1, 2]
+                }
+            }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "clip",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false
+            }
+        }
+    },
+    "pass_flows": [
+        ["convert", "optimize"]
+    ],
+    "engine": {
+        "log_severity_level": 0,
+        "evaluator": "common_evaluator",
+        "evaluate_input_model": false,
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_name": "text_encoder",
+        "output_dir": "footprints"
+    }
+}
diff --git a/OnnxStack.Converter/latent_consistency/config_unet.json b/OnnxStack.Converter/latent_consistency/config_unet.json
new file mode 100644
index 0000000..1c3b983
--- /dev/null
+++ b/OnnxStack.Converter/latent_consistency/config_unet.json
@@ -0,0 +1,129 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_path": "SimianLuo/LCM_Dreamshaper_v7",
+            "model_loader": "unet_load",
+            "model_script": "models.py",
+            "io_config": {
+                "input_names": [ "sample", "timestep", "encoder_hidden_states", "timestep_cond", "return_dict" ],
+                "output_names": [ "out_sample" ],
+                "dynamic_axes": {
+                    "sample": {"0": "unet_sample_batch", "1": "unet_sample_channels", "2": "unet_sample_height", "3": "unet_sample_width"},
+                    "timestep": {"0": "unet_time_batch"},
+                    "encoder_hidden_states": {"0": "unet_hidden_batch", "1": "unet_hidden_sequence"},
+					"timestep_cond": { "0": "batch_size"  }
+                }
+            },
+            "dummy_inputs_func": "unet_conversion_inputs"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "gpu",
+                        "execution_providers": [
+                            "DmlExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [{"name": "avg"}],
+                    "user_config": {
+                        "user_script": "models.py",
+                        "dataloader_func": "unet_data_loader",
+                        "batch_size": 2
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 14,
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true,
+                "external_data_name": "weights.pb"
+            }
+        },
+        "ov_convert": {
+            "type": "OpenVINOConversion",
+            "config": {
+                "user_script": "models.py",
+                "example_input_func": "get_unet_ov_example_input",
+                "output_model": "unet"
+            }
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "unet",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false,
+                "optimization_options": {
+                    "enable_gelu": true,
+                    "enable_layer_norm": true,
+                    "enable_attention": true,
+                    "use_multi_head_attention": true,
+                    "enable_skip_layer_norm": false,
+                    "enable_embed_layer_norm": true,
+                    "enable_bias_skip_layer_norm": false,
+                    "enable_bias_gelu": true,
+                    "enable_gelu_approximation": false,
+                    "enable_qordered_matmul": false,
+                    "enable_shape_inference": true,
+                    "enable_gemm_fast_gelu": false,
+                    "enable_nhwc_conv": false,
+                    "enable_group_norm": true,
+                    "enable_bias_splitgelu": false,
+                    "enable_packed_qkv": true,
+                    "enable_packed_kv": true,
+                    "enable_bias_add": false,
+                    "group_norm_channels_last": false
+                },
+                "force_fp32_ops": ["RandomNormalLike"],
+                "force_fp16_inputs": {
+                    "GroupNorm": [0, 1, 2]
+                }
+            }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "unet",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false
+            }
+        }
+    },
+    "pass_flows": [
+        ["convert", "optimize"]
+    ],
+    "engine": {
+        "log_severity_level": 0,
+        "evaluator": "common_evaluator",
+        "evaluate_input_model": false,
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_name": "unet",
+        "output_dir": "footprints"
+    }
+}
diff --git a/OnnxStack.Converter/latent_consistency/config_vae_decoder.json b/OnnxStack.Converter/latent_consistency/config_vae_decoder.json
new file mode 100644
index 0000000..755ab9a
--- /dev/null
+++ b/OnnxStack.Converter/latent_consistency/config_vae_decoder.json
@@ -0,0 +1,121 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_path": "SimianLuo/LCM_Dreamshaper_v7",
+            "model_loader": "vae_decoder_load",
+            "model_script": "models.py",
+            "io_config": {
+                "input_names": [ "latent_sample", "return_dict" ],
+                "output_names": [ "sample" ],
+                "dynamic_axes": { "latent_sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } }
+            },
+            "dummy_inputs_func": "vae_decoder_conversion_inputs"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "gpu",
+                        "execution_providers": [
+                            "DmlExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [{"name": "avg"}],
+                    "user_config": {
+                        "user_script": "models.py",
+                        "dataloader_func": "vae_decoder_data_loader",
+                        "batch_size": 1
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 14
+            }
+        },
+        "ov_convert": {
+            "type": "OpenVINOConversion",
+            "config": {
+                "user_script": "models.py",
+                "example_input_func": "vae_decoder_conversion_inputs",
+                "output_model": "vae_decoder"
+            }
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "vae",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false,
+                "optimization_options": {
+                    "enable_gelu": true,
+                    "enable_layer_norm": true,
+                    "enable_attention": true,
+                    "use_multi_head_attention": true,
+                    "enable_skip_layer_norm": false,
+                    "enable_embed_layer_norm": true,
+                    "enable_bias_skip_layer_norm": false,
+                    "enable_bias_gelu": true,
+                    "enable_gelu_approximation": false,
+                    "enable_qordered_matmul": false,
+                    "enable_shape_inference": true,
+                    "enable_gemm_fast_gelu": false,
+                    "enable_nhwc_conv": false,
+                    "enable_group_norm": true,
+                    "enable_bias_splitgelu": false,
+                    "enable_packed_qkv": true,
+                    "enable_packed_kv": true,
+                    "enable_bias_add": false,
+                    "group_norm_channels_last": false
+                },
+                "force_fp32_ops": ["RandomNormalLike"],
+                "force_fp16_inputs": {
+                    "GroupNorm": [0, 1, 2]
+                }
+            }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "vae",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false
+            }
+        }
+    },
+    "pass_flows": [
+        ["convert", "optimize"]
+    ],
+    "engine": {
+        "log_severity_level": 0,
+        "evaluator": "common_evaluator",
+        "evaluate_input_model": false,
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_name": "vae_decoder",
+        "output_dir": "footprints"
+    }
+}
diff --git a/OnnxStack.Converter/latent_consistency/config_vae_encoder.json b/OnnxStack.Converter/latent_consistency/config_vae_encoder.json
new file mode 100644
index 0000000..7a664ea
--- /dev/null
+++ b/OnnxStack.Converter/latent_consistency/config_vae_encoder.json
@@ -0,0 +1,121 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_path": "SimianLuo/LCM_Dreamshaper_v7",
+            "model_loader": "vae_encoder_load",
+            "model_script": "models.py",
+            "io_config": {
+                "input_names": [ "sample", "return_dict" ],
+                "output_names": [ "latent_sample" ],
+                "dynamic_axes": { "sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } }
+            },
+            "dummy_inputs_func": "vae_encoder_conversion_inputs"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "gpu",
+                        "execution_providers": [
+                            "DmlExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [{"name": "avg"}],
+                    "user_config": {
+                        "user_script": "models.py",
+                        "dataloader_func": "vae_encoder_data_loader",
+                        "batch_size": 1
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 14
+            }
+        },
+        "ov_convert": {
+            "type": "OpenVINOConversion",
+            "config": {
+                "user_script": "models.py",
+                "example_input_func": "vae_encoder_conversion_inputs",
+                "output_model": "vae_encoder"
+            }
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "vae",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false,
+                "optimization_options": {
+                    "enable_gelu": true,
+                    "enable_layer_norm": true,
+                    "enable_attention": true,
+                    "use_multi_head_attention": true,
+                    "enable_skip_layer_norm": false,
+                    "enable_embed_layer_norm": true,
+                    "enable_bias_skip_layer_norm": false,
+                    "enable_bias_gelu": true,
+                    "enable_gelu_approximation": false,
+                    "enable_qordered_matmul": false,
+                    "enable_shape_inference": true,
+                    "enable_gemm_fast_gelu": false,
+                    "enable_nhwc_conv": false,
+                    "enable_group_norm": true,
+                    "enable_bias_splitgelu": false,
+                    "enable_packed_qkv": true,
+                    "enable_packed_kv": true,
+                    "enable_bias_add": false,
+                    "group_norm_channels_last": false
+                },
+                "force_fp32_ops": ["RandomNormalLike"],
+                "force_fp16_inputs": {
+                    "GroupNorm": [0, 1, 2]
+                }
+            }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "vae",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false
+            }
+        }
+    },
+    "pass_flows": [
+        ["convert", "optimize"]
+    ],
+    "engine": {
+        "log_severity_level": 0,
+        "evaluator": "common_evaluator",
+        "evaluate_input_model": false,
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_name": "vae_encoder",
+        "output_dir": "footprints"
+    }
+}
diff --git a/OnnxStack.Converter/latent_consistency/convert.py b/OnnxStack.Converter/latent_consistency/convert.py
new file mode 100644
index 0000000..2c476a0
--- /dev/null
+++ b/OnnxStack.Converter/latent_consistency/convert.py
@@ -0,0 +1,272 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import argparse
+import json
+import shutil
+import sys
+import warnings
+from pathlib import Path
+from typing import Dict
+
+import config
+import torch
+from diffusers import DiffusionPipeline
+from packaging import version
+
+from olive.common.utils import set_tempdir
+from olive.workflows import run as olive_run
+
+# pylint: disable=redefined-outer-name
+# ruff: noqa: TID252, T201
+
+
+def save_image(result, batch_size, provider, num_images, images_saved, image_callback=None):
+    passed_safety_checker = 0
+    for image_index in range(batch_size):
+        if result.nsfw_content_detected is None or not result.nsfw_content_detected[image_index]:
+            passed_safety_checker += 1
+            if images_saved < num_images:
+                output_path = f"result_{images_saved}.png"
+                result.images[image_index].save(output_path)
+                if image_callback:
+                    image_callback(images_saved, output_path)
+                images_saved += 1
+                print(f"Generated {output_path}")
+    print(f"Inference Batch End ({passed_safety_checker}/{batch_size} images).")
+    print("Images passed the safety checker.")
+    return images_saved
+
+
+def run_inference_loop(
+    pipeline,
+    prompt,
+    num_images,
+    batch_size,
+    image_size,
+    num_inference_steps,
+    guidance_scale,
+    strength: float,
+    provider: str,
+    image_callback=None,
+    step_callback=None,
+):
+    images_saved = 0
+
+    def update_steps(step, timestep, latents):
+        if step_callback:
+            step_callback((images_saved // batch_size) * num_inference_steps + step)
+
+    while images_saved < num_images:
+        print(f"\nInference Batch Start (batch size = {batch_size}).")
+
+        kwargs = {}
+
+        result = pipeline(
+            [prompt] * batch_size,
+            num_inference_steps=num_inference_steps,
+            callback=update_steps if step_callback else None,
+            height=image_size,
+            width=image_size,
+            guidance_scale=guidance_scale,
+            **kwargs,
+        )
+
+        images_saved = save_image(result, batch_size, provider, num_images, images_saved, image_callback)
+
+
+def update_config_with_provider(config: Dict, provider: str):
+    if provider == "dml":
+        # DirectML EP is the default, so no need to update config.
+        return config
+    elif provider == "cuda":
+        from sd_utils.ort import update_cuda_config
+
+        return update_cuda_config(config)
+    else:
+        raise ValueError(f"Unsupported provider: {provider}")
+
+
+def optimize(
+    model_input: str,
+    model_output: Path,
+    provider: str,
+    controlnet: bool
+):
+    from google.protobuf import __version__ as protobuf_version
+
+    # protobuf 4.x aborts with OOM when optimizing unet
+    if version.parse(protobuf_version) > version.parse("3.20.3"):
+        print("This script requires protobuf 3.20.3. Please ensure your package version matches requirements.txt.")
+        sys.exit(1)
+
+    model_dir = model_input
+    script_dir = Path(__file__).resolve().parent
+
+    # Clean up previously optimized models, if any.
+    shutil.rmtree(script_dir / "footprints", ignore_errors=True)
+    shutil.rmtree(model_output, ignore_errors=True)
+
+
+    # Load the entire PyTorch pipeline to ensure all models and their configurations are downloaded and cached.
+    # This avoids an issue where the non-ONNX components (tokenizer, scheduler, and feature extractor) are not
+    # automatically cached correctly if individual models are fetched one at a time.
+    print("Download stable diffusion PyTorch pipeline...")
+    pipeline = DiffusionPipeline.from_pretrained(model_dir, torch_dtype=torch.float32, **{"local_files_only": True})
+    config.vae_sample_size = pipeline.vae.config.sample_size
+    config.cross_attention_dim = pipeline.unet.config.cross_attention_dim
+    config.unet_sample_size = pipeline.unet.config.sample_size
+
+    model_info = {}
+
+    submodel_names = ["vae_encoder", "vae_decoder", "unet" , "text_encoder"]
+
+    has_safety_checker = getattr(pipeline, "safety_checker", None) is not None
+
+    if has_safety_checker:
+        submodel_names.append("safety_checker")
+
+    if controlnet:
+        submodel_names.append("controlnet")
+
+    for submodel_name in submodel_names:
+        print(f"\nOptimizing {submodel_name}")
+
+        olive_config = None
+        with (script_dir / f"config_{submodel_name}.json").open() as fin:
+            olive_config = json.load(fin)
+        olive_config = update_config_with_provider(olive_config, provider)
+
+        if submodel_name in ("unet", "controlnet", "text_encoder"):
+            olive_config["input_model"]["config"]["model_path"] = model_dir
+        else:
+            # Only the unet & text encoder are affected by LoRA, so it's better to use the base model ID for
+            # other models: the Olive cache is based on the JSON config, and two LoRA variants with the same
+            # base model ID should be able to reuse previously optimized copies.
+            olive_config["input_model"]["config"]["model_path"] = model_dir
+
+        run_res = olive_run(olive_config)
+
+        from sd_utils.ort import save_optimized_onnx_submodel
+
+        save_optimized_onnx_submodel(submodel_name, provider, model_info)
+
+    from sd_utils.ort import save_onnx_pipeline
+
+    save_onnx_pipeline(
+        has_safety_checker, model_info, model_output, pipeline, submodel_names
+    )
+
+    return model_info
+
+
+def parse_common_args(raw_args):
+    parser = argparse.ArgumentParser("Common arguments")
+
+    parser.add_argument("--model_input", default="stable-diffusion-v1-5", type=str)
+    parser.add_argument("--model_output", default="stable-diffusion-v1-5", type=Path)
+    parser.add_argument("--controlnet",action="store_true", help="Create ControlNet Unet Model")
+    parser.add_argument(
+        "--provider", default="dml", type=str, choices=["dml", "cuda"], help="Execution provider to use"
+    )
+    parser.add_argument("--optimize", action="store_true", help="Runs the optimization step")
+    parser.add_argument("--clean_cache", action="store_true", help="Deletes the Olive cache")
+    parser.add_argument("--test_unoptimized", action="store_true", help="Use unoptimized model for inference")
+    parser.add_argument("--batch_size", default=1, type=int, help="Number of images to generate per batch")
+    parser.add_argument(
+        "--prompt",
+        default=(
+            "castle surrounded by water and nature, village, volumetric lighting, photorealistic, "
+            "detailed and intricate, fantasy, epic cinematic shot, mountains, 8k ultra hd"
+        ),
+        type=str,
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        default=7.5,
+        type=float,
+        help="Guidance scale as defined in Classifier-Free Diffusion Guidance",
+    )
+    parser.add_argument("--num_images", default=1, type=int, help="Number of images to generate")
+    parser.add_argument("--num_inference_steps", default=50, type=int, help="Number of steps in diffusion process")
+    parser.add_argument("--tempdir", default=None, type=str, help="Root directory for tempfile directories and files")
+    parser.add_argument(
+        "--strength",
+        default=1.0,
+        type=float,
+        help="Value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. "
+        "Values that approach 1.0 enable lots of variations but will also produce images "
+        "that are not semantically consistent with the input.",
+    )
+    parser.add_argument("--image_size", default=512, type=int, help="Width and height of the images to generate")
+
+    return parser.parse_known_args(raw_args)
+
+
+def parse_ort_args(raw_args):
+    parser = argparse.ArgumentParser("ONNX Runtime arguments")
+
+    parser.add_argument(
+        "--static_dims",
+        action="store_true",
+        help="DEPRECATED (now enabled by default). Use --dynamic_dims to disable static_dims.",
+    )
+    parser.add_argument("--dynamic_dims", action="store_true", help="Disable static shape optimization")
+
+    return parser.parse_known_args(raw_args)
+
+
+def main(raw_args=None):
+    common_args, extra_args = parse_common_args(raw_args)
+
+    provider = common_args.provider
+    model_input = common_args.model_input
+    model_output = common_args.model_output
+
+    script_dir = Path(__file__).resolve().parent
+
+
+    if common_args.clean_cache:
+        shutil.rmtree(script_dir / "cache", ignore_errors=True)
+
+    guidance_scale = common_args.guidance_scale
+
+    ort_args = None, None
+    ort_args, extra_args = parse_ort_args(extra_args)
+
+    if common_args.optimize or not model_output.exists():
+        set_tempdir(common_args.tempdir)
+
+        # TODO(jstoecker): clean up warning filter (mostly during conversion from torch to ONNX)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+        
+            from sd_utils.ort import validate_args
+
+            validate_args(ort_args, common_args.provider)
+            optimize(common_args.model_input, common_args.model_output, common_args.provider, common_args.controlnet)
+
+    if not common_args.optimize:
+        model_dir = model_output / "F32" if common_args.test_unoptimized else model_output / "F16"
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+      
+            from sd_utils.ort import get_ort_pipeline
+
+            pipeline = get_ort_pipeline(model_dir, common_args, ort_args, guidance_scale)
+            run_inference_loop(
+                pipeline,
+                common_args.prompt,
+                common_args.num_images,
+                common_args.batch_size,
+                common_args.image_size,
+                common_args.num_inference_steps,
+                guidance_scale,
+                common_args.strength,
+                provider=provider,
+            )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/OnnxStack.Converter/latent_consistency/models.py b/OnnxStack.Converter/latent_consistency/models.py
new file mode 100644
index 0000000..8b3de3f
--- /dev/null
+++ b/OnnxStack.Converter/latent_consistency/models.py
@@ -0,0 +1,336 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import config
+import torch
+from typing import Union, Optional, Tuple
+from diffusers import AutoencoderKL, UNet2DConditionModel, ControlNetModel
+from diffusers.models.controlnet import ControlNetOutput, BaseOutput as ControlNetBaseOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from transformers.models.clip.modeling_clip import CLIPTextModel
+from dataclasses import dataclass
+
+# Helper latency-only dataloader that creates random tensors with no label
+class RandomDataLoader:
+    def __init__(self, create_inputs_func, batchsize, torch_dtype):
+        self.create_input_func = create_inputs_func
+        self.batchsize = batchsize
+        self.torch_dtype = torch_dtype
+
+    def __getitem__(self, idx):
+        label = None
+        return self.create_input_func(self.batchsize, self.torch_dtype), label
+
+
+
+# -----------------------------------------------------------------------------
+# TEXT ENCODER
+# -----------------------------------------------------------------------------
+
+
+def text_encoder_inputs(batchsize, torch_dtype):
+    return torch.zeros((batchsize, 77), dtype=torch_dtype)
+
+
+def text_encoder_load(model_name):
+    model = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder")
+    if is_lora_model(model_name):
+        merge_lora_weights(model, model_name, "text_encoder")
+    return model
+
+
+def text_encoder_conversion_inputs(model=None):
+    return text_encoder_inputs(1, torch.int32)
+
+
+def text_encoder_data_loader(data_dir, batchsize, *args, **kwargs):
+    return RandomDataLoader(text_encoder_inputs, batchsize, torch.int32)
+
+
+# -----------------------------------------------------------------------------
+# UNET
+# -----------------------------------------------------------------------------
+
+
+def unet_inputs(batchsize, torch_dtype, is_conversion_inputs=False):
+    # TODO(jstoecker): Rename onnx::Concat_4 to text_embeds and onnx::Shape_5 to time_ids
+    inputs = {
+        "sample": torch.rand((batchsize, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype),
+        "timestep": torch.rand((batchsize,), dtype=torch_dtype),
+        "encoder_hidden_states": torch.rand((batchsize, 77, config.cross_attention_dim), dtype=torch_dtype),
+    }
+
+    # use as kwargs since they won't be in the correct position if passed along with the tuple of inputs
+    kwargs = {
+        "timestep_cond": torch.rand((batchsize, 256), dtype=torch_dtype),
+        "return_dict": False,
+    }
+    if is_conversion_inputs:
+        inputs["additional_inputs"] = {
+            **kwargs,
+            "added_cond_kwargs": {
+                "text_embeds": torch.rand((1, 1280), dtype=torch_dtype),
+                "time_ids": torch.rand((1, 5), dtype=torch_dtype),
+            },
+        }
+    else:
+        inputs.update(kwargs)
+        inputs["onnx::Concat_4"] = torch.rand((1, 1280), dtype=torch_dtype)
+        inputs["onnx::Shape_5"] = torch.rand((1, 5), dtype=torch_dtype)
+
+    return inputs
+
+
+def unet_load(model_name):
+    model = UNet2DConditionModel.from_pretrained(model_name, subfolder="unet")
+    if is_lora_model(model_name):
+        merge_lora_weights(model, model_name, "unet")
+    return model
+
+
+def unet_conversion_inputs(model=None):
+    return tuple(unet_inputs(1, torch.float32, True).values())
+
+
+def unet_data_loader(data_dir, batchsize, *args, **kwargs):
+    return RandomDataLoader(unet_inputs, batchsize, torch.float16)
+
+# -----------------------------------------------------------------------------
+# CONTROLNET - UNET
+# -----------------------------------------------------------------------------
+
+class PatchedUNet2DConditionModel(UNet2DConditionModel):
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        timestep_cond: torch.Tensor,
+        down_block_0_additional_residual: torch.Tensor,
+        down_block_1_additional_residual: torch.Tensor,
+        down_block_2_additional_residual: torch.Tensor,
+        down_block_3_additional_residual: torch.Tensor,
+        down_block_4_additional_residual: torch.Tensor,
+        down_block_5_additional_residual: torch.Tensor,
+        down_block_6_additional_residual: torch.Tensor,
+        down_block_7_additional_residual: torch.Tensor,
+        down_block_8_additional_residual: torch.Tensor,
+        down_block_9_additional_residual: torch.Tensor,
+        down_block_10_additional_residual: torch.Tensor,
+        down_block_11_additional_residual: torch.Tensor,
+        mid_block_additional_residual: torch.Tensor,
+    ) -> Union[UNet2DConditionModel, Tuple]:
+        down_block_add_res = (
+            down_block_0_additional_residual, down_block_1_additional_residual, down_block_2_additional_residual,
+            down_block_3_additional_residual, down_block_4_additional_residual, down_block_5_additional_residual,
+            down_block_6_additional_residual, down_block_7_additional_residual, down_block_8_additional_residual,
+            down_block_9_additional_residual, down_block_10_additional_residual, down_block_11_additional_residual)
+        return super().forward(
+            sample = sample,
+            timestep = timestep,
+            encoder_hidden_states = encoder_hidden_states,
+            timestep_cond = timestep_cond,
+            down_block_additional_residuals = down_block_add_res,
+            mid_block_additional_residual = mid_block_additional_residual,
+            return_dict = False
+        )
+
+def controlnet_unet_inputs(batchsize, torch_dtype):
+    return {
+        "sample": torch.rand((batchsize, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype),
+        "timestep": torch.rand((batchsize,), dtype=torch_dtype),
+        "encoder_hidden_states": torch.rand((batchsize, 77, config.cross_attention_dim), dtype=torch_dtype),
+        "timestep_cond": torch.rand((batchsize, 256), dtype=torch_dtype),
+        "down_block_0_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype),
+        "down_block_1_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype),
+        "down_block_2_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype),
+        "down_block_3_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size // 2, config.unet_sample_size // 2), dtype=torch_dtype),
+        "down_block_4_additional_residual": torch.rand((batchsize, 640, config.unet_sample_size // 2, config.unet_sample_size // 2), dtype=torch_dtype),
+        "down_block_5_additional_residual": torch.rand((batchsize, 640, config.unet_sample_size // 2, config.unet_sample_size // 2), dtype=torch_dtype),
+        "down_block_6_additional_residual": torch.rand((batchsize, 640, config.unet_sample_size // 4, config.unet_sample_size // 4), dtype=torch_dtype),
+        "down_block_7_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 4, config.unet_sample_size // 4), dtype=torch_dtype),
+        "down_block_8_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 4, config.unet_sample_size // 4), dtype=torch_dtype),
+        "down_block_9_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype),
+        "down_block_10_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype),
+        "down_block_11_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype),
+        "mid_block_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype)
+    }
+
+
+def controlnet_unet_load(model_name):
+    model = PatchedUNet2DConditionModel.from_pretrained(model_name, subfolder="unet")
+    return model
+
+
+def controlnet_unet_conversion_inputs(model):
+    return tuple(controlnet_unet_inputs(1, torch.float32).values())
+
+
+def controlnet_unet_data_loader(data_dir, batchsize, *args, **kwargs):
+    return RandomDataLoader(controlnet_unet_inputs, batchsize, torch.float16)
+	
+# -----------------------------------------------------------------------------
+# VAE ENCODER
+# -----------------------------------------------------------------------------
+
+
+def vae_encoder_inputs(batchsize, torch_dtype):
+    return {"sample": torch.rand((batchsize, 3, config.vae_sample_size, config.vae_sample_size), dtype=torch_dtype)}
+
+
+def vae_encoder_load(model_name):
+    model = AutoencoderKL.from_pretrained(model_name, subfolder="vae")
+    model.forward = lambda sample: model.encode(sample)[0].sample()
+    return model
+
+
+def vae_encoder_conversion_inputs(model=None):
+    return tuple(vae_encoder_inputs(1, torch.float32).values())
+
+
+def vae_encoder_data_loader(data_dir, batchsize, *args, **kwargs):
+    return RandomDataLoader(vae_encoder_inputs, batchsize, torch.float16)
+
+
+# -----------------------------------------------------------------------------
+# VAE DECODER
+# -----------------------------------------------------------------------------
+
+
+def vae_decoder_inputs(batchsize, torch_dtype):
+    return {
+        "latent_sample": torch.rand((batchsize, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype)
+    }
+
+
+def vae_decoder_load(model_name):
+    model = AutoencoderKL.from_pretrained(model_name, subfolder="vae")
+    model.forward = model.decode
+    return model
+
+
+def vae_decoder_conversion_inputs(model=None):
+    return tuple(vae_decoder_inputs(1, torch.float32).values())
+
+
+def vae_decoder_data_loader(data_dir, batchsize, *args, **kwargs):
+    return RandomDataLoader(vae_decoder_inputs, batchsize, torch.float16)
+
+
+# -----------------------------------------------------------------------------
+# SAFETY CHECKER
+# -----------------------------------------------------------------------------
+
+
+def safety_checker_inputs(batchsize, torch_dtype):
+    return {
+        "clip_input": torch.rand((batchsize, 3, 224, 224), dtype=torch_dtype),
+        "images": torch.rand((batchsize, config.vae_sample_size, config.vae_sample_size, 3), dtype=torch_dtype),
+    }
+
+
+def safety_checker_load(model_name):
+    model = StableDiffusionSafetyChecker.from_pretrained(model_name, subfolder="safety_checker")
+    model.forward = model.forward_onnx
+    return model
+
+
+def safety_checker_conversion_inputs(model=None):
+    return tuple(safety_checker_inputs(1, torch.float32).values())
+
+
+def safety_checker_data_loader(data_dir, batchsize, *args, **kwargs):
+    return RandomDataLoader(safety_checker_inputs, batchsize, torch.float16)
+
+
+# -----------------------------------------------------------------------------
+# LoRA weights
+# -----------------------------------------------------------------------------
+
+def is_lora_model(model_name):
+    # TODO(jstoecker): might be a better way to detect (e.g. presence of LORA weights file)
+    return False
+
+
+# Merges LoRA weights into the layers of a base model
+def merge_lora_weights(base_model, lora_model_id, submodel_name="unet", scale=1.0):
+    import inspect
+    from collections import defaultdict
+    from functools import reduce
+
+    try:
+        from diffusers.loaders import LORA_WEIGHT_NAME
+    except ImportError:
+        # moved in version 0.24.0
+        from diffusers.loaders.lora import LORA_WEIGHT_NAME
+    from diffusers.models.attention_processor import LoRAAttnProcessor
+    from diffusers.utils.hub_utils import _get_model_file
+
+    parameters = inspect.signature(_get_model_file).parameters
+
+    kwargs = {}
+    if "use_auth_token" in parameters:
+        kwargs["use_auth_token"] = None
+    elif "token" in parameters:
+        kwargs["token"] = None
+
+    # Load LoRA weights
+    model_file = _get_model_file(
+        lora_model_id,
+        weights_name=LORA_WEIGHT_NAME,
+        cache_dir=None,
+        force_download=False,
+        resume_download=False,
+        proxies=None,
+        local_files_only=False,
+        revision=None,
+        subfolder=None,
+        user_agent={
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        },
+        **kwargs,
+    )
+    lora_state_dict = torch.load(model_file, map_location="cpu")
+
+    # All keys in the LoRA state dictionary should have 'lora' somewhere in the string.
+    keys = list(lora_state_dict.keys())
+    assert all("lora" in k for k in keys)
+
+    if all(key.startswith(submodel_name) for key in keys):
+        # New format (https://github.com/huggingface/diffusers/pull/2918) supports LoRA weights in both the
+        # unet and text encoder where keys are prefixed with 'unet' or 'text_encoder', respectively.
+        submodel_state_dict = {k: v for k, v in lora_state_dict.items() if k.startswith(submodel_name)}
+    else:
+        # Old format. Keys will not have any prefix. This only applies to unet, so exit early if this is
+        # optimizing the text encoder.
+        if submodel_name != "unet":
+            return
+        submodel_state_dict = lora_state_dict
+
+    # Group LoRA weights into attention processors
+    attn_processors = {}
+    lora_grouped_dict = defaultdict(dict)
+    for key, value in submodel_state_dict.items():
+        attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
+        lora_grouped_dict[attn_processor_key][sub_key] = value
+
+    for key, value_dict in lora_grouped_dict.items():
+        rank = value_dict["to_k_lora.down.weight"].shape[0]
+        cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[1]
+        hidden_size = value_dict["to_k_lora.up.weight"].shape[0]
+
+        attn_processors[key] = LoRAAttnProcessor(
+            hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank
+        )
+        attn_processors[key].load_state_dict(value_dict)
+
+    # Merge LoRA attention processor weights into existing Q/K/V/Out weights
+    for name, proc in attn_processors.items():
+        attention_name = name[: -len(".processor")]
+        attention = reduce(getattr, attention_name.split(sep="."), base_model)
+        attention.to_q.weight.data += scale * torch.mm(proc.to_q_lora.up.weight, proc.to_q_lora.down.weight)
+        attention.to_k.weight.data += scale * torch.mm(proc.to_k_lora.up.weight, proc.to_k_lora.down.weight)
+        attention.to_v.weight.data += scale * torch.mm(proc.to_v_lora.up.weight, proc.to_v_lora.down.weight)
+        attention.to_out[0].weight.data += scale * torch.mm(proc.to_out_lora.up.weight, proc.to_out_lora.down.weight)
diff --git a/OnnxStack.Converter/latent_consistency/requirements.txt b/OnnxStack.Converter/latent_consistency/requirements.txt
new file mode 100644
index 0000000..15b9198
--- /dev/null
+++ b/OnnxStack.Converter/latent_consistency/requirements.txt
@@ -0,0 +1,9 @@
+accelerate
+diffusers
+onnx
+pillow
+protobuf==3.20.3 # protobuf 4.x aborts with OOM when optimizing unet
+tabulate
+torch
+transformers
+onnxruntime-directml>=1.16.0
diff --git a/OnnxStack.Converter/latent_consistency/sd_utils/ort.py b/OnnxStack.Converter/latent_consistency/sd_utils/ort.py
new file mode 100644
index 0000000..ad49818
--- /dev/null
+++ b/OnnxStack.Converter/latent_consistency/sd_utils/ort.py
@@ -0,0 +1,172 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import os
+import json
+import shutil
+import sys
+from pathlib import Path
+from typing import Dict
+
+import onnxruntime as ort
+from diffusers import OnnxRuntimeModel, OnnxStableDiffusionPipeline
+from onnxruntime import __version__ as OrtVersion
+from packaging import version
+
+from olive.model import ONNXModelHandler
+
+# ruff: noqa: TID252, T201
+
+
+def update_cuda_config(config: Dict):
+    if version.parse(OrtVersion) < version.parse("1.17.0"):
+        # disable skip_group_norm fusion since there is a shape inference bug which leads to invalid models
+        config["passes"]["optimize_cuda"]["config"]["optimization_options"] = {"enable_skip_group_norm": False}
+    config["pass_flows"] = [["convert", "optimize_cuda"]]
+    config["systems"]["local_system"]["config"]["accelerators"][0]["execution_providers"] = ["CUDAExecutionProvider"]
+    return config
+
+
+def validate_args(args, provider):
+    ort.set_default_logger_severity(4)
+    if args.static_dims:
+        print(
+            "WARNING: the --static_dims option is deprecated, and static shape optimization is enabled by default. "
+            "Use --dynamic_dims to disable static shape optimization."
+        )
+
+    validate_ort_version(provider)
+
+
+def validate_ort_version(provider: str):
+    if provider == "dml" and version.parse(OrtVersion) < version.parse("1.16.0"):
+        print("This script requires onnxruntime-directml 1.16.0 or newer")
+        sys.exit(1)
+    elif provider == "cuda" and version.parse(OrtVersion) < version.parse("1.17.0"):
+        if version.parse(OrtVersion) < version.parse("1.16.2"):
+            print("This script requires onnxruntime-gpu 1.16.2 or newer")
+            sys.exit(1)
+        print(
+            f"WARNING: onnxruntime {OrtVersion} has known issues with shape inference for SkipGroupNorm. Will disable"
+            " skip_group_norm fusion. onnxruntime-gpu 1.17.0 or newer is strongly recommended!"
+        )
+
+
+def save_optimized_onnx_submodel(submodel_name, provider, model_info):
+    footprints_file_path = (
+        Path(__file__).resolve().parents[1] / "footprints" / f"{submodel_name}_gpu-{provider}_footprints.json"
+    )
+    with footprints_file_path.open("r") as footprint_file:
+        footprints = json.load(footprint_file)
+
+        conversion_footprint = None
+        optimizer_footprint = None
+        for footprint in footprints.values():
+            if footprint["from_pass"] == "OnnxConversion":
+                conversion_footprint = footprint
+            elif footprint["from_pass"] == "OrtTransformersOptimization":
+                optimizer_footprint = footprint
+
+        assert conversion_footprint
+        assert optimizer_footprint
+
+        unoptimized_olive_model = ONNXModelHandler(**conversion_footprint["model_config"]["config"])
+        optimized_olive_model = ONNXModelHandler(**optimizer_footprint["model_config"]["config"])
+
+        model_info[submodel_name] = {
+            "unoptimized": {
+                "path": Path(unoptimized_olive_model.model_path),
+            },
+            "optimized": {
+                "path": Path(optimized_olive_model.model_path),
+            },
+        }
+
+        print(f"Unoptimized Model : {model_info[submodel_name]['unoptimized']['path']}")
+        print(f"Optimized Model   : {model_info[submodel_name]['optimized']['path']}")
+
+
+def save_onnx_pipeline(
+    has_safety_checker, model_info, model_output, pipeline, submodel_names
+):
+    # Save the unoptimized models in a directory structure that the diffusers library can load and run.
+    # This is optional, and the optimized models can be used directly in a custom pipeline if desired.
+    print("\nCreating ONNX pipeline...")
+
+    optimized_model_dir = model_output / "Optimized"
+    unoptimized_model_dir = model_output / "Default"
+    has_controlnet = 'controlnet' in submodel_names
+    if has_safety_checker:
+        safety_checker = OnnxRuntimeModel.from_pretrained(model_info["safety_checker"]["unoptimized"]["path"].parent)
+    else:
+        safety_checker = None
+
+    onnx_pipeline = OnnxStableDiffusionPipeline(
+        vae_encoder=OnnxRuntimeModel.from_pretrained(model_info["vae_encoder"]["unoptimized"]["path"].parent),
+        vae_decoder=OnnxRuntimeModel.from_pretrained(model_info["vae_decoder"]["unoptimized"]["path"].parent),
+        text_encoder=OnnxRuntimeModel.from_pretrained(model_info["text_encoder"]["unoptimized"]["path"].parent),
+        tokenizer=pipeline.tokenizer,
+        unet=OnnxRuntimeModel.from_pretrained(model_info["unet"]["unoptimized"]["path"].parent),
+        scheduler=pipeline.scheduler,
+        safety_checker=safety_checker,
+        feature_extractor=pipeline.feature_extractor,
+        requires_safety_checker=True,
+    )
+
+    if has_controlnet:
+        controlnet=OnnxRuntimeModel.from_pretrained(model_info["controlnet"]["unoptimized"]["path"].parent)
+
+    print("Saving unoptimized models...")
+    onnx_pipeline.save_pretrained(unoptimized_model_dir)
+    if has_controlnet:
+        controlnet.save_pretrained(unoptimized_model_dir / "controlnet" )
+
+    # Create a copy of the unoptimized model directory, then overwrite with optimized models from the olive cache.
+    print("Copying optimized models...")
+    shutil.copytree(unoptimized_model_dir, optimized_model_dir, ignore=shutil.ignore_patterns("weights.pb"))
+    for submodel_name in submodel_names:
+        src_path = model_info[submodel_name]["optimized"]["path"]
+        dst_path = optimized_model_dir / submodel_name / "model.onnx"
+        exists = os.path.exists(dst_path)
+        if not exists:
+            os.mkdir(optimized_model_dir / submodel_name)
+        shutil.copyfile(src_path, dst_path)
+
+    print(f"The default pipeline is located here: {unoptimized_model_dir}")
+    print(f"The optimized pipeline is located here: {optimized_model_dir}")
+
+
+def get_ort_pipeline(model_dir, common_args, ort_args, guidance_scale):
+    ort.set_default_logger_severity(3)
+
+    print("Loading models into ORT session...")
+    sess_options = ort.SessionOptions()
+    sess_options.enable_mem_pattern = False
+
+    static_dims = not ort_args.dynamic_dims
+    batch_size = common_args.batch_size
+    image_size = common_args.image_size
+    provider = common_args.provider
+
+    if static_dims:
+        hidden_batch_size = batch_size if (guidance_scale == 0.0) else batch_size * 2
+        # Not necessary, but helps DML EP further optimize runtime performance.
+        # batch_size is doubled for sample & hidden state because of classifier free guidance:
+        # https://github.com/huggingface/diffusers/blob/46c52f9b9607e6ecb29c782c052aea313e6487b7/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L672
+        sess_options.add_free_dimension_override_by_name("unet_sample_batch", hidden_batch_size)
+        sess_options.add_free_dimension_override_by_name("unet_sample_channels", 4)
+        sess_options.add_free_dimension_override_by_name("unet_sample_height", image_size // 8)
+        sess_options.add_free_dimension_override_by_name("unet_sample_width", image_size // 8)
+        sess_options.add_free_dimension_override_by_name("unet_time_batch", 1)
+        sess_options.add_free_dimension_override_by_name("unet_hidden_batch", hidden_batch_size)
+        sess_options.add_free_dimension_override_by_name("unet_hidden_sequence", 77)
+
+    provider_map = {
+        "dml": "DmlExecutionProvider",
+        "cuda": "CUDAExecutionProvider",
+    }
+    assert provider in provider_map, f"Unsupported provider: {provider}"
+    return OnnxStableDiffusionPipeline.from_pretrained(
+        model_dir, provider=provider_map[provider], sess_options=sess_options
+    )
diff --git a/OnnxStack.Converter/stable_cascade/.gitignore b/OnnxStack.Converter/stable_cascade/.gitignore
new file mode 100644
index 0000000..4cf6f30
--- /dev/null
+++ b/OnnxStack.Converter/stable_cascade/.gitignore
@@ -0,0 +1,3 @@
+/footprints/
+/cache/
+/result_*.png
diff --git a/OnnxStack.Converter/stable_cascade/README.md b/OnnxStack.Converter/stable_cascade/README.md
new file mode 100644
index 0000000..40d584b
--- /dev/null
+++ b/OnnxStack.Converter/stable_cascade/README.md
@@ -0,0 +1,20 @@
+﻿# OnnxStack.Converter
+
+## Requirements
+```bash
+pip install onnxruntime-directml
+pip install olive-ai[directml]
+python -m pip install -r requirements.txt
+```
+
+## Usage
+```bash
+convert.py --optimize --model_input '..\stable-cascade' --model_output '..\converted' 
+```
+`--optimize`  - Run the model optimization
+
+`--model_input`  - Safetensor model to convert
+
+`--model_output`  - Output for converted ONNX model (NOTE: This folder is deleted before each run)
+
+`--image_encoder`  - Convert the optional image encoder
diff --git a/OnnxStack.Converter/stable_cascade/config.py b/OnnxStack.Converter/stable_cascade/config.py
new file mode 100644
index 0000000..7b1b47e
--- /dev/null
+++ b/OnnxStack.Converter/stable_cascade/config.py
@@ -0,0 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+vae_sample_size = 512
+unet_sample_size = 24
+cross_attention_dim = 1280
\ No newline at end of file
diff --git a/OnnxStack.Converter/stable_cascade/config_decoder.json b/OnnxStack.Converter/stable_cascade/config_decoder.json
new file mode 100644
index 0000000..6d70698
--- /dev/null
+++ b/OnnxStack.Converter/stable_cascade/config_decoder.json
@@ -0,0 +1,120 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_path": "stabilityai/stable-cascade",
+            "model_loader": "decoder_load",
+            "model_script": "models.py",
+            "io_config": {
+                "input_names": [ "sample", "timestep_ratio", "clip_text_pooled", "effnet", "return_dict" ],
+                "output_names": [ "out_sample" ],
+                "dynamic_axes": {
+                    "sample": {"0": "unet_sample_batch", "1": "unet_sample_channels", "2": "unet_sample_height", "3": "unet_sample_width"},
+                    "timestep_ratio": {"0": "unet_timestep_ratio"},
+                    "clip_text_pooled": {"0": "unet_clip_text_pooled_batch", "1": "unet_clip_text_pooled_size"},
+					"effnet": {"0": "unet_hidden_batch", "1": "unet_hidden_size"}
+                }
+            },
+            "dummy_inputs_func": "decoder_conversion_inputs"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "gpu",
+                        "execution_providers": [
+                            "DmlExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [{"name": "avg"}],
+                    "user_config": {
+                        "user_script": "models.py",
+                        "dataloader_func": "decoder_data_loader",
+                        "batch_size": 2
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 16,
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true
+            }
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "unet",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": true,
+                "optimization_options": {
+                    "enable_gelu": true,
+                    "enable_layer_norm": true,
+                    "enable_attention": true,
+                    "use_multi_head_attention": true,
+                    "enable_skip_layer_norm": false,
+                    "enable_embed_layer_norm": true,
+                    "enable_bias_skip_layer_norm": false,
+                    "enable_bias_gelu": true,
+                    "enable_gelu_approximation": false,
+                    "enable_qordered_matmul": false,
+                    "enable_shape_inference": true,
+                    "enable_gemm_fast_gelu": false,
+                    "enable_nhwc_conv": false,
+                    "enable_group_norm": true,
+                    "enable_bias_splitgelu": false,
+                    "enable_packed_qkv": true,
+                    "enable_packed_kv": true,
+                    "enable_bias_add": false,
+                    "group_norm_channels_last": false
+                },
+                "force_fp32_ops": ["RandomNormalLike"],
+                "force_fp16_inputs": {
+                    "GroupNorm": [0, 1, 2]
+                }
+            }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "decoder",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false
+            }
+        }
+    },
+    "pass_flows": [
+        ["convert", "optimize"]
+    ],
+    "engine": {
+        "log_severity_level": 0,
+        "evaluator": "common_evaluator",
+        "evaluate_input_model": false,
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_name": "decoder",
+        "output_dir": "footprints"
+    }
+}
diff --git a/OnnxStack.Converter/stable_cascade/config_image_encoder.json b/OnnxStack.Converter/stable_cascade/config_image_encoder.json
new file mode 100644
index 0000000..08cfc7e
--- /dev/null
+++ b/OnnxStack.Converter/stable_cascade/config_image_encoder.json
@@ -0,0 +1,113 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_path": "stabilityai/stable-cascade",
+            "model_loader": "image_encoder_load",
+            "model_script": "models.py",
+            "io_config": {
+                "input_names": [ "sample"],
+                "output_names": [ "image_embeds", "last_hidden_state"],
+                "dynamic_axes": { "sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } }
+            },
+            "dummy_inputs_func": "image_encoder_conversion_inputs"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "gpu",
+                        "execution_providers": [
+                            "DmlExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [{"name": "avg"}],
+                    "user_config": {
+                        "user_script": "models.py",
+                        "dataloader_func": "image_encoder_data_loader",
+                        "batch_size": 1
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 16
+            }
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "clip",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": true,
+                "optimization_options": {
+                    "enable_gelu": true,
+                    "enable_layer_norm": true,
+                    "enable_attention": true,
+                    "use_multi_head_attention": true,
+                    "enable_skip_layer_norm": false,
+                    "enable_embed_layer_norm": true,
+                    "enable_bias_skip_layer_norm": false,
+                    "enable_bias_gelu": true,
+                    "enable_gelu_approximation": false,
+                    "enable_qordered_matmul": false,
+                    "enable_shape_inference": true,
+                    "enable_gemm_fast_gelu": false,
+                    "enable_nhwc_conv": false,
+                    "enable_group_norm": true,
+                    "enable_bias_splitgelu": false,
+                    "enable_packed_qkv": true,
+                    "enable_packed_kv": true,
+                    "enable_bias_add": false,
+                    "group_norm_channels_last": false
+                },
+                "force_fp32_ops": ["RandomNormalLike"],
+                "force_fp16_inputs": {
+                    "GroupNorm": [0, 1, 2]
+                }
+            }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "clip",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false
+            }
+        }
+    },
+    "pass_flows": [
+        ["convert", "optimize"]
+    ],
+    "engine": {
+        "log_severity_level": 0,
+        "evaluator": "common_evaluator",
+        "evaluate_input_model": false,
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_name": "image_encoder",
+        "output_dir": "footprints"
+    }
+}
diff --git a/OnnxStack.Converter/stable_cascade/config_prior.json b/OnnxStack.Converter/stable_cascade/config_prior.json
new file mode 100644
index 0000000..373e8a5
--- /dev/null
+++ b/OnnxStack.Converter/stable_cascade/config_prior.json
@@ -0,0 +1,121 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_path": "stabilityai/stable-cascade",
+            "model_loader": "prior_load",
+            "model_script": "models.py",
+            "io_config": {
+                "input_names": [ "sample", "timestep_ratio", "clip_text_pooled", "clip_text", "clip_img", "return_dict" ],
+                "output_names": [ "out_sample" ],
+                "dynamic_axes": {
+                    "sample": {"0": "unet_sample_batch", "1": "unet_sample_channels", "2": "unet_sample_height", "3": "unet_sample_width"},
+                    "timestep_ratio": {"0": "unet_timestep_ratio"},
+                    "clip_text_pooled": {"0": "unet_clip_text_pooled_batch", "1": "unet_clip_text_pooled_size", "2": "unet_clip_text_pooled_length"},
+					"clip_text": {"0": "unet_clip_text_batch", "1": "unet_clip_text_size", "2": "unet_clip_text_length"},
+					"clip_img": {"0": "unet_clip_img_batch", "1": "unet_clip_img_size", "2": "unet_clip_img_length"}
+                }
+            },
+            "dummy_inputs_func": "prior_conversion_inputs"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "gpu",
+                        "execution_providers": [
+                            "DmlExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [{"name": "avg"}],
+                    "user_config": {
+                        "user_script": "models.py",
+                        "dataloader_func": "prior_data_loader",
+                        "batch_size": 2
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 16,
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true
+            }
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "unet",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": true,
+                "optimization_options": {
+                    "enable_gelu": true,
+                    "enable_layer_norm": true,
+                    "enable_attention": true,
+                    "use_multi_head_attention": true,
+                    "enable_skip_layer_norm": false,
+                    "enable_embed_layer_norm": true,
+                    "enable_bias_skip_layer_norm": false,
+                    "enable_bias_gelu": true,
+                    "enable_gelu_approximation": false,
+                    "enable_qordered_matmul": false,
+                    "enable_shape_inference": true,
+                    "enable_gemm_fast_gelu": false,
+                    "enable_nhwc_conv": false,
+                    "enable_group_norm": true,
+                    "enable_bias_splitgelu": false,
+                    "enable_packed_qkv": true,
+                    "enable_packed_kv": true,
+                    "enable_bias_add": false,
+                    "group_norm_channels_last": false
+                },
+                "force_fp32_ops": ["RandomNormalLike"],
+                "force_fp16_inputs": {
+                    "GroupNorm": [0, 1, 2]
+                }
+            }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "unet",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false
+            }
+        }
+    },
+    "pass_flows": [
+        ["convert", "optimize"]
+    ],
+    "engine": {
+        "log_severity_level": 0,
+        "evaluator": "common_evaluator",
+        "evaluate_input_model": false,
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_name": "prior",
+        "output_dir": "footprints"
+    }
+}
diff --git a/OnnxStack.Converter/stable_cascade/config_text_encoder.json b/OnnxStack.Converter/stable_cascade/config_text_encoder.json
new file mode 100644
index 0000000..a16f5e5
--- /dev/null
+++ b/OnnxStack.Converter/stable_cascade/config_text_encoder.json
@@ -0,0 +1,113 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_path": "stabilityai/stable-cascade",
+            "model_loader": "text_encoder_load",
+            "model_script": "models.py",
+            "io_config": {
+                "input_names": [ "input_ids" ],
+                "output_names": [ "last_hidden_state", "pooler_output" ],
+                "dynamic_axes": { "input_ids": { "0": "batch", "1": "sequence" } }
+            },
+            "dummy_inputs_func": "text_encoder_conversion_inputs"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "gpu",
+                        "execution_providers": [
+                            "DmlExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [{"name": "avg"}],
+                    "user_config": {
+                        "user_script": "models.py",
+                        "dataloader_func": "text_encoder_data_loader",
+                        "batch_size": 1
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 16
+            }
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "clip",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": true,
+                "optimization_options": {
+                    "enable_gelu": true,
+                    "enable_layer_norm": true,
+                    "enable_attention": true,
+                    "use_multi_head_attention": true,
+                    "enable_skip_layer_norm": false,
+                    "enable_embed_layer_norm": true,
+                    "enable_bias_skip_layer_norm": false,
+                    "enable_bias_gelu": true,
+                    "enable_gelu_approximation": false,
+                    "enable_qordered_matmul": false,
+                    "enable_shape_inference": true,
+                    "enable_gemm_fast_gelu": false,
+                    "enable_nhwc_conv": false,
+                    "enable_group_norm": true,
+                    "enable_bias_splitgelu": false,
+                    "enable_packed_qkv": true,
+                    "enable_packed_kv": true,
+                    "enable_bias_add": false,
+                    "group_norm_channels_last": false
+                },
+                "force_fp32_ops": ["RandomNormalLike"],
+                "force_fp16_inputs": {
+                    "GroupNorm": [0, 1, 2]
+                }
+            }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "clip",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false
+            }
+        }
+    },
+    "pass_flows": [
+        ["convert", "optimize"]
+    ],
+    "engine": {
+        "log_severity_level": 0,
+        "evaluator": "common_evaluator",
+        "evaluate_input_model": false,
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_name": "text_encoder",
+        "output_dir": "footprints"
+    }
+}
diff --git a/OnnxStack.Converter/stable_cascade/config_vqgan.json b/OnnxStack.Converter/stable_cascade/config_vqgan.json
new file mode 100644
index 0000000..edf5969
--- /dev/null
+++ b/OnnxStack.Converter/stable_cascade/config_vqgan.json
@@ -0,0 +1,103 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_path": "stabilityai/stable-cascade",
+            "model_loader": "vqgan_load",
+            "model_script": "models.py",
+            "io_config": {
+                "input_names": [ "sample", "return_dict" ],
+                "output_names": [ "latent_sample" ],
+                "dynamic_axes": { "sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } }
+            },
+            "dummy_inputs_func": "vqgan_conversion_inputs"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "gpu",
+                        "execution_providers": [
+                            "DmlExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [{"name": "avg"}],
+                    "user_config": {
+                        "user_script": "models.py",
+                        "dataloader_func": "vqgan_data_loader",
+                        "batch_size": 1
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 16
+            }
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "vae",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false,
+                "optimization_options": {
+                    "enable_gelu": true,
+                    "enable_layer_norm": true,
+                    "enable_attention": true,
+                    "use_multi_head_attention": true,
+                    "enable_skip_layer_norm": false,
+                    "enable_embed_layer_norm": true,
+                    "enable_bias_skip_layer_norm": false,
+                    "enable_bias_gelu": true,
+                    "enable_gelu_approximation": false,
+                    "enable_qordered_matmul": false,
+                    "enable_shape_inference": true,
+                    "enable_gemm_fast_gelu": false,
+                    "enable_nhwc_conv": false,
+                    "enable_group_norm": true,
+                    "enable_bias_splitgelu": false,
+                    "enable_packed_qkv": true,
+                    "enable_packed_kv": true,
+                    "enable_bias_add": false,
+                    "group_norm_channels_last": false
+                },
+                "force_fp32_ops": ["RandomNormalLike"],
+                "force_fp16_inputs": {
+                    "GroupNorm": [0, 1, 2]
+                }
+            }
+        }
+    },
+    "pass_flows": [
+        ["convert", "optimize"]
+    ],
+    "engine": {
+        "log_severity_level": 0,
+        "evaluator": "common_evaluator",
+        "evaluate_input_model": false,
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_name": "vqgan",
+        "output_dir": "footprints"
+    }
+}
diff --git a/OnnxStack.Converter/stable_cascade/convert.py b/OnnxStack.Converter/stable_cascade/convert.py
new file mode 100644
index 0000000..17450bc
--- /dev/null
+++ b/OnnxStack.Converter/stable_cascade/convert.py
@@ -0,0 +1,211 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import argparse
+import json
+import shutil
+import sys
+import warnings
+from pathlib import Path
+from typing import Dict
+
+import config
+import torch
+from diffusers import DiffusionPipeline
+from packaging import version
+
+from olive.common.utils import set_tempdir
+from olive.workflows import run as olive_run
+
+
+# pylint: disable=redefined-outer-name
+# ruff: noqa: TID252, T201
+
+
+def save_image(result, batch_size, provider, num_images, images_saved, image_callback=None):
+    passed_safety_checker = 0
+    for image_index in range(batch_size):
+        if result.nsfw_content_detected is None or not result.nsfw_content_detected[image_index]:
+            passed_safety_checker += 1
+            if images_saved < num_images:
+                output_path = f"result_{images_saved}.png"
+                result.images[image_index].save(output_path)
+                if image_callback:
+                    image_callback(images_saved, output_path)
+                images_saved += 1
+                print(f"Generated {output_path}")
+    print(f"Inference Batch End ({passed_safety_checker}/{batch_size} images).")
+    print("Images passed the safety checker.")
+    return images_saved
+
+
+def run_inference_loop(
+    pipeline,
+    prompt,
+    num_images,
+    batch_size,
+    image_size,
+    num_inference_steps,
+    guidance_scale,
+    strength: float,
+    provider: str,
+    image_callback=None,
+    step_callback=None,
+):
+    images_saved = 0
+
+    def update_steps(step, timestep, latents):
+        if step_callback:
+            step_callback((images_saved // batch_size) * num_inference_steps + step)
+
+    while images_saved < num_images:
+        print(f"\nInference Batch Start (batch size = {batch_size}).")
+
+        kwargs = {}
+
+        result = pipeline(
+            [prompt] * batch_size,
+            num_inference_steps=num_inference_steps,
+            callback=update_steps if step_callback else None,
+            height=image_size,
+            width=image_size,
+            guidance_scale=guidance_scale,
+            **kwargs,
+        )
+
+        images_saved = save_image(result, batch_size, provider, num_images, images_saved, image_callback)
+
+
+def update_config_with_provider(config: Dict, provider: str):
+    if provider == "dml":
+        # DirectML EP is the default, so no need to update config.
+        return config
+    elif provider == "cuda":
+        from sd_utils.ort import update_cuda_config
+
+        return update_cuda_config(config)
+    else:
+        raise ValueError(f"Unsupported provider: {provider}")
+
+
+def optimize(
+    model_input: str,
+    model_output: Path,
+    provider: str,
+    image_encoder: bool
+):
+    from google.protobuf import __version__ as protobuf_version
+
+    # protobuf 4.x aborts with OOM when optimizing unet
+    if version.parse(protobuf_version) > version.parse("3.20.3"):
+        print("This script requires protobuf 3.20.3. Please ensure your package version matches requirements.txt.")
+        sys.exit(1)
+
+    model_dir = model_input
+    script_dir = Path(__file__).resolve().parent
+
+    # Clean up previously optimized models, if any.
+    shutil.rmtree(script_dir / "footprints", ignore_errors=True)
+    shutil.rmtree(model_output, ignore_errors=True)
+
+    # Load the entire PyTorch pipeline to ensure all models and their configurations are downloaded and cached.
+    # This avoids an issue where the non-ONNX components (tokenizer, scheduler, and feature extractor) are not
+    # automatically cached correctly if individual models are fetched one at a time.
+    print("Download stable diffusion PyTorch pipeline...")
+    pipeline = DiffusionPipeline.from_pretrained(model_dir, torch_dtype=torch.float32, **{"local_files_only": True})
+    # config.vae_sample_size = pipeline.vae.config.sample_size
+    # config.cross_attention_dim = pipeline.unet.config.cross_attention_dim
+    # config.unet_sample_size = pipeline.unet.config.sample_size
+
+    model_info = {}
+
+    submodel_names = [ "text_encoder", "decoder", "prior", "vqgan"]
+
+    if image_encoder:
+        submodel_names.append("image_encoder")
+
+    for submodel_name in submodel_names:
+        print(f"\nOptimizing {submodel_name}")
+
+        olive_config = None
+        with (script_dir / f"config_{submodel_name}.json").open() as fin:
+            olive_config = json.load(fin)
+        olive_config = update_config_with_provider(olive_config, provider)
+        olive_config["input_model"]["config"]["model_path"] = model_dir
+
+        run_res = olive_run(olive_config)
+
+        from sd_utils.ort import save_optimized_onnx_submodel
+
+        save_optimized_onnx_submodel(submodel_name, provider, model_info)
+
+    from sd_utils.ort import save_onnx_pipeline
+
+    save_onnx_pipeline(
+        model_info, model_output, pipeline, submodel_names
+    )
+
+    return model_info
+
+
+def parse_common_args(raw_args):
+    parser = argparse.ArgumentParser("Common arguments")
+    parser.add_argument("--model_input", default="stable-diffusion-v1-5", type=str)
+    parser.add_argument("--model_output", default="stable-diffusion-v1-5", type=Path)
+    parser.add_argument("--image_encoder",action="store_true", help="Create image encoder model")
+    parser.add_argument("--provider", default="dml", type=str, choices=["dml", "cuda"], help="Execution provider to use")
+    parser.add_argument("--optimize", action="store_true", help="Runs the optimization step")
+    parser.add_argument("--clean_cache", action="store_true", help="Deletes the Olive cache")
+    parser.add_argument("--test_unoptimized", action="store_true", help="Use unoptimized model for inference")
+    parser.add_argument("--tempdir", default=None, type=str, help="Root directory for tempfile directories and files")
+    return parser.parse_known_args(raw_args)
+
+
+def parse_ort_args(raw_args):
+    parser = argparse.ArgumentParser("ONNX Runtime arguments")
+
+    parser.add_argument(
+        "--static_dims",
+        action="store_true",
+        help="DEPRECATED (now enabled by default). Use --dynamic_dims to disable static_dims.",
+    )
+    parser.add_argument("--dynamic_dims", action="store_true", help="Disable static shape optimization")
+
+    return parser.parse_known_args(raw_args)
+
+
+def main(raw_args=None):
+    common_args, extra_args = parse_common_args(raw_args)
+
+    provider = common_args.provider
+    model_input = common_args.model_input
+    model_output = common_args.model_output
+
+    script_dir = Path(__file__).resolve().parent
+
+
+    if common_args.clean_cache:
+        shutil.rmtree(script_dir / "cache", ignore_errors=True)
+
+    ort_args = None, None
+    ort_args, extra_args = parse_ort_args(extra_args)
+
+    if common_args.optimize or not model_output.exists():
+        set_tempdir(common_args.tempdir)
+
+        # TODO(jstoecker): clean up warning filter (mostly during conversion from torch to ONNX)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+        
+            from sd_utils.ort import validate_args
+
+            validate_args(ort_args, common_args.provider)
+            optimize(common_args.model_input, common_args.model_output, common_args.provider, common_args.image_encoder)
+
+    if not common_args.optimize:
+        print("TODO: Create OnnxStableCascadePipeline")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/OnnxStack.Converter/stable_cascade/models.py b/OnnxStack.Converter/stable_cascade/models.py
new file mode 100644
index 0000000..c8b15b7
--- /dev/null
+++ b/OnnxStack.Converter/stable_cascade/models.py
@@ -0,0 +1,168 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import config
+import torch
+from typing import Union, Optional, Tuple
+from diffusers import AutoencoderKL, StableCascadeUNet
+from diffusers.pipelines.wuerstchen import PaellaVQModel
+from transformers.models.clip.modeling_clip import CLIPTextModelWithProjection, CLIPVisionModelWithProjection
+from dataclasses import dataclass
+
+# Helper latency-only dataloader that creates random tensors with no label
+class RandomDataLoader:
+    def __init__(self, create_inputs_func, batchsize, torch_dtype):
+        self.create_input_func = create_inputs_func
+        self.batchsize = batchsize
+        self.torch_dtype = torch_dtype
+
+    def __getitem__(self, idx):
+        label = None
+        return self.create_input_func(self.batchsize, self.torch_dtype), label
+
+
+
+# -----------------------------------------------------------------------------
+# TEXT ENCODER
+# -----------------------------------------------------------------------------
+
+def text_encoder_inputs(batchsize, torch_dtype):
+    return torch.zeros((batchsize, 77), dtype=torch_dtype)
+
+
+def text_encoder_load(model_name):
+    model = CLIPTextModelWithProjection.from_pretrained(model_name, subfolder="text_encoder")
+    return model
+
+
+def text_encoder_conversion_inputs(model=None):
+    return text_encoder_inputs(1, torch.int32)
+
+
+def text_encoder_data_loader(data_dir, batchsize, *args, **kwargs):
+    return RandomDataLoader(text_encoder_inputs, batchsize, torch.int32)
+
+
+
+
+# -----------------------------------------------------------------------------
+# DECODER UNET
+# -----------------------------------------------------------------------------
+
+def decoder_inputs(batchsize, torch_dtype, is_conversion_inputs=False):
+    # TODO(jstoecker): Rename onnx::Concat_4 to text_embeds and onnx::Shape_5 to time_ids
+    inputs = {
+        "sample": torch.rand((batchsize, 4, 256, 256), dtype=torch_dtype),
+        "timestep_ratio": torch.rand((batchsize,), dtype=torch_dtype),
+        "clip_text_pooled": torch.rand((batchsize , 1,  1280), dtype=torch_dtype),
+        "effnet": torch.rand((batchsize, 16, 24, 24), dtype=torch_dtype)
+    }
+
+    # use as kwargs since they won't be in the correct position if passed along with the tuple of inputs
+    kwargs = {
+        "return_dict": False,
+    }
+   
+    return inputs
+
+
+def decoder_load(model_name):
+    model = StableCascadeUNet.from_pretrained(model_name, subfolder="decoder")
+    return model
+
+
+def decoder_conversion_inputs(model=None):
+    return tuple(decoder_inputs(1, torch.float32, True).values())
+
+
+def decoder_data_loader(data_dir, batchsize, *args, **kwargs):
+    return RandomDataLoader(decoder_inputs, batchsize, torch.float16)
+
+
+
+
+# -----------------------------------------------------------------------------
+# PRIOR UNET
+# -----------------------------------------------------------------------------
+
+def prior_inputs(batchsize, torch_dtype, is_conversion_inputs=False):
+    inputs = {
+        "sample": torch.rand((batchsize, 16, 24, 24), dtype=torch_dtype),
+        "timestep_ratio": torch.rand((batchsize,), dtype=torch_dtype),
+        "clip_text_pooled": torch.rand((batchsize  , 1,  1280), dtype=torch_dtype),
+        "clip_text": torch.rand((batchsize  , 77,  1280), dtype=torch_dtype),
+        "clip_img": torch.rand((batchsize , 1,  768), dtype=torch_dtype)
+    }
+
+    # use as kwargs since they won't be in the correct position if passed along with the tuple of inputs
+    kwargs = {
+        "return_dict": False,
+    }
+   
+    return inputs
+
+
+def prior_load(model_name):
+    model = StableCascadeUNet.from_pretrained(model_name, subfolder="prior")
+    return model
+
+
+def prior_conversion_inputs(model=None):
+    return tuple(prior_inputs(1, torch.float32, True).values())
+
+
+def prior_data_loader(data_dir, batchsize, *args, **kwargs):
+    return RandomDataLoader(prior_inputs, batchsize, torch.float16)
+
+
+
+    
+# -----------------------------------------------------------------------------
+# IMAGE ENCODER
+# -----------------------------------------------------------------------------
+
+def image_encoder_inputs(batchsize, torch_dtype, is_conversion_inputs=False):
+    inputs = {
+        "sample": torch.rand((batchsize, 3, 224, 224), dtype=torch_dtype)
+    }
+    return inputs
+
+
+def image_encoder_load(model_name):
+    model = CLIPVisionModelWithProjection.from_pretrained(model_name, subfolder="image_encoder", use_safetensors=True)
+    return model
+
+
+def image_encoder_conversion_inputs(model=None):
+    return tuple(image_encoder_inputs(1, torch.float32, True).values())
+
+
+def image_encoder_data_loader(data_dir, batchsize, *args, **kwargs):
+    return RandomDataLoader(image_encoder_inputs, batchsize, torch.float16)
+
+
+
+
+# -----------------------------------------------------------------------------
+# VQGAN
+# -----------------------------------------------------------------------------
+
+def vqgan_inputs(batchsize, torch_dtype, is_conversion_inputs=False):
+    inputs = {
+        "sample": torch.rand((batchsize, 3, 256, 256), dtype=torch_dtype)
+    }
+    return inputs
+
+
+def vqgan_load(model_name):
+    model = PaellaVQModel.from_pretrained(model_name, subfolder="vqgan", use_safetensors=True)
+    return model
+
+
+def vqgan_conversion_inputs(model=None):
+    return tuple(vqgan_inputs(1, torch.float32, True).values())
+
+
+def vqgan_data_loader(data_dir, batchsize, *args, **kwargs):
+    return RandomDataLoader(vqgan_inputs, batchsize, torch.float16)
\ No newline at end of file
diff --git a/OnnxStack.Converter/stable_cascade/requirements.txt b/OnnxStack.Converter/stable_cascade/requirements.txt
new file mode 100644
index 0000000..15b9198
--- /dev/null
+++ b/OnnxStack.Converter/stable_cascade/requirements.txt
@@ -0,0 +1,9 @@
+accelerate
+diffusers
+onnx
+pillow
+protobuf==3.20.3 # protobuf 4.x aborts with OOM when optimizing unet
+tabulate
+torch
+transformers
+onnxruntime-directml>=1.16.0
diff --git a/OnnxStack.Converter/stable_cascade/sd_utils/ort.py b/OnnxStack.Converter/stable_cascade/sd_utils/ort.py
new file mode 100644
index 0000000..72746f7
--- /dev/null
+++ b/OnnxStack.Converter/stable_cascade/sd_utils/ort.py
@@ -0,0 +1,117 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import os
+import json
+import shutil
+import sys
+from pathlib import Path
+from typing import Dict
+
+import onnxruntime as ort
+from diffusers import OnnxRuntimeModel, StableCascadePriorPipeline
+from onnxruntime import __version__ as OrtVersion
+from packaging import version
+
+from olive.model import ONNXModelHandler
+
+# ruff: noqa: TID252, T201
+
+
+def update_cuda_config(config: Dict):
+    if version.parse(OrtVersion) < version.parse("1.17.0"):
+        # disable skip_group_norm fusion since there is a shape inference bug which leads to invalid models
+        config["passes"]["optimize_cuda"]["config"]["optimization_options"] = {"enable_skip_group_norm": False}
+    config["pass_flows"] = [["convert", "optimize_cuda"]]
+    config["systems"]["local_system"]["config"]["accelerators"][0]["execution_providers"] = ["CUDAExecutionProvider"]
+    return config
+
+
+def validate_args(args, provider):
+    ort.set_default_logger_severity(4)
+    if args.static_dims:
+        print(
+            "WARNING: the --static_dims option is deprecated, and static shape optimization is enabled by default. "
+            "Use --dynamic_dims to disable static shape optimization."
+        )
+
+    validate_ort_version(provider)
+
+
+def validate_ort_version(provider: str):
+    if provider == "dml" and version.parse(OrtVersion) < version.parse("1.16.0"):
+        print("This script requires onnxruntime-directml 1.16.0 or newer")
+        sys.exit(1)
+    elif provider == "cuda" and version.parse(OrtVersion) < version.parse("1.17.0"):
+        if version.parse(OrtVersion) < version.parse("1.16.2"):
+            print("This script requires onnxruntime-gpu 1.16.2 or newer")
+            sys.exit(1)
+        print(
+            f"WARNING: onnxruntime {OrtVersion} has known issues with shape inference for SkipGroupNorm. Will disable"
+            " skip_group_norm fusion. onnxruntime-gpu 1.17.0 or newer is strongly recommended!"
+        )
+
+
+def save_optimized_onnx_submodel(submodel_name, provider, model_info):
+    footprints_file_path = (
+        Path(__file__).resolve().parents[1] / "footprints" / f"{submodel_name}_gpu-{provider}_footprints.json"
+    )
+    with footprints_file_path.open("r") as footprint_file:
+        footprints = json.load(footprint_file)
+
+        conversion_footprint = None
+        optimizer_footprint = None
+        for footprint in footprints.values():
+            if footprint["from_pass"] == "OnnxConversion":
+                conversion_footprint = footprint
+            elif footprint["from_pass"] == "OrtTransformersOptimization":
+                optimizer_footprint = footprint
+
+        assert conversion_footprint
+        assert optimizer_footprint
+
+        unoptimized_olive_model = ONNXModelHandler(**conversion_footprint["model_config"]["config"])
+        optimized_olive_model = ONNXModelHandler(**optimizer_footprint["model_config"]["config"])
+
+        model_info[submodel_name] = {
+            "unoptimized": {
+                "path": Path(unoptimized_olive_model.model_path),
+                "data": Path(unoptimized_olive_model.model_path + ".data"),
+            },
+            "optimized": {
+                "path": Path(optimized_olive_model.model_path),
+                "data": Path(optimized_olive_model.model_path + ".data"),
+            },
+        }
+
+        print(f"Unoptimized Model : {model_info[submodel_name]['unoptimized']['path']}")
+        print(f"Optimized Model   : {model_info[submodel_name]['optimized']['path']}")
+
+
+def save_onnx_pipeline(
+    model_info, model_output, pipeline, submodel_names
+):
+    # Save the unoptimized models in a directory structure that the diffusers library can load and run.
+    # This is optional, and the optimized models can be used directly in a custom pipeline if desired.
+    # print("\nCreating ONNX pipeline...")
+   
+    # TODO: Create OnnxStableCascadePipeline
+
+    # Create a copy of the unoptimized model directory, then overwrite with optimized models from the olive cache.
+    print("Copying optimized models...")
+    for passType in ["optimized", "unoptimized"]:
+        model_dir = model_output / passType
+        for submodel_name in submodel_names:
+            src_path = model_info[submodel_name][passType]["path"] # model.onnx
+            src_data_path = model_info[submodel_name][passType]["data"]# model.onnx.data
+
+            dst_path = model_dir / submodel_name
+            if not os.path.exists(dst_path):
+                os.makedirs(dst_path, exist_ok=True)
+
+            shutil.copyfile(src_path, dst_path / "model.onnx")
+            if os.path.exists(src_data_path):
+                shutil.copyfile(src_data_path, dst_path / "model.onnx.data")
+        
+    print(f"The converted model is located here: {model_output}")
diff --git a/OnnxStack.Converter/stable_diffusion/.gitignore b/OnnxStack.Converter/stable_diffusion/.gitignore
new file mode 100644
index 0000000..4cf6f30
--- /dev/null
+++ b/OnnxStack.Converter/stable_diffusion/.gitignore
@@ -0,0 +1,3 @@
+/footprints/
+/cache/
+/result_*.png
diff --git a/OnnxStack.Converter/stable_diffusion/config.py b/OnnxStack.Converter/stable_diffusion/config.py
new file mode 100644
index 0000000..f8cfccd
--- /dev/null
+++ b/OnnxStack.Converter/stable_diffusion/config.py
@@ -0,0 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+vae_sample_size = 512
+unet_sample_size = 64
+cross_attention_dim = 768
diff --git a/OnnxStack.Converter/stable_diffusion/config_controlnet.json b/OnnxStack.Converter/stable_diffusion/config_controlnet.json
new file mode 100644
index 0000000..02902ea
--- /dev/null
+++ b/OnnxStack.Converter/stable_diffusion/config_controlnet.json
@@ -0,0 +1,123 @@
+{
+  "input_model": {
+      "type": "PyTorchModel",
+      "config": {
+          "model_path": "runwayml/stable-diffusion-v1-5",
+          "model_loader": "controlnet_unet_load",
+          "model_script": "models.py",
+          "io_config": {
+              "input_names": [ "sample", "timestep", "encoder_hidden_states", "down_block_0_additional_residual", "down_block_1_additional_residual", "down_block_2_additional_residual", "down_block_3_additional_residual", "down_block_4_additional_residual", "down_block_5_additional_residual", "down_block_6_additional_residual", "down_block_7_additional_residual", "down_block_8_additional_residual", "down_block_9_additional_residual", "down_block_10_additional_residual", "down_block_11_additional_residual", "mid_block_additional_residual", "return_dict" ],
+              "output_names": [ "out_sample" ],
+              "dynamic_axes": {
+                  "sample": {"0": "unet_sample_batch", "1": "unet_sample_channels", "2": "unet_sample_height", "3": "unet_sample_width"},
+                  "timestep": {"0": "unet_time_batch"},
+                  "encoder_hidden_states": {"0": "unet_hidden_batch", "1": "unet_hidden_sequence"},
+                  "down_block_0_additional_residual": {"0": "cnet_db0_batch", "1": "cnet_db0_channels", "2": "cnet_db0_height", "3": "cnet_db0_width"},
+                  "down_block_1_additional_residual": {"0": "cnet_db1_batch", "1": "cnet_db1_channels", "2": "cnet_db1_height", "3": "cnet_db1_width"},
+                  "down_block_2_additional_residual": {"0": "cnet_db2_batch", "1": "cnet_db2_channels", "2": "cnet_db2_height", "3": "cnet_db2_width"},
+                  "down_block_3_additional_residual": {"0": "cnet_db3_batch", "1": "cnet_db3_channels", "2": "cnet_db3_height2", "3": "cnet_db3_width2"},
+                  "down_block_4_additional_residual": {"0": "cnet_db4_batch", "1": "cnet_db4_channels", "2": "cnet_db4_height2", "3": "cnet_db4_width2"},
+                  "down_block_5_additional_residual": {"0": "cnet_db5_batch", "1": "cnet_db5_channels", "2": "cnet_db5_height2", "3": "cnet_db5_width2"},
+                  "down_block_6_additional_residual": {"0": "cnet_db6_batch", "1": "cnet_db6_channels", "2": "cnet_db6_height4", "3": "cnet_db6_width4"},
+                  "down_block_7_additional_residual": {"0": "cnet_db7_batch", "1": "cnet_db7_channels", "2": "cnet_db7_height4", "3": "cnet_db7_width4"},
+                  "down_block_8_additional_residual": {"0": "cnet_db8_batch", "1": "cnet_db8_channels", "2": "cnet_db8_height4", "3": "cnet_db8_width4"},
+                  "down_block_9_additional_residual": {"0": "cnet_db9_batch", "1": "cnet_db9_channels", "2": "cnet_db9_height8", "3": "cnet_db9_width8"},
+                  "down_block_10_additional_residual": {"0": "cnet_db10_batch", "1": "cnet_db10_channels", "2": "cnet_db10_height8", "3": "cnet_db10_width8"},
+                  "down_block_11_additional_residual": {"0": "cnet_db11_batch", "1": "cnet_db11_channels", "2": "cnet_db11_height8", "3": "cnet_db11_width8"},
+                  "mid_block_additional_residual": {"0": "cnet_mbar_batch", "1": "cnet_mbar_channels", "2": "cnet_mbar_height8", "3": "cnet_mbar_width8"}
+              }
+          },
+          "dummy_inputs_func": "controlnet_unet_conversion_inputs"
+      }
+  },
+  "systems": {
+      "local_system": {
+          "type": "LocalSystem",
+          "config": {
+                "accelerators": [
+                    {
+                        "device": "gpu",
+                        "execution_providers": [
+                            "DmlExecutionProvider"
+                        ]
+                    }
+                ]
+          }
+      }
+  },
+  "evaluators": {
+      "common_evaluator": {
+          "metrics": [
+              {
+                  "name": "latency",
+                  "type": "latency",
+                  "sub_types": [{"name": "avg"}],
+                  "user_config": {
+                      "user_script": "models.py",
+                      "dataloader_func": "controlnet_unet_data_loader",
+                      "batch_size": 2
+                  }
+              }
+          ]
+      }
+  },
+  "passes": {
+      "convert": {
+          "type": "OnnxConversion",
+          "config": {
+              "target_opset": 14,
+              "save_as_external_data": true,
+              "all_tensors_to_one_file": true,
+              "external_data_name": "weights.pb"
+          }
+      },
+      "optimize": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "unet",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false,
+                "optimization_options": {
+                    "enable_gelu": true,
+                    "enable_layer_norm": true,
+                    "enable_attention": true,
+                    "use_multi_head_attention": true,
+                    "enable_skip_layer_norm": false,
+                    "enable_embed_layer_norm": true,
+                    "enable_bias_skip_layer_norm": false,
+                    "enable_bias_gelu": true,
+                    "enable_gelu_approximation": false,
+                    "enable_qordered_matmul": false,
+                    "enable_shape_inference": true,
+                    "enable_gemm_fast_gelu": false,
+                    "enable_nhwc_conv": false,
+                    "enable_group_norm": true,
+                    "enable_bias_splitgelu": false,
+                    "enable_packed_qkv": true,
+                    "enable_packed_kv": true,
+                    "enable_bias_add": false,
+                    "group_norm_channels_last": false
+                },
+                "force_fp32_ops": ["RandomNormalLike"],
+                "force_fp16_inputs": {
+                    "GroupNorm": [0, 1, 2]
+                }
+            }
+      }
+    },
+    "pass_flows": [
+        ["convert", "optimize"]
+    ],
+    "engine": {
+        "log_severity_level": 0,
+        "evaluator": "common_evaluator",
+        "evaluate_input_model": false,
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_name": "controlnet",
+        "output_dir": "footprints"
+    }
+}
diff --git a/OnnxStack.Converter/stable_diffusion/config_safety_checker.json b/OnnxStack.Converter/stable_diffusion/config_safety_checker.json
new file mode 100644
index 0000000..f5234a8
--- /dev/null
+++ b/OnnxStack.Converter/stable_diffusion/config_safety_checker.json
@@ -0,0 +1,124 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_path": "runwayml/stable-diffusion-v1-5",
+            "model_loader": "safety_checker_load",
+            "model_script": "models.py",
+            "io_config": {
+                "input_names": [ "clip_input", "images" ],
+                "output_names": [ "out_images", "has_nsfw_concepts" ],
+                "dynamic_axes": {
+                    "clip_input": { "0": "batch", "1": "channels", "2": "height", "3": "width" },
+                    "images": { "0": "batch", "1": "height", "2": "width", "3": "channels" }
+                }
+            },
+            "dummy_inputs_func": "safety_checker_conversion_inputs"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "gpu",
+                        "execution_providers": [
+                            "DmlExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [{"name": "avg"}],
+                    "user_config": {
+                        "user_script": "models.py",
+                        "dataloader_func": "safety_checker_data_loader",
+                        "batch_size": 1
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 14
+            }
+        },
+        "ov_convert": {
+            "type": "OpenVINOConversion",
+            "config": {
+                "user_script": "models.py",
+                "example_input_func": "safety_checker_conversion_inputs",
+                "output_model": "safety_checker"
+            }
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "unet",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false,
+                "optimization_options": {
+                    "enable_gelu": true,
+                    "enable_layer_norm": true,
+                    "enable_attention": true,
+                    "use_multi_head_attention": true,
+                    "enable_skip_layer_norm": false,
+                    "enable_embed_layer_norm": true,
+                    "enable_bias_skip_layer_norm": false,
+                    "enable_bias_gelu": true,
+                    "enable_gelu_approximation": false,
+                    "enable_qordered_matmul": false,
+                    "enable_shape_inference": true,
+                    "enable_gemm_fast_gelu": false,
+                    "enable_nhwc_conv": false,
+                    "enable_group_norm": true,
+                    "enable_bias_splitgelu": false,
+                    "enable_packed_qkv": true,
+                    "enable_packed_kv": true,
+                    "enable_bias_add": false,
+                    "group_norm_channels_last": false
+                },
+                "force_fp32_ops": ["RandomNormalLike"],
+                "force_fp16_inputs": {
+                    "GroupNorm": [0, 1, 2]
+                }
+            }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "unet",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false
+            }
+        }
+    },
+    "pass_flows": [
+        ["convert", "optimize"]
+    ],
+    "engine": {
+        "log_severity_level": 0,
+        "evaluator": "common_evaluator",
+        "evaluate_input_model": false,
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_name": "safety_checker",
+        "output_dir": "footprints"
+    }
+}
diff --git a/OnnxStack.Converter/stable_diffusion/config_text_encoder.json b/OnnxStack.Converter/stable_diffusion/config_text_encoder.json
new file mode 100644
index 0000000..db7115f
--- /dev/null
+++ b/OnnxStack.Converter/stable_diffusion/config_text_encoder.json
@@ -0,0 +1,121 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_path": "runwayml/stable-diffusion-v1-5",
+            "model_loader": "text_encoder_load",
+            "model_script": "models.py",
+            "io_config": {
+                "input_names": [ "input_ids" ],
+                "output_names": [ "last_hidden_state", "pooler_output" ],
+                "dynamic_axes": { "input_ids": { "0": "batch", "1": "sequence" } }
+            },
+            "dummy_inputs_func": "text_encoder_conversion_inputs"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "gpu",
+                        "execution_providers": [
+                            "DmlExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [{"name": "avg"}],
+                    "user_config": {
+                        "user_script": "models.py",
+                        "dataloader_func": "text_encoder_data_loader",
+                        "batch_size": 1
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 14
+            }
+        },
+        "ov_convert": {
+            "type": "OpenVINOConversion",
+            "config": {
+                "user_script": "models.py",
+                "example_input_func": "text_encoder_conversion_inputs",
+                "output_model": "text_encoder"
+            }
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "clip",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false,
+                "optimization_options": {
+                    "enable_gelu": true,
+                    "enable_layer_norm": true,
+                    "enable_attention": true,
+                    "use_multi_head_attention": true,
+                    "enable_skip_layer_norm": false,
+                    "enable_embed_layer_norm": true,
+                    "enable_bias_skip_layer_norm": false,
+                    "enable_bias_gelu": true,
+                    "enable_gelu_approximation": false,
+                    "enable_qordered_matmul": false,
+                    "enable_shape_inference": true,
+                    "enable_gemm_fast_gelu": false,
+                    "enable_nhwc_conv": false,
+                    "enable_group_norm": true,
+                    "enable_bias_splitgelu": false,
+                    "enable_packed_qkv": true,
+                    "enable_packed_kv": true,
+                    "enable_bias_add": false,
+                    "group_norm_channels_last": false
+                },
+                "force_fp32_ops": ["RandomNormalLike"],
+                "force_fp16_inputs": {
+                    "GroupNorm": [0, 1, 2]
+                }
+            }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "clip",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false
+            }
+        }
+    },
+    "pass_flows": [
+        ["convert", "optimize"]
+    ],
+    "engine": {
+        "log_severity_level": 0,
+        "evaluator": "common_evaluator",
+        "evaluate_input_model": false,
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_name": "text_encoder",
+        "output_dir": "footprints"
+    }
+}
diff --git a/OnnxStack.Converter/stable_diffusion/config_unet.json b/OnnxStack.Converter/stable_diffusion/config_unet.json
new file mode 100644
index 0000000..d5e4ab2
--- /dev/null
+++ b/OnnxStack.Converter/stable_diffusion/config_unet.json
@@ -0,0 +1,128 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_path": "runwayml/stable-diffusion-v1-5",
+            "model_loader": "unet_load",
+            "model_script": "models.py",
+            "io_config": {
+                "input_names": [ "sample", "timestep", "encoder_hidden_states", "return_dict" ],
+                "output_names": [ "out_sample" ],
+                "dynamic_axes": {
+                    "sample": {"0": "unet_sample_batch", "1": "unet_sample_channels", "2": "unet_sample_height", "3": "unet_sample_width"},
+                    "timestep": {"0": "unet_time_batch"},
+                    "encoder_hidden_states": {"0": "unet_hidden_batch", "1": "unet_hidden_sequence"}
+                }
+            },
+            "dummy_inputs_func": "unet_conversion_inputs"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "gpu",
+                        "execution_providers": [
+                            "DmlExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [{"name": "avg"}],
+                    "user_config": {
+                        "user_script": "models.py",
+                        "dataloader_func": "unet_data_loader",
+                        "batch_size": 2
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 14,
+                "save_as_external_data": true,
+                "all_tensors_to_one_file": true,
+                "external_data_name": "weights.pb"
+            }
+        },
+        "ov_convert": {
+            "type": "OpenVINOConversion",
+            "config": {
+                "user_script": "models.py",
+                "example_input_func": "get_unet_ov_example_input",
+                "output_model": "unet"
+            }
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "unet",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false,
+                "optimization_options": {
+                    "enable_gelu": true,
+                    "enable_layer_norm": true,
+                    "enable_attention": true,
+                    "use_multi_head_attention": true,
+                    "enable_skip_layer_norm": false,
+                    "enable_embed_layer_norm": true,
+                    "enable_bias_skip_layer_norm": false,
+                    "enable_bias_gelu": true,
+                    "enable_gelu_approximation": false,
+                    "enable_qordered_matmul": false,
+                    "enable_shape_inference": true,
+                    "enable_gemm_fast_gelu": false,
+                    "enable_nhwc_conv": false,
+                    "enable_group_norm": true,
+                    "enable_bias_splitgelu": false,
+                    "enable_packed_qkv": true,
+                    "enable_packed_kv": true,
+                    "enable_bias_add": false,
+                    "group_norm_channels_last": false
+                },
+                "force_fp32_ops": ["RandomNormalLike"],
+                "force_fp16_inputs": {
+                    "GroupNorm": [0, 1, 2]
+                }
+            }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "unet",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false
+            }
+        }
+    },
+    "pass_flows": [
+        ["convert", "optimize"]
+    ],
+    "engine": {
+        "log_severity_level": 0,
+        "evaluator": "common_evaluator",
+        "evaluate_input_model": false,
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_name": "unet",
+        "output_dir": "footprints"
+    }
+}
diff --git a/OnnxStack.Converter/stable_diffusion/config_vae_decoder.json b/OnnxStack.Converter/stable_diffusion/config_vae_decoder.json
new file mode 100644
index 0000000..40c42b8
--- /dev/null
+++ b/OnnxStack.Converter/stable_diffusion/config_vae_decoder.json
@@ -0,0 +1,121 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_path": "runwayml/stable-diffusion-v1-5",
+            "model_loader": "vae_decoder_load",
+            "model_script": "models.py",
+            "io_config": {
+                "input_names": [ "latent_sample", "return_dict" ],
+                "output_names": [ "sample" ],
+                "dynamic_axes": { "latent_sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } }
+            },
+            "dummy_inputs_func": "vae_decoder_conversion_inputs"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "gpu",
+                        "execution_providers": [
+                            "DmlExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [{"name": "avg"}],
+                    "user_config": {
+                        "user_script": "models.py",
+                        "dataloader_func": "vae_decoder_data_loader",
+                        "batch_size": 1
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 14
+            }
+        },
+        "ov_convert": {
+            "type": "OpenVINOConversion",
+            "config": {
+                "user_script": "models.py",
+                "example_input_func": "vae_decoder_conversion_inputs",
+                "output_model": "vae_decoder"
+            }
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "vae",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false,
+                "optimization_options": {
+                    "enable_gelu": true,
+                    "enable_layer_norm": true,
+                    "enable_attention": true,
+                    "use_multi_head_attention": true,
+                    "enable_skip_layer_norm": false,
+                    "enable_embed_layer_norm": true,
+                    "enable_bias_skip_layer_norm": false,
+                    "enable_bias_gelu": true,
+                    "enable_gelu_approximation": false,
+                    "enable_qordered_matmul": false,
+                    "enable_shape_inference": true,
+                    "enable_gemm_fast_gelu": false,
+                    "enable_nhwc_conv": false,
+                    "enable_group_norm": true,
+                    "enable_bias_splitgelu": false,
+                    "enable_packed_qkv": true,
+                    "enable_packed_kv": true,
+                    "enable_bias_add": false,
+                    "group_norm_channels_last": false
+                },
+                "force_fp32_ops": ["RandomNormalLike"],
+                "force_fp16_inputs": {
+                    "GroupNorm": [0, 1, 2]
+                }
+            }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "vae",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false
+            }
+        }
+    },
+    "pass_flows": [
+        ["convert", "optimize"]
+    ],
+    "engine": {
+        "log_severity_level": 0,
+        "evaluator": "common_evaluator",
+        "evaluate_input_model": false,
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_name": "vae_decoder",
+        "output_dir": "footprints"
+    }
+}
diff --git a/OnnxStack.Converter/stable_diffusion/config_vae_encoder.json b/OnnxStack.Converter/stable_diffusion/config_vae_encoder.json
new file mode 100644
index 0000000..780b250
--- /dev/null
+++ b/OnnxStack.Converter/stable_diffusion/config_vae_encoder.json
@@ -0,0 +1,121 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "config": {
+            "model_path": "runwayml/stable-diffusion-v1-5",
+            "model_loader": "vae_encoder_load",
+            "model_script": "models.py",
+            "io_config": {
+                "input_names": [ "sample", "return_dict" ],
+                "output_names": [ "latent_sample" ],
+                "dynamic_axes": { "sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } }
+            },
+            "dummy_inputs_func": "vae_encoder_conversion_inputs"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "gpu",
+                        "execution_providers": [
+                            "DmlExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [{"name": "avg"}],
+                    "user_config": {
+                        "user_script": "models.py",
+                        "dataloader_func": "vae_encoder_data_loader",
+                        "batch_size": 1
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 14
+            }
+        },
+        "ov_convert": {
+            "type": "OpenVINOConversion",
+            "config": {
+                "user_script": "models.py",
+                "example_input_func": "vae_encoder_conversion_inputs",
+                "output_model": "vae_encoder"
+            }
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "vae",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false,
+                "optimization_options": {
+                    "enable_gelu": true,
+                    "enable_layer_norm": true,
+                    "enable_attention": true,
+                    "use_multi_head_attention": true,
+                    "enable_skip_layer_norm": false,
+                    "enable_embed_layer_norm": true,
+                    "enable_bias_skip_layer_norm": false,
+                    "enable_bias_gelu": true,
+                    "enable_gelu_approximation": false,
+                    "enable_qordered_matmul": false,
+                    "enable_shape_inference": true,
+                    "enable_gemm_fast_gelu": false,
+                    "enable_nhwc_conv": false,
+                    "enable_group_norm": true,
+                    "enable_bias_splitgelu": false,
+                    "enable_packed_qkv": true,
+                    "enable_packed_kv": true,
+                    "enable_bias_add": false,
+                    "group_norm_channels_last": false
+                },
+                "force_fp32_ops": ["RandomNormalLike"],
+                "force_fp16_inputs": {
+                    "GroupNorm": [0, 1, 2]
+                }
+            }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "config": {
+                "model_type": "vae",
+                "opt_level": 0,
+                "float16": true,
+                "use_gpu": true,
+                "keep_io_types": false
+            }
+        }
+    },
+    "pass_flows": [
+        ["convert", "optimize"]
+    ],
+    "engine": {
+        "log_severity_level": 0,
+        "evaluator": "common_evaluator",
+        "evaluate_input_model": false,
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_name": "vae_encoder",
+        "output_dir": "footprints"
+    }
+}
diff --git a/OnnxStack.Converter/stable_diffusion/convert.py b/OnnxStack.Converter/stable_diffusion/convert.py
new file mode 100644
index 0000000..c011d45
--- /dev/null
+++ b/OnnxStack.Converter/stable_diffusion/convert.py
@@ -0,0 +1,273 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import argparse
+import json
+import shutil
+import sys
+import warnings
+from pathlib import Path
+from typing import Dict
+
+import config
+import torch
+from diffusers import DiffusionPipeline
+from packaging import version
+
+from olive.common.utils import set_tempdir
+from olive.workflows import run as olive_run
+
+
+# pylint: disable=redefined-outer-name
+# ruff: noqa: TID252, T201
+
+
+def save_image(result, batch_size, provider, num_images, images_saved, image_callback=None):
+    passed_safety_checker = 0
+    for image_index in range(batch_size):
+        if result.nsfw_content_detected is None or not result.nsfw_content_detected[image_index]:
+            passed_safety_checker += 1
+            if images_saved < num_images:
+                output_path = f"result_{images_saved}.png"
+                result.images[image_index].save(output_path)
+                if image_callback:
+                    image_callback(images_saved, output_path)
+                images_saved += 1
+                print(f"Generated {output_path}")
+    print(f"Inference Batch End ({passed_safety_checker}/{batch_size} images).")
+    print("Images passed the safety checker.")
+    return images_saved
+
+
+def run_inference_loop(
+    pipeline,
+    prompt,
+    num_images,
+    batch_size,
+    image_size,
+    num_inference_steps,
+    guidance_scale,
+    strength: float,
+    provider: str,
+    image_callback=None,
+    step_callback=None,
+):
+    images_saved = 0
+
+    def update_steps(step, timestep, latents):
+        if step_callback:
+            step_callback((images_saved // batch_size) * num_inference_steps + step)
+
+    while images_saved < num_images:
+        print(f"\nInference Batch Start (batch size = {batch_size}).")
+
+        kwargs = {}
+
+        result = pipeline(
+            [prompt] * batch_size,
+            num_inference_steps=num_inference_steps,
+            callback=update_steps if step_callback else None,
+            height=image_size,
+            width=image_size,
+            guidance_scale=guidance_scale,
+            **kwargs,
+        )
+
+        images_saved = save_image(result, batch_size, provider, num_images, images_saved, image_callback)
+
+
+def update_config_with_provider(config: Dict, provider: str):
+    if provider == "dml":
+        # DirectML EP is the default, so no need to update config.
+        return config
+    elif provider == "cuda":
+        from sd_utils.ort import update_cuda_config
+
+        return update_cuda_config(config)
+    else:
+        raise ValueError(f"Unsupported provider: {provider}")
+
+
+def optimize(
+    model_input: str,
+    model_output: Path,
+    provider: str,
+    controlnet: bool
+):
+    from google.protobuf import __version__ as protobuf_version
+
+    # protobuf 4.x aborts with OOM when optimizing unet
+    if version.parse(protobuf_version) > version.parse("3.20.3"):
+        print("This script requires protobuf 3.20.3. Please ensure your package version matches requirements.txt.")
+        sys.exit(1)
+
+    model_dir = model_input
+    script_dir = Path(__file__).resolve().parent
+
+    # Clean up previously optimized models, if any.
+    shutil.rmtree(script_dir / "footprints", ignore_errors=True)
+    shutil.rmtree(model_output, ignore_errors=True)
+
+
+    # Load the entire PyTorch pipeline to ensure all models and their configurations are downloaded and cached.
+    # This avoids an issue where the non-ONNX components (tokenizer, scheduler, and feature extractor) are not
+    # automatically cached correctly if individual models are fetched one at a time.
+    print("Download stable diffusion PyTorch pipeline...")
+    pipeline = DiffusionPipeline.from_pretrained(model_dir, torch_dtype=torch.float32, **{"local_files_only": True})
+    config.vae_sample_size = pipeline.vae.config.sample_size
+    config.cross_attention_dim = pipeline.unet.config.cross_attention_dim
+    config.unet_sample_size = pipeline.unet.config.sample_size
+
+    model_info = {}
+
+    submodel_names = ["vae_encoder", "vae_decoder", "unet" , "text_encoder"]
+
+    has_safety_checker = getattr(pipeline, "safety_checker", None) is not None
+
+    if has_safety_checker:
+        submodel_names.append("safety_checker")
+
+    if controlnet:
+        submodel_names.append("controlnet")
+
+    for submodel_name in submodel_names:
+        print(f"\nOptimizing {submodel_name}")
+
+        olive_config = None
+        with (script_dir / f"config_{submodel_name}.json").open() as fin:
+            olive_config = json.load(fin)
+        olive_config = update_config_with_provider(olive_config, provider)
+
+        if submodel_name in ("unet", "controlnet", "text_encoder"):
+            olive_config["input_model"]["config"]["model_path"] = model_dir
+        else:
+            # Only the unet & text encoder are affected by LoRA, so it's better to use the base model ID for
+            # other models: the Olive cache is based on the JSON config, and two LoRA variants with the same
+            # base model ID should be able to reuse previously optimized copies.
+            olive_config["input_model"]["config"]["model_path"] = model_dir
+
+        run_res = olive_run(olive_config)
+
+        from sd_utils.ort import save_optimized_onnx_submodel
+
+        save_optimized_onnx_submodel(submodel_name, provider, model_info)
+
+    from sd_utils.ort import save_onnx_pipeline
+
+    save_onnx_pipeline(
+        has_safety_checker, model_info, model_output, pipeline, submodel_names
+    )
+
+    return model_info
+
+
+def parse_common_args(raw_args):
+    parser = argparse.ArgumentParser("Common arguments")
+
+    parser.add_argument("--model_input", default="stable-diffusion-v1-5", type=str)
+    parser.add_argument("--model_output", default="stable-diffusion-v1-5", type=Path)
+    parser.add_argument("--controlnet",action="store_true", help="Create ControlNet Unet Model")
+    parser.add_argument(
+        "--provider", default="dml", type=str, choices=["dml", "cuda"], help="Execution provider to use"
+    )
+    parser.add_argument("--optimize", action="store_true", help="Runs the optimization step")
+    parser.add_argument("--clean_cache", action="store_true", help="Deletes the Olive cache")
+    parser.add_argument("--test_unoptimized", action="store_true", help="Use unoptimized model for inference")
+    parser.add_argument("--batch_size", default=1, type=int, help="Number of images to generate per batch")
+    parser.add_argument(
+        "--prompt",
+        default=(
+            "castle surrounded by water and nature, village, volumetric lighting, photorealistic, "
+            "detailed and intricate, fantasy, epic cinematic shot, mountains, 8k ultra hd"
+        ),
+        type=str,
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        default=7.5,
+        type=float,
+        help="Guidance scale as defined in Classifier-Free Diffusion Guidance",
+    )
+    parser.add_argument("--num_images", default=1, type=int, help="Number of images to generate")
+    parser.add_argument("--num_inference_steps", default=50, type=int, help="Number of steps in diffusion process")
+    parser.add_argument("--tempdir", default=None, type=str, help="Root directory for tempfile directories and files")
+    parser.add_argument(
+        "--strength",
+        default=1.0,
+        type=float,
+        help="Value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. "
+        "Values that approach 1.0 enable lots of variations but will also produce images "
+        "that are not semantically consistent with the input.",
+    )
+    parser.add_argument("--image_size", default=512, type=int, help="Width and height of the images to generate")
+
+    return parser.parse_known_args(raw_args)
+
+
+def parse_ort_args(raw_args):
+    parser = argparse.ArgumentParser("ONNX Runtime arguments")
+
+    parser.add_argument(
+        "--static_dims",
+        action="store_true",
+        help="DEPRECATED (now enabled by default). Use --dynamic_dims to disable static_dims.",
+    )
+    parser.add_argument("--dynamic_dims", action="store_true", help="Disable static shape optimization")
+
+    return parser.parse_known_args(raw_args)
+
+
+def main(raw_args=None):
+    common_args, extra_args = parse_common_args(raw_args)
+
+    provider = common_args.provider
+    model_input = common_args.model_input
+    model_output = common_args.model_output
+
+    script_dir = Path(__file__).resolve().parent
+
+
+    if common_args.clean_cache:
+        shutil.rmtree(script_dir / "cache", ignore_errors=True)
+
+    guidance_scale = common_args.guidance_scale
+
+    ort_args = None, None
+    ort_args, extra_args = parse_ort_args(extra_args)
+
+    if common_args.optimize or not model_output.exists():
+        set_tempdir(common_args.tempdir)
+
+        # TODO(jstoecker): clean up warning filter (mostly during conversion from torch to ONNX)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+        
+            from sd_utils.ort import validate_args
+
+            validate_args(ort_args, common_args.provider)
+            optimize(common_args.model_input, common_args.model_output, common_args.provider, common_args.controlnet)
+
+    if not common_args.optimize:
+        model_dir = model_output / "F32" if common_args.test_unoptimized else model_output / "F16"
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+      
+            from sd_utils.ort import get_ort_pipeline
+
+            pipeline = get_ort_pipeline(model_dir, common_args, ort_args, guidance_scale)
+            run_inference_loop(
+                pipeline,
+                common_args.prompt,
+                common_args.num_images,
+                common_args.batch_size,
+                common_args.image_size,
+                common_args.num_inference_steps,
+                guidance_scale,
+                common_args.strength,
+                provider=provider,
+            )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/OnnxStack.Converter/stable_diffusion/models.py b/OnnxStack.Converter/stable_diffusion/models.py
new file mode 100644
index 0000000..196135d
--- /dev/null
+++ b/OnnxStack.Converter/stable_diffusion/models.py
@@ -0,0 +1,342 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import config
+import torch
+from typing import Union, Optional, Tuple
+from diffusers import AutoencoderKL, UNet2DConditionModel, ControlNetModel
+from diffusers.models.controlnet import ControlNetOutput, BaseOutput as ControlNetBaseOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from transformers.models.clip.modeling_clip import CLIPTextModel
+from dataclasses import dataclass
+
+# Helper latency-only dataloader that creates random tensors with no label
+class RandomDataLoader:
+    def __init__(self, create_inputs_func, batchsize, torch_dtype):
+        self.create_input_func = create_inputs_func
+        self.batchsize = batchsize
+        self.torch_dtype = torch_dtype
+
+    def __getitem__(self, idx):
+        label = None
+        return self.create_input_func(self.batchsize, self.torch_dtype), label
+
+
+
+# -----------------------------------------------------------------------------
+# TEXT ENCODER
+# -----------------------------------------------------------------------------
+
+
+def text_encoder_inputs(batchsize, torch_dtype):
+    return torch.zeros((batchsize, 77), dtype=torch_dtype)
+
+
+def text_encoder_load(model_name):
+    model = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder")
+    if is_lora_model(model_name):
+        merge_lora_weights(model, model_name, "text_encoder")
+    return model
+
+
+def text_encoder_conversion_inputs(model=None):
+    return text_encoder_inputs(1, torch.int32)
+
+
+def text_encoder_data_loader(data_dir, batchsize, *args, **kwargs):
+    return RandomDataLoader(text_encoder_inputs, batchsize, torch.int32)
+
+
+# -----------------------------------------------------------------------------
+# UNET
+# -----------------------------------------------------------------------------
+
+
+def unet_inputs(batchsize, torch_dtype, is_conversion_inputs=False):
+    # TODO(jstoecker): Rename onnx::Concat_4 to text_embeds and onnx::Shape_5 to time_ids
+    inputs = {
+        "sample": torch.rand((batchsize, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype),
+        "timestep": torch.rand((batchsize,), dtype=torch_dtype),
+        "encoder_hidden_states": torch.rand((batchsize, 77, config.cross_attention_dim), dtype=torch_dtype),
+    }
+
+    # use as kwargs since they won't be in the correct position if passed along with the tuple of inputs
+    kwargs = {
+        "return_dict": False,
+    }
+    if is_conversion_inputs:
+        inputs["additional_inputs"] = {
+            **kwargs,
+            "added_cond_kwargs": {
+                "text_embeds": torch.rand((1, 1280), dtype=torch_dtype),
+                "time_ids": torch.rand((1, 5), dtype=torch_dtype),
+            },
+        }
+    else:
+        inputs.update(kwargs)
+        inputs["onnx::Concat_4"] = torch.rand((1, 1280), dtype=torch_dtype)
+        inputs["onnx::Shape_5"] = torch.rand((1, 5), dtype=torch_dtype)
+
+    return inputs
+
+
+def get_unet_ov_example_input():
+    import numpy as np
+
+    encoder_hidden_state = torch.ones((2, 77, 768))
+    latents_shape = (2, 4, 512 // 8, 512 // 8)
+    latents = torch.randn(latents_shape)
+    t = torch.from_numpy(np.array(1, dtype=float))
+    return (latents, t, encoder_hidden_state)
+
+
+def unet_load(model_name):
+    model = UNet2DConditionModel.from_pretrained(model_name, subfolder="unet")
+    if is_lora_model(model_name):
+        merge_lora_weights(model, model_name, "unet")
+    return model
+
+
+def unet_conversion_inputs(model=None):
+    return tuple(unet_inputs(1, torch.float32, True).values())
+
+
+def unet_data_loader(data_dir, batchsize, *args, **kwargs):
+    return RandomDataLoader(unet_inputs, batchsize, torch.float16)
+
+# -----------------------------------------------------------------------------
+# CONTROLNET - UNET
+# -----------------------------------------------------------------------------
+
+class PatchedUNet2DConditionModel(UNet2DConditionModel):
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        down_block_0_additional_residual: torch.Tensor,
+        down_block_1_additional_residual: torch.Tensor,
+        down_block_2_additional_residual: torch.Tensor,
+        down_block_3_additional_residual: torch.Tensor,
+        down_block_4_additional_residual: torch.Tensor,
+        down_block_5_additional_residual: torch.Tensor,
+        down_block_6_additional_residual: torch.Tensor,
+        down_block_7_additional_residual: torch.Tensor,
+        down_block_8_additional_residual: torch.Tensor,
+        down_block_9_additional_residual: torch.Tensor,
+        down_block_10_additional_residual: torch.Tensor,
+        down_block_11_additional_residual: torch.Tensor,
+        mid_block_additional_residual: torch.Tensor,
+    ) -> Union[UNet2DConditionModel, Tuple]:
+        down_block_add_res = (
+            down_block_0_additional_residual, down_block_1_additional_residual, down_block_2_additional_residual,
+            down_block_3_additional_residual, down_block_4_additional_residual, down_block_5_additional_residual,
+            down_block_6_additional_residual, down_block_7_additional_residual, down_block_8_additional_residual,
+            down_block_9_additional_residual, down_block_10_additional_residual, down_block_11_additional_residual)
+        return super().forward(
+            sample = sample,
+            timestep = timestep,
+            encoder_hidden_states = encoder_hidden_states,
+            down_block_additional_residuals = down_block_add_res,
+            mid_block_additional_residual = mid_block_additional_residual,
+            return_dict = False
+        )
+
+def controlnet_unet_inputs(batchsize, torch_dtype):
+    return {
+        "sample": torch.rand((batchsize, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype),
+        "timestep": torch.rand((batchsize,), dtype=torch_dtype),
+        "encoder_hidden_states": torch.rand((batchsize, 77, config.cross_attention_dim), dtype=torch_dtype),
+        "down_block_0_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype),
+        "down_block_1_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype),
+        "down_block_2_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype),
+        "down_block_3_additional_residual": torch.rand((batchsize, 320, config.unet_sample_size // 2, config.unet_sample_size // 2), dtype=torch_dtype),
+        "down_block_4_additional_residual": torch.rand((batchsize, 640, config.unet_sample_size // 2, config.unet_sample_size // 2), dtype=torch_dtype),
+        "down_block_5_additional_residual": torch.rand((batchsize, 640, config.unet_sample_size // 2, config.unet_sample_size // 2), dtype=torch_dtype),
+        "down_block_6_additional_residual": torch.rand((batchsize, 640, config.unet_sample_size // 4, config.unet_sample_size // 4), dtype=torch_dtype),
+        "down_block_7_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 4, config.unet_sample_size // 4), dtype=torch_dtype),
+        "down_block_8_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 4, config.unet_sample_size // 4), dtype=torch_dtype),
+        "down_block_9_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype),
+        "down_block_10_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype),
+        "down_block_11_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype),
+        "mid_block_additional_residual": torch.rand((batchsize, 1280, config.unet_sample_size // 8, config.unet_sample_size // 8), dtype=torch_dtype)
+    }
+
+
+def controlnet_unet_load(model_name):
+    model = PatchedUNet2DConditionModel.from_pretrained(model_name, subfolder="unet")
+    return model
+
+
+def controlnet_unet_conversion_inputs(model):
+    return tuple(controlnet_unet_inputs(1, torch.float32).values())
+
+
+def controlnet_unet_data_loader(data_dir, batchsize, *args, **kwargs):
+    return RandomDataLoader(controlnet_unet_inputs, batchsize, torch.float16)
+	
+# -----------------------------------------------------------------------------
+# VAE ENCODER
+# -----------------------------------------------------------------------------
+
+
+def vae_encoder_inputs(batchsize, torch_dtype):
+    return {"sample": torch.rand((batchsize, 3, config.vae_sample_size, config.vae_sample_size), dtype=torch_dtype)}
+
+
+def vae_encoder_load(model_name):
+    model = AutoencoderKL.from_pretrained(model_name, subfolder="vae")
+    model.forward = lambda sample: model.encode(sample)[0].sample()
+    return model
+
+
+def vae_encoder_conversion_inputs(model=None):
+    return tuple(vae_encoder_inputs(1, torch.float32).values())
+
+
+def vae_encoder_data_loader(data_dir, batchsize, *args, **kwargs):
+    return RandomDataLoader(vae_encoder_inputs, batchsize, torch.float16)
+
+
+# -----------------------------------------------------------------------------
+# VAE DECODER
+# -----------------------------------------------------------------------------
+
+
+def vae_decoder_inputs(batchsize, torch_dtype):
+    return {
+        "latent_sample": torch.rand((batchsize, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype)
+    }
+
+
+def vae_decoder_load(model_name):
+    model = AutoencoderKL.from_pretrained(model_name, subfolder="vae")
+    model.forward = model.decode
+    return model
+
+
+def vae_decoder_conversion_inputs(model=None):
+    return tuple(vae_decoder_inputs(1, torch.float32).values())
+
+
+def vae_decoder_data_loader(data_dir, batchsize, *args, **kwargs):
+    return RandomDataLoader(vae_decoder_inputs, batchsize, torch.float16)
+
+
+# -----------------------------------------------------------------------------
+# SAFETY CHECKER
+# -----------------------------------------------------------------------------
+
+
+def safety_checker_inputs(batchsize, torch_dtype):
+    return {
+        "clip_input": torch.rand((batchsize, 3, 224, 224), dtype=torch_dtype),
+        "images": torch.rand((batchsize, config.vae_sample_size, config.vae_sample_size, 3), dtype=torch_dtype),
+    }
+
+
+def safety_checker_load(model_name):
+    model = StableDiffusionSafetyChecker.from_pretrained(model_name, subfolder="safety_checker")
+    model.forward = model.forward_onnx
+    return model
+
+
+def safety_checker_conversion_inputs(model=None):
+    return tuple(safety_checker_inputs(1, torch.float32).values())
+
+
+def safety_checker_data_loader(data_dir, batchsize, *args, **kwargs):
+    return RandomDataLoader(safety_checker_inputs, batchsize, torch.float16)
+
+
+# -----------------------------------------------------------------------------
+# LoRA weights
+# -----------------------------------------------------------------------------
+
+def is_lora_model(model_name):
+    # TODO(jstoecker): might be a better way to detect (e.g. presence of LORA weights file)
+    return False
+
+
+# Merges LoRA weights into the layers of a base model
+def merge_lora_weights(base_model, lora_model_id, submodel_name="unet", scale=1.0):
+    import inspect
+    from collections import defaultdict
+    from functools import reduce
+
+    try:
+        from diffusers.loaders import LORA_WEIGHT_NAME
+    except ImportError:
+        # moved in version 0.24.0
+        from diffusers.loaders.lora import LORA_WEIGHT_NAME
+    from diffusers.models.attention_processor import LoRAAttnProcessor
+    from diffusers.utils.hub_utils import _get_model_file
+
+    parameters = inspect.signature(_get_model_file).parameters
+
+    kwargs = {}
+    if "use_auth_token" in parameters:
+        kwargs["use_auth_token"] = None
+    elif "token" in parameters:
+        kwargs["token"] = None
+
+    # Load LoRA weights
+    model_file = _get_model_file(
+        lora_model_id,
+        weights_name=LORA_WEIGHT_NAME,
+        cache_dir=None,
+        force_download=False,
+        resume_download=False,
+        proxies=None,
+        local_files_only=False,
+        revision=None,
+        subfolder=None,
+        user_agent={
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        },
+        **kwargs,
+    )
+    lora_state_dict = torch.load(model_file, map_location="cpu")
+
+    # All keys in the LoRA state dictionary should have 'lora' somewhere in the string.
+    keys = list(lora_state_dict.keys())
+    assert all("lora" in k for k in keys)
+
+    if all(key.startswith(submodel_name) for key in keys):
+        # New format (https://github.com/huggingface/diffusers/pull/2918) supports LoRA weights in both the
+        # unet and text encoder where keys are prefixed with 'unet' or 'text_encoder', respectively.
+        submodel_state_dict = {k: v for k, v in lora_state_dict.items() if k.startswith(submodel_name)}
+    else:
+        # Old format. Keys will not have any prefix. This only applies to unet, so exit early if this is
+        # optimizing the text encoder.
+        if submodel_name != "unet":
+            return
+        submodel_state_dict = lora_state_dict
+
+    # Group LoRA weights into attention processors
+    attn_processors = {}
+    lora_grouped_dict = defaultdict(dict)
+    for key, value in submodel_state_dict.items():
+        attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
+        lora_grouped_dict[attn_processor_key][sub_key] = value
+
+    for key, value_dict in lora_grouped_dict.items():
+        rank = value_dict["to_k_lora.down.weight"].shape[0]
+        cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[1]
+        hidden_size = value_dict["to_k_lora.up.weight"].shape[0]
+
+        attn_processors[key] = LoRAAttnProcessor(
+            hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank
+        )
+        attn_processors[key].load_state_dict(value_dict)
+
+    # Merge LoRA attention processor weights into existing Q/K/V/Out weights
+    for name, proc in attn_processors.items():
+        attention_name = name[: -len(".processor")]
+        attention = reduce(getattr, attention_name.split(sep="."), base_model)
+        attention.to_q.weight.data += scale * torch.mm(proc.to_q_lora.up.weight, proc.to_q_lora.down.weight)
+        attention.to_k.weight.data += scale * torch.mm(proc.to_k_lora.up.weight, proc.to_k_lora.down.weight)
+        attention.to_v.weight.data += scale * torch.mm(proc.to_v_lora.up.weight, proc.to_v_lora.down.weight)
+        attention.to_out[0].weight.data += scale * torch.mm(proc.to_out_lora.up.weight, proc.to_out_lora.down.weight)
diff --git a/OnnxStack.Converter/stable_diffusion/requirements.txt b/OnnxStack.Converter/stable_diffusion/requirements.txt
new file mode 100644
index 0000000..15b9198
--- /dev/null
+++ b/OnnxStack.Converter/stable_diffusion/requirements.txt
@@ -0,0 +1,9 @@
+accelerate
+diffusers
+onnx
+pillow
+protobuf==3.20.3 # protobuf 4.x aborts with OOM when optimizing unet
+tabulate
+torch
+transformers
+onnxruntime-directml>=1.16.0
diff --git a/OnnxStack.Converter/stable_diffusion/sd_utils/ort.py b/OnnxStack.Converter/stable_diffusion/sd_utils/ort.py
new file mode 100644
index 0000000..ad49818
--- /dev/null
+++ b/OnnxStack.Converter/stable_diffusion/sd_utils/ort.py
@@ -0,0 +1,172 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import os
+import json
+import shutil
+import sys
+from pathlib import Path
+from typing import Dict
+
+import onnxruntime as ort
+from diffusers import OnnxRuntimeModel, OnnxStableDiffusionPipeline
+from onnxruntime import __version__ as OrtVersion
+from packaging import version
+
+from olive.model import ONNXModelHandler
+
+# ruff: noqa: TID252, T201
+
+
+def update_cuda_config(config: Dict):
+    if version.parse(OrtVersion) < version.parse("1.17.0"):
+        # disable skip_group_norm fusion since there is a shape inference bug which leads to invalid models
+        config["passes"]["optimize_cuda"]["config"]["optimization_options"] = {"enable_skip_group_norm": False}
+    config["pass_flows"] = [["convert", "optimize_cuda"]]
+    config["systems"]["local_system"]["config"]["accelerators"][0]["execution_providers"] = ["CUDAExecutionProvider"]
+    return config
+
+
+def validate_args(args, provider):
+    ort.set_default_logger_severity(4)
+    if args.static_dims:
+        print(
+            "WARNING: the --static_dims option is deprecated, and static shape optimization is enabled by default. "
+            "Use --dynamic_dims to disable static shape optimization."
+        )
+
+    validate_ort_version(provider)
+
+
+def validate_ort_version(provider: str):
+    if provider == "dml" and version.parse(OrtVersion) < version.parse("1.16.0"):
+        print("This script requires onnxruntime-directml 1.16.0 or newer")
+        sys.exit(1)
+    elif provider == "cuda" and version.parse(OrtVersion) < version.parse("1.17.0"):
+        if version.parse(OrtVersion) < version.parse("1.16.2"):
+            print("This script requires onnxruntime-gpu 1.16.2 or newer")
+            sys.exit(1)
+        print(
+            f"WARNING: onnxruntime {OrtVersion} has known issues with shape inference for SkipGroupNorm. Will disable"
+            " skip_group_norm fusion. onnxruntime-gpu 1.17.0 or newer is strongly recommended!"
+        )
+
+
+def save_optimized_onnx_submodel(submodel_name, provider, model_info):
+    footprints_file_path = (
+        Path(__file__).resolve().parents[1] / "footprints" / f"{submodel_name}_gpu-{provider}_footprints.json"
+    )
+    with footprints_file_path.open("r") as footprint_file:
+        footprints = json.load(footprint_file)
+
+        conversion_footprint = None
+        optimizer_footprint = None
+        for footprint in footprints.values():
+            if footprint["from_pass"] == "OnnxConversion":
+                conversion_footprint = footprint
+            elif footprint["from_pass"] == "OrtTransformersOptimization":
+                optimizer_footprint = footprint
+
+        assert conversion_footprint
+        assert optimizer_footprint
+
+        unoptimized_olive_model = ONNXModelHandler(**conversion_footprint["model_config"]["config"])
+        optimized_olive_model = ONNXModelHandler(**optimizer_footprint["model_config"]["config"])
+
+        model_info[submodel_name] = {
+            "unoptimized": {
+                "path": Path(unoptimized_olive_model.model_path),
+            },
+            "optimized": {
+                "path": Path(optimized_olive_model.model_path),
+            },
+        }
+
+        print(f"Unoptimized Model : {model_info[submodel_name]['unoptimized']['path']}")
+        print(f"Optimized Model   : {model_info[submodel_name]['optimized']['path']}")
+
+
+def save_onnx_pipeline(
+    has_safety_checker, model_info, model_output, pipeline, submodel_names
+):
+    # Save the unoptimized models in a directory structure that the diffusers library can load and run.
+    # This is optional, and the optimized models can be used directly in a custom pipeline if desired.
+    print("\nCreating ONNX pipeline...")
+
+    optimized_model_dir = model_output / "Optimized"
+    unoptimized_model_dir = model_output / "Default"
+    has_controlnet = 'controlnet' in submodel_names
+    if has_safety_checker:
+        safety_checker = OnnxRuntimeModel.from_pretrained(model_info["safety_checker"]["unoptimized"]["path"].parent)
+    else:
+        safety_checker = None
+
+    onnx_pipeline = OnnxStableDiffusionPipeline(
+        vae_encoder=OnnxRuntimeModel.from_pretrained(model_info["vae_encoder"]["unoptimized"]["path"].parent),
+        vae_decoder=OnnxRuntimeModel.from_pretrained(model_info["vae_decoder"]["unoptimized"]["path"].parent),
+        text_encoder=OnnxRuntimeModel.from_pretrained(model_info["text_encoder"]["unoptimized"]["path"].parent),
+        tokenizer=pipeline.tokenizer,
+        unet=OnnxRuntimeModel.from_pretrained(model_info["unet"]["unoptimized"]["path"].parent),
+        scheduler=pipeline.scheduler,
+        safety_checker=safety_checker,
+        feature_extractor=pipeline.feature_extractor,
+        requires_safety_checker=True,
+    )
+
+    if has_controlnet:
+        controlnet=OnnxRuntimeModel.from_pretrained(model_info["controlnet"]["unoptimized"]["path"].parent)
+
+    print("Saving unoptimized models...")
+    onnx_pipeline.save_pretrained(unoptimized_model_dir)
+    if has_controlnet:
+        controlnet.save_pretrained(unoptimized_model_dir / "controlnet" )
+
+    # Create a copy of the unoptimized model directory, then overwrite with optimized models from the olive cache.
+    print("Copying optimized models...")
+    shutil.copytree(unoptimized_model_dir, optimized_model_dir, ignore=shutil.ignore_patterns("weights.pb"))
+    for submodel_name in submodel_names:
+        src_path = model_info[submodel_name]["optimized"]["path"]
+        dst_path = optimized_model_dir / submodel_name / "model.onnx"
+        exists = os.path.exists(dst_path)
+        if not exists:
+            os.mkdir(optimized_model_dir / submodel_name)
+        shutil.copyfile(src_path, dst_path)
+
+    print(f"The default pipeline is located here: {unoptimized_model_dir}")
+    print(f"The optimized pipeline is located here: {optimized_model_dir}")
+
+
+def get_ort_pipeline(model_dir, common_args, ort_args, guidance_scale):
+    ort.set_default_logger_severity(3)
+
+    print("Loading models into ORT session...")
+    sess_options = ort.SessionOptions()
+    sess_options.enable_mem_pattern = False
+
+    static_dims = not ort_args.dynamic_dims
+    batch_size = common_args.batch_size
+    image_size = common_args.image_size
+    provider = common_args.provider
+
+    if static_dims:
+        hidden_batch_size = batch_size if (guidance_scale == 0.0) else batch_size * 2
+        # Not necessary, but helps DML EP further optimize runtime performance.
+        # batch_size is doubled for sample & hidden state because of classifier free guidance:
+        # https://github.com/huggingface/diffusers/blob/46c52f9b9607e6ecb29c782c052aea313e6487b7/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L672
+        sess_options.add_free_dimension_override_by_name("unet_sample_batch", hidden_batch_size)
+        sess_options.add_free_dimension_override_by_name("unet_sample_channels", 4)
+        sess_options.add_free_dimension_override_by_name("unet_sample_height", image_size // 8)
+        sess_options.add_free_dimension_override_by_name("unet_sample_width", image_size // 8)
+        sess_options.add_free_dimension_override_by_name("unet_time_batch", 1)
+        sess_options.add_free_dimension_override_by_name("unet_hidden_batch", hidden_batch_size)
+        sess_options.add_free_dimension_override_by_name("unet_hidden_sequence", 77)
+
+    provider_map = {
+        "dml": "DmlExecutionProvider",
+        "cuda": "CUDAExecutionProvider",
+    }
+    assert provider in provider_map, f"Unsupported provider: {provider}"
+    return OnnxStableDiffusionPipeline.from_pretrained(
+        model_dir, provider=provider_map[provider], sess_options=sess_options
+    )