In [1]:
import os
import torch
import subprocess
from diffusers import UNet2DConditionModel, AutoencoderKL
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers.models.attention_processor import AttnProcessor


In [2]:
# ============================================================
# CONFIG
# ============================================================

MODEL_PATH = "/teamspace/studios/this_studio/Latent-Diffusion-Model-for-text-to-image-generation/ldm_checkpoints/epoch_11"
OUTPUT_DIR = "./openvino_models"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
# ============================================================
# LOAD MODELS
# ============================================================

print("Loading UNet...")
unet = UNet2DConditionModel.from_pretrained(
    os.path.join(MODEL_PATH, "unet"),
    use_safetensors=True
)
unet.set_attn_processor(AttnProcessor())     # turn OFF optimized attention
unet.to(torch.float32)               # ensure fp32
unet.eval()

print("Loading Text Encoder...")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32").to(DEVICE).eval()
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

print("Loading VAE...")
vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse").to(DEVICE).eval()

Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


Loading UNet...
Loading Text Encoder...


Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


Loading VAE...


In [4]:
class VAEDecoderWrapper(torch.nn.Module):
    def __init__(self, vae):
        super().__init__()
        self.vae = vae

    def forward(self, latent):
        # This replicates SD's actual forward output
        return self.vae.decode(latent).sample

In [8]:
# ============================================================
# EXPORT TO ONNX
# ============================================================

def export_unet():
    print("\n=== Exporting UNet to ONNX ===")

    # dummy_latent = torch.randn(2, 4, 16, 16)   # batch=2 for CFG (uncond + cond)
    # dummy_timestep = torch.tensor([10], dtype=torch.int64)  # scheduler timestep dtype long
    # dummy_context = torch.randn(2, 77, 512)   # CLIP text encoder output size = 512
    dummy_latent = torch.randn(2, 4, 16, 16).float()
    dummy_timestep = torch.tensor([10.0], dtype=torch.float32) 
    dummy_context = torch.randn(2, 77, 512).float()

    onnx_path = os.path.join(OUTPUT_DIR, "unet.onnx")

    torch.onnx.export(
        unet,
        (dummy_latent, dummy_timestep, dummy_context),
        onnx_path,
        input_names=["latent", "timestep", "context"],
        output_names=["noise_pred"],
        opset_version=17,
        dynamic_axes={
            "latent": {0: "batch", 2: "h", 3: "w"},
            "context": {0: "batch"},
        }
    )
    print("UNet exported:", onnx_path)
    return onnx_path

# def export_text_encoder():
#     print("\n=== Exporting Text Encoder to ONNX ===")
#     # text_encoder.to(DEVICE)
#     tokens = tokenizer(
#         ["a dummy sample text"],
#         padding="max_length",
#         max_length=77,
#         return_tensors="pt"
#     ).to(DEVICE)

#     onnx_path = os.path.join(OUTPUT_DIR, "text_encoder.onnx")

#     torch.onnx.export(
#         text_encoder,
#         (tokens["input_ids"], tokens["attention_mask"]),
#         onnx_path,
#         input_names=["input_ids", "attention_mask"],
#         output_names=["last_hidden_state"],
#         opset_version=17,
#         dynamic_axes={
#             "input_ids": {0: "batch"},
#             "attention_mask": {0: "batch"}
#         }
#     )

#     print("Text Encoder exported:", onnx_path)
#     return onnx_path
def export_text_encoder():
    print("\n=== Exporting Text Encoder to ONNX ===")

    # Move model to CPU for export
    text_encoder_cpu = text_encoder.to(DEVICE).eval()

    # Create CPU dummy inputs
    tokens = tokenizer(
        ["a dummy sample text"],
        padding="max_length",
        max_length=77,
        truncation=True,
        return_tensors="pt"
    )

    input_ids = tokens["input_ids"].to(DEVICE)
    attention_mask = tokens["attention_mask"].to(DEVICE)

    onnx_path = os.path.join(OUTPUT_DIR, "text_encoder.onnx")

    torch.onnx.export(
        text_encoder_cpu,
        (input_ids, attention_mask),
        onnx_path,
        input_names=["input_ids", "attention_mask"],
        output_names=["last_hidden_state"],
        dynamic_axes={
            "input_ids":       {0: "batch", 1: "sequence"},
            "attention_mask":  {0: "batch", 1: "sequence"},
            "last_hidden_state": {0: "batch", 1: "sequence"},
        },
        opset_version=17
    )

    print("Text Encoder exported:", onnx_path)
    return onnx_path

# def export_vae_decoder():
#     print("\n=== Exporting VAE Decoder to ONNX ===")

#     dummy_latent = torch.randn(1, 4, 16, 16).to(DEVICE)   # correct latent resolution for 64Ã—64 VAE

#     onnx_path = os.path.join(OUTPUT_DIR, "vae_decoder.onnx")

#     torch.onnx.export(
#         vae.decoder,
#         (dummy_latent,),
#         onnx_path,
#         input_names=["latent"],
#         output_names=["image"],
#         opset_version=17,
#         dynamic_axes={
#             "latent": {0: "batch", 2: "h", 3: "w"},
#         }
#     )

#     print("VAE Decoder exported:", onnx_path)
#     return onnx_path

def export_vae_decoder():
    print("\n=== Exporting VAE Decoder to ONNX ===")

    vae_cpu = VAEDecoderWrapper(vae).to(DEVICE).eval()

    dummy_latent = torch.randn(1, 4, 16, 16, dtype=torch.float32).to(DEVICE)

    onnx_path = os.path.join(OUTPUT_DIR, "vae_decoder.onnx")

    torch.onnx.export(
        vae_cpu,
        (dummy_latent,),
        onnx_path,
        input_names=["latent"],
        output_names=["image"],
        dynamic_axes={
            "latent": {0: "batch", 2: "height", 3: "width"},
            "image":  {0: "batch"},
        },
        opset_version=17
    )

    print("VAE Decoder exported:", onnx_path)
    return onnx_path


In [9]:

# ============================================================
# CONVERT ONNX â†’ OPENVINO IR
# ============================================================

def convert_to_openvino(onnx_path):
    print(f"\n=== Converting {onnx_path} to OpenVINO IR ===")

    command = [
        "mo",
        "--input_model", onnx_path,
        "--output_dir", OUTPUT_DIR
    ]

    subprocess.run(command, check=True)
    print("Converted:", onnx_path)

In [10]:
# ============================================================
# MAIN PIPELINE
# ============================================================
unet_onnx = export_unet()
text_onnx = export_text_encoder()
vae_onnx = export_vae_decoder()

convert_to_openvino(unet_onnx)
convert_to_openvino(text_onnx)
convert_to_openvino(vae_onnx)

print("\nðŸŽ‰ All models successfully converted to OpenVINO!")
print(f"Files saved in: {OUTPUT_DIR}")


=== Exporting UNet to ONNX ===


  torch.onnx.export(


UNet exported: ./openvino_models/unet.onnx

=== Exporting Text Encoder to ONNX ===


  torch.onnx.export(
  if input_shape[-1] > 1 or self.sliding_window is not None:
  if past_key_values_length > 0:


Text Encoder exported: ./openvino_models/text_encoder.onnx

=== Exporting VAE Decoder to ONNX ===


  torch.onnx.export(


VAE Decoder exported: ./openvino_models/vae_decoder.onnx

=== Converting ./openvino_models/unet.onnx to OpenVINO IR ===
[ INFO ] MO command line tool is considered as the legacy conversion API as of OpenVINO 2023.2 release.
In 2025.0 MO command line tool and openvino.tools.mo.convert_model() will be removed. Please use OpenVINO Model Converter (OVC) or openvino.convert_model(). OVC represents a lightweight alternative of MO and provides simplified model conversion API. 
Find more information about transition from MO to OVC at https://docs.openvino.ai/2023.2/openvino_docs_OV_Converter_UG_prepare_model_convert_model_MO_OVC_transition.html
[ INFO ] Generated IR will be compressed to FP16. If you get lower accuracy, please consider disabling compression explicitly by adding argument --compress_to_fp16=False.
Find more information about compression to FP16 at https://docs.openvino.ai/2023.0/openvino_docs_MO_DG_FP16_Compression.html
Check for a new version of Intel(R) Distribution of OpenVIN