In [1]:
import torch

from ldm_uncond.latent_diffusion_uncond import LDMPipeline
from eval.evaluation import BaseEvaluator, ONNXEvaluator

# Define hardware
dev = "rtx_3070"

  from .autonotebook import tqdm as notebook_tqdm


## Baseline - 32bit Model

In [2]:
# Init evaluator
baseline_evaluator= BaseEvaluator(dev=dev, perf_cls="baseline")

# Init inputs and model
DTYPE = torch.float32
DEVICE = torch.device('cuda')
diffusion_pipeline = LDMPipeline().to(device=DEVICE, dtype=DTYPE)

noise = torch.randn((1, 3, 64, 64), dtype=DTYPE, device=DEVICE)

# Evaluate the baseline
baseline_evaluator.evaluate(diffusion_pipeline,noise)

del diffusion_pipeline
del baseline_evaluator
torch.cuda.empty_cache()

Warming up...
Warmup complete!
Evaluation iter: 0
Evaluation iter: 1
Timestep = 000

## Optimization 1 - JIT Compilation

In [None]:
# Init evaluator
optim1_evaluator= BaseEvaluator(dev=dev, perf_cls="32bit_jit")

# Init inputs and model
DTYPE = torch.float32
DEVICE = torch.device('cuda')

noise = torch.randn((1, 3, 64, 64), dtype=DTYPE, device=DEVICE)
torchscript_model = torch.jit.load("output/optim/model_jit_fp32_cuda.ptl")

# Evaluate optimization-1
with torch.no_grad():
    optim1_evaluator.evaluate(torchscript_model,noise)

del torchscript_model
torch.cuda.empty_cache()

## Optimization 2 - Quantization (16-bit) + JIT Compilation

In [None]:
# Init evaluator
optim2_evaluator= BaseEvaluator(dev=dev, perf_cls="16bit_jit")

# Init inputs and model
DTYPE = torch.float16
DEVICE = torch.device('cuda')

noise = torch.randn((1, 3, 64, 64), dtype=DTYPE, device=DEVICE)
torchscript_model = torch.jit.load("output/optim/model_jit_fp16_cuda.ptl")

# Evaluate optimization-2
with torch.no_grad():
    optim2_evaluator.evaluate(torchscript_model,noise)

del torchscript_model
torch.cuda.empty_cache()

## Optimization 3 - ONNX Runtime (Graph optimizations + Transformer specific optimizations)

In [None]:
# Init inputs and model
DTYPE = torch.float32
DEVICE = torch.device('cuda')

noise = torch.randn((1, 3, 64, 64), dtype=DTYPE)

#### Vanilla ONNX

In [None]:
# Init evaluator
optim3_1 = ONNXEvaluator("output/optim/model_onnx_fp32_cpu.onnx", dev=dev, perf_cls="onnx_vanilla")
optim3_1.evaluate(noise)

del optim3_1

#### Optimized ONNX 

In [None]:
# Init evaluator
optim3_2 = ONNXEvaluator("output/optim/model_onnx_fp32_cpu_optimized.onnx", dev=dev, perf_cls="onnx_optim")
optim3_2.evaluate(noise)

del optim3_2

#### Transformer Optimized ONNX

In [None]:
# Init evaluator
optim3_3 = ONNXEvaluator("output/optim/model_onnx_fp32_cpu_optimized_tf.onnx", dev=dev, perf_cls="onnx_optim_tf")
optim3_3.evaluate(noise)

del optim3_3

In [None]:
!git add output/eval_data/
!git commit -m "Adding evaluation pipeline and notebooks"
!git push

## Optimization 4 - TensorRT (Layer & Tensor fusion + Quantization (16-bit) + JIT Compilation)

### Load optimized UNet

In [None]:
# Init evaluator
optim1_evaluator= BaseEvaluator(dev=dev, perf_cls="32bit_jit")

optimized_diffusion_pipeline = LDMPipeline()

optimized_diffusion_pipeline = optimized_diffusion_pipeline.to(device=DEVICE, dtype=DTYPE)
optimized_diffusion_pipeline.load_optimized_unet("uldm_unet_fp16_sim.ts")
optimized_diffusion_pipeline.eval()
optimized_diffusion_pipeline.warmup()

### Sample from optimized network

In [None]:
%%time
## Generate sample

noise = torch.randn((1, 3, 64, 64), dtype=DTYPE, device=DEVICE)
# with torch.cuda.amp.autocast():
%timeit %memit sample = optimized_diffusion_pipeline(noise)