In [2]:
%load_ext memory_profiler

In [3]:
import torch
import matplotlib.pyplot as plt

from ldm_uncond.latent_diffusion_uncond import LDMPipeline
from torch.profiler import profile, record_function, ProfilerActivity
from eval.power import PowerReader

DTYPE = torch.float16
DEVICE = torch.device('cuda')
# Init PowerReader
pr = PowerReader()

### Init pipeline, move to device and warmup

In [4]:
diffusion_pipeline = LDMPipeline().to(device=DEVICE, dtype=DTYPE)

diffusion_pipeline.eval()
diffusion_pipeline.warmup()

Timestep = 000

In [5]:
# Create noise input
noise = torch.randn((1, 3, 64, 64), dtype=DTYPE, device=DEVICE)

In [6]:
## Generate sample
fl = open("output/eval_data/jetson_latency.txt", "a")
# fm = open("output/eval_data/jetson_memory.txt", "a")
fp = open("output/eval_data/jetson_power.txt", "a")
for i in range(10):
    print(f"\rIteration number {i}")
    print(f"\nRun number {i}: \n", file=fl)
    # print(f"\nRun number {i}: \n", file=fm)
    print(f"\nRun number {i}: \n", file=fp)
    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
        with record_function("model_inference"):
            pr.start()
            %memit sample = diffusion_pipeline(noise)
            pr.stop()
    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=5), file=fl)
    # print(sout.stdout, file=fm)
    print(pr.print_vals(), file=fp)
fl.close()

Iteration number 0
peak memory: 1893.34 MiB, increment: 56.62 MiB
POM_5V_IN  max 	 =====> 	 6606 mW
POM_5V_CPU max 	 =====> 	 3734 mW
POM_5V_GPU max 	 =====> 	 1328 mW
POM_5V_IN  avg 	 =====> 	 6182 mW
POM_5V_CPU avg 	 =====> 	 3023 mW
POM_5V_GPU avg 	 =====> 	 859 mW
Iteration number 1
peak memory: 1830.27 MiB, increment: 50.84 MiB
POM_5V_IN  max 	 =====> 	 6606 mW
POM_5V_CPU max 	 =====> 	 3698 mW
POM_5V_GPU max 	 =====> 	 1436 mW
POM_5V_IN  avg 	 =====> 	 6008 mW
POM_5V_CPU avg 	 =====> 	 2896 mW
POM_5V_GPU avg 	 =====> 	 860 mW
Iteration number 2
peak memory: 1788.05 MiB, increment: 13.10 MiB
POM_5V_IN  max 	 =====> 	 6606 mW
POM_5V_CPU max 	 =====> 	 3704 mW
POM_5V_GPU max 	 =====> 	 1472 mW
POM_5V_IN  avg 	 =====> 	 6203 mW
POM_5V_CPU avg 	 =====> 	 3046 mW
POM_5V_GPU avg 	 =====> 	 889 mW
Iteration number 3
peak memory: 1762.84 MiB, increment: 2.78 MiB
POM_5V_IN  max 	 =====> 	 6582 mW
POM_5V_CPU max 	 =====> 	 3704 mW
POM_5V_GPU max 	 =====> 	 1579 mW
POM_5V_IN  avg 	 =====> 	 

NameError: name 'fm' is not defined

### Visualize sample

In [None]:
plt.imshow(sample.cpu().float().numpy()[0]/255)

### JIT Compiled Model

In [None]:
# RUN THIS CELL ONLY ONCE!!
DTYPE = torch.float32
DEVICE = torch.device('cpu')

diffusion_pipeline = LDMPipeline().to(device=DEVICE, dtype=DTYPE)
noise = torch.randn((1, 3, 64, 64), dtype=DTYPE, device=DEVICE)
print("Diffusion Model loaded!")


print("Starting JIT Trace...")
# JIT Trace and Save model
torchscript_model = torch.jit.trace(diffusion_pipeline, noise)
print("JIT Trace finished! Saving Model...")
torch.jit.save(torchscript_model, "uldm_jit.ptl")
print("JIT compiled model saved!")

del torchscript_model

### Load optimized UNet

In [None]:
optimized_diffusion_pipeline = LDMPipeline()

optimized_diffusion_pipeline = optimized_diffusion_pipeline.to(device=DEVICE, dtype=DTYPE)
optimized_diffusion_pipeline.load_optimized_unet("uldm_unet_fp16_sim.ts")
optimized_diffusion_pipeline.eval()
optimized_diffusion_pipeline.warmup()

### Sample from optimized network

In [None]:
%%time
## Generate sample

noise = torch.randn((1, 3, 64, 64), dtype=DTYPE, device=DEVICE)
# with torch.cuda.amp.autocast():
%timeit %memit sample = optimized_diffusion_pipeline(noise)

### Visualize sample from optimized model

In [None]:
plt.imshow(sample.cpu().float().numpy()[0]/255)