# What do we have in diffusers?

In [None]:
!pip install -q transformers
!pip install -q diffusers
!pip install -q accelerate

# Text2Image

In [None]:
import torch
from diffusers import StableDiffusionPipeline
from diffusers import DiffusionPipeline
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler

from PIL import Image
from IPython.display import display

import gc


def free_memory():
    torch.cuda.empty_cache()
    gc.collect()

#### SDv1

In [None]:
StableDiffusionPipeline.from_pretrained?

In [None]:
model_id = "sd-legacy/stable-diffusion-v1-5"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe.enable_model_cpu_offload()

prompt = "a photo of an astronaut riding a horse on mars"
seed = 42

In [None]:
image = pipe(
    prompt,
    negative_prompt="bad quality, blurry, lowres",
    generator=torch.Generator().manual_seed(seed),
    num_inference_steps=30,
    guidance_scale=7.5
).images[0]

display(image)

In [None]:
del pipe
free_memory()

#### SDv2

In [None]:
model_id = "stabilityai/stable-diffusion-2-1"

pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()

In [None]:
image = pipe(
    prompt,
    negative_prompt="bad quality, blurry, lowres",
    generator=torch.Generator().manual_seed(seed),
    num_inference_steps=30,
    guidance_scale=7.5
).images[0]

display(image)

In [None]:
del pipe
free_memory()

#### SDXL

In [None]:
# load both base & refiner
base = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
)
base.enable_model_cpu_offload()
n_steps = 40
high_noise_frac = 0.8
prompt = "a photo of an astronaut riding a horse on mars"

In [None]:
# run base -> output latents (output_type="latent")
latents = base(
    prompt,
    num_inference_steps=n_steps,
    denoising_end=high_noise_frac,
    output_type="latent"
).images

del base.unet
del base
free_memory()

In [None]:
refiner = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-refiner-1.0",
    torch_dtype=torch.float16,
    use_safetensors=True,
    variant="fp16",
)
refiner.enable_model_cpu_offload()

In [None]:
# run refiner -> pass image (latent from previous step)
free_memory()
image = refiner(
    prompt,
    num_inference_steps=n_steps,
    image=latents,
    denoising_start=high_noise_frac,
).images[0]

In [None]:
display(image)

In [None]:
del refiner
free_memory()

# Image2Image

In [None]:
from diffusers import StableDiffusionXLImg2ImgPipeline
from diffusers.utils import make_image_grid, load_image
from torchvision.transforms.functional import to_pil_image


pipeline = StableDiffusionXLImg2ImgPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
)
pipeline.enable_model_cpu_offload()

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png"
init_image = load_image(url)

display(init_image.resize((init_image.size[0] // 2, init_image.size[1] // 2)))

In [None]:
prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k"
strength = 0.8

generator = torch.Generator(device="cpu").manual_seed(2**24 + 43)
image = pipeline(prompt, image=init_image, generator=generator, strength=strength).images[0]

res = make_image_grid([init_image, image], rows=1, cols=2)
display(res.resize((res.size[0] // 2, res.size[1] // 2)))

In [None]:
@torch.no_grad()
def encode(image):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    dtype = next(pipeline.vae.parameters()).dtype

    image = pipeline.image_processor.preprocess(image)
    image = image.to(device=device, dtype=dtype)

    if pipeline.vae.config.force_upcast:
        image = image.float()
        pipeline.vae.to(dtype=torch.float32)

    latents = pipeline.vae.encode(image).latent_dist.sample()

    if pipeline.vae.config.force_upcast:
        pipeline.vae.to(dtype)


    latents = pipeline.vae.config.scaling_factor * latents
    latents = latents.cpu()
    gc.collect()
    torch.cuda.empty_cache()
    return latents

def normalize(tensor):
    return (tensor - tensor.min()) / (tensor.max() - tensor.min())

encoded = encode(init_image)
encoded_image = to_pil_image(normalize(encoded)[0])
display(encoded_image.resize((encoded_image.size[0] * 2, encoded_image.size[1] * 2)))

In [None]:
@torch.no_grad()
def decode(encoded):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    encoded = encoded.to(device)
    dtype = next(pipeline.vae.parameters()).dtype

    if pipeline.vae.config.force_upcast:
        encoded = encoded.float()
        pipeline.vae.to(dtype=torch.float32)

    decoded = pipeline.vae.decode(encoded / pipeline.vae.config.scaling_factor)

    if pipeline.vae.config.force_upcast:
        pipeline.vae.to(dtype)

    decoded = decoded.sample.cpu()
    gc.collect()
    torch.cuda.empty_cache()
    return decoded

decoded = decode(encoded)
decoded_im = to_pil_image(normalize(decoded)[0])
display(decoded_im.resize((decoded_im.size[0] // 2, decoded_im.size[1] // 2)))

In [None]:
res = make_image_grid([init_image, decoded_im], rows=1, cols=2)
display(res.resize((res.size[0] // 2, res.size[1] // 2)))

In [None]:
del pipeline
free_memory()

# ControlNet

In [None]:
import gc

import torch
import cv2
import numpy as np

from torch import autocast
from diffusers import StableDiffusionPipeline
from diffusers import DiffusionPipeline
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
from diffusers.utils import load_image, make_image_grid
from transformers import pipeline

from PIL import Image
from IPython.display import display

In [None]:
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png"
prompt = "An elegant cat with thick, fluffy black and white fur, sitting in a snowy winter landscape. Snowflakes gently fall around."
original_image = load_image(
    url
)
display(original_image)

In [None]:
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16, use_safetensors=True)
model_id = "sd-legacy/stable-diffusion-v1-5"

pipe = StableDiffusionControlNetPipeline.from_pretrained(
    model_id, controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
)

pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()

In [None]:
# canny version

image = np.array(original_image)

low_threshold = 100
high_threshold = 200

image = cv2.Canny(image, low_threshold, high_threshold)
image = image[:, :, None]
image = np.concatenate([image, image, image], axis=2)
canny_image = Image.fromarray(image)
display(canny_image.resize((canny_image.size[0] // 2, canny_image.size[1] // 2)))

In [None]:
generator = torch.Generator(device="cpu").manual_seed(int(10101))
output = pipe(
    prompt, image=canny_image, generator=generator
).images[0]

res = make_image_grid([original_image, canny_image, output], rows=1, cols=3)
display(res.resize((res.size[0] // 2, res.size[1] // 2)))

In [None]:
del controlnet
del pipe
free_memory()

In [None]:
# depth
depth_estimator = pipeline('depth-estimation')

controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-depth", torch_dtype=torch.float16, use_safetensors=True)
model_id = "sd-legacy/stable-diffusion-v1-5"

pipe = StableDiffusionControlNetPipeline.from_pretrained(
    model_id, controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()

In [None]:
image = depth_estimator(original_image)['depth']
image = np.array(image)
image = image[:, :, None]
image = np.concatenate([image, image, image], axis=2)
depth_image = Image.fromarray(image)

display(depth_image.resize((depth_image.size[0] // 2, depth_image.size[1] // 2)))

In [None]:
generator = torch.Generator(device="cpu").manual_seed(int(10101))
output = pipe(
    prompt, image=depth_image, generator=generator
).images[0]

res = make_image_grid([original_image, depth_image, output], rows=1, cols=3)
display(res.resize((res.size[0] // 2, res.size[1] // 2)))

In [None]:
del controlnet
del pipe
free_memory()

#### IP-Adapter

(this part is taken from diffusers docs)

In [None]:
from diffusers import AutoPipelineForText2Image
from diffusers.utils import load_image
import torch

pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
pipeline.set_ip_adapter_scale(0.6)

In [None]:
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_diner.png")
display(image)

In [None]:
generator = torch.Generator(device="cpu").manual_seed(0)
images = pipeline(
    prompt="a polar bear sitting in a chair drinking a milkshake",
    ip_adapter_image=image,
    negative_prompt="deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality",
    num_inference_steps=100,
    generator=generator,
).images
display(images[0])

In [None]:
# we can save embeds and use them as ip_adapter_image_embeds parameter
image_embeds = pipeline.prepare_ip_adapter_image_embeds(
    ip_adapter_image=image,
    ip_adapter_image_embeds=None,
    device="cuda",
    num_images_per_prompt=1,
    do_classifier_free_guidance=True,
)

torch.save(image_embeds, "image_embeds.ipadpt")

image_embeds = torch.load("image_embeds.ipadpt")
images = pipeline(
    prompt="a polar bear sitting in a chair drinking a milkshake",
    ip_adapter_image_embeds=image_embeds,
    negative_prompt="deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality",
    num_inference_steps=100,
    generator=generator,
).images