# (P)AID: (Prompt-guided) Attention Interpolation of Text-to-Image Diffusion

Author: Qiyuan He $^1$ Jinghao Wang $^2$ Ziwei Liu $^2$ Angela Yao $^1$

$^1$ National University of Singapore 

$^2$ S-Lab, Nanyang Technological University

In [None]:
%pip install -r requirements.txt

In [None]:
import torch, lpips
from utils import show_images_horizontally, compute_smoothness_and_consistency
from pipeline_interpolated_stable_diffusion import InterpolationStableDiffusionPipeline

In [None]:
pipe = InterpolationStableDiffusionPipeline(
    repo_name="runwayml/stable-diffusion-v1-5",
    guidance_scale=10.0,
    scheduler_name="unipc",
)

# Initialize the generator
vae_scale_factor = 8
channel = pipe.unet.config.in_channels
height = pipe.unet.config.sample_size * vae_scale_factor
width = pipe.unet.config.sample_size * vae_scale_factor
torch_device = "cuda"
generator = torch.cuda.manual_seed(1002)

latent = torch.randn(
    (1, channel, height // vae_scale_factor, width // vae_scale_factor),
    generator=generator,
    device=torch_device,
)

num_inference_steps = 50
lpips_model = lpips.LPIPS(net="vgg").to("cuda")

*Note: if you have sufficient computational resources (GPU Memory), you can change the below `pipe.interpolate_save_gpu` which generates the images one by one to `pipe.interpolate` which generates all interpolation images at once.*

In [None]:
warmup_ratio = 6.1 / 50
early = "fused_inner"
late = "self"
guide_prompt = (
    "A photo of a dog driving a car, logical, best quality, extremely detailed"
)
negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"

prompt1 = "A photo of dog, best quality, extremely detailed"
prompt2 = "A photo of car, best quality, extremely detailed"
alpha = 6
beta = 3
images = pipe.interpolate_save_gpu(
    latent,
    latent,
    prompt1,
    prompt2,
    guide_prompt=guide_prompt,
    size=3,
    num_inference_steps=num_inference_steps,
    warmup_ratio=warmup_ratio,
    early=early,
    late=late,
    alpha=alpha,
    beta=beta,
    negative_prompt=negative_prompt,
)
show_images_horizontally(images, interact=True)
smoothness, consistency, _ = compute_smoothness_and_consistency(images, lpips_model)
print(smoothness, consistency)

In [None]:
warmup_ratio = 8.1 / 50
early = "fused_inner"
late = "self"
guide_prompt = (
    "A photo of a toy named dog-car, logical, best quality, extremely detailed"
)
negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"

prompt1 = "A photo of dog, best quality, extremely detailed"
prompt2 = "A photo of car, best quality, extremely detailed"
alpha = 8
beta = 8
images = pipe.interpolate_save_gpu(
    latent,
    latent,
    prompt1,
    prompt2,
    guide_prompt=guide_prompt,
    size=3,
    num_inference_steps=num_inference_steps,
    warmup_ratio=warmup_ratio,
    early=early,
    late=late,
    alpha=alpha,
    beta=beta,
    negative_prompt=negative_prompt,
)
show_images_horizontally(images, prompt1="a dog", prompt2="a car", interact=True)
smoothness, _, _ = compute_smoothness_and_consistency(images, lpips_model)
print(smoothness)

In [None]:
warmup_ratio = 8.1 / 50
early = "fused_inner"
late = "self"
guide_prompt = (
    "A photo of a car with furry texture, logical, best quality, extremely detailed"
)
negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"

prompt1 = "A photo of dog, best quality, extremely detailed"
prompt2 = "A photo of car, best quality, extremely detailed"
alpha = 8
beta = 8
images = pipe.interpolate_save_gpu(
    latent,
    latent,
    prompt1,
    prompt2,
    guide_prompt=guide_prompt,
    size=3,
    num_inference_steps=num_inference_steps,
    warmup_ratio=warmup_ratio,
    early=early,
    late=late,
    alpha=alpha,
    beta=beta,
    negative_prompt=negative_prompt,
)
show_images_horizontally(images, prompt1="a dog", prompt2="a car", interact=True)
smoothness, _, _ = compute_smoothness_and_consistency(images, lpips_model)
print(smoothness)

In [None]:
warmup_ratio = 1.0
early = "fused_inner"
late = "self"
guide_prompt = (
    "A photo of a car with dog head, logical, best quality, extremely detailed"
)
negative_prompt = "monochrome, lowres, worst quality, low quality"

prompt1 = "A photo of dog, best quality, extremely detailed"
prompt2 = "A photo of car, best quality, extremely detailed"
alpha = 29.75
beta = 20
images = pipe.interpolate_save_gpu(
    latent,
    latent,
    prompt1,
    prompt2,
    guide_prompt=guide_prompt,
    size=3,
    num_inference_steps=num_inference_steps,
    warmup_ratio=warmup_ratio,
    early=early,
    late=late,
    alpha=alpha,
    beta=beta,
    negative_prompt=negative_prompt,
)
show_images_horizontally(images, prompt1="a dog", prompt2="a car", interact=True)
smoothness, _, _ = compute_smoothness_and_consistency(images, lpips_model)
print(smoothness)

In [None]:
warmup_ratio = 20.1 / 50
num_inference_steps = 50
early = "fused_inner"
late = "self"
guide_prompt = (
    "A product display of toy named Gundam-Pikachu, best quality, extremely detailed"
)
negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"

prompt1 = "A product display of toy named Pikachu, Pokemon created by Game Freak and Nitendo, best quality, extremely detailed"
prompt2 = "A product display of toy named Gundam, Mobile Suit Gundam Figure Gundam Universe, Bandai Tamashii Nations, best quality, extremely detailed"

alpha = 6
beta = 2
images = pipe.interpolate_save_gpu(
    latent,
    latent,
    prompt1,
    prompt2,
    guide_prompt=guide_prompt,
    size=7,
    num_inference_steps=num_inference_steps,
    warmup_ratio=warmup_ratio,
    early=early,
    late=late,
    alpha=alpha,
    beta=beta,
    negative_prompt=negative_prompt,
)
show_images_horizontally(images, interact=True)
smoothness, _, _ = compute_smoothness_and_consistency(images, lpips_model)
print(smoothness)