# Performance Settings

Set `FAST_MODE = True` for faster inference on smaller GPUs (reduces quality but speeds up generation)

In [1]:
# Set to True for faster inference on smaller GPUs
FAST_MODE = True  # Change to True for faster generation with lower quality

# Fast mode settings
if FAST_MODE:
    FAST_NUM_DIFFUSION_STEPS = 5  # Reduced from 20 (much faster for testing)
    FAST_IMAGE_SIZE = 64  # Reduced from 128 (smaller images)
    FAST_NUM_SEEDS = 1  # Reduced to 1 for quickest testing
    FAST_MAX_ITER_TO_ALTER = 5  # Reduced iterations for Attend-and-Excite
else:
    FAST_NUM_DIFFUSION_STEPS = 50
    FAST_IMAGE_SIZE = 256
    FAST_NUM_SEEDS = None  # Use all provided seeds
    FAST_MAX_ITER_TO_ALTER = 25

In [2]:
from typing import List, Dict, Optional
import torch

import sys 
sys.path.append(".")
sys.path.append("..")

from pipeline_attend_and_excite import AttendAndExcitePipeline
from config import RunConfig
from run import run_on_prompt, get_indices_to_alter
from utils import vis_utils
from utils.ptp_utils import AttentionStore

%load_ext autoreload 
%autoreload 2

# Load Model Weights (may take a few minutes)

In [3]:
NUM_DIFFUSION_STEPS = FAST_NUM_DIFFUSION_STEPS if FAST_MODE else 50
GUIDANCE_SCALE = 7.5
MAX_NUM_WORDS = 77
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")
# stable = AttendAndExcitePipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(device)
# Not enough memory for full precision model
stable = AttendAndExcitePipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16).to(device)
tokenizer = stable.tokenizer

Using device: cuda:0


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!


# Pipeline Wrapper

In [4]:
# configurable parameters (see RunConfig for all parameters)
# scale factor - intensity of shift by gradient
# thresholds - a dictionary for iterative refinement mapping the iteration number to the attention threshold
# max_iter_to_alter- maximal inference timestep to apply Attend-and-Excite
def run_and_display(prompts: List[str],
                    controller: AttentionStore,
                    indices_to_alter: List[int],
                    generator: torch.Generator,
                    run_standard_sd: bool = False,
                    scale_factor: int = 20,
                    thresholds: Dict[int, float] = {0:0.05, 10: 0.5, 20: 0.8},
                    max_iter_to_alter: int = 25,
                    display_output: bool = False,
                    sd_2_1: bool = False):
    # Use faster settings in fast mode
    if FAST_MODE:
        max_iter_to_alter = FAST_MAX_ITER_TO_ALTER
        thresholds = {0: 0.1}  # Simplified threshold for speed
    
    config = RunConfig(prompt=prompts[0],
                       run_standard_sd=run_standard_sd,
                       scale_factor=scale_factor,
                       thresholds=thresholds,
                       max_iter_to_alter=max_iter_to_alter,
                       sd_2_1=sd_2_1)
    image = run_on_prompt(model=stable,
                          prompt=prompts,
                          controller=controller,
                          token_indices=indices_to_alter,
                          seed=generator,
                          config=config)
    if display_output:
        display(image)
    return image

# Run Inference on a Set of Seeds and Generate an Image Grid

In [5]:
def get_indices_to_alter_auto(prompt: str, stable):
    """Automatically extract noun token indices without user input."""
    token_idx_to_word = {idx: stable.tokenizer.decode(t)
                         for idx, t in enumerate(stable.tokenizer(prompt)['input_ids'])
                         if 0 < idx < len(stable.tokenizer(prompt)['input_ids']) - 1}
    
    # Automatically select non-stopword tokens (nouns, verbs, adjectives)
    # Skip common words like 'a', 'and', 'the', 'of', 'in', etc.
    stopwords = {'a', 'an', 'and', 'the', 'of', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'from'}
    token_indices = [idx for idx, word in token_idx_to_word.items() if word.strip().lower() not in stopwords]
    
    print(f"Token mapping: {token_idx_to_word}")
    print(f"Auto-selected indices: {token_indices} -> {[token_idx_to_word[i] for i in token_indices]}")
    return token_indices

def generate_images_for_method(prompt: str,
                               seeds: List[int],
                               indices_to_alter: Optional[List[int]] = None,
                               is_attend_and_excite: bool = True,
                               sd_2_1: bool = False):
    # Use automatic token selection instead of interactive input
    token_indices = get_indices_to_alter_auto(prompt, stable) if indices_to_alter is None else indices_to_alter
    
    # Use fewer seeds in fast mode
    if FAST_MODE and FAST_NUM_SEEDS is not None:
        seeds = seeds[:FAST_NUM_SEEDS]
    
    images = []
    for seed in seeds:
        g = torch.Generator('cuda').manual_seed(seed)
        prompts = [prompt]
        controller = AttentionStore()
        run_standard_sd = False if is_attend_and_excite else True
        image = run_and_display(prompts=prompts,
                                controller=controller,
                                indices_to_alter=token_indices,
                                generator=g,
                                run_standard_sd=run_standard_sd,
                                sd_2_1=sd_2_1)
        # Use smaller image size in fast mode
        img_size = FAST_IMAGE_SIZE if FAST_MODE else 256
        images.append(image.resize((img_size, img_size)))
    grid = vis_utils.get_image_grid(images)
    display(grid)

# Stable Diffusion vs. Attend-and-Excite

## Stable Diffusion

In [6]:
generate_images_for_method(
    prompt="a cat and a frog",
    seeds=[6141, 9031, 969, 1910],
    is_attend_and_excite=False
)

Token mapping: {1: 'a', 2: 'cat', 3: 'and', 4: 'a', 5: 'frog'}
Auto-selected indices: [2, 5] -> ['cat', 'frog']


  num_channels_latents = self.unet.in_channels


  0%|          | 0/50 [00:00<?, ?it/s]

TypeError: prepare_attention_mask() missing 1 required positional argument: 'batch_size'

## Attend-and-Excite

In [None]:
generate_images_for_method(
    prompt="a cat and a frog",
    seeds=[6141, 9031, 969, 1910],
    is_attend_and_excite=True
)

## Stable Diffusion

In [None]:
generate_images_for_method(
    prompt="a mouse and a red car",
    seeds=[7803, 2098, 15792, 2354],
    is_attend_and_excite=False
)

## Attend-and-Excite

In [None]:
generate_images_for_method(
    prompt="a mouse and a red car",
    seeds=[7803, 2098, 15792, 2354],
    is_attend_and_excite=True
)

# Stable Diffusion 2.1

In [None]:
NUM_DIFFUSION_STEPS = FAST_NUM_DIFFUSION_STEPS if FAST_MODE else 50
GUIDANCE_SCALE = 7.5
MAX_NUM_WORDS = 77
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
stable = AttendAndExcitePipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base").to(device)
tokenizer = stable.tokenizer

In [None]:
generate_images_for_method(
    prompt="a cat and a dog",
    seeds=[39, 63, 68, 62],
    is_attend_and_excite=False,
    sd_2_1=True
)

In [None]:
generate_images_for_method(
    prompt="a cat and a dog",
    seeds=[39, 63, 68, 62],
    is_attend_and_excite=True,
    sd_2_1=True
)