Use this notebook to run 1 denoiser step and get updated intermediate steps

In [1]:
import os
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'  # needed to make torch deterministic

In [2]:
import torch
from torch.testing import assert_close
from torch import allclose, nn, tensor
torch.set_printoptions(linewidth=200)

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'mps'
device_dtype = torch.float16 if device == 'cuda' else torch.float32

## Load the model

In [4]:
from diffusers import StableDiffusionXLPipeline
from diffusers import EulerDiscreteScheduler
from diffusers.models.controlnetxs import ControlNetXSModel
from diffusers.pipelines.controlnet_xs.pipeline_controlnet_xs_sd_xl import StableDiffusionXLControlNetXSPipeline

In [5]:
sdxl_pipe = StableDiffusionXLPipeline.from_single_file('weights/sd_xl_base_1.0_0.9vae.safetensors').to(device)
cnxs = ControlNetXSModel.from_pretrained('weights/cnxs').to(device)

In [6]:
assert cnxs.config.control_attention_head_dim==64
assert cnxs.control_model.down_blocks[1].attentions[0].transformer_blocks[0].attn1.heads==1

In [7]:
cnxs.base_model = sdxl_pipe.unet

The example script of Heidelberg manually sets scale_list to 0.95

In [8]:
cnxs.scale_list = cnxs.scale_list * 0. + 0.95
assert cnxs.scale_list[0] == .95

Heidelberg uses `timestep_spacing = 'linspace'` in their scheduler, so let's do that as well

In [9]:
scheduler_cgf = dict(sdxl_pipe.scheduler.config)
scheduler_cgf['timestep_spacing'] = 'linspace'
sdxl_pipe.scheduler = EulerDiscreteScheduler.from_config(scheduler_cgf)

# test it worked
sdxl_pipe.scheduler.set_timesteps(50)
assert sdxl_pipe.scheduler.timesteps[0]==999

# reset
sdxl_pipe.scheduler = EulerDiscreteScheduler.from_config(scheduler_cgf)

sigmas after (linear) interpolation: [14.61464691 12.93677721 11.49164976 10.24291444  9.16035419] ...


In [10]:
cnxs_pipe = StableDiffusionXLControlNetXSPipeline(
    vae=sdxl_pipe.vae,
    text_encoder=sdxl_pipe.text_encoder,
    text_encoder_2=sdxl_pipe.text_encoder_2,
    tokenizer=sdxl_pipe.tokenizer,
    tokenizer_2=sdxl_pipe.tokenizer_2,
    unet=sdxl_pipe.unet,
    controlnet=cnxs,
    scheduler=sdxl_pipe.scheduler,
)

___

## Run 1 step locally

In [11]:
import torch
import random
import numpy as np
import cv2
from diffusers.utils import load_image
import matplotlib.pyplot as plt

class CannyDetector:
    def __call__(self, img, low_threshold, high_threshold):
        return cv2.Canny(img, low_threshold, high_threshold)

def get_canny_edges(image, threshold=(100, 250)):
    image = np.array(image).astype(np.uint8)
    edges = CannyDetector()(image, *threshold)  # original sized greyscale edges
    edges = edges / 255.
    return edges

def seed_everything(seed):
    # paper used deprecated `seed_everything` from pytorch lightning
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.use_deterministic_algorithms(True)

RANDOM_SEED_IN_PAPER = 1999158951

In [12]:
latents_sdxl_cloud = torch.load('latents_cloud_no_control.pth', map_location=torch.device(device))
rand_from_cloud = latents_sdxl_cloud[0] / 14.6146

In [13]:
prompt = 'cinematic, shoe in the streets, made from meat, photorealistic shoe, highly detailed'
neg_prompt = 'lowres, bad anatomy, worst quality, low quality'

image = load_image('input_images/shoe.png')
edges = get_canny_edges(image)

edges_tensor = torch.tensor(edges)
three_edges = torch.stack((edges_tensor,edges_tensor,edges_tensor))
three_edges.shape

torch.Size([3, 768, 768])

In [14]:
from diffusers.umer_debug_logger import udl

In [15]:
udl.set_dir('logs/local_cuda', clear=True)
udl.set_condition('SUBBLOCK')

In [16]:
cnxs.toggle_control(to=True)

Model already set to control mode == True


In [17]:
from functools import partial
from util_plot import save_latents

lats = []
save_lats = partial(save_latents, lats=lats)

In [18]:
seed_everything(RANDOM_SEED_IN_PAPER)
result = cnxs_pipe(prompt, negative_prompt=neg_prompt,image=three_edges, latents=rand_from_cloud, callback=save_lats)

sigmas after (linear) interpolation: [14.61464691 12.93677721 11.49164976 10.24291444  9.16035419] ...


  0%|          | 0/50 [00:00<?, ?it/s]

control_scale: tensor([0.9500, 0.9500, 0.9500, 0.9500, 0.9500, 0.9500, 0.9500, 0.9500, 0.9500, 0.9500, 0.9500, 0.9500, 0.9500, 0.9500, 0.9500, 0.9500, 0.9500, 0.9500, 0.9500], device='cuda:0')
------ enc ------
>> Applying base block	>> Applying ctrl block	
>> Applying base block	>> Applying ctrl block	
>> Applying base block	>> Applying ctrl block	
>> Applying base block	>> Applying ctrl block	
>> Applying base block	>> Applying ctrl block	
>> Applying base block	>> Applying ctrl block	
>> Applying base block	>> Applying ctrl block	
>> Applying base block	>> Applying ctrl block	
------ mid ------
>> Applying base block	
>> Applying base block	
>> Applying ctrl block	
>> Applying ctrl block	
------ dec ------
>> Applying base block	
>> Applying base block	
>> Applying base block	
>> Applying base block	
>> Applying base block	
>> Applying base block	
>> Applying base block	
>> Applying base block	
>> Applying base block	

The subblocks are cought. Let us gaze into their soul, their ver

SystemExit: The subblocks are cought. Let us gaze into their soul, their very essence.

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


Convert pipe and latents to cpu, as we otherwise get an cuda oom error when decoding the image after each denoising step

In [None]:
def to_cpu(t): return t.cpu() if hasattr(t,'cpu') else t
def only_lat(l): return l[-1]

In [None]:
lats = [to_cpu(only_lat(l)) for l in lats]

In [None]:
pipe=cnxs_pipe.to('cpu')

In [None]:
lats[0].device

In [None]:
import einops
import numpy as np
from PIL import Image

def lat2img(lat, pipe, resize_to=None, output_type='pil'):
    with torch.no_grad():
        if lat.dim()==3: lat = lat.unsqueeze(0) # add batch dimension        
        ims = pipe.vae.decode(lat / pipe.vae.config.scaling_factor, return_dict=False)[0]
        ims = pipe.image_processor.postprocess(ims, output_type=output_type)
        
        if resize_to is not None:
            if output_type=='pil': ims = [im.resize(resize_to) for im in ims]
            else: print(f'Not resizing as output_type = {output_type} requested')
    return ims

In [None]:
from functools import partial
from tqdm.notebook import tqdm

import torch
import einops
import matplotlib.pyplot as plt
from PIL import Image, ImageOps, ImageDraw
from tqdm.notebook import tqdm
from functools import partial

def plot_latents_to_pil_grid(lats, pipe, every=1, cols=10, im_size=200, pbar=True, border=2, return_ims=True, output_type='pil'):
    if not isinstance(im_size, (list, tuple)): im_size = (im_size, im_size)
    
    lats = [lat for i, lat in enumerate(lats) if i % every == 0 or i == len(lats)-1]
    if pbar: lats = tqdm(lats)
    
    # decoce latents -> images
    ims = [lat2img(lat, pipe, resize_to=im_size, output_type=output_type)[0] for lat in lats] # removed pipe argument
    
    # add border
    ims_bordered = [ImageOps.expand(im, border=2, fill='black') for im in ims]
    im_size = (im_size[0]+border, im_size[1]+border)

    rows = len(ims) // cols
    if rows * cols < len(ims): rows += 1

    # draw background
    grid_image = Image.new('RGB', (cols * im_size[0], rows * im_size[1]), color='grey')
    draw = ImageDraw.Draw(grid_image)
    for xy in range(0,2*max(cols * im_size[0], rows * im_size[1])+1,100):
        draw.line([(xy, 0), (0, xy)], fill="white", width=1)
    
    # draw images
    for i, img in enumerate(ims_bordered):
        x_offset = (i % cols) * im_size[0]
        y_offset = (i // cols) * im_size[1]
        grid_image.paste(img, (x_offset, y_offset))

    if return_ims: return grid_image, ims
    else: return grid_image

In [None]:
type(lats)

In [None]:
grid, ims = plot_latents_to_pil_grid(lats, pipe=pipe)

In [None]:
grid

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(4, 4))
plt.imshow(image)
plt.axis('off')
plt.show()