In [1]:
import os
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'  # needed to make torch deterministic

In [2]:
import numpy as np
import torch
from torch import allclose, nn, tensor
torch.set_printoptions(linewidth=200, precision=4, sci_mode=False)
torch.use_deterministic_algorithms(True)

In [3]:
def seed_all(seed):
    import torch
    import numpy as np
    import random

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

In [4]:
def print_dict(d):
    for k,v in d.items():
        if hasattr(v, 'item'): v=v.item() 
        print(f'{k}: {v:.4f}', end='     ')
    print()

In [5]:
pos =  torch.ones((2,2))
neg = -torch.ones((2,2))
pseudo_noise = 100 * torch.ones((2,2))

pos.flatten(), neg.flatten(), pseudo_noise.flatten()

(tensor([1., 1., 1., 1.]),
 tensor([-1., -1., -1., -1.]),
 tensor([100., 100., 100., 100.]))

**Load Heidelberg scheduler**

In [6]:
import scripts.control_utils as cu
from ldm.models.diffusion.ddim import DDIMSampler



Downloading: "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt" to /home/ControlNet-XS/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt



100%|██████████| 470M/470M [00:04<00:00, 111MB/s]  
  model = create_fn(


In [7]:
path_to_config = 'cnxs_config/sd/sd21_encD_canny_14m.yaml'

If this results in the kernel crashing, I'm using too much GPU memory elsewhere. Shut down every other kernel and try again.

In [8]:
model = cu.create_model(path_to_config).to('cpu')

TwoStreamControlLDM: Running in eps-prediction mode
DiffusionWrapper has 865.91 M params.
making attention of type 'vanilla-xformers' with 512 in_channels
building MemoryEfficientAttnBlock with 512 in_channels...
Working with z of shape (1, 4, 32, 32) = 4096 dimensions.
making attention of type 'vanilla-xformers' with 512 in_channels
building MemoryEfficientAttnBlock with 512 in_channels...
[404 keys are missing from the model (hint processing and cross connections included)]
['CONTROL WEIGHTS LOADED']
Loaded model config from [cnxs_config/sd/sd21_encD_canny_14m.yaml]


In [9]:
model.parameterization

'eps'

In [10]:
h_sampler = DDIMSampler(model)

In [11]:
h_sampler.ddpm_num_timesteps, h_sampler.schedule

(1000, 'linear')

In [12]:
h_sampler.make_schedule(ddim_num_steps=50, ddim_eta=0.5, verbose=True)

Selected timesteps for ddim sampler: [  1  21  41  61  81 101 121 141 161 181 201 221 241 261 281 301 321 341
 361 381 401 421 441 461 481 501 521 541 561 581 601 621 641 661 681 701
 721 741 761 781 801 821 841 861 881 901 921 941 961 981]
Selected alphas for ddim sampler: a_t: tensor([0.9983, 0.9804, 0.9609, 0.9398, 0.9171, 0.8930, 0.8674, 0.8404, 0.8121, 0.7827, 0.7521, 0.7207, 0.6885, 0.6557, 0.6224, 0.5888, 0.5551, 0.5215, 0.4882, 0.4552, 0.4229, 0.3913, 0.3605, 0.3308,
        0.3023, 0.2750, 0.2490, 0.2245, 0.2014, 0.1799, 0.1598, 0.1413, 0.1243, 0.1087, 0.0946, 0.0819, 0.0705, 0.0604, 0.0514, 0.0435, 0.0365, 0.0305, 0.0254, 0.0210, 0.0172, 0.0140, 0.0113, 0.0091,
        0.0073, 0.0058]); a_(t-1): [0.99914998 0.99829602 0.98038077 0.96087277 0.93978298 0.91713792
 0.89298052 0.86737001 0.84038192 0.81210774 0.78265446 0.75214338
 0.72070938 0.68849909 0.65566933 0.62238538 0.58881873 0.55514455
 0.52153981 0.4881804  0.45523876 0.42288151 0.39126703 0.36054322
 0.33084565 0.302

In [13]:
h_sampler.ddim_timesteps

array([  1,  21,  41,  61,  81, 101, 121, 141, 161, 181, 201, 221, 241,
       261, 281, 301, 321, 341, 361, 381, 401, 421, 441, 461, 481, 501,
       521, 541, 561, 581, 601, 621, 641, 661, 681, 701, 721, 741, 761,
       781, 801, 821, 841, 861, 881, 901, 921, 941, 961, 981])

In [14]:
x = pos
model_output = neg

def make_shape(t): return torch.full((2, 1), t)

a_t = make_shape(h_sampler.ddim_alphas[50-1])
a_prev = make_shape(h_sampler.ddim_alphas_prev[50-1])
sqrt_one_minus_at = make_shape(h_sampler.ddim_sqrt_one_minus_alphas[50-1])
sigma_t = make_shape(h_sampler.ddim_sigmas[50-1])

h_vars = {
    'alpha_t': a_t[0],
    'alpha_(t-1)': a_prev[0],
    'sqrt(1-alpha_t)': sqrt_one_minus_at[0],
    'sigma': sigma_t[0],
}

print_dict(h_vars)

alpha_t: 0.0058     alpha_(t-1): 0.0073     sqrt(1-alpha_t): 0.9971     sigma: 0.2272     


In [15]:
temperature = 1

# model.parameterization == 'eps'
pred_x0 = (x - sqrt_one_minus_at * model_output) / a_t.sqrt()
dir_xt = (1. - a_prev - sigma_t**2).sqrt() * model_output
noise = sigma_t * pseudo_noise * temperature

h_res = a_prev.sqrt() * pred_x0 + dir_xt + noise

In [16]:
h_res

tensor([[23.9955, 23.9955],
        [23.9955, 23.9955]])

**Load diffusers scheduler**

In [17]:
from diffusers import StableDiffusionPipeline
from diffusers import PNDMScheduler, DDIMScheduler

Info: `UmerDebugLogger` created. This is a logging class that will be deleted when the PR to integrate ControlNet-XS is done.


In [18]:
sd_pipe = StableDiffusionPipeline.from_single_file('weights/sd/sd21/v2-1_512-ema-pruned.ckpt').to('cpu')

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .
  deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False)


In [19]:
scheduler_dict = dict(DDIMScheduler().config)
for k,v in sd_pipe.scheduler.config.items():
    if k in scheduler_dict: scheduler_dict[k]=v
#scheduler_dict['eta'] = 1.0 -- will be passed as call arg

d_scheduler = DDIMScheduler(**scheduler_dict)

In [20]:
d_scheduler.set_timesteps(50)
d_scheduler.timesteps

tensor([981, 961, 941, 921, 901, 881, 861, 841, 821, 801, 781, 761, 741, 721, 701, 681, 661, 641, 621, 601, 581, 561, 541, 521, 501, 481, 461, 441, 421, 401, 381, 361, 341, 321, 301, 281, 261, 241,
        221, 201, 181, 161, 141, 121, 101,  81,  61,  41,  21,   1])

In [21]:
seed_all(0)
d_res = d_scheduler.step(model_output=neg, timestep=981, sample=pos, eta=0.5, return_dict=False)[0]

In [22]:
d_res

tensor([[1.3353, 0.9185],
        [0.4900, 1.1143]])

Let's execute 1 step manually

In [23]:
d_scheduler.config.num_train_timesteps, d_scheduler.num_inference_steps

(1000, 50)

In [24]:
seed_all(0)

sample = pos # previous latent
model_output = neg # current noise prediction
eta = 0.5

timestep = 981
# 1. get previous step value (=t-1)
prev_timestep = 981 - 1000 // 50

# 2. compute alphas, betas
alpha_prod_t = d_scheduler.alphas_cumprod[timestep]
alpha_prod_t_prev = d_scheduler.alphas_cumprod[prev_timestep]

beta_prod_t = 1 - alpha_prod_t

# 3. compute predicted original sample from predicted noise also called
# "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf

# self.config.prediction_type == "v_prediction":
pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample

# 5. compute variance: "sigma_t(η)" -> see formula (16)
# σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
variance = d_scheduler._get_variance(timestep, prev_timestep)
std_dev_t = eta * variance ** (0.5)

d_vars = {
    'alpha_t': alpha_prod_t,
    'alpha_(t-1)': alpha_prod_t_prev,
    'sqrt(1-alpha_t)': (1-alpha_prod_t).sqrt(),
    'sigma': std_dev_t,
}

# 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon

# 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction

variance_noise = pseudo_noise

variance = std_dev_t * variance_noise
prev_sample = prev_sample + variance

d_res = prev_sample

In [25]:
print_dict(h_vars)
print_dict(d_vars)

alpha_t: 0.0058     alpha_(t-1): 0.0073     sqrt(1-alpha_t): 0.9971     sigma: 0.2272     
alpha_t: 0.0058     alpha_(t-1): 0.0073     sqrt(1-alpha_t): 0.9971     sigma: 0.2272     


In [26]:
d_res

tensor([[23.7083, 23.7083],
        [23.7083, 23.7083]])

In [27]:
h_res

tensor([[23.9955, 23.9955],
        [23.9955, 23.9955]])

**Let's compare the scheduler substeps of Heidelberg and diffusers side-by-side**

In [40]:
assert (x==sample).all()
f = model_output

In [41]:
h_α      = h_vars['alpha_t']
h_α_prev = h_vars['alpha_(t-1)']
h_β      = h_vars['sqrt(1-alpha_t)']**2
h_σ      = h_vars['sigma']

d_α      = d_vars['alpha_t']
d_α_prev = d_vars['alpha_(t-1)']
d_β      = d_vars['sqrt(1-alpha_t)']**2
d_σ      = d_vars['sigma']

In [62]:
temp = 1

h_pred_x0 = (x - d_β.sqrt()*f) / h_α.sqrt()
h_dir_xt  = (1. - h_α_prev - h_σ**2).sqrt() * f

n = h_σ * pseudo_noise * temp

h_x_prev = h_α_prev.sqrt()*h_pred_x0 + h_dir_xt + n

In [63]:
h_x_prev.flatten()

tensor([23.9955, 23.9955, 23.9955, 23.9955])

In [64]:
d_pred_x0 = (x - d_β.sqrt()*f) / d_α.sqrt()
d_dir_xt = (1. - h_α_prev - d_σ**2).sqrt() * pred_epsilon

n = d_σ * pseudo_noise

d_res = d_α_prev.sqrt()*d_pred_x0 + d_dir_xt + n

In [65]:
d_res.flatten()

tensor([23.9955, 23.9955, 23.9955, 23.9955])