In [1]:
from diffusers import StableDiffusionPipeline, StableDiffusionXLPipeline

In [2]:
sdxl_pipe = StableDiffusionXLPipeline.from_pretrained('stabilityai/stable-diffusion-xl-base-1.0')
sd21_pipe = StableDiffusionPipeline.from_pretrained('stabilityai/stable-diffusion-2-1')

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

In [3]:
sdxl = sdxl_pipe.unet
sd21 = sd21_pipe.unet

In [4]:
def has(o, attr):
    if not hasattr(o,attr): return False
    return getattr(o,attr) is not None

def tf_num(attns):
    tf_nums = [len(attn.transformer_blocks) for attn in attns]
    assert len(set(tf_nums))==1, "There are different numbers of transformers per attention"
    return tf_nums[0]

def print_down_blocks(unet):
    for i,d in enumerate(unet.down_blocks):
        descr = []
        if has(d, 'resnets'): descr.append(f'{len(d.resnets)} resnets')
        if has(d, 'attentions'):  descr.append(f'{len(d.attentions)} attentions ({tf_num(d.attentions)} transformers each)')
        if has(d, 'downsamplers'): descr.append(f'{len(d.downsamplers)} downsamplers')
        print(f'Block {i}:',', '.join(descr))

In [5]:
print_down_blocks(sd21)

Block 0: 2 resnets, 2 attentions (1 transformers each), 1 downsamplers
Block 1: 2 resnets, 2 attentions (1 transformers each), 1 downsamplers
Block 2: 2 resnets, 2 attentions (1 transformers each), 1 downsamplers
Block 3: 2 resnets


In [6]:
print_down_blocks(sdxl)

Block 0: 2 resnets, 1 downsamplers
Block 1: 2 resnets, 2 attentions (2 transformers each), 1 downsamplers
Block 2: 2 resnets, 2 attentions (10 transformers each)


In [7]:
sd21.down_blocks[0]

CrossAttnDownBlock2D(
  (attentions): ModuleList(
    (0-1): 2 x Transformer2DModel(
      (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
      (proj_in): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)
      (transformer_blocks): ModuleList(
        (0): BasicTransformerBlock(
          (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
          (attn1): Attention(
            (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
            (to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
            (to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
            (to_out): ModuleList(
              (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)
              (1): Dropout(p=0.0, inplace=False)
            )
          )
          (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
          (attn2): Attention(
            (to_q): LoRACompatibl

In [8]:
from diffusers import ControlNetXSModel

In [9]:
cnxs = ControlNetXSModel.create_as_in_original_paper(sd21, is_sdxl=False)

`norm_num_groups` was set to `min(block_out_channels)` (=4) so it divides all block_out_channels` ([4, 8, 16, 16]). Set it explicitly to remove this information.


In [10]:
from diffusers import UNet2DConditionModel

In [11]:
import inspect
from typing import get_type_hints

def stringify(o):
    if isinstance(o,tuple): o = list(o)
    return str(o)
    
def non_internal_params(unet): return { k:stringify(v) for k,v in dict(unet.config).items() if not k.startswith('_')}

params_default = { k:stringify(v.default) for k,v in inspect.signature(UNet2DConditionModel.__init__).parameters.items() if k != 'self'}
params_sdxl = non_internal_params(sdxl)
params_sd21 = non_internal_params(sd21)

In [12]:
assert set(params_default)==set(params_sdxl)==set(params_sd21)

In [13]:
nondef_sdxl = set()

print("Params for which SDXL doesn't use default values:")
for k in params_default.keys():
    v_def  = params_default[k]
    v_sdxl = params_sdxl[k]

    if v_sdxl != v_def:
        print(f'- {k:<30}: {v_sdxl:<40} | def: {v_def:<40}')
        nondef_sdxl.add(k)

Params for which SDXL doesn't use default values:
- sample_size                   : 128                                      | def: None                                    
- down_block_types              : ['DownBlock2D', 'CrossAttnDownBlock2D', 'CrossAttnDownBlock2D'] | def: ['CrossAttnDownBlock2D', 'CrossAttnDownBlock2D', 'CrossAttnDownBlock2D', 'DownBlock2D']
- up_block_types                : ['CrossAttnUpBlock2D', 'CrossAttnUpBlock2D', 'UpBlock2D'] | def: ['UpBlock2D', 'CrossAttnUpBlock2D', 'CrossAttnUpBlock2D', 'CrossAttnUpBlock2D']
- block_out_channels            : [320, 640, 1280]                         | def: [320, 640, 1280, 1280]                  
- cross_attention_dim           : 2048                                     | def: 1280                                    
- transformer_layers_per_block  : [1, 2, 10]                               | def: 1                                       
- attention_head_dim            : [5, 10, 20]                              | def: 8   

In [14]:
nondef_sd21 = set()

print("Params for which SD21 doesn't use default values:")
for k in params_default.keys():
    v_def  = params_default[k]
    v_sd21 = params_sd21[k]

    if v_sd21 != v_def:
        print(f'- {k:<30}: {v_sd21:<40} | def: {v_def:<40}')
        nondef_sd21.add(k)

Params for which SD21 doesn't use default values:
- sample_size                   : 96                                       | def: None                                    
- cross_attention_dim           : 1024                                     | def: 1280                                    
- attention_head_dim            : [5, 10, 20, 20]                          | def: 8                                       
- use_linear_projection         : True                                     | def: False                                   
- upcast_attention              : True                                     | def: False                                   


In [15]:
nondef_sd21

{'attention_head_dim',
 'cross_attention_dim',
 'sample_size',
 'upcast_attention',
 'use_linear_projection'}

In [16]:
nondef_sd21 - nondef_sdxl

set()

In [17]:
nondef_sdxl - nondef_sd21

{'addition_embed_type',
 'addition_time_embed_dim',
 'block_out_channels',
 'down_block_types',
 'projection_class_embeddings_input_dim',
 'transformer_layers_per_block',
 'up_block_types'}

In [18]:
nondef_sdxl

{'addition_embed_type',
 'addition_time_embed_dim',
 'attention_head_dim',
 'block_out_channels',
 'cross_attention_dim',
 'down_block_types',
 'projection_class_embeddings_input_dim',
 'sample_size',
 'transformer_layers_per_block',
 'up_block_types',
 'upcast_attention',
 'use_linear_projection'}

In [19]:
params_default['layers_per_block']

'2'