In [6]:
# training_demo.ipynb

# 1. Imports
from pathlib import Path
import torch
from torch import nn
from torch.nn import Module
from torch.utils.data import Dataset
import torchvision.transforms as T
from diffusers.models import AutoencoderKL
from transfusion_pytorch import Transfusion, print_modality_sample
# Image and model size balancing
IMAGE_SIZE = 512  # Base image size
LATENT_FACTOR = 8  # SD VAE's downsampling factor
LATENT_SIZE = IMAGE_SIZE // LATENT_FACTOR  # = 96x96 latent space

In [7]:
# 2. Helper Classes
class Encoder(Module):
    def __init__(self, vae):
        super().__init__()
        self.vae = vae

    def forward(self, image):
        with torch.no_grad():
            latent = self.vae.encode(image * 2 - 1)
        return 0.18215 * latent.latent_dist.sample()

class Decoder(Module):
    def __init__(self, vae):
        super().__init__()
        self.vae = vae

    def forward(self, latents):
        latents = (1 / 0.18215) * latents
        with torch.no_grad():
            image = self.vae.decode(latents).sample
        return (image / 2 + 0.5).clamp(0, 1)

# 3. Model Setup
def setup_model():
    # Load VAE
    vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae")
    
    # Initialize model
    model = Transfusion(
        num_text_tokens = 256,
        dim_latent = 4,
        channel_first_latent = True,
        modality_default_shape = (LATENT_SIZE, LATENT_SIZE),
        modality_encoder = Encoder(vae),
        modality_decoder = Decoder(vae),
        pre_post_transformer_enc_dec = (
            nn.Sequential(
                nn.Conv2d(4, 640, 3, 2, 1),
                nn.Conv2d(640, 1280, 3, 2, 1),
            ),
            nn.Sequential(
                nn.ConvTranspose2d(1280, 640, 3, 2, 1, output_padding=1),
                nn.ConvTranspose2d(640, 4, 3, 2, 1, output_padding=1),
            )
        ),
        add_pos_emb = True,
        modality_num_dim = 2,
        reconstruction_loss_weight = 0.05,
        transformer = dict(
            dim = 1280,
            depth = 24,
            dim_head = 80,
            heads = 16,
            dropout = 0.1,
            ff_expansion_factor = 2.0
        )
    ).cuda()

    return model

In [8]:

# 4. Load Checkpoint
def load_checkpoint(model, checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    return model

In [9]:
# 5. Generate Samples
def generate_samples(model, num_samples=4):
    with torch.no_grad():
        sample = model.sample(batch_size=num_samples)
        print_modality_sample(sample)
        return sample

# 6. Demo Usage
if __name__ == "__main__":
    # Setup model
    model = setup_model()
    
    # Load checkpoint
    checkpoint_path = Path('./checkpoints/best_model.pt')
    if checkpoint_path.exists():
        model = load_checkpoint(model, checkpoint_path)
        print("Loaded checkpoint successfully!")
    
    # Generate samples
    samples = generate_samples(model)

  checkpoint = torch.load(checkpoint_path)


RuntimeError: Error(s) in loading state_dict for Transfusion:
	Missing key(s) in state_dict: "transformer.layers.12.0.weight", "transformer.layers.13.0.weight", "pos_emb_mlp.0.mlps.0.0.weight", "pos_emb_mlp.0.mlps.0.0.bias", "pos_emb_mlp.0.mlps.0.2.weight", "pos_emb_mlp.0.mlps.0.2.bias", "pos_emb_mlp.0.mlps.0.4.weight", "pos_emb_mlp.0.mlps.0.4.bias", "pos_emb_mlp.0.mlps.1.0.weight", "pos_emb_mlp.0.mlps.1.0.bias", "pos_emb_mlp.0.mlps.1.2.weight", "pos_emb_mlp.0.mlps.1.2.bias", "pos_emb_mlp.0.mlps.1.4.weight", "pos_emb_mlp.0.mlps.1.4.bias", "latent_to_model_projs.0.0.0.weight", "latent_to_model_projs.0.0.0.bias", "latent_to_model_projs.0.0.1.weight", "latent_to_model_projs.0.0.1.bias", "model_to_latent_projs.0.1.0.weight", "model_to_latent_projs.0.1.0.bias", "model_to_latent_projs.0.1.1.weight", "model_to_latent_projs.0.1.1.bias". 
	Unexpected key(s) in state_dict: "transformer.layers.24.0.weight", "transformer.layers.24.1.layernorm_gamma", "transformer.layers.24.1.layerscale", "transformer.layers.24.1.fn.to_qkv.0.weight", "transformer.layers.24.1.fn.to_learned_value_residual.0.weight", "transformer.layers.24.1.fn.to_learned_value_residual.0.bias", "transformer.layers.24.1.fn.to_gates.0.weight", "transformer.layers.24.1.fn.to_out.1.weight", "transformer.layers.24.1.to_film.weight", "transformer.layers.24.1.to_film.bias", "transformer.layers.24.1.to_ada_ln_zero.weight", "transformer.layers.24.1.to_ada_ln_zero.bias", "transformer.layers.24.2.static_beta", "transformer.layers.24.2.static_alpha", "transformer.layers.24.2.dynamic_alpha_fn", "transformer.layers.24.2.dynamic_alpha_scale", "transformer.layers.24.2.dynamic_beta_fn", "transformer.layers.24.2.dynamic_beta_scale", "transformer.layers.24.2.norm.gamma", "transformer.layers.24.3.layernorm_gamma", "transformer.layers.24.3.layerscale", "transformer.layers.24.3.fn.0.weight", "transformer.layers.24.3.fn.0.bias", "transformer.layers.24.3.fn.3.weight", "transformer.layers.24.3.fn.3.bias", "transformer.layers.24.3.to_film.weight", "transformer.layers.24.3.to_film.bias", "transformer.layers.24.3.to_ada_ln_zero.weight", "transformer.layers.24.3.to_ada_ln_zero.bias", "transformer.layers.24.4.static_beta", "transformer.layers.24.4.static_alpha", "transformer.layers.24.4.dynamic_alpha_fn", "transformer.layers.24.4.dynamic_alpha_scale", "transformer.layers.24.4.dynamic_beta_fn", "transformer.layers.24.4.dynamic_beta_scale", "transformer.layers.24.4.norm.gamma", "transformer.layers.25.0.weight", "transformer.layers.25.1.layernorm_gamma", "transformer.layers.25.1.layerscale", "transformer.layers.25.1.fn.to_qkv.0.weight", "transformer.layers.25.1.fn.to_learned_value_residual.0.weight", "transformer.layers.25.1.fn.to_learned_value_residual.0.bias", "transformer.layers.25.1.fn.to_gates.0.weight", "transformer.layers.25.1.fn.to_out.1.weight", "transformer.layers.25.1.to_film.weight", "transformer.layers.25.1.to_film.bias", "transformer.layers.25.1.to_ada_ln_zero.weight", "transformer.layers.25.1.to_ada_ln_zero.bias", "transformer.layers.25.2.static_beta", "transformer.layers.25.2.static_alpha", "transformer.layers.25.2.dynamic_alpha_fn", "transformer.layers.25.2.dynamic_alpha_scale", "transformer.layers.25.2.dynamic_beta_fn", "transformer.layers.25.2.dynamic_beta_scale", "transformer.layers.25.2.norm.gamma", "transformer.layers.25.3.layernorm_gamma", "transformer.layers.25.3.layerscale", "transformer.layers.25.3.fn.0.weight", "transformer.layers.25.3.fn.0.bias", "transformer.layers.25.3.fn.3.weight", "transformer.layers.25.3.fn.3.bias", "transformer.layers.25.3.to_film.weight", "transformer.layers.25.3.to_film.bias", "transformer.layers.25.3.to_ada_ln_zero.weight", "transformer.layers.25.3.to_ada_ln_zero.bias", "transformer.layers.25.4.static_beta", "transformer.layers.25.4.static_alpha", "transformer.layers.25.4.dynamic_alpha_fn", "transformer.layers.25.4.dynamic_alpha_scale", "transformer.layers.25.4.dynamic_beta_fn", "transformer.layers.25.4.dynamic_beta_scale", "transformer.layers.25.4.norm.gamma", "transformer.layers.26.0.weight", "transformer.layers.26.1.layernorm_gamma", "transformer.layers.26.1.layerscale", "transformer.layers.26.1.fn.to_qkv.0.weight", "transformer.layers.26.1.fn.to_learned_value_residual.0.weight", "transformer.layers.26.1.fn.to_learned_value_residual.0.bias", "transformer.layers.26.1.fn.to_gates.0.weight", "transformer.layers.26.1.fn.to_out.1.weight", "transformer.layers.26.1.to_film.weight", "transformer.layers.26.1.to_film.bias", "transformer.layers.26.1.to_ada_ln_zero.weight", "transformer.layers.26.1.to_ada_ln_zero.bias", "transformer.layers.26.2.static_beta", "transformer.layers.26.2.static_alpha", "transformer.layers.26.2.dynamic_alpha_fn", "transformer.layers.26.2.dynamic_alpha_scale", "transformer.layers.26.2.dynamic_beta_fn", "transformer.layers.26.2.dynamic_beta_scale", "transformer.layers.26.2.norm.gamma", "transformer.layers.26.3.layernorm_gamma", "transformer.layers.26.3.layerscale", "transformer.layers.26.3.fn.0.weight", "transformer.layers.26.3.fn.0.bias", "transformer.layers.26.3.fn.3.weight", "transformer.layers.26.3.fn.3.bias", "transformer.layers.26.3.to_film.weight", "transformer.layers.26.3.to_film.bias", "transformer.layers.26.3.to_ada_ln_zero.weight", "transformer.layers.26.3.to_ada_ln_zero.bias", "transformer.layers.26.4.static_beta", "transformer.layers.26.4.static_alpha", "transformer.layers.26.4.dynamic_alpha_fn", "transformer.layers.26.4.dynamic_alpha_scale", "transformer.layers.26.4.dynamic_beta_fn", "transformer.layers.26.4.dynamic_beta_scale", "transformer.layers.26.4.norm.gamma", "transformer.layers.27.0.weight", "transformer.layers.27.1.layernorm_gamma", "transformer.layers.27.1.layerscale", "transformer.layers.27.1.fn.to_qkv.0.weight", "transformer.layers.27.1.fn.to_learned_value_residual.0.weight", "transformer.layers.27.1.fn.to_learned_value_residual.0.bias", "transformer.layers.27.1.fn.to_gates.0.weight", "transformer.layers.27.1.fn.to_out.1.weight", "transformer.layers.27.1.to_film.weight", "transformer.layers.27.1.to_film.bias", "transformer.layers.27.1.to_ada_ln_zero.weight", "transformer.layers.27.1.to_ada_ln_zero.bias", "transformer.layers.27.2.static_beta", "transformer.layers.27.2.static_alpha", "transformer.layers.27.2.dynamic_alpha_fn", "transformer.layers.27.2.dynamic_alpha_scale", "transformer.layers.27.2.dynamic_beta_fn", "transformer.layers.27.2.dynamic_beta_scale", "transformer.layers.27.2.norm.gamma", "transformer.layers.27.3.layernorm_gamma", "transformer.layers.27.3.layerscale", "transformer.layers.27.3.fn.0.weight", "transformer.layers.27.3.fn.0.bias", "transformer.layers.27.3.fn.3.weight", "transformer.layers.27.3.fn.3.bias", "transformer.layers.27.3.to_film.weight", "transformer.layers.27.3.to_film.bias", "transformer.layers.27.3.to_ada_ln_zero.weight", "transformer.layers.27.3.to_ada_ln_zero.bias", "transformer.layers.27.4.static_beta", "transformer.layers.27.4.static_alpha", "transformer.layers.27.4.dynamic_alpha_fn", "transformer.layers.27.4.dynamic_alpha_scale", "transformer.layers.27.4.dynamic_beta_fn", "transformer.layers.27.4.dynamic_beta_scale", "transformer.layers.27.4.norm.gamma", "latent_to_model_projs.0.0.weight", "latent_to_model_projs.0.0.bias", "model_to_latent_projs.0.1.weight", "model_to_latent_projs.0.1.bias". 
	size mismatch for transformer.to_time_cond.0.weights: copying a param with shape torch.Size([256]) from checkpoint, the shape in current model is torch.Size([640]).
	size mismatch for transformer.to_time_cond.1.weight: copying a param with shape torch.Size([2048, 513]) from checkpoint, the shape in current model is torch.Size([5120, 1281]).
	size mismatch for transformer.to_time_cond.1.bias: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([5120]).
	size mismatch for transformer.layers.0.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.0.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.0.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.0.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.0.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.0.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.0.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.0.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.0.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.0.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.0.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.0.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.0.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.0.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.0.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.0.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.0.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.0.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.0.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.0.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.0.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.0.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.0.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.0.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.0.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.1.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.1.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.1.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.1.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.1.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.1.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.1.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.1.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.1.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.1.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.1.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.1.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.1.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.1.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.1.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.1.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.1.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.1.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.1.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.1.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.1.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.1.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.1.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.1.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.1.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.1.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.1.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.2.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.2.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.2.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.2.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.2.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.2.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.2.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.2.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.2.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.2.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.2.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.2.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.2.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.2.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.2.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.2.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.2.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.2.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.2.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.2.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.2.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.2.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.2.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.2.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.2.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.2.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.2.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.3.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.3.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.3.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.3.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.3.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.3.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.3.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.3.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.3.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.3.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.3.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.3.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.3.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.3.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.3.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.3.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.3.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.3.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.3.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.3.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.3.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.3.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.3.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.3.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.3.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.3.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.3.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.4.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.4.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.4.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.4.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.4.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.4.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.4.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.4.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.4.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.4.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.4.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.4.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.4.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.4.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.4.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.4.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.4.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.4.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.4.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.4.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.4.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.4.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.4.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.4.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.4.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.4.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.4.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.5.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.5.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.5.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.5.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.5.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.5.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.5.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.5.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.5.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.5.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.5.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.5.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.5.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.5.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.5.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.5.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.5.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.5.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.5.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.5.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.5.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.5.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.5.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.5.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.5.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.5.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.5.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.6.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.6.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.6.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.6.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.6.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.6.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.6.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.6.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.6.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.6.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.6.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.6.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.6.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.6.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.6.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.6.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.6.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.6.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.6.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.6.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.6.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.6.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.6.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.6.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.6.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.6.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.6.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.7.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.7.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.7.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.7.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.7.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.7.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.7.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.7.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.7.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.7.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.7.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.7.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.7.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.7.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.7.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.7.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.7.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.7.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.7.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.7.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.7.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.7.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.7.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.7.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.7.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.7.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.7.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.8.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.8.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.8.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.8.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.8.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.8.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.8.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.8.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.8.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.8.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.8.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.8.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.8.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.8.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.8.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.8.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.8.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.8.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.8.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.8.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.8.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.8.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.8.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.8.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.8.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.8.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.8.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.9.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.9.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.9.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.9.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.9.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.9.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.9.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.9.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.9.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.9.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.9.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.9.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.9.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.9.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.9.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.9.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.9.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.9.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.9.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.9.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.9.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.9.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.9.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.9.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.9.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.9.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.9.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.10.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.10.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.10.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.10.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.10.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.10.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.10.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.10.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.10.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.10.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.10.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.10.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.10.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.10.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.10.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.10.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.10.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.10.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.10.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.10.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.10.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.10.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.10.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.10.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.10.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.10.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.10.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.11.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.11.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.11.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.11.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.11.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.11.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.11.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.11.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.11.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.11.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.11.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.11.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.11.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.11.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.11.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.11.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.11.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.11.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.11.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.11.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.11.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.11.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.11.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.11.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.11.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.11.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.11.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.12.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.12.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.12.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.12.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.12.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.12.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.12.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.12.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.12.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.12.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.12.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.12.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.12.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.12.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.12.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.12.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.12.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.12.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.12.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.12.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.12.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.12.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.12.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.12.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.12.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.12.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.12.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.13.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.13.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.13.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.13.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.13.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.13.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.13.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.13.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.13.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.13.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.13.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.13.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.13.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.13.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.13.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.13.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.13.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.13.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.13.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.13.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.13.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.13.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.13.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.13.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.13.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.13.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.13.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.14.0.weight: copying a param with shape torch.Size([512, 1024]) from checkpoint, the shape in current model is torch.Size([1280, 2560]).
	size mismatch for transformer.layers.14.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.14.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.14.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.14.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.14.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.14.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.14.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.14.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.14.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.14.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.14.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.14.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.14.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.14.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.14.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.14.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.14.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.14.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.14.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.14.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.14.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.14.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.14.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.14.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.14.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.14.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.14.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.15.0.weight: copying a param with shape torch.Size([512, 1024]) from checkpoint, the shape in current model is torch.Size([1280, 2560]).
	size mismatch for transformer.layers.15.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.15.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.15.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.15.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.15.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.15.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.15.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.15.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.15.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.15.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.15.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.15.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.15.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.15.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.15.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.15.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.15.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.15.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.15.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.15.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.15.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.15.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.15.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.15.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.15.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.15.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.15.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.16.0.weight: copying a param with shape torch.Size([512, 1024]) from checkpoint, the shape in current model is torch.Size([1280, 2560]).
	size mismatch for transformer.layers.16.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.16.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.16.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.16.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.16.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.16.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.16.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.16.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.16.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.16.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.16.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.16.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.16.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.16.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.16.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.16.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.16.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.16.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.16.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.16.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.16.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.16.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.16.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.16.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.16.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.16.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.16.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.17.0.weight: copying a param with shape torch.Size([512, 1024]) from checkpoint, the shape in current model is torch.Size([1280, 2560]).
	size mismatch for transformer.layers.17.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.17.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.17.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.17.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.17.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.17.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.17.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.17.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.17.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.17.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.17.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.17.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.17.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.17.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.17.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.17.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.17.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.17.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.17.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.17.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.17.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.17.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.17.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.17.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.17.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.17.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.17.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.18.0.weight: copying a param with shape torch.Size([512, 1024]) from checkpoint, the shape in current model is torch.Size([1280, 2560]).
	size mismatch for transformer.layers.18.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.18.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.18.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.18.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.18.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.18.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.18.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.18.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.18.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.18.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.18.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.18.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.18.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.18.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.18.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.18.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.18.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.18.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.18.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.18.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.18.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.18.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.18.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.18.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.18.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.18.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.18.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.19.0.weight: copying a param with shape torch.Size([512, 1024]) from checkpoint, the shape in current model is torch.Size([1280, 2560]).
	size mismatch for transformer.layers.19.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.19.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.19.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.19.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.19.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.19.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.19.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.19.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.19.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.19.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.19.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.19.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.19.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.19.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.19.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.19.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.19.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.19.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.19.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.19.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.19.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.19.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.19.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.19.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.19.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.19.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.19.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.20.0.weight: copying a param with shape torch.Size([512, 1024]) from checkpoint, the shape in current model is torch.Size([1280, 2560]).
	size mismatch for transformer.layers.20.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.20.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.20.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.20.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.20.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.20.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.20.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.20.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.20.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.20.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.20.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.20.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.20.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.20.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.20.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.20.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.20.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.20.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.20.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.20.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.20.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.20.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.20.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.20.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.20.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.20.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.20.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.21.0.weight: copying a param with shape torch.Size([512, 1024]) from checkpoint, the shape in current model is torch.Size([1280, 2560]).
	size mismatch for transformer.layers.21.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.21.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.21.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.21.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.21.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.21.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.21.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.21.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.21.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.21.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.21.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.21.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.21.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.21.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.21.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.21.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.21.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.21.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.21.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.21.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.21.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.21.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.21.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.21.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.21.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.21.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.21.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.22.0.weight: copying a param with shape torch.Size([512, 1024]) from checkpoint, the shape in current model is torch.Size([1280, 2560]).
	size mismatch for transformer.layers.22.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.22.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.22.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.22.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.22.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.22.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.22.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.22.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.22.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.22.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.22.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.22.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.22.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.22.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.22.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.22.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.22.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.22.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.22.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.22.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.22.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.22.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.22.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.22.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.22.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.22.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.22.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.23.0.weight: copying a param with shape torch.Size([512, 1024]) from checkpoint, the shape in current model is torch.Size([1280, 2560]).
	size mismatch for transformer.layers.23.1.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.23.1.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.23.1.fn.to_qkv.0.weight: copying a param with shape torch.Size([4608, 512]) from checkpoint, the shape in current model is torch.Size([3840, 1280]).
	size mismatch for transformer.layers.23.1.fn.to_learned_value_residual.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.23.1.fn.to_learned_value_residual.0.bias: copying a param with shape torch.Size([24]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for transformer.layers.23.1.fn.to_gates.0.weight: copying a param with shape torch.Size([24, 512]) from checkpoint, the shape in current model is torch.Size([16, 1280]).
	size mismatch for transformer.layers.23.1.fn.to_out.1.weight: copying a param with shape torch.Size([512, 1536]) from checkpoint, the shape in current model is torch.Size([1280, 1280]).
	size mismatch for transformer.layers.23.1.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.23.1.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.23.1.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.23.1.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.23.2.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.23.2.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.23.2.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.23.3.layernorm_gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.23.3.layerscale: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.23.3.fn.0.weight: copying a param with shape torch.Size([2730, 512]) from checkpoint, the shape in current model is torch.Size([3412, 1280]).
	size mismatch for transformer.layers.23.3.fn.0.bias: copying a param with shape torch.Size([2730]) from checkpoint, the shape in current model is torch.Size([3412]).
	size mismatch for transformer.layers.23.3.fn.3.weight: copying a param with shape torch.Size([512, 1365]) from checkpoint, the shape in current model is torch.Size([1280, 1706]).
	size mismatch for transformer.layers.23.3.fn.3.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.23.3.to_film.weight: copying a param with shape torch.Size([1024, 2048]) from checkpoint, the shape in current model is torch.Size([2560, 5120]).
	size mismatch for transformer.layers.23.3.to_film.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([2560]).
	size mismatch for transformer.layers.23.3.to_ada_ln_zero.weight: copying a param with shape torch.Size([512, 2048]) from checkpoint, the shape in current model is torch.Size([1280, 5120]).
	size mismatch for transformer.layers.23.3.to_ada_ln_zero.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.23.4.dynamic_alpha_fn: copying a param with shape torch.Size([512, 5]) from checkpoint, the shape in current model is torch.Size([1280, 5]).
	size mismatch for transformer.layers.23.4.dynamic_beta_fn: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.layers.23.4.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for transformer.norm.gamma: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1280]).
	size mismatch for rotary_emb.freqs: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([40]).
	size mismatch for text_embed.weight: copying a param with shape torch.Size([389, 512]) from checkpoint, the shape in current model is torch.Size([389, 1280]).
	size mismatch for to_text_logits.weight: copying a param with shape torch.Size([389, 512]) from checkpoint, the shape in current model is torch.Size([389, 1280]).