In [1]:
from diffusers import AutoencoderKL, UNet2DConditionModel, DDPMScheduler
import torch

In [2]:
noisy_latents = torch.Tensor(40, 64, 64, 64).cuda()
timesteps = torch.Tensor([615, 678, 556,  34, 389, 288,  84, 299, 459, 916, 660, 135, 903, 818,
        978, 724, 331, 354, 636, 679, 211, 213, 288, 469, 964, 393, 515, 459,
         61, 691, 939, 205, 163, 692, 900, 784, 395, 612, 445, 586]).long().cuda()
encoder_hidden_states = torch.Tensor(40, 77, 768).cuda()

In [4]:
unet = UNet2DConditionModel.from_pretrained(
            'deepghs/animefull-latest', subfolder="unet", low_cpu_mem_usage=False, ignore_mismatched_sizes=True
        ).cuda()

In [5]:
unet

UNet2DConditionModel(
  (conv_in): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (time_proj): Timesteps()
  (time_embedding): TimestepEmbedding(
    (linear_1): LoRACompatibleLinear(in_features=320, out_features=1280, bias=True)
    (act): SiLU()
    (linear_2): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
  )
  (down_blocks): ModuleList(
    (0): CrossAttnDownBlock2D(
      (attentions): ModuleList(
        (0-1): 2 x Transformer2DModel(
          (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
          (proj_in): LoRACompatibleConv(320, 320, kernel_size=(1, 1), stride=(1, 1))
          (transformer_blocks): ModuleList(
            (0): BasicTransformerBlock(
              (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
              (attn1): Attention(
                (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
                (to_k): LoRACompatibleLinear(in_features=320, out_features=320

In [13]:
torch.nn.Conv2d(in_channels=3, out_channels=64, kernel_size=1, stride=1, padding=1)(torch.Tensor(3,3,64,64)).shape

torch.Size([3, 64, 66, 66])

In [4]:
unet(noisy_latents, timesteps, encoder_hidden_states).sample.shape

torch.Size([40, 1, 64, 64])

In [7]:
import torch
import torch.nn as nn

class ImageAdapter(nn.Module):
    def __init__(self, in_channels, adapter_channels):
        super(ImageAdapter, self).__init__()
        self.adapter = nn.Sequential(
            nn.Conv2d(in_channels, adapter_channels, kernel_size=1),
            nn.ReLU(),
            nn.Conv2d(adapter_channels, in_channels, kernel_size=1)
        )

    def forward(self, x):
        return x + self.adapter(x)

# Example usage
in_channels = 64  # Assuming RGB images, adjust as needed
adapter_channels = 64

# Create an image adapter
image_adapter = ImageAdapter(in_channels, adapter_channels)

# Example input tensor (batch size, channels, height, width)
input_tensor = torch.randn(4, in_channels, 64, 64)

# Apply the image adapter to the input tensor
image_adapter(input_tensor).shape

torch.Size([4, 64, 64, 64])

In [1]:
from movqgan.util import instantiate_from_config
from movqgan import get_movqgan_model
from omegaconf import OmegaConf
import torch
config = OmegaConf.load(f"./vae/vaegan.yaml")
model = instantiate_from_config(config['model'])# Initialize data loaders
ckpt_path = f"./vae/ckpt/step=13499-model.ckpt"
checkpoint = torch.load(ckpt_path)
model.load_state_dict(checkpoint['state_dict'])
vae = model.cuda()

Working with z of shape (1, 64, 32, 32) = 65536 dimensions.
loaded pretrained LPIPS loss from movqgan/modules/losses/lpips/vgg.pth
VQLPIPSWithDiscriminator running with hinge loss.
use_ema = True
