# Check U-Net pipeline

In [None]:
import torch
import torch.nn as nn
from tqdm import tqdm
from nuscenes.nuscenes import NuScenes
from nuscenes.utils.splits import create_splits_scenes
from nuscenes.can_bus.can_bus_api import NuScenesCanBus
from einops import rearrange, repeat
from transformers import CLIPTextModel, CLIPTokenizer

import sys
sys.path.append('/home/wxd/video-generation/diffusers/src')
from diffusers import UNetSpatioTemporalConditionModel_Action
from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
from diffusers import ActionVideoDiffusionPipeline

In [None]:
## check the network，5 context layer
unet = UNetSpatioTemporalConditionModel_Action(cross_attention_dim=768, in_channels=4)
unet.to('cuda')

In [None]:
sample = torch.randn((2, 8, 4, 24, 48)).to('cuda')
image_context = sample[:, 0]
image_context.shape
bsz = sample.shape[0]
num_frames = 8

In [None]:
timesteps = torch.randint(0, 1000, (bsz,))
timesteps = timesteps.long()
image_only_indicator = torch.zeros(bsz, 8, dtype=sample.dtype)

In [None]:
sample = sample.flatten(0, 1)
sample = unet.conv_in(sample)
sample.shape

In [None]:
context_frames = (image_context,)

for module in unet.context_block:
    image_context = module(image_context)
    context_frames =  context_frames + (image_context,)
context_frames = context_frames[1:] # pop the extra small feature map

In [None]:
emb = torch.randn((bsz, 1280))
emb = emb.repeat_interleave(num_frames, dim=0)

In [None]:
pretrained_model_name_or_path = "/mnt/workspace/sd-drive-video-ep40"
text_encoder = CLIPTextModel.from_pretrained(
            pretrained_model_name_or_path, subfolder="text_encoder"
)
vae = AutoencoderKL.from_pretrained(
            pretrained_model_name_or_path, subfolder="vae"
)

text_encoder.eval()
vae.eval()
text_encoder = text_encoder.to('cuda:0')
vae = vae.to('cuda:0')

In [None]:
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="tokenizer")
scheduler = DDPMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
feature_extractor = CLIPImageProcessor.from_pretrained(pretrained_model_name_or_path, subfolder="feature_extractor")

In [None]:
texts = ['a car in the street', 'a bus in the street']
inputs = tokenizer(
                texts, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
            )
inputs["input_ids"] = inputs["input_ids"].to('cuda:0')
print(inputs["input_ids"].shape)
# encoder_hidden_states = text_encoder(inputs["input_ids"].cuda())
# encoder_hidden_states

In [None]:
encoder_hidden_states = text_encoder(inputs["input_ids"])[0]

In [None]:
# text conditions
encoder_hidden_states = encoder_hidden_states.repeat_interleave(num_frames, dim=0)

In [None]:
def get_add_time_ids(
        batch_size,
        fps=6,
        motion_bucket_id=127,
        noise_aug_strength=0.02,
        num_videos_per_prompt=1,
        do_classifier_free_guidance=False,
    ):
        add_time_ids = [fps, motion_bucket_id, noise_aug_strength]

        passed_add_embed_dim = 256 * len(add_time_ids)
        expected_add_embed_dim = 768

        if expected_add_embed_dim != passed_add_embed_dim:
            raise ValueError(
                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
            )

        add_time_ids = torch.tensor([add_time_ids])
        add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)

        if do_classifier_free_guidance:
            add_time_ids = torch.cat([add_time_ids, add_time_ids])

        return add_time_ids

In [None]:
added_time_ids = get_add_time_ids(bsz)

In [None]:
sample = sample.to('cuda')
emb = emb.to('cuda')
encoder_hidden_states = encoder_hidden_states.to('cuda')
image_only_indicator = image_only_indicator.to('cuda')

In [None]:
num_frames = image_only_indicator.shape[-1]
num_frames

In [None]:
down_block_res_samples = (sample,)
for idx, downsample_block in enumerate(unet.down_blocks):
    # print(f'Down Blocks idx {idx}')
    if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
        # print(f'DOWN Has_cross_attention')
        # print(f'DOWN context shape: {context_frames[idx].shape}')
        sample, res_samples = downsample_block(
            hidden_states=sample,
            temb=emb,
            encoder_hidden_states=encoder_hidden_states,
            image_only_indicator=image_only_indicator,
            action=None,
            image_context=context_frames[idx],
        )
    else:
        # print(f'DOWN No_cross_attention')
        # print(f'DOWN context shape: {context_frames[idx].shape}')
        sample, res_samples = downsample_block(
            hidden_states=sample,
            temb=emb,
            image_only_indicator=image_only_indicator,
            action=None,
            image_context=context_frames[idx],
        )
    print(f'{idx}, {context_frames[idx].shape}')

    down_block_res_samples += res_samples

In [None]:
sample = unet.mid_block(
            hidden_states=sample,
            temb=emb,
            encoder_hidden_states=encoder_hidden_states,
            image_only_indicator=image_only_indicator,
            action=None,
            image_context=context_frames[-1], # smallest feature map
        )
print(sample.shape)

In [None]:
# for i, upsample_block in enumerate(unet.up_blocks):
#     res_samples = down_block_res_samples[-len(upsample_block.resnets) :] # len=2, smallest feature map?
#     down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]

#     if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
#         sample = upsample_block(
#             hidden_states=sample,
#             temb=emb,
#             res_hidden_states_tuple=res_samples,
#             encoder_hidden_states=encoder_hidden_states,
#             image_only_indicator=image_only_indicator,
#         )
#     else:
#         sample = upsample_block(
#             hidden_states=sample,
#             temb=emb,
#             res_hidden_states_tuple=res_samples,
#             image_only_indicator=image_only_indicator,
#         )

In [None]:
# Predict the noise residual and compute loss
# model_pred = unet(noisy_latents, timesteps, encoder_hidden_states, added_time_ids, image_context=image_context).sample

# Eval Action Pipeline

In [1]:
import torch
import torch.nn as nn
from tqdm import tqdm
from nuscenes.nuscenes import NuScenes
from nuscenes.utils.splits import create_splits_scenes
from nuscenes.can_bus.can_bus_api import NuScenesCanBus
from einops import rearrange, repeat
from transformers import CLIPTextModel, CLIPTokenizer

import sys
sys.path.append('/home/wxd/video-generation/diffusers/src')
from diffusers import UNetSpatioTemporalConditionModel_Action
from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
from diffusers import ActionVideoDiffusionPipeline

In [2]:
## check the network，5 context layer
unet = UNetSpatioTemporalConditionModel_Action(cross_attention_dim=768, in_channels=4)
unet

add action and image context parameters
add action and image context parameters
add action and image context parameters
add action and image context parameters
add action and image context parameters


UNetSpatioTemporalConditionModel_Action(
  (conv_in): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (time_proj): Timesteps()
  (time_embedding): TimestepEmbedding(
    (linear_1): Linear(in_features=320, out_features=1280, bias=True)
    (act): SiLU()
    (linear_2): Linear(in_features=1280, out_features=1280, bias=True)
  )
  (add_time_proj): Timesteps()
  (add_embedding): TimestepEmbedding(
    (linear_1): Linear(in_features=768, out_features=1280, bias=True)
    (act): SiLU()
    (linear_2): Linear(in_features=1280, out_features=1280, bias=True)
  )
  (down_blocks): ModuleList(
    (0): CrossAttnDownBlockSpatioTemporal(
      (attentions): ModuleList(
        (0-1): 2 x TransformerSpatioTemporalModel(
          (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
          (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
          (transformer_blocks): ModuleList(
            (0): BasicTransformerBlock(
              (norm1): LayerNorm((320,), eps=1

In [8]:
unet.context_block

ModuleList(
  (0): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): Sequential(
    (0): GroupNorm(32, 320, eps=1e-05, affine=True)
    (1): SiLU()
    (2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): Downsample2D(
      (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    )
  )
  (2): Sequential(
    (0): GroupNorm(32, 320, eps=1e-05, affine=True)
    (1): SiLU()
    (2): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): Downsample2D(
      (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    )
  )
  (3): Sequential(
    (0): GroupNorm(32, 640, eps=1e-05, affine=True)
    (1): SiLU()
    (2): Conv2d(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): Downsample2D(
      (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    )
  )
  (4): Sequential(
    (0): GroupNorm(32, 1280, eps=1e-05, affine=True)
   

In [4]:
unet.down_blocks[0] # 当然是先过resnet，再过attetnion，这里的顺序只是字典存储顺序

CrossAttnDownBlockSpatioTemporal(
  (attentions): ModuleList(
    (0-1): 2 x TransformerSpatioTemporalModel(
      (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
      (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
      (transformer_blocks): ModuleList(
        (0): BasicTransformerBlock(
          (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
          (attn1): Attention(
            (to_q): Linear(in_features=320, out_features=320, bias=False)
            (to_k): Linear(in_features=320, out_features=320, bias=False)
            (to_v): Linear(in_features=320, out_features=320, bias=False)
            (to_out): ModuleList(
              (0): Linear(in_features=320, out_features=320, bias=True)
              (1): Dropout(p=0.0, inplace=False)
            )
          )
          (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
          (attn2): Attention(
            (to_q): Linear(in_features=320, out_features=320, bias=False)
      

In [23]:
pretrained_model_name_or_path = "/home/wxd/video-generation/diffusers/examples/text_to_image/sd-drive-ep40"
text_encoder = CLIPTextModel.from_pretrained(
            pretrained_model_name_or_path, subfolder="text_encoder"
)
vae = AutoencoderKL.from_pretrained(
            pretrained_model_name_or_path, subfolder="vae"
)

text_encoder.eval()
vae.eval()

AutoencoderKL(
  (encoder): Encoder(
    (conv_in): Conv2d(3, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (down_blocks): ModuleList(
      (0): DownEncoderBlock2D(
        (resnets): ModuleList(
          (0-1): 2 x ResnetBlock2D(
            (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
            (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            (nonlinearity): SiLU()
          )
        )
        (downsamplers): ModuleList(
          (0): Downsample2D(
            (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2))
          )
        )
      )
      (1): DownEncoderBlock2D(
        (resnets): ModuleList(
          (0): ResnetBlock2D(
            (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
            (c

In [25]:
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="tokenizer")
scheduler = DDPMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
feature_extractor = CLIPImageProcessor.from_pretrained(pretrained_model_name_or_path, subfolder="feature_extractor")

In [26]:
from nuscene_video import Videoframes

dataset = Videoframes(None, tokenizer)

Total samples: 684


In [27]:
dataloader = torch.utils.data.DataLoader(
        dataset,
        shuffle=True,
        batch_size=6,
        num_workers=6,
    )

In [28]:
for batch in dataloader:
    print(batch)
    break

{'input_ids': tensor([[49406,   320,  1305,  2012,   593,   320,  3399,  1395,   537,  3346,
          4161,  1136,   585,   269, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407],
        [49406,   320,  2012,   593,  3346, 16487,   530,   518,  2443, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 

In [32]:
batch['label_imgs'].shape

torch.Size([6, 8, 3, 192, 384])

In [None]:
unet = UNetSpatioTemporalConditionModel_Action.from_pretrained(pretrained_model_name_or_path, subfolder="unet")
unet.eval()
unet = unet.to('cuda:0')

In [None]:
pipeline = ActionVideoDiffusionPipeline(
            text_encoder=text_encoder,
            vae=vae,
            unet=unet,
            scheduler=scheduler,
            tokenizer=tokenizer,
            feature_extractor=feature_extractor
        )

In [None]:
pipeline.to('cuda:0')

In [None]:
from PIL import Image
import numpy as np
image = Image.open('/mnt/workspace/diffusers/examples/text_to_video/car_2.png')
image = image.resize((384, 192))
prompt = "a car is riding on a street"

action_scale = 1000
steers = [[17.59999999999991,
 14.299999999999727,
 14.299999999999727,
 14.299999999999727,
 16.799999999999727,
 18.0,
 19.699999999999818,
 23.0]]
speeds = [[22.17,
 21.29,
 20.38,
 19.48,
 19.03,
 19.150000000000002,
 19.740000000000002,
 20.59]]
         
steers = torch.tensor(steers) * np.pi / 180 * action_scale
speeds = torch.tensor(speeds) * 10. / 36 * action_scale
print(steers.shape)

steers = steers.view(-1, 1)
speeds = speeds.view(-1, 1)
action = torch.cat([steers, speeds], dim=-1)
action = action.to('cuda:0')
print(action.shape)
image

In [None]:
frames = pipeline(image, num_frames=8, prompt=prompt, action=action, height=192, width=384).frames[0]

In [None]:
from diffusers.utils import export_to_video
export_to_video(frames, "generated.mp4", fps=7)