In [None]:
## For visualisation
!pip install denku

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import sys
sys.path.append('..')

import torch
from diffusers.utils import load_video, export_to_video
from diffusers import AutoencoderKLWan, FlowMatchEulerDiscreteScheduler, UniPCMultistepScheduler
from transformers import UMT5EncoderModel, T5TokenizerFast
from controlnet_aux import HEDdetector, CannyDetector, MidasDetector
from denku import show_images

from wan_controlnet import WanControlnet
from wan_transformer import CustomWanTransformer3DModel
from wan_t2v_controlnet_pipeline import WanTextToVideoControlnetPipeline

%load_ext autoreload
%autoreload 2

In [None]:
# Wan-AI/Wan2.2-TI2V-5B-Diffusers
base_model_path = "Wan-AI/Wan2.2-TI2V-5B-Diffusers"

tokenizer = T5TokenizerFast.from_pretrained(base_model_path, subfolder="tokenizer")
text_encoder = UMT5EncoderModel.from_pretrained(base_model_path, subfolder="text_encoder", torch_dtype=torch.bfloat16)
vae = AutoencoderKLWan.from_pretrained(base_model_path, subfolder="vae", torch_dtype=torch.float32)
transformer = CustomWanTransformer3DModel.from_pretrained(base_model_path, subfolder="transformer", torch_dtype=torch.bfloat16)
scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(base_model_path, subfolder="scheduler")

In [None]:
# TheDenk/wan2.2-ti2v-5b-controlnet-depth-v1
controlnet_model_path = "TheDenk/wan2.2-ti2v-5b-controlnet-depth-v1"

controlnet = WanControlnet.from_pretrained(controlnet_model_path, torch_dtype=torch.bfloat16)

In [None]:
pipe = WanTextToVideoControlnetPipeline.from_pretrained(
    pretrained_model_name_or_path=base_model_path,
    tokenizer=tokenizer, 
    text_encoder=text_encoder,
    transformer=transformer,
    vae=vae, 
    controlnet=controlnet,
    scheduler=scheduler,
)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=5.0)
pipe.enable_model_cpu_offload()

In [None]:
def init_controlnet(controlnet_type, device='cuda'):
    if controlnet_type in ['canny']:
        return controlnet_mapping[controlnet_type]()
    return controlnet_mapping[controlnet_type].from_pretrained('lllyasviel/Annotators').to(device=device)

controlnet_mapping = {
    'canny': CannyDetector,
    'hed': HEDdetector,
    'depth': MidasDetector,
}
controlnet_processor = init_controlnet("depth")

In [None]:
img_h = 704 # 704 480
img_w = 1280 # 1280 832
num_frames = 121  # 121 81 49

video_path = '../resources/bubble.mp4'
video_frames = load_video(video_path)[:num_frames]
video_frames = [x.resize((img_w, img_h)) for x in video_frames]
controlnet_frames = [controlnet_processor(x) for x in video_frames]

show_images(video_frames[::25], figsize=(16, 8))
show_images(controlnet_frames[::25], figsize=(16, 8))

In [None]:
prompt = "Close-up shot with soft lighting, focusing sharply on the lower half of a young woman's face. Her lips are slightly parted as she blows an enormous bubblegum bubble. The bubble is semi-transparent, shimmering gently under the light, and surprisingly contains a miniature aquarium inside, where two orange-and-white goldfish slowly swim, their fins delicately fluttering as if in an aquatic universe. The background is a pure light blue color."
negative_prompt = "bad quality, worst quality"

output = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    height=img_h,
    width=img_w,
    num_frames=num_frames,
    guidance_scale=5,
    generator=torch.Generator(device="cuda").manual_seed(42),
    output_type="pil",

    controlnet_frames=controlnet_frames,
    controlnet_guidance_start=0.0,
    controlnet_guidance_end=0.8,
    controlnet_weight=0.8,

    teacache_treshold=0.6,
).frames[0]

show_images(output[:2], figsize=(6, 12))
export_to_video(output, "output.mp4", fps=16)