In [None]:
import os
import torch
import random
import torch.nn as nn
import torch.backends.cudnn as cudnn
import numpy as np
from models import build_model
from PIL import Image
from IPython.display import Image as ipython_image
from diffusers.utils import load_image, export_to_video, export_to_gif
from diffusers import DiffusionPipeline

In [None]:
# The local directory to save Video-LaVIT checkpoint
model_path = "/home/jinyang06/models/VideoLaVIT-v1"
model_dtype='fp16'

seed = 42
random.seed(seed)
torch.manual_seed(seed)

# Set the load GPU id
device_id = 0
torch.cuda.set_device(device_id)
device = torch.device('cuda')

# If you have already install xformers, set `use_xformers=True` to save the GPU memory (Xformers is not supported on V100 GPU)
# If you have already download the checkpoint, set `local_files_only=True`` to avoid auto-downloading from remote
model = build_model(model_path=model_path, model_dtype=model_dtype, local_files_only=True, 
                device_id=device_id, use_xformers=True, understanding=False,)
model = model.to(device)


torch_dtype = torch.bfloat16 if model_dtype=="bf16" else torch.float16


# We load a high aesthetic text-to-image model to intervene the keyframe when generating videos
t2i_pipe = DiffusionPipeline.from_pretrained(
    "/home/jinyang06/models/playground-v2",
    torch_dtype=torch.bfloat16 if model_dtype=='bf16' else torch.float16,
    use_safetensors=True,
    add_watermarker=False,
)
t2i_pipe.unet.enable_xformers_memory_efficient_attention()
t2i_pipe.to(device)

print("Building Model Finsished")

### Text-to-Video Generation

We load a high aesthetic text-to-image model to intervene the keyframe when generating videos.

In [None]:
prompt = 'Sailboat sailing on a sunny day in a mountain lake'
# prompt = 'Bloomming cherry tree in the garden beautiful sun light'
# prompt = 'A wooden barrel drifting on a river'
# prompt = 'A cute mouse typing on a keyboard'
# prompt = 'A panda playing a ukulele at home'
# prompt = 'Toy poodle dog rides a penny board outdoors'
# prompt = 'Aerial Around Young Hiker Man Standing on Mountain Peak Summit At Sunrise'
# prompt = 'A hamster wearing virtual reality headsets is a dj in a disco'
# prompt = "Beer pouring into glass"
# prompt = "Funny cute pug dog feeling good listening to music with big headphones and swinging head"
# prompt = "A bear is giving a presentation in the classroom"


ratio_dict = {
    '1:1' : (1024, 1024),
    '1:2' : (576, 1024),
}

ratio = '1:1'
height, width = ratio_dict[ratio]
keyframe  = t2i_pipe(prompt=prompt, width=width, height=height, guidance_scale=3.0, num_inference_steps=50).images[0]
print(prompt)


# The video width and height should has the same aspect ratio with the generated keyframe
# Generated high resolution video requires more GPU memory, you can choose to lower the resolution.
# e.g., set video_width=576, video_height = 320 for 1:2;  video_width=512, video_height = 512 for 1:1
if ratio == '1:2':
    video_width = 896
    video_height = 512
    # video_width = 576
    # video_height = 320
else:
    assert ratio == '1:1'
    video_width = 768
    video_height = 768
    # video_width = 512
    # video_height = 512


# Manuaaly intervene the image tokenids with the high aesthetic text-to-image model
input_prompts = [(prompt, 'text'), (keyframe, 'image')]

with torch.cuda.amp.autocast(enabled=True, dtype=torch_dtype):
    videos, _ = model.multimodal_video_generate(input_prompts, width=width, height=height, video_width=video_width, 
        video_height=video_height, guidance_scale_for_llm=4.0, top_k=50)


output_video_path = "generated.gif"
export_to_gif(videos[0], output_video_path)
display(ipython_image(open(output_video_path,'rb').read()))