In [None]:
!pip install diffusers transformers hf_transfer
!pip install opencv-python

In [None]:
!pip install accelerate==0.33.0

In [None]:
!pip install imageio-ffmpeg

!pip install moviepy

In [None]:
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

In [None]:
import torch
from diffusers import AutoencoderKLCogVideoX, CogVideoXImageToVideoPipeline, CogVideoXTransformer3DModel
from diffusers.utils import export_to_video, load_video
from transformers import T5EncoderModel
from moviepy.editor import VideoFileClip, concatenate_videoclips
import utils

In [None]:
# Models: "THUDM/CogVideoX-2b" or "THUDM/CogVideoX-5b"
model_id = "THUDM/CogVideoX-5b-I2V"

In [None]:
# Thank you [@camenduru](https://github.com/camenduru)!
# The reason for using checkpoints hosted by Camenduru instead of the original is because they exported
# with a max_shard_size of "5GB" when saving the model with `.save_pretrained`. The original converted
# model was saved with "10GB" as the max shard size, which causes the Colab CPU RAM to be insufficient
# leading to OOM (on the CPU)

transformer = CogVideoXTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.float16)
text_encoder = T5EncoderModel.from_pretrained(model_id, subfolder="text_encoder", torch_dtype=torch.float16)
vae = AutoencoderKLCogVideoX.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float16)

In [None]:
# Create pipeline and run inference
pipe = CogVideoXImageToVideoPipeline.from_pretrained(
    model_id,
    text_encoder=text_encoder,
    transformer=transformer,
    vae=vae,
    torch_dtype=torch.float16,
)

In [None]:
pipe.enable_sequential_cpu_offload()
# pipe.vae.enable_tiling()

In [None]:
input_video = load_video(
    "inputDog.mp4"
)

last_frame = utils.get_last_frame(
    "inputDog.mp4"
)

prompt = (
    "The dog become a real dog."
)

import cv2

if last_frame is not None:
    cv2.imshow('Last Frame', last_frame)

In [None]:
video = pipe(image=last_frame, prompt=prompt, guidance_scale=6, use_dynamic_cfg=True, num_inference_steps=50).frames[0]

In [None]:


export_to_video(video, "output.mp4", fps=8)

In [None]:
export_to_video(input_video, "input.mp4", fps=8)

video1 = VideoFileClip("input.mp4")  # 替换为你的第一个视频文件名
video2 = VideoFileClip("output.mp4")  # 替换为你的第二个视频文件名

# 拼接视频
final_video = concatenate_videoclips([video1, video2])

# 导出最终视频
final_video.write_videofile("final_video.mp4", codec="libx264")

In [None]:
from IPython.display import display, Video
display(Video("input.mp4", embed=True))
display(Video("final_video.mp4", embed=True))