In [1]:
import os

import torch
from diffusers import AutoencoderKLWan, WanImageToVideoPipeline, WanTransformer3DModel
from transformers import UMT5EncoderModel, BitsAndBytesConfig
from diffusers.utils import export_to_video, load_image

import numpy as np
from PIL import Image

In [2]:
model_id = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"

transformer_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)

print("Загрузка Text Encoder (16-bit)...")
text_encoder = UMT5EncoderModel.from_pretrained(
    model_id,
    subfolder="text_encoder",
    dtype=torch.bfloat16
)

print("Загрузка Transformer (4-bit)...")
transformer = WanTransformer3DModel.from_pretrained(
    model_id,
    subfolder="transformer",
    quantization_config=transformer_config,
    torch_dtype=torch.bfloat16
)

print("Загрузка Transformer 2 (4-bit)...")
transformer_2 = WanTransformer3DModel.from_pretrained(
    model_id,
    subfolder="transformer_2",
    quantization_config=transformer_config,
    torch_dtype=torch.bfloat16
)

print("Загрузка VAE (32-bit)...")
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)

print("Сборка пайплайна...")
pipe = WanImageToVideoPipeline.from_pretrained(
    model_id,
    text_encoder=text_encoder,
    transformer=transformer,
    transformer_2=transformer_2,
    vae=vae,
)

pipe.enable_model_cpu_offload()

print(f"Готово. Пайплайн загружен.")

Загрузка Text Encoder (16-bit)...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Загрузка Transformer (4-bit)...


Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/12 [00:00<?, ?it/s]

Загрузка Transformer 2 (4-bit)...


Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/12 [00:00<?, ?it/s]

Загрузка VAE (32-bit)...
Сборка пайплайна...


Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

Готово. Пайплайн загружен.


In [3]:
import pandas as pd
import os

In [4]:
attachment_folder = 'wan_i2v_14b'

if not os.path.exists(attachment_folder):
    os.mkdir(attachment_folder)

test_data = pd.read_csv('metadata_test.csv')

In [None]:
for index, row in test_data.loc[:4].iterrows():
    filename = row['video']
    filename = '_'.join(filename.split('_')[:-2]) + '_frame.jpg'
    image = load_image(
        os.path.join("/root/.cache/kagglehub/datasets/davidik67/waves-videos-df/versions/3/first_screen/", filename)
    )

    max_area = 832 * 480
    num_frames = 121
    num_inference_steps = 50
    guidance_scale = 3.5
    
    aspect_ratio = image.height / image.width
    mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
    height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
    width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
    image = image.resize((width, height))
    
    prompt = row['prompt']

    output = pipe(
        image=image,
        prompt=prompt,
        height=height,
        width=width,
        num_frames=num_frames,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale
    )
    export_to_video(output.frames[0], os.path.join(attachment_folder, filename.replace('_frame.jpg', '.mp4')), fps=24)

  0%|          | 0/50 [00:00<?, ?it/s]