In [1]:
!pip install boto3

Collecting boto3
  Downloading boto3-1.37.25-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<1.38.0,>=1.37.25 (from boto3)
  Downloading botocore-1.37.25-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.12.0,>=0.11.0 (from boto3)
  Downloading s3transfer-0.11.4-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.37.25-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.6/139.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.37.25-py3-none-any.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m87.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Downloading s3transfer-0.11.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.4/84.4 kB[0m [31m7.4 MB/s[0m eta [36m0:0

In [2]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
from peft import LoraConfig, get_peft_model
from tqdm import tqdm
import boto3
import tempfile
import random

In [3]:
# ==== Step 0: 初始化环境：支持多卡和 Colab 单卡 ====
is_distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ
if is_distributed:
    import torch.distributed as dist
    from torch.nn.parallel import DistributedDataParallel as DDP
    dist.init_process_group(backend="nccl")
    local_rank = int(os.environ["LOCAL_RANK"])
    torch.cuda.set_device(local_rank)
    device = torch.device("cuda", local_rank)
else:
    local_rank = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# ==== Step 1: 自定义 Dataset 加载 S3 上的 .pt 文件 ====
class S3PTVideoDataset(Dataset):
    def __init__(self, s3_bucket, s3_prefix):
        self.s3 = boto3.client(
            's3',
            aws_access_key_id="AKIAZOZQF46DIM7BU3TG",
            aws_secret_access_key="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"
        )
        self.bucket = s3_bucket
        self.prefix = s3_prefix
        response = self.s3.list_objects_v2(Bucket=self.bucket, Prefix=self.prefix)
        self.keys = [item['Key'] for item in response.get('Contents', []) if item['Key'].endswith(".pt")]

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        key = self.keys[idx]
        with tempfile.NamedTemporaryFile(suffix=".pt") as temp:
            self.s3.download_file(self.bucket, key, temp.name)
            sample = torch.load(temp.name)
        return sample["video"], sample["text"]

In [5]:
# ==== Step 2: 加载预训练模型 + 插入 LoRA 层 ====
model_id = "damo-vilab/text-to-video-ms-1.7b"
pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, variant="fp16")
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.text_encoder.to(device)
pipe.tokenizer
pipe.vae.to(device)
pipe.unet.requires_grad_(False)
lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["to_q", "to_v"],
    bias="none"
)
pipe.unet = get_peft_model(pipe.unet, lora_config)
pipe.unet.to(device)
if is_distributed:
    pipe.unet = DDP(pipe.unet, device_ids=[local_rank])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model_index.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

model.fp16.safetensors:   0%|          | 0.00/681M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/755 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/2.82G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/657 [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
# ==== Step 3: 加载数据集（来自 S3） ====
s3_bucket = "data298youcook2"
s3_prefix = "ModelScope_T2V_processed_data/train"
dataset = S3PTVideoDataset(s3_bucket, s3_prefix)
if is_distributed:
    from torch.utils.data import DistributedSampler
    sampler = DistributedSampler(dataset)
else:
    sampler = None

dataloader = DataLoader(dataset, batch_size=4, shuffle=(sampler is None), sampler=sampler)

In [8]:
# ==== Step 3.5: 从 S3 下载上一次 3 epoch 的 checkpoint ====
s3 = boto3.client(
    's3',
    aws_access_key_id="AKIAZOZQF46DIM7BU3TG",
    aws_secret_access_key="jAPb1o26QHsPZwE1rSDJLjZoxJ/vxTOJJQMzkxQW"
)
s3_bucket = "data298youcook2"
s3_key = "ModelScope_T2V_finetuned/uniform_3epoch_123.pt"
resume_path = "checkpoints/last_checkpoint.pt"

if not os.path.exists(resume_path):
    os.makedirs("checkpoints", exist_ok=True)
    print(" Downloading uniform_3epoch_123.pt from S3...")
    s3.download_file(s3_bucket, s3_key, resume_path)
    print(" Download complete.")


In [9]:
# ✅ Batch 采样函数
def sample_segment_frames_batch(video_batch, num_frames=8, method="uniform"):
    """
    从 batch 视频中每个视频采样固定数量的帧。
    video_batch: Tensor [B, T, C, H, W]
    Return: Tensor [B, num_frames, C, H, W]
    """
    B, T, C, H, W = video_batch.shape
    sampled = []

    for i in range(B):
        video = video_batch[i]
        if T <= num_frames:
            sampled_video = video
        else:
            if method == "uniform":
                indices = torch.linspace(0, T - 1, num_frames).long()
            elif method == "center":
                start = (T - num_frames) // 2
                indices = torch.arange(start, start + num_frames)
            elif method == "random":
                start = random.randint(0, T - num_frames)
                indices = torch.arange(start, start + num_frames)
            else:
                raise ValueError(f"Unsupported method: {method}")
            sampled_video = video[indices]
        sampled.append(sampled_video)

    return torch.stack(sampled)


Fine-tuning 3 epoch

In [8]:
from peft import get_peft_model_state_dict

# ==== Step 4: 训练循环 + Checkpoint 保存 ====
optimizer = torch.optim.Adam(pipe.unet.parameters(), lr=1e-4)
pipe.unet.train()

save_dir = "checkpoints"
os.makedirs(save_dir, exist_ok=True)
resume_path = os.path.join(save_dir, "last_checkpoint.pt")
if os.path.exists(resume_path):
    map_location = {f"cuda:{0}": f"cuda:{local_rank}"}
    state_dict = torch.load(resume_path, map_location=map_location)
    if is_distributed:
        pipe.unet.module.load_state_dict(state_dict)
    else:
        pipe.unet.load_state_dict(state_dict)

accumulate_steps = 4
scaler = torch.amp.GradScaler('cuda')

for epoch in range(3):
    if is_distributed:
        sampler.set_epoch(epoch)
    optimizer.zero_grad()

    running_loss = 0.0  # ✅ 用于统计本轮平均 loss

    for step, (video, text) in enumerate(tqdm(dataloader, desc=f"Epoch {epoch+1} [Rank {local_rank}]")):
        video = video.to(device)  # [B, T, C, H, W]
        B, T, C, H, W = video.shape

        # ✅ 视频帧采样
        video = sample_segment_frames_batch(video, num_frames=8, method="uniform")
        B, T, C, H, W = video.shape

        # ✅ 文本 tokenizer
        text_inputs = pipe.tokenizer(list(text), return_tensors="pt", padding=True, truncation=True).to(device)
        encoder_hidden_states = pipe.text_encoder(**text_inputs)[0]  # [B, L, D]

        # ✅ VAE 编码
        video_flat = video.view(B * T, C, H, W)
        with torch.no_grad():
            latents_flat = pipe.vae.encode(video_flat).latent_dist.sample()

        latents = latents_flat.view(B, T, -1, H // 8, W // 8) * 0.18215
        latents = latents.permute(0, 2, 1, 3, 4)  # [B, C, T, H, W]

        # ✅ 添加噪声
        noise = torch.randn_like(latents)
        timesteps = torch.randint(0, 1000, (B,), device=latents.device).long()
        noisy_latents = pipe.scheduler.add_noise(latents, noise, timesteps)

        # ✅ forward + loss
        with torch.amp.autocast('cuda'):
            noise_pred = pipe.unet(
                noisy_latents,
                timesteps,
                encoder_hidden_states=encoder_hidden_states
            ).sample
            loss = torch.nn.functional.mse_loss(noise_pred, noise) / accumulate_steps

        # ✅ backward
        scaler.scale(loss).backward()
        running_loss += loss.item()  # ✅ 累加到 epoch loss

        if (step + 1) % accumulate_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        # ✅ 保存 LoRA adapter 权重
        if step % 100 == 0 and local_rank == 0:
            ckpt_path = os.path.join(save_dir, f"epoch{epoch+1}_step{step}.pt")
            lora_state_dict = get_peft_model_state_dict(pipe.unet)
            torch.save(lora_state_dict, ckpt_path)
            torch.save(lora_state_dict, resume_path)
            print(f"💾 Saved LoRA-only checkpoint: {ckpt_path}")

    torch.cuda.empty_cache()

    # ✅ 打印 epoch 平均 loss（仅主进程）
    avg_loss = running_loss / len(dataloader)
    if local_rank == 0:
        print(f"✅ [Epoch {epoch+1}] Average Loss: {avg_loss:.6f}")


Epoch 1 [Rank 0]:   0%|          | 1/250 [00:07<31:02,  7.48s/it]

💾 Saved LoRA-only checkpoint: checkpoints/epoch1_step0.pt


Epoch 1 [Rank 0]:  40%|████      | 101/250 [06:23<10:05,  4.06s/it]

💾 Saved LoRA-only checkpoint: checkpoints/epoch1_step100.pt


Epoch 1 [Rank 0]:  80%|████████  | 201/250 [12:24<03:12,  3.92s/it]

💾 Saved LoRA-only checkpoint: checkpoints/epoch1_step200.pt


Epoch 1 [Rank 0]: 100%|██████████| 250/250 [15:22<00:00,  3.69s/it]


✅ [Epoch 1] Average Loss: 0.069618


Epoch 2 [Rank 0]:   0%|          | 1/250 [00:03<14:18,  3.45s/it]

💾 Saved LoRA-only checkpoint: checkpoints/epoch2_step0.pt


Epoch 2 [Rank 0]:  40%|████      | 101/250 [06:12<09:32,  3.84s/it]

💾 Saved LoRA-only checkpoint: checkpoints/epoch2_step100.pt


Epoch 2 [Rank 0]:  80%|████████  | 201/250 [12:07<03:14,  3.97s/it]

💾 Saved LoRA-only checkpoint: checkpoints/epoch2_step200.pt


Epoch 2 [Rank 0]: 100%|██████████| 250/250 [15:02<00:00,  3.61s/it]


✅ [Epoch 2] Average Loss: 0.046302


Epoch 3 [Rank 0]:   0%|          | 1/250 [00:05<20:59,  5.06s/it]

💾 Saved LoRA-only checkpoint: checkpoints/epoch3_step0.pt


Epoch 3 [Rank 0]:  40%|████      | 101/250 [06:06<10:01,  4.04s/it]

💾 Saved LoRA-only checkpoint: checkpoints/epoch3_step100.pt


Epoch 3 [Rank 0]:  80%|████████  | 201/250 [12:31<03:08,  3.85s/it]

💾 Saved LoRA-only checkpoint: checkpoints/epoch3_step200.pt


Epoch 3 [Rank 0]: 100%|██████████| 250/250 [15:28<00:00,  3.71s/it]

✅ [Epoch 3] Average Loss: 0.039379





In [13]:
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
import torch
import imageio
import numpy as np
from IPython.display import HTML, Video
from base64 import b64encode

# === 加载原始 pipeline ===
pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16")
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.to("cuda")

# === 加载你训练好的 LoRA 参数 ===
lora_weights = torch.load("checkpoints/last_checkpoint.pt")
pipe.unet.load_state_dict(lora_weights, strict=False)  # 若是 DDP 训练，可能需要 .module

# === 推理 ===
pipe.enable_model_cpu_offload()
prompt = "crack two eggs into a bowl"
result = pipe(prompt, num_inference_steps=25)
frames = result.frames[0]  # shape: (T, H, W, C)

# === 保存视频 ===
out_path = "uniform_3epoch_egg.mp4"
frames = [(frame * 255).astype(np.uint8) if frame.max() <= 1 else frame.astype(np.uint8) for frame in frames]
imageio.mimsave(out_path, frames, fps=8)

# === 视频预览 ===
mp4 = open(out_path, "rb").read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML(f"<video width=512 controls><source src='{data_url}' type='video/mp4'></video>")

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

In [19]:
# === 推理 ===
pipe.enable_model_cpu_offload()
prompt = "A person is slicing vegetables on a cutting board"
result = pipe(prompt, num_inference_steps=25)
frames = result.frames[0]  # shape: (T, H, W, C)

# === 保存视频 ===
out_path = "uniform_3epoch_vegetable.mp4"
frames = [(frame * 255).astype(np.uint8) if frame.max() <= 1 else frame.astype(np.uint8) for frame in frames]
imageio.mimsave(out_path, frames, fps=8)

# === 视频预览 ===
mp4 = open(out_path, "rb").read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML(f"<video width=512 controls><source src='{data_url}' type='video/mp4'></video>")

  0%|          | 0/25 [00:00<?, ?it/s]

FFine-tuning 10 epoch

In [11]:
import os
import torch
from tqdm import tqdm
from peft import get_peft_model_state_dict

# ==== Resume Training Settings ====
save_dir = "checkpoints"
resume_path = os.path.join(save_dir, "last_checkpoint.pt")
total_epochs = 10
start_epoch = 0

# 检查是否存在 checkpoint
if os.path.exists(resume_path):
    print(f" Resuming from checkpoint: {resume_path}")
    start_epoch = 3  # 假设上次训练完成了 3 个 epoch
else:
    print(" Starting fresh training.")

# ==== Step 4: 训练循环 + Checkpoint 保存 ====
optimizer = torch.optim.Adam(pipe.unet.parameters(), lr=1e-4)
pipe.unet.train()

os.makedirs(save_dir, exist_ok=True)

if os.path.exists(resume_path):
    map_location = {f"cuda:{0}": f"cuda:{local_rank}"}
    lora_state_dict = torch.load(resume_path, map_location=map_location)

    if is_distributed:
        pipe.unet.module.load_state_dict(lora_state_dict, strict=False)
    else:
        pipe.unet.load_state_dict(lora_state_dict, strict=False)

    print("✅ LoRA adapter weights loaded.")


accumulate_steps = 4
scaler = torch.amp.GradScaler('cuda')

for epoch in range(start_epoch, total_epochs):
    if is_distributed:
        sampler.set_epoch(epoch)
    optimizer.zero_grad()

    running_loss = 0.0  # ✅ 用于统计本轮平均 loss

    for step, (video, text) in enumerate(tqdm(dataloader, desc=f"Epoch {epoch+1} [Rank {local_rank}]")):
        video = video.to(device)  # [B, T, C, H, W]
        B, T, C, H, W = video.shape

        # ✅ 视频帧采样
        video = sample_segment_frames_batch(video, num_frames=8, method="uniform")
        B, T, C, H, W = video.shape

        # ✅ 文本 tokenizer
        text_inputs = pipe.tokenizer(list(text), return_tensors="pt", padding=True, truncation=True).to(device)
        encoder_hidden_states = pipe.text_encoder(**text_inputs)[0]  # [B, L, D]

        # ✅ VAE 编码
        video_flat = video.view(B * T, C, H, W)
        with torch.no_grad():
            latents_flat = pipe.vae.encode(video_flat).latent_dist.sample()

        latents = latents_flat.view(B, T, -1, H // 8, W // 8) * 0.18215
        latents = latents.permute(0, 2, 1, 3, 4)  # [B, C, T, H, W]

        # ✅ 添加噪声
        noise = torch.randn_like(latents)
        timesteps = torch.randint(0, 1000, (B,), device=latents.device).long()
        noisy_latents = pipe.scheduler.add_noise(latents, noise, timesteps)

        # ✅ forward + loss
        with torch.amp.autocast('cuda'):
            noise_pred = pipe.unet(
                noisy_latents,
                timesteps,
                encoder_hidden_states=encoder_hidden_states
            ).sample
            loss = torch.nn.functional.mse_loss(noise_pred, noise) / accumulate_steps

        # ✅ backward
        scaler.scale(loss).backward()
        running_loss += loss.item()  # ✅ 累加到 epoch loss

        if (step + 1) % accumulate_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        # ✅ 保存 LoRA adapter 权重
        if step % 100 == 0 and local_rank == 0:
            ckpt_path = os.path.join(save_dir, f"epoch{epoch+1}_step{step}.pt")
            lora_state_dict = get_peft_model_state_dict(pipe.unet)
            torch.save(lora_state_dict, ckpt_path)
            torch.save(lora_state_dict, resume_path)
            print(f" Saved LoRA-only checkpoint: {ckpt_path}")

    torch.cuda.empty_cache()

    # ✅ 打印 epoch 平均 loss（仅主进程）
    avg_loss = running_loss / len(dataloader)
    if local_rank == 0:
        print(f" [Epoch {epoch+1}] Average Loss: {avg_loss:.6f}")




 Resuming from checkpoint: checkpoints/last_checkpoint.pt
✅ LoRA adapter weights loaded.


Epoch 4 [Rank 0]:   0%|          | 1/250 [00:07<31:38,  7.62s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch4_step0.pt


Epoch 4 [Rank 0]:  40%|████      | 101/250 [06:16<09:37,  3.88s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch4_step100.pt


Epoch 4 [Rank 0]:  80%|████████  | 201/250 [12:09<03:09,  3.86s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch4_step200.pt


Epoch 4 [Rank 0]: 100%|██████████| 250/250 [15:10<00:00,  3.64s/it]


 [Epoch 4] Average Loss: 0.071865


Epoch 5 [Rank 0]:   0%|          | 1/250 [00:05<20:53,  5.03s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch5_step0.pt


Epoch 5 [Rank 0]:  40%|████      | 101/250 [05:57<09:51,  3.97s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch5_step100.pt


Epoch 5 [Rank 0]:  80%|████████  | 201/250 [12:41<03:55,  4.80s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch5_step200.pt


Epoch 5 [Rank 0]: 100%|██████████| 250/250 [15:36<00:00,  3.75s/it]


 [Epoch 5] Average Loss: 0.047686


Epoch 6 [Rank 0]:   0%|          | 1/250 [00:05<21:07,  5.09s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch6_step0.pt


Epoch 6 [Rank 0]:  40%|████      | 101/250 [06:45<09:15,  3.73s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch6_step100.pt


Epoch 6 [Rank 0]:  80%|████████  | 201/250 [13:22<03:23,  4.16s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch6_step200.pt


Epoch 6 [Rank 0]: 100%|██████████| 250/250 [16:21<00:00,  3.92s/it]


 [Epoch 6] Average Loss: 0.034472


Epoch 7 [Rank 0]:   0%|          | 1/250 [00:05<21:10,  5.10s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch7_step0.pt


Epoch 7 [Rank 0]:  40%|████      | 101/250 [06:15<09:51,  3.97s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch7_step100.pt


Epoch 7 [Rank 0]:  80%|████████  | 201/250 [12:22<03:25,  4.19s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch7_step200.pt


Epoch 7 [Rank 0]: 100%|██████████| 250/250 [15:15<00:00,  3.66s/it]


 [Epoch 7] Average Loss: 0.031760


Epoch 8 [Rank 0]:   0%|          | 1/250 [00:05<21:53,  5.28s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch8_step0.pt


Epoch 8 [Rank 0]:  40%|████      | 101/250 [06:49<09:48,  3.95s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch8_step100.pt


Epoch 8 [Rank 0]:  80%|████████  | 201/250 [13:07<04:08,  5.07s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch8_step200.pt


Epoch 8 [Rank 0]: 100%|██████████| 250/250 [16:32<00:00,  3.97s/it]


 [Epoch 8] Average Loss: 0.030559


Epoch 9 [Rank 0]:   0%|          | 1/250 [00:06<25:11,  6.07s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch9_step0.pt


Epoch 9 [Rank 0]:  40%|████      | 101/250 [07:35<09:55,  4.00s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch9_step100.pt


Epoch 9 [Rank 0]:  80%|████████  | 201/250 [14:35<03:43,  4.57s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch9_step200.pt


Epoch 9 [Rank 0]: 100%|██████████| 250/250 [17:51<00:00,  4.28s/it]


 [Epoch 9] Average Loss: 0.030256


Epoch 10 [Rank 0]:   0%|          | 1/250 [00:06<25:27,  6.13s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch10_step0.pt


Epoch 10 [Rank 0]:  40%|████      | 101/250 [06:00<10:05,  4.06s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch10_step100.pt


Epoch 10 [Rank 0]:  80%|████████  | 201/250 [15:31<03:18,  4.04s/it]

 Saved LoRA-only checkpoint: checkpoints/epoch10_step200.pt


Epoch 10 [Rank 0]: 100%|██████████| 250/250 [18:27<00:00,  4.43s/it]

 [Epoch 10] Average Loss: 0.030326





In [20]:
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
import torch
import imageio
import numpy as np
from IPython.display import HTML, Video
from base64 import b64encode

# === 加载原始 pipeline ===
pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16")
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.to("cuda")

# === 加载你训练好的 LoRA 参数 ===
lora_weights = torch.load("checkpoints/last_checkpoint.pt")
pipe.unet.load_state_dict(lora_weights, strict=False)  # 若是 DDP 训练，可能需要 .module

# === 推理 ===
pipe.enable_model_cpu_offload()
prompt = "crack two eggs into a bowl"
result = pipe(prompt, num_inference_steps=25)
frames = result.frames[0]  # shape: (T, H, W, C)

# === 保存视频 ===
out_path = "uniform_10epoch_egg.mp4"
frames = [(frame * 255).astype(np.uint8) if frame.max() <= 1 else frame.astype(np.uint8) for frame in frames]
imageio.mimsave(out_path, frames, fps=8)

# === 视频预览 ===
mp4 = open(out_path, "rb").read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML(f"<video width=512 controls><source src='{data_url}' type='video/mp4'></video>")

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

In [24]:
# === 推理 ===
pipe.enable_model_cpu_offload()
prompt = "A person is slicing vegetables on a cutting board"
result = pipe(prompt, num_inference_steps=25)
frames = result.frames[0]  # shape: (T, H, W, C)

# === 保存视频 ===
out_path = "uniform_10epoch_vegetable.mp4"
frames = [(frame * 255).astype(np.uint8) if frame.max() <= 1 else frame.astype(np.uint8) for frame in frames]
imageio.mimsave(out_path, frames, fps=8)

# === 视频预览 ===
mp4 = open(out_path, "rb").read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML(f"<video width=512 controls><source src='{data_url}' type='video/mp4'></video>")

  0%|          | 0/25 [00:00<?, ?it/s]

In [18]:
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
import torch
import imageio
import numpy as np
from IPython.display import HTML, Video
from base64 import b64encode

# === 加载原始 pipeline ===
pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16")
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.to("cuda")

# === 加载你训练好的 LoRA 参数 ===
lora_weights = torch.load("checkpoints/last_checkpoint.pt")
pipe.unet.load_state_dict(lora_weights, strict=False)  # 若是 DDP 训练，可能需要 .module

# === 推理 ===
pipe.enable_model_cpu_offload()
prompt = "crack two eggs into a bowl"
result = pipe(prompt, num_inference_steps=25)
frames = result.frames[0]  # shape: (T, H, W, C)

# === 保存视频 ===
out_path = "finetuned_output.mp4"
frames = [(frame * 255).astype(np.uint8) if frame.max() <= 1 else frame.astype(np.uint8) for frame in frames]
imageio.mimsave(out_path, frames, fps=8)

# === 视频预览 ===
mp4 = open(out_path, "rb").read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML(f"<video width=512 controls><source src='{data_url}' type='video/mp4'></video>")

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

In [20]:
# === 推理 ===
pipe.enable_model_cpu_offload()
prompt = "A person is slicing vegetables on a cutting board"
result = pipe(prompt, num_inference_steps=25)
frames = result.frames[0]  # shape: (T, H, W, C)

# === 保存视频 ===
out_path = "finetuned_output.mp4"
frames = [(frame * 255).astype(np.uint8) if frame.max() <= 1 else frame.astype(np.uint8) for frame in frames]
imageio.mimsave(out_path, frames, fps=8)

# === 视频预览 ===
mp4 = open(out_path, "rb").read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML(f"<video width=512 controls><source src='{data_url}' type='video/mp4'></video>")

  0%|          | 0/25 [00:00<?, ?it/s]

In [25]:
# ==== Step 5: 上传最终模型到 S3（仅主进程） ====
if local_rank == 0:
    s3 = boto3.client(
        's3',
        aws_access_key_id="AKIAZOZQF46DIM7BU3TG",
        aws_secret_access_key="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"
    )
    final_ckpt = resume_path
    s3_upload_key = "ModelScope_T2V_finetuned/uniform_10epoch_123.pt"
    s3.upload_file(final_ckpt, s3_bucket, s3_upload_key)
    print(f"LoRA weights uploaded to s3://{s3_bucket}/{s3_upload_key}")

LoRA weights uploaded to s3://data298youcook2/ModelScope_T2V_finetuned/uniform_10epoch_123.pt


In [None]:
# ✅ 推理函数：generate_video(prompt)
def generate_video(prompt, lora_path="checkpoints/last_checkpoint.pt", steps=25):
    print("🔧 Loading pipeline and LoRA weights...")
    pipe = DiffusionPipeline.from_pretrained(
        "damo-vilab/text-to-video-ms-1.7b",
        torch_dtype=torch.float16,
        variant="fp16"
    )
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
    pipe.to("cuda")

    # 加载 LoRA 权重
    lora_weights = torch.load(lora_path, map_location="cuda")
    pipe.unet.load_state_dict(lora_weights, strict=False)

    print(f"🎬 Generating video for: '{prompt}'")
    pipe.enable_model_cpu_offload()
    result = pipe(prompt, num_inference_steps=steps)

    frames = result.frames[0]
    out_path = "preview_output.mp4"
    frames = [(f * 255).astype(np.uint8) if f.max() <= 1 else f.astype(np.uint8) for f in frames]
    imageio.mimsave(out_path, frames, fps=8)

    mp4 = open(out_path, "rb").read()
    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
    return HTML(f"<video width=512 controls><source src='{data_url}' type='video/mp4'></video>")
