In [None]:
!pip install -q diffusers transformers accelerate peft torchvision

In [None]:
import os, json, glob
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import CLIPTokenizer
from google.colab import drive
drive.mount('/content/drive')

# Set up the device
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16

tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")

class YouCook2SDDataset(Dataset):
    def __init__(self, annotation_path, frame_dir, tokenizer, transform=None, max_frames=8):
        with open(annotation_path, "r") as f:
            self.annotations = json.load(f)

        self.tokenizer = tokenizer
        self.frame_dir = frame_dir
        self.transform = transform or transforms.Compose([
            transforms.Resize((512, 512)),
            transforms.ToTensor()
        ])
        self.max_frames = max_frames

        # Only include videos that have frames extracted
        self.existing_video_ids = set([
            f.split("_sentence")[0] for f in os.listdir(self.frame_dir) if "_frames" in f
        ])

        self.data = self.prepare_data()

    def prepare_data(self):
        data = []
        for video_id, video_data in self.annotations["database"].items():
            if video_id not in self.existing_video_ids:
                continue
            for idx, ann in enumerate(video_data["annotations"]):
                sentence = ann["sentence"]
                sentence_folder = f"{video_id}_sentence{idx}_frames"
                full_path = os.path.join(self.frame_dir, sentence_folder)

                if not os.path.exists(full_path):
                    continue

                frame_list = sorted(glob.glob(os.path.join(full_path, "frame_*.jpg")))
                if not frame_list:
                    continue

                # Trim or pad to max_frames
                selected_frames = frame_list[:self.max_frames]
                if len(selected_frames) < self.max_frames:
                    selected_frames += [selected_frames[-1]] * (self.max_frames - len(selected_frames))

                data.append((selected_frames, sentence))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        frame_paths, caption = self.data[idx]
        frames = torch.stack([
            self.transform(Image.open(fp).convert("RGB")) for fp in frame_paths
        ])

        # Tokenize using provided tokenizer
        text_input = self.tokenizer(
            caption,
            padding="max_length",
            max_length=77,
            truncation=True,
            return_tensors="pt"
        )

        return frames, text_input.input_ids.squeeze()

tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")

annotation_path = "/content/drive/Shareddrives/DATA 298B Team 8/298A/youcook/annotations/youcookii_annotations_trainval.json"
frame_dir = "/content/drive/Shareddrives/DATA 298B Team 8/298A/processed_data/train"

dataset = YouCook2SDDataset(annotation_path, frame_dir, tokenizer)
dataloader = DataLoader(
    dataset,
    batch_size=1,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

print(f"✅ Dataset loaded with {len(dataset)} samples")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ Dataset loaded with 8634 samples


In [None]:
from diffusers import StableDiffusion3Pipeline
from peft import get_peft_model, LoraConfig, TaskType
import torch

model_id = "stabilityai/stable-diffusion-3.5-medium"
pipe = StableDiffusion3Pipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")

# 设置 LoRA（作用于transformer）
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["to_q", "to_k", "to_v", "to_out.0"],
    lora_dropout=0.1,
)

pipe.transformer = get_peft_model(pipe.transformer, lora_config)

Loading pipeline components...:   0%|          | 0/9 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [None]:
from tqdm import tqdm
import torch
from torch import nn, optim
from diffusers import DDPMScheduler
import os

# 设置环境变量防止碎片化
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

torch.cuda.empty_cache()
torch.cuda.ipc_collect()

pipe.scheduler = DDPMScheduler.from_config(pipe.scheduler.config)
vae = pipe.vae
text_encoder = pipe.text_encoder
noise_scheduler = pipe.scheduler

# 初始化优化器和损失函数
optimizer = optim.Adam(pipe.transformer.parameters(), lr=1e-4)
loss_fn = nn.MSELoss()

num_epochs = 3
max_batches_per_epoch = 200  # 每轮epoch最多训练200个batch
device = "cuda"

# 修改后的训练循环
for epoch in range(num_epochs):
    pipe.transformer.train()

    # 创建有限迭代器，只取前200个batch
    limited_dataloader = []
    for i, batch in enumerate(dataloader):
        if i >= max_batches_per_epoch:
            break
        limited_dataloader.append(batch)

    loop = tqdm(limited_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
    total_loss = 0

    for batch in loop:
        frames, text_input_ids = batch
        frames = frames.to(device, dtype=torch.float16)
        text_input_ids = text_input_ids.to(device)

        # 获取文本编码器输出
        with torch.no_grad():  # 冻结text_encoder以减少内存
            text_encoder_output = text_encoder(text_input_ids, return_dict=True)
            encoder_hidden_states = text_encoder_output["last_hidden_state"]
            pooled_projections = text_encoder_output["text_embeds"]

        # 调整维度（确保dtype匹配）
        if encoder_hidden_states.shape[-1] != 4096:
            linear = nn.Linear(encoder_hidden_states.shape[-1], 4096).to(device)
            linear = linear.to(dtype=encoder_hidden_states.dtype)
            encoder_hidden_states = linear(encoder_hidden_states)
        if pooled_projections.shape[-1] != 2048:
            linear = nn.Linear(pooled_projections.shape[-1], 2048).to(device)
            linear = linear.to(dtype=pooled_projections.dtype)
            pooled_projections = linear(pooled_projections)

        # 处理视频帧 - 只取最后一帧以减少计算量
        last_frames = frames[:, -1]  # [B, 3, 512, 512]

        # VAE编码
        with torch.no_grad():  # 冻结VAE以减少内存
            latents = vae.encode(last_frames).latent_dist.sample()  # [B, 4, 64, 64]

        # 准备噪声和时间步
        timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps,
                                (frames.shape[0],), device=device, dtype=torch.long)
        noise = torch.randn_like(latents)
        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)  # [B, 4, 64, 64]

        # 前向传递
        preds = pipe.transformer(
            hidden_states=noisy_latents,
            encoder_hidden_states=encoder_hidden_states,
            pooled_projections=pooled_projections,
            timestep=timesteps,
        )["sample"]

        # 计算损失
        loss = loss_fn(preds, noise)

        optimizer.zero_grad()
        loss.backward()

        # 梯度裁剪防止爆炸
        torch.nn.utils.clip_grad_norm_(pipe.transformer.parameters(), 1.0)

        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

        # 手动释放内存
        del frames, text_input_ids, text_encoder_output, last_frames, latents, noise, noisy_latents, preds
        torch.cuda.empty_cache()

    avg_loss = total_loss / max(len(dataloader), max_batches_per_epoch)
    print(f"Epoch {epoch+1} Loss: {avg_loss:.4f}")

The config attributes {'shift': 3.0} were passed to DDPMScheduler, but are not expected and will be ignored. Please verify your scheduler_config.json configuration file.
Epoch 1/3: 100%|██████████| 200/200 [01:30<00:00,  2.20it/s, loss=nan]

Epoch 1 Loss: nan



Epoch 2/3: 100%|██████████| 200/200 [01:31<00:00,  2.19it/s, loss=nan]

Epoch 2 Loss: nan



Epoch 3/3: 100%|██████████| 200/200 [01:30<00:00,  2.20it/s, loss=nan]

Epoch 3 Loss: nan





In [None]:
# 保存fine-tune后的模型（简洁版）
def save_finetuned_model(pipe, base_path="/content/drive/MyDrive"):
    """保存微调后的模型和权重"""
    import os
    from datetime import datetime

    # 1. 创建保存目录（带时间戳）
    save_dir = os.path.join(base_path, f"sd3_finetuned_{datetime.now().strftime('%Y%m%d_%H%M')}")
    os.makedirs(save_dir, exist_ok=True)

    # 2. 保存整个pipeline（包含LoRA权重）
    pipe.save_pretrained(save_dir)

    # 3. 单独保存关键组件（可选）
    transformer_path = os.path.join(save_dir, "transformer_weights.safetensors")
    torch.save(pipe.transformer.state_dict(), transformer_path)

    # 4. 验证保存结果
    assert os.path.exists(os.path.join(save_dir, "model_index.json")), "模型保存失败"
    print(f"✅ 模型已保存到: {save_dir}")
    return save_dir

# 在训练结束后调用保存
model_path = save_finetuned_model(pipe)

self.transformer=PeftModel(
  (base_model): LoraModel(
    (model): SD3Transformer2DModel(
      (pos_embed): PatchEmbed(
        (proj): Conv2d(16, 1536, kernel_size=(2, 2), stride=(2, 2))
      )
      (time_text_embed): CombinedTimestepTextProjEmbeddings(
        (time_proj): Timesteps()
        (timestep_embedder): TimestepEmbedding(
          (linear_1): Linear(in_features=256, out_features=1536, bias=True)
          (act): SiLU()
          (linear_2): Linear(in_features=1536, out_features=1536, bias=True)
        )
        (text_embedder): PixArtAlphaTextProjection(
          (linear_1): Linear(in_features=2048, out_features=1536, bias=True)
          (act_1): SiLU()
          (linear_2): Linear(in_features=1536, out_features=1536, bias=True)
        )
      )
      (context_embedder): Linear(in_features=4096, out_features=1536, bias=True)
      (transformer_blocks): ModuleList(
        (0-12): 13 x JointTransformerBlock(
          (norm1): SD35AdaLayerNormZeroX(
            (sil

✅ 模型已保存到: /content/drive/MyDrive/sd3_finetuned_20250408_0137


In [None]:
pip install diffusers transformers accelerate torchvision scipy
pip install lpips
pip install torchmetrics
pip install av

In [None]:
import os
import torch
from torchvision.transforms import ToTensor, Compose, Resize, CenterCrop, Normalize
from diffusers import StableDiffusion3Pipeline
from PIL import Image
import numpy as np
from tqdm import tqdm
from transformers import CLIPProcessor, CLIPModel
from scipy import linalg
import torch.nn.functional as F
from torchvision import models
import imageio.v2 as imageio  # ✅ 用于写入视频

device = "cuda" if torch.cuda.is_available() else "cpu"
from google.colab import drive
drive.mount('/content/drive')


# 登录 HuggingFace
from huggingface_hub import login
login("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")

# 加载模型
model_id = "stabilityai/stable-diffusion-3.5-medium"
pipe = StableDiffusion3Pipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")

# ✅ 修改后的视频生成函数
def generate_video_from_text(
    pipe,
    prompt,
    num_frames=8,
    output_path="generated_video.mp4",
    fps=8,
    seed=None
):
    if seed is not None:
        torch.manual_seed(seed)

    frames = []

    for i in tqdm(range(num_frames), desc="Generating frames"):
        frame_prompt = f"{prompt}, cinematic, frame {i+1}/{num_frames}"

        with torch.autocast("cuda"):
            image = pipe(
                frame_prompt,
                num_inference_steps=20,
                generator=torch.Generator(device="cuda").manual_seed(seed+i) if seed else None
            ).images[0]

        image_np = np.array(image)  # PIL → NumPy [H, W, C]
        frames.append(image_np)

    imageio.mimsave(output_path, frames, fps=fps)
    print(f"🎞️ Video saved to {output_path}")
    return frames

# Step 1: 生成视频帧
prompt = "A chef is cooking pasta in a professional kitchen"
generated_frames = generate_video_from_text(pipe, prompt, num_frames=8, output_path="/content/drive/MyDrive/generated_pasta.mp4")

# ---- Metrics Section ----
real_frames = generated_frames  # 暂时使用相同帧

clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)

transform = Compose([
    Resize(299),
    CenterCrop(299),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def calculate_metrics(generated_frames, real_frames, prompt):
    gen_tensors = torch.stack([transform(Image.fromarray(f)).to(device) for f in generated_frames])
    real_tensors = torch.stack([transform(Image.fromarray(f)).to(device) for f in real_frames])
    min_frames = min(len(gen_tensors), len(real_tensors))
    gen_tensors = gen_tensors[:min_frames]
    real_tensors = real_tensors[:min_frames]

    def ssim(img1, img2):
        C1 = 0.01 ** 2
        C2 = 0.03 ** 2
        mu1 = F.avg_pool2d(img1, 3, 1, 1)
        mu2 = F.avg_pool2d(img2, 3, 1, 1)
        sigma1 = F.avg_pool2d(img1 * img1, 3, 1, 1) - mu1 ** 2
        sigma2 = F.avg_pool2d(img2 * img2, 3, 1, 1) - mu2 ** 2
        sigma12 = F.avg_pool2d(img1 * img2, 3, 1, 1) - mu1 * mu2
        ssim_map = ((2 * mu1 * mu2 + C1) * (2 * sigma12 + C2)) / \
                   ((mu1 ** 2 + mu2 ** 2 + C1) * (sigma1 + sigma2 + C2))
        return ssim_map.mean()

    def psnr(img1, img2):
        mse = F.mse_loss(img1, img2)
        return 20 * torch.log10(1.0 / torch.sqrt(mse))

    ssim_vals, psnr_vals = [], []
    for g, r in zip(gen_tensors, real_tensors):
        ssim_vals.append(ssim(g.unsqueeze(0), r.unsqueeze(0)))
        psnr_vals.append(psnr(g.unsqueeze(0), r.unsqueeze(0)))

    with torch.no_grad():
        inputs = clip_processor(text=[prompt], images=[Image.fromarray(f) for f in generated_frames], return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = clip_model(**inputs)
        clip_score = outputs.logits_per_image.mean().item()

    inception = models.inception_v3(pretrained=True, aux_logits=False).to(device)
    inception.fc = torch.nn.Identity()
    inception.eval()

    def extract_features(frames):
        feats = []
        with torch.no_grad():
            for img in frames:
                img_tensor = transform(Image.fromarray(img)).unsqueeze(0).to(device)
                feat = inception(img_tensor)
                feats.append(feat)
        return torch.cat(feats)

    gen_feats = extract_features(generated_frames)
    real_feats = extract_features(real_frames)

    mu_gen = gen_feats.mean(dim=0)
    sigma_gen = torch.cov(gen_feats.T)
    mu_real = real_feats.mean(dim=0)
    sigma_real = torch.cov(real_feats.T)

    diff = mu_gen - mu_real
    covmean, _ = linalg.sqrtm(sigma_gen.cpu().numpy() @ sigma_real.cpu().numpy(), disp=False)
    if np.iscomplexobj(covmean):
        covmean = covmean.real
    fvd = diff @ diff + torch.trace(sigma_gen) + torch.trace(sigma_real) - 2 * torch.tensor(np.trace(covmean))

    return {
        "SSIM": torch.stack(ssim_vals).mean().item(),
        "PSNR": torch.stack(psnr_vals).mean().item(),
        "FVD": fvd.item(),
        "CLIP_SCORE": clip_score
    }

# Step 2: Evaluate metrics
metrics = calculate_metrics(generated_frames, real_frames, prompt)
print("📊 Evaluation Metrics:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading pipeline components...:   0%|          | 0/9 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Generating frames:   0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Generating frames:  12%|█▎        | 1/8 [00:05<00:39,  5.66s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

Generating frames:  25%|██▌       | 2/8 [00:10<00:31,  5.19s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

Generating frames:  38%|███▊      | 3/8 [00:15<00:25,  5.04s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

Generating frames:  50%|█████     | 4/8 [00:20<00:19,  4.96s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

Generating frames:  62%|██████▎   | 5/8 [00:25<00:14,  4.93s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

Generating frames:  75%|███████▌  | 6/8 [00:29<00:09,  4.91s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

Generating frames:  88%|████████▊ | 7/8 [00:34<00:04,  4.89s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

Generating frames: 100%|██████████| 8/8 [00:39<00:00,  4.96s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


🎞️ Video saved to /content/drive/MyDrive/generated_pasta.mp4




ValueError: The parameter 'aux_logits' expected value True but got False instead.

In [22]:
!pip install lpips decord transformers accelerate -q
!pip install torchmetrics imageio scikit-image einops -q

In [29]:
import torch
import torchvision.transforms as T
import numpy as np
import imageio
from skimage.metrics import peak_signal_noise_ratio, structural_similarity
import lpips
from decord import VideoReader, cpu
from torchvision.transforms import Resize, Compose, ToTensor
from transformers import CLIPProcessor, CLIPModel
from torchmetrics.image.fid import FrechetInceptionDistance
from einops import rearrange
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# 设置
device = "cuda" if torch.cuda.is_available() else "cpu"
resize_to = 224  # Resize for CLIP, LPIPS, etc.
max_frames = 20  # Use a fixed number of frames

# 检查文件是否存在
real_video_path ="/content/real_video.mp4"
gen_video_path = "/content/generated_video.mp4"


if not os.path.exists(real_video_path):
    raise FileNotFoundError(f"Real video not found at {real_video_path}")
if not os.path.exists(gen_video_path):
    raise FileNotFoundError(f"Generated video not found at {gen_video_path}")

# 加载视频并转换为帧
def load_video_frames(path, max_frames=20, size=224):
    try:
        vr = VideoReader(path, ctx=cpu(0))
        total = min(len(vr), max_frames)
        indices = np.linspace(0, len(vr)-1, total).astype(int)
        transform = Compose([
            T.ToPILImage(),
            Resize((size, size)),
            ToTensor()
        ])
        frames = [transform(vr[i].asnumpy()) for i in indices]
        return torch.stack(frames)
    except Exception as e:
        raise RuntimeError(f"Error loading video {path}: {str(e)}")

try:
    # 加载帧
    print("Loading real video frames...")
    real_frames = load_video_frames(real_video_path, max_frames, resize_to).to(device)
    print("Loading generated video frames...")
    gen_frames = load_video_frames(gen_video_path, max_frames, resize_to).to(device)

    # 保证一致大小
    assert gen_frames.shape == real_frames.shape, "Frames must be same shape"

    # ---------------------------
    # SSIM & PSNR
    # ---------------------------
    def compute_ssim_psnr(gen, real):
        ssim_list, psnr_list = [], []
        for g, r in zip(gen, real):
            g_np = g.permute(1, 2, 0).cpu().numpy()
            r_np = r.permute(1, 2, 0).cpu().numpy()
            ssim_list.append(structural_similarity(r_np, g_np, multichannel=True, data_range=1.0))
            psnr_list.append(peak_signal_noise_ratio(r_np, g_np, data_range=1.0))
        return np.mean(ssim_list), np.mean(psnr_list)

    # ---------------------------
    # LPIPS
    # ---------------------------
    def compute_lpips(gen, real):
        loss_fn = lpips.LPIPS(net='alex').to(device)
        lpips_list = []
        for g, r in zip(gen, real):
            lpips_list.append(loss_fn(g.unsqueeze(0), r.unsqueeze(0)).item())
        return np.mean(lpips_list)

    # ---------------------------
    # FVD
    # ---------------------------
    def compute_fvd(gen, real):
        fvd = FrechetInceptionDistance(feature=64).to(device)
        fvd.update(rearrange(gen, 't c h w -> t c h w'), real=False)
        fvd.update(rearrange(real, 't c h w -> t c h w'), real=True)
        return fvd.compute().item()

    # ---------------------------
    # CLIP Score
    # ---------------------------
    def compute_clip_score(frames, prompt="a bowl of delicious pasta"):
        model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
        processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        images = [T.ToPILImage()(frame.cpu()) for frame in frames]
        inputs = processor(text=[prompt] * len(images), images=images, return_tensors="pt", padding=True).to(device)
        outputs = model(**inputs)
        logits_per_image = outputs.logits_per_image
        scores = logits_per_image.softmax(dim=1)[:, 0]  # assume first prompt
        return scores.mean().item()

    # ---------------------------
    # Run all metrics
    # ---------------------------
    print("Computing metrics...")
    ssim, psnr = compute_ssim_psnr(gen_frames, real_frames)
    lpips_score = compute_lpips(gen_frames, real_frames)
    fvd_score = compute_fvd(gen_frames, real_frames)
    clip_score = compute_clip_score(gen_frames, prompt="a bowl of pasta")

    print("\n📊 Evaluation Metrics:")
    print(f"SSIM: {ssim:.4f} (higher is better, max=1)")
    print(f"PSNR: {psnr:.2f} dB (higher is better)")
    print(f"LPIPS: {lpips_score:.4f} (lower is better, min=0)")
    print(f"FVD: {fvd_score:.2f} (lower is better)")
    print(f"CLIP Score: {clip_score:.4f} (higher is better)")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    print("Please check:")
    print(f"1. The video files exist at the specified paths")
    print(f"2. The videos are valid MP4 files")
    print(f"3. You have permission to access the files")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


FileNotFoundError: Real video not found at /content/real_video.mp4